From e98e2fe52a8614b1473d8f19847036afd8309445 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 25 Jun 2024 12:21:53 -0500 Subject: [PATCH 01/40] Initial FLI-based implementation --- .../_core/launcher/dragon/dragonBackend.py | 30 ++++- .../_core/mli/comm/channel/dragonchannel.py | 12 +- smartsim/_core/mli/comm/channel/dragonfli.py | 54 +++++++++ .../infrastructure/control/workermanager.py | 33 +++--- .../_core/mli/infrastructure/worker/worker.py | 106 ++++++++++++++---- smartsim/_core/mli/message_handler.py | 10 +- 6 files changed, 192 insertions(+), 53 deletions(-) create mode 100644 smartsim/_core/mli/comm/channel/dragonfli.py diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 2456606623..9ec4cc93e9 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -26,6 +26,7 @@ import collections import functools import itertools +import os import time import typing as t from dataclasses import dataclass, field @@ -38,10 +39,13 @@ # isort: off import dragon.infrastructure.connection as dragon_connection import dragon.infrastructure.policy as dragon_policy +from dragon.infrastructure.process_desc import ProcessOptions +from dragon.data.ddict.ddict import DDict import dragon.native.group_state as dragon_group_state import dragon.native.process as dragon_process import dragon.native.process_group as dragon_process_group import dragon.native.machine as dragon_machine +import multiprocessing as mp # pylint: enable=import-error # isort: on @@ -75,6 +79,9 @@ def __str__(self) -> str: return self.value +mp.set_start_method("dragon") + + @dataclass class ProcessGroupInfo: status: SmartSimStatus @@ -187,6 +194,7 @@ def __init__(self, pid: int) -> None: self._view = DragonBackendView(self) logger.debug(self._view.host_desc) + self._infra_ddict: t.Optional[DDict] = None @property def hosts(self) -> list[str]: @@ -391,6 +399,20 @@ def _stop_steps(self) -> None: self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED self._group_infos[step_id].return_codes = [-9] + @property + def infra_ddict(self) -> str: + """Create a Dragon distributed dictionary and return its + serialized descriptor + """ + if self._infra_ddict is None: + logger.info("Creating DDict") + self._infra_ddict = DDict() # todo: parametrize + logger.info("Created DDict") + self._infra_ddict["creation"] = str(time.time()) + logger.info(self._infra_ddict["creation"]) + + return self._infra_ddict.serialize() + def _start_steps(self) -> None: self._heartbeat() with self._queue_lock: @@ -406,6 +428,7 @@ def _start_steps(self) -> None: placement=dragon_policy.Policy.Placement.HOST_NAME, host_name=hosts[0], ) + options = ProcessOptions(make_inf_channels=True) grp = dragon_process_group.ProcessGroup( restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy ) @@ -421,10 +444,15 @@ def _start_steps(self) -> None: target=request.exe, args=request.exe_args, cwd=request.path, - env={**request.current_env, **request.env}, + env={ + **request.current_env, + **request.env, + "SS_DRG_DDICT": self.infra_ddict, + }, stdout=dragon_process.Popen.PIPE, stderr=dragon_process.Popen.PIPE, policy=local_policy, + options=options, ) grp.add_process(nproc=request.tasks_per_node, template=tmp_proc) diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index 4fd26861ca..d4dbfa3ba0 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -24,16 +24,13 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import typing as t import smartsim._core.mli.comm.channel.channel as cch from smartsim.log import get_logger logger = get_logger(__name__) -if t.TYPE_CHECKING: - import dragon.channels as dch - import dragon.utils as du +import dragon.channels as dch class DragonCommChannel(cch.CommChannelBase): @@ -42,11 +39,10 @@ class DragonCommChannel(cch.CommChannelBase): def __init__(self, key: bytes) -> None: """Initialize the DragonCommChannel instance""" super().__init__(key) - # todo: do we need memory pool information to construct the channel correctly? - self._channel: "dch.Channel" = du.get_channel(key) + self._channel: dch.Channel = dch.Channel.attach(key) def send(self, value: bytes) -> None: """Send a message throuh the underlying communication channel :param value: The value to send""" - logger.debug(f"Channel {self.descriptor.decode('utf-8')} sending message") - self._channel.send_bytes(value) + with self._channel.sendh(timeout=None) as sendh: + sendh.send_bytes(value) diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py new file mode 100644 index 0000000000..f601bb2eb8 --- /dev/null +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -0,0 +1,54 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# isort: off +import dragon +from dragon import fli +import dragon.channels as dch + +# isort: on + + +import smartsim._core.mli.comm.channel.channel as cch +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class DragonFLIChannel(cch.CommChannelBase): + """Passes messages by writing to a Dragon FLI Channel""" + + def __init__(self, fli_desc: bytes) -> None: + """Initialize the DragonFLIChannel instance""" + super().__init__(fli_desc) + # todo: do we need memory pool information to construct the channel correctly? + self._channel: "dch.Channel" = fli.FLInterface.attach(fli_desc) + + def send(self, value: bytes) -> None: + """Send a message throuh the underlying communication channel + :param value: The value to send""" + with self._channel.sendh(timeout=None) as sendh: + sendh.send_bytes(value) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index b3b79f7f30..588dc8e28d 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -24,14 +24,19 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import multiprocessing as mp +# isort: off +import dragon +from dragon import fli + +# isort: on +import time import typing as t import numpy as np from smartsim._core.entrypoints.service import Service from smartsim._core.mli.comm.channel.channel import CommChannelBase -from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore from smartsim._core.mli.infrastructure.worker.worker import ( InferenceReply, @@ -84,12 +89,6 @@ def deserialize_message( None # these will really be tensors already ) - # # client example - # msg = Message() - # t = torch.Tensor() - # msg.inputs = [custom_byte_converter(t)] - # mli_client.request_inference(msg) - # # end client input_meta: t.List[t.Any] = [] if request.input.which() == "inputKeys": @@ -163,12 +162,12 @@ class WorkerManager(Service): def __init__( self, - task_queue: "mp.Queue[bytes]", + file_like_interface: fli.FLInterface, worker: MachineLearningWorkerBase, feature_store: t.Optional[FeatureStore] = None, as_service: bool = False, cooldown: int = 0, - comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, + comm_channel_type: t.Type[CommChannelBase] = DragonFLIChannel, ) -> None: """Initialize the WorkerManager :param task_queue: The queue to monitor for new tasks @@ -182,7 +181,7 @@ def __init__( super().__init__(as_service, cooldown) """a collection of workers the manager is controlling""" - self._task_queue: "mp.Queue[bytes]" = task_queue + self._task_queue: fli.FLInterface = file_like_interface """the queue the manager monitors for new tasks""" self._feature_store: t.Optional[FeatureStore] = feature_store """a feature store to retrieve models from""" @@ -232,7 +231,12 @@ def _on_iteration(self) -> None: return # perform default deserialization of the message envelope - request_bytes: bytes = self._task_queue.get() + # perform default deserialization of the message envelope + with self._task_queue.recvh(timeout=None) as recvh: + try: + request_bytes, _ = recvh.recv_bytes(timeout=None) + except fli.FLIEOT as exc: + return request = deserialize_message(request_bytes, self._comm_channel_type) if not self._validate_request(request): @@ -246,17 +250,12 @@ def _on_iteration(self) -> None: fetch_input_result = self._worker.fetch_inputs(request, self._feature_store) transformed_input = self._worker.transform_input(request, fetch_input_result) - # batch: t.Collection[_Datum] = transform_result.transformed_input - # if self._batch_size: - # batch = self._worker.batch_requests(transform_result, self._batch_size) - reply = InferenceReply() try: execute_result = self._worker.execute( request, model_result, transformed_input ) - transformed_output = self._worker.transform_output(request, execute_result) if request.output_keys: diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 99b51e178d..8992b2b6ea 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -24,12 +24,17 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import io import typing as t from abc import ABC, abstractmethod +import numpy as np +import torch + import smartsim.error as sse from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.mli_schemas.tensor import tensor_capnp from smartsim.log import get_logger logger = get_logger(__name__) @@ -106,9 +111,10 @@ def __init__(self, result: t.Any) -> None: class FetchInputResult: """A wrapper around fetched inputs""" - def __init__(self, result: t.List[bytes]) -> None: + def __init__(self, result: t.List[bytes], meta: t.List[t.Any]) -> None: """Initialize the object""" self.inputs = result + self.meta = meta class TransformOutputResult: @@ -122,7 +128,6 @@ def __init__( self.shape = shape self.order = order self.dtype = dtype - # todo: determine if each output must have an individual (shape, order, dtype) class CreateInputBatchResult: @@ -152,8 +157,6 @@ def fetch_model( :param request: The request that triggered the pipeline :param feature_store: The feature store used for persistence :return: Raw bytes of the model""" - if not feature_store: - raise ValueError("Feature store is required for model retrieval") if request.raw_model: # Should we cache model in the feature store? @@ -162,6 +165,9 @@ def fetch_model( # short-circuit and return the directly supplied model return FetchModelResult(request.raw_model) + if not feature_store: + raise ValueError("Feature store is required for model retrieval") + if not request.model_key: raise sse.SmartSimError( "Key must be provided to retrieve model from feature store" @@ -185,8 +191,12 @@ def fetch_inputs( :param request: The request that triggered the pipeline :param feature_store: The feature store used for persistence :return: the fetched input""" + + if request.raw_inputs: + return FetchInputResult(request.raw_inputs, request.input_meta) + if not feature_store: - raise ValueError("Feature store is required for input retrieval") + raise ValueError("No input and no feature store provided") if request.input_keys: data: t.List[bytes] = [] @@ -201,9 +211,6 @@ def fetch_inputs( ) from ex return FetchInputResult(data) - if request.raw_inputs: - return FetchInputResult(request.raw_inputs) - raise ValueError("No input source") @staticmethod @@ -250,14 +257,6 @@ class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC): """Abstrct base class providing contract for a machine learning worker implementation.""" - # @staticmethod - # @abstractmethod - # def deserialize(request: InferenceRequest) -> InferenceRequest: - # """Given a collection of data serialized to bytes, convert the bytes - # to a proper representation used by the ML backend - # :param data_blob: inference request as a byte-serialized blob - # :return: InferenceRequest deserialized from the input""" - @staticmethod @abstractmethod def load_model( @@ -303,11 +302,70 @@ def transform_output( :param execute_result: The result of inference wrapped in an ExecuteResult :return:""" - # @staticmethod - # @abstractmethod - # def serialize_reply( - # request: InferenceRequest, results: OutputTransformResult - # ) -> bytes: - # """Given an output, serialize to bytes for transport - # :param reply: The result of the inference pipeline - # :return: a byte-serialized version of the reply""" + +class TorchWorker(MachineLearningWorkerBase): + """A worker that executes a PyTorch model.""" + + @staticmethod + def load_model( + request: InferenceRequest, fetch_result: FetchModelResult + ) -> LoadModelResult: + model_bytes = fetch_result.model_bytes or request.raw_model + if not model_bytes: + raise ValueError("Unable to load model without reference object") + + _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} + device = _device_to_torch[str(request.device)] + model: torch.nn.Module = torch.jit.load(io.BytesIO(model_bytes), map_location=device) # type: ignore[no-untyped-call] + result = LoadModelResult(model) + return result + + @staticmethod + def transform_input( + request: InferenceRequest, fetch_result: FetchInputResult + ) -> TransformInputResult: + result = [] + + _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} + device = _device_to_torch[str(request.device)] + for item, item_meta in zip(fetch_result.inputs, fetch_result.meta): + td: tensor_capnp.TensorDescriptor = item_meta + result.append( + torch.tensor( + np.frombuffer(item, dtype=str(td.dataType)).reshape(td.dimensions) + ).to(device) + ) + return TransformInputResult(result) + # return data # note: this fails copy test! + + @staticmethod + def execute( + request: InferenceRequest, + load_result: LoadModelResult, + transform_result: TransformInputResult, + ) -> ExecuteResult: + if not load_result.model: + raise sse.SmartSimError("Model must be loaded to execute") + + model: torch.nn.Module = load_result.model + model.eval() + results = [model(tensor).detach() for tensor in transform_result.transformed] + + execute_result = ExecuteResult(results) + return execute_result + + @staticmethod + def transform_output( + request: InferenceRequest, + execute_result: ExecuteResult, + ) -> TransformOutputResult: + if str(request.device) != "cpu": + transformed = [ + item.to("cpu").clone() for item in execute_result.predictions + ] + # todo: need the shape from latest schemas added here. + return TransformOutputResult(transformed, None, "c", "float32") # fixme + else: + return TransformOutputResult( + execute_result.predictions, None, "c", "float32" + ) # fixme diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index 733fa83d98..4a5725bd9e 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -391,7 +391,9 @@ def deserialize_request(request_bytes: t.ByteString) -> request_capnp.Request: :param request_bytes: Bytes to be deserialized into a Request """ - bytes_message = request_capnp.Request.from_bytes(request_bytes) + bytes_message = request_capnp.Request.from_bytes( + request_bytes, traversal_limit_in_words=2**63 + ) with bytes_message as message: return message @@ -484,7 +486,7 @@ def _assign_custom_response_attributes( response.customAttributes.tf = custom_attrs # type: ignore else: raise ValueError("""Invalid custom attribute class name. - Expected 'TensorFlowResponseAttributes' or + Expected 'TensorFlowResponseAttributes' or 'TorchResponseAttributes'.""") except Exception as e: raise ValueError("Error assigning custom attributes to response.") from e @@ -529,7 +531,9 @@ def deserialize_response(response_bytes: t.ByteString) -> response_capnp.Respons """ Deserializes a serialized response message. """ - bytes_message = response_capnp.Response.from_bytes(response_bytes) + bytes_message = response_capnp.Response.from_bytes( + response_bytes, traversal_limit_in_words=2**63 + ) with bytes_message as message: return message From 043f0e74e68ad07846ffce9a0013eb6cf1919c09 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 25 Jun 2024 13:42:44 -0500 Subject: [PATCH 02/40] Add inference example stub --- .../high_throughput_inference/mli_driver.py | 35 +++++ .../high_throughput_inference/mock_app.py | 129 ++++++++++++++++++ .../standalone_workermanager.py | 46 +++++++ 3 files changed, 210 insertions(+) create mode 100644 examples/high_throughput_inference/mli_driver.py create mode 100644 examples/high_throughput_inference/mock_app.py create mode 100644 examples/high_throughput_inference/standalone_workermanager.py diff --git a/examples/high_throughput_inference/mli_driver.py b/examples/high_throughput_inference/mli_driver.py new file mode 100644 index 0000000000..187a7b8214 --- /dev/null +++ b/examples/high_throughput_inference/mli_driver.py @@ -0,0 +1,35 @@ +import sys +from smartsim import Experiment +from smartsim.status import TERMINAL_STATUSES +import time + +worker_manager_script_name = "standalone_workermanager.py" +app_script_name = "mock_app.py" +device = "cpu" + + +exp = Experiment("MLI_proto", launcher="dragon") + +worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name]) +worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) +worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) + + +app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) +app = exp.create_model("app", run_settings=app_rs) +app.attach_generator_files(to_copy=[app_script_name], to_symlink=[f"resnet50.{device.upper()}.pt"]) + + +exp.generate(worker_manager, app, overwrite=True) +exp.start(worker_manager, app, block=False) + +while True: + if exp.get_status(app)[0] in TERMINAL_STATUSES: + exp.stop(worker_manager) + break + if exp.get_status(worker_manager)[0] in TERMINAL_STATUSES: + exp.stop(app) + break + time.sleep(5) + +print("Exiting.") \ No newline at end of file diff --git a/examples/high_throughput_inference/mock_app.py b/examples/high_throughput_inference/mock_app.py new file mode 100644 index 0000000000..d6f8253b70 --- /dev/null +++ b/examples/high_throughput_inference/mock_app.py @@ -0,0 +1,129 @@ +# isort: off +import dragon +from dragon import fli +from dragon.channels import Channel +import dragon.channels +from dragon.data.ddict.ddict import DDict +from dragon.globalservices.api_setup import connect_to_infrastructure +from dragon.utils import b64decode, b64encode + +# isort: on + +import argparse +import io +import numpy +import os +import tabulate +import time +import torch +import typing as t + +from smartsim._core.mli.message_handler import MessageHandler + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser("Mock application") + parser.add_argument("--device", default="cpu") + + args = parser.parse_args() + + connect_to_infrastructure() + ddict_str = os.environ["SS_DRG_DDICT"] + + ddict = DDict.attach(ddict_str) + + to_worker_fli_str = None + + while to_worker_fli_str is None: + try: + to_worker_fli_str = ddict["to_worker_fli"] + except Exception as e: + time.sleep(1) + + to_worker_fli = fli.FLInterface.attach(b64decode(to_worker_fli_str)) + + batch_size = 32 + model = torch.jit.load(f"resnet50.{args.device.upper()}.pt") + buffer = io.BytesIO() + batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) + scripted = torch.jit.trace(model, batch) + torch.jit.save(scripted, buffer) + + total_iterations = 10 + + headers=[ + "batch_size", + "build_tensor", + "build_request", + "serialize_request", + "send", + "receive", + "deserialize_response", + "deserialize_tensor", + ] + + print(",".join(headers)) + + for batch_size in [1, 8, 32, 64, 128]: + + timings = [] + for iteration_number in range(total_iterations + int(batch_size==1)): + + timings.append([batch_size]) + + batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) + + expected_device: t.Literal["cpu", "gpu"] = args.device.lower() + + start = time.perf_counter() + interm = start + built_tensor = MessageHandler.build_tensor( + batch.numpy(), "c", "float32", list(batch.shape) + ) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + + from_worker_ch = Channel.make_process_local() + + request = MessageHandler.build_request( + reply_channel=from_worker_ch.serialize(), + model=buffer.getvalue(), + device=expected_device, + inputs=[built_tensor], + outputs=[], + output_descriptors=[], + custom_attributes=None, + ) + + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + request_bytes = MessageHandler.serialize_request(request) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + with to_worker_fli.sendh(timeout=None) as to_sendh: + to_sendh.send_bytes(request_bytes) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + + with from_worker_ch.recvh(timeout=None) as from_recvh: + resp = from_recvh.recv_bytes(timeout=None) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + response = MessageHandler.deserialize_response(resp) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + result = torch.from_numpy( + numpy.frombuffer( + response.result.data[0].blob, + dtype=str(response.result.data[0].tensorDescriptor.dataType), + ) + ) + + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + + # duration = time.perf_counter() - start + # print(f"{duration:.3f} s") + + print(",".join(str(timing) for timing in timings[-1])) diff --git a/examples/high_throughput_inference/standalone_workermanager.py b/examples/high_throughput_inference/standalone_workermanager.py new file mode 100644 index 0000000000..7ddeff0094 --- /dev/null +++ b/examples/high_throughput_inference/standalone_workermanager.py @@ -0,0 +1,46 @@ +# isort: off +import dragon +from dragon import fli +from dragon.channels import Channel +from dragon.data.ddict.ddict import DDict +from dragon.utils import b64decode, b64encode +from dragon.globalservices.api_setup import connect_to_infrastructure +# isort: on +import logging +import multiprocessing as mp +import os +import pathlib +import shutil +import time + + +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel +from smartsim._core.mli.infrastructure.worker.worker import TorchWorker +from smartsim._core.mli.infrastructure.control.workermanager import ( + DragonCommChannel, + WorkerManager, +) + +if __name__ == "__main__": + connect_to_infrastructure() + mp.set_start_method("dragon") + ddict_str = os.environ["SS_DRG_DDICT"] + ddict = DDict.attach(ddict_str) + + to_worker_channel = Channel.make_process_local() + to_worker_manager_channel = Channel.make_process_local() + channels = [Channel.make_process_local() for _ in range(100)] + to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=to_worker_manager_channel, stream_channels=channels) + ddict["to_worker_fli"] = b64encode(to_worker_fli.serialize()) + + torch_worker = TorchWorker() + + worker_manager = WorkerManager( + file_like_interface=to_worker_fli, + worker=torch_worker, + feature_store=None, + as_service=True, + cooldown=10, + comm_channel_type=DragonCommChannel, + ) + worker_manager.execute() From efc9e839d2c317a49662776b710993e43c88f75c Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 25 Jun 2024 17:09:50 -0500 Subject: [PATCH 03/40] Lint, style, black magic --- .../high_throughput_inference/mli_driver.py | 2 +- .../standalone_workermanager.py | 3 +- .../_core/launcher/dragon/dragonBackend.py | 3 +- .../_core/mli/infrastructure/worker/worker.py | 30 +++++++++++-------- 4 files changed, 21 insertions(+), 17 deletions(-) diff --git a/examples/high_throughput_inference/mli_driver.py b/examples/high_throughput_inference/mli_driver.py index 187a7b8214..833766cbef 100644 --- a/examples/high_throughput_inference/mli_driver.py +++ b/examples/high_throughput_inference/mli_driver.py @@ -5,7 +5,7 @@ worker_manager_script_name = "standalone_workermanager.py" app_script_name = "mock_app.py" -device = "cpu" +device = "gpu" exp = Experiment("MLI_proto", launcher="dragon") diff --git a/examples/high_throughput_inference/standalone_workermanager.py b/examples/high_throughput_inference/standalone_workermanager.py index 7ddeff0094..bb93c613ce 100644 --- a/examples/high_throughput_inference/standalone_workermanager.py +++ b/examples/high_throughput_inference/standalone_workermanager.py @@ -14,10 +14,9 @@ import time -from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel +from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel from smartsim._core.mli.infrastructure.worker.worker import TorchWorker from smartsim._core.mli.infrastructure.control.workermanager import ( - DragonCommChannel, WorkerManager, ) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 9ec4cc93e9..d103579115 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -26,7 +26,6 @@ import collections import functools import itertools -import os import time import typing as t from dataclasses import dataclass, field @@ -411,7 +410,7 @@ def infra_ddict(self) -> str: self._infra_ddict["creation"] = str(time.time()) logger.info(self._infra_ddict["creation"]) - return self._infra_ddict.serialize() + return str(self._infra_ddict.serialize()) def _start_steps(self) -> None: self._heartbeat() diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 8992b2b6ea..295b2573c8 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -111,7 +111,7 @@ def __init__(self, result: t.Any) -> None: class FetchInputResult: """A wrapper around fetched inputs""" - def __init__(self, result: t.List[bytes], meta: t.List[t.Any]) -> None: + def __init__(self, result: t.List[bytes], meta: t.Optional[t.List[t.Any]]) -> None: """Initialize the object""" self.inputs = result self.meta = meta @@ -121,7 +121,7 @@ class TransformOutputResult: """A wrapper around inference results transformed for transmission""" def __init__( - self, result: t.Any, shape: t.List[int], order: str, dtype: str + self, result: t.Any, shape: t.Optional[t.List[int]], order: str, dtype: str ) -> None: """Initialize the OutputTransformResult""" self.outputs = result @@ -209,7 +209,9 @@ def fetch_inputs( raise sse.SmartSimError( f"Model could not be retrieved with key {input_}" ) from ex - return FetchInputResult(data) + return FetchInputResult( + data, None + ) # fixme: need to get both tensor and descriptor raise ValueError("No input source") @@ -316,7 +318,9 @@ def load_model( _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} device = _device_to_torch[str(request.device)] - model: torch.nn.Module = torch.jit.load(io.BytesIO(model_bytes), map_location=device) # type: ignore[no-untyped-call] + buffer = io.BytesIO(model_bytes) + # type: ignore-next[no-untyped-call] + model = torch.jit.load(buffer, map_location=device) result = LoadModelResult(model) return result @@ -328,12 +332,14 @@ def transform_input( _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} device = _device_to_torch[str(request.device)] + if fetch_result.meta is None: + raise ValueError("Cannot reconstruct tensor without meta information") for item, item_meta in zip(fetch_result.inputs, fetch_result.meta): - td: tensor_capnp.TensorDescriptor = item_meta + tensor_desc: tensor_capnp.TensorDescriptor = item_meta result.append( - torch.tensor( - np.frombuffer(item, dtype=str(td.dataType)).reshape(td.dimensions) - ).to(device) + torch.tensor(np.frombuffer(item, dtype=str(tensor_desc.dataType))) + .to(device) + .reshape(tuple(dim for dim in tensor_desc.dimensions)) ) return TransformInputResult(result) # return data # note: this fails copy test! @@ -365,7 +371,7 @@ def transform_output( ] # todo: need the shape from latest schemas added here. return TransformOutputResult(transformed, None, "c", "float32") # fixme - else: - return TransformOutputResult( - execute_result.predictions, None, "c", "float32" - ) # fixme + + return TransformOutputResult( + execute_result.predictions, None, "c", "float32" + ) # fixme From ed3c42a10b812963e2de28c6e89918dfe0efbc07 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 25 Jun 2024 18:07:56 -0500 Subject: [PATCH 04/40] Bring up to feature branch --- .../infrastructure/control/workermanager.py | 24 +++++++++++++++---- .../_core/mli/infrastructure/worker/worker.py | 24 ++++++++++--------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 67b1627bb5..f46ced8756 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -54,7 +54,9 @@ def deserialize_message( - data_blob: bytes, channel_type: t.Type[CommChannelBase] + data_blob: bytes, + channel_type: t.Type[CommChannelBase], + device: t.Literal["cpu", "gpu"], ) -> InferenceRequest: """Deserialize a message from a byte stream into an InferenceRequest :param data_blob: The byte stream to deserialize""" @@ -166,6 +168,7 @@ def __init__( as_service: bool = False, cooldown: int = 0, comm_channel_type: t.Type[CommChannelBase] = DragonFLIChannel, + device: t.Literal["cpu", "gpu"] = "cpu", ) -> None: """Initialize the WorkerManager :param task_queue: The queue to monitor for new tasks @@ -187,6 +190,8 @@ def __init__( """The ML Worker implementation""" self._comm_channel_type = comm_channel_type """The type of communication channel to construct for callbacks""" + self._device = device + """Device on which workers need to run""" def _validate_request(self, request: InferenceRequest) -> bool: """Ensure the request can be processed. @@ -236,17 +241,24 @@ def _on_iteration(self) -> None: except fli.FLIEOT as exc: return - request = deserialize_message(request_bytes, self._comm_channel_type) + request = deserialize_message( + request_bytes, self._comm_channel_type, self._device + ) if not self._validate_request(request): return + # # let the worker perform additional custom deserialization # request = self._worker.deserialize(request_bytes) fetch_model_result = self._worker.fetch_model(request, self._feature_store) - model_result = self._worker.load_model(request, fetch_model_result) + model_result = self._worker.load_model( + request, fetch_model_result, self._device + ) fetch_input_result = self._worker.fetch_inputs(request, self._feature_store) - transformed_input = self._worker.transform_input(request, fetch_input_result) + transformed_input = self._worker.transform_input( + request, fetch_input_result, self._device + ) reply = InferenceReply() @@ -254,7 +266,9 @@ def _on_iteration(self) -> None: execute_result = self._worker.execute( request, model_result, transformed_input ) - transformed_output = self._worker.transform_output(request, execute_result) + transformed_output = self._worker.transform_output( + request, execute_result, self._device + ) if request.output_keys: reply.output_keys = self._worker.place_output( diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 9b813a9e9b..08c4997554 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -260,21 +260,23 @@ class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC): @staticmethod @abstractmethod def load_model( - request: InferenceRequest, fetch_result: FetchModelResult + request: InferenceRequest, fetch_result: FetchModelResult, device: str ) -> LoadModelResult: """Given a loaded MachineLearningModel, ensure it is loaded into device memory :param request: The request that triggered the pipeline + :param device: The device on which the model must be placed :return: ModelLoadResult wrapping the model loaded for the request""" @staticmethod @abstractmethod def transform_input( - request: InferenceRequest, fetch_result: FetchInputResult + request: InferenceRequest, fetch_result: FetchInputResult, device: str ) -> TransformInputResult: """Given a collection of data, perform a transformation on the data :param request: The request that triggered the pipeline :param fetch_result: Raw output from fetching inputs out of a feature store + :param device: The device on which the transformed input must be placed :return: The transformed inputs wrapped in a InputTransformResult""" @staticmethod @@ -293,13 +295,13 @@ def execute( @staticmethod @abstractmethod def transform_output( - request: InferenceRequest, - execute_result: ExecuteResult, + request: InferenceRequest, execute_result: ExecuteResult, result_device: str ) -> TransformOutputResult: """Given inference results, perform transformations required to transmit results to the requestor. :param request: The request that triggered the pipeline :param execute_result: The result of inference wrapped in an ExecuteResult + :param result_device: The device on which the result of inference is placed :return:""" @@ -308,28 +310,27 @@ class TorchWorker(MachineLearningWorkerBase): @staticmethod def load_model( - request: InferenceRequest, fetch_result: FetchModelResult + request: InferenceRequest, fetch_result: FetchModelResult, device: str ) -> LoadModelResult: model_bytes = fetch_result.model_bytes or request.raw_model if not model_bytes: raise ValueError("Unable to load model without reference object") _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} - device = _device_to_torch[str(request.device)] + device = _device_to_torch[device] buffer = io.BytesIO(model_bytes) - # type: ignore-next[no-untyped-call] - model = torch.jit.load(buffer, map_location=device) + model = torch.jit.load(buffer, map_location=device) # type: ignore result = LoadModelResult(model) return result @staticmethod def transform_input( - request: InferenceRequest, fetch_result: FetchInputResult + request: InferenceRequest, fetch_result: FetchInputResult, device: str ) -> TransformInputResult: result = [] _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} - device = _device_to_torch[str(request.device)] + device = _device_to_torch[device] if fetch_result.meta is None: raise ValueError("Cannot reconstruct tensor without meta information") for item, item_meta in zip(fetch_result.inputs, fetch_result.meta): @@ -362,8 +363,9 @@ def execute( def transform_output( request: InferenceRequest, execute_result: ExecuteResult, + result_device: str, ) -> TransformOutputResult: - if str(request.device) != "cpu": + if result_device != "cpu": transformed = [ item.to("cpu").clone() for item in execute_result.predictions ] From e5be26bdcd8d55e6b3b9669fa9bd5492ffd89390 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 25 Jun 2024 18:08:14 -0500 Subject: [PATCH 05/40] Update example --- examples/high_throughput_inference/mli_driver.py | 13 ++++++++----- examples/high_throughput_inference/mock_app.py | 3 --- .../standalone_workermanager.py | 11 +++++------ 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/examples/high_throughput_inference/mli_driver.py b/examples/high_throughput_inference/mli_driver.py index 833766cbef..d32d88e51b 100644 --- a/examples/high_throughput_inference/mli_driver.py +++ b/examples/high_throughput_inference/mli_driver.py @@ -1,23 +1,26 @@ +import os import sys from smartsim import Experiment from smartsim.status import TERMINAL_STATUSES import time -worker_manager_script_name = "standalone_workermanager.py" -app_script_name = "mock_app.py" device = "gpu" +filedir = os.path.dirname(__file__) +worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py") +app_script_name = os.path.join(filedir, "mock_app.py") +model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") -exp = Experiment("MLI_proto", launcher="dragon") +exp = Experiment("MLI_proto", launcher="dragon", exp_path=os.path.join(filedir, "MLI_proto")) -worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name]) +worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device]) worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) app = exp.create_model("app", run_settings=app_rs) -app.attach_generator_files(to_copy=[app_script_name], to_symlink=[f"resnet50.{device.upper()}.pt"]) +app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) exp.generate(worker_manager, app, overwrite=True) diff --git a/examples/high_throughput_inference/mock_app.py b/examples/high_throughput_inference/mock_app.py index d6f8253b70..afc0c836b8 100644 --- a/examples/high_throughput_inference/mock_app.py +++ b/examples/high_throughput_inference/mock_app.py @@ -74,8 +74,6 @@ batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) - expected_device: t.Literal["cpu", "gpu"] = args.device.lower() - start = time.perf_counter() interm = start built_tensor = MessageHandler.build_tensor( @@ -89,7 +87,6 @@ request = MessageHandler.build_request( reply_channel=from_worker_ch.serialize(), model=buffer.getvalue(), - device=expected_device, inputs=[built_tensor], outputs=[], output_descriptors=[], diff --git a/examples/high_throughput_inference/standalone_workermanager.py b/examples/high_throughput_inference/standalone_workermanager.py index bb93c613ce..32d534f360 100644 --- a/examples/high_throughput_inference/standalone_workermanager.py +++ b/examples/high_throughput_inference/standalone_workermanager.py @@ -6,12 +6,8 @@ from dragon.utils import b64decode, b64encode from dragon.globalservices.api_setup import connect_to_infrastructure # isort: on -import logging -import multiprocessing as mp +import argparse import os -import pathlib -import shutil -import time from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel @@ -21,8 +17,10 @@ ) if __name__ == "__main__": + parser = argparse.ArgumentParser("Worker Manager") + parser.add_argument("--device", default="gpu") + args = parser.parse_args() connect_to_infrastructure() - mp.set_start_method("dragon") ddict_str = os.environ["SS_DRG_DDICT"] ddict = DDict.attach(ddict_str) @@ -41,5 +39,6 @@ as_service=True, cooldown=10, comm_channel_type=DragonCommChannel, + device = args.device, ) worker_manager.execute() From a23010fb9726e4c18997bee279a0553bbaa473f0 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 25 Jun 2024 18:17:30 -0500 Subject: [PATCH 06/40] Change the changelog --- doc/changelog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/changelog.md b/doc/changelog.md index e86c93de66..d146d1973a 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -17,7 +17,7 @@ Description - Added schemas and MessageHandler class for de/serialization of inference requests and response messages - Removed device from schemas, MessageHandler and tests - +- Add TorchWorker first implementation and mock inference app example ### Development branch From 3c20f464d512c7b3a1ead1981efb96842e7a14bb Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 25 Jun 2024 18:38:12 -0500 Subject: [PATCH 07/40] Make style --- smartsim/_core/mli/infrastructure/control/workermanager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index f46ced8756..7a5f168fe4 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -247,7 +247,6 @@ def _on_iteration(self) -> None: if not self._validate_request(request): return - # # let the worker perform additional custom deserialization # request = self._worker.deserialize(request_bytes) From b9ed5ba8baa9fc355640f8c2461a0ce7d16cf56b Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 26 Jun 2024 09:51:07 -0500 Subject: [PATCH 08/40] Attempt to mitigate import dragon error --- .../_core/mli/infrastructure/control/workermanager.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 7a5f168fe4..607f94982d 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -24,9 +24,15 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import sys + # isort: off -import dragon -from dragon import fli +try: + import dragon + from dragon import fli +except ImportError as exc: + if not "pytest" in sys.modules: + raise exc from None # isort: on import time From 0de06f3b6c0fa4747b471989a8068e4e609829a0 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 26 Jun 2024 10:20:27 -0500 Subject: [PATCH 09/40] Import dragon optional --- smartsim/_core/mli/comm/channel/dragonchannel.py | 9 ++++++--- smartsim/_core/mli/comm/channel/dragonfli.py | 12 ++++++++---- .../mli/infrastructure/control/workermanager.py | 2 +- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index d4dbfa3ba0..e79fd2dfcf 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -24,14 +24,17 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - +import sys import smartsim._core.mli.comm.channel.channel as cch from smartsim.log import get_logger logger = get_logger(__name__) -import dragon.channels as dch - +try: + import dragon.channels as dch +except ImportError as exc: + if not "pytest" in sys.modules: + raise exc from None class DragonCommChannel(cch.CommChannelBase): """Passes messages by writing to a Dragon channel""" diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index f601bb2eb8..3992241380 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -24,11 +24,15 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# isort: off -import dragon -from dragon import fli -import dragon.channels as dch +import sys +# isort: off +try: + from dragon import fli + import dragon.channels as dch +except ImportError as exc: + if not "pytest" in sys.modules: + raise exc from None # isort: on diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 607f94982d..6003869e46 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -168,7 +168,7 @@ class WorkerManager(Service): def __init__( self, - file_like_interface: fli.FLInterface, + file_like_interface: "fli.FLInterface", worker: MachineLearningWorkerBase, feature_store: t.Optional[FeatureStore] = None, as_service: bool = False, From d051385a963f2c18e55792b30316cd41eb2ca357 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 26 Jun 2024 10:28:23 -0500 Subject: [PATCH 10/40] isort --- smartsim/_core/mli/comm/channel/dragonchannel.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index e79fd2dfcf..872eb32350 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import sys + import smartsim._core.mli.comm.channel.channel as cch from smartsim.log import get_logger @@ -36,6 +37,7 @@ if not "pytest" in sys.modules: raise exc from None + class DragonCommChannel(cch.CommChannelBase): """Passes messages by writing to a Dragon channel""" From e77b1cd5c9c8359aa7be27b2a3d61c398eaa7d04 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 26 Jun 2024 11:33:47 -0500 Subject: [PATCH 11/40] Fix imports in dragon backend tests --- smartsim/_core/launcher/dragon/dragonBackend.py | 10 ++++------ tests/test_dragon_backend.py | 10 ++++++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index d103579115..f0e450a19c 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -36,15 +36,14 @@ # pylint: disable=import-error # isort: off +import dragon.data.ddict.ddict as dragon_ddict import dragon.infrastructure.connection as dragon_connection import dragon.infrastructure.policy as dragon_policy -from dragon.infrastructure.process_desc import ProcessOptions -from dragon.data.ddict.ddict import DDict +import dragon.infrastructure.process_desc as dragon_process_desc import dragon.native.group_state as dragon_group_state import dragon.native.process as dragon_process import dragon.native.process_group as dragon_process_group import dragon.native.machine as dragon_machine -import multiprocessing as mp # pylint: enable=import-error # isort: on @@ -78,7 +77,6 @@ def __str__(self) -> str: return self.value -mp.set_start_method("dragon") @dataclass @@ -405,7 +403,7 @@ def infra_ddict(self) -> str: """ if self._infra_ddict is None: logger.info("Creating DDict") - self._infra_ddict = DDict() # todo: parametrize + self._infra_ddict = dragon_ddict.DDict() # todo: parametrize logger.info("Created DDict") self._infra_ddict["creation"] = str(time.time()) logger.info(self._infra_ddict["creation"]) @@ -427,7 +425,7 @@ def _start_steps(self) -> None: placement=dragon_policy.Policy.Placement.HOST_NAME, host_name=hosts[0], ) - options = ProcessOptions(make_inf_channels=True) + options = dragon_process_desc.ProcessOptions(make_inf_channels=True) grp = dragon_process_group.ProcessGroup( restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy ) diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py index a510f660a5..f284f38d99 100644 --- a/tests/test_dragon_backend.py +++ b/tests/test_dragon_backend.py @@ -103,6 +103,16 @@ def get_mock_backend(monkeypatch: pytest.MonkeyPatch) -> "DragonBackend": "dragon.infrastructure.connection", MagicMock(), ) + monkeypatch.setitem( + sys.modules, + "dragon.infrastructure.process_desc", + MagicMock(), + ) + monkeypatch.setitem( + sys.modules, + "dragon.data.ddict.ddict", + MagicMock(), + ) monkeypatch.setitem( sys.modules, "dragon.infrastructure.policy", From a90888d44d3e9ef2207a97c6b0936418daf4d06c Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 26 Jun 2024 11:36:26 -0500 Subject: [PATCH 12/40] Style --- smartsim/_core/launcher/dragon/dragonBackend.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index f0e450a19c..d91f73e3c5 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -77,8 +77,6 @@ def __str__(self) -> str: return self.value - - @dataclass class ProcessGroupInfo: status: SmartSimStatus From b4312215184478186e837ab193cc609fb53f4698 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 26 Jun 2024 11:40:14 -0500 Subject: [PATCH 13/40] Fix type --- smartsim/_core/launcher/dragon/dragonBackend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index d91f73e3c5..52f69ec41f 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -189,7 +189,7 @@ def __init__(self, pid: int) -> None: self._view = DragonBackendView(self) logger.debug(self._view.host_desc) - self._infra_ddict: t.Optional[DDict] = None + self._infra_ddict: t.Optional[dragon_ddict.DDict] = None @property def hosts(self) -> list[str]: From 23efebc25027d908703e80e059a3c431d5f7d434 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 26 Jun 2024 12:38:55 -0500 Subject: [PATCH 14/40] Rename examples dir --- ex/high_throughput_inference/mli_driver.py | 38 ++++++ ex/high_throughput_inference/mock_app.py | 126 ++++++++++++++++++ .../standalone_workermanager.py | 44 ++++++ 3 files changed, 208 insertions(+) create mode 100644 ex/high_throughput_inference/mli_driver.py create mode 100644 ex/high_throughput_inference/mock_app.py create mode 100644 ex/high_throughput_inference/standalone_workermanager.py diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py new file mode 100644 index 0000000000..7b8db5ed83 --- /dev/null +++ b/ex/high_throughput_inference/mli_driver.py @@ -0,0 +1,38 @@ +import os +import sys +from smartsim import Experiment +from smartsim.status import TERMINAL_STATUSES +import time + +device = "cpu" +filedir = os.path.dirname(__file__) +worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py") +app_script_name = os.path.join(filedir, "mock_app.py") +model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") + + +exp = Experiment("MLI_proto", launcher="dragon", exp_path=os.path.join(filedir, "MLI_proto")) + +worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device]) +worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) +worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) + + +app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) +app = exp.create_model("app", run_settings=app_rs) +app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) + + +exp.generate(worker_manager, app, overwrite=True) +exp.start(worker_manager, app, block=False) + +while True: + if exp.get_status(app)[0] in TERMINAL_STATUSES: + exp.stop(worker_manager) + break + if exp.get_status(worker_manager)[0] in TERMINAL_STATUSES: + exp.stop(app) + break + time.sleep(5) + +print("Exiting.") \ No newline at end of file diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py new file mode 100644 index 0000000000..afc0c836b8 --- /dev/null +++ b/ex/high_throughput_inference/mock_app.py @@ -0,0 +1,126 @@ +# isort: off +import dragon +from dragon import fli +from dragon.channels import Channel +import dragon.channels +from dragon.data.ddict.ddict import DDict +from dragon.globalservices.api_setup import connect_to_infrastructure +from dragon.utils import b64decode, b64encode + +# isort: on + +import argparse +import io +import numpy +import os +import tabulate +import time +import torch +import typing as t + +from smartsim._core.mli.message_handler import MessageHandler + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser("Mock application") + parser.add_argument("--device", default="cpu") + + args = parser.parse_args() + + connect_to_infrastructure() + ddict_str = os.environ["SS_DRG_DDICT"] + + ddict = DDict.attach(ddict_str) + + to_worker_fli_str = None + + while to_worker_fli_str is None: + try: + to_worker_fli_str = ddict["to_worker_fli"] + except Exception as e: + time.sleep(1) + + to_worker_fli = fli.FLInterface.attach(b64decode(to_worker_fli_str)) + + batch_size = 32 + model = torch.jit.load(f"resnet50.{args.device.upper()}.pt") + buffer = io.BytesIO() + batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) + scripted = torch.jit.trace(model, batch) + torch.jit.save(scripted, buffer) + + total_iterations = 10 + + headers=[ + "batch_size", + "build_tensor", + "build_request", + "serialize_request", + "send", + "receive", + "deserialize_response", + "deserialize_tensor", + ] + + print(",".join(headers)) + + for batch_size in [1, 8, 32, 64, 128]: + + timings = [] + for iteration_number in range(total_iterations + int(batch_size==1)): + + timings.append([batch_size]) + + batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) + + start = time.perf_counter() + interm = start + built_tensor = MessageHandler.build_tensor( + batch.numpy(), "c", "float32", list(batch.shape) + ) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + + from_worker_ch = Channel.make_process_local() + + request = MessageHandler.build_request( + reply_channel=from_worker_ch.serialize(), + model=buffer.getvalue(), + inputs=[built_tensor], + outputs=[], + output_descriptors=[], + custom_attributes=None, + ) + + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + request_bytes = MessageHandler.serialize_request(request) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + with to_worker_fli.sendh(timeout=None) as to_sendh: + to_sendh.send_bytes(request_bytes) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + + with from_worker_ch.recvh(timeout=None) as from_recvh: + resp = from_recvh.recv_bytes(timeout=None) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + response = MessageHandler.deserialize_response(resp) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + result = torch.from_numpy( + numpy.frombuffer( + response.result.data[0].blob, + dtype=str(response.result.data[0].tensorDescriptor.dataType), + ) + ) + + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + + # duration = time.perf_counter() - start + # print(f"{duration:.3f} s") + + print(",".join(str(timing) for timing in timings[-1])) diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py new file mode 100644 index 0000000000..32d534f360 --- /dev/null +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -0,0 +1,44 @@ +# isort: off +import dragon +from dragon import fli +from dragon.channels import Channel +from dragon.data.ddict.ddict import DDict +from dragon.utils import b64decode, b64encode +from dragon.globalservices.api_setup import connect_to_infrastructure +# isort: on +import argparse +import os + + +from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel +from smartsim._core.mli.infrastructure.worker.worker import TorchWorker +from smartsim._core.mli.infrastructure.control.workermanager import ( + WorkerManager, +) + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Worker Manager") + parser.add_argument("--device", default="gpu") + args = parser.parse_args() + connect_to_infrastructure() + ddict_str = os.environ["SS_DRG_DDICT"] + ddict = DDict.attach(ddict_str) + + to_worker_channel = Channel.make_process_local() + to_worker_manager_channel = Channel.make_process_local() + channels = [Channel.make_process_local() for _ in range(100)] + to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=to_worker_manager_channel, stream_channels=channels) + ddict["to_worker_fli"] = b64encode(to_worker_fli.serialize()) + + torch_worker = TorchWorker() + + worker_manager = WorkerManager( + file_like_interface=to_worker_fli, + worker=torch_worker, + feature_store=None, + as_service=True, + cooldown=10, + comm_channel_type=DragonCommChannel, + device = args.device, + ) + worker_manager.execute() From 09b9d249c5c2147a062f95356c943c4da8e534b9 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 26 Jun 2024 12:48:11 -0500 Subject: [PATCH 15/40] Remove old dir --- .../high_throughput_inference/mli_driver.py | 38 ------ .../high_throughput_inference/mock_app.py | 126 ------------------ .../standalone_workermanager.py | 44 ------ 3 files changed, 208 deletions(-) delete mode 100644 examples/high_throughput_inference/mli_driver.py delete mode 100644 examples/high_throughput_inference/mock_app.py delete mode 100644 examples/high_throughput_inference/standalone_workermanager.py diff --git a/examples/high_throughput_inference/mli_driver.py b/examples/high_throughput_inference/mli_driver.py deleted file mode 100644 index d32d88e51b..0000000000 --- a/examples/high_throughput_inference/mli_driver.py +++ /dev/null @@ -1,38 +0,0 @@ -import os -import sys -from smartsim import Experiment -from smartsim.status import TERMINAL_STATUSES -import time - -device = "gpu" -filedir = os.path.dirname(__file__) -worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py") -app_script_name = os.path.join(filedir, "mock_app.py") -model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") - - -exp = Experiment("MLI_proto", launcher="dragon", exp_path=os.path.join(filedir, "MLI_proto")) - -worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device]) -worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) -worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) - - -app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) -app = exp.create_model("app", run_settings=app_rs) -app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) - - -exp.generate(worker_manager, app, overwrite=True) -exp.start(worker_manager, app, block=False) - -while True: - if exp.get_status(app)[0] in TERMINAL_STATUSES: - exp.stop(worker_manager) - break - if exp.get_status(worker_manager)[0] in TERMINAL_STATUSES: - exp.stop(app) - break - time.sleep(5) - -print("Exiting.") \ No newline at end of file diff --git a/examples/high_throughput_inference/mock_app.py b/examples/high_throughput_inference/mock_app.py deleted file mode 100644 index afc0c836b8..0000000000 --- a/examples/high_throughput_inference/mock_app.py +++ /dev/null @@ -1,126 +0,0 @@ -# isort: off -import dragon -from dragon import fli -from dragon.channels import Channel -import dragon.channels -from dragon.data.ddict.ddict import DDict -from dragon.globalservices.api_setup import connect_to_infrastructure -from dragon.utils import b64decode, b64encode - -# isort: on - -import argparse -import io -import numpy -import os -import tabulate -import time -import torch -import typing as t - -from smartsim._core.mli.message_handler import MessageHandler - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser("Mock application") - parser.add_argument("--device", default="cpu") - - args = parser.parse_args() - - connect_to_infrastructure() - ddict_str = os.environ["SS_DRG_DDICT"] - - ddict = DDict.attach(ddict_str) - - to_worker_fli_str = None - - while to_worker_fli_str is None: - try: - to_worker_fli_str = ddict["to_worker_fli"] - except Exception as e: - time.sleep(1) - - to_worker_fli = fli.FLInterface.attach(b64decode(to_worker_fli_str)) - - batch_size = 32 - model = torch.jit.load(f"resnet50.{args.device.upper()}.pt") - buffer = io.BytesIO() - batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) - scripted = torch.jit.trace(model, batch) - torch.jit.save(scripted, buffer) - - total_iterations = 10 - - headers=[ - "batch_size", - "build_tensor", - "build_request", - "serialize_request", - "send", - "receive", - "deserialize_response", - "deserialize_tensor", - ] - - print(",".join(headers)) - - for batch_size in [1, 8, 32, 64, 128]: - - timings = [] - for iteration_number in range(total_iterations + int(batch_size==1)): - - timings.append([batch_size]) - - batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) - - start = time.perf_counter() - interm = start - built_tensor = MessageHandler.build_tensor( - batch.numpy(), "c", "float32", list(batch.shape) - ) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - - from_worker_ch = Channel.make_process_local() - - request = MessageHandler.build_request( - reply_channel=from_worker_ch.serialize(), - model=buffer.getvalue(), - inputs=[built_tensor], - outputs=[], - output_descriptors=[], - custom_attributes=None, - ) - - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - request_bytes = MessageHandler.serialize_request(request) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - with to_worker_fli.sendh(timeout=None) as to_sendh: - to_sendh.send_bytes(request_bytes) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - - with from_worker_ch.recvh(timeout=None) as from_recvh: - resp = from_recvh.recv_bytes(timeout=None) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - response = MessageHandler.deserialize_response(resp) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - result = torch.from_numpy( - numpy.frombuffer( - response.result.data[0].blob, - dtype=str(response.result.data[0].tensorDescriptor.dataType), - ) - ) - - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - - # duration = time.perf_counter() - start - # print(f"{duration:.3f} s") - - print(",".join(str(timing) for timing in timings[-1])) diff --git a/examples/high_throughput_inference/standalone_workermanager.py b/examples/high_throughput_inference/standalone_workermanager.py deleted file mode 100644 index 32d534f360..0000000000 --- a/examples/high_throughput_inference/standalone_workermanager.py +++ /dev/null @@ -1,44 +0,0 @@ -# isort: off -import dragon -from dragon import fli -from dragon.channels import Channel -from dragon.data.ddict.ddict import DDict -from dragon.utils import b64decode, b64encode -from dragon.globalservices.api_setup import connect_to_infrastructure -# isort: on -import argparse -import os - - -from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel -from smartsim._core.mli.infrastructure.worker.worker import TorchWorker -from smartsim._core.mli.infrastructure.control.workermanager import ( - WorkerManager, -) - -if __name__ == "__main__": - parser = argparse.ArgumentParser("Worker Manager") - parser.add_argument("--device", default="gpu") - args = parser.parse_args() - connect_to_infrastructure() - ddict_str = os.environ["SS_DRG_DDICT"] - ddict = DDict.attach(ddict_str) - - to_worker_channel = Channel.make_process_local() - to_worker_manager_channel = Channel.make_process_local() - channels = [Channel.make_process_local() for _ in range(100)] - to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=to_worker_manager_channel, stream_channels=channels) - ddict["to_worker_fli"] = b64encode(to_worker_fli.serialize()) - - torch_worker = TorchWorker() - - worker_manager = WorkerManager( - file_like_interface=to_worker_fli, - worker=torch_worker, - feature_store=None, - as_service=True, - cooldown=10, - comm_channel_type=DragonCommChannel, - device = args.device, - ) - worker_manager.execute() From 56d8e50f4f7e9fddb9e4d79ba0b1fe556e400684 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 26 Jun 2024 18:47:40 -0500 Subject: [PATCH 16/40] Add tests for torch worker --- ex/high_throughput_inference/mock_app.py | 5 +- .../standalone_workermanager.py | 2 +- .../mli/infrastructure/worker/torch_worker.py | 118 ++++++++++++ .../_core/mli/infrastructure/worker/worker.py | 91 +-------- tests/mli/test_torch_worker.py | 173 ++++++++++++++++++ tests/mli/test_worker_manager.py | 12 +- 6 files changed, 309 insertions(+), 92 deletions(-) create mode 100644 smartsim/_core/mli/infrastructure/worker/torch_worker.py create mode 100644 tests/mli/test_torch_worker.py diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index afc0c836b8..d22792d15b 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -5,7 +5,7 @@ import dragon.channels from dragon.data.ddict.ddict import DDict from dragon.globalservices.api_setup import connect_to_infrastructure -from dragon.utils import b64decode, b64encode +from dragon.utils import b64decode # isort: on @@ -13,11 +13,8 @@ import io import numpy import os -import tabulate import time import torch -import typing as t - from smartsim._core.mli.message_handler import MessageHandler diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index 32d534f360..40fefcc372 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -11,7 +11,7 @@ from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel -from smartsim._core.mli.infrastructure.worker.worker import TorchWorker +from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim._core.mli.infrastructure.control.workermanager import ( WorkerManager, ) diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py new file mode 100644 index 0000000000..c350499c20 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -0,0 +1,118 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io + +import numpy as np +import torch + +from .....error import SmartSimError +from .....log import get_logger +from ...mli_schemas.tensor import tensor_capnp +from .worker import ( + ExecuteResult, + FetchInputResult, + FetchModelResult, + InferenceRequest, + LoadModelResult, + MachineLearningWorkerBase, + TransformInputResult, + TransformOutputResult, +) + +logger = get_logger(__name__) + + +class TorchWorker(MachineLearningWorkerBase): + """A worker that executes a PyTorch model.""" + + @staticmethod + def load_model( + request: InferenceRequest, fetch_result: FetchModelResult, device: str + ) -> LoadModelResult: + model_bytes = fetch_result.model_bytes or request.raw_model + if not model_bytes: + raise ValueError("Unable to load model without reference object") + + _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} + device = _device_to_torch[device] + buffer = io.BytesIO(model_bytes) + model = torch.jit.load(buffer, map_location=device) # type: ignore + result = LoadModelResult(model) + return result + + @staticmethod + def transform_input( + request: InferenceRequest, fetch_result: FetchInputResult, device: str + ) -> TransformInputResult: + result = [] + + _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} + device = _device_to_torch[device] + if fetch_result.meta is None: + raise ValueError("Cannot reconstruct tensor without meta information") + for item, item_meta in zip(fetch_result.inputs, fetch_result.meta): + tensor_desc: tensor_capnp.TensorDescriptor = item_meta + result.append( + torch.tensor(np.frombuffer(item, dtype=str(tensor_desc.dataType))) + .to(device) + .reshape(tuple(dim for dim in tensor_desc.dimensions)) + ) + return TransformInputResult(result) + # return data # note: this fails copy test! + + @staticmethod + def execute( + request: InferenceRequest, + load_result: LoadModelResult, + transform_result: TransformInputResult, + ) -> ExecuteResult: + if not load_result.model: + raise SmartSimError("Model must be loaded to execute") + + model: torch.nn.Module = load_result.model + model.eval() + results = [model(tensor).detach() for tensor in transform_result.transformed] + + execute_result = ExecuteResult(results) + return execute_result + + @staticmethod + def transform_output( + request: InferenceRequest, + execute_result: ExecuteResult, + result_device: str, + ) -> TransformOutputResult: + if result_device != "cpu": + transformed = [ + item.to("cpu").clone() for item in execute_result.predictions + ] + # todo: need the shape from latest schemas added here. + return TransformOutputResult(transformed, None, "c", "float32") # fixme + + return TransformOutputResult( + execute_result.predictions, None, "c", "float32" + ) # fixme diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 08c4997554..24dc734d00 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -24,18 +24,13 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import io import typing as t from abc import ABC, abstractmethod -import numpy as np -import torch - -import smartsim.error as sse -from smartsim._core.mli.comm.channel.channel import CommChannelBase -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore -from smartsim._core.mli.mli_schemas.tensor import tensor_capnp -from smartsim.log import get_logger +from .....error import SmartSimError +from .....log import get_logger +from ...comm.channel.channel import CommChannelBase +from ...infrastructure.storage.featurestore import FeatureStore logger = get_logger(__name__) @@ -167,7 +162,7 @@ def fetch_model( raise ValueError("Feature store is required for model retrieval") if not request.model_key: - raise sse.SmartSimError( + raise SmartSimError( "Key must be provided to retrieve model from feature store" ) @@ -176,7 +171,7 @@ def fetch_model( return FetchModelResult(raw_bytes) except FileNotFoundError as ex: logger.exception(ex) - raise sse.SmartSimError( + raise SmartSimError( f"Model could not be retrieved with key {request.model_key}" ) from ex @@ -204,7 +199,7 @@ def fetch_inputs( data.append(tensor_bytes) except KeyError as ex: logger.exception(ex) - raise sse.SmartSimError( + raise SmartSimError( f"Model could not be retrieved with key {input_}" ) from ex return FetchInputResult( @@ -303,75 +298,3 @@ def transform_output( :param execute_result: The result of inference wrapped in an ExecuteResult :param result_device: The device on which the result of inference is placed :return:""" - - -class TorchWorker(MachineLearningWorkerBase): - """A worker that executes a PyTorch model.""" - - @staticmethod - def load_model( - request: InferenceRequest, fetch_result: FetchModelResult, device: str - ) -> LoadModelResult: - model_bytes = fetch_result.model_bytes or request.raw_model - if not model_bytes: - raise ValueError("Unable to load model without reference object") - - _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} - device = _device_to_torch[device] - buffer = io.BytesIO(model_bytes) - model = torch.jit.load(buffer, map_location=device) # type: ignore - result = LoadModelResult(model) - return result - - @staticmethod - def transform_input( - request: InferenceRequest, fetch_result: FetchInputResult, device: str - ) -> TransformInputResult: - result = [] - - _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} - device = _device_to_torch[device] - if fetch_result.meta is None: - raise ValueError("Cannot reconstruct tensor without meta information") - for item, item_meta in zip(fetch_result.inputs, fetch_result.meta): - tensor_desc: tensor_capnp.TensorDescriptor = item_meta - result.append( - torch.tensor(np.frombuffer(item, dtype=str(tensor_desc.dataType))) - .to(device) - .reshape(tuple(dim for dim in tensor_desc.dimensions)) - ) - return TransformInputResult(result) - # return data # note: this fails copy test! - - @staticmethod - def execute( - request: InferenceRequest, - load_result: LoadModelResult, - transform_result: TransformInputResult, - ) -> ExecuteResult: - if not load_result.model: - raise sse.SmartSimError("Model must be loaded to execute") - - model: torch.nn.Module = load_result.model - model.eval() - results = [model(tensor).detach() for tensor in transform_result.transformed] - - execute_result = ExecuteResult(results) - return execute_result - - @staticmethod - def transform_output( - request: InferenceRequest, - execute_result: ExecuteResult, - result_device: str, - ) -> TransformOutputResult: - if result_device != "cpu": - transformed = [ - item.to("cpu").clone() for item in execute_result.predictions - ] - # todo: need the shape from latest schemas added here. - return TransformOutputResult(transformed, None, "c", "float32") # fixme - - return TransformOutputResult( - execute_result.predictions, None, "c", "float32" - ) # fixme diff --git a/tests/mli/test_torch_worker.py b/tests/mli/test_torch_worker.py new file mode 100644 index 0000000000..0b1cd4ccf3 --- /dev/null +++ b/tests/mli/test_torch_worker.py @@ -0,0 +1,173 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io + +import numpy as np +import pytest +import torch +from torch import nn +from torch.nn import functional as F + +from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker +from smartsim._core.mli.infrastructure.worker.worker import ( + ExecuteResult, + FetchInputResult, + FetchModelResult, + InferenceRequest, + LoadModelResult, + TransformInputResult, +) +from smartsim._core.mli.message_handler import MessageHandler +from smartsim.log import get_logger + +logger = get_logger(__name__) +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +# simple MNIST in PyTorch +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout(0.25) + self.dropout2 = nn.Dropout(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = F.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = F.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + output = F.log_softmax(x, dim=1) + return output + + +torch_device = {"cpu": "cpu", "gpu": "cuda"} + + +def get_batch() -> torch.Tensor: + return torch.rand(20, 1, 28, 28) + + +def create_torch_model(): + n = Net() + example_forward_input = get_batch() + module = torch.jit.trace(n, example_forward_input) + model_buffer = io.BytesIO() + torch.jit.save(module, model_buffer) + return model_buffer.getvalue() + + +def get_request() -> InferenceRequest: + + tensors = [get_batch() for _ in range(2)] + serialized_tensors = [ + MessageHandler.build_tensor(tensor.numpy(), "c", "float32", list(tensor.shape)) + for tensor in tensors + ] + + return InferenceRequest( + model_key="model", + callback=None, + raw_inputs=[s_tensor.blob for s_tensor in serialized_tensors], + input_keys=None, + input_meta=[s_tensor.tensorDescriptor for s_tensor in serialized_tensors], + output_keys=None, + raw_model=create_torch_model(), + batch_size=0, + ) + + +sample_request: InferenceRequest = get_request() +worker = TorchWorker() + + +def test_load_model(mlutils) -> None: + fetch_model_result = FetchModelResult(sample_request.raw_model) + load_model_result = worker.load_model( + sample_request, fetch_model_result, mlutils.get_test_device().lower() + ) + + assert load_model_result.model( + get_batch().to(torch_device[mlutils.get_test_device().lower()]) + ).shape == torch.Size((20, 10)) + + +def test_transform_input(mlutils) -> None: + fetch_input_result = FetchInputResult( + sample_request.raw_inputs, sample_request.input_meta + ) + + transform_input_result = worker.transform_input( + sample_request, fetch_input_result, mlutils.get_test_device().lower() + ) + + assert all( + transformed.shape == get_batch().shape + for transformed in transform_input_result.transformed + ) + + +def test_execute(mlutils) -> None: + load_model_result = LoadModelResult( + Net().to(torch_device[mlutils.get_test_device().lower()]) + ) + transform_result = TransformInputResult( + [ + get_batch().to(torch_device[mlutils.get_test_device().lower()]) + for _ in range(2) + ] + ) + + execute_result = worker.execute(sample_request, load_model_result, transform_result) + + assert all( + result.shape == torch.Size((20, 10)) for result in execute_result.predictions + ) + + +def test_transform_output(mlutils): + execute_result = ExecuteResult([torch.rand((20, 10)) for _ in range(2)]) + + transformed_output = worker.transform_output( + sample_request, execute_result, torch_device[mlutils.get_test_device().lower()] + ) + + assert transformed_output.outputs == execute_result.predictions + assert transformed_output.shape == None + assert transformed_output.order == "c" + assert transformed_output.dtype == "float32" diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py index 01502ec521..46cae5b2e4 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/mli/test_worker_manager.py @@ -29,10 +29,14 @@ import multiprocessing as mp import pathlib import time -import typing as t import pytest -import torch + +should_run = True +try: + import torch +except ImportError: + should_run = False from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore @@ -44,9 +48,11 @@ from .worker import IntegratedTorchWorker logger = get_logger(__name__) -# The tests in this file belong to the group_b group +# The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a +pytest.mark.skipif(not should_run, "Test needs PyTorch to run") + def mock_work(worker_manager_queue: "mp.Queue[bytes]") -> None: """Mock event producer for triggering the inference pipeline""" From 6cec83ea4697761b3d297cc8fd50cd44a568af64 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 27 Jun 2024 08:14:24 -0500 Subject: [PATCH 17/40] Switch to sender-supplied channels in app example --- ex/high_throughput_inference/mock_app.py | 6 ++++-- ex/high_throughput_inference/standalone_workermanager.py | 3 +-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index d22792d15b..8a00e8f0e4 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -62,6 +62,9 @@ print(",".join(headers)) + from_worker_ch = Channel.make_process_local() + to_worker_ch = Channel.make_process_local() + for batch_size in [1, 8, 32, 64, 128]: timings = [] @@ -79,7 +82,6 @@ timings[-1].append(time.perf_counter() - interm) interm = time.perf_counter() - from_worker_ch = Channel.make_process_local() request = MessageHandler.build_request( reply_channel=from_worker_ch.serialize(), @@ -95,7 +97,7 @@ request_bytes = MessageHandler.serialize_request(request) timings[-1].append(time.perf_counter() - interm) interm = time.perf_counter() - with to_worker_fli.sendh(timeout=None) as to_sendh: + with to_worker_fli.sendh(timeout=None, stream_channel=to_worker_ch) as to_sendh: to_sendh.send_bytes(request_bytes) timings[-1].append(time.perf_counter() - interm) interm = time.perf_counter() diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index 40fefcc372..cdc97f4c2e 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -26,8 +26,7 @@ to_worker_channel = Channel.make_process_local() to_worker_manager_channel = Channel.make_process_local() - channels = [Channel.make_process_local() for _ in range(100)] - to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=to_worker_manager_channel, stream_channels=channels) + to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=to_worker_manager_channel) ddict["to_worker_fli"] = b64encode(to_worker_fli.serialize()) torch_worker = TorchWorker() From 3ad6d445662a611539b40cb72fcba1a0b4ea102f Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 27 Jun 2024 16:55:59 -0500 Subject: [PATCH 18/40] Add prototype client for mock app --- ex/high_throughput_inference/mli_driver.py | 2 +- ex/high_throughput_inference/mock_app.py | 206 ++++++++++++--------- 2 files changed, 116 insertions(+), 92 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 7b8db5ed83..d32d88e51b 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -4,7 +4,7 @@ from smartsim.status import TERMINAL_STATUSES import time -device = "cpu" +device = "gpu" filedir = os.path.dirname(__file__) worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py") app_script_name = os.path.join(filedir, "mock_app.py") diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 8a00e8f0e4..aa3aaeb3ee 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -15,111 +15,135 @@ import os import time import torch +import numbers + +from collections import OrderedDict from smartsim._core.mli.message_handler import MessageHandler +class ProtoClient: + def __init__(self, timing_on: bool): + connect_to_infrastructure() + ddict_str = os.environ["SS_DRG_DDICT"] + self._ddict = DDict.attach(ddict_str) + to_worker_fli_str = None + while to_worker_fli_str is None: + try: + to_worker_fli_str = self._ddict["to_worker_fli"] + self._to_worker_fli = fli.FLInterface.attach(b64decode(to_worker_fli_str)) + except KeyError: + time.sleep(1) + self._from_worker_ch = Channel.make_process_local() + self._from_worker_ch_serialized = self._from_worker_ch.serialize() + self._to_worker_ch = Channel.make_process_local() + + self._start = None + self._interm = None + self._timings: OrderedDict[str, list[numbers.Number]] = OrderedDict() + self._timing_on = timing_on + + def _add_label_to_timings(self, label: str): + if label not in self._timings: + self._timings[label] = [] + + @staticmethod + def _format_number(number: numbers.Number): + return f"{number:0.4e}" + + def start_timings(self, batch_size: int): + if self._timing_on: + self._add_label_to_timings("batch_size") + self._timings["batch_size"].append(batch_size) + self._start = time.perf_counter() + self._interm = time.perf_counter() + + def end_timings(self): + if self._timing_on: + self._add_label_to_timings("total_time") + self._timings["total_time"].append(self._format_number(time.perf_counter()-self._start)) + + def measure_time(self, label: str): + if self._timing_on: + self._add_label_to_timings(label) + self._timings[label].append(self._format_number(time.perf_counter()-self._interm)) + self._interm = time.perf_counter() + + def print_timings(self, to_file: bool = False): + print(" ".join(self._timings.keys())) + value_array = numpy.array([value for value in self._timings.values()], dtype=float) + value_array = numpy.transpose(value_array) + for i in range(value_array.shape[0]): + print(" ".join(self._format_number(value) for value in value_array[i])) + if to_file: + numpy.save("timings.npy", value_array) + numpy.savetxt("timings.txt", value_array) + + + def run_model(self, model: bytes, batch: torch.Tensor): + self.start_timings(batch.shape[0]) + built_tensor = MessageHandler.build_tensor( + batch.numpy(), "c", "float32", list(batch.shape)) + self.measure_time("build_tensor") + request = MessageHandler.build_request( + reply_channel=self._from_worker_ch_serialized, + model=model, + inputs=[built_tensor], + outputs=[], + output_descriptors=[], + custom_attributes=None, + ) + self.measure_time("build_request") + request_bytes = MessageHandler.serialize_request(request) + self.measure_time("serialize_request") + with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: + to_sendh.send_bytes(request_bytes) + + self.measure_time("send") + with self._from_worker_ch.recvh(timeout=None) as from_recvh: + resp = from_recvh.recv_bytes(timeout=None) + self.measure_time("receive") + response = MessageHandler.deserialize_response(resp) + self.measure_time("deserialize_response") + result = torch.from_numpy( + numpy.frombuffer( + response.result.data[0].blob, + dtype=str(response.result.data[0].tensorDescriptor.dataType), + ) + ) + self.measure_time("deserialize_tensor") -if __name__ == "__main__": + self.end_timings() + return result - parser = argparse.ArgumentParser("Mock application") - parser.add_argument("--device", default="cpu") - args = parser.parse_args() +class ResNetWrapper(): + def __init__(self, model: str): + self._model = torch.jit.load(model) + buffer = io.BytesIO() + scripted = torch.jit.trace(self._model, self.get_batch()) + torch.jit.save(scripted, buffer) + self._serialized_model = buffer.getvalue() - connect_to_infrastructure() - ddict_str = os.environ["SS_DRG_DDICT"] + def get_batch(self, batch_size: int=32): + return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) - ddict = DDict.attach(ddict_str) + @property + def model(self): + return self._serialized_model - to_worker_fli_str = None +if __name__ == "__main__": - while to_worker_fli_str is None: - try: - to_worker_fli_str = ddict["to_worker_fli"] - except Exception as e: - time.sleep(1) + parser = argparse.ArgumentParser("Mock application") + parser.add_argument("--device", default="cpu") + args = parser.parse_args() - to_worker_fli = fli.FLInterface.attach(b64decode(to_worker_fli_str)) + resnet = ResNetWrapper(f"resnet50.{args.device.upper()}.pt") - batch_size = 32 - model = torch.jit.load(f"resnet50.{args.device.upper()}.pt") - buffer = io.BytesIO() - batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) - scripted = torch.jit.trace(model, batch) - torch.jit.save(scripted, buffer) + client = ProtoClient(timing_on=True) total_iterations = 10 - headers=[ - "batch_size", - "build_tensor", - "build_request", - "serialize_request", - "send", - "receive", - "deserialize_response", - "deserialize_tensor", - ] - - print(",".join(headers)) - - from_worker_ch = Channel.make_process_local() - to_worker_ch = Channel.make_process_local() - for batch_size in [1, 8, 32, 64, 128]: - - timings = [] for iteration_number in range(total_iterations + int(batch_size==1)): + client.run_model(resnet.model, resnet.get_batch(batch_size)) - timings.append([batch_size]) - - batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) - - start = time.perf_counter() - interm = start - built_tensor = MessageHandler.build_tensor( - batch.numpy(), "c", "float32", list(batch.shape) - ) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - - - request = MessageHandler.build_request( - reply_channel=from_worker_ch.serialize(), - model=buffer.getvalue(), - inputs=[built_tensor], - outputs=[], - output_descriptors=[], - custom_attributes=None, - ) - - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - request_bytes = MessageHandler.serialize_request(request) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - with to_worker_fli.sendh(timeout=None, stream_channel=to_worker_ch) as to_sendh: - to_sendh.send_bytes(request_bytes) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - - with from_worker_ch.recvh(timeout=None) as from_recvh: - resp = from_recvh.recv_bytes(timeout=None) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - response = MessageHandler.deserialize_response(resp) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - result = torch.from_numpy( - numpy.frombuffer( - response.result.data[0].blob, - dtype=str(response.result.data[0].tensorDescriptor.dataType), - ) - ) - - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - - # duration = time.perf_counter() - start - # print(f"{duration:.3f} s") - - print(",".join(str(timing) for timing in timings[-1])) + client.print_timings(to_file=True) \ No newline at end of file From bd5f13357b181ee07e2df880b519d8464c8af174 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 28 Jun 2024 14:55:18 -0500 Subject: [PATCH 19/40] Update mock app --- ex/high_throughput_inference/mli_driver.py | 5 +++-- ex/high_throughput_inference/mock_app.py | 9 +++++++-- ex/high_throughput_inference/standalone_workermanager.py | 3 +-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index d32d88e51b..9b899f4124 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -10,8 +10,9 @@ app_script_name = os.path.join(filedir, "mock_app.py") model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") - -exp = Experiment("MLI_proto", launcher="dragon", exp_path=os.path.join(filedir, "MLI_proto")) +exp_path = os.path.join(filedir, "MLI_proto") +os.makedirs(exp_path, exist_ok=True) +exp = Experiment("MLI_proto", launcher="dragon", exp_path=exp_path) worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device]) worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index aa3aaeb3ee..666d7fcc91 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -19,6 +19,9 @@ from collections import OrderedDict from smartsim._core.mli.message_handler import MessageHandler +from smartsim.log import get_logger + +logger = get_logger("App") class ProtoClient: def __init__(self, timing_on: bool): @@ -140,10 +143,12 @@ def model(self): client = ProtoClient(timing_on=True) - total_iterations = 10 + total_iterations = 100 - for batch_size in [1, 8, 32, 64, 128]: + for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]: + logger.info(f"Batch size: {batch_size}") for iteration_number in range(total_iterations + int(batch_size==1)): + logger.info(f"Iteration: {iteration_number}") client.run_model(resnet.model, resnet.get_batch(batch_size)) client.print_timings(to_file=True) \ No newline at end of file diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index cdc97f4c2e..ccefcbf584 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -25,8 +25,7 @@ ddict = DDict.attach(ddict_str) to_worker_channel = Channel.make_process_local() - to_worker_manager_channel = Channel.make_process_local() - to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=to_worker_manager_channel) + to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) ddict["to_worker_fli"] = b64encode(to_worker_fli.serialize()) torch_worker = TorchWorker() From 3e343ee5dff7d85646a39db1b56123efa575f387 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 4 Jul 2024 05:40:59 -0500 Subject: [PATCH 20/40] Changes to feature store --- smartsim/_core/launcher/dragon/dragonBackend.py | 2 +- .../infrastructure/storage/dragonfeaturestore.py | 12 ++++-------- .../mli/infrastructure/worker/torch_worker.py | 2 +- smartsim/_core/mli/infrastructure/worker/worker.py | 14 +++++++++++++- smartsim/_core/mli/message_handler.py | 4 +++- 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 52f69ec41f..856de38030 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -401,7 +401,7 @@ def infra_ddict(self) -> str: """ if self._infra_ddict is None: logger.info("Creating DDict") - self._infra_ddict = dragon_ddict.DDict() # todo: parametrize + self._infra_ddict = dragon_ddict.DDict(n_nodes=len(self._hosts), total_mem=len(self._hosts)*1024**3) # todo: parametrize logger.info("Created DDict") self._infra_ddict["creation"] = str(time.time()) logger.info(self._infra_ddict["creation"]) diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py index ea8f06977d..53f2f461f8 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py +++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py @@ -47,24 +47,20 @@ def __init__(self, storage: "DragonDict") -> None: def __getitem__(self, key: str) -> t.Any: """Retrieve an item using key :param key: Unique key of an item to retrieve from the feature store""" - key_ = key.encode("utf-8") try: - return self._storage[key_] + return self._storage[key] except Exception as ex: # note: explicitly avoid round-trip to check for key existence raise sse.SmartSimError(f"{key} not found in feature store") from ex - def __setitem__(self, key: str, value: bytes) -> None: + def __setitem__(self, key: str, value: str) -> None: """Assign a value using key :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" - key_ = key.encode("utf-8") - self._storage[key_] = value + self._storage[key] = value - def __contains__(self, key: t.Union[str, bytes]) -> bool: + def __contains__(self, key: t.Union[str]) -> bool: """Membership operator to test for a key existing within the feature store. Return `True` if the key is found, `False` otherwise :param key: Unique key of an item to retrieve from the feature store""" - if isinstance(key, str): - key = key.encode("utf-8") return key in self._storage diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index c350499c20..122b9ddf2f 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -108,7 +108,7 @@ def transform_output( ) -> TransformOutputResult: if result_device != "cpu": transformed = [ - item.to("cpu").clone() for item in execute_result.predictions + item.to("cpu") for item in execute_result.predictions ] # todo: need the shape from latest schemas added here. return TransformOutputResult(transformed, None, "c", "float32") # fixme diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 24dc734d00..40696ac22f 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -32,6 +32,18 @@ from ...comm.channel.channel import CommChannelBase from ...infrastructure.storage.featurestore import FeatureStore +import sys + +# isort: off +try: + import dragon + from dragon.utils import b64decode +except ImportError as exc: + if not "pytest" in sys.modules: + raise exc from None + +# isort: on + logger = get_logger(__name__) @@ -167,7 +179,7 @@ def fetch_model( ) try: - raw_bytes = feature_store[request.model_key] + raw_bytes = b64decode(feature_store[request.model_key]) return FetchModelResult(raw_bytes) except FileNotFoundError as ex: logger.exception(ex) diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index fd8f6aeed7..1928db2f7c 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -200,7 +200,9 @@ def _assign_model( if isinstance(model, bytes): request.model.modelData = model else: - request.model.modelKey = model # type: ignore + model_key = data_references_capnp.ModelKey() + model_key.key = model + request.model.modelKey = model_key # type: ignore except Exception as e: raise ValueError("Error building model portion of request.") from e From a2bed267d8dbc1af109cad6708557afb11687d0a Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 5 Jul 2024 17:45:32 +0200 Subject: [PATCH 21/40] Make style --- smartsim/_core/launcher/dragon/dragonBackend.py | 4 +++- smartsim/_core/mli/infrastructure/worker/torch_worker.py | 4 +--- smartsim/_core/mli/infrastructure/worker/worker.py | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 856de38030..dcc5c8392b 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -401,7 +401,9 @@ def infra_ddict(self) -> str: """ if self._infra_ddict is None: logger.info("Creating DDict") - self._infra_ddict = dragon_ddict.DDict(n_nodes=len(self._hosts), total_mem=len(self._hosts)*1024**3) # todo: parametrize + self._infra_ddict = dragon_ddict.DDict( + n_nodes=len(self._hosts), total_mem=len(self._hosts) * 1024**3 + ) # todo: parametrize logger.info("Created DDict") self._infra_ddict["creation"] = str(time.time()) logger.info(self._infra_ddict["creation"]) diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 122b9ddf2f..28237dc422 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -107,9 +107,7 @@ def transform_output( result_device: str, ) -> TransformOutputResult: if result_device != "cpu": - transformed = [ - item.to("cpu") for item in execute_result.predictions - ] + transformed = [item.to("cpu") for item in execute_result.predictions] # todo: need the shape from latest schemas added here. return TransformOutputResult(transformed, None, "c", "float32") # fixme diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 73eff4e8ea..e368935a0d 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import sys import typing as t from abc import ABC, abstractmethod @@ -33,8 +34,6 @@ from ...infrastructure.storage.featurestore import FeatureStore from ...mli_schemas.model.model_capnp import Model -import sys - # isort: off try: import dragon From 36e92d9dabcdd013cdba637a2629e19c15896cb5 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 5 Jul 2024 18:07:31 +0200 Subject: [PATCH 22/40] Fix typing --- .../mli/infrastructure/storage/featurestore.py | 2 +- .../_core/mli/infrastructure/worker/torch_worker.py | 13 ++++++++----- smartsim/_core/mli/infrastructure/worker/worker.py | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py index ec4086b732..e18643e932 100644 --- a/smartsim/_core/mli/infrastructure/storage/featurestore.py +++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py @@ -37,7 +37,7 @@ def __getitem__(self, key: str) -> bytes: :param key: Unique key of an item to retrieve from the feature store""" @abstractmethod - def __setitem__(self, key: str, value: bytes) -> None: + def __setitem__(self, key: str, value: str) -> None: """Assign a value using key :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 28237dc422..e21513648b 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -53,13 +53,16 @@ class TorchWorker(MachineLearningWorkerBase): def load_model( request: InferenceRequest, fetch_result: FetchModelResult, device: str ) -> LoadModelResult: - model_bytes = fetch_result.model_bytes or request.raw_model - if not model_bytes: + if fetch_result.model_bytes: + model_bytes = fetch_result.model_bytes + elif request.raw_model and request.raw_model.data: + model_bytes = request.raw_model.data + else: raise ValueError("Unable to load model without reference object") - _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} - device = _device_to_torch[device] - buffer = io.BytesIO(model_bytes) + device_to_torch = {"cpu": "cpu", "gpu": "cuda"} + device = device_to_torch[device] + buffer = io.BytesIO(initial_bytes=model_bytes) model = torch.jit.load(buffer, map_location=device) # type: ignore result = LoadModelResult(model) return result diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index e368935a0d..fb061348ee 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -148,7 +148,7 @@ class FetchModelResult: def __init__(self, result: bytes) -> None: """Initialize the object""" - self.model_bytes = result + self.model_bytes: bytes = result class MachineLearningWorkerCore: From 59840a3be12576eedce2528d93a8b601a768973e Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 5 Jul 2024 18:17:18 +0200 Subject: [PATCH 23/40] Fix lint --- smartsim/_core/mli/infrastructure/worker/torch_worker.py | 4 ++-- smartsim/_core/mli/infrastructure/worker/worker.py | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index e21513648b..a4e725ab99 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -73,8 +73,8 @@ def transform_input( ) -> TransformInputResult: result = [] - _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} - device = _device_to_torch[device] + device_to_torch = {"cpu": "cpu", "gpu": "cuda"} + device = device_to_torch[device] if fetch_result.meta is None: raise ValueError("Cannot reconstruct tensor without meta information") for item, item_meta in zip(fetch_result.inputs, fetch_result.meta): diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index fb061348ee..fe82ea2a3e 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -36,12 +36,10 @@ # isort: off try: - import dragon from dragon.utils import b64decode except ImportError as exc: - if not "pytest" in sys.modules: + if "pytest" not in sys.modules: raise exc from None - # isort: on logger = get_logger(__name__) From b35b37dd89bf6f7fd7a93c339e79643046d48abe Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 5 Jul 2024 18:32:00 +0200 Subject: [PATCH 24/40] Remove duplicated/useless comments --- smartsim/_core/mli/infrastructure/control/workermanager.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 4e276d2507..f0cae497a0 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -240,7 +240,6 @@ def _on_iteration(self) -> None: logger.warning("No queue to check for tasks") return - # perform default deserialization of the message envelope # perform default deserialization of the message envelope with self._task_queue.recvh(timeout=None) as recvh: try: @@ -254,9 +253,6 @@ def _on_iteration(self) -> None: if not self._validate_request(request): return - # # let the worker perform additional custom deserialization - # request = self._worker.deserialize(request_bytes) - fetch_model_result = self._worker.fetch_model(request, self._feature_store) model_result = self._worker.load_model( request, fetch_model_result, self._device @@ -294,7 +290,6 @@ def _on_iteration(self) -> None: response = build_reply(reply) - # serialized = self._worker.serialize_reply(request, transformed_output) serialized_resp = MessageHandler.serialize_response(response) # type: ignore if request.callback: request.callback.send(serialized_resp) From 51e0b17bdbf22683759597ece523778b6d7bd953 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 9 Jul 2024 12:37:22 -0500 Subject: [PATCH 25/40] Bring up to date with new schema --- ex/high_throughput_inference/mli_driver.py | 9 ++- ex/high_throughput_inference/mock_app.py | 30 +++++++++- .../standalone_workermanager.py | 57 +++++++++++++++++-- smartsim/_core/entrypoints/service.py | 3 +- smartsim/_core/mli/comm/channel/channel.py | 7 ++- .../_core/mli/comm/channel/dragonchannel.py | 6 ++ smartsim/_core/mli/comm/channel/dragonfli.py | 29 ++++++---- .../infrastructure/control/workermanager.py | 20 ++----- .../_core/mli/infrastructure/worker/worker.py | 11 ++-- 9 files changed, 128 insertions(+), 44 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 9b899f4124..4a3dd034e8 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -1,6 +1,11 @@ + + import os +import base64 +import cloudpickle import sys from smartsim import Experiment +from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim.status import TERMINAL_STATUSES import time @@ -14,7 +19,9 @@ os.makedirs(exp_path, exist_ok=True) exp = Experiment("MLI_proto", launcher="dragon", exp_path=exp_path) -worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device]) +torch_worker_str = base64.b64encode(cloudpickle.dumps(TorchWorker)).decode("ascii") + +worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device, "--worker_class", torch_worker_str]) worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 666d7fcc91..df0ba55c76 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -1,3 +1,29 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + # isort: off import dragon from dragon import fli @@ -32,7 +58,7 @@ def __init__(self, timing_on: bool): while to_worker_fli_str is None: try: to_worker_fli_str = self._ddict["to_worker_fli"] - self._to_worker_fli = fli.FLInterface.attach(b64decode(to_worker_fli_str)) + self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str) except KeyError: time.sleep(1) self._from_worker_ch = Channel.make_process_local() @@ -88,7 +114,7 @@ def run_model(self, model: bytes, batch: torch.Tensor): self.measure_time("build_tensor") request = MessageHandler.build_request( reply_channel=self._from_worker_ch_serialized, - model=model, + model=MessageHandler.build_model(model, "resnet-50", "1.0"), inputs=[built_tensor], outputs=[], output_descriptors=[], diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index ccefcbf584..991e869581 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -1,3 +1,29 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + # isort: off import dragon from dragon import fli @@ -7,10 +33,12 @@ from dragon.globalservices.api_setup import connect_to_infrastructure # isort: on import argparse +import base64 +import cloudpickle import os - from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim._core.mli.infrastructure.control.workermanager import ( WorkerManager, @@ -18,7 +46,23 @@ if __name__ == "__main__": parser = argparse.ArgumentParser("Worker Manager") - parser.add_argument("--device", default="gpu") + parser.add_argument( + "--device", + type=str, + default="gpu", + choices="gpu cpu".split(), + help="Device on which the inference takes place", + ) + parser.add_argument( + "--worker_class", + type=str, + required=True, + help="Serialized class of worker to run", + ) + parser.add_argument( + "--num_workers", type=int, default=1, help="Number of workers to run" + ) + args = parser.parse_args() connect_to_infrastructure() ddict_str = os.environ["SS_DRG_DDICT"] @@ -26,12 +70,13 @@ to_worker_channel = Channel.make_process_local() to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) - ddict["to_worker_fli"] = b64encode(to_worker_fli.serialize()) - - torch_worker = TorchWorker() + to_worker_fli_serialized = to_worker_fli.serialize() + ddict["to_worker_fli"] = to_worker_fli_serialized + torch_worker = cloudpickle.loads(base64.b64decode(args.worker_class.encode('ascii')))() + comm_channel = DragonFLIChannel(to_worker_fli_serialized) worker_manager = WorkerManager( - file_like_interface=to_worker_fli, + task_queue=comm_channel, worker=torch_worker, feature_store=None, as_service=True, diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py index e03df6bea1..6b4ef74b67 100644 --- a/smartsim/_core/entrypoints/service.py +++ b/smartsim/_core/entrypoints/service.py @@ -46,7 +46,8 @@ def __init__( :param as_service: Determines if the host will run until shutdown criteria are met or as a run-once instance :param cooldown: Period of time to allow service to run before automatic - shutdown, in seconds. A non-zero, positive integer.""" + shutdown, in seconds. A non-zero, positive integer. + :param loop_delay: delay between iterations of the event loop""" self._as_service = as_service """If the service should run until shutdown function returns True""" self._cooldown = abs(cooldown) diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index 201ab9deab..2318896a9b 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -41,9 +41,14 @@ def __init__(self, descriptor: t.Union[str, bytes]) -> None: @abstractmethod def send(self, value: bytes) -> None: - """Send a message throuh the underlying communication channel + """Send a message through the underlying communication channel :param value: The value to send""" + @abstractmethod + def recv(self) -> bytes: + """Receieve a message through the underlying communication channel + :returns: the received message""" + @property def descriptor(self) -> bytes: """Return the channel descriptor for the underlying dragon channel""" diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index 872eb32350..fb1a0c51c1 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -51,3 +51,9 @@ def send(self, value: bytes) -> None: :param value: The value to send""" with self._channel.sendh(timeout=None) as sendh: sendh.send_bytes(value) + + def recv(self) -> bytes: + """Receieve a message through the underlying communication channel + :returns: the received message""" + with self._channel.recvh(timeout=None) as recvh: + return recvh.recv_bytes(timeout=None) \ No newline at end of file diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 3992241380..ebf824b7db 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -24,18 +24,12 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import sys - # isort: off -try: - from dragon import fli - import dragon.channels as dch -except ImportError as exc: - if not "pytest" in sys.modules: - raise exc from None +from dragon import fli +import dragon.channels as dch # isort: on - +import sys import smartsim._core.mli.comm.channel.channel as cch from smartsim.log import get_logger @@ -45,14 +39,25 @@ class DragonFLIChannel(cch.CommChannelBase): """Passes messages by writing to a Dragon FLI Channel""" - def __init__(self, fli_desc: bytes) -> None: + def __init__(self, fli_desc: str) -> None: """Initialize the DragonFLIChannel instance""" super().__init__(fli_desc) # todo: do we need memory pool information to construct the channel correctly? - self._channel: "dch.Channel" = fli.FLInterface.attach(fli_desc) + self._channel: "fli" = fli.FLInterface.attach(fli_desc) def send(self, value: bytes) -> None: - """Send a message throuh the underlying communication channel + """Send a message through the underlying communication channel :param value: The value to send""" with self._channel.sendh(timeout=None) as sendh: sendh.send_bytes(value) + + def recv(self) -> bytes: + """Receieve a message through the underlying communication channel + :returns: the received message""" + with self._channel.recvh(timeout=None) as recvh: + try: + request_bytes: bytes + request_bytes, _ = recvh.recv_bytes(timeout=None) + return request_bytes + except fli.FLIEOT as exc: + return b'' \ No newline at end of file diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index f0cae497a0..6f31972727 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -27,14 +27,10 @@ import sys # isort: off -try: - import dragon - from dragon import fli -except ImportError as exc: - if not "pytest" in sys.modules: - raise exc from None - +import dragon +from dragon import fli # isort: on + import time import typing as t @@ -169,7 +165,7 @@ class WorkerManager(Service): def __init__( self, - file_like_interface: "fli.FLInterface", + task_queue: CommChannelBase, worker: MachineLearningWorkerBase, feature_store: t.Optional[FeatureStore] = None, as_service: bool = False, @@ -189,7 +185,7 @@ def __init__( super().__init__(as_service, cooldown) """a collection of workers the manager is controlling""" - self._task_queue: fli.FLInterface = file_like_interface + self._task_queue: CommChannelBase = task_queue """the queue the manager monitors for new tasks""" self._feature_store: t.Optional[FeatureStore] = feature_store """a feature store to retrieve models from""" @@ -241,11 +237,7 @@ def _on_iteration(self) -> None: return # perform default deserialization of the message envelope - with self._task_queue.recvh(timeout=None) as recvh: - try: - request_bytes, _ = recvh.recv_bytes(timeout=None) - except fli.FLIEOT as exc: - return + request_bytes = self._task_queue.recv() request = deserialize_message( request_bytes, self._comm_channel_type, self._device diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index fe82ea2a3e..808c9cf9bf 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -24,6 +24,10 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# isort: off +from dragon.utils import b64decode +# isort: on + import sys import typing as t from abc import ABC, abstractmethod @@ -34,13 +38,6 @@ from ...infrastructure.storage.featurestore import FeatureStore from ...mli_schemas.model.model_capnp import Model -# isort: off -try: - from dragon.utils import b64decode -except ImportError as exc: - if "pytest" not in sys.modules: - raise exc from None -# isort: on logger = get_logger(__name__) From 1fcf17d4456f99a6ad34d6360879e2e2a2b24f12 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 10 Jul 2024 11:06:08 -0500 Subject: [PATCH 26/40] Add feature store prototype caching --- ex/high_throughput_inference/mli_driver.py | 7 +- ex/high_throughput_inference/mock_app.py | 19 +++- .../standalone_workermanager.py | 10 +- smartsim/_core/entrypoints/service.py | 17 ++++ .../_core/mli/comm/channel/dragonchannel.py | 3 +- smartsim/_core/mli/comm/channel/dragonfli.py | 4 +- .../infrastructure/control/workermanager.py | 96 ++++++++++++++++--- .../storage/dragonfeaturestore.py | 15 ++- .../infrastructure/storage/featurestore.py | 5 +- .../_core/mli/infrastructure/worker/worker.py | 10 +- tests/mli/test_worker_manager.py | 8 +- 11 files changed, 147 insertions(+), 47 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 4a3dd034e8..4e68fdfbcb 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -8,6 +8,7 @@ from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim.status import TERMINAL_STATUSES import time +import typing as t device = "gpu" filedir = os.path.dirname(__file__) @@ -15,7 +16,11 @@ app_script_name = os.path.join(filedir, "mock_app.py") model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") -exp_path = os.path.join(filedir, "MLI_proto") +transport: t.Literal["hsta", "tcp"] = "hsta" + +os.environ["SMARTSIM_DRAGON_TRANSPORT"] = transport + +exp_path = os.path.join(filedir, f"MLI_proto_{transport.upper()}") os.makedirs(exp_path, exist_ok=True) exp = Experiment("MLI_proto", launcher="dragon", exp_path=exp_path) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index df0ba55c76..4ecce58ac7 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -31,7 +31,7 @@ import dragon.channels from dragon.data.ddict.ddict import DDict from dragon.globalservices.api_setup import connect_to_infrastructure -from dragon.utils import b64decode +from dragon.utils import b64decode, b64encode # isort: on @@ -107,7 +107,7 @@ def print_timings(self, to_file: bool = False): numpy.savetxt("timings.txt", value_array) - def run_model(self, model: bytes, batch: torch.Tensor): + def run_model(self, model: bytes | str, batch: torch.Tensor): self.start_timings(batch.shape[0]) built_tensor = MessageHandler.build_tensor( batch.numpy(), "c", "float32", list(batch.shape)) @@ -143,10 +143,14 @@ def run_model(self, model: bytes, batch: torch.Tensor): self.end_timings() return result + def set_model(self, key: str, model: bytes): + self._ddict[key] = b64encode(model) + class ResNetWrapper(): - def __init__(self, model: str): + def __init__(self, name: str, model: str): self._model = torch.jit.load(model) + self._name = name buffer = io.BytesIO() scripted = torch.jit.trace(self._model, self.get_batch()) torch.jit.save(scripted, buffer) @@ -159,15 +163,20 @@ def get_batch(self, batch_size: int=32): def model(self): return self._serialized_model + @property + def name(self): + return self._name + if __name__ == "__main__": parser = argparse.ArgumentParser("Mock application") parser.add_argument("--device", default="cpu") args = parser.parse_args() - resnet = ResNetWrapper(f"resnet50.{args.device.upper()}.pt") + resnet = ResNetWrapper("resnet50", f"resnet50.{args.device.upper()}.pt") client = ProtoClient(timing_on=True) + client.set_model(resnet.name, resnet.model) total_iterations = 100 @@ -175,6 +184,6 @@ def model(self): logger.info(f"Batch size: {batch_size}") for iteration_number in range(total_iterations + int(batch_size==1)): logger.info(f"Iteration: {iteration_number}") - client.run_model(resnet.model, resnet.get_batch(batch_size)) + client.run_model(resnet.name, resnet.get_batch(batch_size)) client.print_timings(to_file=True) \ No newline at end of file diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index 991e869581..f3e8e7c589 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -38,11 +38,11 @@ import os from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel +from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import DragonFeatureStore from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker -from smartsim._core.mli.infrastructure.control.workermanager import ( - WorkerManager, -) +from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager + if __name__ == "__main__": parser = argparse.ArgumentParser("Worker Manager") @@ -74,11 +74,13 @@ ddict["to_worker_fli"] = to_worker_fli_serialized torch_worker = cloudpickle.loads(base64.b64decode(args.worker_class.encode('ascii')))() + + dfs = DragonFeatureStore(ddict) comm_channel = DragonFLIChannel(to_worker_fli_serialized) worker_manager = WorkerManager( task_queue=comm_channel, worker=torch_worker, - feature_store=None, + feature_store=dfs, as_service=True, cooldown=10, comm_channel_type=DragonCommChannel, diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py index 6b4ef74b67..df9c2bbef6 100644 --- a/smartsim/_core/entrypoints/service.py +++ b/smartsim/_core/entrypoints/service.py @@ -103,6 +103,23 @@ def execute(self) -> None: running = True cooldown_start: t.Optional[datetime.datetime] = None + headers = [ + "batch_size", + "w_deserialize", + "w_fetch_model", + "w_load_model", + "w_fetch_input", + "w_transform_input", + "w_execute", + "w_transform_output", + "w_assign_output", + "w_build_reply", + "w_serialize_resp", + "w_send", + ] + + print(",".join(headers)) + while running: self._on_iteration() diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index fb1a0c51c1..1409747a91 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -56,4 +56,5 @@ def recv(self) -> bytes: """Receieve a message through the underlying communication channel :returns: the received message""" with self._channel.recvh(timeout=None) as recvh: - return recvh.recv_bytes(timeout=None) \ No newline at end of file + message_bytes: bytes = recvh.recv_bytes(timeout=None) + return message_bytes diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index ebf824b7db..0c1aba94e3 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -27,9 +27,11 @@ # isort: off from dragon import fli import dragon.channels as dch + # isort: on import sys + import smartsim._core.mli.comm.channel.channel as cch from smartsim.log import get_logger @@ -60,4 +62,4 @@ def recv(self) -> bytes: request_bytes, _ = recvh.recv_bytes(timeout=None) return request_bytes except fli.FLIEOT as exc: - return b'' \ No newline at end of file + return b"" diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 6f31972727..d3cc2d84ae 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -29,6 +29,7 @@ # isort: off import dragon from dragon import fli + # isort: on import time @@ -36,18 +37,20 @@ import numpy as np -from smartsim._core.entrypoints.service import Service -from smartsim._core.mli.comm.channel.channel import CommChannelBase -from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore -from smartsim._core.mli.infrastructure.worker.worker import ( +from .....error import SmartSimError +from .....log import get_logger +from ....entrypoints.service import Service +from ...comm.channel.channel import CommChannelBase +from ...comm.channel.dragonfli import DragonFLIChannel +from ...infrastructure.storage.featurestore import FeatureStore +from ...infrastructure.worker.worker import ( InferenceReply, InferenceRequest, + LoadModelResult, MachineLearningWorkerBase, ) -from smartsim._core.mli.message_handler import MessageHandler -from smartsim._core.mli.mli_schemas.response.response_capnp import Response -from smartsim.log import get_logger +from ...message_handler import MessageHandler +from ...mli_schemas.response.response_capnp import Response if t.TYPE_CHECKING: from smartsim._core.mli.mli_schemas.model.model_capnp import Model @@ -195,6 +198,8 @@ def __init__( """The type of communication channel to construct for callbacks""" self._device = device """Device on which workers need to run""" + self._cached_models: dict[str, t.Any] = {} + """Dictionary of previously loaded models""" def _validate_request(self, request: InferenceRequest) -> bool: """Ensure the request can be processed. @@ -236,34 +241,84 @@ def _on_iteration(self) -> None: logger.warning("No queue to check for tasks") return + timings = [] # perform default deserialization of the message envelope - request_bytes = self._task_queue.recv() + request_bytes: bytes = self._task_queue.recv() + interm = time.perf_counter() request = deserialize_message( request_bytes, self._comm_channel_type, self._device ) if not self._validate_request(request): return - fetch_model_result = self._worker.fetch_model(request, self._feature_store) - model_result = self._worker.load_model( - request, fetch_model_result, self._device - ) + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() + + if not request.raw_model: + if not request.model_key: + raise SmartSimError("Neither key, nor model provided") + + if request.model_key in self._cached_models: + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() + model_result = LoadModelResult(self._cached_models[request.model_key]) + + else: + fetch_model_result = None + while True: + try: + interm = time.perf_counter() + fetch_model_result = self._worker.fetch_model( + request, self._feature_store + ) + except KeyError: + time.sleep(0.1) + else: + break + + if fetch_model_result is None: + raise SmartSimError("Could not retrieve model from feature store") + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() + model_result = self._worker.load_model( + request, fetch_model_result, self._device + ) + self._cached_models[request.model_key] = model_result.model + else: + fetch_model_result = self._worker.fetch_model(request, None) + model_result = self._worker.load_model( + request, fetch_result=fetch_model_result, device=self._device + ) + + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() fetch_input_result = self._worker.fetch_inputs(request, self._feature_store) + + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() transformed_input = self._worker.transform_input( request, fetch_input_result, self._device ) + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() + reply = InferenceReply() try: execute_result = self._worker.execute( request, model_result, transformed_input ) + + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() transformed_output = self._worker.transform_output( request, execute_result, self._device ) + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() if request.output_keys: reply.output_keys = self._worker.place_output( request, transformed_output, self._feature_store @@ -274,6 +329,9 @@ def _on_iteration(self) -> None: logger.exception("Error executing worker") reply.failed = True + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() + if reply.failed: response = build_failure_reply("fail", "failure-occurred") else: @@ -282,10 +340,22 @@ def _on_iteration(self) -> None: response = build_reply(reply) + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() + + # serialized = self._worker.serialize_reply(request, transformed_output) serialized_resp = MessageHandler.serialize_response(response) # type: ignore + + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() if request.callback: request.callback.send(serialized_resp) + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() + + print(" ".join(str(time) for time in timings)) + def _can_shutdown(self) -> bool: """Return true when the criteria to shut down the service are met.""" # todo: determine shutdown criteria diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py index 53f2f461f8..fbd18438f5 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py +++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py @@ -44,22 +44,27 @@ def __init__(self, storage: "DragonDict") -> None: """Initialize the DragonFeatureStore instance""" self._storage = storage - def __getitem__(self, key: str) -> t.Any: + def __getitem__(self, key: str) -> t.Union[str, bytes]: """Retrieve an item using key :param key: Unique key of an item to retrieve from the feature store""" try: - return self._storage[key] + value: t.Union[str, bytes] = self._storage[key] + return value + except KeyError as ex: + raise ex except Exception as ex: # note: explicitly avoid round-trip to check for key existence - raise sse.SmartSimError(f"{key} not found in feature store") from ex + raise sse.SmartSimError( + f"Could not get value for existing key {key}, error:\n{ex}" + ) from ex - def __setitem__(self, key: str, value: str) -> None: + def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None: """Assign a value using key :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" self._storage[key] = value - def __contains__(self, key: t.Union[str]) -> bool: + def __contains__(self, key: str) -> bool: """Membership operator to test for a key existing within the feature store. Return `True` if the key is found, `False` otherwise :param key: Unique key of an item to retrieve from the feature store""" diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py index e18643e932..553e13b10f 100644 --- a/smartsim/_core/mli/infrastructure/storage/featurestore.py +++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import typing as t from abc import ABC, abstractmethod @@ -32,12 +33,12 @@ class FeatureStore(ABC): values from a feature store implementation""" @abstractmethod - def __getitem__(self, key: str) -> bytes: + def __getitem__(self, key: str) -> t.Union[str, bytes]: """Retrieve an item using key :param key: Unique key of an item to retrieve from the feature store""" @abstractmethod - def __setitem__(self, key: str, value: str) -> None: + def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None: """Assign a value using key :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 808c9cf9bf..900a8241de 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -24,11 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# isort: off -from dragon.utils import b64decode -# isort: on - -import sys import typing as t from abc import ABC, abstractmethod @@ -38,7 +33,6 @@ from ...infrastructure.storage.featurestore import FeatureStore from ...mli_schemas.model.model_capnp import Model - logger = get_logger(__name__) @@ -174,7 +168,7 @@ def fetch_model( ) try: - raw_bytes = b64decode(feature_store[request.model_key]) + raw_bytes: bytes = t.cast(bytes, feature_store[request.model_key]) return FetchModelResult(raw_bytes) except FileNotFoundError as ex: logger.exception(ex) @@ -202,7 +196,7 @@ def fetch_inputs( data: t.List[bytes] = [] for input_ in request.input_keys: try: - tensor_bytes = feature_store[input_] + tensor_bytes = t.cast(bytes, feature_store[input_]) data.append(tensor_bytes) except KeyError as ex: logger.exception(ex) diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py index 46cae5b2e4..62bd711ebb 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/mli/test_worker_manager.py @@ -32,11 +32,7 @@ import pytest -should_run = True -try: - import torch -except ImportError: - should_run = False +pytest.importorskip("torch") from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore @@ -51,8 +47,6 @@ # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a -pytest.mark.skipif(not should_run, "Test needs PyTorch to run") - def mock_work(worker_manager_queue: "mp.Queue[bytes]") -> None: """Mock event producer for triggering the inference pipeline""" From d76f88014cebe7a76175b06178d27ca32195841d Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 10 Jul 2024 13:10:08 -0500 Subject: [PATCH 27/40] Add redis driver, fix FLI --- ex/high_throughput_inference/mock_app.py | 10 ++- .../mock_app_redis.py | 88 +++++++++++++++++++ ex/high_throughput_inference/redis_driver.py | 65 ++++++++++++++ smartsim/_core/mli/comm/channel/dragonfli.py | 12 ++- .../infrastructure/control/workermanager.py | 2 +- 5 files changed, 170 insertions(+), 7 deletions(-) create mode 100644 ex/high_throughput_inference/mock_app_redis.py create mode 100644 ex/high_throughput_inference/redis_driver.py diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 4ecce58ac7..45246db2e5 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -112,9 +112,14 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): built_tensor = MessageHandler.build_tensor( batch.numpy(), "c", "float32", list(batch.shape)) self.measure_time("build_tensor") + built_model = None + if isinstance(model, str): + model_arg = MessageHandler.build_model_key(model) + else: + model_arg = MessageHandler.build_model(model, "resnet-50", "1.0") request = MessageHandler.build_request( reply_channel=self._from_worker_ch_serialized, - model=MessageHandler.build_model(model, "resnet-50", "1.0"), + model= model_arg, inputs=[built_tensor], outputs=[], output_descriptors=[], @@ -125,6 +130,7 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self.measure_time("serialize_request") with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: to_sendh.send_bytes(request_bytes) + logger.info(f"Message size: {len(request_bytes)} bytes") self.measure_time("send") with self._from_worker_ch.recvh(timeout=None) as from_recvh: @@ -144,7 +150,7 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): return result def set_model(self, key: str, model: bytes): - self._ddict[key] = b64encode(model) + self._ddict[key] = model class ResNetWrapper(): diff --git a/ex/high_throughput_inference/mock_app_redis.py b/ex/high_throughput_inference/mock_app_redis.py new file mode 100644 index 0000000000..c56b4fb8b4 --- /dev/null +++ b/ex/high_throughput_inference/mock_app_redis.py @@ -0,0 +1,88 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import io +import numpy +import time +import torch +from smartsim.log import get_logger +from smartredis import Client + +logger = get_logger("App") + +class ResNetWrapper(): + def __init__(self, name: str, model: str): + self._model = torch.jit.load(model) + self._name = name + buffer = io.BytesIO() + scripted = torch.jit.trace(self._model, self.get_batch()) + torch.jit.save(scripted, buffer) + self._serialized_model = buffer.getvalue() + + def get_batch(self, batch_size: int=32): + return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) + + @property + def model(self): + return self._serialized_model + + @property + def name(self): + return self._name + +if __name__ == "__main__": + + parser = argparse.ArgumentParser("Mock application") + parser.add_argument("--device", default="cpu") + args = parser.parse_args() + + resnet = ResNetWrapper("resnet50", f"resnet50.{args.device.upper()}.pt") + + client = Client(cluster=False, address=None) + client.set_model(resnet.name, resnet.model, backend='TORCH', device=args.device.upper()) + + total_iterations = 100 + timings=[] + for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]: + logger.info(f"Batch size: {batch_size}") + for iteration_number in range(total_iterations + int(batch_size==1)): + timing = [batch_size] + logger.info(f"Iteration: {iteration_number}") + start = time.perf_counter() + client.put_tensor(name="batch", data=resnet.get_batch(batch_size).numpy()) + client.run_model(name=resnet.name, inputs=["batch"], outputs=["result"]) + result = client.get_tensor(name="result") + end = time.perf_counter() + timing.append(end-start) + timings.append(timing) + + + + timings_np = numpy.asarray(timings) + numpy.save("timings.npy", timings_np) + for timing in timings: + print(" ".join(str(t) for t in timing)) diff --git a/ex/high_throughput_inference/redis_driver.py b/ex/high_throughput_inference/redis_driver.py new file mode 100644 index 0000000000..ceddba4ef7 --- /dev/null +++ b/ex/high_throughput_inference/redis_driver.py @@ -0,0 +1,65 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys +from smartsim import Experiment +from smartsim.status import TERMINAL_STATUSES +import time +import typing as t + +device = "gpu" +filedir = os.path.dirname(__file__) +app_script_name = os.path.join(filedir, "mock_app_redis.py") +model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") + + +exp_path = os.path.join(filedir, "redis_ai") +os.makedirs(exp_path, exist_ok=True) +exp = Experiment("redis_ai", launcher="slurm", exp_path=exp_path) + +db = exp.create_database(interface="hsn0") + +app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) +app_rs.set_nodes(1) +app_rs.set_tasks(1) +app = exp.create_model("app", run_settings=app_rs) +app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) + +exp.generate(db, app, overwrite=True) + +exp.start(db, app, block=False) + +while True: + if exp.get_status(app)[0] in TERMINAL_STATUSES: + exp.stop(db) + break + if exp.get_status(db)[0] in TERMINAL_STATUSES: + exp.stop(app) + break + time.sleep(5) + +print("Exiting.") \ No newline at end of file diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 0c1aba94e3..eb3175e445 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -31,6 +31,7 @@ # isort: on import sys +import typing as t import smartsim._core.mli.comm.channel.channel as cch from smartsim.log import get_logger @@ -41,22 +42,25 @@ class DragonFLIChannel(cch.CommChannelBase): """Passes messages by writing to a Dragon FLI Channel""" - def __init__(self, fli_desc: str) -> None: + def __init__(self, fli_desc: str, sender_supplied: bool = True) -> None: """Initialize the DragonFLIChannel instance""" super().__init__(fli_desc) # todo: do we need memory pool information to construct the channel correctly? - self._channel: "fli" = fli.FLInterface.attach(fli_desc) + self._fli: "fli" = fli.FLInterface.attach(fli_desc) + self._channel: t.Optional["dch"] = ( + dch.Channel.make_process_local() if sender_supplied else None + ) def send(self, value: bytes) -> None: """Send a message through the underlying communication channel :param value: The value to send""" - with self._channel.sendh(timeout=None) as sendh: + with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: sendh.send_bytes(value) def recv(self) -> bytes: """Receieve a message through the underlying communication channel :returns: the received message""" - with self._channel.recvh(timeout=None) as recvh: + with self._fli.recvh(timeout=None) as recvh: try: request_bytes: bytes request_bytes, _ = recvh.recv_bytes(timeout=None) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index d3cc2d84ae..60e263f337 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -259,7 +259,7 @@ def _on_iteration(self) -> None: if not request.model_key: raise SmartSimError("Neither key, nor model provided") - if request.model_key in self._cached_models: + if False and (request.model_key in self._cached_models): timings.append(time.perf_counter() - interm) interm = time.perf_counter() model_result = LoadModelResult(self._cached_models[request.model_key]) From 3938ec8dbe9964235e6ed4791600257b08b9f3eb Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 11 Jul 2024 12:27:34 -0500 Subject: [PATCH 28/40] Update post-merge --- ex/high_throughput_inference/mli_driver.py | 1 - .../standalone_workermanager.py | 11 ++- .../infrastructure/control/workermanager.py | 68 +++++++++---------- .../mli/infrastructure/environmentloader.py | 11 +-- 4 files changed, 49 insertions(+), 42 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 4e68fdfbcb..6da559aa6f 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -30,7 +30,6 @@ worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) - app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) app = exp.create_model("app", run_settings=app_rs) app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index f3e8e7c589..c56e11a7c3 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -35,6 +35,7 @@ import argparse import base64 import cloudpickle +import pickle import os from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel @@ -42,6 +43,7 @@ from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager +from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader if __name__ == "__main__": @@ -77,10 +79,15 @@ dfs = DragonFeatureStore(ddict) comm_channel = DragonFLIChannel(to_worker_fli_serialized) + + os.environ["SSFeatureStore"] = base64.b64encode(pickle.dumps(dfs)).decode("utf-8") + os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8") + + config_loader = EnvironmentConfigLoader() + worker_manager = WorkerManager( - task_queue=comm_channel, + config_loader=config_loader, worker=torch_worker, - feature_store=dfs, as_service=True, cooldown=10, comm_channel_type=DragonCommChannel, diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index eaa77bdf3e..8c06351fb5 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -41,7 +41,7 @@ from .....log import get_logger from ....entrypoints.service import Service from ...comm.channel.channel import CommChannelBase -from ...comm.channel.dragonfli import DragonFLIChannel +from ...comm.channel.dragonchannel import DragonCommChannel from ...infrastructure.environmentloader import EnvironmentConfigLoader from ...infrastructure.storage.featurestore import FeatureStore from ...infrastructure.worker.worker import ( @@ -175,7 +175,7 @@ def __init__( worker: MachineLearningWorkerBase, as_service: bool = False, cooldown: int = 0, - comm_channel_type: t.Type[CommChannelBase] = DragonFLIChannel, + comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, device: t.Literal["cpu", "gpu"] = "cpu", ) -> None: """Initialize the WorkerManager @@ -244,34 +244,34 @@ def _on_iteration(self) -> None: logger.warning("No queue to check for tasks") return - timings = [] + timings = [] # timing # perform default deserialization of the message envelope request_bytes: bytes = self._task_queue.recv() - interm = time.perf_counter() + interm = time.perf_counter() # timing request = deserialize_message( request_bytes, self._comm_channel_type, self._device ) if not self._validate_request(request): return - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing if not request.raw_model: - if not request.model_key: - raise SmartSimError("Neither key, nor model provided") - - if False and (request.model_key in self._cached_models): - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + if request.model_key is None: + # A valid request should never get here. + raise ValueError("Could not read model key") + if request.model_key in self._cached_models: + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing model_result = LoadModelResult(self._cached_models[request.model_key]) else: fetch_model_result = None while True: try: - interm = time.perf_counter() + interm = time.perf_counter() # timing fetch_model_result = self._worker.fetch_model( request, self._feature_store ) @@ -282,8 +282,8 @@ def _on_iteration(self) -> None: if fetch_model_result is None: raise SmartSimError("Could not retrieve model from feature store") - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing model_result = self._worker.load_model( request, fetch_model_result, self._device ) @@ -294,18 +294,18 @@ def _on_iteration(self) -> None: request, fetch_result=fetch_model_result, device=self._device ) - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing fetch_input_result = self._worker.fetch_inputs(request, self._feature_store) - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing transformed_input = self._worker.transform_input( request, fetch_input_result, self._device ) - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing reply = InferenceReply() @@ -314,14 +314,14 @@ def _on_iteration(self) -> None: request, model_result, transformed_input ) - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing transformed_output = self._worker.transform_output( request, execute_result, self._device ) - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing if request.output_keys: reply.output_keys = self._worker.place_output( request, transformed_output, self._feature_store @@ -332,8 +332,8 @@ def _on_iteration(self) -> None: logger.exception("Error executing worker") reply.failed = True - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing if reply.failed: response = build_failure_reply("fail", "failure-occurred") @@ -343,21 +343,21 @@ def _on_iteration(self) -> None: response = build_reply(reply) - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing # serialized = self._worker.serialize_reply(request, transformed_output) serialized_resp = MessageHandler.serialize_response(response) # type: ignore - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing if request.callback: request.callback.send(serialized_resp) - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing - print(" ".join(str(time) for time in timings)) + print(" ".join(str(time) for time in timings)) # timing def _can_shutdown(self) -> bool: """Return true when the criteria to shut down the service are met.""" diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index 267b668f63..f5e9532103 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -32,6 +32,7 @@ from dragon.fli import FLInterface # pylint: disable=all from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel class EnvironmentConfigLoader: @@ -41,10 +42,10 @@ class EnvironmentConfigLoader: """ def __init__(self) -> None: - self._feature_store_descriptor = os.getenv("SSFeatureStore", None) - self._queue_descriptor = os.getenv("SSQueue", None) + self._feature_store_descriptor: t.Optional[str] = os.getenv("SSFeatureStore", None) + self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None) self.feature_store: t.Optional[FeatureStore] = None - self.queue: t.Optional["FLInterface"] = None + self.queue: t.Optional[DragonFLIChannel] = None def get_feature_store(self) -> t.Optional[FeatureStore]: """Loads the Feature Store previously set in SSFeatureStore""" @@ -54,8 +55,8 @@ def get_feature_store(self) -> t.Optional[FeatureStore]: ) return self.feature_store - def get_queue(self) -> t.Optional["FLInterface"]: + def get_queue(self, sender_supplied: bool = True) -> t.Optional[DragonFLIChannel]: """Returns the Queue previously set in SSQueue""" if self._queue_descriptor is not None: - self.queue = FLInterface.attach(base64.b64decode(self._queue_descriptor)) + self.queue = DragonFLIChannel(fli_desc=base64.b64decode(self._queue_descriptor), sender_supplied=sender_supplied) return self.queue From 273a7d952fdcaa89984b654ce4b46c272c1c2bbd Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 11 Jul 2024 13:15:38 -0500 Subject: [PATCH 29/40] Fix typing --- smartsim/_core/mli/comm/channel/dragonfli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index eb3175e445..75f8fb4bfc 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -42,7 +42,7 @@ class DragonFLIChannel(cch.CommChannelBase): """Passes messages by writing to a Dragon FLI Channel""" - def __init__(self, fli_desc: str, sender_supplied: bool = True) -> None: + def __init__(self, fli_desc: bytes, sender_supplied: bool = True) -> None: """Initialize the DragonFLIChannel instance""" super().__init__(fli_desc) # todo: do we need memory pool information to construct the channel correctly? From a12d9232914ff9c2cf8def6224a3bb08896b80d9 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 11 Jul 2024 13:50:35 -0500 Subject: [PATCH 30/40] isort --- .../_core/mli/infrastructure/environmentloader.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index f5e9532103..9f6770623d 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -31,8 +31,8 @@ from dragon.fli import FLInterface # pylint: disable=all -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore class EnvironmentConfigLoader: @@ -42,7 +42,9 @@ class EnvironmentConfigLoader: """ def __init__(self) -> None: - self._feature_store_descriptor: t.Optional[str] = os.getenv("SSFeatureStore", None) + self._feature_store_descriptor: t.Optional[str] = os.getenv( + "SSFeatureStore", None + ) self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None) self.feature_store: t.Optional[FeatureStore] = None self.queue: t.Optional[DragonFLIChannel] = None @@ -58,5 +60,8 @@ def get_feature_store(self) -> t.Optional[FeatureStore]: def get_queue(self, sender_supplied: bool = True) -> t.Optional[DragonFLIChannel]: """Returns the Queue previously set in SSQueue""" if self._queue_descriptor is not None: - self.queue = DragonFLIChannel(fli_desc=base64.b64decode(self._queue_descriptor), sender_supplied=sender_supplied) + self.queue = DragonFLIChannel( + fli_desc=base64.b64decode(self._queue_descriptor), + sender_supplied=sender_supplied, + ) return self.queue From 38b0de15266288b4a959bbbcb244e131407555ea Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 11 Jul 2024 14:42:16 -0500 Subject: [PATCH 31/40] Update envloader test --- tests/dragon/test_environment_loader.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index d339fec885..00db0a9d32 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -64,10 +64,9 @@ def test_environment_loader_attach_FLI(content, monkeypatch): config = EnvironmentConfigLoader() config_queue = config.get_queue() - new_sender = config_queue.sendh(use_main_as_stream_channel=True) - new_sender.send_bytes(content) + new_sender = config_queue.send(content) - old_recv = queue.recvh(use_main_as_stream_channel=True) + old_recv = queue.recvh() result, _ = old_recv.recv_bytes() assert result == content @@ -81,7 +80,7 @@ def test_environment_loader_serialize_FLI(monkeypatch): config = EnvironmentConfigLoader() config_queue = config.get_queue() - assert config_queue.serialize() == queue.serialize() + assert config_queue._fli.serialize() == queue.serialize() def test_environment_loader_FLI_fails(monkeypatch): From 53eb0457fb0762f62b938065f11b7b830f1fe588 Mon Sep 17 00:00:00 2001 From: Alyssa Cote Date: Thu, 11 Jul 2024 16:24:16 -0500 Subject: [PATCH 32/40] no more data blob --- smartsim/_core/mli/mli_schemas/request/request.capnp | 2 +- smartsim/_core/mli/mli_schemas/tensor/tensor.capnp | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/smartsim/_core/mli/mli_schemas/request/request.capnp b/smartsim/_core/mli/mli_schemas/request/request.capnp index f9508cb54f..6d290fb599 100644 --- a/smartsim/_core/mli/mli_schemas/request/request.capnp +++ b/smartsim/_core/mli/mli_schemas/request/request.capnp @@ -43,7 +43,7 @@ struct Request { } input :union { keys @3 :List(DataRef.TensorKey); - data @4 :List(Tensors.Tensor); + descriptors @4 :List(Tensors.TensorDescriptor); } output @5 :List(DataRef.TensorKey); outputDescriptors @6 :List(Tensors.OutputDescriptor); diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp b/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp index aca1ce0836..3d70296209 100644 --- a/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp +++ b/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp @@ -61,11 +61,6 @@ enum ReturnNumericalType { auto @ 11; } -struct Tensor { - blob @0 :Data; - tensorDescriptor @1 :TensorDescriptor; -} - struct TensorDescriptor { dimensions @0 :List(Int32); order @1 :Order; From e64532de392c226c4543863d8a7dfc5b5f5bac0d Mon Sep 17 00:00:00 2001 From: Alyssa Cote Date: Thu, 11 Jul 2024 15:59:15 -0700 Subject: [PATCH 33/40] fixing up worker manager --- .../infrastructure/control/workermanager.py | 24 ++++++----- .../_core/mli/infrastructure/worker/worker.py | 2 +- smartsim/_core/mli/message_handler.py | 42 +++++++++--------- .../mli/mli_schemas/request/request_capnp.pyi | 18 +++++--- .../mli/mli_schemas/response/response.capnp | 2 +- .../mli_schemas/response/response_capnp.pyi | 18 +++++--- .../_core/mli/mli_schemas/tensor/tensor.capnp | 2 +- .../mli/mli_schemas/tensor/tensor_capnp.py | 3 -- .../mli/mli_schemas/tensor/tensor_capnp.pyi | 43 ------------------- tests/mli/test_torch_worker.py | 9 ++-- 10 files changed, 66 insertions(+), 97 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 8c06351fb5..1c571dc2f2 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -103,9 +103,9 @@ def deserialize_message( if request.input.which() == "keys": input_keys = [input_key.key for input_key in request.input.keys] - elif request.input.which() == "data": - input_bytes = [data.blob for data in request.input.data] - input_meta = [data.tensorDescriptor for data in request.input.data] + elif request.input.which() == "descriptors": + # input_bytes = [data.blob for data in request.input.data] + input_meta = [request.input.descriptors] inference_request = InferenceRequest( model_key=model_key, @@ -137,20 +137,16 @@ def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]: msg_key = MessageHandler.build_tensor_key(key) prepared_outputs.append(msg_key) elif reply.outputs: - arrays: t.List[np.ndarray[t.Any, np.dtype[t.Any]]] = [ - output.numpy() for output in reply.outputs - ] - for tensor in arrays: + for _ in reply.outputs: # todo: need to have the output attributes specified in the req? # maybe, add `MessageHandler.dtype_of(tensor)`? # can `build_tensor` do dtype and shape? - msg_tensor = MessageHandler.build_tensor( - tensor, + msg_tensor_desc = MessageHandler.build_tensor_descriptor( "c", "float32", [1], ) - prepared_outputs.append(msg_tensor) + prepared_outputs.append(msg_tensor_desc) return prepared_outputs @@ -252,6 +248,11 @@ def _on_iteration(self) -> None: request = deserialize_message( request_bytes, self._comm_channel_type, self._device ) + + if request.input_meta: + for _ in request.input_meta: + request.raw_inputs.append(self._task_queue.recv()) + if not self._validate_request(request): return @@ -353,6 +354,9 @@ def _on_iteration(self) -> None: interm = time.perf_counter() # timing if request.callback: request.callback.send(serialized_resp) + if reply.outputs: + for output in reply.outputs: + request.callback.send(output) timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 900a8241de..f76e05bcc0 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -56,7 +56,7 @@ def __init__( self.model_key = model_key self.raw_model = raw_model self.callback = callback - self.raw_inputs = raw_inputs + self.raw_inputs = raw_inputs or [] self.input_keys = input_keys or [] self.input_meta = input_meta or [] self.output_keys = output_keys or [] diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index bcf1cfdf14..d5e2549bae 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -25,8 +25,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import typing as t -import numpy as np - from .mli_schemas.data import data_references_capnp from .mli_schemas.model import model_capnp from .mli_schemas.request import request_capnp @@ -38,17 +36,15 @@ class MessageHandler: @staticmethod - def build_tensor( - tensor: np.ndarray[t.Any, np.dtype[t.Any]], + def build_tensor_descriptor( order: "tensor_capnp.Order", data_type: "tensor_capnp.NumericalType", dimensions: t.List[int], - ) -> tensor_capnp.Tensor: + ) -> tensor_capnp.TensorDescriptor: """ - Builds a Tensor message using the provided data, + Builds a TensorDescriptor message using the provided order, data type, and dimensions. - :param tensor: Tensor to build the message around :param order: Order of the tensor, such as row-major (c) or column-major (f) :param data_type: Data type of the tensor :param dimensions: Dimensions of the tensor @@ -59,15 +55,12 @@ def build_tensor( description.order = order description.dataType = data_type description.dimensions = dimensions - built_tensor = tensor_capnp.Tensor.new_message() - built_tensor.blob = tensor.tobytes() # tensor channel instead? - built_tensor.tensorDescriptor = description except Exception as e: raise ValueError( - "Error building tensor." + "Error building tensor descriptor." ) from e # TODO: create custom exception - return built_tensor + return description @staticmethod def build_output_tensor_descriptor( @@ -248,7 +241,8 @@ def _assign_reply_channel( def _assign_inputs( request: request_capnp.Request, inputs: t.Union[ - t.List[data_references_capnp.TensorKey], t.List[tensor_capnp.Tensor] + t.List[data_references_capnp.TensorKey], + t.List[tensor_capnp.TensorDescriptor], ], ) -> None: """ @@ -262,13 +256,14 @@ def _assign_inputs( if inputs: display_name = inputs[0].schema.node.displayName # type: ignore input_class_name = display_name.split(":")[-1] - if input_class_name == "Tensor": - request.input.data = inputs # type: ignore + if input_class_name == "TensorDescriptor": + request.input.descriptors = inputs # type: ignore elif input_class_name == "TensorKey": request.input.keys = inputs # type: ignore else: raise ValueError( - "Invalid input class name. Expected 'Tensor' or 'TensorKey'." + """Invalid input class name. Expected + 'TensorDescriptor' or 'TensorKey'.""" ) except Exception as e: raise ValueError("Error building inputs portion of request.") from e @@ -351,7 +346,8 @@ def build_request( reply_channel: bytes, model: t.Union[data_references_capnp.ModelKey, model_capnp.Model], inputs: t.Union[ - t.List[data_references_capnp.TensorKey], t.List[tensor_capnp.Tensor] + t.List[data_references_capnp.TensorKey], + t.List[tensor_capnp.TensorDescriptor], ], outputs: t.List[data_references_capnp.TensorKey], output_descriptors: t.List[tensor_capnp.OutputDescriptor], @@ -437,7 +433,8 @@ def _assign_message(response: response_capnp.Response, message: str) -> None: def _assign_result( response: response_capnp.Response, result: t.Union[ - t.List[tensor_capnp.Tensor], t.List[data_references_capnp.TensorKey] + t.List[tensor_capnp.TensorDescriptor], + t.List[data_references_capnp.TensorKey], ], ) -> None: """ @@ -452,13 +449,13 @@ def _assign_result( first_result = result[0] display_name = first_result.schema.node.displayName # type: ignore result_class_name = display_name.split(":")[-1] - if result_class_name == "Tensor": - response.result.data = result # type: ignore + if result_class_name == "TensorDescriptor": + response.result.descriptors = result # type: ignore elif result_class_name == "TensorKey": response.result.keys = result # type: ignore else: raise ValueError("""Invalid custom attribute class name. - Expected 'Tensor' or 'TensorKey'.""") + Expected 'TensorDescriptor' or 'TensorKey'.""") except Exception as e: raise ValueError("Error assigning result to response.") from e @@ -501,7 +498,8 @@ def build_response( status: "response_capnp.StatusEnum", message: str, result: t.Union[ - t.List[tensor_capnp.Tensor], t.List[data_references_capnp.TensorKey] + t.List[tensor_capnp.TensorDescriptor], + t.List[data_references_capnp.TensorKey], ], custom_attributes: t.Union[ response_attributes_capnp.TorchResponseAttributes, diff --git a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi index 39093f61ad..54dcdcfecc 100644 --- a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi @@ -47,9 +47,9 @@ from ..tensor.tensor_capnp import ( OutputDescriptor, OutputDescriptorBuilder, OutputDescriptorReader, - Tensor, - TensorBuilder, - TensorReader, + TensorDescriptor, + TensorDescriptorBuilder, + TensorDescriptorReader, ) from .request_attributes.request_attributes_capnp import ( TensorFlowRequestAttributes, @@ -143,8 +143,10 @@ class Request: class Input: keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] - data: Sequence[Tensor | TensorBuilder | TensorReader] - def which(self) -> Literal["keys", "data"]: ... + descriptors: Sequence[ + TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader + ] + def which(self) -> Literal["keys", "descriptors"]: ... @staticmethod @contextmanager def from_bytes( @@ -164,12 +166,14 @@ class Request: class InputReader(Request.Input): keys: Sequence[TensorKeyReader] - data: Sequence[TensorReader] + descriptors: Sequence[TensorDescriptorReader] def as_builder(self) -> Request.InputBuilder: ... class InputBuilder(Request.Input): keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] - data: Sequence[Tensor | TensorBuilder | TensorReader] + descriptors: Sequence[ + TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader + ] @staticmethod def from_dict(dictionary: dict) -> Request.InputBuilder: ... def copy(self) -> Request.InputBuilder: ... diff --git a/smartsim/_core/mli/mli_schemas/response/response.capnp b/smartsim/_core/mli/mli_schemas/response/response.capnp index 67375b5a97..01b1f67e86 100644 --- a/smartsim/_core/mli/mli_schemas/response/response.capnp +++ b/smartsim/_core/mli/mli_schemas/response/response.capnp @@ -41,7 +41,7 @@ struct Response { message @1 :Text; result :union { keys @2 :List(DataRef.TensorKey); - data @3 :List(Tensors.Tensor); + descriptors @3 :List(Tensors.TensorDescriptor); } customAttributes :union { torch @4 :ResponseAttributes.TorchResponseAttributes; diff --git a/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi b/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi index f6d7f8444e..6253422af2 100644 --- a/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi @@ -35,7 +35,11 @@ from io import BufferedWriter from typing import Iterator, Literal, Sequence, overload from ..data.data_references_capnp import TensorKey, TensorKeyBuilder, TensorKeyReader -from ..tensor.tensor_capnp import Tensor, TensorBuilder, TensorReader +from ..tensor.tensor_capnp import ( + TensorDescriptor, + TensorDescriptorBuilder, + TensorDescriptorReader, +) from .response_attributes.response_attributes_capnp import ( TensorFlowResponseAttributes, TensorFlowResponseAttributesBuilder, @@ -50,8 +54,10 @@ StatusEnum = Literal["complete", "fail", "timeout"] class Response: class Result: keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] - data: Sequence[Tensor | TensorBuilder | TensorReader] - def which(self) -> Literal["keys", "data"]: ... + descriptors: Sequence[ + TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader + ] + def which(self) -> Literal["keys", "descriptors"]: ... @staticmethod @contextmanager def from_bytes( @@ -71,12 +77,14 @@ class Response: class ResultReader(Response.Result): keys: Sequence[TensorKeyReader] - data: Sequence[TensorReader] + descriptors: Sequence[TensorDescriptorReader] def as_builder(self) -> Response.ResultBuilder: ... class ResultBuilder(Response.Result): keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] - data: Sequence[Tensor | TensorBuilder | TensorReader] + descriptors: Sequence[ + TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader + ] @staticmethod def from_dict(dictionary: dict) -> Response.ResultBuilder: ... def copy(self) -> Response.ResultBuilder: ... diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp b/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp index 3d70296209..4b2218b166 100644 --- a/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp +++ b/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp @@ -58,7 +58,7 @@ enum ReturnNumericalType { float32 @8; float64 @9; none @10; - auto @ 11; + auto @11; } struct TensorDescriptor { diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py index aa7f1e7b18..8c9d6c9029 100644 --- a/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py +++ b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py @@ -33,9 +33,6 @@ capnp.remove_import_hook() here = os.path.dirname(os.path.abspath(__file__)) module_file = os.path.abspath(os.path.join(here, "tensor.capnp")) -Tensor = capnp.load(module_file).Tensor -TensorBuilder = Tensor -TensorReader = Tensor TensorDescriptor = capnp.load(module_file).TensorDescriptor TensorDescriptorBuilder = TensorDescriptor TensorDescriptorReader = TensorDescriptor diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi index 7e7222ef54..b55f26b452 100644 --- a/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi @@ -101,49 +101,6 @@ class TensorDescriptorBuilder(TensorDescriptor): @staticmethod def write_packed(file: BufferedWriter) -> None: ... -class Tensor: - blob: bytes - tensorDescriptor: ( - TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader - ) - def init(self, name: Literal["tensorDescriptor"]) -> TensorDescriptor: ... - @staticmethod - @contextmanager - def from_bytes( - data: bytes, - traversal_limit_in_words: int | None = ..., - nesting_limit: int | None = ..., - ) -> Iterator[TensorReader]: ... - @staticmethod - def from_bytes_packed( - data: bytes, - traversal_limit_in_words: int | None = ..., - nesting_limit: int | None = ..., - ) -> TensorReader: ... - @staticmethod - def new_message() -> TensorBuilder: ... - def to_dict(self) -> dict: ... - -class TensorReader(Tensor): - tensorDescriptor: TensorDescriptorReader - def as_builder(self) -> TensorBuilder: ... - -class TensorBuilder(Tensor): - tensorDescriptor: ( - TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader - ) - @staticmethod - def from_dict(dictionary: dict) -> TensorBuilder: ... - def copy(self) -> TensorBuilder: ... - def to_bytes(self) -> bytes: ... - def to_bytes_packed(self) -> bytes: ... - def to_segments(self) -> list[bytes]: ... - def as_reader(self) -> TensorReader: ... - @staticmethod - def write(file: BufferedWriter) -> None: ... - @staticmethod - def write_packed(file: BufferedWriter) -> None: ... - class OutputDescriptor: order: Order optionalKeys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] diff --git a/tests/mli/test_torch_worker.py b/tests/mli/test_torch_worker.py index 0b1cd4ccf3..f159c15a0e 100644 --- a/tests/mli/test_torch_worker.py +++ b/tests/mli/test_torch_worker.py @@ -95,17 +95,18 @@ def create_torch_model(): def get_request() -> InferenceRequest: tensors = [get_batch() for _ in range(2)] - serialized_tensors = [ - MessageHandler.build_tensor(tensor.numpy(), "c", "float32", list(tensor.shape)) + tensor_numpy = [tensor.numpy() for tensor in tensors] + serialized_tensors_descriptors = [ + MessageHandler.build_tensor_descriptor("c", "float32", list(tensor.shape)) for tensor in tensors ] return InferenceRequest( model_key="model", callback=None, - raw_inputs=[s_tensor.blob for s_tensor in serialized_tensors], + raw_inputs=tensor_numpy, input_keys=None, - input_meta=[s_tensor.tensorDescriptor for s_tensor in serialized_tensors], + input_meta=serialized_tensors_descriptors, output_keys=None, raw_model=create_torch_model(), batch_size=0, From 52f5e74ea0bf80f4375e23b527034e6a5a453452 Mon Sep 17 00:00:00 2001 From: Alyssa Cote Date: Thu, 11 Jul 2024 19:06:21 -0500 Subject: [PATCH 34/40] fixed tests, maybe fixed mock app? --- ex/high_throughput_inference/mock_app.py | 25 +- smartsim/_core/mli/message_handler.py | 6 +- .../test_message_handler/test_build_tensor.py | 185 ------- .../test_build_tensor_desc.py | 90 ++++ tests/test_message_handler/test_request.py | 491 ++---------------- tests/test_message_handler/test_response.py | 248 ++------- 6 files changed, 209 insertions(+), 836 deletions(-) delete mode 100644 tests/test_message_handler/test_build_tensor.py create mode 100644 tests/test_message_handler/test_build_tensor_desc.py diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 45246db2e5..9cd59d2206 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -108,10 +108,13 @@ def print_timings(self, to_file: bool = False): def run_model(self, model: bytes | str, batch: torch.Tensor): + tensors = [batch.numpy()] self.start_timings(batch.shape[0]) - built_tensor = MessageHandler.build_tensor( - batch.numpy(), "c", "float32", list(batch.shape)) - self.measure_time("build_tensor") + # built_tensor = MessageHandler.build_tensor( + # batch.numpy(), "c", "float32", list(batch.shape)) + built_tensor_desc = MessageHandler.build_tensor_descriptor( + "c", "float32", list(batch.shape)) + self.measure_time("build_tensor_descriptor") built_model = None if isinstance(model, str): model_arg = MessageHandler.build_model_key(model) @@ -120,7 +123,7 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): request = MessageHandler.build_request( reply_channel=self._from_worker_ch_serialized, model= model_arg, - inputs=[built_tensor], + inputs=[built_tensor_desc], outputs=[], output_descriptors=[], custom_attributes=None, @@ -130,6 +133,8 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self.measure_time("serialize_request") with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: to_sendh.send_bytes(request_bytes) + for t in tensors: + to_sendh.send_bytes(t.tobytes()) # NOT FAST ENOUGH!!! logger.info(f"Message size: {len(request_bytes)} bytes") self.measure_time("send") @@ -138,12 +143,20 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self.measure_time("receive") response = MessageHandler.deserialize_response(resp) self.measure_time("deserialize_response") + # list of data blobs? recv depending on the len(esponse.result.descriptors)? + data_blob = from_recvh.recv_bytes(timeout=None) result = torch.from_numpy( numpy.frombuffer( - response.result.data[0].blob, - dtype=str(response.result.data[0].tensorDescriptor.dataType), + data_blob, + dtype=str(response.result.descriptors[0].dataType), ) ) + # result = torch.from_numpy( + # numpy.frombuffer( + # response.result.data[0].blob, + # dtype=str(response.result.data[0].tensorDescriptor.dataType), + # ) + # ) self.measure_time("deserialize_tensor") self.end_timings() diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index d5e2549bae..5599af5d2e 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -261,10 +261,8 @@ def _assign_inputs( elif input_class_name == "TensorKey": request.input.keys = inputs # type: ignore else: - raise ValueError( - """Invalid input class name. Expected - 'TensorDescriptor' or 'TensorKey'.""" - ) + raise ValueError("""Invalid input class name. Expected + 'TensorDescriptor' or 'TensorKey'.""") except Exception as e: raise ValueError("Error building inputs portion of request.") from e diff --git a/tests/test_message_handler/test_build_tensor.py b/tests/test_message_handler/test_build_tensor.py deleted file mode 100644 index aa7bd4e6e2..0000000000 --- a/tests/test_message_handler/test_build_tensor.py +++ /dev/null @@ -1,185 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2024, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -try: - import tensorflow as tf -except ImportError: - should_run_tf = False -else: - should_run_tf = True - - small_tf_tensor = tf.zeros((3, 2, 5), dtype=tf.int8) - small_tf_tensor = small_tf_tensor.numpy() - medium_tf_tensor = tf.ones((1040, 1040, 3), dtype=tf.int64) - medium_tf_tensor = medium_tf_tensor.numpy() - - -try: - import torch -except ImportError: - should_run_torch = False -else: - should_run_torch = True - - small_torch_tensor = torch.zeros((3, 2, 5), dtype=torch.int8) - small_torch_tensor = small_torch_tensor.numpy() - medium_torch_tensor = torch.ones((1040, 1040, 3), dtype=torch.int64) - medium_torch_tensor = medium_torch_tensor.numpy() - -from smartsim._core.mli.message_handler import MessageHandler - -# The tests in this file belong to the group_a group -pytestmark = pytest.mark.group_a - -handler = MessageHandler() - - -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") -@pytest.mark.parametrize( - "tensor, dtype, order, dimension", - [ - pytest.param( - small_torch_tensor, - "int8", - "c", - [3, 2, 5], - id="small torch tensor", - ), - pytest.param( - medium_torch_tensor, - "int64", - "c", - [1040, 1040, 3], - id="medium torch tensor", - ), - ], -) -def test_build_torch_tensor_successful(tensor, dtype, order, dimension): - built_tensor = handler.build_tensor(tensor, order, dtype, dimension) - assert built_tensor is not None - assert type(built_tensor.blob) == bytes - assert built_tensor.tensorDescriptor.order == order - assert built_tensor.tensorDescriptor.dataType == dtype - for i, j in zip(built_tensor.tensorDescriptor.dimensions, dimension): - assert i == j - - -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -@pytest.mark.parametrize( - "tensor, dtype, order, dimension", - [ - pytest.param( - small_tf_tensor, - "int8", - "c", - [3, 2, 5], - id="small tf tensor", - ), - pytest.param( - medium_tf_tensor, - "int64", - "c", - [1040, 1040, 3], - id="medium tf tensor", - ), - ], -) -def test_build_tf_tensor_successful(tensor, dtype, order, dimension): - built_tensor = handler.build_tensor(tensor, order, dtype, dimension) - assert built_tensor is not None - assert type(built_tensor.blob) == bytes - assert built_tensor.tensorDescriptor.order == order - assert built_tensor.tensorDescriptor.dataType == dtype - for i, j in zip(built_tensor.tensorDescriptor.dimensions, dimension): - assert i == j - - -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") -@pytest.mark.parametrize( - "tensor, dtype, order, dimension", - [ - pytest.param([1, 2, 4], "c", "int8", [1, 2, 3], id="bad tensor type"), - pytest.param( - small_torch_tensor, - "bad_order", - "int8", - [3, 2, 5], - id="bad order type", - ), - pytest.param( - small_torch_tensor, - "f", - "bad_num_type", - [3, 2, 5], - id="bad numerical type", - ), - pytest.param( - small_torch_tensor, - "f", - "int8", - "bad shape type", - id="bad shape type", - ), - ], -) -def test_build_torch_tensor_bad_input(tensor, dtype, order, dimension): - with pytest.raises(ValueError): - built_tensor = handler.build_tensor(tensor, order, dtype, dimension) - - -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -@pytest.mark.parametrize( - "tensor, dtype, order, dimension", - [ - pytest.param([1, 2, 4], "c", "int8", [1, 2, 3], id="bad tensor type"), - pytest.param( - small_tf_tensor, - "bad_order", - "int8", - [3, 2, 5], - id="bad order type", - ), - pytest.param( - small_tf_tensor, - "f", - "bad_num_type", - [3, 2, 5], - id="bad numerical type", - ), - pytest.param( - small_tf_tensor, - "f", - "int8", - "bad shape type", - id="bad shape type", - ), - ], -) -def test_build_tf_tensor_bad_input(tensor, dtype, order, dimension): - with pytest.raises(ValueError): - built_tensor = handler.build_tensor(tensor, order, dtype, dimension) diff --git a/tests/test_message_handler/test_build_tensor_desc.py b/tests/test_message_handler/test_build_tensor_desc.py new file mode 100644 index 0000000000..45126fb16c --- /dev/null +++ b/tests/test_message_handler/test_build_tensor_desc.py @@ -0,0 +1,90 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from smartsim._core.mli.message_handler import MessageHandler + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +handler = MessageHandler() + + +@pytest.mark.parametrize( + "dtype, order, dimension", + [ + pytest.param( + "int8", + "c", + [3, 2, 5], + id="small torch tensor", + ), + pytest.param( + "int64", + "c", + [1040, 1040, 3], + id="medium torch tensor", + ), + ], +) +def test_build_tensor_descriptor_successful(dtype, order, dimension): + built_tensor_descriptor = handler.build_tensor_descriptor(order, dtype, dimension) + assert built_tensor_descriptor is not None + assert built_tensor_descriptor.order == order + assert built_tensor_descriptor.dataType == dtype + for i, j in zip(built_tensor_descriptor.dimensions, dimension): + assert i == j + + +@pytest.mark.parametrize( + "dtype, order, dimension", + [ + pytest.param( + "bad_order", + "int8", + [3, 2, 5], + id="bad order type", + ), + pytest.param( + "f", + "bad_num_type", + [3, 2, 5], + id="bad numerical type", + ), + pytest.param( + "f", + "int8", + "bad shape type", + id="bad shape type", + ), + ], +) +def test_build_tensor_descriptor_unsuccessful(dtype, order, dimension): + with pytest.raises(ValueError): + built_tensor_descriptor = handler.build_tensor_descriptor( + order, dtype, dimension + ) diff --git a/tests/test_message_handler/test_request.py b/tests/test_message_handler/test_request.py index b1fedaa024..5a8a091d90 100644 --- a/tests/test_message_handler/test_request.py +++ b/tests/test_message_handler/test_request.py @@ -28,46 +28,6 @@ from smartsim._core.mli.message_handler import MessageHandler -try: - import tensorflow as tf -except ImportError: - should_run_tf = False -else: - should_run_tf = True - tflow1 = tf.zeros((3, 2, 5), dtype=tf.int8) - tflow2 = tf.ones((10, 10, 3), dtype=tf.int64) - - tensor_3 = MessageHandler.build_tensor( - tflow1.numpy(), "c", "int8", list(tflow1.shape) - ) - tensor_4 = MessageHandler.build_tensor( - tflow2.numpy(), "c", "int64", list(tflow2.shape) - ) - - tf_attributes = MessageHandler.build_tf_request_attributes( - name="tf", tensor_type="sparse" - ) - - -try: - import torch -except ImportError: - should_run_torch = False -else: - should_run_torch = True - - torch1 = torch.zeros((3, 2, 5), dtype=torch.int8) - torch2 = torch.ones((10, 10, 3), dtype=torch.int64) - - tensor_1 = MessageHandler.build_tensor( - torch1.numpy(), "c", "int8", list(torch1.shape) - ) - tensor_2 = MessageHandler.build_tensor( - torch2.numpy(), "c", "int64", list(torch2.shape) - ) - - torch_attributes = MessageHandler.build_torch_request_attributes("sparse") - # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a @@ -87,123 +47,54 @@ output_descriptor3 = MessageHandler.build_output_tensor_descriptor( "c", [output_key1], "none", [1, 2, 3] ) +torch_attributes = MessageHandler.build_torch_request_attributes("sparse") +tf_attributes = MessageHandler.build_tf_request_attributes( + name="tf", tensor_type="sparse" +) +tensor_1 = MessageHandler.build_tensor_descriptor("c", "int8", [1]) +tensor_2 = MessageHandler.build_tensor_descriptor("c", "int64", [3, 2]) +tensor_3 = MessageHandler.build_tensor_descriptor("f", "int8", [1]) +tensor_4 = MessageHandler.build_tensor_descriptor("f", "int64", [3, 2]) -if should_run_tf: - tf_indirect_request = MessageHandler.build_request( - b"reply", - model, - [input_key1, input_key2], - [output_key1, output_key2], - [output_descriptor1, output_descriptor2, output_descriptor3], - tf_attributes, - ) - tf_direct_request = MessageHandler.build_request( - b"reply", - model, - [tensor_3, tensor_4], - [], - [output_descriptor1, output_descriptor2], - tf_attributes, - ) +tf_indirect_request = MessageHandler.build_request( + b"reply", + model, + [input_key1, input_key2], + [output_key1, output_key2], + [output_descriptor1, output_descriptor2, output_descriptor3], + tf_attributes, +) -if should_run_torch: - torch_indirect_request = MessageHandler.build_request( - b"reply", - model, - [input_key1, input_key2], - [output_key1, output_key2], - [output_descriptor1, output_descriptor2, output_descriptor3], - torch_attributes, - ) - torch_direct_request = MessageHandler.build_request( - b"reply", - model, - [tensor_1, tensor_2], - [], - [output_descriptor1, output_descriptor2], - torch_attributes, - ) +tf_direct_request = MessageHandler.build_request( + b"reply", + model, + [tensor_3, tensor_4], + [], + [output_descriptor1, output_descriptor2], + tf_attributes, +) +torch_indirect_request = MessageHandler.build_request( + b"reply", + model, + [input_key1, input_key2], + [output_key1, output_key2], + [output_descriptor1, output_descriptor2, output_descriptor3], + torch_attributes, +) -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -@pytest.mark.parametrize( - "reply_channel, model, input, output, output_descriptors, custom_attributes", - [ - pytest.param( - b"reply channel", - model_key, - [input_key1, input_key2], - [output_key1, output_key2], - [output_descriptor1], - tf_attributes, - ), - pytest.param( - b"another reply channel", - model, - [input_key1], - [output_key2], - [output_descriptor1], - tf_attributes, - ), - pytest.param( - b"another reply channel", - model, - [input_key1], - [output_key2], - [output_descriptor1], - tf_attributes, - ), - pytest.param( - b"reply channel", - model_key, - [input_key1], - [output_key1], - [output_descriptor1], - None, - ), - ], +torch_direct_request = MessageHandler.build_request( + b"reply", + model, + [tensor_1, tensor_2], + [], + [output_descriptor1, output_descriptor2], + torch_attributes, ) -def test_build_request_indirect_tf_successful( - reply_channel, model, input, output, output_descriptors, custom_attributes -): - built_request = MessageHandler.build_request( - reply_channel, - model, - input, - output, - output_descriptors, - custom_attributes, - ) - assert built_request is not None - assert built_request.replyChannel.reply == reply_channel - if built_request.model.which() == "key": - assert built_request.model.key.key == model.key - else: - assert built_request.model.data.data == model.data - assert built_request.model.data.name == model.name - assert built_request.model.data.version == model.version - assert built_request.input.which() == "keys" - assert built_request.input.keys[0].key == input[0].key - assert len(built_request.input.keys) == len(input) - assert len(built_request.output) == len(output) - for i, j in zip(built_request.outputDescriptors, output_descriptors): - assert i.order == j.order - if built_request.customAttributes.which() == "tf": - assert ( - built_request.customAttributes.tf.tensorType == custom_attributes.tensorType - ) - elif built_request.customAttributes.which() == "torch": - assert ( - built_request.customAttributes.torch.tensorType - == custom_attributes.tensorType - ) - else: - assert built_request.customAttributes.none == custom_attributes -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") @pytest.mark.parametrize( "reply_channel, model, input, output, output_descriptors, custom_attributes", [ @@ -221,7 +112,7 @@ def test_build_request_indirect_tf_successful( [input_key1], [output_key2], [output_descriptor1], - torch_attributes, + tf_attributes, ), pytest.param( b"another reply channel", @@ -241,7 +132,7 @@ def test_build_request_indirect_tf_successful( ), ], ) -def test_build_request_indirect_torch_successful( +def test_build_request_indirect_successful( reply_channel, model, input, output, output_descriptors, custom_attributes ): built_request = MessageHandler.build_request( @@ -279,108 +170,6 @@ def test_build_request_indirect_torch_successful( assert built_request.customAttributes.none == custom_attributes -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") -@pytest.mark.parametrize( - "reply_channel, model, input, output, output_descriptors, custom_attributes", - [ - pytest.param( - [], - model_key, - [input_key1, input_key2], - [output_key1, output_key2], - [output_descriptor1], - torch_attributes, - id="bad channel", - ), - pytest.param( - b"reply channel", - "bad model", - [input_key1], - [output_key2], - [output_descriptor1], - torch_attributes, - id="bad model", - ), - pytest.param( - b"reply channel", - model_key, - ["input_key1", "input_key2"], - [output_key1, output_key2], - [output_descriptor1], - torch_attributes, - id="bad inputs", - ), - pytest.param( - b"reply channel", - model_key, - [model_key], - [output_key1, output_key2], - [output_descriptor1], - torch_attributes, - id="bad input schema type", - ), - pytest.param( - b"reply channel", - model_key, - [input_key1], - ["output_key1", "output_key2"], - [output_descriptor1], - torch_attributes, - id="bad outputs", - ), - pytest.param( - b"reply channel", - model_key, - [input_key1], - [model_key], - [output_descriptor1], - torch_attributes, - id="bad output schema type", - ), - pytest.param( - b"reply channel", - model_key, - [input_key1], - [output_key1, output_key2], - [output_descriptor1], - "bad attributes", - id="bad custom attributes", - ), - pytest.param( - b"reply channel", - model_key, - [input_key1], - [output_key1, output_key2], - [output_descriptor1], - model_key, - id="bad custom attributes schema type", - ), - pytest.param( - b"reply channel", - model_key, - [input_key1], - [output_key1, output_key2], - "bad descriptors", - torch_attributes, - id="bad output descriptors", - ), - ], -) -def test_build_request_indirect_torch_unsuccessful( - reply_channel, model, input, output, output_descriptors, custom_attributes -): - with pytest.raises(ValueError): - built_request = MessageHandler.build_request( - reply_channel, - model, - input, - output, - output_descriptors, - custom_attributes, - ) - - -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") @pytest.mark.parametrize( "reply_channel, model, input, output, output_descriptors, custom_attributes", [ @@ -399,7 +188,7 @@ def test_build_request_indirect_torch_unsuccessful( [input_key1], [output_key2], [output_descriptor1], - tf_attributes, + torch_attributes, id="bad model", ), pytest.param( @@ -417,7 +206,7 @@ def test_build_request_indirect_torch_unsuccessful( [model_key], [output_key1, output_key2], [output_descriptor1], - tf_attributes, + torch_attributes, id="bad input schema type", ), pytest.param( @@ -462,12 +251,12 @@ def test_build_request_indirect_torch_unsuccessful( [input_key1], [output_key1, output_key2], "bad descriptors", - tf_attributes, + torch_attributes, id="bad output descriptors", ), ], ) -def test_build_request_indirect_tf_unsuccessful( +def test_build_request_indirect_unsuccessful( reply_channel, model, input, output, output_descriptors, custom_attributes ): with pytest.raises(ValueError): @@ -481,7 +270,6 @@ def test_build_request_indirect_tf_unsuccessful( ) -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") @pytest.mark.parametrize( "reply_channel, model, input, output, output_descriptors, custom_attributes", [ @@ -499,88 +287,12 @@ def test_build_request_indirect_tf_unsuccessful( [tensor_1], [], [output_descriptor3], - torch_attributes, - ), - pytest.param( - b"another reply channel", - model, - [tensor_2], - [], - [output_descriptor1], - torch_attributes, - ), - pytest.param( - b"another reply channel", - model, - [tensor_1], - [], - [output_descriptor1], - None, - ), - ], -) -def test_build_request_direct_torch_successful( - reply_channel, model, input, output, output_descriptors, custom_attributes -): - built_request = MessageHandler.build_request( - reply_channel, - model, - input, - output, - output_descriptors, - custom_attributes, - ) - assert built_request is not None - assert built_request.replyChannel.reply == reply_channel - if built_request.model.which() == "key": - assert built_request.model.key.key == model.key - else: - assert built_request.model.data.data == model.data - assert built_request.model.data.name == model.name - assert built_request.model.data.version == model.version - assert built_request.input.which() == "data" - assert built_request.input.data[0].blob == input[0].blob - assert len(built_request.input.data) == len(input) - assert len(built_request.output) == len(output) - for i, j in zip(built_request.outputDescriptors, output_descriptors): - assert i.order == j.order - if built_request.customAttributes.which() == "tf": - assert ( - built_request.customAttributes.tf.tensorType == custom_attributes.tensorType - ) - elif built_request.customAttributes.which() == "torch": - assert ( - built_request.customAttributes.torch.tensorType - == custom_attributes.tensorType - ) - else: - assert built_request.customAttributes.none == custom_attributes - - -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -@pytest.mark.parametrize( - "reply_channel, model, input, output, output_descriptors, custom_attributes", - [ - pytest.param( - b"reply channel", - model_key, - [tensor_3, tensor_4], - [], - [output_descriptor2], - tf_attributes, - ), - pytest.param( - b"another reply channel", - model, - [tensor_4], - [], - [output_descriptor3], tf_attributes, ), pytest.param( b"another reply channel", model, - [tensor_4], + [tensor_2], [], [output_descriptor1], tf_attributes, @@ -588,14 +300,14 @@ def test_build_request_direct_torch_successful( pytest.param( b"another reply channel", model, - [tensor_3], + [tensor_1], [], [output_descriptor1], None, ), ], ) -def test_build_request_direct_tf_successful( +def test_build_request_direct_successful( reply_channel, model, input, output, output_descriptors, custom_attributes ): built_request = MessageHandler.build_request( @@ -614,9 +326,8 @@ def test_build_request_direct_tf_successful( assert built_request.model.data.data == model.data assert built_request.model.data.name == model.name assert built_request.model.data.version == model.version - assert built_request.input.which() == "data" - assert built_request.input.data[0].blob == input[0].blob - assert len(built_request.input.data) == len(input) + assert built_request.input.which() == "descriptors" + assert len(built_request.input.descriptors) == len(input) assert len(built_request.output) == len(output) for i, j in zip(built_request.outputDescriptors, output_descriptors): assert i.order == j.order @@ -633,81 +344,6 @@ def test_build_request_direct_tf_successful( assert built_request.customAttributes.none == custom_attributes -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") -@pytest.mark.parametrize( - "reply_channel, model, input, output, output_descriptors, custom_attributes", - [ - pytest.param( - [], - model_key, - [tensor_1, tensor_2], - [], - [output_descriptor2], - torch_attributes, - id="bad channel", - ), - pytest.param( - b"reply channel", - "bad model", - [tensor_1], - [], - [output_descriptor2], - torch_attributes, - id="bad model", - ), - pytest.param( - b"reply channel", - model_key, - ["input_key1", "input_key2"], - [], - [output_descriptor2], - torch_attributes, - id="bad inputs", - ), - pytest.param( - b"reply channel", - model_key, - [], - ["output_key1", "output_key2"], - [output_descriptor2], - torch_attributes, - id="bad outputs", - ), - pytest.param( - b"reply channel", - model_key, - [tensor_1], - [], - [output_descriptor2], - "bad attributes", - id="bad custom attributes", - ), - pytest.param( - b"reply_channel", - model_key, - [tensor_1, tensor_2], - [], - ["output_descriptor2"], - torch_attributes, - id="bad output descriptors", - ), - ], -) -def test_build_torch_request_direct_unsuccessful( - reply_channel, model, input, output, output_descriptors, custom_attributes -): - with pytest.raises(ValueError): - built_request = MessageHandler.build_request( - reply_channel, - model, - input, - output, - output_descriptors, - custom_attributes, - ) - - -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") @pytest.mark.parametrize( "reply_channel, model, input, output, output_descriptors, custom_attributes", [ @@ -735,7 +371,7 @@ def test_build_torch_request_direct_unsuccessful( ["input_key1", "input_key2"], [], [output_descriptor2], - tf_attributes, + torch_attributes, id="bad inputs", ), pytest.param( @@ -762,12 +398,12 @@ def test_build_torch_request_direct_unsuccessful( [tensor_3, tensor_4], [], ["output_descriptor2"], - tf_attributes, + torch_attributes, id="bad output descriptors", ), ], ) -def test_build_tf_request_direct_unsuccessful( +def test_build_request_direct_unsuccessful( reply_channel, model, input, output, output_descriptors, custom_attributes ): with pytest.raises(ValueError): @@ -781,31 +417,16 @@ def test_build_tf_request_direct_unsuccessful( ) -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") @pytest.mark.parametrize( "req", [ + pytest.param(tf_indirect_request, id="tf indirect"), + pytest.param(tf_direct_request, id="tf direct"), pytest.param(torch_indirect_request, id="indirect"), pytest.param(torch_direct_request, id="direct"), ], ) -def test_serialize_torch_request_successful(req): - serialized = MessageHandler.serialize_request(req) - assert type(serialized) == bytes - - deserialized = MessageHandler.deserialize_request(serialized) - assert deserialized.to_dict() == req.to_dict() - - -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -@pytest.mark.parametrize( - "req", - [ - pytest.param(tf_indirect_request, id="indirect"), - pytest.param(tf_direct_request, id="direct"), - ], -) -def test_serialize_tf_request_successful(req): +def test_serialize_request_successful(req): serialized = MessageHandler.serialize_request(req) assert type(serialized) == bytes diff --git a/tests/test_message_handler/test_response.py b/tests/test_message_handler/test_response.py index 9d59a18793..03bd9ba73f 100644 --- a/tests/test_message_handler/test_response.py +++ b/tests/test_message_handler/test_response.py @@ -28,60 +28,6 @@ from smartsim._core.mli.message_handler import MessageHandler -try: - import tensorflow as tf -except ImportError: - should_run_tf = False -else: - should_run_tf = True - - tflow1 = tf.zeros((3, 2, 5), dtype=tf.int8) - tflow2 = tf.ones((1040, 1040, 3), dtype=tf.int64) - - small_tf_tensor = MessageHandler.build_tensor( - tflow1.numpy(), "c", "int8", list(tflow1.shape) - ) - medium_tf_tensor = MessageHandler.build_tensor( - tflow2.numpy(), "c", "int64", list(tflow2.shape) - ) - - tf_attributes = MessageHandler.build_tf_response_attributes() - - tf_direct_response = MessageHandler.build_response( - "complete", - "Success again!", - [small_tf_tensor, medium_tf_tensor], - tf_attributes, - ) - - -try: - import torch -except ImportError: - should_run_torch = False -else: - should_run_torch = True - - torch1 = torch.zeros((3, 2, 5), dtype=torch.int8) - torch2 = torch.ones((1040, 1040, 3), dtype=torch.int64) - - small_torch_tensor = MessageHandler.build_tensor( - torch1.numpy(), "c", "int8", list(torch1.shape) - ) - medium_torch_tensor = MessageHandler.build_tensor( - torch2.numpy(), "c", "int64", list(torch2.shape) - ) - - torch_attributes = MessageHandler.build_torch_response_attributes() - - torch_direct_response = MessageHandler.build_response( - "complete", - "Success again!", - [small_torch_tensor, medium_torch_tensor], - torch_attributes, - ) - - # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a @@ -89,86 +35,51 @@ result_key1 = MessageHandler.build_tensor_key("result_key1") result_key2 = MessageHandler.build_tensor_key("result_key2") +torch_attributes = MessageHandler.build_torch_response_attributes() +tf_attributes = MessageHandler.build_tf_response_attributes() -if should_run_tf: - tf_indirect_response = MessageHandler.build_response( - "complete", - "Success!", - [result_key1, result_key2], - tf_attributes, - ) +tensor1 = MessageHandler.build_tensor_descriptor("c", "int8", [1]) +tensor2 = MessageHandler.build_tensor_descriptor("c", "int64", [3, 2]) -if should_run_torch: - torch_indirect_response = MessageHandler.build_response( - "complete", - "Success!", - [result_key1, result_key2], - torch_attributes, - ) +tf_indirect_response = MessageHandler.build_response( + "complete", + "Success!", + [result_key1, result_key2], + tf_attributes, +) -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") -@pytest.mark.parametrize( - "status, status_message, result, custom_attribute", - [ - pytest.param( - 200, - "Yay, it worked!", - [small_torch_tensor, medium_torch_tensor], - None, - id="tensor list", - ), - pytest.param( - 200, - "Yay, it worked!", - [small_torch_tensor], - torch_attributes, - id="small tensor", - ), - pytest.param( - 200, - "Yay, it worked!", - [result_key1, result_key2], - torch_attributes, - id="tensor key list", - ), - ], +tf_direct_response = MessageHandler.build_response( + "complete", + "Success again!", + [tensor2, tensor1], + tf_attributes, +) + +torch_indirect_response = MessageHandler.build_response( + "complete", + "Success!", + [result_key1, result_key2], + torch_attributes, +) + +torch_direct_response = MessageHandler.build_response( + "complete", + "Success again!", + [tensor1, tensor2], + torch_attributes, ) -def test_build_torch_response_successful( - status, status_message, result, custom_attribute -): - response = MessageHandler.build_response( - status=status, - message=status_message, - result=result, - custom_attributes=custom_attribute, - ) - assert response is not None - assert response.status == status - assert response.message == status_message - if response.result.which() == "keys": - assert response.result.keys[0].to_dict() == result[0].to_dict() - else: - assert response.result.data[0].to_dict() == result[0].to_dict() -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") @pytest.mark.parametrize( "status, status_message, result, custom_attribute", [ pytest.param( 200, "Yay, it worked!", - [small_tf_tensor, medium_tf_tensor], + [tensor1, tensor2], None, - id="tensor list", - ), - pytest.param( - 200, - "Yay, it worked!", - [small_tf_tensor], - tf_attributes, - id="small tensor", + id="tensor descriptor list", ), pytest.param( 200, @@ -179,7 +90,7 @@ def test_build_torch_response_successful( ), ], ) -def test_build_tf_response_successful(status, status_message, result, custom_attribute): +def test_build_response_successful(status, status_message, result, custom_attribute): response = MessageHandler.build_response( status=status, message=status_message, @@ -192,25 +103,24 @@ def test_build_tf_response_successful(status, status_message, result, custom_att if response.result.which() == "keys": assert response.result.keys[0].to_dict() == result[0].to_dict() else: - assert response.result.data[0].to_dict() == result[0].to_dict() + assert response.result.descriptors[0].to_dict() == result[0].to_dict() -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") @pytest.mark.parametrize( "status, status_message, result, custom_attribute", [ pytest.param( "bad status", "Yay, it worked!", - [small_tf_tensor, medium_tf_tensor], + [tensor1, tensor2], None, id="bad status", ), pytest.param( "complete", 200, - [small_tf_tensor], - tf_attributes, + [tensor2], + torch_attributes, id="bad status message", ), pytest.param( @@ -230,110 +140,36 @@ def test_build_tf_response_successful(status, status_message, result, custom_att pytest.param( "complete", "Yay, it worked!", - [small_tf_tensor, medium_tf_tensor], - "custom attributes", - id="bad custom attributes", - ), - pytest.param( - "complete", - "Yay, it worked!", - [small_tf_tensor, medium_tf_tensor], - result_key1, - id="bad custom attributes type", - ), - ], -) -def test_build_tf_response_unsuccessful( - status, status_message, result, custom_attribute -): - with pytest.raises(ValueError): - response = MessageHandler.build_response( - status, status_message, result, custom_attribute - ) - - -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") -@pytest.mark.parametrize( - "status, status_message, result, custom_attribute", - [ - pytest.param( - "bad status", - "Yay, it worked!", - [small_torch_tensor, medium_torch_tensor], - None, - id="bad status", - ), - pytest.param( - "complete", - 200, - [small_torch_tensor], - torch_attributes, - id="bad status message", - ), - pytest.param( - "complete", - "Yay, it worked!", - ["result_key1", "result_key2"], - torch_attributes, - id="bad result", - ), - pytest.param( - "complete", - "Yay, it worked!", - [torch_attributes], - torch_attributes, - id="bad result type", - ), - pytest.param( - "complete", - "Yay, it worked!", - [small_torch_tensor, medium_torch_tensor], + [tensor2, tensor1], "custom attributes", id="bad custom attributes", ), pytest.param( "complete", "Yay, it worked!", - [small_torch_tensor, medium_torch_tensor], + [tensor2, tensor1], result_key1, id="bad custom attributes type", ), ], ) -def test_build_torch_response_unsuccessful( - status, status_message, result, custom_attribute -): +def test_build_response_unsuccessful(status, status_message, result, custom_attribute): with pytest.raises(ValueError): response = MessageHandler.build_response( status, status_message, result, custom_attribute ) -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") @pytest.mark.parametrize( "response", [ pytest.param(torch_indirect_response, id="indirect"), pytest.param(torch_direct_response, id="direct"), + pytest.param(tf_indirect_response, id="tf indirect"), + pytest.param(tf_direct_response, id="tf direct"), ], ) -def test_torch_serialize_response(response): - serialized = MessageHandler.serialize_response(response) - assert type(serialized) == bytes - - deserialized = MessageHandler.deserialize_response(serialized) - assert deserialized.to_dict() == response.to_dict() - - -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -@pytest.mark.parametrize( - "response", - [ - pytest.param(tf_indirect_response, id="indirect"), - pytest.param(tf_direct_response, id="direct"), - ], -) -def test_tf_serialize_response(response): +def test_serialize_response(response): serialized = MessageHandler.serialize_response(response) assert type(serialized) == bytes From 0e3bd612689e0223721a8c0687c4f4cb85ce399f Mon Sep 17 00:00:00 2001 From: Alyssa Cote Date: Sat, 13 Jul 2024 11:18:55 -0700 Subject: [PATCH 35/40] mli driver runs all the way through --- ex/high_throughput_inference/mli_driver.py | 2 +- ex/high_throughput_inference/mock_app.py | 14 +++++--------- ex/high_throughput_inference/redis_driver.py | 2 +- .../standalone_workermanager.py | 2 +- smartsim/_core/mli/comm/channel/channel.py | 2 +- smartsim/_core/mli/comm/channel/dragonfli.py | 16 +++++++++------- .../mli/infrastructure/control/workermanager.py | 12 ++++++++---- .../mli/infrastructure/worker/torch_worker.py | 5 ++++- tests/mli/test_torch_worker.py | 9 ++++++--- 9 files changed, 36 insertions(+), 28 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 6da559aa6f..4438261139 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -10,7 +10,7 @@ import time import typing as t -device = "gpu" +device = "cpu" filedir = os.path.dirname(__file__) worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py") app_script_name = os.path.join(filedir, "mock_app.py") diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 9cd59d2206..51f01c3095 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -42,6 +42,7 @@ import time import torch import numbers +import typing from collections import OrderedDict from smartsim._core.mli.message_handler import MessageHandler @@ -108,7 +109,7 @@ def print_timings(self, to_file: bool = False): def run_model(self, model: bytes | str, batch: torch.Tensor): - tensors = [batch.numpy()] + tensors: typing.List[numpy.ndarray[typing.Any, numpy.dtype[typing.Any]]]= [batch.numpy()] self.start_timings(batch.shape[0]) # built_tensor = MessageHandler.build_tensor( # batch.numpy(), "c", "float32", list(batch.shape)) @@ -134,7 +135,8 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: to_sendh.send_bytes(request_bytes) for t in tensors: - to_sendh.send_bytes(t.tobytes()) # NOT FAST ENOUGH!!! + # to_sendh.send_bytes(t.tobytes()) #TODO NOT FAST ENOUGH!!! + to_sendh.send_bytes(bytes(t.data)) logger.info(f"Message size: {len(request_bytes)} bytes") self.measure_time("send") @@ -143,7 +145,7 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self.measure_time("receive") response = MessageHandler.deserialize_response(resp) self.measure_time("deserialize_response") - # list of data blobs? recv depending on the len(esponse.result.descriptors)? + # list of data blobs? recv depending on the len(response.result.descriptors)? data_blob = from_recvh.recv_bytes(timeout=None) result = torch.from_numpy( numpy.frombuffer( @@ -151,12 +153,6 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): dtype=str(response.result.descriptors[0].dataType), ) ) - # result = torch.from_numpy( - # numpy.frombuffer( - # response.result.data[0].blob, - # dtype=str(response.result.data[0].tensorDescriptor.dataType), - # ) - # ) self.measure_time("deserialize_tensor") self.end_timings() diff --git a/ex/high_throughput_inference/redis_driver.py b/ex/high_throughput_inference/redis_driver.py index ceddba4ef7..5111019099 100644 --- a/ex/high_throughput_inference/redis_driver.py +++ b/ex/high_throughput_inference/redis_driver.py @@ -31,7 +31,7 @@ import time import typing as t -device = "gpu" +device = "cpu" filedir = os.path.dirname(__file__) app_script_name = os.path.join(filedir, "mock_app_redis.py") model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index c56e11a7c3..7ff706953d 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -51,7 +51,7 @@ parser.add_argument( "--device", type=str, - default="gpu", + default="cpu", choices="gpu cpu".split(), help="Device on which the inference takes place", ) diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index 2318896a9b..fede10a588 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -50,7 +50,7 @@ def recv(self) -> bytes: :returns: the received message""" @property - def descriptor(self) -> bytes: + def descriptor(self) -> t.List[bytes]: """Return the channel descriptor for the underlying dragon channel""" if isinstance(self._descriptor, str): return self._descriptor.encode("utf-8") diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 75f8fb4bfc..134b00d3df 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -57,13 +57,15 @@ def send(self, value: bytes) -> None: with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: sendh.send_bytes(value) - def recv(self) -> bytes: + def recv(self) -> t.List[bytes]: """Receieve a message through the underlying communication channel :returns: the received message""" + messages = [] with self._fli.recvh(timeout=None) as recvh: - try: - request_bytes: bytes - request_bytes, _ = recvh.recv_bytes(timeout=None) - return request_bytes - except fli.FLIEOT as exc: - return b"" + while True: + try: + message, _ = recvh.recv_bytes(timeout=None) + messages.append(message) + except fli.FLIEOT as exc: + break + return messages diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 1c571dc2f2..73d7a3d141 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -105,7 +105,7 @@ def deserialize_message( input_keys = [input_key.key for input_key in request.input.keys] elif request.input.which() == "descriptors": # input_bytes = [data.blob for data in request.input.data] - input_meta = [request.input.descriptors] + input_meta = request.input.descriptors inference_request = InferenceRequest( model_key=model_key, @@ -242,7 +242,10 @@ def _on_iteration(self) -> None: timings = [] # timing # perform default deserialization of the message envelope - request_bytes: bytes = self._task_queue.recv() + bytes_list: t.List[bytes] = self._task_queue.recv() + if bytes_list: + request_bytes = bytes_list[0] + tensor_list = bytes_list[1:] interm = time.perf_counter() # timing request = deserialize_message( @@ -250,8 +253,7 @@ def _on_iteration(self) -> None: ) if request.input_meta: - for _ in request.input_meta: - request.raw_inputs.append(self._task_queue.recv()) + request.raw_inputs = tensor_list if not self._validate_request(request): return @@ -353,8 +355,10 @@ def _on_iteration(self) -> None: timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing if request.callback: + # send serialized response request.callback.send(serialized_resp) if reply.outputs: + # send tensor data after response for output in reply.outputs: request.callback.send(output) diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index a4e725ab99..f8cfa9886f 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -98,7 +98,10 @@ def execute( model: torch.nn.Module = load_result.model model.eval() - results = [model(tensor).detach() for tensor in transform_result.transformed] + results = [ + model(tensor).detach().numpy().tobytes() + for tensor in transform_result.transformed + ] # TODO THIS IS BAD execute_result = ExecuteResult(results) return execute_result diff --git a/tests/mli/test_torch_worker.py b/tests/mli/test_torch_worker.py index f159c15a0e..87748ecc68 100644 --- a/tests/mli/test_torch_worker.py +++ b/tests/mli/test_torch_worker.py @@ -156,9 +156,12 @@ def test_execute(mlutils) -> None: execute_result = worker.execute(sample_request, load_model_result, transform_result) - assert all( - result.shape == torch.Size((20, 10)) for result in execute_result.predictions - ) + # assert all( + # result.shape == torch.Size((20, 10)) for result in execute_result.predictions + # ) + + # need to make this test more meaningful, but predictions are bytes string now (potentially will change back) + assert all(type(result) == bytes for result in execute_result.predictions) def test_transform_output(mlutils): From e3f44a5267f9574d4a2df3d623bc298b9af5ec79 Mon Sep 17 00:00:00 2001 From: Alyssa Cote Date: Mon, 15 Jul 2024 15:50:04 -0500 Subject: [PATCH 36/40] weaks --- ex/high_throughput_inference/mock_app.py | 4 ++-- smartsim/_core/mli/comm/channel/channel.py | 4 ++-- .../infrastructure/control/workermanager.py | 15 ++++++++++----- .../mli/infrastructure/worker/torch_worker.py | 19 +++++++++++++------ tests/mli/test_torch_worker.py | 13 ++++++------- 5 files changed, 33 insertions(+), 22 deletions(-) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 51f01c3095..d686c7d5c9 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -135,8 +135,8 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: to_sendh.send_bytes(request_bytes) for t in tensors: - # to_sendh.send_bytes(t.tobytes()) #TODO NOT FAST ENOUGH!!! - to_sendh.send_bytes(bytes(t.data)) + to_sendh.send_bytes(t.tobytes()) #TODO NOT FAST ENOUGH!!! + # to_sendh.send_bytes(bytes(t.data)) logger.info(f"Message size: {len(request_bytes)} bytes") self.measure_time("send") diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index fede10a588..a3cce21814 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -45,12 +45,12 @@ def send(self, value: bytes) -> None: :param value: The value to send""" @abstractmethod - def recv(self) -> bytes: + def recv(self) -> t.List[bytes]: """Receieve a message through the underlying communication channel :returns: the received message""" @property - def descriptor(self) -> t.List[bytes]: + def descriptor(self) -> bytes: """Return the channel descriptor for the underlying dragon channel""" if isinstance(self._descriptor, str): return self._descriptor.encode("utf-8") diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 73d7a3d141..ad2b89f173 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -58,6 +58,7 @@ from smartsim._core.mli.mli_schemas.model.model_capnp import Model from smartsim._core.mli.mli_schemas.response.response_capnp import StatusEnum + from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import TensorDescriptor logger = get_logger(__name__) @@ -99,13 +100,12 @@ def deserialize_message( None # these will really be tensors already ) - input_meta: t.List[t.Any] = [] + input_meta: t.List[TensorDescriptor] = [] if request.input.which() == "keys": input_keys = [input_key.key for input_key in request.input.keys] elif request.input.which() == "descriptors": - # input_bytes = [data.blob for data in request.input.data] - input_meta = request.input.descriptors + input_meta = request.input.descriptors # type: ignore inference_request = InferenceRequest( model_key=model_key, @@ -141,6 +141,8 @@ def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]: # todo: need to have the output attributes specified in the req? # maybe, add `MessageHandler.dtype_of(tensor)`? # can `build_tensor` do dtype and shape? + + # TODO isn't this what output descriptors are for? msg_tensor_desc = MessageHandler.build_tensor_descriptor( "c", "float32", @@ -241,8 +243,11 @@ def _on_iteration(self) -> None: return timings = [] # timing - # perform default deserialization of the message envelope + bytes_list: t.List[bytes] = self._task_queue.recv() + request_bytes: bytes = b"" + tensor_list = [] + if bytes_list: request_bytes = bytes_list[0] tensor_list = bytes_list[1:] @@ -252,7 +257,7 @@ def _on_iteration(self) -> None: request_bytes, self._comm_channel_type, self._device ) - if request.input_meta: + if request.input_meta and tensor_list: request.raw_inputs = tensor_list if not self._validate_request(request): diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index f8cfa9886f..b06874e1cc 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -98,10 +98,11 @@ def execute( model: torch.nn.Module = load_result.model model.eval() - results = [ - model(tensor).detach().numpy().tobytes() - for tensor in transform_result.transformed - ] # TODO THIS IS BAD + results = [model(tensor).detach() for tensor in transform_result.transformed] + # results = [ + # model(tensor).detach().numpy().tobytes() + # for tensor in transform_result.transformed + # ] # TODO THIS IS BAD execute_result = ExecuteResult(results) return execute_result @@ -113,10 +114,16 @@ def transform_output( result_device: str, ) -> TransformOutputResult: if result_device != "cpu": - transformed = [item.to("cpu") for item in execute_result.predictions] + transformed = [ + item.to("cpu").numpy().tobytes() for item in execute_result.predictions + ] + # todo: need the shape from latest schemas added here. return TransformOutputResult(transformed, None, "c", "float32") # fixme return TransformOutputResult( - execute_result.predictions, None, "c", "float32" + [item.numpy().tobytes() for item in execute_result.predictions], + None, + "c", + "float32", ) # fixme diff --git a/tests/mli/test_torch_worker.py b/tests/mli/test_torch_worker.py index 87748ecc68..b73e4a31b5 100644 --- a/tests/mli/test_torch_worker.py +++ b/tests/mli/test_torch_worker.py @@ -156,12 +156,9 @@ def test_execute(mlutils) -> None: execute_result = worker.execute(sample_request, load_model_result, transform_result) - # assert all( - # result.shape == torch.Size((20, 10)) for result in execute_result.predictions - # ) - - # need to make this test more meaningful, but predictions are bytes string now (potentially will change back) - assert all(type(result) == bytes for result in execute_result.predictions) + assert all( + result.shape == torch.Size((20, 10)) for result in execute_result.predictions + ) def test_transform_output(mlutils): @@ -171,7 +168,9 @@ def test_transform_output(mlutils): sample_request, execute_result, torch_device[mlutils.get_test_device().lower()] ) - assert transformed_output.outputs == execute_result.predictions + assert transformed_output.outputs == [ + item.numpy().tobytes() for item in execute_result.predictions + ] assert transformed_output.shape == None assert transformed_output.order == "c" assert transformed_output.dtype == "float32" From b57fc8e6718b94bef0794e93d7f6b2e78a7cdbe8 Mon Sep 17 00:00:00 2001 From: Alyssa Cote Date: Mon, 15 Jul 2024 16:15:21 -0500 Subject: [PATCH 37/40] more clean up --- ex/high_throughput_inference/mli_driver.py | 2 +- ex/high_throughput_inference/mock_app.py | 5 +---- ex/high_throughput_inference/redis_driver.py | 2 +- ex/high_throughput_inference/standalone_workermanager.py | 2 +- smartsim/_core/mli/infrastructure/worker/torch_worker.py | 4 ---- 5 files changed, 4 insertions(+), 11 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 4438261139..6da559aa6f 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -10,7 +10,7 @@ import time import typing as t -device = "cpu" +device = "gpu" filedir = os.path.dirname(__file__) worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py") app_script_name = os.path.join(filedir, "mock_app.py") diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index d686c7d5c9..e244c93e0f 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -42,7 +42,6 @@ import time import torch import numbers -import typing from collections import OrderedDict from smartsim._core.mli.message_handler import MessageHandler @@ -109,10 +108,8 @@ def print_timings(self, to_file: bool = False): def run_model(self, model: bytes | str, batch: torch.Tensor): - tensors: typing.List[numpy.ndarray[typing.Any, numpy.dtype[typing.Any]]]= [batch.numpy()] + tensors = [batch.numpy()] self.start_timings(batch.shape[0]) - # built_tensor = MessageHandler.build_tensor( - # batch.numpy(), "c", "float32", list(batch.shape)) built_tensor_desc = MessageHandler.build_tensor_descriptor( "c", "float32", list(batch.shape)) self.measure_time("build_tensor_descriptor") diff --git a/ex/high_throughput_inference/redis_driver.py b/ex/high_throughput_inference/redis_driver.py index 5111019099..ceddba4ef7 100644 --- a/ex/high_throughput_inference/redis_driver.py +++ b/ex/high_throughput_inference/redis_driver.py @@ -31,7 +31,7 @@ import time import typing as t -device = "cpu" +device = "gpu" filedir = os.path.dirname(__file__) app_script_name = os.path.join(filedir, "mock_app_redis.py") model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index 7ff706953d..c56e11a7c3 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -51,7 +51,7 @@ parser.add_argument( "--device", type=str, - default="cpu", + default="gpu", choices="gpu cpu".split(), help="Device on which the inference takes place", ) diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index b06874e1cc..e732ecd2cd 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -99,10 +99,6 @@ def execute( model: torch.nn.Module = load_result.model model.eval() results = [model(tensor).detach() for tensor in transform_result.transformed] - # results = [ - # model(tensor).detach().numpy().tobytes() - # for tensor in transform_result.transformed - # ] # TODO THIS IS BAD execute_result = ExecuteResult(results) return execute_result From c1f856b6b0cba341bb2a4b710a390953f4cb969d Mon Sep 17 00:00:00 2001 From: Alyssa Cote Date: Mon, 15 Jul 2024 16:31:10 -0500 Subject: [PATCH 38/40] changelog, mypy --- doc/changelog.md | 1 + smartsim/_core/mli/comm/channel/dragonchannel.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/changelog.md b/doc/changelog.md index ee41fabf88..81c8ac4794 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Adjust schemas for better performance - Add TorchWorker first implementation and mock inference app example - Add EnvironmentConfigLoader for ML Worker Manager - Add Model schema with model metadata included diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index 1409747a91..672fce75b2 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import sys +import typing as t import smartsim._core.mli.comm.channel.channel as cch from smartsim.log import get_logger @@ -52,9 +53,9 @@ def send(self, value: bytes) -> None: with self._channel.sendh(timeout=None) as sendh: sendh.send_bytes(value) - def recv(self) -> bytes: + def recv(self) -> t.List[bytes]: """Receieve a message through the underlying communication channel :returns: the received message""" with self._channel.recvh(timeout=None) as recvh: message_bytes: bytes = recvh.recv_bytes(timeout=None) - return message_bytes + return [message_bytes] From f1415f23fcab5f3ab1d51b61cca3e6efdf5c8903 Mon Sep 17 00:00:00 2001 From: Alyssa Cote Date: Thu, 18 Jul 2024 12:12:45 -0500 Subject: [PATCH 39/40] pr comments addressed --- smartsim/_core/mli/comm/channel/dragonfli.py | 6 ++-- .../infrastructure/control/workermanager.py | 36 +++++++++---------- smartsim/_core/mli/message_handler.py | 2 +- .../mli/mli_schemas/request/request.capnp | 2 +- .../mli/mli_schemas/request/request_capnp.pyi | 2 +- tests/test_message_handler/test_request.py | 4 +-- 6 files changed, 25 insertions(+), 27 deletions(-) diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 134b00d3df..7ad28307cd 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -61,11 +61,13 @@ def recv(self) -> t.List[bytes]: """Receieve a message through the underlying communication channel :returns: the received message""" messages = [] + eot = False with self._fli.recvh(timeout=None) as recvh: - while True: + while not eot: try: message, _ = recvh.recv_bytes(timeout=None) messages.append(message) except fli.FLIEOT as exc: - break + eot = True return messages + diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 781b36b450..27f5bfc971 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -89,19 +89,18 @@ def deserialize_message( elif request.model.which() == "data": model_bytes = request.model.data - callback_key = request.replyChannel.reply + callback_key = request.replyChannel.descriptor # todo: shouldn't this be `CommChannel.find` instead of `DragonCommChannel` comm_channel = channel_type(callback_key) # comm_channel = DragonCommChannel(request.replyChannel) input_keys: t.Optional[t.List[str]] = None - input_bytes: t.Optional[t.List[bytes]] = ( - None # these will really be tensors already - ) + input_bytes: t.Optional[t.List[bytes]] = None + output_keys: t.Optional[t.List[str]] = None - input_meta: t.List[TensorDescriptor] = [] + input_meta: t.Optional[t.List[TensorDescriptor]] = None if request.input.which() == "keys": input_keys = [input_key.key for input_key in request.input.keys] @@ -111,9 +110,6 @@ def deserialize_message( if request.output: output_keys = [tensor_key.key for tensor_key in request.output] - if request.output: - output_keys = [tensor_key.key for tensor_key in request.output] - inference_request = InferenceRequest( model_key=model_key, callback=comm_channel, @@ -146,11 +142,6 @@ def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]: prepared_outputs.append(msg_key) elif reply.outputs: for _ in reply.outputs: - # todo: need to have the output attributes specified in the req? - # maybe, add `MessageHandler.dtype_of(tensor)`? - # can `build_tensor` do dtype and shape? - - # TODO isn't this what output descriptors are for? msg_tensor_desc = MessageHandler.build_tensor_descriptor( "c", "float32", @@ -275,20 +266,25 @@ def _on_iteration(self) -> None: timings = [] # timing bytes_list: t.List[bytes] = self._task_queue.recv() - request_bytes: bytes = b"" - tensor_list = [] - if bytes_list: - request_bytes = bytes_list[0] - tensor_list = bytes_list[1:] + if not bytes_list: + exception_handler( + ValueError("No request data found"), + None, + "No request data found.", + ) + return + + request_bytes = bytes_list[0] + tensor_bytes_list = bytes_list[1:] interm = time.perf_counter() # timing request = deserialize_message( request_bytes, self._comm_channel_type, self._device ) - if request.input_meta and tensor_list: - request.raw_inputs = tensor_list + if request.input_meta and tensor_bytes_list: + request.raw_inputs = tensor_bytes_list if not self._validate_request(request): return diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index f28bc341f6..00670dce8a 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -233,7 +233,7 @@ def _assign_reply_channel( :raises ValueError: if building fails """ try: - request.replyChannel.reply = reply_channel + request.replyChannel.descriptor = reply_channel except Exception as e: raise ValueError("Error building reply channel portion of request.") from e diff --git a/smartsim/_core/mli/mli_schemas/request/request.capnp b/smartsim/_core/mli/mli_schemas/request/request.capnp index 6d290fb599..4be1cfa215 100644 --- a/smartsim/_core/mli/mli_schemas/request/request.capnp +++ b/smartsim/_core/mli/mli_schemas/request/request.capnp @@ -32,7 +32,7 @@ using DataRef = import "../data/data_references.capnp"; using Models = import "../model/model.capnp"; struct ChannelDescriptor { - reply @0 :Data; + descriptor @0 :Data; } struct Request { diff --git a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi index 54dcdcfecc..a4ad631f9f 100644 --- a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi @@ -61,7 +61,7 @@ from .request_attributes.request_attributes_capnp import ( ) class ChannelDescriptor: - reply: bytes + descriptor: bytes @staticmethod @contextmanager def from_bytes( diff --git a/tests/test_message_handler/test_request.py b/tests/test_message_handler/test_request.py index 5a8a091d90..4cfc115845 100644 --- a/tests/test_message_handler/test_request.py +++ b/tests/test_message_handler/test_request.py @@ -144,7 +144,7 @@ def test_build_request_indirect_successful( custom_attributes, ) assert built_request is not None - assert built_request.replyChannel.reply == reply_channel + assert built_request.replyChannel.descriptor == reply_channel if built_request.model.which() == "key": assert built_request.model.key.key == model.key else: @@ -319,7 +319,7 @@ def test_build_request_direct_successful( custom_attributes, ) assert built_request is not None - assert built_request.replyChannel.reply == reply_channel + assert built_request.replyChannel.descriptor == reply_channel if built_request.model.which() == "key": assert built_request.model.key.key == model.key else: From dafb4df8c7a51921b3687262d46782dea840b7fa Mon Sep 17 00:00:00 2001 From: Alyssa Cote Date: Thu, 18 Jul 2024 12:20:53 -0500 Subject: [PATCH 40/40] style --- smartsim/_core/mli/comm/channel/dragonfli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 7ad28307cd..28b4c2bf3b 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -70,4 +70,3 @@ def recv(self) -> t.List[bytes]: except fli.FLIEOT as exc: eot = True return messages -