From e98e2fe52a8614b1473d8f19847036afd8309445 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 25 Jun 2024 12:21:53 -0500
Subject: [PATCH 01/84] Initial FLI-based implementation

---
 .../_core/launcher/dragon/dragonBackend.py    |  30 ++++-
 .../_core/mli/comm/channel/dragonchannel.py   |  12 +-
 smartsim/_core/mli/comm/channel/dragonfli.py  |  54 +++++++++
 .../infrastructure/control/workermanager.py   |  33 +++---
 .../_core/mli/infrastructure/worker/worker.py | 106 ++++++++++++++----
 smartsim/_core/mli/message_handler.py         |  10 +-
 6 files changed, 192 insertions(+), 53 deletions(-)
 create mode 100644 smartsim/_core/mli/comm/channel/dragonfli.py

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 2456606623..9ec4cc93e9 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -26,6 +26,7 @@
 import collections
 import functools
 import itertools
+import os
 import time
 import typing as t
 from dataclasses import dataclass, field
@@ -38,10 +39,13 @@
 # isort: off
 import dragon.infrastructure.connection as dragon_connection
 import dragon.infrastructure.policy as dragon_policy
+from dragon.infrastructure.process_desc import ProcessOptions
+from dragon.data.ddict.ddict import DDict
 import dragon.native.group_state as dragon_group_state
 import dragon.native.process as dragon_process
 import dragon.native.process_group as dragon_process_group
 import dragon.native.machine as dragon_machine
+import multiprocessing as mp
 
 # pylint: enable=import-error
 # isort: on
@@ -75,6 +79,9 @@ def __str__(self) -> str:
         return self.value
 
 
+mp.set_start_method("dragon")
+
+
 @dataclass
 class ProcessGroupInfo:
     status: SmartSimStatus
@@ -187,6 +194,7 @@ def __init__(self, pid: int) -> None:
 
         self._view = DragonBackendView(self)
         logger.debug(self._view.host_desc)
+        self._infra_ddict: t.Optional[DDict] = None
 
     @property
     def hosts(self) -> list[str]:
@@ -391,6 +399,20 @@ def _stop_steps(self) -> None:
                 self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED
                 self._group_infos[step_id].return_codes = [-9]
 
+    @property
+    def infra_ddict(self) -> str:
+        """Create a Dragon distributed dictionary and return its
+        serialized descriptor
+        """
+        if self._infra_ddict is None:
+            logger.info("Creating DDict")
+            self._infra_ddict = DDict()  # todo: parametrize
+            logger.info("Created DDict")
+            self._infra_ddict["creation"] = str(time.time())
+            logger.info(self._infra_ddict["creation"])
+
+        return self._infra_ddict.serialize()
+
     def _start_steps(self) -> None:
         self._heartbeat()
         with self._queue_lock:
@@ -406,6 +428,7 @@ def _start_steps(self) -> None:
                     placement=dragon_policy.Policy.Placement.HOST_NAME,
                     host_name=hosts[0],
                 )
+                options = ProcessOptions(make_inf_channels=True)
                 grp = dragon_process_group.ProcessGroup(
                     restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy
                 )
@@ -421,10 +444,15 @@ def _start_steps(self) -> None:
                         target=request.exe,
                         args=request.exe_args,
                         cwd=request.path,
-                        env={**request.current_env, **request.env},
+                        env={
+                            **request.current_env,
+                            **request.env,
+                            "SS_DRG_DDICT": self.infra_ddict,
+                        },
                         stdout=dragon_process.Popen.PIPE,
                         stderr=dragon_process.Popen.PIPE,
                         policy=local_policy,
+                        options=options,
                     )
                     grp.add_process(nproc=request.tasks_per_node, template=tmp_proc)
 
diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py
index 4fd26861ca..d4dbfa3ba0 100644
--- a/smartsim/_core/mli/comm/channel/dragonchannel.py
+++ b/smartsim/_core/mli/comm/channel/dragonchannel.py
@@ -24,16 +24,13 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import typing as t
 
 import smartsim._core.mli.comm.channel.channel as cch
 from smartsim.log import get_logger
 
 logger = get_logger(__name__)
 
-if t.TYPE_CHECKING:
-    import dragon.channels as dch
-    import dragon.utils as du
+import dragon.channels as dch
 
 
 class DragonCommChannel(cch.CommChannelBase):
@@ -42,11 +39,10 @@ class DragonCommChannel(cch.CommChannelBase):
     def __init__(self, key: bytes) -> None:
         """Initialize the DragonCommChannel instance"""
         super().__init__(key)
-        # todo: do we need memory pool information to construct the channel correctly?
-        self._channel: "dch.Channel" = du.get_channel(key)
+        self._channel: dch.Channel = dch.Channel.attach(key)
 
     def send(self, value: bytes) -> None:
         """Send a message throuh the underlying communication channel
         :param value: The value to send"""
-        logger.debug(f"Channel {self.descriptor.decode('utf-8')} sending message")
-        self._channel.send_bytes(value)
+        with self._channel.sendh(timeout=None) as sendh:
+            sendh.send_bytes(value)
diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py
new file mode 100644
index 0000000000..f601bb2eb8
--- /dev/null
+++ b/smartsim/_core/mli/comm/channel/dragonfli.py
@@ -0,0 +1,54 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# isort: off
+import dragon
+from dragon import fli
+import dragon.channels as dch
+
+# isort: on
+
+
+import smartsim._core.mli.comm.channel.channel as cch
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class DragonFLIChannel(cch.CommChannelBase):
+    """Passes messages by writing to a Dragon FLI Channel"""
+
+    def __init__(self, fli_desc: bytes) -> None:
+        """Initialize the DragonFLIChannel instance"""
+        super().__init__(fli_desc)
+        # todo: do we need memory pool information to construct the channel correctly?
+        self._channel: "dch.Channel" = fli.FLInterface.attach(fli_desc)
+
+    def send(self, value: bytes) -> None:
+        """Send a message throuh the underlying communication channel
+        :param value: The value to send"""
+        with self._channel.sendh(timeout=None) as sendh:
+            sendh.send_bytes(value)
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index b3b79f7f30..588dc8e28d 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -24,14 +24,19 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import multiprocessing as mp
+# isort: off
+import dragon
+from dragon import fli
+
+# isort: on
+import time
 import typing as t
 
 import numpy as np
 
 from smartsim._core.entrypoints.service import Service
 from smartsim._core.mli.comm.channel.channel import CommChannelBase
-from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
 from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
 from smartsim._core.mli.infrastructure.worker.worker import (
     InferenceReply,
@@ -84,12 +89,6 @@ def deserialize_message(
         None  # these will really be tensors already
     )
 
-    # # client example
-    # msg = Message()
-    # t = torch.Tensor()
-    # msg.inputs = [custom_byte_converter(t)]
-    # mli_client.request_inference(msg)
-    # # end client
     input_meta: t.List[t.Any] = []
 
     if request.input.which() == "inputKeys":
@@ -163,12 +162,12 @@ class WorkerManager(Service):
 
     def __init__(
         self,
-        task_queue: "mp.Queue[bytes]",
+        file_like_interface: fli.FLInterface,
         worker: MachineLearningWorkerBase,
         feature_store: t.Optional[FeatureStore] = None,
         as_service: bool = False,
         cooldown: int = 0,
-        comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel,
+        comm_channel_type: t.Type[CommChannelBase] = DragonFLIChannel,
     ) -> None:
         """Initialize the WorkerManager
         :param task_queue: The queue to monitor for new tasks
@@ -182,7 +181,7 @@ def __init__(
         super().__init__(as_service, cooldown)
 
         """a collection of workers the manager is controlling"""
-        self._task_queue: "mp.Queue[bytes]" = task_queue
+        self._task_queue: fli.FLInterface = file_like_interface
         """the queue the manager monitors for new tasks"""
         self._feature_store: t.Optional[FeatureStore] = feature_store
         """a feature store to retrieve models from"""
@@ -232,7 +231,12 @@ def _on_iteration(self) -> None:
             return
 
         # perform default deserialization of the message envelope
-        request_bytes: bytes = self._task_queue.get()
+        # perform default deserialization of the message envelope
+        with self._task_queue.recvh(timeout=None) as recvh:
+            try:
+                request_bytes, _ = recvh.recv_bytes(timeout=None)
+            except fli.FLIEOT as exc:
+                return
 
         request = deserialize_message(request_bytes, self._comm_channel_type)
         if not self._validate_request(request):
@@ -246,17 +250,12 @@ def _on_iteration(self) -> None:
         fetch_input_result = self._worker.fetch_inputs(request, self._feature_store)
         transformed_input = self._worker.transform_input(request, fetch_input_result)
 
-        # batch: t.Collection[_Datum] = transform_result.transformed_input
-        # if self._batch_size:
-        #     batch = self._worker.batch_requests(transform_result, self._batch_size)
-
         reply = InferenceReply()
 
         try:
             execute_result = self._worker.execute(
                 request, model_result, transformed_input
             )
-
             transformed_output = self._worker.transform_output(request, execute_result)
 
             if request.output_keys:
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 99b51e178d..8992b2b6ea 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -24,12 +24,17 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import io
 import typing as t
 from abc import ABC, abstractmethod
 
+import numpy as np
+import torch
+
 import smartsim.error as sse
 from smartsim._core.mli.comm.channel.channel import CommChannelBase
 from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+from smartsim._core.mli.mli_schemas.tensor import tensor_capnp
 from smartsim.log import get_logger
 
 logger = get_logger(__name__)
@@ -106,9 +111,10 @@ def __init__(self, result: t.Any) -> None:
 class FetchInputResult:
     """A wrapper around fetched inputs"""
 
-    def __init__(self, result: t.List[bytes]) -> None:
+    def __init__(self, result: t.List[bytes], meta: t.List[t.Any]) -> None:
         """Initialize the object"""
         self.inputs = result
+        self.meta = meta
 
 
 class TransformOutputResult:
@@ -122,7 +128,6 @@ def __init__(
         self.shape = shape
         self.order = order
         self.dtype = dtype
-        # todo: determine if each output must have an individual (shape, order, dtype)
 
 
 class CreateInputBatchResult:
@@ -152,8 +157,6 @@ def fetch_model(
         :param request: The request that triggered the pipeline
         :param feature_store: The feature store used for persistence
         :return: Raw bytes of the model"""
-        if not feature_store:
-            raise ValueError("Feature store is required for model retrieval")
 
         if request.raw_model:
             # Should we cache model in the feature store?
@@ -162,6 +165,9 @@ def fetch_model(
             # short-circuit and return the directly supplied model
             return FetchModelResult(request.raw_model)
 
+        if not feature_store:
+            raise ValueError("Feature store is required for model retrieval")
+
         if not request.model_key:
             raise sse.SmartSimError(
                 "Key must be provided to retrieve model from feature store"
@@ -185,8 +191,12 @@ def fetch_inputs(
         :param request: The request that triggered the pipeline
         :param feature_store: The feature store used for persistence
         :return: the fetched input"""
+
+        if request.raw_inputs:
+            return FetchInputResult(request.raw_inputs, request.input_meta)
+
         if not feature_store:
-            raise ValueError("Feature store is required for input retrieval")
+            raise ValueError("No input and no feature store provided")
 
         if request.input_keys:
             data: t.List[bytes] = []
@@ -201,9 +211,6 @@ def fetch_inputs(
                     ) from ex
             return FetchInputResult(data)
 
-        if request.raw_inputs:
-            return FetchInputResult(request.raw_inputs)
-
         raise ValueError("No input source")
 
     @staticmethod
@@ -250,14 +257,6 @@ class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC):
     """Abstrct base class providing contract for a machine learning
     worker implementation."""
 
-    # @staticmethod
-    # @abstractmethod
-    # def deserialize(request: InferenceRequest) -> InferenceRequest:
-    #     """Given a collection of data serialized to bytes, convert the bytes
-    #     to a proper representation used by the ML backend
-    #     :param data_blob: inference request as a byte-serialized blob
-    #     :return: InferenceRequest deserialized from the input"""
-
     @staticmethod
     @abstractmethod
     def load_model(
@@ -303,11 +302,70 @@ def transform_output(
         :param execute_result: The result of inference wrapped in an ExecuteResult
         :return:"""
 
-    # @staticmethod
-    # @abstractmethod
-    # def serialize_reply(
-    #     request: InferenceRequest, results: OutputTransformResult
-    # ) -> bytes:
-    #     """Given an output, serialize to bytes for transport
-    #     :param reply: The result of the inference pipeline
-    #     :return: a byte-serialized version of the reply"""
+
+class TorchWorker(MachineLearningWorkerBase):
+    """A worker that executes a PyTorch model."""
+
+    @staticmethod
+    def load_model(
+        request: InferenceRequest, fetch_result: FetchModelResult
+    ) -> LoadModelResult:
+        model_bytes = fetch_result.model_bytes or request.raw_model
+        if not model_bytes:
+            raise ValueError("Unable to load model without reference object")
+
+        _device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
+        device = _device_to_torch[str(request.device)]
+        model: torch.nn.Module = torch.jit.load(io.BytesIO(model_bytes), map_location=device)  # type: ignore[no-untyped-call]
+        result = LoadModelResult(model)
+        return result
+
+    @staticmethod
+    def transform_input(
+        request: InferenceRequest, fetch_result: FetchInputResult
+    ) -> TransformInputResult:
+        result = []
+
+        _device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
+        device = _device_to_torch[str(request.device)]
+        for item, item_meta in zip(fetch_result.inputs, fetch_result.meta):
+            td: tensor_capnp.TensorDescriptor = item_meta
+            result.append(
+                torch.tensor(
+                    np.frombuffer(item, dtype=str(td.dataType)).reshape(td.dimensions)
+                ).to(device)
+            )
+        return TransformInputResult(result)
+        # return data # note: this fails copy test!
+
+    @staticmethod
+    def execute(
+        request: InferenceRequest,
+        load_result: LoadModelResult,
+        transform_result: TransformInputResult,
+    ) -> ExecuteResult:
+        if not load_result.model:
+            raise sse.SmartSimError("Model must be loaded to execute")
+
+        model: torch.nn.Module = load_result.model
+        model.eval()
+        results = [model(tensor).detach() for tensor in transform_result.transformed]
+
+        execute_result = ExecuteResult(results)
+        return execute_result
+
+    @staticmethod
+    def transform_output(
+        request: InferenceRequest,
+        execute_result: ExecuteResult,
+    ) -> TransformOutputResult:
+        if str(request.device) != "cpu":
+            transformed = [
+                item.to("cpu").clone() for item in execute_result.predictions
+            ]
+            # todo: need the shape from latest schemas added here.
+            return TransformOutputResult(transformed, None, "c", "float32")  # fixme
+        else:
+            return TransformOutputResult(
+                execute_result.predictions, None, "c", "float32"
+            )  # fixme
diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py
index 733fa83d98..4a5725bd9e 100644
--- a/smartsim/_core/mli/message_handler.py
+++ b/smartsim/_core/mli/message_handler.py
@@ -391,7 +391,9 @@ def deserialize_request(request_bytes: t.ByteString) -> request_capnp.Request:
 
         :param request_bytes: Bytes to be deserialized into a Request
         """
-        bytes_message = request_capnp.Request.from_bytes(request_bytes)
+        bytes_message = request_capnp.Request.from_bytes(
+            request_bytes, traversal_limit_in_words=2**63
+        )
 
         with bytes_message as message:
             return message
@@ -484,7 +486,7 @@ def _assign_custom_response_attributes(
                     response.customAttributes.tf = custom_attrs  # type: ignore
                 else:
                     raise ValueError("""Invalid custom attribute class name.
-                        Expected 'TensorFlowResponseAttributes' or 
+                        Expected 'TensorFlowResponseAttributes' or
                         'TorchResponseAttributes'.""")
         except Exception as e:
             raise ValueError("Error assigning custom attributes to response.") from e
@@ -529,7 +531,9 @@ def deserialize_response(response_bytes: t.ByteString) -> response_capnp.Respons
         """
         Deserializes a serialized response message.
         """
-        bytes_message = response_capnp.Response.from_bytes(response_bytes)
+        bytes_message = response_capnp.Response.from_bytes(
+            response_bytes, traversal_limit_in_words=2**63
+        )
 
         with bytes_message as message:
             return message

From 043f0e74e68ad07846ffce9a0013eb6cf1919c09 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 25 Jun 2024 13:42:44 -0500
Subject: [PATCH 02/84] Add inference example stub

---
 .../high_throughput_inference/mli_driver.py   |  35 +++++
 .../high_throughput_inference/mock_app.py     | 129 ++++++++++++++++++
 .../standalone_workermanager.py               |  46 +++++++
 3 files changed, 210 insertions(+)
 create mode 100644 examples/high_throughput_inference/mli_driver.py
 create mode 100644 examples/high_throughput_inference/mock_app.py
 create mode 100644 examples/high_throughput_inference/standalone_workermanager.py

diff --git a/examples/high_throughput_inference/mli_driver.py b/examples/high_throughput_inference/mli_driver.py
new file mode 100644
index 0000000000..187a7b8214
--- /dev/null
+++ b/examples/high_throughput_inference/mli_driver.py
@@ -0,0 +1,35 @@
+import sys
+from smartsim import Experiment
+from smartsim.status import TERMINAL_STATUSES
+import time
+
+worker_manager_script_name = "standalone_workermanager.py"
+app_script_name = "mock_app.py"
+device = "cpu"
+
+
+exp = Experiment("MLI_proto", launcher="dragon")
+
+worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name])
+worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs)
+worker_manager.attach_generator_files(to_copy=[worker_manager_script_name])
+
+
+app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device])
+app = exp.create_model("app", run_settings=app_rs)
+app.attach_generator_files(to_copy=[app_script_name], to_symlink=[f"resnet50.{device.upper()}.pt"])
+
+
+exp.generate(worker_manager, app, overwrite=True)
+exp.start(worker_manager, app, block=False)
+
+while True:
+    if exp.get_status(app)[0] in TERMINAL_STATUSES:
+        exp.stop(worker_manager)
+        break
+    if exp.get_status(worker_manager)[0] in TERMINAL_STATUSES:
+        exp.stop(app)
+        break
+    time.sleep(5)
+
+print("Exiting.")
\ No newline at end of file
diff --git a/examples/high_throughput_inference/mock_app.py b/examples/high_throughput_inference/mock_app.py
new file mode 100644
index 0000000000..d6f8253b70
--- /dev/null
+++ b/examples/high_throughput_inference/mock_app.py
@@ -0,0 +1,129 @@
+# isort: off
+import dragon
+from dragon import fli
+from dragon.channels import Channel
+import dragon.channels
+from dragon.data.ddict.ddict import DDict
+from dragon.globalservices.api_setup import connect_to_infrastructure
+from dragon.utils import b64decode, b64encode
+
+# isort: on
+
+import argparse
+import io
+import numpy
+import os
+import tabulate
+import time
+import torch
+import typing as t
+
+from smartsim._core.mli.message_handler import MessageHandler
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser("Mock application")
+    parser.add_argument("--device", default="cpu")
+
+    args = parser.parse_args()
+
+    connect_to_infrastructure()
+    ddict_str = os.environ["SS_DRG_DDICT"]
+
+    ddict = DDict.attach(ddict_str)
+
+    to_worker_fli_str = None
+
+    while to_worker_fli_str is None:
+        try:
+            to_worker_fli_str = ddict["to_worker_fli"]
+        except Exception as e:
+            time.sleep(1)
+
+    to_worker_fli = fli.FLInterface.attach(b64decode(to_worker_fli_str))
+
+    batch_size = 32
+    model = torch.jit.load(f"resnet50.{args.device.upper()}.pt")
+    buffer = io.BytesIO()
+    batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32)
+    scripted = torch.jit.trace(model, batch)
+    torch.jit.save(scripted, buffer)
+
+    total_iterations = 10
+
+    headers=[
+                "batch_size",
+                "build_tensor",
+                "build_request",
+                "serialize_request",
+                "send",
+                "receive",
+                "deserialize_response",
+                "deserialize_tensor",
+            ]
+
+    print(",".join(headers))
+
+    for batch_size in [1, 8, 32, 64, 128]:
+
+        timings = []
+        for iteration_number in range(total_iterations + int(batch_size==1)):
+
+            timings.append([batch_size])
+
+            batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32)
+
+            expected_device: t.Literal["cpu", "gpu"] = args.device.lower()
+
+            start = time.perf_counter()
+            interm = start
+            built_tensor = MessageHandler.build_tensor(
+                batch.numpy(), "c", "float32", list(batch.shape)
+            )
+            timings[-1].append(time.perf_counter() - interm)
+            interm = time.perf_counter()
+
+            from_worker_ch = Channel.make_process_local()
+
+            request = MessageHandler.build_request(
+                reply_channel=from_worker_ch.serialize(),
+                model=buffer.getvalue(),
+                device=expected_device,
+                inputs=[built_tensor],
+                outputs=[],
+                output_descriptors=[],
+                custom_attributes=None,
+            )
+
+            timings[-1].append(time.perf_counter() - interm)
+            interm = time.perf_counter()
+            request_bytes = MessageHandler.serialize_request(request)
+            timings[-1].append(time.perf_counter() - interm)
+            interm = time.perf_counter()
+            with to_worker_fli.sendh(timeout=None) as to_sendh:
+                to_sendh.send_bytes(request_bytes)
+                timings[-1].append(time.perf_counter() - interm)
+                interm = time.perf_counter()
+
+            with from_worker_ch.recvh(timeout=None) as from_recvh:
+                resp = from_recvh.recv_bytes(timeout=None)
+                timings[-1].append(time.perf_counter() - interm)
+                interm = time.perf_counter()
+                response = MessageHandler.deserialize_response(resp)
+                timings[-1].append(time.perf_counter() - interm)
+                interm = time.perf_counter()
+                result = torch.from_numpy(
+                    numpy.frombuffer(
+                        response.result.data[0].blob,
+                        dtype=str(response.result.data[0].tensorDescriptor.dataType),
+                    )
+                )
+
+                timings[-1].append(time.perf_counter() - interm)
+                interm = time.perf_counter()
+
+            # duration = time.perf_counter() - start
+            # print(f"{duration:.3f} s")
+
+            print(",".join(str(timing) for timing in timings[-1]))
diff --git a/examples/high_throughput_inference/standalone_workermanager.py b/examples/high_throughput_inference/standalone_workermanager.py
new file mode 100644
index 0000000000..7ddeff0094
--- /dev/null
+++ b/examples/high_throughput_inference/standalone_workermanager.py
@@ -0,0 +1,46 @@
+# isort: off
+import dragon
+from dragon import fli
+from dragon.channels import Channel
+from dragon.data.ddict.ddict import DDict
+from dragon.utils import b64decode, b64encode
+from dragon.globalservices.api_setup import connect_to_infrastructure
+# isort: on
+import logging
+import multiprocessing as mp
+import os
+import pathlib
+import shutil
+import time
+
+
+from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
+from smartsim._core.mli.infrastructure.worker.worker import TorchWorker
+from smartsim._core.mli.infrastructure.control.workermanager import (
+    DragonCommChannel,
+    WorkerManager,
+)
+
+if __name__ == "__main__":
+    connect_to_infrastructure()
+    mp.set_start_method("dragon")
+    ddict_str = os.environ["SS_DRG_DDICT"]
+    ddict = DDict.attach(ddict_str)
+
+    to_worker_channel = Channel.make_process_local()
+    to_worker_manager_channel = Channel.make_process_local()
+    channels = [Channel.make_process_local() for _ in range(100)]
+    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=to_worker_manager_channel, stream_channels=channels)
+    ddict["to_worker_fli"] = b64encode(to_worker_fli.serialize())
+
+    torch_worker = TorchWorker()
+
+    worker_manager = WorkerManager(
+        file_like_interface=to_worker_fli,
+        worker=torch_worker,
+        feature_store=None,
+        as_service=True,
+        cooldown=10,
+        comm_channel_type=DragonCommChannel,
+    )
+    worker_manager.execute()

From efc9e839d2c317a49662776b710993e43c88f75c Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 25 Jun 2024 17:09:50 -0500
Subject: [PATCH 03/84] Lint, style, black magic

---
 .../high_throughput_inference/mli_driver.py   |  2 +-
 .../standalone_workermanager.py               |  3 +-
 .../_core/launcher/dragon/dragonBackend.py    |  3 +-
 .../_core/mli/infrastructure/worker/worker.py | 30 +++++++++++--------
 4 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/examples/high_throughput_inference/mli_driver.py b/examples/high_throughput_inference/mli_driver.py
index 187a7b8214..833766cbef 100644
--- a/examples/high_throughput_inference/mli_driver.py
+++ b/examples/high_throughput_inference/mli_driver.py
@@ -5,7 +5,7 @@
 
 worker_manager_script_name = "standalone_workermanager.py"
 app_script_name = "mock_app.py"
-device = "cpu"
+device = "gpu"
 
 
 exp = Experiment("MLI_proto", launcher="dragon")
diff --git a/examples/high_throughput_inference/standalone_workermanager.py b/examples/high_throughput_inference/standalone_workermanager.py
index 7ddeff0094..bb93c613ce 100644
--- a/examples/high_throughput_inference/standalone_workermanager.py
+++ b/examples/high_throughput_inference/standalone_workermanager.py
@@ -14,10 +14,9 @@
 import time
 
 
-from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
+from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel
 from smartsim._core.mli.infrastructure.worker.worker import TorchWorker
 from smartsim._core.mli.infrastructure.control.workermanager import (
-    DragonCommChannel,
     WorkerManager,
 )
 
diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 9ec4cc93e9..d103579115 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -26,7 +26,6 @@
 import collections
 import functools
 import itertools
-import os
 import time
 import typing as t
 from dataclasses import dataclass, field
@@ -411,7 +410,7 @@ def infra_ddict(self) -> str:
             self._infra_ddict["creation"] = str(time.time())
             logger.info(self._infra_ddict["creation"])
 
-        return self._infra_ddict.serialize()
+        return str(self._infra_ddict.serialize())
 
     def _start_steps(self) -> None:
         self._heartbeat()
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 8992b2b6ea..295b2573c8 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -111,7 +111,7 @@ def __init__(self, result: t.Any) -> None:
 class FetchInputResult:
     """A wrapper around fetched inputs"""
 
-    def __init__(self, result: t.List[bytes], meta: t.List[t.Any]) -> None:
+    def __init__(self, result: t.List[bytes], meta: t.Optional[t.List[t.Any]]) -> None:
         """Initialize the object"""
         self.inputs = result
         self.meta = meta
@@ -121,7 +121,7 @@ class TransformOutputResult:
     """A wrapper around inference results transformed for transmission"""
 
     def __init__(
-        self, result: t.Any, shape: t.List[int], order: str, dtype: str
+        self, result: t.Any, shape: t.Optional[t.List[int]], order: str, dtype: str
     ) -> None:
         """Initialize the OutputTransformResult"""
         self.outputs = result
@@ -209,7 +209,9 @@ def fetch_inputs(
                     raise sse.SmartSimError(
                         f"Model could not be retrieved with key {input_}"
                     ) from ex
-            return FetchInputResult(data)
+            return FetchInputResult(
+                data, None
+            )  # fixme: need to get both tensor and descriptor
 
         raise ValueError("No input source")
 
@@ -316,7 +318,9 @@ def load_model(
 
         _device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
         device = _device_to_torch[str(request.device)]
-        model: torch.nn.Module = torch.jit.load(io.BytesIO(model_bytes), map_location=device)  # type: ignore[no-untyped-call]
+        buffer = io.BytesIO(model_bytes)
+        # type: ignore-next[no-untyped-call]
+        model = torch.jit.load(buffer, map_location=device)
         result = LoadModelResult(model)
         return result
 
@@ -328,12 +332,14 @@ def transform_input(
 
         _device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
         device = _device_to_torch[str(request.device)]
+        if fetch_result.meta is None:
+            raise ValueError("Cannot reconstruct tensor without meta information")
         for item, item_meta in zip(fetch_result.inputs, fetch_result.meta):
-            td: tensor_capnp.TensorDescriptor = item_meta
+            tensor_desc: tensor_capnp.TensorDescriptor = item_meta
             result.append(
-                torch.tensor(
-                    np.frombuffer(item, dtype=str(td.dataType)).reshape(td.dimensions)
-                ).to(device)
+                torch.tensor(np.frombuffer(item, dtype=str(tensor_desc.dataType)))
+                .to(device)
+                .reshape(tuple(dim for dim in tensor_desc.dimensions))
             )
         return TransformInputResult(result)
         # return data # note: this fails copy test!
@@ -365,7 +371,7 @@ def transform_output(
             ]
             # todo: need the shape from latest schemas added here.
             return TransformOutputResult(transformed, None, "c", "float32")  # fixme
-        else:
-            return TransformOutputResult(
-                execute_result.predictions, None, "c", "float32"
-            )  # fixme
+
+        return TransformOutputResult(
+            execute_result.predictions, None, "c", "float32"
+        )  # fixme

From ed3c42a10b812963e2de28c6e89918dfe0efbc07 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 25 Jun 2024 18:07:56 -0500
Subject: [PATCH 04/84] Bring up to feature branch

---
 .../infrastructure/control/workermanager.py   | 24 +++++++++++++++----
 .../_core/mli/infrastructure/worker/worker.py | 24 ++++++++++---------
 2 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 67b1627bb5..f46ced8756 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -54,7 +54,9 @@
 
 
 def deserialize_message(
-    data_blob: bytes, channel_type: t.Type[CommChannelBase]
+    data_blob: bytes,
+    channel_type: t.Type[CommChannelBase],
+    device: t.Literal["cpu", "gpu"],
 ) -> InferenceRequest:
     """Deserialize a message from a byte stream into an InferenceRequest
     :param data_blob: The byte stream to deserialize"""
@@ -166,6 +168,7 @@ def __init__(
         as_service: bool = False,
         cooldown: int = 0,
         comm_channel_type: t.Type[CommChannelBase] = DragonFLIChannel,
+        device: t.Literal["cpu", "gpu"] = "cpu",
     ) -> None:
         """Initialize the WorkerManager
         :param task_queue: The queue to monitor for new tasks
@@ -187,6 +190,8 @@ def __init__(
         """The ML Worker implementation"""
         self._comm_channel_type = comm_channel_type
         """The type of communication channel to construct for callbacks"""
+        self._device = device
+        """Device on which workers need to run"""
 
     def _validate_request(self, request: InferenceRequest) -> bool:
         """Ensure the request can be processed.
@@ -236,17 +241,24 @@ def _on_iteration(self) -> None:
             except fli.FLIEOT as exc:
                 return
 
-        request = deserialize_message(request_bytes, self._comm_channel_type)
+        request = deserialize_message(
+            request_bytes, self._comm_channel_type, self._device
+        )
         if not self._validate_request(request):
             return
 
+
         # # let the worker perform additional custom deserialization
         # request = self._worker.deserialize(request_bytes)
 
         fetch_model_result = self._worker.fetch_model(request, self._feature_store)
-        model_result = self._worker.load_model(request, fetch_model_result)
+        model_result = self._worker.load_model(
+            request, fetch_model_result, self._device
+        )
         fetch_input_result = self._worker.fetch_inputs(request, self._feature_store)
-        transformed_input = self._worker.transform_input(request, fetch_input_result)
+        transformed_input = self._worker.transform_input(
+            request, fetch_input_result, self._device
+        )
 
         reply = InferenceReply()
 
@@ -254,7 +266,9 @@ def _on_iteration(self) -> None:
             execute_result = self._worker.execute(
                 request, model_result, transformed_input
             )
-            transformed_output = self._worker.transform_output(request, execute_result)
+            transformed_output = self._worker.transform_output(
+                request, execute_result, self._device
+            )
 
             if request.output_keys:
                 reply.output_keys = self._worker.place_output(
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 9b813a9e9b..08c4997554 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -260,21 +260,23 @@ class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC):
     @staticmethod
     @abstractmethod
     def load_model(
-        request: InferenceRequest, fetch_result: FetchModelResult
+        request: InferenceRequest, fetch_result: FetchModelResult, device: str
     ) -> LoadModelResult:
         """Given a loaded MachineLearningModel, ensure it is loaded into
         device memory
         :param request: The request that triggered the pipeline
+        :param device: The device on which the model must be placed
         :return: ModelLoadResult wrapping the model loaded for the request"""
 
     @staticmethod
     @abstractmethod
     def transform_input(
-        request: InferenceRequest, fetch_result: FetchInputResult
+        request: InferenceRequest, fetch_result: FetchInputResult, device: str
     ) -> TransformInputResult:
         """Given a collection of data, perform a transformation on the data
         :param request: The request that triggered the pipeline
         :param fetch_result: Raw output from fetching inputs out of a feature store
+        :param device: The device on which the transformed input must be placed
         :return: The transformed inputs wrapped in a InputTransformResult"""
 
     @staticmethod
@@ -293,13 +295,13 @@ def execute(
     @staticmethod
     @abstractmethod
     def transform_output(
-        request: InferenceRequest,
-        execute_result: ExecuteResult,
+        request: InferenceRequest, execute_result: ExecuteResult, result_device: str
     ) -> TransformOutputResult:
         """Given inference results, perform transformations required to
         transmit results to the requestor.
         :param request: The request that triggered the pipeline
         :param execute_result: The result of inference wrapped in an ExecuteResult
+        :param result_device: The device on which the result of inference is placed
         :return:"""
 
 
@@ -308,28 +310,27 @@ class TorchWorker(MachineLearningWorkerBase):
 
     @staticmethod
     def load_model(
-        request: InferenceRequest, fetch_result: FetchModelResult
+        request: InferenceRequest, fetch_result: FetchModelResult, device: str
     ) -> LoadModelResult:
         model_bytes = fetch_result.model_bytes or request.raw_model
         if not model_bytes:
             raise ValueError("Unable to load model without reference object")
 
         _device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
-        device = _device_to_torch[str(request.device)]
+        device = _device_to_torch[device]
         buffer = io.BytesIO(model_bytes)
-        # type: ignore-next[no-untyped-call]
-        model = torch.jit.load(buffer, map_location=device)
+        model = torch.jit.load(buffer, map_location=device)  # type: ignore
         result = LoadModelResult(model)
         return result
 
     @staticmethod
     def transform_input(
-        request: InferenceRequest, fetch_result: FetchInputResult
+        request: InferenceRequest, fetch_result: FetchInputResult, device: str
     ) -> TransformInputResult:
         result = []
 
         _device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
-        device = _device_to_torch[str(request.device)]
+        device = _device_to_torch[device]
         if fetch_result.meta is None:
             raise ValueError("Cannot reconstruct tensor without meta information")
         for item, item_meta in zip(fetch_result.inputs, fetch_result.meta):
@@ -362,8 +363,9 @@ def execute(
     def transform_output(
         request: InferenceRequest,
         execute_result: ExecuteResult,
+        result_device: str,
     ) -> TransformOutputResult:
-        if str(request.device) != "cpu":
+        if result_device != "cpu":
             transformed = [
                 item.to("cpu").clone() for item in execute_result.predictions
             ]

From e5be26bdcd8d55e6b3b9669fa9bd5492ffd89390 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 25 Jun 2024 18:08:14 -0500
Subject: [PATCH 05/84] Update example

---
 examples/high_throughput_inference/mli_driver.py    | 13 ++++++++-----
 examples/high_throughput_inference/mock_app.py      |  3 ---
 .../standalone_workermanager.py                     | 11 +++++------
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/examples/high_throughput_inference/mli_driver.py b/examples/high_throughput_inference/mli_driver.py
index 833766cbef..d32d88e51b 100644
--- a/examples/high_throughput_inference/mli_driver.py
+++ b/examples/high_throughput_inference/mli_driver.py
@@ -1,23 +1,26 @@
+import os
 import sys
 from smartsim import Experiment
 from smartsim.status import TERMINAL_STATUSES
 import time
 
-worker_manager_script_name = "standalone_workermanager.py"
-app_script_name = "mock_app.py"
 device = "gpu"
+filedir = os.path.dirname(__file__)
+worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py")
+app_script_name = os.path.join(filedir, "mock_app.py")
+model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt")
 
 
-exp = Experiment("MLI_proto", launcher="dragon")
+exp = Experiment("MLI_proto", launcher="dragon", exp_path=os.path.join(filedir, "MLI_proto"))
 
-worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name])
+worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device])
 worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs)
 worker_manager.attach_generator_files(to_copy=[worker_manager_script_name])
 
 
 app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device])
 app = exp.create_model("app", run_settings=app_rs)
-app.attach_generator_files(to_copy=[app_script_name], to_symlink=[f"resnet50.{device.upper()}.pt"])
+app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name])
 
 
 exp.generate(worker_manager, app, overwrite=True)
diff --git a/examples/high_throughput_inference/mock_app.py b/examples/high_throughput_inference/mock_app.py
index d6f8253b70..afc0c836b8 100644
--- a/examples/high_throughput_inference/mock_app.py
+++ b/examples/high_throughput_inference/mock_app.py
@@ -74,8 +74,6 @@
 
             batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32)
 
-            expected_device: t.Literal["cpu", "gpu"] = args.device.lower()
-
             start = time.perf_counter()
             interm = start
             built_tensor = MessageHandler.build_tensor(
@@ -89,7 +87,6 @@
             request = MessageHandler.build_request(
                 reply_channel=from_worker_ch.serialize(),
                 model=buffer.getvalue(),
-                device=expected_device,
                 inputs=[built_tensor],
                 outputs=[],
                 output_descriptors=[],
diff --git a/examples/high_throughput_inference/standalone_workermanager.py b/examples/high_throughput_inference/standalone_workermanager.py
index bb93c613ce..32d534f360 100644
--- a/examples/high_throughput_inference/standalone_workermanager.py
+++ b/examples/high_throughput_inference/standalone_workermanager.py
@@ -6,12 +6,8 @@
 from dragon.utils import b64decode, b64encode
 from dragon.globalservices.api_setup import connect_to_infrastructure
 # isort: on
-import logging
-import multiprocessing as mp
+import argparse
 import os
-import pathlib
-import shutil
-import time
 
 
 from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel
@@ -21,8 +17,10 @@
 )
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Worker Manager")
+    parser.add_argument("--device", default="gpu")
+    args = parser.parse_args()
     connect_to_infrastructure()
-    mp.set_start_method("dragon")
     ddict_str = os.environ["SS_DRG_DDICT"]
     ddict = DDict.attach(ddict_str)
 
@@ -41,5 +39,6 @@
         as_service=True,
         cooldown=10,
         comm_channel_type=DragonCommChannel,
+        device = args.device,
     )
     worker_manager.execute()

From a23010fb9726e4c18997bee279a0553bbaa473f0 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 25 Jun 2024 18:17:30 -0500
Subject: [PATCH 06/84] Change the changelog

---
 doc/changelog.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/changelog.md b/doc/changelog.md
index e86c93de66..d146d1973a 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -17,7 +17,7 @@ Description
 - Added schemas and MessageHandler class for de/serialization of
   inference requests and response messages
 - Removed device from schemas, MessageHandler and tests
-
+- Add TorchWorker first implementation and mock inference app example
 
 ### Development branch
 

From 3c20f464d512c7b3a1ead1981efb96842e7a14bb Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 25 Jun 2024 18:38:12 -0500
Subject: [PATCH 07/84] Make style

---
 smartsim/_core/mli/infrastructure/control/workermanager.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index f46ced8756..7a5f168fe4 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -247,7 +247,6 @@ def _on_iteration(self) -> None:
         if not self._validate_request(request):
             return
 
-
         # # let the worker perform additional custom deserialization
         # request = self._worker.deserialize(request_bytes)
 

From b9ed5ba8baa9fc355640f8c2461a0ce7d16cf56b Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 26 Jun 2024 09:51:07 -0500
Subject: [PATCH 08/84] Attempt to mitigate import dragon error

---
 .../_core/mli/infrastructure/control/workermanager.py  | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 7a5f168fe4..607f94982d 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -24,9 +24,15 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import sys
+
 # isort: off
-import dragon
-from dragon import fli
+try:
+    import dragon
+    from dragon import fli
+except ImportError as exc:
+    if not "pytest" in sys.modules:
+        raise exc from None
 
 # isort: on
 import time

From 0de06f3b6c0fa4747b471989a8068e4e609829a0 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 26 Jun 2024 10:20:27 -0500
Subject: [PATCH 09/84] Import dragon optional

---
 smartsim/_core/mli/comm/channel/dragonchannel.py     |  9 ++++++---
 smartsim/_core/mli/comm/channel/dragonfli.py         | 12 ++++++++----
 .../mli/infrastructure/control/workermanager.py      |  2 +-
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py
index d4dbfa3ba0..e79fd2dfcf 100644
--- a/smartsim/_core/mli/comm/channel/dragonchannel.py
+++ b/smartsim/_core/mli/comm/channel/dragonchannel.py
@@ -24,14 +24,17 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
+import sys
 import smartsim._core.mli.comm.channel.channel as cch
 from smartsim.log import get_logger
 
 logger = get_logger(__name__)
 
-import dragon.channels as dch
-
+try:
+    import dragon.channels as dch
+except ImportError as exc:
+    if not "pytest" in sys.modules:
+        raise exc from None
 
 class DragonCommChannel(cch.CommChannelBase):
     """Passes messages by writing to a Dragon channel"""
diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py
index f601bb2eb8..3992241380 100644
--- a/smartsim/_core/mli/comm/channel/dragonfli.py
+++ b/smartsim/_core/mli/comm/channel/dragonfli.py
@@ -24,11 +24,15 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# isort: off
-import dragon
-from dragon import fli
-import dragon.channels as dch
+import sys
 
+# isort: off
+try:
+    from dragon import fli
+    import dragon.channels as dch
+except ImportError as exc:
+    if not "pytest" in sys.modules:
+        raise exc from None
 # isort: on
 
 
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 607f94982d..6003869e46 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -168,7 +168,7 @@ class WorkerManager(Service):
 
     def __init__(
         self,
-        file_like_interface: fli.FLInterface,
+        file_like_interface: "fli.FLInterface",
         worker: MachineLearningWorkerBase,
         feature_store: t.Optional[FeatureStore] = None,
         as_service: bool = False,

From d051385a963f2c18e55792b30316cd41eb2ca357 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 26 Jun 2024 10:28:23 -0500
Subject: [PATCH 10/84] isort

---
 smartsim/_core/mli/comm/channel/dragonchannel.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py
index e79fd2dfcf..872eb32350 100644
--- a/smartsim/_core/mli/comm/channel/dragonchannel.py
+++ b/smartsim/_core/mli/comm/channel/dragonchannel.py
@@ -25,6 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import sys
+
 import smartsim._core.mli.comm.channel.channel as cch
 from smartsim.log import get_logger
 
@@ -36,6 +37,7 @@
     if not "pytest" in sys.modules:
         raise exc from None
 
+
 class DragonCommChannel(cch.CommChannelBase):
     """Passes messages by writing to a Dragon channel"""
 

From e77b1cd5c9c8359aa7be27b2a3d61c398eaa7d04 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 26 Jun 2024 11:33:47 -0500
Subject: [PATCH 11/84] Fix imports in dragon backend tests

---
 smartsim/_core/launcher/dragon/dragonBackend.py | 10 ++++------
 tests/test_dragon_backend.py                    | 10 ++++++++++
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index d103579115..f0e450a19c 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -36,15 +36,14 @@
 
 # pylint: disable=import-error
 # isort: off
+import dragon.data.ddict.ddict as dragon_ddict
 import dragon.infrastructure.connection as dragon_connection
 import dragon.infrastructure.policy as dragon_policy
-from dragon.infrastructure.process_desc import ProcessOptions
-from dragon.data.ddict.ddict import DDict
+import dragon.infrastructure.process_desc as dragon_process_desc
 import dragon.native.group_state as dragon_group_state
 import dragon.native.process as dragon_process
 import dragon.native.process_group as dragon_process_group
 import dragon.native.machine as dragon_machine
-import multiprocessing as mp
 
 # pylint: enable=import-error
 # isort: on
@@ -78,7 +77,6 @@ def __str__(self) -> str:
         return self.value
 
 
-mp.set_start_method("dragon")
 
 
 @dataclass
@@ -405,7 +403,7 @@ def infra_ddict(self) -> str:
         """
         if self._infra_ddict is None:
             logger.info("Creating DDict")
-            self._infra_ddict = DDict()  # todo: parametrize
+            self._infra_ddict = dragon_ddict.DDict()  # todo: parametrize
             logger.info("Created DDict")
             self._infra_ddict["creation"] = str(time.time())
             logger.info(self._infra_ddict["creation"])
@@ -427,7 +425,7 @@ def _start_steps(self) -> None:
                     placement=dragon_policy.Policy.Placement.HOST_NAME,
                     host_name=hosts[0],
                 )
-                options = ProcessOptions(make_inf_channels=True)
+                options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
                 grp = dragon_process_group.ProcessGroup(
                     restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy
                 )
diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py
index a510f660a5..f284f38d99 100644
--- a/tests/test_dragon_backend.py
+++ b/tests/test_dragon_backend.py
@@ -103,6 +103,16 @@ def get_mock_backend(monkeypatch: pytest.MonkeyPatch) -> "DragonBackend":
         "dragon.infrastructure.connection",
         MagicMock(),
     )
+    monkeypatch.setitem(
+        sys.modules,
+        "dragon.infrastructure.process_desc",
+        MagicMock(),
+    )
+    monkeypatch.setitem(
+        sys.modules,
+        "dragon.data.ddict.ddict",
+        MagicMock(),
+    )
     monkeypatch.setitem(
         sys.modules,
         "dragon.infrastructure.policy",

From a90888d44d3e9ef2207a97c6b0936418daf4d06c Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 26 Jun 2024 11:36:26 -0500
Subject: [PATCH 12/84] Style

---
 smartsim/_core/launcher/dragon/dragonBackend.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index f0e450a19c..d91f73e3c5 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -77,8 +77,6 @@ def __str__(self) -> str:
         return self.value
 
 
-
-
 @dataclass
 class ProcessGroupInfo:
     status: SmartSimStatus

From b4312215184478186e837ab193cc609fb53f4698 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 26 Jun 2024 11:40:14 -0500
Subject: [PATCH 13/84] Fix type

---
 smartsim/_core/launcher/dragon/dragonBackend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index d91f73e3c5..52f69ec41f 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -189,7 +189,7 @@ def __init__(self, pid: int) -> None:
 
         self._view = DragonBackendView(self)
         logger.debug(self._view.host_desc)
-        self._infra_ddict: t.Optional[DDict] = None
+        self._infra_ddict: t.Optional[dragon_ddict.DDict] = None
 
     @property
     def hosts(self) -> list[str]:

From 23efebc25027d908703e80e059a3c431d5f7d434 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 26 Jun 2024 12:38:55 -0500
Subject: [PATCH 14/84] Rename examples dir

---
 ex/high_throughput_inference/mli_driver.py    |  38 ++++++
 ex/high_throughput_inference/mock_app.py      | 126 ++++++++++++++++++
 .../standalone_workermanager.py               |  44 ++++++
 3 files changed, 208 insertions(+)
 create mode 100644 ex/high_throughput_inference/mli_driver.py
 create mode 100644 ex/high_throughput_inference/mock_app.py
 create mode 100644 ex/high_throughput_inference/standalone_workermanager.py

diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py
new file mode 100644
index 0000000000..7b8db5ed83
--- /dev/null
+++ b/ex/high_throughput_inference/mli_driver.py
@@ -0,0 +1,38 @@
+import os
+import sys
+from smartsim import Experiment
+from smartsim.status import TERMINAL_STATUSES
+import time
+
+device = "cpu"
+filedir = os.path.dirname(__file__)
+worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py")
+app_script_name = os.path.join(filedir, "mock_app.py")
+model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt")
+
+
+exp = Experiment("MLI_proto", launcher="dragon", exp_path=os.path.join(filedir, "MLI_proto"))
+
+worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device])
+worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs)
+worker_manager.attach_generator_files(to_copy=[worker_manager_script_name])
+
+
+app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device])
+app = exp.create_model("app", run_settings=app_rs)
+app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name])
+
+
+exp.generate(worker_manager, app, overwrite=True)
+exp.start(worker_manager, app, block=False)
+
+while True:
+    if exp.get_status(app)[0] in TERMINAL_STATUSES:
+        exp.stop(worker_manager)
+        break
+    if exp.get_status(worker_manager)[0] in TERMINAL_STATUSES:
+        exp.stop(app)
+        break
+    time.sleep(5)
+
+print("Exiting.")
\ No newline at end of file
diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
new file mode 100644
index 0000000000..afc0c836b8
--- /dev/null
+++ b/ex/high_throughput_inference/mock_app.py
@@ -0,0 +1,126 @@
+# isort: off
+import dragon
+from dragon import fli
+from dragon.channels import Channel
+import dragon.channels
+from dragon.data.ddict.ddict import DDict
+from dragon.globalservices.api_setup import connect_to_infrastructure
+from dragon.utils import b64decode, b64encode
+
+# isort: on
+
+import argparse
+import io
+import numpy
+import os
+import tabulate
+import time
+import torch
+import typing as t
+
+from smartsim._core.mli.message_handler import MessageHandler
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser("Mock application")
+    parser.add_argument("--device", default="cpu")
+
+    args = parser.parse_args()
+
+    connect_to_infrastructure()
+    ddict_str = os.environ["SS_DRG_DDICT"]
+
+    ddict = DDict.attach(ddict_str)
+
+    to_worker_fli_str = None
+
+    while to_worker_fli_str is None:
+        try:
+            to_worker_fli_str = ddict["to_worker_fli"]
+        except Exception as e:
+            time.sleep(1)
+
+    to_worker_fli = fli.FLInterface.attach(b64decode(to_worker_fli_str))
+
+    batch_size = 32
+    model = torch.jit.load(f"resnet50.{args.device.upper()}.pt")
+    buffer = io.BytesIO()
+    batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32)
+    scripted = torch.jit.trace(model, batch)
+    torch.jit.save(scripted, buffer)
+
+    total_iterations = 10
+
+    headers=[
+                "batch_size",
+                "build_tensor",
+                "build_request",
+                "serialize_request",
+                "send",
+                "receive",
+                "deserialize_response",
+                "deserialize_tensor",
+            ]
+
+    print(",".join(headers))
+
+    for batch_size in [1, 8, 32, 64, 128]:
+
+        timings = []
+        for iteration_number in range(total_iterations + int(batch_size==1)):
+
+            timings.append([batch_size])
+
+            batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32)
+
+            start = time.perf_counter()
+            interm = start
+            built_tensor = MessageHandler.build_tensor(
+                batch.numpy(), "c", "float32", list(batch.shape)
+            )
+            timings[-1].append(time.perf_counter() - interm)
+            interm = time.perf_counter()
+
+            from_worker_ch = Channel.make_process_local()
+
+            request = MessageHandler.build_request(
+                reply_channel=from_worker_ch.serialize(),
+                model=buffer.getvalue(),
+                inputs=[built_tensor],
+                outputs=[],
+                output_descriptors=[],
+                custom_attributes=None,
+            )
+
+            timings[-1].append(time.perf_counter() - interm)
+            interm = time.perf_counter()
+            request_bytes = MessageHandler.serialize_request(request)
+            timings[-1].append(time.perf_counter() - interm)
+            interm = time.perf_counter()
+            with to_worker_fli.sendh(timeout=None) as to_sendh:
+                to_sendh.send_bytes(request_bytes)
+                timings[-1].append(time.perf_counter() - interm)
+                interm = time.perf_counter()
+
+            with from_worker_ch.recvh(timeout=None) as from_recvh:
+                resp = from_recvh.recv_bytes(timeout=None)
+                timings[-1].append(time.perf_counter() - interm)
+                interm = time.perf_counter()
+                response = MessageHandler.deserialize_response(resp)
+                timings[-1].append(time.perf_counter() - interm)
+                interm = time.perf_counter()
+                result = torch.from_numpy(
+                    numpy.frombuffer(
+                        response.result.data[0].blob,
+                        dtype=str(response.result.data[0].tensorDescriptor.dataType),
+                    )
+                )
+
+                timings[-1].append(time.perf_counter() - interm)
+                interm = time.perf_counter()
+
+            # duration = time.perf_counter() - start
+            # print(f"{duration:.3f} s")
+
+            print(",".join(str(timing) for timing in timings[-1]))
diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
new file mode 100644
index 0000000000..32d534f360
--- /dev/null
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -0,0 +1,44 @@
+# isort: off
+import dragon
+from dragon import fli
+from dragon.channels import Channel
+from dragon.data.ddict.ddict import DDict
+from dragon.utils import b64decode, b64encode
+from dragon.globalservices.api_setup import connect_to_infrastructure
+# isort: on
+import argparse
+import os
+
+
+from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel
+from smartsim._core.mli.infrastructure.worker.worker import TorchWorker
+from smartsim._core.mli.infrastructure.control.workermanager import (
+    WorkerManager,
+)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Worker Manager")
+    parser.add_argument("--device", default="gpu")
+    args = parser.parse_args()
+    connect_to_infrastructure()
+    ddict_str = os.environ["SS_DRG_DDICT"]
+    ddict = DDict.attach(ddict_str)
+
+    to_worker_channel = Channel.make_process_local()
+    to_worker_manager_channel = Channel.make_process_local()
+    channels = [Channel.make_process_local() for _ in range(100)]
+    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=to_worker_manager_channel, stream_channels=channels)
+    ddict["to_worker_fli"] = b64encode(to_worker_fli.serialize())
+
+    torch_worker = TorchWorker()
+
+    worker_manager = WorkerManager(
+        file_like_interface=to_worker_fli,
+        worker=torch_worker,
+        feature_store=None,
+        as_service=True,
+        cooldown=10,
+        comm_channel_type=DragonCommChannel,
+        device = args.device,
+    )
+    worker_manager.execute()

From 09b9d249c5c2147a062f95356c943c4da8e534b9 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 26 Jun 2024 12:48:11 -0500
Subject: [PATCH 15/84] Remove old dir

---
 .../high_throughput_inference/mli_driver.py   |  38 ------
 .../high_throughput_inference/mock_app.py     | 126 ------------------
 .../standalone_workermanager.py               |  44 ------
 3 files changed, 208 deletions(-)
 delete mode 100644 examples/high_throughput_inference/mli_driver.py
 delete mode 100644 examples/high_throughput_inference/mock_app.py
 delete mode 100644 examples/high_throughput_inference/standalone_workermanager.py

diff --git a/examples/high_throughput_inference/mli_driver.py b/examples/high_throughput_inference/mli_driver.py
deleted file mode 100644
index d32d88e51b..0000000000
--- a/examples/high_throughput_inference/mli_driver.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import os
-import sys
-from smartsim import Experiment
-from smartsim.status import TERMINAL_STATUSES
-import time
-
-device = "gpu"
-filedir = os.path.dirname(__file__)
-worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py")
-app_script_name = os.path.join(filedir, "mock_app.py")
-model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt")
-
-
-exp = Experiment("MLI_proto", launcher="dragon", exp_path=os.path.join(filedir, "MLI_proto"))
-
-worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device])
-worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs)
-worker_manager.attach_generator_files(to_copy=[worker_manager_script_name])
-
-
-app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device])
-app = exp.create_model("app", run_settings=app_rs)
-app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name])
-
-
-exp.generate(worker_manager, app, overwrite=True)
-exp.start(worker_manager, app, block=False)
-
-while True:
-    if exp.get_status(app)[0] in TERMINAL_STATUSES:
-        exp.stop(worker_manager)
-        break
-    if exp.get_status(worker_manager)[0] in TERMINAL_STATUSES:
-        exp.stop(app)
-        break
-    time.sleep(5)
-
-print("Exiting.")
\ No newline at end of file
diff --git a/examples/high_throughput_inference/mock_app.py b/examples/high_throughput_inference/mock_app.py
deleted file mode 100644
index afc0c836b8..0000000000
--- a/examples/high_throughput_inference/mock_app.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# isort: off
-import dragon
-from dragon import fli
-from dragon.channels import Channel
-import dragon.channels
-from dragon.data.ddict.ddict import DDict
-from dragon.globalservices.api_setup import connect_to_infrastructure
-from dragon.utils import b64decode, b64encode
-
-# isort: on
-
-import argparse
-import io
-import numpy
-import os
-import tabulate
-import time
-import torch
-import typing as t
-
-from smartsim._core.mli.message_handler import MessageHandler
-
-
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser("Mock application")
-    parser.add_argument("--device", default="cpu")
-
-    args = parser.parse_args()
-
-    connect_to_infrastructure()
-    ddict_str = os.environ["SS_DRG_DDICT"]
-
-    ddict = DDict.attach(ddict_str)
-
-    to_worker_fli_str = None
-
-    while to_worker_fli_str is None:
-        try:
-            to_worker_fli_str = ddict["to_worker_fli"]
-        except Exception as e:
-            time.sleep(1)
-
-    to_worker_fli = fli.FLInterface.attach(b64decode(to_worker_fli_str))
-
-    batch_size = 32
-    model = torch.jit.load(f"resnet50.{args.device.upper()}.pt")
-    buffer = io.BytesIO()
-    batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32)
-    scripted = torch.jit.trace(model, batch)
-    torch.jit.save(scripted, buffer)
-
-    total_iterations = 10
-
-    headers=[
-                "batch_size",
-                "build_tensor",
-                "build_request",
-                "serialize_request",
-                "send",
-                "receive",
-                "deserialize_response",
-                "deserialize_tensor",
-            ]
-
-    print(",".join(headers))
-
-    for batch_size in [1, 8, 32, 64, 128]:
-
-        timings = []
-        for iteration_number in range(total_iterations + int(batch_size==1)):
-
-            timings.append([batch_size])
-
-            batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32)
-
-            start = time.perf_counter()
-            interm = start
-            built_tensor = MessageHandler.build_tensor(
-                batch.numpy(), "c", "float32", list(batch.shape)
-            )
-            timings[-1].append(time.perf_counter() - interm)
-            interm = time.perf_counter()
-
-            from_worker_ch = Channel.make_process_local()
-
-            request = MessageHandler.build_request(
-                reply_channel=from_worker_ch.serialize(),
-                model=buffer.getvalue(),
-                inputs=[built_tensor],
-                outputs=[],
-                output_descriptors=[],
-                custom_attributes=None,
-            )
-
-            timings[-1].append(time.perf_counter() - interm)
-            interm = time.perf_counter()
-            request_bytes = MessageHandler.serialize_request(request)
-            timings[-1].append(time.perf_counter() - interm)
-            interm = time.perf_counter()
-            with to_worker_fli.sendh(timeout=None) as to_sendh:
-                to_sendh.send_bytes(request_bytes)
-                timings[-1].append(time.perf_counter() - interm)
-                interm = time.perf_counter()
-
-            with from_worker_ch.recvh(timeout=None) as from_recvh:
-                resp = from_recvh.recv_bytes(timeout=None)
-                timings[-1].append(time.perf_counter() - interm)
-                interm = time.perf_counter()
-                response = MessageHandler.deserialize_response(resp)
-                timings[-1].append(time.perf_counter() - interm)
-                interm = time.perf_counter()
-                result = torch.from_numpy(
-                    numpy.frombuffer(
-                        response.result.data[0].blob,
-                        dtype=str(response.result.data[0].tensorDescriptor.dataType),
-                    )
-                )
-
-                timings[-1].append(time.perf_counter() - interm)
-                interm = time.perf_counter()
-
-            # duration = time.perf_counter() - start
-            # print(f"{duration:.3f} s")
-
-            print(",".join(str(timing) for timing in timings[-1]))
diff --git a/examples/high_throughput_inference/standalone_workermanager.py b/examples/high_throughput_inference/standalone_workermanager.py
deleted file mode 100644
index 32d534f360..0000000000
--- a/examples/high_throughput_inference/standalone_workermanager.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# isort: off
-import dragon
-from dragon import fli
-from dragon.channels import Channel
-from dragon.data.ddict.ddict import DDict
-from dragon.utils import b64decode, b64encode
-from dragon.globalservices.api_setup import connect_to_infrastructure
-# isort: on
-import argparse
-import os
-
-
-from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel
-from smartsim._core.mli.infrastructure.worker.worker import TorchWorker
-from smartsim._core.mli.infrastructure.control.workermanager import (
-    WorkerManager,
-)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser("Worker Manager")
-    parser.add_argument("--device", default="gpu")
-    args = parser.parse_args()
-    connect_to_infrastructure()
-    ddict_str = os.environ["SS_DRG_DDICT"]
-    ddict = DDict.attach(ddict_str)
-
-    to_worker_channel = Channel.make_process_local()
-    to_worker_manager_channel = Channel.make_process_local()
-    channels = [Channel.make_process_local() for _ in range(100)]
-    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=to_worker_manager_channel, stream_channels=channels)
-    ddict["to_worker_fli"] = b64encode(to_worker_fli.serialize())
-
-    torch_worker = TorchWorker()
-
-    worker_manager = WorkerManager(
-        file_like_interface=to_worker_fli,
-        worker=torch_worker,
-        feature_store=None,
-        as_service=True,
-        cooldown=10,
-        comm_channel_type=DragonCommChannel,
-        device = args.device,
-    )
-    worker_manager.execute()

From 56d8e50f4f7e9fddb9e4d79ba0b1fe556e400684 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 26 Jun 2024 18:47:40 -0500
Subject: [PATCH 16/84] Add tests for torch worker

---
 ex/high_throughput_inference/mock_app.py      |   5 +-
 .../standalone_workermanager.py               |   2 +-
 .../mli/infrastructure/worker/torch_worker.py | 118 ++++++++++++
 .../_core/mli/infrastructure/worker/worker.py |  91 +--------
 tests/mli/test_torch_worker.py                | 173 ++++++++++++++++++
 tests/mli/test_worker_manager.py              |  12 +-
 6 files changed, 309 insertions(+), 92 deletions(-)
 create mode 100644 smartsim/_core/mli/infrastructure/worker/torch_worker.py
 create mode 100644 tests/mli/test_torch_worker.py

diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index afc0c836b8..d22792d15b 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -5,7 +5,7 @@
 import dragon.channels
 from dragon.data.ddict.ddict import DDict
 from dragon.globalservices.api_setup import connect_to_infrastructure
-from dragon.utils import b64decode, b64encode
+from dragon.utils import b64decode
 
 # isort: on
 
@@ -13,11 +13,8 @@
 import io
 import numpy
 import os
-import tabulate
 import time
 import torch
-import typing as t
-
 from smartsim._core.mli.message_handler import MessageHandler
 
 
diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index 32d534f360..40fefcc372 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -11,7 +11,7 @@
 
 
 from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel
-from smartsim._core.mli.infrastructure.worker.worker import TorchWorker
+from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
 from smartsim._core.mli.infrastructure.control.workermanager import (
     WorkerManager,
 )
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
new file mode 100644
index 0000000000..c350499c20
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -0,0 +1,118 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import io
+
+import numpy as np
+import torch
+
+from .....error import SmartSimError
+from .....log import get_logger
+from ...mli_schemas.tensor import tensor_capnp
+from .worker import (
+    ExecuteResult,
+    FetchInputResult,
+    FetchModelResult,
+    InferenceRequest,
+    LoadModelResult,
+    MachineLearningWorkerBase,
+    TransformInputResult,
+    TransformOutputResult,
+)
+
+logger = get_logger(__name__)
+
+
+class TorchWorker(MachineLearningWorkerBase):
+    """A worker that executes a PyTorch model."""
+
+    @staticmethod
+    def load_model(
+        request: InferenceRequest, fetch_result: FetchModelResult, device: str
+    ) -> LoadModelResult:
+        model_bytes = fetch_result.model_bytes or request.raw_model
+        if not model_bytes:
+            raise ValueError("Unable to load model without reference object")
+
+        _device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
+        device = _device_to_torch[device]
+        buffer = io.BytesIO(model_bytes)
+        model = torch.jit.load(buffer, map_location=device)  # type: ignore
+        result = LoadModelResult(model)
+        return result
+
+    @staticmethod
+    def transform_input(
+        request: InferenceRequest, fetch_result: FetchInputResult, device: str
+    ) -> TransformInputResult:
+        result = []
+
+        _device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
+        device = _device_to_torch[device]
+        if fetch_result.meta is None:
+            raise ValueError("Cannot reconstruct tensor without meta information")
+        for item, item_meta in zip(fetch_result.inputs, fetch_result.meta):
+            tensor_desc: tensor_capnp.TensorDescriptor = item_meta
+            result.append(
+                torch.tensor(np.frombuffer(item, dtype=str(tensor_desc.dataType)))
+                .to(device)
+                .reshape(tuple(dim for dim in tensor_desc.dimensions))
+            )
+        return TransformInputResult(result)
+        # return data # note: this fails copy test!
+
+    @staticmethod
+    def execute(
+        request: InferenceRequest,
+        load_result: LoadModelResult,
+        transform_result: TransformInputResult,
+    ) -> ExecuteResult:
+        if not load_result.model:
+            raise SmartSimError("Model must be loaded to execute")
+
+        model: torch.nn.Module = load_result.model
+        model.eval()
+        results = [model(tensor).detach() for tensor in transform_result.transformed]
+
+        execute_result = ExecuteResult(results)
+        return execute_result
+
+    @staticmethod
+    def transform_output(
+        request: InferenceRequest,
+        execute_result: ExecuteResult,
+        result_device: str,
+    ) -> TransformOutputResult:
+        if result_device != "cpu":
+            transformed = [
+                item.to("cpu").clone() for item in execute_result.predictions
+            ]
+            # todo: need the shape from latest schemas added here.
+            return TransformOutputResult(transformed, None, "c", "float32")  # fixme
+
+        return TransformOutputResult(
+            execute_result.predictions, None, "c", "float32"
+        )  # fixme
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 08c4997554..24dc734d00 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -24,18 +24,13 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import io
 import typing as t
 from abc import ABC, abstractmethod
 
-import numpy as np
-import torch
-
-import smartsim.error as sse
-from smartsim._core.mli.comm.channel.channel import CommChannelBase
-from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
-from smartsim._core.mli.mli_schemas.tensor import tensor_capnp
-from smartsim.log import get_logger
+from .....error import SmartSimError
+from .....log import get_logger
+from ...comm.channel.channel import CommChannelBase
+from ...infrastructure.storage.featurestore import FeatureStore
 
 logger = get_logger(__name__)
 
@@ -167,7 +162,7 @@ def fetch_model(
             raise ValueError("Feature store is required for model retrieval")
 
         if not request.model_key:
-            raise sse.SmartSimError(
+            raise SmartSimError(
                 "Key must be provided to retrieve model from feature store"
             )
 
@@ -176,7 +171,7 @@ def fetch_model(
             return FetchModelResult(raw_bytes)
         except FileNotFoundError as ex:
             logger.exception(ex)
-            raise sse.SmartSimError(
+            raise SmartSimError(
                 f"Model could not be retrieved with key {request.model_key}"
             ) from ex
 
@@ -204,7 +199,7 @@ def fetch_inputs(
                     data.append(tensor_bytes)
                 except KeyError as ex:
                     logger.exception(ex)
-                    raise sse.SmartSimError(
+                    raise SmartSimError(
                         f"Model could not be retrieved with key {input_}"
                     ) from ex
             return FetchInputResult(
@@ -303,75 +298,3 @@ def transform_output(
         :param execute_result: The result of inference wrapped in an ExecuteResult
         :param result_device: The device on which the result of inference is placed
         :return:"""
-
-
-class TorchWorker(MachineLearningWorkerBase):
-    """A worker that executes a PyTorch model."""
-
-    @staticmethod
-    def load_model(
-        request: InferenceRequest, fetch_result: FetchModelResult, device: str
-    ) -> LoadModelResult:
-        model_bytes = fetch_result.model_bytes or request.raw_model
-        if not model_bytes:
-            raise ValueError("Unable to load model without reference object")
-
-        _device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
-        device = _device_to_torch[device]
-        buffer = io.BytesIO(model_bytes)
-        model = torch.jit.load(buffer, map_location=device)  # type: ignore
-        result = LoadModelResult(model)
-        return result
-
-    @staticmethod
-    def transform_input(
-        request: InferenceRequest, fetch_result: FetchInputResult, device: str
-    ) -> TransformInputResult:
-        result = []
-
-        _device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
-        device = _device_to_torch[device]
-        if fetch_result.meta is None:
-            raise ValueError("Cannot reconstruct tensor without meta information")
-        for item, item_meta in zip(fetch_result.inputs, fetch_result.meta):
-            tensor_desc: tensor_capnp.TensorDescriptor = item_meta
-            result.append(
-                torch.tensor(np.frombuffer(item, dtype=str(tensor_desc.dataType)))
-                .to(device)
-                .reshape(tuple(dim for dim in tensor_desc.dimensions))
-            )
-        return TransformInputResult(result)
-        # return data # note: this fails copy test!
-
-    @staticmethod
-    def execute(
-        request: InferenceRequest,
-        load_result: LoadModelResult,
-        transform_result: TransformInputResult,
-    ) -> ExecuteResult:
-        if not load_result.model:
-            raise sse.SmartSimError("Model must be loaded to execute")
-
-        model: torch.nn.Module = load_result.model
-        model.eval()
-        results = [model(tensor).detach() for tensor in transform_result.transformed]
-
-        execute_result = ExecuteResult(results)
-        return execute_result
-
-    @staticmethod
-    def transform_output(
-        request: InferenceRequest,
-        execute_result: ExecuteResult,
-        result_device: str,
-    ) -> TransformOutputResult:
-        if result_device != "cpu":
-            transformed = [
-                item.to("cpu").clone() for item in execute_result.predictions
-            ]
-            # todo: need the shape from latest schemas added here.
-            return TransformOutputResult(transformed, None, "c", "float32")  # fixme
-
-        return TransformOutputResult(
-            execute_result.predictions, None, "c", "float32"
-        )  # fixme
diff --git a/tests/mli/test_torch_worker.py b/tests/mli/test_torch_worker.py
new file mode 100644
index 0000000000..0b1cd4ccf3
--- /dev/null
+++ b/tests/mli/test_torch_worker.py
@@ -0,0 +1,173 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import io
+
+import numpy as np
+import pytest
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
+from smartsim._core.mli.infrastructure.worker.worker import (
+    ExecuteResult,
+    FetchInputResult,
+    FetchModelResult,
+    InferenceRequest,
+    LoadModelResult,
+    TransformInputResult,
+)
+from smartsim._core.mli.message_handler import MessageHandler
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+
+# simple MNIST in PyTorch
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout(0.25)
+        self.dropout2 = nn.Dropout(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+
+
+torch_device = {"cpu": "cpu", "gpu": "cuda"}
+
+
+def get_batch() -> torch.Tensor:
+    return torch.rand(20, 1, 28, 28)
+
+
+def create_torch_model():
+    n = Net()
+    example_forward_input = get_batch()
+    module = torch.jit.trace(n, example_forward_input)
+    model_buffer = io.BytesIO()
+    torch.jit.save(module, model_buffer)
+    return model_buffer.getvalue()
+
+
+def get_request() -> InferenceRequest:
+
+    tensors = [get_batch() for _ in range(2)]
+    serialized_tensors = [
+        MessageHandler.build_tensor(tensor.numpy(), "c", "float32", list(tensor.shape))
+        for tensor in tensors
+    ]
+
+    return InferenceRequest(
+        model_key="model",
+        callback=None,
+        raw_inputs=[s_tensor.blob for s_tensor in serialized_tensors],
+        input_keys=None,
+        input_meta=[s_tensor.tensorDescriptor for s_tensor in serialized_tensors],
+        output_keys=None,
+        raw_model=create_torch_model(),
+        batch_size=0,
+    )
+
+
+sample_request: InferenceRequest = get_request()
+worker = TorchWorker()
+
+
+def test_load_model(mlutils) -> None:
+    fetch_model_result = FetchModelResult(sample_request.raw_model)
+    load_model_result = worker.load_model(
+        sample_request, fetch_model_result, mlutils.get_test_device().lower()
+    )
+
+    assert load_model_result.model(
+        get_batch().to(torch_device[mlutils.get_test_device().lower()])
+    ).shape == torch.Size((20, 10))
+
+
+def test_transform_input(mlutils) -> None:
+    fetch_input_result = FetchInputResult(
+        sample_request.raw_inputs, sample_request.input_meta
+    )
+
+    transform_input_result = worker.transform_input(
+        sample_request, fetch_input_result, mlutils.get_test_device().lower()
+    )
+
+    assert all(
+        transformed.shape == get_batch().shape
+        for transformed in transform_input_result.transformed
+    )
+
+
+def test_execute(mlutils) -> None:
+    load_model_result = LoadModelResult(
+        Net().to(torch_device[mlutils.get_test_device().lower()])
+    )
+    transform_result = TransformInputResult(
+        [
+            get_batch().to(torch_device[mlutils.get_test_device().lower()])
+            for _ in range(2)
+        ]
+    )
+
+    execute_result = worker.execute(sample_request, load_model_result, transform_result)
+
+    assert all(
+        result.shape == torch.Size((20, 10)) for result in execute_result.predictions
+    )
+
+
+def test_transform_output(mlutils):
+    execute_result = ExecuteResult([torch.rand((20, 10)) for _ in range(2)])
+
+    transformed_output = worker.transform_output(
+        sample_request, execute_result, torch_device[mlutils.get_test_device().lower()]
+    )
+
+    assert transformed_output.outputs == execute_result.predictions
+    assert transformed_output.shape == None
+    assert transformed_output.order == "c"
+    assert transformed_output.dtype == "float32"
diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py
index 01502ec521..46cae5b2e4 100644
--- a/tests/mli/test_worker_manager.py
+++ b/tests/mli/test_worker_manager.py
@@ -29,10 +29,14 @@
 import multiprocessing as mp
 import pathlib
 import time
-import typing as t
 
 import pytest
-import torch
+
+should_run = True
+try:
+    import torch
+except ImportError:
+    should_run = False
 
 from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager
 from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
@@ -44,9 +48,11 @@
 from .worker import IntegratedTorchWorker
 
 logger = get_logger(__name__)
-# The tests in this file belong to the group_b group
+# The tests in this file belong to the group_a group
 pytestmark = pytest.mark.group_a
 
+pytest.mark.skipif(not should_run, "Test needs PyTorch to run")
+
 
 def mock_work(worker_manager_queue: "mp.Queue[bytes]") -> None:
     """Mock event producer for triggering the inference pipeline"""

From 6cec83ea4697761b3d297cc8fd50cd44a568af64 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Thu, 27 Jun 2024 08:14:24 -0500
Subject: [PATCH 17/84] Switch to sender-supplied channels in app example

---
 ex/high_throughput_inference/mock_app.py                 | 6 ++++--
 ex/high_throughput_inference/standalone_workermanager.py | 3 +--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index d22792d15b..8a00e8f0e4 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -62,6 +62,9 @@
 
     print(",".join(headers))
 
+    from_worker_ch = Channel.make_process_local()
+    to_worker_ch = Channel.make_process_local()
+
     for batch_size in [1, 8, 32, 64, 128]:
 
         timings = []
@@ -79,7 +82,6 @@
             timings[-1].append(time.perf_counter() - interm)
             interm = time.perf_counter()
 
-            from_worker_ch = Channel.make_process_local()
 
             request = MessageHandler.build_request(
                 reply_channel=from_worker_ch.serialize(),
@@ -95,7 +97,7 @@
             request_bytes = MessageHandler.serialize_request(request)
             timings[-1].append(time.perf_counter() - interm)
             interm = time.perf_counter()
-            with to_worker_fli.sendh(timeout=None) as to_sendh:
+            with to_worker_fli.sendh(timeout=None, stream_channel=to_worker_ch) as to_sendh:
                 to_sendh.send_bytes(request_bytes)
                 timings[-1].append(time.perf_counter() - interm)
                 interm = time.perf_counter()
diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index 40fefcc372..cdc97f4c2e 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -26,8 +26,7 @@
 
     to_worker_channel = Channel.make_process_local()
     to_worker_manager_channel = Channel.make_process_local()
-    channels = [Channel.make_process_local() for _ in range(100)]
-    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=to_worker_manager_channel, stream_channels=channels)
+    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=to_worker_manager_channel)
     ddict["to_worker_fli"] = b64encode(to_worker_fli.serialize())
 
     torch_worker = TorchWorker()

From 3ad6d445662a611539b40cb72fcba1a0b4ea102f Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Thu, 27 Jun 2024 16:55:59 -0500
Subject: [PATCH 18/84] Add prototype client for mock app

---
 ex/high_throughput_inference/mli_driver.py |   2 +-
 ex/high_throughput_inference/mock_app.py   | 206 ++++++++++++---------
 2 files changed, 116 insertions(+), 92 deletions(-)

diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py
index 7b8db5ed83..d32d88e51b 100644
--- a/ex/high_throughput_inference/mli_driver.py
+++ b/ex/high_throughput_inference/mli_driver.py
@@ -4,7 +4,7 @@
 from smartsim.status import TERMINAL_STATUSES
 import time
 
-device = "cpu"
+device = "gpu"
 filedir = os.path.dirname(__file__)
 worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py")
 app_script_name = os.path.join(filedir, "mock_app.py")
diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index 8a00e8f0e4..aa3aaeb3ee 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -15,111 +15,135 @@
 import os
 import time
 import torch
+import numbers
+
+from collections import OrderedDict
 from smartsim._core.mli.message_handler import MessageHandler
 
+class ProtoClient:
+    def __init__(self, timing_on: bool):
+        connect_to_infrastructure()
+        ddict_str = os.environ["SS_DRG_DDICT"]
+        self._ddict = DDict.attach(ddict_str)
+        to_worker_fli_str = None
+        while to_worker_fli_str is None:
+            try:
+                to_worker_fli_str = self._ddict["to_worker_fli"]
+                self._to_worker_fli = fli.FLInterface.attach(b64decode(to_worker_fli_str))
+            except KeyError:
+                time.sleep(1)
+        self._from_worker_ch = Channel.make_process_local()
+        self._from_worker_ch_serialized = self._from_worker_ch.serialize()
+        self._to_worker_ch = Channel.make_process_local()
+
+        self._start = None
+        self._interm = None
+        self._timings: OrderedDict[str, list[numbers.Number]] = OrderedDict()
+        self._timing_on = timing_on
+
+    def _add_label_to_timings(self, label: str):
+        if label not in self._timings:
+            self._timings[label] = []
+
+    @staticmethod
+    def _format_number(number: numbers.Number):
+        return f"{number:0.4e}"
+
+    def start_timings(self, batch_size: int):
+        if self._timing_on:
+            self._add_label_to_timings("batch_size")
+            self._timings["batch_size"].append(batch_size)
+            self._start = time.perf_counter()
+            self._interm = time.perf_counter()
+
+    def end_timings(self):
+        if self._timing_on:
+            self._add_label_to_timings("total_time")
+            self._timings["total_time"].append(self._format_number(time.perf_counter()-self._start))
+
+    def measure_time(self, label: str):
+        if self._timing_on:
+            self._add_label_to_timings(label)
+            self._timings[label].append(self._format_number(time.perf_counter()-self._interm))
+            self._interm = time.perf_counter()
+
+    def print_timings(self, to_file: bool = False):
+        print(" ".join(self._timings.keys()))
+        value_array = numpy.array([value for  value in self._timings.values()], dtype=float)
+        value_array = numpy.transpose(value_array)
+        for i in range(value_array.shape[0]):
+            print(" ".join(self._format_number(value) for value in value_array[i]))
+        if to_file:
+            numpy.save("timings.npy", value_array)
+            numpy.savetxt("timings.txt", value_array)
+
+
+    def run_model(self, model: bytes, batch: torch.Tensor):
+        self.start_timings(batch.shape[0])
+        built_tensor = MessageHandler.build_tensor(
+            batch.numpy(), "c", "float32", list(batch.shape))
+        self.measure_time("build_tensor")
+        request = MessageHandler.build_request(
+            reply_channel=self._from_worker_ch_serialized,
+            model=model,
+            inputs=[built_tensor],
+            outputs=[],
+            output_descriptors=[],
+            custom_attributes=None,
+        )
+        self.measure_time("build_request")
+        request_bytes = MessageHandler.serialize_request(request)
+        self.measure_time("serialize_request")
+        with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh:
+            to_sendh.send_bytes(request_bytes)
+
+        self.measure_time("send")
+        with self._from_worker_ch.recvh(timeout=None) as from_recvh:
+            resp = from_recvh.recv_bytes(timeout=None)
+            self.measure_time("receive")
+            response = MessageHandler.deserialize_response(resp)
+            self.measure_time("deserialize_response")
+            result = torch.from_numpy(
+                numpy.frombuffer(
+                    response.result.data[0].blob,
+                    dtype=str(response.result.data[0].tensorDescriptor.dataType),
+                )
+            )
+            self.measure_time("deserialize_tensor")
 
-if __name__ == "__main__":
+        self.end_timings()
+        return result
 
-    parser = argparse.ArgumentParser("Mock application")
-    parser.add_argument("--device", default="cpu")
 
-    args = parser.parse_args()
+class ResNetWrapper():
+    def __init__(self, model: str):
+        self._model = torch.jit.load(model)
+        buffer = io.BytesIO()
+        scripted = torch.jit.trace(self._model, self.get_batch())
+        torch.jit.save(scripted, buffer)
+        self._serialized_model = buffer.getvalue()
 
-    connect_to_infrastructure()
-    ddict_str = os.environ["SS_DRG_DDICT"]
+    def get_batch(self, batch_size: int=32):
+        return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32)
 
-    ddict = DDict.attach(ddict_str)
+    @property
+    def model(self):
+        return self._serialized_model
 
-    to_worker_fli_str = None
+if __name__ == "__main__":
 
-    while to_worker_fli_str is None:
-        try:
-            to_worker_fli_str = ddict["to_worker_fli"]
-        except Exception as e:
-            time.sleep(1)
+    parser = argparse.ArgumentParser("Mock application")
+    parser.add_argument("--device", default="cpu")
+    args = parser.parse_args()
 
-    to_worker_fli = fli.FLInterface.attach(b64decode(to_worker_fli_str))
+    resnet = ResNetWrapper(f"resnet50.{args.device.upper()}.pt")
 
-    batch_size = 32
-    model = torch.jit.load(f"resnet50.{args.device.upper()}.pt")
-    buffer = io.BytesIO()
-    batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32)
-    scripted = torch.jit.trace(model, batch)
-    torch.jit.save(scripted, buffer)
+    client = ProtoClient(timing_on=True)
 
     total_iterations = 10
 
-    headers=[
-                "batch_size",
-                "build_tensor",
-                "build_request",
-                "serialize_request",
-                "send",
-                "receive",
-                "deserialize_response",
-                "deserialize_tensor",
-            ]
-
-    print(",".join(headers))
-
-    from_worker_ch = Channel.make_process_local()
-    to_worker_ch = Channel.make_process_local()
-
     for batch_size in [1, 8, 32, 64, 128]:
-
-        timings = []
         for iteration_number in range(total_iterations + int(batch_size==1)):
+            client.run_model(resnet.model, resnet.get_batch(batch_size))
 
-            timings.append([batch_size])
-
-            batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32)
-
-            start = time.perf_counter()
-            interm = start
-            built_tensor = MessageHandler.build_tensor(
-                batch.numpy(), "c", "float32", list(batch.shape)
-            )
-            timings[-1].append(time.perf_counter() - interm)
-            interm = time.perf_counter()
-
-
-            request = MessageHandler.build_request(
-                reply_channel=from_worker_ch.serialize(),
-                model=buffer.getvalue(),
-                inputs=[built_tensor],
-                outputs=[],
-                output_descriptors=[],
-                custom_attributes=None,
-            )
-
-            timings[-1].append(time.perf_counter() - interm)
-            interm = time.perf_counter()
-            request_bytes = MessageHandler.serialize_request(request)
-            timings[-1].append(time.perf_counter() - interm)
-            interm = time.perf_counter()
-            with to_worker_fli.sendh(timeout=None, stream_channel=to_worker_ch) as to_sendh:
-                to_sendh.send_bytes(request_bytes)
-                timings[-1].append(time.perf_counter() - interm)
-                interm = time.perf_counter()
-
-            with from_worker_ch.recvh(timeout=None) as from_recvh:
-                resp = from_recvh.recv_bytes(timeout=None)
-                timings[-1].append(time.perf_counter() - interm)
-                interm = time.perf_counter()
-                response = MessageHandler.deserialize_response(resp)
-                timings[-1].append(time.perf_counter() - interm)
-                interm = time.perf_counter()
-                result = torch.from_numpy(
-                    numpy.frombuffer(
-                        response.result.data[0].blob,
-                        dtype=str(response.result.data[0].tensorDescriptor.dataType),
-                    )
-                )
-
-                timings[-1].append(time.perf_counter() - interm)
-                interm = time.perf_counter()
-
-            # duration = time.perf_counter() - start
-            # print(f"{duration:.3f} s")
-
-            print(",".join(str(timing) for timing in timings[-1]))
+    client.print_timings(to_file=True)
\ No newline at end of file

From bd5f13357b181ee07e2df880b519d8464c8af174 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Fri, 28 Jun 2024 14:55:18 -0500
Subject: [PATCH 19/84] Update mock app

---
 ex/high_throughput_inference/mli_driver.py               | 5 +++--
 ex/high_throughput_inference/mock_app.py                 | 9 +++++++--
 ex/high_throughput_inference/standalone_workermanager.py | 3 +--
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py
index d32d88e51b..9b899f4124 100644
--- a/ex/high_throughput_inference/mli_driver.py
+++ b/ex/high_throughput_inference/mli_driver.py
@@ -10,8 +10,9 @@
 app_script_name = os.path.join(filedir, "mock_app.py")
 model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt")
 
-
-exp = Experiment("MLI_proto", launcher="dragon", exp_path=os.path.join(filedir, "MLI_proto"))
+exp_path = os.path.join(filedir, "MLI_proto")
+os.makedirs(exp_path, exist_ok=True)
+exp = Experiment("MLI_proto", launcher="dragon", exp_path=exp_path)
 
 worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device])
 worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs)
diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index aa3aaeb3ee..666d7fcc91 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -19,6 +19,9 @@
 
 from collections import OrderedDict
 from smartsim._core.mli.message_handler import MessageHandler
+from smartsim.log import get_logger
+
+logger = get_logger("App")
 
 class ProtoClient:
     def __init__(self, timing_on: bool):
@@ -140,10 +143,12 @@ def model(self):
 
     client = ProtoClient(timing_on=True)
 
-    total_iterations = 10
+    total_iterations = 100
 
-    for batch_size in [1, 8, 32, 64, 128]:
+    for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]:
+        logger.info(f"Batch size: {batch_size}")
         for iteration_number in range(total_iterations + int(batch_size==1)):
+            logger.info(f"Iteration: {iteration_number}")
             client.run_model(resnet.model, resnet.get_batch(batch_size))
 
     client.print_timings(to_file=True)
\ No newline at end of file
diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index cdc97f4c2e..ccefcbf584 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -25,8 +25,7 @@
     ddict = DDict.attach(ddict_str)
 
     to_worker_channel = Channel.make_process_local()
-    to_worker_manager_channel = Channel.make_process_local()
-    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=to_worker_manager_channel)
+    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
     ddict["to_worker_fli"] = b64encode(to_worker_fli.serialize())
 
     torch_worker = TorchWorker()

From 3e343ee5dff7d85646a39db1b56123efa575f387 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Thu, 4 Jul 2024 05:40:59 -0500
Subject: [PATCH 20/84] Changes to feature store

---
 smartsim/_core/launcher/dragon/dragonBackend.py    |  2 +-
 .../infrastructure/storage/dragonfeaturestore.py   | 12 ++++--------
 .../mli/infrastructure/worker/torch_worker.py      |  2 +-
 smartsim/_core/mli/infrastructure/worker/worker.py | 14 +++++++++++++-
 smartsim/_core/mli/message_handler.py              |  4 +++-
 5 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 52f69ec41f..856de38030 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -401,7 +401,7 @@ def infra_ddict(self) -> str:
         """
         if self._infra_ddict is None:
             logger.info("Creating DDict")
-            self._infra_ddict = dragon_ddict.DDict()  # todo: parametrize
+            self._infra_ddict = dragon_ddict.DDict(n_nodes=len(self._hosts), total_mem=len(self._hosts)*1024**3)  # todo: parametrize
             logger.info("Created DDict")
             self._infra_ddict["creation"] = str(time.time())
             logger.info(self._infra_ddict["creation"])
diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
index ea8f06977d..53f2f461f8 100644
--- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
+++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
@@ -47,24 +47,20 @@ def __init__(self, storage: "DragonDict") -> None:
     def __getitem__(self, key: str) -> t.Any:
         """Retrieve an item using key
         :param key: Unique key of an item to retrieve from the feature store"""
-        key_ = key.encode("utf-8")
         try:
-            return self._storage[key_]
+            return self._storage[key]
         except Exception as ex:
             # note: explicitly avoid round-trip to check for key existence
             raise sse.SmartSimError(f"{key} not found in feature store") from ex
 
-    def __setitem__(self, key: str, value: bytes) -> None:
+    def __setitem__(self, key: str, value: str) -> None:
         """Assign a value using key
         :param key: Unique key of an item to set in the feature store
         :param value: Value to persist in the feature store"""
-        key_ = key.encode("utf-8")
-        self._storage[key_] = value
+        self._storage[key] = value
 
-    def __contains__(self, key: t.Union[str, bytes]) -> bool:
+    def __contains__(self, key: t.Union[str]) -> bool:
         """Membership operator to test for a key existing within the feature store.
         Return `True` if the key is found, `False` otherwise
         :param key: Unique key of an item to retrieve from the feature store"""
-        if isinstance(key, str):
-            key = key.encode("utf-8")
         return key in self._storage
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index c350499c20..122b9ddf2f 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -108,7 +108,7 @@ def transform_output(
     ) -> TransformOutputResult:
         if result_device != "cpu":
             transformed = [
-                item.to("cpu").clone() for item in execute_result.predictions
+                item.to("cpu") for item in execute_result.predictions
             ]
             # todo: need the shape from latest schemas added here.
             return TransformOutputResult(transformed, None, "c", "float32")  # fixme
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 24dc734d00..40696ac22f 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -32,6 +32,18 @@
 from ...comm.channel.channel import CommChannelBase
 from ...infrastructure.storage.featurestore import FeatureStore
 
+import sys
+
+# isort: off
+try:
+    import dragon
+    from dragon.utils import b64decode
+except ImportError as exc:
+    if not "pytest" in sys.modules:
+        raise exc from None
+
+# isort: on
+
 logger = get_logger(__name__)
 
 
@@ -167,7 +179,7 @@ def fetch_model(
             )
 
         try:
-            raw_bytes = feature_store[request.model_key]
+            raw_bytes = b64decode(feature_store[request.model_key])
             return FetchModelResult(raw_bytes)
         except FileNotFoundError as ex:
             logger.exception(ex)
diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py
index fd8f6aeed7..1928db2f7c 100644
--- a/smartsim/_core/mli/message_handler.py
+++ b/smartsim/_core/mli/message_handler.py
@@ -200,7 +200,9 @@ def _assign_model(
             if isinstance(model, bytes):
                 request.model.modelData = model
             else:
-                request.model.modelKey = model  # type: ignore
+                model_key = data_references_capnp.ModelKey()
+                model_key.key = model
+                request.model.modelKey = model_key  # type: ignore
         except Exception as e:
             raise ValueError("Error building model portion of request.") from e
 

From a2bed267d8dbc1af109cad6708557afb11687d0a Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Fri, 5 Jul 2024 17:45:32 +0200
Subject: [PATCH 21/84] Make style

---
 smartsim/_core/launcher/dragon/dragonBackend.py          | 4 +++-
 smartsim/_core/mli/infrastructure/worker/torch_worker.py | 4 +---
 smartsim/_core/mli/infrastructure/worker/worker.py       | 3 +--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 856de38030..dcc5c8392b 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -401,7 +401,9 @@ def infra_ddict(self) -> str:
         """
         if self._infra_ddict is None:
             logger.info("Creating DDict")
-            self._infra_ddict = dragon_ddict.DDict(n_nodes=len(self._hosts), total_mem=len(self._hosts)*1024**3)  # todo: parametrize
+            self._infra_ddict = dragon_ddict.DDict(
+                n_nodes=len(self._hosts), total_mem=len(self._hosts) * 1024**3
+            )  # todo: parametrize
             logger.info("Created DDict")
             self._infra_ddict["creation"] = str(time.time())
             logger.info(self._infra_ddict["creation"])
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index 122b9ddf2f..28237dc422 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -107,9 +107,7 @@ def transform_output(
         result_device: str,
     ) -> TransformOutputResult:
         if result_device != "cpu":
-            transformed = [
-                item.to("cpu") for item in execute_result.predictions
-            ]
+            transformed = [item.to("cpu") for item in execute_result.predictions]
             # todo: need the shape from latest schemas added here.
             return TransformOutputResult(transformed, None, "c", "float32")  # fixme
 
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 73eff4e8ea..e368935a0d 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -24,6 +24,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import sys
 import typing as t
 from abc import ABC, abstractmethod
 
@@ -33,8 +34,6 @@
 from ...infrastructure.storage.featurestore import FeatureStore
 from ...mli_schemas.model.model_capnp import Model
 
-import sys
-
 # isort: off
 try:
     import dragon

From 36e92d9dabcdd013cdba637a2629e19c15896cb5 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Fri, 5 Jul 2024 18:07:31 +0200
Subject: [PATCH 22/84] Fix typing

---
 .../mli/infrastructure/storage/featurestore.py      |  2 +-
 .../_core/mli/infrastructure/worker/torch_worker.py | 13 ++++++++-----
 smartsim/_core/mli/infrastructure/worker/worker.py  |  2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py
index ec4086b732..e18643e932 100644
--- a/smartsim/_core/mli/infrastructure/storage/featurestore.py
+++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py
@@ -37,7 +37,7 @@ def __getitem__(self, key: str) -> bytes:
         :param key: Unique key of an item to retrieve from the feature store"""
 
     @abstractmethod
-    def __setitem__(self, key: str, value: bytes) -> None:
+    def __setitem__(self, key: str, value: str) -> None:
         """Assign a value using key
         :param key: Unique key of an item to set in the feature store
         :param value: Value to persist in the feature store"""
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index 28237dc422..e21513648b 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -53,13 +53,16 @@ class TorchWorker(MachineLearningWorkerBase):
     def load_model(
         request: InferenceRequest, fetch_result: FetchModelResult, device: str
     ) -> LoadModelResult:
-        model_bytes = fetch_result.model_bytes or request.raw_model
-        if not model_bytes:
+        if fetch_result.model_bytes:
+            model_bytes = fetch_result.model_bytes
+        elif request.raw_model and request.raw_model.data:
+            model_bytes = request.raw_model.data
+        else:
             raise ValueError("Unable to load model without reference object")
 
-        _device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
-        device = _device_to_torch[device]
-        buffer = io.BytesIO(model_bytes)
+        device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
+        device = device_to_torch[device]
+        buffer = io.BytesIO(initial_bytes=model_bytes)
         model = torch.jit.load(buffer, map_location=device)  # type: ignore
         result = LoadModelResult(model)
         return result
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index e368935a0d..fb061348ee 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -148,7 +148,7 @@ class FetchModelResult:
 
     def __init__(self, result: bytes) -> None:
         """Initialize the object"""
-        self.model_bytes = result
+        self.model_bytes: bytes = result
 
 
 class MachineLearningWorkerCore:

From 59840a3be12576eedce2528d93a8b601a768973e Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Fri, 5 Jul 2024 18:17:18 +0200
Subject: [PATCH 23/84] Fix lint

---
 smartsim/_core/mli/infrastructure/worker/torch_worker.py | 4 ++--
 smartsim/_core/mli/infrastructure/worker/worker.py       | 4 +---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index e21513648b..a4e725ab99 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -73,8 +73,8 @@ def transform_input(
     ) -> TransformInputResult:
         result = []
 
-        _device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
-        device = _device_to_torch[device]
+        device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
+        device = device_to_torch[device]
         if fetch_result.meta is None:
             raise ValueError("Cannot reconstruct tensor without meta information")
         for item, item_meta in zip(fetch_result.inputs, fetch_result.meta):
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index fb061348ee..fe82ea2a3e 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -36,12 +36,10 @@
 
 # isort: off
 try:
-    import dragon
     from dragon.utils import b64decode
 except ImportError as exc:
-    if not "pytest" in sys.modules:
+    if "pytest" not in sys.modules:
         raise exc from None
-
 # isort: on
 
 logger = get_logger(__name__)

From b35b37dd89bf6f7fd7a93c339e79643046d48abe Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Fri, 5 Jul 2024 18:32:00 +0200
Subject: [PATCH 24/84] Remove duplicated/useless comments

---
 smartsim/_core/mli/infrastructure/control/workermanager.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 4e276d2507..f0cae497a0 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -240,7 +240,6 @@ def _on_iteration(self) -> None:
             logger.warning("No queue to check for tasks")
             return
 
-        # perform default deserialization of the message envelope
         # perform default deserialization of the message envelope
         with self._task_queue.recvh(timeout=None) as recvh:
             try:
@@ -254,9 +253,6 @@ def _on_iteration(self) -> None:
         if not self._validate_request(request):
             return
 
-        # # let the worker perform additional custom deserialization
-        # request = self._worker.deserialize(request_bytes)
-
         fetch_model_result = self._worker.fetch_model(request, self._feature_store)
         model_result = self._worker.load_model(
             request, fetch_model_result, self._device
@@ -294,7 +290,6 @@ def _on_iteration(self) -> None:
 
             response = build_reply(reply)
 
-        # serialized = self._worker.serialize_reply(request, transformed_output)
         serialized_resp = MessageHandler.serialize_response(response)  # type: ignore
         if request.callback:
             request.callback.send(serialized_resp)

From 51e0b17bdbf22683759597ece523778b6d7bd953 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 9 Jul 2024 12:37:22 -0500
Subject: [PATCH 25/84] Bring up to date with new schema

---
 ex/high_throughput_inference/mli_driver.py    |  9 ++-
 ex/high_throughput_inference/mock_app.py      | 30 +++++++++-
 .../standalone_workermanager.py               | 57 +++++++++++++++++--
 smartsim/_core/entrypoints/service.py         |  3 +-
 smartsim/_core/mli/comm/channel/channel.py    |  7 ++-
 .../_core/mli/comm/channel/dragonchannel.py   |  6 ++
 smartsim/_core/mli/comm/channel/dragonfli.py  | 29 ++++++----
 .../infrastructure/control/workermanager.py   | 20 ++-----
 .../_core/mli/infrastructure/worker/worker.py | 11 ++--
 9 files changed, 128 insertions(+), 44 deletions(-)

diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py
index 9b899f4124..4a3dd034e8 100644
--- a/ex/high_throughput_inference/mli_driver.py
+++ b/ex/high_throughput_inference/mli_driver.py
@@ -1,6 +1,11 @@
+
+
 import os
+import base64
+import cloudpickle
 import sys
 from smartsim import Experiment
+from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
 from smartsim.status import TERMINAL_STATUSES
 import time
 
@@ -14,7 +19,9 @@
 os.makedirs(exp_path, exist_ok=True)
 exp = Experiment("MLI_proto", launcher="dragon", exp_path=exp_path)
 
-worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device])
+torch_worker_str = base64.b64encode(cloudpickle.dumps(TorchWorker)).decode("ascii")
+
+worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device, "--worker_class", torch_worker_str])
 worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs)
 worker_manager.attach_generator_files(to_copy=[worker_manager_script_name])
 
diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index 666d7fcc91..df0ba55c76 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -1,3 +1,29 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 # isort: off
 import dragon
 from dragon import fli
@@ -32,7 +58,7 @@ def __init__(self, timing_on: bool):
         while to_worker_fli_str is None:
             try:
                 to_worker_fli_str = self._ddict["to_worker_fli"]
-                self._to_worker_fli = fli.FLInterface.attach(b64decode(to_worker_fli_str))
+                self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str)
             except KeyError:
                 time.sleep(1)
         self._from_worker_ch = Channel.make_process_local()
@@ -88,7 +114,7 @@ def run_model(self, model: bytes, batch: torch.Tensor):
         self.measure_time("build_tensor")
         request = MessageHandler.build_request(
             reply_channel=self._from_worker_ch_serialized,
-            model=model,
+            model=MessageHandler.build_model(model, "resnet-50", "1.0"),
             inputs=[built_tensor],
             outputs=[],
             output_descriptors=[],
diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index ccefcbf584..991e869581 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -1,3 +1,29 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 # isort: off
 import dragon
 from dragon import fli
@@ -7,10 +33,12 @@
 from dragon.globalservices.api_setup import connect_to_infrastructure
 # isort: on
 import argparse
+import base64
+import cloudpickle
 import os
 
-
 from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
 from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
 from smartsim._core.mli.infrastructure.control.workermanager import (
     WorkerManager,
@@ -18,7 +46,23 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser("Worker Manager")
-    parser.add_argument("--device", default="gpu")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="gpu",
+        choices="gpu cpu".split(),
+        help="Device on which the inference takes place",
+    )
+    parser.add_argument(
+        "--worker_class",
+        type=str,
+        required=True,
+        help="Serialized class of worker to run",
+    )
+    parser.add_argument(
+        "--num_workers", type=int, default=1, help="Number of workers to run"
+    )
+
     args = parser.parse_args()
     connect_to_infrastructure()
     ddict_str = os.environ["SS_DRG_DDICT"]
@@ -26,12 +70,13 @@
 
     to_worker_channel = Channel.make_process_local()
     to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
-    ddict["to_worker_fli"] = b64encode(to_worker_fli.serialize())
-
-    torch_worker = TorchWorker()
+    to_worker_fli_serialized = to_worker_fli.serialize()
+    ddict["to_worker_fli"] = to_worker_fli_serialized
 
+    torch_worker = cloudpickle.loads(base64.b64decode(args.worker_class.encode('ascii')))()
+    comm_channel = DragonFLIChannel(to_worker_fli_serialized)
     worker_manager = WorkerManager(
-        file_like_interface=to_worker_fli,
+        task_queue=comm_channel,
         worker=torch_worker,
         feature_store=None,
         as_service=True,
diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py
index e03df6bea1..6b4ef74b67 100644
--- a/smartsim/_core/entrypoints/service.py
+++ b/smartsim/_core/entrypoints/service.py
@@ -46,7 +46,8 @@ def __init__(
         :param as_service: Determines if the host will run until shutdown criteria
         are met or as a run-once instance
         :param cooldown: Period of time to allow service to run before automatic
-        shutdown, in seconds. A non-zero, positive integer."""
+        shutdown, in seconds. A non-zero, positive integer.
+        :param loop_delay: delay between iterations of the event loop"""
         self._as_service = as_service
         """If the service should run until shutdown function returns True"""
         self._cooldown = abs(cooldown)
diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py
index 201ab9deab..2318896a9b 100644
--- a/smartsim/_core/mli/comm/channel/channel.py
+++ b/smartsim/_core/mli/comm/channel/channel.py
@@ -41,9 +41,14 @@ def __init__(self, descriptor: t.Union[str, bytes]) -> None:
 
     @abstractmethod
     def send(self, value: bytes) -> None:
-        """Send a message throuh the underlying communication channel
+        """Send a message through the underlying communication channel
         :param value: The value to send"""
 
+    @abstractmethod
+    def recv(self) -> bytes:
+        """Receieve a message through the underlying communication channel
+        :returns: the received message"""
+
     @property
     def descriptor(self) -> bytes:
         """Return the channel descriptor for the underlying dragon channel"""
diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py
index 872eb32350..fb1a0c51c1 100644
--- a/smartsim/_core/mli/comm/channel/dragonchannel.py
+++ b/smartsim/_core/mli/comm/channel/dragonchannel.py
@@ -51,3 +51,9 @@ def send(self, value: bytes) -> None:
         :param value: The value to send"""
         with self._channel.sendh(timeout=None) as sendh:
             sendh.send_bytes(value)
+
+    def recv(self) -> bytes:
+        """Receieve a message through the underlying communication channel
+        :returns: the received message"""
+        with self._channel.recvh(timeout=None) as recvh:
+            return recvh.recv_bytes(timeout=None)
\ No newline at end of file
diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py
index 3992241380..ebf824b7db 100644
--- a/smartsim/_core/mli/comm/channel/dragonfli.py
+++ b/smartsim/_core/mli/comm/channel/dragonfli.py
@@ -24,18 +24,12 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import sys
-
 # isort: off
-try:
-    from dragon import fli
-    import dragon.channels as dch
-except ImportError as exc:
-    if not "pytest" in sys.modules:
-        raise exc from None
+from dragon import fli
+import dragon.channels as dch
 # isort: on
 
-
+import sys
 import smartsim._core.mli.comm.channel.channel as cch
 from smartsim.log import get_logger
 
@@ -45,14 +39,25 @@
 class DragonFLIChannel(cch.CommChannelBase):
     """Passes messages by writing to a Dragon FLI Channel"""
 
-    def __init__(self, fli_desc: bytes) -> None:
+    def __init__(self, fli_desc: str) -> None:
         """Initialize the DragonFLIChannel instance"""
         super().__init__(fli_desc)
         # todo: do we need memory pool information to construct the channel correctly?
-        self._channel: "dch.Channel" = fli.FLInterface.attach(fli_desc)
+        self._channel: "fli" = fli.FLInterface.attach(fli_desc)
 
     def send(self, value: bytes) -> None:
-        """Send a message throuh the underlying communication channel
+        """Send a message through the underlying communication channel
         :param value: The value to send"""
         with self._channel.sendh(timeout=None) as sendh:
             sendh.send_bytes(value)
+
+    def recv(self) -> bytes:
+        """Receieve a message through the underlying communication channel
+        :returns: the received message"""
+        with self._channel.recvh(timeout=None) as recvh:
+            try:
+                request_bytes: bytes
+                request_bytes, _ = recvh.recv_bytes(timeout=None)
+                return request_bytes
+            except fli.FLIEOT as exc:
+                return b''
\ No newline at end of file
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index f0cae497a0..6f31972727 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -27,14 +27,10 @@
 import sys
 
 # isort: off
-try:
-    import dragon
-    from dragon import fli
-except ImportError as exc:
-    if not "pytest" in sys.modules:
-        raise exc from None
-
+import dragon
+from dragon import fli
 # isort: on
+
 import time
 import typing as t
 
@@ -169,7 +165,7 @@ class WorkerManager(Service):
 
     def __init__(
         self,
-        file_like_interface: "fli.FLInterface",
+        task_queue: CommChannelBase,
         worker: MachineLearningWorkerBase,
         feature_store: t.Optional[FeatureStore] = None,
         as_service: bool = False,
@@ -189,7 +185,7 @@ def __init__(
         super().__init__(as_service, cooldown)
 
         """a collection of workers the manager is controlling"""
-        self._task_queue: fli.FLInterface = file_like_interface
+        self._task_queue: CommChannelBase = task_queue
         """the queue the manager monitors for new tasks"""
         self._feature_store: t.Optional[FeatureStore] = feature_store
         """a feature store to retrieve models from"""
@@ -241,11 +237,7 @@ def _on_iteration(self) -> None:
             return
 
         # perform default deserialization of the message envelope
-        with self._task_queue.recvh(timeout=None) as recvh:
-            try:
-                request_bytes, _ = recvh.recv_bytes(timeout=None)
-            except fli.FLIEOT as exc:
-                return
+        request_bytes = self._task_queue.recv()
 
         request = deserialize_message(
             request_bytes, self._comm_channel_type, self._device
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index fe82ea2a3e..808c9cf9bf 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -24,6 +24,10 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+# isort: off
+from dragon.utils import b64decode
+# isort: on
+
 import sys
 import typing as t
 from abc import ABC, abstractmethod
@@ -34,13 +38,6 @@
 from ...infrastructure.storage.featurestore import FeatureStore
 from ...mli_schemas.model.model_capnp import Model
 
-# isort: off
-try:
-    from dragon.utils import b64decode
-except ImportError as exc:
-    if "pytest" not in sys.modules:
-        raise exc from None
-# isort: on
 
 logger = get_logger(__name__)
 

From 1fcf17d4456f99a6ad34d6360879e2e2a2b24f12 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 10 Jul 2024 11:06:08 -0500
Subject: [PATCH 26/84] Add feature store prototype caching

---
 ex/high_throughput_inference/mli_driver.py    |  7 +-
 ex/high_throughput_inference/mock_app.py      | 19 +++-
 .../standalone_workermanager.py               | 10 +-
 smartsim/_core/entrypoints/service.py         | 17 ++++
 .../_core/mli/comm/channel/dragonchannel.py   |  3 +-
 smartsim/_core/mli/comm/channel/dragonfli.py  |  4 +-
 .../infrastructure/control/workermanager.py   | 96 ++++++++++++++++---
 .../storage/dragonfeaturestore.py             | 15 ++-
 .../infrastructure/storage/featurestore.py    |  5 +-
 .../_core/mli/infrastructure/worker/worker.py | 10 +-
 tests/mli/test_worker_manager.py              |  8 +-
 11 files changed, 147 insertions(+), 47 deletions(-)

diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py
index 4a3dd034e8..4e68fdfbcb 100644
--- a/ex/high_throughput_inference/mli_driver.py
+++ b/ex/high_throughput_inference/mli_driver.py
@@ -8,6 +8,7 @@
 from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
 from smartsim.status import TERMINAL_STATUSES
 import time
+import typing as t
 
 device = "gpu"
 filedir = os.path.dirname(__file__)
@@ -15,7 +16,11 @@
 app_script_name = os.path.join(filedir, "mock_app.py")
 model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt")
 
-exp_path = os.path.join(filedir, "MLI_proto")
+transport: t.Literal["hsta", "tcp"] = "hsta"
+
+os.environ["SMARTSIM_DRAGON_TRANSPORT"] = transport
+
+exp_path = os.path.join(filedir, f"MLI_proto_{transport.upper()}")
 os.makedirs(exp_path, exist_ok=True)
 exp = Experiment("MLI_proto", launcher="dragon", exp_path=exp_path)
 
diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index df0ba55c76..4ecce58ac7 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -31,7 +31,7 @@
 import dragon.channels
 from dragon.data.ddict.ddict import DDict
 from dragon.globalservices.api_setup import connect_to_infrastructure
-from dragon.utils import b64decode
+from dragon.utils import b64decode, b64encode
 
 # isort: on
 
@@ -107,7 +107,7 @@ def print_timings(self, to_file: bool = False):
             numpy.savetxt("timings.txt", value_array)
 
 
-    def run_model(self, model: bytes, batch: torch.Tensor):
+    def run_model(self, model: bytes | str, batch: torch.Tensor):
         self.start_timings(batch.shape[0])
         built_tensor = MessageHandler.build_tensor(
             batch.numpy(), "c", "float32", list(batch.shape))
@@ -143,10 +143,14 @@ def run_model(self, model: bytes, batch: torch.Tensor):
         self.end_timings()
         return result
 
+    def set_model(self, key: str, model: bytes):
+        self._ddict[key] = b64encode(model)
+
 
 class ResNetWrapper():
-    def __init__(self, model: str):
+    def __init__(self, name: str, model: str):
         self._model = torch.jit.load(model)
+        self._name = name
         buffer = io.BytesIO()
         scripted = torch.jit.trace(self._model, self.get_batch())
         torch.jit.save(scripted, buffer)
@@ -159,15 +163,20 @@ def get_batch(self, batch_size: int=32):
     def model(self):
         return self._serialized_model
 
+    @property
+    def name(self):
+        return self._name
+
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser("Mock application")
     parser.add_argument("--device", default="cpu")
     args = parser.parse_args()
 
-    resnet = ResNetWrapper(f"resnet50.{args.device.upper()}.pt")
+    resnet = ResNetWrapper("resnet50", f"resnet50.{args.device.upper()}.pt")
 
     client = ProtoClient(timing_on=True)
+    client.set_model(resnet.name, resnet.model)
 
     total_iterations = 100
 
@@ -175,6 +184,6 @@ def model(self):
         logger.info(f"Batch size: {batch_size}")
         for iteration_number in range(total_iterations + int(batch_size==1)):
             logger.info(f"Iteration: {iteration_number}")
-            client.run_model(resnet.model, resnet.get_batch(batch_size))
+            client.run_model(resnet.name, resnet.get_batch(batch_size))
 
     client.print_timings(to_file=True)
\ No newline at end of file
diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index 991e869581..f3e8e7c589 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -38,11 +38,11 @@
 import os
 
 from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel
+from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import DragonFeatureStore
 from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
 from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
-from smartsim._core.mli.infrastructure.control.workermanager import (
-    WorkerManager,
-)
+from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser("Worker Manager")
@@ -74,11 +74,13 @@
     ddict["to_worker_fli"] = to_worker_fli_serialized
 
     torch_worker = cloudpickle.loads(base64.b64decode(args.worker_class.encode('ascii')))()
+
+    dfs = DragonFeatureStore(ddict)
     comm_channel = DragonFLIChannel(to_worker_fli_serialized)
     worker_manager = WorkerManager(
         task_queue=comm_channel,
         worker=torch_worker,
-        feature_store=None,
+        feature_store=dfs,
         as_service=True,
         cooldown=10,
         comm_channel_type=DragonCommChannel,
diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py
index 6b4ef74b67..df9c2bbef6 100644
--- a/smartsim/_core/entrypoints/service.py
+++ b/smartsim/_core/entrypoints/service.py
@@ -103,6 +103,23 @@ def execute(self) -> None:
         running = True
         cooldown_start: t.Optional[datetime.datetime] = None
 
+        headers = [
+            "batch_size",
+            "w_deserialize",
+            "w_fetch_model",
+            "w_load_model",
+            "w_fetch_input",
+            "w_transform_input",
+            "w_execute",
+            "w_transform_output",
+            "w_assign_output",
+            "w_build_reply",
+            "w_serialize_resp",
+            "w_send",
+        ]
+
+        print(",".join(headers))
+
         while running:
             self._on_iteration()
 
diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py
index fb1a0c51c1..1409747a91 100644
--- a/smartsim/_core/mli/comm/channel/dragonchannel.py
+++ b/smartsim/_core/mli/comm/channel/dragonchannel.py
@@ -56,4 +56,5 @@ def recv(self) -> bytes:
         """Receieve a message through the underlying communication channel
         :returns: the received message"""
         with self._channel.recvh(timeout=None) as recvh:
-            return recvh.recv_bytes(timeout=None)
\ No newline at end of file
+            message_bytes: bytes = recvh.recv_bytes(timeout=None)
+            return message_bytes
diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py
index ebf824b7db..0c1aba94e3 100644
--- a/smartsim/_core/mli/comm/channel/dragonfli.py
+++ b/smartsim/_core/mli/comm/channel/dragonfli.py
@@ -27,9 +27,11 @@
 # isort: off
 from dragon import fli
 import dragon.channels as dch
+
 # isort: on
 
 import sys
+
 import smartsim._core.mli.comm.channel.channel as cch
 from smartsim.log import get_logger
 
@@ -60,4 +62,4 @@ def recv(self) -> bytes:
                 request_bytes, _ = recvh.recv_bytes(timeout=None)
                 return request_bytes
             except fli.FLIEOT as exc:
-                return b''
\ No newline at end of file
+                return b""
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 6f31972727..d3cc2d84ae 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -29,6 +29,7 @@
 # isort: off
 import dragon
 from dragon import fli
+
 # isort: on
 
 import time
@@ -36,18 +37,20 @@
 
 import numpy as np
 
-from smartsim._core.entrypoints.service import Service
-from smartsim._core.mli.comm.channel.channel import CommChannelBase
-from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
-from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
-from smartsim._core.mli.infrastructure.worker.worker import (
+from .....error import SmartSimError
+from .....log import get_logger
+from ....entrypoints.service import Service
+from ...comm.channel.channel import CommChannelBase
+from ...comm.channel.dragonfli import DragonFLIChannel
+from ...infrastructure.storage.featurestore import FeatureStore
+from ...infrastructure.worker.worker import (
     InferenceReply,
     InferenceRequest,
+    LoadModelResult,
     MachineLearningWorkerBase,
 )
-from smartsim._core.mli.message_handler import MessageHandler
-from smartsim._core.mli.mli_schemas.response.response_capnp import Response
-from smartsim.log import get_logger
+from ...message_handler import MessageHandler
+from ...mli_schemas.response.response_capnp import Response
 
 if t.TYPE_CHECKING:
     from smartsim._core.mli.mli_schemas.model.model_capnp import Model
@@ -195,6 +198,8 @@ def __init__(
         """The type of communication channel to construct for callbacks"""
         self._device = device
         """Device on which workers need to run"""
+        self._cached_models: dict[str, t.Any] = {}
+        """Dictionary of previously loaded models"""
 
     def _validate_request(self, request: InferenceRequest) -> bool:
         """Ensure the request can be processed.
@@ -236,34 +241,84 @@ def _on_iteration(self) -> None:
             logger.warning("No queue to check for tasks")
             return
 
+        timings = []
         # perform default deserialization of the message envelope
-        request_bytes = self._task_queue.recv()
+        request_bytes: bytes = self._task_queue.recv()
 
+        interm = time.perf_counter()
         request = deserialize_message(
             request_bytes, self._comm_channel_type, self._device
         )
         if not self._validate_request(request):
             return
 
-        fetch_model_result = self._worker.fetch_model(request, self._feature_store)
-        model_result = self._worker.load_model(
-            request, fetch_model_result, self._device
-        )
+        timings.append(time.perf_counter() - interm)
+        interm = time.perf_counter()
+
+        if not request.raw_model:
+            if not request.model_key:
+                raise SmartSimError("Neither key, nor model provided")
+
+            if request.model_key in self._cached_models:
+                timings.append(time.perf_counter() - interm)
+                interm = time.perf_counter()
+                model_result = LoadModelResult(self._cached_models[request.model_key])
+
+            else:
+                fetch_model_result = None
+                while True:
+                    try:
+                        interm = time.perf_counter()
+                        fetch_model_result = self._worker.fetch_model(
+                            request, self._feature_store
+                        )
+                    except KeyError:
+                        time.sleep(0.1)
+                    else:
+                        break
+
+                if fetch_model_result is None:
+                    raise SmartSimError("Could not retrieve model from feature store")
+                timings.append(time.perf_counter() - interm)
+                interm = time.perf_counter()
+                model_result = self._worker.load_model(
+                    request, fetch_model_result, self._device
+                )
+                self._cached_models[request.model_key] = model_result.model
+        else:
+            fetch_model_result = self._worker.fetch_model(request, None)
+            model_result = self._worker.load_model(
+                request, fetch_result=fetch_model_result, device=self._device
+            )
+
+        timings.append(time.perf_counter() - interm)
+        interm = time.perf_counter()
         fetch_input_result = self._worker.fetch_inputs(request, self._feature_store)
+
+        timings.append(time.perf_counter() - interm)
+        interm = time.perf_counter()
         transformed_input = self._worker.transform_input(
             request, fetch_input_result, self._device
         )
 
+        timings.append(time.perf_counter() - interm)
+        interm = time.perf_counter()
+
         reply = InferenceReply()
 
         try:
             execute_result = self._worker.execute(
                 request, model_result, transformed_input
             )
+
+            timings.append(time.perf_counter() - interm)
+            interm = time.perf_counter()
             transformed_output = self._worker.transform_output(
                 request, execute_result, self._device
             )
 
+            timings.append(time.perf_counter() - interm)
+            interm = time.perf_counter()
             if request.output_keys:
                 reply.output_keys = self._worker.place_output(
                     request, transformed_output, self._feature_store
@@ -274,6 +329,9 @@ def _on_iteration(self) -> None:
             logger.exception("Error executing worker")
             reply.failed = True
 
+        timings.append(time.perf_counter() - interm)
+        interm = time.perf_counter()
+
         if reply.failed:
             response = build_failure_reply("fail", "failure-occurred")
         else:
@@ -282,10 +340,22 @@ def _on_iteration(self) -> None:
 
             response = build_reply(reply)
 
+        timings.append(time.perf_counter() - interm)
+        interm = time.perf_counter()
+
+        # serialized = self._worker.serialize_reply(request, transformed_output)
         serialized_resp = MessageHandler.serialize_response(response)  # type: ignore
+
+        timings.append(time.perf_counter() - interm)
+        interm = time.perf_counter()
         if request.callback:
             request.callback.send(serialized_resp)
 
+        timings.append(time.perf_counter() - interm)
+        interm = time.perf_counter()
+
+        print(" ".join(str(time) for time in timings))
+
     def _can_shutdown(self) -> bool:
         """Return true when the criteria to shut down the service are met."""
         # todo: determine shutdown criteria
diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
index 53f2f461f8..fbd18438f5 100644
--- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
+++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
@@ -44,22 +44,27 @@ def __init__(self, storage: "DragonDict") -> None:
         """Initialize the DragonFeatureStore instance"""
         self._storage = storage
 
-    def __getitem__(self, key: str) -> t.Any:
+    def __getitem__(self, key: str) -> t.Union[str, bytes]:
         """Retrieve an item using key
         :param key: Unique key of an item to retrieve from the feature store"""
         try:
-            return self._storage[key]
+            value: t.Union[str, bytes] = self._storage[key]
+            return value
+        except KeyError as ex:
+            raise ex
         except Exception as ex:
             # note: explicitly avoid round-trip to check for key existence
-            raise sse.SmartSimError(f"{key} not found in feature store") from ex
+            raise sse.SmartSimError(
+                f"Could not get value for existing key {key}, error:\n{ex}"
+            ) from ex
 
-    def __setitem__(self, key: str, value: str) -> None:
+    def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None:
         """Assign a value using key
         :param key: Unique key of an item to set in the feature store
         :param value: Value to persist in the feature store"""
         self._storage[key] = value
 
-    def __contains__(self, key: t.Union[str]) -> bool:
+    def __contains__(self, key: str) -> bool:
         """Membership operator to test for a key existing within the feature store.
         Return `True` if the key is found, `False` otherwise
         :param key: Unique key of an item to retrieve from the feature store"""
diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py
index e18643e932..553e13b10f 100644
--- a/smartsim/_core/mli/infrastructure/storage/featurestore.py
+++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py
@@ -24,6 +24,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import typing as t
 from abc import ABC, abstractmethod
 
 
@@ -32,12 +33,12 @@ class FeatureStore(ABC):
     values from a feature store implementation"""
 
     @abstractmethod
-    def __getitem__(self, key: str) -> bytes:
+    def __getitem__(self, key: str) -> t.Union[str, bytes]:
         """Retrieve an item using key
         :param key: Unique key of an item to retrieve from the feature store"""
 
     @abstractmethod
-    def __setitem__(self, key: str, value: str) -> None:
+    def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None:
         """Assign a value using key
         :param key: Unique key of an item to set in the feature store
         :param value: Value to persist in the feature store"""
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 808c9cf9bf..900a8241de 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -24,11 +24,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# isort: off
-from dragon.utils import b64decode
-# isort: on
-
-import sys
 import typing as t
 from abc import ABC, abstractmethod
 
@@ -38,7 +33,6 @@
 from ...infrastructure.storage.featurestore import FeatureStore
 from ...mli_schemas.model.model_capnp import Model
 
-
 logger = get_logger(__name__)
 
 
@@ -174,7 +168,7 @@ def fetch_model(
             )
 
         try:
-            raw_bytes = b64decode(feature_store[request.model_key])
+            raw_bytes: bytes = t.cast(bytes, feature_store[request.model_key])
             return FetchModelResult(raw_bytes)
         except FileNotFoundError as ex:
             logger.exception(ex)
@@ -202,7 +196,7 @@ def fetch_inputs(
             data: t.List[bytes] = []
             for input_ in request.input_keys:
                 try:
-                    tensor_bytes = feature_store[input_]
+                    tensor_bytes = t.cast(bytes, feature_store[input_])
                     data.append(tensor_bytes)
                 except KeyError as ex:
                     logger.exception(ex)
diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py
index 46cae5b2e4..62bd711ebb 100644
--- a/tests/mli/test_worker_manager.py
+++ b/tests/mli/test_worker_manager.py
@@ -32,11 +32,7 @@
 
 import pytest
 
-should_run = True
-try:
-    import torch
-except ImportError:
-    should_run = False
+pytest.importorskip("torch")
 
 from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager
 from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
@@ -51,8 +47,6 @@
 # The tests in this file belong to the group_a group
 pytestmark = pytest.mark.group_a
 
-pytest.mark.skipif(not should_run, "Test needs PyTorch to run")
-
 
 def mock_work(worker_manager_queue: "mp.Queue[bytes]") -> None:
     """Mock event producer for triggering the inference pipeline"""

From d76f88014cebe7a76175b06178d27ca32195841d Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 10 Jul 2024 13:10:08 -0500
Subject: [PATCH 27/84] Add redis driver, fix FLI

---
 ex/high_throughput_inference/mock_app.py      | 10 ++-
 .../mock_app_redis.py                         | 88 +++++++++++++++++++
 ex/high_throughput_inference/redis_driver.py  | 65 ++++++++++++++
 smartsim/_core/mli/comm/channel/dragonfli.py  | 12 ++-
 .../infrastructure/control/workermanager.py   |  2 +-
 5 files changed, 170 insertions(+), 7 deletions(-)
 create mode 100644 ex/high_throughput_inference/mock_app_redis.py
 create mode 100644 ex/high_throughput_inference/redis_driver.py

diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index 4ecce58ac7..45246db2e5 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -112,9 +112,14 @@ def run_model(self, model: bytes | str, batch: torch.Tensor):
         built_tensor = MessageHandler.build_tensor(
             batch.numpy(), "c", "float32", list(batch.shape))
         self.measure_time("build_tensor")
+        built_model = None
+        if isinstance(model, str):
+            model_arg = MessageHandler.build_model_key(model)
+        else:
+            model_arg = MessageHandler.build_model(model, "resnet-50", "1.0")
         request = MessageHandler.build_request(
             reply_channel=self._from_worker_ch_serialized,
-            model=MessageHandler.build_model(model, "resnet-50", "1.0"),
+            model= model_arg,
             inputs=[built_tensor],
             outputs=[],
             output_descriptors=[],
@@ -125,6 +130,7 @@ def run_model(self, model: bytes | str, batch: torch.Tensor):
         self.measure_time("serialize_request")
         with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh:
             to_sendh.send_bytes(request_bytes)
+        logger.info(f"Message size: {len(request_bytes)} bytes")
 
         self.measure_time("send")
         with self._from_worker_ch.recvh(timeout=None) as from_recvh:
@@ -144,7 +150,7 @@ def run_model(self, model: bytes | str, batch: torch.Tensor):
         return result
 
     def set_model(self, key: str, model: bytes):
-        self._ddict[key] = b64encode(model)
+        self._ddict[key] = model
 
 
 class ResNetWrapper():
diff --git a/ex/high_throughput_inference/mock_app_redis.py b/ex/high_throughput_inference/mock_app_redis.py
new file mode 100644
index 0000000000..c56b4fb8b4
--- /dev/null
+++ b/ex/high_throughput_inference/mock_app_redis.py
@@ -0,0 +1,88 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import io
+import numpy
+import time
+import torch
+from smartsim.log import get_logger
+from smartredis import Client
+
+logger = get_logger("App")
+
+class ResNetWrapper():
+    def __init__(self, name: str, model: str):
+        self._model = torch.jit.load(model)
+        self._name = name
+        buffer = io.BytesIO()
+        scripted = torch.jit.trace(self._model, self.get_batch())
+        torch.jit.save(scripted, buffer)
+        self._serialized_model = buffer.getvalue()
+
+    def get_batch(self, batch_size: int=32):
+        return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32)
+
+    @property
+    def model(self):
+        return self._serialized_model
+
+    @property
+    def name(self):
+        return self._name
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser("Mock application")
+    parser.add_argument("--device", default="cpu")
+    args = parser.parse_args()
+
+    resnet = ResNetWrapper("resnet50", f"resnet50.{args.device.upper()}.pt")
+
+    client = Client(cluster=False, address=None)
+    client.set_model(resnet.name, resnet.model, backend='TORCH', device=args.device.upper())
+
+    total_iterations = 100
+    timings=[]
+    for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]:
+        logger.info(f"Batch size: {batch_size}")
+        for iteration_number in range(total_iterations + int(batch_size==1)):
+            timing = [batch_size]
+            logger.info(f"Iteration: {iteration_number}")
+            start = time.perf_counter()
+            client.put_tensor(name="batch", data=resnet.get_batch(batch_size).numpy())
+            client.run_model(name=resnet.name, inputs=["batch"], outputs=["result"])
+            result = client.get_tensor(name="result")
+            end = time.perf_counter()
+            timing.append(end-start)
+            timings.append(timing)
+
+
+
+    timings_np = numpy.asarray(timings)
+    numpy.save("timings.npy", timings_np)
+    for timing in timings:
+        print(" ".join(str(t) for t in timing))
diff --git a/ex/high_throughput_inference/redis_driver.py b/ex/high_throughput_inference/redis_driver.py
new file mode 100644
index 0000000000..ceddba4ef7
--- /dev/null
+++ b/ex/high_throughput_inference/redis_driver.py
@@ -0,0 +1,65 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import sys
+from smartsim import Experiment
+from smartsim.status import TERMINAL_STATUSES
+import time
+import typing as t
+
+device = "gpu"
+filedir = os.path.dirname(__file__)
+app_script_name = os.path.join(filedir, "mock_app_redis.py")
+model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt")
+
+
+exp_path = os.path.join(filedir, "redis_ai")
+os.makedirs(exp_path, exist_ok=True)
+exp = Experiment("redis_ai", launcher="slurm", exp_path=exp_path)
+
+db = exp.create_database(interface="hsn0")
+
+app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device])
+app_rs.set_nodes(1)
+app_rs.set_tasks(1)
+app = exp.create_model("app", run_settings=app_rs)
+app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name])
+
+exp.generate(db, app, overwrite=True)
+
+exp.start(db, app, block=False)
+
+while True:
+    if exp.get_status(app)[0] in TERMINAL_STATUSES:
+        exp.stop(db)
+        break
+    if exp.get_status(db)[0] in TERMINAL_STATUSES:
+        exp.stop(app)
+        break
+    time.sleep(5)
+
+print("Exiting.")
\ No newline at end of file
diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py
index 0c1aba94e3..eb3175e445 100644
--- a/smartsim/_core/mli/comm/channel/dragonfli.py
+++ b/smartsim/_core/mli/comm/channel/dragonfli.py
@@ -31,6 +31,7 @@
 # isort: on
 
 import sys
+import typing as t
 
 import smartsim._core.mli.comm.channel.channel as cch
 from smartsim.log import get_logger
@@ -41,22 +42,25 @@
 class DragonFLIChannel(cch.CommChannelBase):
     """Passes messages by writing to a Dragon FLI Channel"""
 
-    def __init__(self, fli_desc: str) -> None:
+    def __init__(self, fli_desc: str, sender_supplied: bool = True) -> None:
         """Initialize the DragonFLIChannel instance"""
         super().__init__(fli_desc)
         # todo: do we need memory pool information to construct the channel correctly?
-        self._channel: "fli" = fli.FLInterface.attach(fli_desc)
+        self._fli: "fli" = fli.FLInterface.attach(fli_desc)
+        self._channel: t.Optional["dch"] = (
+            dch.Channel.make_process_local() if sender_supplied else None
+        )
 
     def send(self, value: bytes) -> None:
         """Send a message through the underlying communication channel
         :param value: The value to send"""
-        with self._channel.sendh(timeout=None) as sendh:
+        with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh:
             sendh.send_bytes(value)
 
     def recv(self) -> bytes:
         """Receieve a message through the underlying communication channel
         :returns: the received message"""
-        with self._channel.recvh(timeout=None) as recvh:
+        with self._fli.recvh(timeout=None) as recvh:
             try:
                 request_bytes: bytes
                 request_bytes, _ = recvh.recv_bytes(timeout=None)
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index d3cc2d84ae..60e263f337 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -259,7 +259,7 @@ def _on_iteration(self) -> None:
             if not request.model_key:
                 raise SmartSimError("Neither key, nor model provided")
 
-            if request.model_key in self._cached_models:
+            if False and (request.model_key in self._cached_models):
                 timings.append(time.perf_counter() - interm)
                 interm = time.perf_counter()
                 model_result = LoadModelResult(self._cached_models[request.model_key])

From 3938ec8dbe9964235e6ed4791600257b08b9f3eb Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Thu, 11 Jul 2024 12:27:34 -0500
Subject: [PATCH 28/84] Update post-merge

---
 ex/high_throughput_inference/mli_driver.py    |  1 -
 .../standalone_workermanager.py               | 11 ++-
 .../infrastructure/control/workermanager.py   | 68 +++++++++----------
 .../mli/infrastructure/environmentloader.py   | 11 +--
 4 files changed, 49 insertions(+), 42 deletions(-)

diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py
index 4e68fdfbcb..6da559aa6f 100644
--- a/ex/high_throughput_inference/mli_driver.py
+++ b/ex/high_throughput_inference/mli_driver.py
@@ -30,7 +30,6 @@
 worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs)
 worker_manager.attach_generator_files(to_copy=[worker_manager_script_name])
 
-
 app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device])
 app = exp.create_model("app", run_settings=app_rs)
 app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name])
diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index f3e8e7c589..c56e11a7c3 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -35,6 +35,7 @@
 import argparse
 import base64
 import cloudpickle
+import pickle
 import os
 
 from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel
@@ -42,6 +43,7 @@
 from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
 from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
 from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager
+from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader
 
 
 if __name__ == "__main__":
@@ -77,10 +79,15 @@
 
     dfs = DragonFeatureStore(ddict)
     comm_channel = DragonFLIChannel(to_worker_fli_serialized)
+
+    os.environ["SSFeatureStore"] = base64.b64encode(pickle.dumps(dfs)).decode("utf-8")
+    os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
+
+    config_loader = EnvironmentConfigLoader()
+
     worker_manager = WorkerManager(
-        task_queue=comm_channel,
+        config_loader=config_loader,
         worker=torch_worker,
-        feature_store=dfs,
         as_service=True,
         cooldown=10,
         comm_channel_type=DragonCommChannel,
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index eaa77bdf3e..8c06351fb5 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -41,7 +41,7 @@
 from .....log import get_logger
 from ....entrypoints.service import Service
 from ...comm.channel.channel import CommChannelBase
-from ...comm.channel.dragonfli import DragonFLIChannel
+from ...comm.channel.dragonchannel import DragonCommChannel
 from ...infrastructure.environmentloader import EnvironmentConfigLoader
 from ...infrastructure.storage.featurestore import FeatureStore
 from ...infrastructure.worker.worker import (
@@ -175,7 +175,7 @@ def __init__(
         worker: MachineLearningWorkerBase,
         as_service: bool = False,
         cooldown: int = 0,
-        comm_channel_type: t.Type[CommChannelBase] = DragonFLIChannel,
+        comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel,
         device: t.Literal["cpu", "gpu"] = "cpu",
     ) -> None:
         """Initialize the WorkerManager
@@ -244,34 +244,34 @@ def _on_iteration(self) -> None:
             logger.warning("No queue to check for tasks")
             return
 
-        timings = []
+        timings = []  # timing
         # perform default deserialization of the message envelope
         request_bytes: bytes = self._task_queue.recv()
 
-        interm = time.perf_counter()
+        interm = time.perf_counter()  # timing
         request = deserialize_message(
             request_bytes, self._comm_channel_type, self._device
         )
         if not self._validate_request(request):
             return
 
-        timings.append(time.perf_counter() - interm)
-        interm = time.perf_counter()
+        timings.append(time.perf_counter() - interm)  # timing
+        interm = time.perf_counter()  # timing
 
         if not request.raw_model:
-            if not request.model_key:
-                raise SmartSimError("Neither key, nor model provided")
-
-            if False and (request.model_key in self._cached_models):
-                timings.append(time.perf_counter() - interm)
-                interm = time.perf_counter()
+            if request.model_key is None:
+                # A valid request should never get here.
+                raise ValueError("Could not read model key")
+            if request.model_key in self._cached_models:
+                timings.append(time.perf_counter() - interm)  # timing
+                interm = time.perf_counter()  # timing
                 model_result = LoadModelResult(self._cached_models[request.model_key])
 
             else:
                 fetch_model_result = None
                 while True:
                     try:
-                        interm = time.perf_counter()
+                        interm = time.perf_counter()  # timing
                         fetch_model_result = self._worker.fetch_model(
                             request, self._feature_store
                         )
@@ -282,8 +282,8 @@ def _on_iteration(self) -> None:
 
                 if fetch_model_result is None:
                     raise SmartSimError("Could not retrieve model from feature store")
-                timings.append(time.perf_counter() - interm)
-                interm = time.perf_counter()
+                timings.append(time.perf_counter() - interm)  # timing
+                interm = time.perf_counter()  # timing
                 model_result = self._worker.load_model(
                     request, fetch_model_result, self._device
                 )
@@ -294,18 +294,18 @@ def _on_iteration(self) -> None:
                 request, fetch_result=fetch_model_result, device=self._device
             )
 
-        timings.append(time.perf_counter() - interm)
-        interm = time.perf_counter()
+        timings.append(time.perf_counter() - interm)  # timing
+        interm = time.perf_counter()  # timing
         fetch_input_result = self._worker.fetch_inputs(request, self._feature_store)
 
-        timings.append(time.perf_counter() - interm)
-        interm = time.perf_counter()
+        timings.append(time.perf_counter() - interm)  # timing
+        interm = time.perf_counter()  # timing
         transformed_input = self._worker.transform_input(
             request, fetch_input_result, self._device
         )
 
-        timings.append(time.perf_counter() - interm)
-        interm = time.perf_counter()
+        timings.append(time.perf_counter() - interm)  # timing
+        interm = time.perf_counter()  # timing
 
         reply = InferenceReply()
 
@@ -314,14 +314,14 @@ def _on_iteration(self) -> None:
                 request, model_result, transformed_input
             )
 
-            timings.append(time.perf_counter() - interm)
-            interm = time.perf_counter()
+            timings.append(time.perf_counter() - interm)  # timing
+            interm = time.perf_counter()  # timing
             transformed_output = self._worker.transform_output(
                 request, execute_result, self._device
             )
 
-            timings.append(time.perf_counter() - interm)
-            interm = time.perf_counter()
+            timings.append(time.perf_counter() - interm)  # timing
+            interm = time.perf_counter()  # timing
             if request.output_keys:
                 reply.output_keys = self._worker.place_output(
                     request, transformed_output, self._feature_store
@@ -332,8 +332,8 @@ def _on_iteration(self) -> None:
             logger.exception("Error executing worker")
             reply.failed = True
 
-        timings.append(time.perf_counter() - interm)
-        interm = time.perf_counter()
+        timings.append(time.perf_counter() - interm)  # timing
+        interm = time.perf_counter()  # timing
 
         if reply.failed:
             response = build_failure_reply("fail", "failure-occurred")
@@ -343,21 +343,21 @@ def _on_iteration(self) -> None:
 
             response = build_reply(reply)
 
-        timings.append(time.perf_counter() - interm)
-        interm = time.perf_counter()
+        timings.append(time.perf_counter() - interm)  # timing
+        interm = time.perf_counter()  # timing
 
         # serialized = self._worker.serialize_reply(request, transformed_output)
         serialized_resp = MessageHandler.serialize_response(response)  # type: ignore
 
-        timings.append(time.perf_counter() - interm)
-        interm = time.perf_counter()
+        timings.append(time.perf_counter() - interm)  # timing
+        interm = time.perf_counter()  # timing
         if request.callback:
             request.callback.send(serialized_resp)
 
-        timings.append(time.perf_counter() - interm)
-        interm = time.perf_counter()
+        timings.append(time.perf_counter() - interm)  # timing
+        interm = time.perf_counter()  # timing
 
-        print(" ".join(str(time) for time in timings))
+        print(" ".join(str(time) for time in timings))  # timing
 
     def _can_shutdown(self) -> bool:
         """Return true when the criteria to shut down the service are met."""
diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py
index 267b668f63..f5e9532103 100644
--- a/smartsim/_core/mli/infrastructure/environmentloader.py
+++ b/smartsim/_core/mli/infrastructure/environmentloader.py
@@ -32,6 +32,7 @@
 from dragon.fli import FLInterface  # pylint: disable=all
 
 from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
 
 
 class EnvironmentConfigLoader:
@@ -41,10 +42,10 @@ class EnvironmentConfigLoader:
     """
 
     def __init__(self) -> None:
-        self._feature_store_descriptor = os.getenv("SSFeatureStore", None)
-        self._queue_descriptor = os.getenv("SSQueue", None)
+        self._feature_store_descriptor: t.Optional[str] = os.getenv("SSFeatureStore", None)
+        self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None)
         self.feature_store: t.Optional[FeatureStore] = None
-        self.queue: t.Optional["FLInterface"] = None
+        self.queue: t.Optional[DragonFLIChannel] = None
 
     def get_feature_store(self) -> t.Optional[FeatureStore]:
         """Loads the Feature Store previously set in SSFeatureStore"""
@@ -54,8 +55,8 @@ def get_feature_store(self) -> t.Optional[FeatureStore]:
             )
         return self.feature_store
 
-    def get_queue(self) -> t.Optional["FLInterface"]:
+    def get_queue(self, sender_supplied: bool = True) -> t.Optional[DragonFLIChannel]:
         """Returns the Queue previously set in SSQueue"""
         if self._queue_descriptor is not None:
-            self.queue = FLInterface.attach(base64.b64decode(self._queue_descriptor))
+            self.queue = DragonFLIChannel(fli_desc=base64.b64decode(self._queue_descriptor), sender_supplied=sender_supplied)
         return self.queue

From 273a7d952fdcaa89984b654ce4b46c272c1c2bbd Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Thu, 11 Jul 2024 13:15:38 -0500
Subject: [PATCH 29/84] Fix typing

---
 smartsim/_core/mli/comm/channel/dragonfli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py
index eb3175e445..75f8fb4bfc 100644
--- a/smartsim/_core/mli/comm/channel/dragonfli.py
+++ b/smartsim/_core/mli/comm/channel/dragonfli.py
@@ -42,7 +42,7 @@
 class DragonFLIChannel(cch.CommChannelBase):
     """Passes messages by writing to a Dragon FLI Channel"""
 
-    def __init__(self, fli_desc: str, sender_supplied: bool = True) -> None:
+    def __init__(self, fli_desc: bytes, sender_supplied: bool = True) -> None:
         """Initialize the DragonFLIChannel instance"""
         super().__init__(fli_desc)
         # todo: do we need memory pool information to construct the channel correctly?

From a12d9232914ff9c2cf8def6224a3bb08896b80d9 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Thu, 11 Jul 2024 13:50:35 -0500
Subject: [PATCH 30/84] isort

---
 .../_core/mli/infrastructure/environmentloader.py     | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py
index f5e9532103..9f6770623d 100644
--- a/smartsim/_core/mli/infrastructure/environmentloader.py
+++ b/smartsim/_core/mli/infrastructure/environmentloader.py
@@ -31,8 +31,8 @@
 
 from dragon.fli import FLInterface  # pylint: disable=all
 
-from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
 from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
 
 
 class EnvironmentConfigLoader:
@@ -42,7 +42,9 @@ class EnvironmentConfigLoader:
     """
 
     def __init__(self) -> None:
-        self._feature_store_descriptor: t.Optional[str] = os.getenv("SSFeatureStore", None)
+        self._feature_store_descriptor: t.Optional[str] = os.getenv(
+            "SSFeatureStore", None
+        )
         self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None)
         self.feature_store: t.Optional[FeatureStore] = None
         self.queue: t.Optional[DragonFLIChannel] = None
@@ -58,5 +60,8 @@ def get_feature_store(self) -> t.Optional[FeatureStore]:
     def get_queue(self, sender_supplied: bool = True) -> t.Optional[DragonFLIChannel]:
         """Returns the Queue previously set in SSQueue"""
         if self._queue_descriptor is not None:
-            self.queue = DragonFLIChannel(fli_desc=base64.b64decode(self._queue_descriptor), sender_supplied=sender_supplied)
+            self.queue = DragonFLIChannel(
+                fli_desc=base64.b64decode(self._queue_descriptor),
+                sender_supplied=sender_supplied,
+            )
         return self.queue

From 38b0de15266288b4a959bbbcb244e131407555ea Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Thu, 11 Jul 2024 14:42:16 -0500
Subject: [PATCH 31/84] Update envloader test

---
 tests/dragon/test_environment_loader.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py
index d339fec885..00db0a9d32 100644
--- a/tests/dragon/test_environment_loader.py
+++ b/tests/dragon/test_environment_loader.py
@@ -64,10 +64,9 @@ def test_environment_loader_attach_FLI(content, monkeypatch):
     config = EnvironmentConfigLoader()
     config_queue = config.get_queue()
 
-    new_sender = config_queue.sendh(use_main_as_stream_channel=True)
-    new_sender.send_bytes(content)
+    new_sender = config_queue.send(content)
 
-    old_recv = queue.recvh(use_main_as_stream_channel=True)
+    old_recv = queue.recvh()
     result, _ = old_recv.recv_bytes()
     assert result == content
 
@@ -81,7 +80,7 @@ def test_environment_loader_serialize_FLI(monkeypatch):
 
     config = EnvironmentConfigLoader()
     config_queue = config.get_queue()
-    assert config_queue.serialize() == queue.serialize()
+    assert config_queue._fli.serialize() == queue.serialize()
 
 
 def test_environment_loader_FLI_fails(monkeypatch):

From 8223f96e93e716202fa33e3e08b8fc2ecdb29da1 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Mon, 15 Jul 2024 13:16:10 -0500
Subject: [PATCH 32/84] Input not concatenated correctly

---
 .../_core/mli/comm/channel/dragonchannel.py   |   6 +-
 smartsim/_core/mli/comm/channel/dragonfli.py  |   3 +-
 .../mli/infrastructure/control/__init__.py    |   0
 .../infrastructure/control/devicemanager.py   | 130 ++++++++++
 .../control/requestdispatcher.py              | 227 ++++++++++++++++++
 .../infrastructure/control/workermanager.py   | 128 ++++------
 .../_core/mli/infrastructure/worker/worker.py |   2 +-
 .../_core/mli/mli_schemas/model/__init__.py   |   0
 smartsim/_core/mli/mli_schemas/model/utils.py |  41 ++++
 9 files changed, 447 insertions(+), 90 deletions(-)
 create mode 100644 smartsim/_core/mli/infrastructure/control/__init__.py
 create mode 100644 smartsim/_core/mli/infrastructure/control/devicemanager.py
 create mode 100644 smartsim/_core/mli/infrastructure/control/requestdispatcher.py
 create mode 100644 smartsim/_core/mli/mli_schemas/model/__init__.py
 create mode 100644 smartsim/_core/mli/mli_schemas/model/utils.py

diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py
index 1409747a91..526910b275 100644
--- a/smartsim/_core/mli/comm/channel/dragonchannel.py
+++ b/smartsim/_core/mli/comm/channel/dragonchannel.py
@@ -31,11 +31,7 @@
 
 logger = get_logger(__name__)
 
-try:
-    import dragon.channels as dch
-except ImportError as exc:
-    if not "pytest" in sys.modules:
-        raise exc from None
+import dragon.channels as dch
 
 
 class DragonCommChannel(cch.CommChannelBase):
diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py
index 75f8fb4bfc..1c02857eab 100644
--- a/smartsim/_core/mli/comm/channel/dragonfli.py
+++ b/smartsim/_core/mli/comm/channel/dragonfli.py
@@ -30,7 +30,6 @@
 
 # isort: on
 
-import sys
 import typing as t
 
 import smartsim._core.mli.comm.channel.channel as cch
@@ -65,5 +64,5 @@ def recv(self) -> bytes:
                 request_bytes: bytes
                 request_bytes, _ = recvh.recv_bytes(timeout=None)
                 return request_bytes
-            except fli.FLIEOT as exc:
+            except fli.FLIEOT:
                 return b""
diff --git a/smartsim/_core/mli/infrastructure/control/__init__.py b/smartsim/_core/mli/infrastructure/control/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py
new file mode 100644
index 0000000000..94c2404ead
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py
@@ -0,0 +1,130 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+from contextlib import contextmanager
+from threading import RLock
+from types import TracebackType
+
+from ...infrastructure.storage.featurestore import FeatureStore
+from ..worker.worker import MachineLearningWorkerBase
+from .requestdispatcher import InferenceWork
+
+
+class WorkerDevice:
+    def __init__(self, name: str) -> None:
+        """Wrapper around a device to keep track of loaded Models and availability
+        :param name: name used by the toolkit to identify this device, e.g. ``cuda:0``
+        """
+        self._name = name
+        """The name used by the toolkit to identify this device"""
+        self._lock = RLock()
+        """Lock to ensure only one thread at the time accesses this device"""
+        self._models: dict[str, t.Any] = {}
+
+    def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]:
+        return self._lock.acquire(blocking=blocking, timeout=timeout)
+
+    def release(self) -> None:
+        self._lock.release()
+
+    def __enter__(self) -> None:
+        self.acquire()
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    def add_model(self, key: str, model: t.Any) -> None:
+        self._models[key] = model
+
+    def remove_model(self, key: str) -> None:
+        self._models.pop(key)
+
+    def get_model(self, key: str) -> t.Any:
+        return self._models[key]
+
+    def __contains__(self, key: str):
+        return key in self._models
+
+    def __exit__(
+        self,
+        exc_type: t.Optional[t.Type[BaseException]],
+        exc_val: t.Optional[BaseException],
+        exc_tb: t.Optional[TracebackType],
+    ) -> None:
+        self.release()
+
+
+class DeviceManager:
+    def __init__(self, devices: list[WorkerDevice]):
+        self._devices = devices
+        """Dictionary of model key to devices on which it is loaded"""
+
+    def get_free_device(
+        self,
+        worker: MachineLearningWorkerBase,
+        inference_work: InferenceWork,
+        feature_store: t.Optional[FeatureStore],
+    ) -> t.Generator[WorkerDevice, None, None]:
+        return_device = None
+        sample_request = inference_work.requests[0]
+        direct_inference = sample_request.raw_model is not None
+        while return_device is None:
+            loaded_devices = []
+            if not direct_inference:
+                # Look up devices to see if any of them already has a copy of the model
+                for device in self._devices:
+                    if inference_work.model_key in device:
+                        loaded_devices.append(device)
+
+                # If a pre-loaded model is found on a device, try using that device
+                for device in loaded_devices:
+                    if device.acquire(blocking=False):
+                        return_device = device
+
+            # If the model is not loaded on a free device, load it on another device (if available)
+            if return_device is None:
+                for candidate_device in self._devices:
+                    if (
+                        candidate_device not in loaded_devices
+                        and candidate_device.acquire(blocking=False)
+                    ):
+                        model_bytes = worker.fetch_model(sample_request, feature_store)
+                        loaded_model = worker.load_model(
+                            sample_request, model_bytes, candidate_device.name
+                        )
+                        candidate_device.add_model(
+                            inference_work.model_key, loaded_model.model
+                        )
+
+                        return_device = candidate_device
+
+        try:
+            yield return_device
+        finally:
+            return_device.remove_model(inference_work.model_key)
+            return_device.release()
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
new file mode 100644
index 0000000000..520605c588
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -0,0 +1,227 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import time
+import typing as t
+import uuid
+from dataclasses import dataclass
+from queue import Empty, Full, Queue
+from threading import RLock
+from types import TracebackType
+
+from packaging.version import Version
+
+from ...infrastructure.worker.worker import InferenceRequest
+from ...mli_schemas.model.model_capnp import Model
+
+if t.TYPE_CHECKING:
+    from dragon.fli import FLInterface
+
+
+@dataclass
+class InferenceWork:
+    model_key: str
+    requests: list[InferenceRequest]
+
+
+class WorkerDevice:
+    def __init__(self, name: str) -> None:
+        """Wrapper around a device to keep track of loaded Models and availability
+        :param name: name used by the toolkit to identify this device, e.g. ``cuda:0``
+        """
+        self._name = name
+        """The name used by the toolkit to identify this device"""
+        self._models: dict[str, t.Any] = {}
+        """Dictionary of model key to model for models stored on this device"""
+        self._lock = RLock()
+        """Lock to ensure only one thread at the time accesses this device"""
+
+    def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]:
+        return self._lock.acquire(blocking=blocking, timeout=timeout)
+
+    def release(self) -> None:
+        self._lock.release()
+
+    def __enter__(self) -> None:
+        self.acquire()
+
+    def __exit__(
+        self,
+        exc_type: t.Optional[t.Type[BaseException]],
+        exc_val: t.Optional[BaseException],
+        exc_tb: t.Optional[TracebackType],
+    ) -> None:
+        self.release()
+
+
+class BatchQueue(Queue[InferenceRequest]):
+    def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> None:
+        super().__init__(maxsize=batch_size)
+        self._batch_timeout = batch_timeout
+        self._batch_size = batch_size
+        self._first_put: t.Optional[float] = None
+        self._disposable = False
+        self._model_key = model_key
+        self._flush_lock = RLock()
+
+    def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]:
+        return self._flush_lock.acquire(blocking=blocking, timeout=timeout)
+
+    def release(self) -> None:
+        self._flush_lock.release()
+
+    def __enter__(self) -> None:
+        self.acquire()
+
+    def __exit__(
+        self,
+        exc_type: t.Optional[t.Type[BaseException]],
+        exc_val: t.Optional[BaseException],
+        exc_tb: t.Optional[TracebackType],
+    ) -> None:
+        self.release()
+
+    @property
+    def model_key(self) -> str:
+        return self._model_key
+
+    def put(
+        self,
+        item: InferenceRequest,
+        block: bool = False,
+        timeout: t.Optional[float] = 0.0,
+    ) -> None:
+        if not self.acquire(blocking=False) or self.disposable:
+            raise Full
+        if self._first_put is None:
+            self._first_put = time.time()
+        super().put(item, block=block, timeout=timeout)
+
+    @property
+    def _waited_time(self) -> float:
+        if self._first_put is None:
+            return 0
+        return time.time() - self._first_put
+
+    @property
+    def ready(self) -> bool:
+        if self.empty():
+            return False
+
+        return self.full() or (self._waited_time >= self._batch_timeout)
+
+    def make_disposable(self) -> None:
+        self._disposable = True
+
+    @property
+    def disposable(self) -> bool:
+        return self.empty() and self._disposable
+
+    def flush(self) -> list[t.Any]:
+        num_items = self.qsize()
+        self._first_put = None
+        items = []
+        # Avoid (unlikely) race condition error
+        for _ in range(num_items):
+            try:
+                items.append(self.get())
+            except Empty:
+                break
+
+        return items
+
+    def full(self) -> bool:
+        return self.qsize() >= self._batch_size
+
+    def empty(self) -> bool:
+        return self.qsize() == 0
+
+
+class RequestDispatcher:
+    def __init__(
+        self,
+        batch_timeout: float,
+        batch_size: int,
+    ) -> None:
+        self._queues: list[BatchQueue]
+        self._active_queues: dict[str, BatchQueue] = {}
+        self._model_last_version: dict[str, Version] = {}
+        self._model_name_to_key: dict[str, str] = {}
+        self._batch_timeout = batch_timeout
+        self._batch_size = batch_size
+        self._queue_swap_lock = RLock()
+
+    def _swap_queue(self, model_key: str) -> None:
+        with self._queue_swap_lock:
+            for queue in self._queues:
+                if queue.model_key == model_key and not queue.full():
+                    self._active_queues[model_key] = queue
+                    return
+
+            new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_key)
+            self._active_queues[model_key] = new_queue
+            return
+
+    def dispatch(self, request: InferenceRequest) -> None:
+        if request.raw_model is not None:
+            tmp_id = f"_tmp_{str(uuid.uuid4())}"
+            tmp_queue: BatchQueue = BatchQueue(
+                batch_timeout=0, batch_size=1, model_key=tmp_id
+            )
+            self._active_queues[tmp_id] = tmp_queue
+            tmp_queue.put_nowait(request)
+            tmp_queue.make_disposable()
+            return
+
+        if request.model_key:
+            success = False
+            while not success:
+                try:
+                    self._active_queues[request.model_key].put_nowait(request)
+                    success = True
+                except (Full, KeyError):
+                    self._swap_queue(request.model_key)
+
+    def _update_model_version(self, model: Model) -> None:
+        if not model.version:
+            return
+        if (
+            model.name not in self._model_last_version
+            or Version(model.version) > self._model_last_version[model.name]
+        ):
+            self._model_last_version[model.name] = Version(model.version)
+            return
+
+    def flush_requests(self) -> t.Optional[InferenceWork]:
+        result = None
+        for queue in self._queues:
+            if queue.acquire(blocking=False) and queue.ready:
+                result = InferenceWork(
+                    model_key=queue.model_key, requests=queue.flush()
+                )
+                queue.release()
+                break
+
+        return result
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 8c06351fb5..9163be4cec 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -23,22 +23,13 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import sys
-
-# isort: off
-import dragon
-from dragon import fli
-
-# isort: on
-
 import time
 import typing as t
 
 import numpy as np
 
 from .....error import SmartSimError
-from .....log import get_logger
+from .....log import ContextThread, get_logger
 from ....entrypoints.service import Service
 from ...comm.channel.channel import CommChannelBase
 from ...comm.channel.dragonchannel import DragonCommChannel
@@ -52,10 +43,10 @@
 )
 from ...message_handler import MessageHandler
 from ...mli_schemas.response.response_capnp import Response
+from .devicemanager import DeviceManager, WorkerDevice
+from .requestdispatcher import RequestDispatcher
 
 if t.TYPE_CHECKING:
-    from dragon.fli import FLInterface
-
     from smartsim._core.mli.mli_schemas.model.model_capnp import Model
     from smartsim._core.mli.mli_schemas.response.response_capnp import StatusEnum
 
@@ -65,7 +56,6 @@
 def deserialize_message(
     data_blob: bytes,
     channel_type: t.Type[CommChannelBase],
-    device: t.Literal["cpu", "gpu"],
 ) -> InferenceRequest:
     """Deserialize a message from a byte stream into an InferenceRequest
     :param data_blob: The byte stream to deserialize"""
@@ -177,6 +167,8 @@ def __init__(
         cooldown: int = 0,
         comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel,
         device: t.Literal["cpu", "gpu"] = "cpu",
+        batch_timeout: float = 0.0,
+        batch_size: int = 0,
     ) -> None:
         """Initialize the WorkerManager
         :param config_loader: Environment config loader that loads the task queue and
@@ -203,6 +195,35 @@ def __init__(
         """Device on which workers need to run"""
         self._cached_models: dict[str, t.Any] = {}
         """Dictionary of previously loaded models"""
+        self._request_dispatcher: RequestDispatcher = RequestDispatcher(
+            batch_timeout=batch_timeout, batch_size=batch_size
+        )
+        """Dispatcher used to batch requests"""
+        self._dispatcher_threads = 1
+        """Number of threads which dispatch requests"""
+        self._device_manager: DeviceManager = DeviceManager([WorkerDevice("gpu")])
+
+    def _receive_requests(self) -> None:
+        if self._task_queue is None:
+            return
+        while not self._can_shutdown():
+            # perform default deserialization of the message envelope
+            request_bytes: bytes = self._task_queue.recv()
+
+            request = deserialize_message(request_bytes, self._comm_channel_type)
+            if not self._validate_request(request):
+                return
+
+            self._request_dispatcher.dispatch(request)
+
+    def _on_start(self) -> None:
+        for thread_idx in range(self._dispatcher_threads):
+            dispatcher_thread = ContextThread(
+                name=f"Dispatcher_{thread_idx}",
+                target=self._receive_requests,
+                daemon=True,
+            )
+            dispatcher_thread.start()
 
     def _validate_request(self, request: InferenceRequest) -> bool:
         """Ensure the request can be processed.
@@ -244,69 +265,29 @@ def _on_iteration(self) -> None:
             logger.warning("No queue to check for tasks")
             return
 
-        timings = []  # timing
-        # perform default deserialization of the message envelope
-        request_bytes: bytes = self._task_queue.recv()
-
-        interm = time.perf_counter()  # timing
-        request = deserialize_message(
-            request_bytes, self._comm_channel_type, self._device
-        )
-        if not self._validate_request(request):
+        inference_work = self._request_dispatcher.flush_requests()
+        if inference_work is None or 0 == len(inference_work.requests):
             return
 
-        timings.append(time.perf_counter() - interm)  # timing
-        interm = time.perf_counter()  # timing
+        request = inference_work.requests[0]
 
-        if not request.raw_model:
-            if request.model_key is None:
-                # A valid request should never get here.
-                raise ValueError("Could not read model key")
-            if request.model_key in self._cached_models:
-                timings.append(time.perf_counter() - interm)  # timing
-                interm = time.perf_counter()  # timing
-                model_result = LoadModelResult(self._cached_models[request.model_key])
-
-            else:
-                fetch_model_result = None
-                while True:
-                    try:
-                        interm = time.perf_counter()  # timing
-                        fetch_model_result = self._worker.fetch_model(
-                            request, self._feature_store
-                        )
-                    except KeyError:
-                        time.sleep(0.1)
-                    else:
-                        break
-
-                if fetch_model_result is None:
-                    raise SmartSimError("Could not retrieve model from feature store")
-                timings.append(time.perf_counter() - interm)  # timing
-                interm = time.perf_counter()  # timing
-                model_result = self._worker.load_model(
-                    request, fetch_model_result, self._device
-                )
-                self._cached_models[request.model_key] = model_result.model
-        else:
-            fetch_model_result = self._worker.fetch_model(request, None)
-            model_result = self._worker.load_model(
-                request, fetch_result=fetch_model_result, device=self._device
+        device: WorkerDevice = next(
+            self._device_manager.get_free_device(
+                worker=self._worker,
+                inference_work=inference_work,
+                feature_store=self._feature_store,
             )
+        )
+
+
+        model_result = device.get_model(inference_work.model_key)
 
-        timings.append(time.perf_counter() - interm)  # timing
-        interm = time.perf_counter()  # timing
         fetch_input_result = self._worker.fetch_inputs(request, self._feature_store)
 
-        timings.append(time.perf_counter() - interm)  # timing
-        interm = time.perf_counter()  # timing
         transformed_input = self._worker.transform_input(
             request, fetch_input_result, self._device
         )
 
-        timings.append(time.perf_counter() - interm)  # timing
-        interm = time.perf_counter()  # timing
-
         reply = InferenceReply()
 
         try:
@@ -314,14 +295,10 @@ def _on_iteration(self) -> None:
                 request, model_result, transformed_input
             )
 
-            timings.append(time.perf_counter() - interm)  # timing
-            interm = time.perf_counter()  # timing
             transformed_output = self._worker.transform_output(
                 request, execute_result, self._device
             )
 
-            timings.append(time.perf_counter() - interm)  # timing
-            interm = time.perf_counter()  # timing
             if request.output_keys:
                 reply.output_keys = self._worker.place_output(
                     request, transformed_output, self._feature_store
@@ -332,9 +309,6 @@ def _on_iteration(self) -> None:
             logger.exception("Error executing worker")
             reply.failed = True
 
-        timings.append(time.perf_counter() - interm)  # timing
-        interm = time.perf_counter()  # timing
-
         if reply.failed:
             response = build_failure_reply("fail", "failure-occurred")
         else:
@@ -343,22 +317,12 @@ def _on_iteration(self) -> None:
 
             response = build_reply(reply)
 
-        timings.append(time.perf_counter() - interm)  # timing
-        interm = time.perf_counter()  # timing
-
         # serialized = self._worker.serialize_reply(request, transformed_output)
         serialized_resp = MessageHandler.serialize_response(response)  # type: ignore
 
-        timings.append(time.perf_counter() - interm)  # timing
-        interm = time.perf_counter()  # timing
         if request.callback:
             request.callback.send(serialized_resp)
 
-        timings.append(time.perf_counter() - interm)  # timing
-        interm = time.perf_counter()  # timing
-
-        print(" ".join(str(time) for time in timings))  # timing
-
     def _can_shutdown(self) -> bool:
         """Return true when the criteria to shut down the service are met."""
         # todo: determine shutdown criteria
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 900a8241de..9dfa974785 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -250,7 +250,7 @@ def place_output(
 
 
 class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC):
-    """Abstrct base class providing contract for a machine learning
+    """Abstract base class providing contract for a machine learning
     worker implementation."""
 
     @staticmethod
diff --git a/smartsim/_core/mli/mli_schemas/model/__init__.py b/smartsim/_core/mli/mli_schemas/model/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/smartsim/_core/mli/mli_schemas/model/utils.py b/smartsim/_core/mli/mli_schemas/model/utils.py
new file mode 100644
index 0000000000..b16dc8f623
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/model/utils.py
@@ -0,0 +1,41 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+from collections import namedtuple
+
+from .model_capnp import Model
+
+ModelInfo = namedtuple("ModelInfo", ["Name", "Version"])
+
+
+def make_model_key(model: Model) -> str:
+    return f"{model.name}_{model.version}"
+
+
+def get_model_name_and_version(key: str) -> t.NamedTuple:
+    split_key = key.rsplit("_", 1)
+    return ModelInfo(split_key[0], split_key[1])

From 4a83abe1e1ca1b033b8a224ec5a2d9e058100846 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Mon, 15 Jul 2024 13:17:49 -0500
Subject: [PATCH 33/84] Changes to entrypoint

---
 ex/high_throughput_inference/standalone_workermanager.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index c56e11a7c3..f91c2269c6 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -75,7 +75,9 @@
     to_worker_fli_serialized = to_worker_fli.serialize()
     ddict["to_worker_fli"] = to_worker_fli_serialized
 
-    torch_worker = cloudpickle.loads(base64.b64decode(args.worker_class.encode('ascii')))()
+    torch_worker = cloudpickle.loads(
+        base64.b64decode(args.worker_class.encode('ascii'))
+        )()
 
     dfs = DragonFeatureStore(ddict)
     comm_channel = DragonFLIChannel(to_worker_fli_serialized)

From 6ea0671e69192434732ad3e3195c019d45b21da8 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 16 Jul 2024 05:24:45 -0500
Subject: [PATCH 34/84] Use batch where needed

---
 .../infrastructure/control/devicemanager.py   |  19 +--
 .../control/requestdispatcher.py              |  13 +-
 .../infrastructure/control/workermanager.py   |  77 ++++++------
 .../mli/infrastructure/worker/torch_worker.py |  71 +++++++----
 .../_core/mli/infrastructure/worker/worker.py | 115 +++++++++---------
 5 files changed, 158 insertions(+), 137 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py
index 94c2404ead..4b3d2a8edb 100644
--- a/smartsim/_core/mli/infrastructure/control/devicemanager.py
+++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py
@@ -31,7 +31,7 @@
 
 from ...infrastructure.storage.featurestore import FeatureStore
 from ..worker.worker import MachineLearningWorkerBase
-from .requestdispatcher import InferenceWork
+from .requestdispatcher import InferenceBatch
 
 
 class WorkerDevice:
@@ -67,7 +67,7 @@ def remove_model(self, key: str) -> None:
     def get_model(self, key: str) -> t.Any:
         return self._models[key]
 
-    def __contains__(self, key: str):
+    def __contains__(self, key: str) -> bool:
         return key in self._models
 
     def __exit__(
@@ -87,18 +87,18 @@ def __init__(self, devices: list[WorkerDevice]):
     def get_free_device(
         self,
         worker: MachineLearningWorkerBase,
-        inference_work: InferenceWork,
+        batch: InferenceBatch,
         feature_store: t.Optional[FeatureStore],
     ) -> t.Generator[WorkerDevice, None, None]:
         return_device = None
-        sample_request = inference_work.requests[0]
+        sample_request = batch.requests[0]
         direct_inference = sample_request.raw_model is not None
         while return_device is None:
             loaded_devices = []
             if not direct_inference:
                 # Look up devices to see if any of them already has a copy of the model
                 for device in self._devices:
-                    if inference_work.model_key in device:
+                    if batch.model_key in device:
                         loaded_devices.append(device)
 
                 # If a pre-loaded model is found on a device, try using that device
@@ -113,12 +113,12 @@ def get_free_device(
                         candidate_device not in loaded_devices
                         and candidate_device.acquire(blocking=False)
                     ):
-                        model_bytes = worker.fetch_model(sample_request, feature_store)
+                        model_bytes = worker.fetch_model(batch, feature_store)
                         loaded_model = worker.load_model(
-                            sample_request, model_bytes, candidate_device.name
+                            batch, model_bytes, candidate_device.name
                         )
                         candidate_device.add_model(
-                            inference_work.model_key, loaded_model.model
+                            batch.model_key, loaded_model.model
                         )
 
                         return_device = candidate_device
@@ -126,5 +126,6 @@ def get_free_device(
         try:
             yield return_device
         finally:
-            return_device.remove_model(inference_work.model_key)
+            if direct_inference:
+                return_device.remove_model(batch.model_key)
             return_device.release()
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index 520605c588..6592187f1f 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -26,26 +26,19 @@
 import time
 import typing as t
 import uuid
-from dataclasses import dataclass
 from queue import Empty, Full, Queue
 from threading import RLock
 from types import TracebackType
 
 from packaging.version import Version
 
-from ...infrastructure.worker.worker import InferenceRequest
+from ...infrastructure.worker.worker import InferenceBatch, InferenceRequest
 from ...mli_schemas.model.model_capnp import Model
 
 if t.TYPE_CHECKING:
     from dragon.fli import FLInterface
 
 
-@dataclass
-class InferenceWork:
-    model_key: str
-    requests: list[InferenceRequest]
-
-
 class WorkerDevice:
     def __init__(self, name: str) -> None:
         """Wrapper around a device to keep track of loaded Models and availability
@@ -214,11 +207,11 @@ def _update_model_version(self, model: Model) -> None:
             self._model_last_version[model.name] = Version(model.version)
             return
 
-    def flush_requests(self) -> t.Optional[InferenceWork]:
+    def flush_requests(self) -> t.Optional[InferenceBatch]:
         result = None
         for queue in self._queues:
             if queue.acquire(blocking=False) and queue.ready:
-                result = InferenceWork(
+                result = InferenceBatch(
                     model_key=queue.model_key, requests=queue.flush()
                 )
                 queue.release()
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 9163be4cec..af7ceec844 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -23,12 +23,10 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import time
 import typing as t
 
 import numpy as np
 
-from .....error import SmartSimError
 from .....log import ContextThread, get_logger
 from ....entrypoints.service import Service
 from ...comm.channel.channel import CommChannelBase
@@ -265,63 +263,70 @@ def _on_iteration(self) -> None:
             logger.warning("No queue to check for tasks")
             return
 
-        inference_work = self._request_dispatcher.flush_requests()
-        if inference_work is None or 0 == len(inference_work.requests):
+        batch = self._request_dispatcher.flush_requests()
+        if batch is None or 0 == len(batch.requests):
             return
 
-        request = inference_work.requests[0]
+        # sample_request = inference_work.requests[0]
 
         device: WorkerDevice = next(
             self._device_manager.get_free_device(
                 worker=self._worker,
-                inference_work=inference_work,
+                batch=batch,
                 feature_store=self._feature_store,
             )
         )
 
+        model_result = LoadModelResult(device.get_model(batch.model_key))
 
-        model_result = device.get_model(inference_work.model_key)
-
-        fetch_input_result = self._worker.fetch_inputs(request, self._feature_store)
+        fetch_input_results = self._worker.fetch_inputs(batch, self._feature_store)
 
         transformed_input = self._worker.transform_input(
-            request, fetch_input_result, self._device
+            batch, fetch_input_results, self._device
         )
 
-        reply = InferenceReply()
+        replies: list[InferenceReply] = [InferenceReply() for _ in range(len(batch.requests))]
 
         try:
             execute_result = self._worker.execute(
-                request, model_result, transformed_input
+                batch, model_result, transformed_input
             )
-
-            transformed_output = self._worker.transform_output(
-                request, execute_result, self._device
+            transformed_outputs = self._worker.transform_output(
+                batch, execute_result, self._device
             )
-
-            if request.output_keys:
-                reply.output_keys = self._worker.place_output(
-                    request, transformed_output, self._feature_store
-                )
-            else:
-                reply.outputs = transformed_output.outputs
         except Exception:
             logger.exception("Error executing worker")
-            reply.failed = True
-
-        if reply.failed:
-            response = build_failure_reply("fail", "failure-occurred")
+            for reply in replies:
+                reply.failed = True
         else:
-            if reply.outputs is None or not reply.outputs:
-                response = build_failure_reply("fail", "no-results")
-
-            response = build_reply(reply)
-
-        # serialized = self._worker.serialize_reply(request, transformed_output)
-        serialized_resp = MessageHandler.serialize_response(response)  # type: ignore
-
-        if request.callback:
-            request.callback.send(serialized_resp)
+            for reply_idx, (request, transformed_output) in enumerate(zip(
+                batch.requests, transformed_outputs
+            )):
+                reply = replies[reply_idx]
+                try:
+                    if request.output_keys:
+                        reply.output_keys = self._worker.place_output(
+                            request, transformed_output, self._feature_store
+                        )
+                    else:
+                        reply.outputs = transformed_output.outputs
+                except Exception:
+                    logger.exception("Error executing worker")
+                    reply.failed = True
+
+                if reply.failed:
+                    response = build_failure_reply("fail", "failure-occurred")
+                else:
+                    if reply.outputs is None or not reply.outputs:
+                        response = build_failure_reply("fail", "no-results")
+
+                response = build_reply(reply)
+
+                # serialized = self._worker.serialize_reply(request, transformed_output)
+                serialized_resp = MessageHandler.serialize_response(response)  # type: ignore
+
+                if request.callback:
+                    request.callback.send(serialized_resp)
 
     def _can_shutdown(self) -> bool:
         """Return true when the criteria to shut down the service are met."""
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index a4e725ab99..25c762c6a2 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -36,6 +36,7 @@
     ExecuteResult,
     FetchInputResult,
     FetchModelResult,
+    InferenceBatch,
     InferenceRequest,
     LoadModelResult,
     MachineLearningWorkerBase,
@@ -51,8 +52,9 @@ class TorchWorker(MachineLearningWorkerBase):
 
     @staticmethod
     def load_model(
-        request: InferenceRequest, fetch_result: FetchModelResult, device: str
+        batch: InferenceBatch, fetch_result: FetchModelResult, device: str
     ) -> LoadModelResult:
+        request = batch.requests[0]
         if fetch_result.model_bytes:
             model_bytes = fetch_result.model_bytes
         elif request.raw_model and request.raw_model.data:
@@ -69,27 +71,45 @@ def load_model(
 
     @staticmethod
     def transform_input(
-        request: InferenceRequest, fetch_result: FetchInputResult, device: str
+        batch: InferenceBatch, fetch_results: list[FetchInputResult], device: str
     ) -> TransformInputResult:
-        result = []
+        results: list[list[torch.Tensor]] = []
+        start = 0
+        slices: list[slice] = []
 
         device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
-        device = device_to_torch[device]
-        if fetch_result.meta is None:
-            raise ValueError("Cannot reconstruct tensor without meta information")
-        for item, item_meta in zip(fetch_result.inputs, fetch_result.meta):
-            tensor_desc: tensor_capnp.TensorDescriptor = item_meta
+        for old, new in device_to_torch.items():
+            device.replace(old, new)
+
+        for fetch_result in fetch_results:
+            partial_result = []
+            if fetch_result.meta is None:
+                raise ValueError("Cannot reconstruct tensor without meta information")
+            for item, item_meta in zip(fetch_result.inputs, fetch_result.meta):
+                tensor_desc: tensor_capnp.TensorDescriptor = item_meta
+                partial_result.append(
+                    torch.tensor(np.frombuffer(item, dtype=str(tensor_desc.dataType)))
+                    .to(device)
+                    .reshape(tuple(dim for dim in tensor_desc.dimensions))
+                )
+            results.append(partial_result)
+            num_samples = fetch_result.meta[0].dimensions[0]
+            slices.append(slice(start, start + num_samples))
+            start = start + num_samples
+
+        result: list[torch.Tensor] = []
+        for t_idx in range(len(results[0])):
             result.append(
-                torch.tensor(np.frombuffer(item, dtype=str(tensor_desc.dataType)))
-                .to(device)
-                .reshape(tuple(dim for dim in tensor_desc.dimensions))
+                torch.concatenate([partial_result[t_idx] for partial_result in results])
             )
-        return TransformInputResult(result)
+
+        return TransformInputResult(result, slices)
         # return data # note: this fails copy test!
 
+    # pylint: disable-next=unused-argument
     @staticmethod
     def execute(
-        request: InferenceRequest,
+        batch: InferenceBatch,
         load_result: LoadModelResult,
         transform_result: TransformInputResult,
     ) -> ExecuteResult:
@@ -100,20 +120,23 @@ def execute(
         model.eval()
         results = [model(tensor).detach() for tensor in transform_result.transformed]
 
-        execute_result = ExecuteResult(results)
+        execute_result = ExecuteResult(results, transform_result.slices)
         return execute_result
 
     @staticmethod
     def transform_output(
-        request: InferenceRequest,
+        batch: InferenceBatch,
         execute_result: ExecuteResult,
         result_device: str,
-    ) -> TransformOutputResult:
-        if result_device != "cpu":
-            transformed = [item.to("cpu") for item in execute_result.predictions]
-            # todo: need the shape from latest schemas added here.
-            return TransformOutputResult(transformed, None, "c", "float32")  # fixme
-
-        return TransformOutputResult(
-            execute_result.predictions, None, "c", "float32"
-        )  # fixme
+    ) -> list[TransformOutputResult]:
+        transformed_list: list[TransformOutputResult] = []
+        for result_slice in execute_result.slices:
+            if result_device != "cpu":
+                transformed = [item.to("cpu") for item in execute_result.predictions[result_slice]]
+                # todo: need the shape from latest schemas added here.
+                transformed_list.append(TransformOutputResult(transformed, None, "c", "float32"))  # fixme
+
+            transformed_list.append(TransformOutputResult(
+                execute_result.predictions[result_slice], None, "c", "float32"
+            ))  # fixme
+        return transformed_list
\ No newline at end of file
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 9dfa974785..2fa03b1297 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -26,6 +26,7 @@
 
 import typing as t
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
 
 from .....error import SmartSimError
 from .....log import get_logger
@@ -63,6 +64,12 @@ def __init__(
         self.batch_size = batch_size
 
 
+@dataclass
+class InferenceBatch:
+    model_key: str
+    requests: list[InferenceRequest]
+
+
 class InferenceReply:
     """Internal representation of the reply to a client request for inference"""
 
@@ -87,19 +94,21 @@ def __init__(self, model: t.Any) -> None:
 
 
 class TransformInputResult:
-    """A wrapper around a transformed input"""
+    """A wrapper around a transformed batchinput"""
 
-    def __init__(self, result: t.Any) -> None:
+    def __init__(self, result: t.Any, slices: list[slice]) -> None:
         """Initialize the object"""
         self.transformed = result
+        self.slices = slices
 
 
 class ExecuteResult:
     """A wrapper around inference results"""
 
-    def __init__(self, result: t.Any) -> None:
+    def __init__(self, result: t.Any, slices: list[slice]) -> None:
         """Initialize the object"""
         self.predictions = result
+        self.slices = slices
 
 
 class FetchInputResult:
@@ -145,82 +154,72 @@ class MachineLearningWorkerCore:
 
     @staticmethod
     def fetch_model(
-        request: InferenceRequest, feature_store: t.Optional[FeatureStore]
+        batch: InferenceBatch, feature_store: t.Optional[FeatureStore]
     ) -> FetchModelResult:
         """Given a resource key, retrieve the raw model from a feature store
-        :param request: The request that triggered the pipeline
+        :param batc: The batch of requests that triggered the pipeline
         :param feature_store: The feature store used for persistence
         :return: Raw bytes of the model"""
 
-        if request.raw_model:
-            # Should we cache model in the feature store?
-            # model_key = hash(request.raw_model)
-            # feature_store[model_key] = request.raw_model
-            # short-circuit and return the directly supplied model
-            return FetchModelResult(request.raw_model.data)
+        # All requests in the same batch share the model
+        sample_request = batch.requests[0]
+        if sample_request.raw_model:
+            return FetchModelResult(sample_request.raw_model.data)
 
         if not feature_store:
             raise ValueError("Feature store is required for model retrieval")
 
-        if not request.model_key:
+        if not sample_request.model_key:
             raise SmartSimError(
                 "Key must be provided to retrieve model from feature store"
             )
 
         try:
-            raw_bytes: bytes = t.cast(bytes, feature_store[request.model_key])
+            raw_bytes: bytes = t.cast(bytes, feature_store[sample_request.model_key])
             return FetchModelResult(raw_bytes)
         except FileNotFoundError as ex:
             logger.exception(ex)
             raise SmartSimError(
-                f"Model could not be retrieved with key {request.model_key}"
+                f"Model could not be retrieved with key {sample_request.model_key}"
             ) from ex
 
     @staticmethod
     def fetch_inputs(
-        request: InferenceRequest, feature_store: t.Optional[FeatureStore]
-    ) -> FetchInputResult:
+        batch: InferenceBatch, feature_store: t.Optional[FeatureStore]
+    ) -> t.List[FetchInputResult]:
         """Given a collection of ResourceKeys, identify the physical location
         and input metadata
         :param request: The request that triggered the pipeline
         :param feature_store: The feature store used for persistence
         :return: the fetched input"""
-
-        if request.raw_inputs:
-            return FetchInputResult(request.raw_inputs, request.input_meta)
-
-        if not feature_store:
-            raise ValueError("No input and no feature store provided")
-
-        if request.input_keys:
-            data: t.List[bytes] = []
-            for input_ in request.input_keys:
-                try:
-                    tensor_bytes = t.cast(bytes, feature_store[input_])
-                    data.append(tensor_bytes)
-                except KeyError as ex:
-                    logger.exception(ex)
-                    raise SmartSimError(
-                        f"Model could not be retrieved with key {input_}"
-                    ) from ex
-            return FetchInputResult(
-                data, None
-            )  # fixme: need to get both tensor and descriptor
-
-        raise ValueError("No input source")
-
-    @staticmethod
-    def batch_requests(
-        request: InferenceRequest, transform_result: TransformInputResult
-    ) -> CreateInputBatchResult:
-        """Create a batch of requests. Return the batch when batch_size datum have been
-        collected or a configured batch duration has elapsed.
-        :param request: The request that triggered the pipeline
-        :param transform_result: Transformed inputs ready for batching
-        :return: `None` if batch size has not been reached and timeout not exceeded."""
-        if transform_result is not None or request.batch_size:
-            raise NotImplementedError("Batching is not yet supported")
-        return CreateInputBatchResult(None)
+        fetch_results = []
+        for request in batch.requests:
+            if request.raw_inputs:
+                fetch_results.append(
+                    FetchInputResult(request.raw_inputs, request.input_meta)
+                )
+
+            if not feature_store:
+                raise ValueError("No input and no feature store provided")
+
+            if request.input_keys:
+                data: t.List[bytes] = []
+                for input_ in request.input_keys:
+                    try:
+                        tensor_bytes = t.cast(bytes, feature_store[input_])
+                        data.append(tensor_bytes)
+                    except KeyError as ex:
+                        logger.exception(ex)
+                        raise SmartSimError(
+                            f"Input tensor could not be retrieved with key {input_}"
+                        ) from ex
+                fetch_results.append(
+                    FetchInputResult(data, None)
+                )  # fixme: need to get both tensor and descriptor
+
+            raise ValueError("No input source")
+
+        return fetch_results
 
     @staticmethod
     def place_output(
@@ -256,7 +255,7 @@ class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC):
     @staticmethod
     @abstractmethod
     def load_model(
-        request: InferenceRequest, fetch_result: FetchModelResult, device: str
+        batch: InferenceBatch, fetch_result: FetchModelResult, device: str
     ) -> LoadModelResult:
         """Given a loaded MachineLearningModel, ensure it is loaded into
         device memory
@@ -267,18 +266,18 @@ def load_model(
     @staticmethod
     @abstractmethod
     def transform_input(
-        request: InferenceRequest, fetch_result: FetchInputResult, device: str
+        batch: InferenceBatch, fetch_results: list[FetchInputResult], device: str
     ) -> TransformInputResult:
         """Given a collection of data, perform a transformation on the data
         :param request: The request that triggered the pipeline
-        :param fetch_result: Raw output from fetching inputs out of a feature store
+        :param fetch_result: Raw outputs from fetching inputs out of a feature store
         :param device: The device on which the transformed input must be placed
         :return: The transformed inputs wrapped in a InputTransformResult"""
 
     @staticmethod
     @abstractmethod
     def execute(
-        request: InferenceRequest,
+        batch: InferenceBatch,
         load_result: LoadModelResult,
         transform_result: TransformInputResult,
     ) -> ExecuteResult:
@@ -291,8 +290,8 @@ def execute(
     @staticmethod
     @abstractmethod
     def transform_output(
-        request: InferenceRequest, execute_result: ExecuteResult, result_device: str
-    ) -> TransformOutputResult:
+        batch: InferenceBatch, execute_result: ExecuteResult, result_device: str
+    ) -> t.List[TransformOutputResult]:
         """Given inference results, perform transformations required to
         transmit results to the requestor.
         :param request: The request that triggered the pipeline

From d26e5f0b19bea99f4263990bc69ca1f7a6fce6b5 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 16 Jul 2024 18:09:57 -0500
Subject: [PATCH 35/84] Adjustments, get back to one thread

---
 smartsim/_core/entrypoints/service.py         |  17 ---
 .../infrastructure/control/devicemanager.py   |   1 +
 .../control/requestdispatcher.py              |  47 ++++++--
 .../infrastructure/control/workermanager.py   | 112 ++++++++++++++----
 .../mli/infrastructure/worker/torch_worker.py |  14 ++-
 .../_core/mli/infrastructure/worker/worker.py |   2 +
 6 files changed, 137 insertions(+), 56 deletions(-)

diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py
index df9c2bbef6..6b4ef74b67 100644
--- a/smartsim/_core/entrypoints/service.py
+++ b/smartsim/_core/entrypoints/service.py
@@ -103,23 +103,6 @@ def execute(self) -> None:
         running = True
         cooldown_start: t.Optional[datetime.datetime] = None
 
-        headers = [
-            "batch_size",
-            "w_deserialize",
-            "w_fetch_model",
-            "w_load_model",
-            "w_fetch_input",
-            "w_transform_input",
-            "w_execute",
-            "w_transform_output",
-            "w_assign_output",
-            "w_build_reply",
-            "w_serialize_resp",
-            "w_send",
-        ]
-
-        print(",".join(headers))
-
         while running:
             self._on_iteration()
 
diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py
index 4b3d2a8edb..8d284c1262 100644
--- a/smartsim/_core/mli/infrastructure/control/devicemanager.py
+++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py
@@ -105,6 +105,7 @@ def get_free_device(
                 for device in loaded_devices:
                     if device.acquire(blocking=False):
                         return_device = device
+                        break
 
             # If the model is not loaded on a free device, load it on another device (if available)
             if return_device is None:
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index 6592187f1f..19f3256cef 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -32,12 +32,14 @@
 
 from packaging.version import Version
 
+from .....log import get_logger
 from ...infrastructure.worker.worker import InferenceBatch, InferenceRequest
 from ...mli_schemas.model.model_capnp import Model
 
 if t.TYPE_CHECKING:
     from dragon.fli import FLInterface
 
+logger = get_logger("Request Dispatcher")
 
 class WorkerDevice:
     def __init__(self, name: str) -> None:
@@ -78,6 +80,11 @@ def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> Non
         self._disposable = False
         self._model_key = model_key
         self._flush_lock = RLock()
+        self._id = str(uuid.uuid4())
+
+    @property
+    def id(self):
+        return self._id
 
     def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]:
         return self._flush_lock.acquire(blocking=blocking, timeout=timeout)
@@ -106,11 +113,17 @@ def put(
         block: bool = False,
         timeout: t.Optional[float] = 0.0,
     ) -> None:
-        if not self.acquire(blocking=False) or self.disposable:
+        if not self.acquire(blocking=False):
+            logger.error(f"Could not acquire queue {self._id} to put")
             raise Full
-        if self._first_put is None:
-            self._first_put = time.time()
-        super().put(item, block=block, timeout=timeout)
+        try:
+            if self.full():
+                raise Full
+            if self._first_put is None:
+                self._first_put = time.time()
+            super().put(item, block=block, timeout=timeout)
+        finally:
+            self.release()
 
     @property
     def _waited_time(self) -> float:
@@ -146,6 +159,10 @@ def flush(self) -> list[t.Any]:
         return items
 
     def full(self) -> bool:
+        if self._disposable:
+            return True
+        if self._batch_size <= 0:
+            return False
         return self.qsize() >= self._batch_size
 
     def empty(self) -> bool:
@@ -158,7 +175,7 @@ def __init__(
         batch_timeout: float,
         batch_size: int,
     ) -> None:
-        self._queues: list[BatchQueue]
+        self._queues: list[BatchQueue] = []
         self._active_queues: dict[str, BatchQueue] = {}
         self._model_last_version: dict[str, Version] = {}
         self._model_name_to_key: dict[str, str] = {}
@@ -170,15 +187,19 @@ def _swap_queue(self, model_key: str) -> None:
         with self._queue_swap_lock:
             for queue in self._queues:
                 if queue.model_key == model_key and not queue.full():
+                    logger.info("Found queue, swapping")
                     self._active_queues[model_key] = queue
                     return
 
+            logger.info("Creating new queue")
             new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_key)
+            self._queues.append(new_queue)
             self._active_queues[model_key] = new_queue
             return
 
     def dispatch(self, request: InferenceRequest) -> None:
         if request.raw_model is not None:
+            logger.info("Direct inference requested, creating tmp queue")
             tmp_id = f"_tmp_{str(uuid.uuid4())}"
             tmp_queue: BatchQueue = BatchQueue(
                 batch_timeout=0, batch_size=1, model_key=tmp_id
@@ -189,12 +210,14 @@ def dispatch(self, request: InferenceRequest) -> None:
             return
 
         if request.model_key:
+            logger.info("Indirect inference requested, dispatching it to existing queue")
             success = False
             while not success:
                 try:
                     self._active_queues[request.model_key].put_nowait(request)
                     success = True
                 except (Full, KeyError):
+                    logger.info("Could not find non-full queue, swapping")
                     self._swap_queue(request.model_key)
 
     def _update_model_version(self, model: Model) -> None:
@@ -210,11 +233,15 @@ def _update_model_version(self, model: Model) -> None:
     def flush_requests(self) -> t.Optional[InferenceBatch]:
         result = None
         for queue in self._queues:
-            if queue.acquire(blocking=False) and queue.ready:
-                result = InferenceBatch(
-                    model_key=queue.model_key, requests=queue.flush()
-                )
-                queue.release()
+            # logger.info("Acquiring queue to flush")
+            if queue.ready and queue.acquire(blocking=False):
+                try:
+                    logger.info(f"Acquired queue {queue.id}")
+                    result = InferenceBatch(
+                        model_key=queue.model_key, requests=queue.flush()
+                    )
+                finally:
+                    queue.release()
                 break
 
         return result
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index af7ceec844..674bfc93a1 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -24,8 +24,11 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import typing as t
-
+import time
 import numpy as np
+import numbers
+
+from collections import OrderedDict
 
 from .....log import ContextThread, get_logger
 from ....entrypoints.service import Service
@@ -166,7 +169,7 @@ def __init__(
         comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel,
         device: t.Literal["cpu", "gpu"] = "cpu",
         batch_timeout: float = 0.0,
-        batch_size: int = 0,
+        batch_size: int = 1,
     ) -> None:
         """Initialize the WorkerManager
         :param config_loader: Environment config loader that loads the task queue and
@@ -200,28 +203,73 @@ def __init__(
         self._dispatcher_threads = 1
         """Number of threads which dispatch requests"""
         self._device_manager: DeviceManager = DeviceManager([WorkerDevice("gpu")])
+        self._start = None
+        self._interm = None
+        self._timings: OrderedDict[str, list[numbers.Number]] = OrderedDict()
+        self._timing_on = True
+
+    def _add_label_to_timings(self, label: str):
+        if label not in self._timings:
+            self._timings[label] = []
+
+    @staticmethod
+    def _format_number(number: numbers.Number):
+        return f"{number:0.4e}"
+
+    def start_timings(self):
+        if self._timing_on:
+            # self._add_label_to_timings("batch_size")
+            # self._timings["batch_size"].append(batch_size)
+            self._start = time.perf_counter()
+            self._interm = time.perf_counter()
+
+    def end_timings(self):
+        if self._timing_on:
+            self._add_label_to_timings("total_time")
+            self._timings["total_time"].append(self._format_number(time.perf_counter()-self._start))
+
+    def measure_time(self, label: str):
+        if self._timing_on:
+            self._add_label_to_timings(label)
+            self._timings[label].append(self._format_number(time.perf_counter()-self._interm))
+            self._interm = time.perf_counter()
+
+    def print_timings(self, to_file: bool = False):
+        print(" ".join(self._timings.keys()))
+        value_array = np.array([value for  value in self._timings.values()], dtype=float)
+        value_array = np.transpose(value_array)
+        for i in range(value_array.shape[0]):
+            print(" ".join(self._format_number(value) for value in value_array[i]))
+        if to_file:
+            np.save("timings.npy", value_array)
+            np.savetxt("timings.txt", value_array)
+
 
     def _receive_requests(self) -> None:
         if self._task_queue is None:
             return
-        while not self._can_shutdown():
-            # perform default deserialization of the message envelope
-            request_bytes: bytes = self._task_queue.recv()
-
-            request = deserialize_message(request_bytes, self._comm_channel_type)
-            if not self._validate_request(request):
-                return
+    # while not self._can_shutdown():
+        # perform default deserialization of the message envelope
+        request_bytes: bytes = self._task_queue.recv()
+
+        self.start_timings()
+        request = deserialize_message(request_bytes, self._comm_channel_type)
+        self.measure_time("w_deserialize")
+        if not self._validate_request(request):
+            return
 
-            self._request_dispatcher.dispatch(request)
+        self._request_dispatcher.dispatch(request)
+        self.measure_time("w_dispatch")
 
     def _on_start(self) -> None:
-        for thread_idx in range(self._dispatcher_threads):
-            dispatcher_thread = ContextThread(
-                name=f"Dispatcher_{thread_idx}",
-                target=self._receive_requests,
-                daemon=True,
-            )
-            dispatcher_thread.start()
+        # for thread_idx in range(self._dispatcher_threads):
+        #     dispatcher_thread = ContextThread(
+        #         name=f"Dispatcher_{thread_idx}",
+        #         target=self._receive_requests,
+        #         daemon=True,
+        #     )
+        #     dispatcher_thread.start()
+        pass
 
     def _validate_request(self, request: InferenceRequest) -> bool:
         """Ensure the request can be processed.
@@ -259,16 +307,15 @@ def _on_iteration(self) -> None:
         the inference pipeline"""
         logger.debug("executing worker manager pipeline")
 
-        if self._task_queue is None:
-            logger.warning("No queue to check for tasks")
-            return
+        self._receive_requests()
 
+        # logger.info("Getting request batch")
         batch = self._request_dispatcher.flush_requests()
         if batch is None or 0 == len(batch.requests):
             return
 
-        # sample_request = inference_work.requests[0]
-
+        self.measure_time("w_flush_requests")
+        # logger.info(f"Got batch of {len(batch.requests)} requests, acquiring device")
         device: WorkerDevice = next(
             self._device_manager.get_free_device(
                 worker=self._worker,
@@ -276,24 +323,32 @@ def _on_iteration(self) -> None:
                 feature_store=self._feature_store,
             )
         )
+        self.measure_time("w_fetch_model")
+
+        # logger.info(f"Acquired device {device.name}")
 
         model_result = LoadModelResult(device.get_model(batch.model_key))
+        self.measure_time("w_load_model")
 
         fetch_input_results = self._worker.fetch_inputs(batch, self._feature_store)
+        self.measure_time("w_fetch_input")
 
         transformed_input = self._worker.transform_input(
             batch, fetch_input_results, self._device
         )
+        self.measure_time("w_transform_input")
 
-        replies: list[InferenceReply] = [InferenceReply() for _ in range(len(batch.requests))]
+        replies = [InferenceReply() for _ in range(len(batch.requests))]
 
         try:
             execute_result = self._worker.execute(
                 batch, model_result, transformed_input
             )
+            self.measure_time("w_execute")
             transformed_outputs = self._worker.transform_output(
                 batch, execute_result, self._device
             )
+            self.measure_time("w_transform_output")
         except Exception:
             logger.exception("Error executing worker")
             for reply in replies:
@@ -310,10 +365,12 @@ def _on_iteration(self) -> None:
                         )
                     else:
                         reply.outputs = transformed_output.outputs
+                    self.measure_time("w_assign_output")
                 except Exception:
                     logger.exception("Error executing worker")
                     reply.failed = True
 
+
                 if reply.failed:
                     response = build_failure_reply("fail", "failure-occurred")
                 else:
@@ -321,12 +378,21 @@ def _on_iteration(self) -> None:
                         response = build_failure_reply("fail", "no-results")
 
                 response = build_reply(reply)
+                self.measure_time("w_build_reply")
 
                 # serialized = self._worker.serialize_reply(request, transformed_output)
                 serialized_resp = MessageHandler.serialize_response(response)  # type: ignore
 
+                self.measure_time("w_serialize_resp")
+
                 if request.callback:
                     request.callback.send(serialized_resp)
+                self.measure_time("w_send")
+
+        self.end_timings()
+
+        if len(self._timings["w_send"]) == 801:
+            self.print_timings(True)
 
     def _can_shutdown(self) -> bool:
         """Return true when the criteria to shut down the service are met."""
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index 25c762c6a2..3156b587a7 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -37,7 +37,6 @@
     FetchInputResult,
     FetchModelResult,
     InferenceBatch,
-    InferenceRequest,
     LoadModelResult,
     MachineLearningWorkerBase,
     TransformInputResult,
@@ -79,7 +78,7 @@ def transform_input(
 
         device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
         for old, new in device_to_torch.items():
-            device.replace(old, new)
+            device = device.replace(old, new)
 
         for fetch_result in fetch_results:
             partial_result = []
@@ -98,10 +97,13 @@ def transform_input(
             start = start + num_samples
 
         result: list[torch.Tensor] = []
-        for t_idx in range(len(results[0])):
-            result.append(
-                torch.concatenate([partial_result[t_idx] for partial_result in results])
-            )
+        if len(batch.requests) > 1:
+            for t_idx in range(len(results[0])):
+                result.append(
+                    torch.concatenate([partial_result[t_idx] for partial_result in results])
+                )
+        else:
+            result = results[0]
 
         return TransformInputResult(result, slices)
         # return data # note: this fails copy test!
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 2fa03b1297..adc6b6edee 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -198,6 +198,7 @@ def fetch_inputs(
                 fetch_results.append(
                     FetchInputResult(request.raw_inputs, request.input_meta)
                 )
+                continue
 
             if not feature_store:
                 raise ValueError("No input and no feature store provided")
@@ -216,6 +217,7 @@ def fetch_inputs(
                 fetch_results.append(
                     FetchInputResult(data, None)
                 )  # fixme: need to get both tensor and descriptor
+                continue
 
             raise ValueError("No input source")
 

From 293e9777e81183afc14e33f0b857f05fcdc0639a Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 17 Jul 2024 02:04:42 +0200
Subject: [PATCH 36/84] Move timing

---
 smartsim/_core/mli/infrastructure/control/workermanager.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 674bfc93a1..9b59e7144e 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -254,9 +254,9 @@ def _receive_requests(self) -> None:
 
         self.start_timings()
         request = deserialize_message(request_bytes, self._comm_channel_type)
-        self.measure_time("w_deserialize")
         if not self._validate_request(request):
             return
+        self.measure_time("w_deserialize")
 
         self._request_dispatcher.dispatch(request)
         self.measure_time("w_dispatch")
@@ -365,10 +365,10 @@ def _on_iteration(self) -> None:
                         )
                     else:
                         reply.outputs = transformed_output.outputs
-                    self.measure_time("w_assign_output")
                 except Exception:
                     logger.exception("Error executing worker")
                     reply.failed = True
+                self.measure_time("w_assign_output")
 
 
                 if reply.failed:
@@ -380,7 +380,6 @@ def _on_iteration(self) -> None:
                 response = build_reply(reply)
                 self.measure_time("w_build_reply")
 
-                # serialized = self._worker.serialize_reply(request, transformed_output)
                 serialized_resp = MessageHandler.serialize_response(response)  # type: ignore
 
                 self.measure_time("w_serialize_resp")

From 40c047133897d31bf9f0d6f1f8450527fb56b62f Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 17 Jul 2024 16:00:15 -0500
Subject: [PATCH 37/84] multiprocess solution

---
 smartsim/_core/mli/comm/channel/dragonfli.py  |   4 +-
 .../infrastructure/control/devicemanager.py   |   4 +-
 .../control/requestdispatcher.py              | 179 ++++++++++++++--
 .../infrastructure/control/workermanager.py   | 196 ++++--------------
 .../mli/infrastructure/worker/torch_worker.py |  41 ++--
 smartsim/_core/utils/timings.py               |  89 ++++++++
 6 files changed, 322 insertions(+), 191 deletions(-)
 create mode 100644 smartsim/_core/utils/timings.py

diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py
index 1c02857eab..319875db2c 100644
--- a/smartsim/_core/mli/comm/channel/dragonfli.py
+++ b/smartsim/_core/mli/comm/channel/dragonfli.py
@@ -59,10 +59,10 @@ def send(self, value: bytes) -> None:
     def recv(self) -> bytes:
         """Receieve a message through the underlying communication channel
         :returns: the received message"""
-        with self._fli.recvh(timeout=None) as recvh:
+        with self._fli.recvh() as recvh:
             try:
                 request_bytes: bytes
-                request_bytes, _ = recvh.recv_bytes(timeout=None)
+                request_bytes, _ = recvh.recv_bytes()
                 return request_bytes
             except fli.FLIEOT:
                 return b""
diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py
index 8d284c1262..1a2a860aa9 100644
--- a/smartsim/_core/mli/infrastructure/control/devicemanager.py
+++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py
@@ -118,9 +118,7 @@ def get_free_device(
                         loaded_model = worker.load_model(
                             batch, model_bytes, candidate_device.name
                         )
-                        candidate_device.add_model(
-                            batch.model_key, loaded_model.model
-                        )
+                        candidate_device.add_model(batch.model_key, loaded_model.model)
 
                         return_device = candidate_device
 
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index 19f3256cef..2a8ed9e39f 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -23,24 +23,93 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+# isort: off
+# pylint: disable-next=unused-import
+import dragon
+from dragon.mpbridge.queues import DragonQueue
+# isort: on
+
+import multiprocessing as mp
 import time
 import typing as t
 import uuid
 from queue import Empty, Full, Queue
-from threading import RLock
+from threading import Lock
 from types import TracebackType
 
 from packaging.version import Version
 
+from .....error import SmartSimError
 from .....log import get_logger
+from ....utils.timings import PerfTimer
+from ...comm.channel.channel import CommChannelBase
+from ...comm.channel.dragonchannel import DragonCommChannel
+from ...infrastructure.storage.featurestore import FeatureStore
 from ...infrastructure.worker.worker import InferenceBatch, InferenceRequest
+from ...message_handler import MessageHandler
 from ...mli_schemas.model.model_capnp import Model
 
-if t.TYPE_CHECKING:
-    from dragon.fli import FLInterface
-
 logger = get_logger("Request Dispatcher")
 
+
+def deserialize_message(
+    data_blob: bytes,
+    channel_type: t.Type[CommChannelBase],
+) -> InferenceRequest:
+    """Deserialize a message from a byte stream into an InferenceRequest
+    :param data_blob: The byte stream to deserialize"""
+    # todo: consider moving to XxxCore and only making
+    # workers implement the inputs and model conversion?
+
+    # alternatively, consider passing the capnproto models
+    # to this method instead of the data_blob...
+
+    # something is definitely wrong here... client shouldn't have to touch
+    # callback (or batch size)
+
+    request = MessageHandler.deserialize_request(data_blob)
+    # return request
+    model_key: t.Optional[str] = None
+    model_bytes: t.Optional[Model] = None
+
+    if request.model.which() == "key":
+        model_key = request.model.key.key
+    elif request.model.which() == "data":
+        model_bytes = request.model.data
+
+    callback_key = request.replyChannel.reply
+
+    # todo: shouldn't this be `CommChannel.find` instead of `DragonCommChannel`
+    comm_channel = channel_type(callback_key)
+    # comm_channel = DragonCommChannel(request.replyChannel)
+
+    input_keys: t.Optional[t.List[str]] = None
+    input_bytes: t.Optional[t.List[bytes]] = (
+        None  # these will really be tensors already
+    )
+
+    input_meta: t.List[t.Any] = []
+
+    if request.input.which() == "keys":
+        input_keys = [input_key.key for input_key in request.input.keys]
+    elif request.input.which() == "data":
+        input_bytes = [data.blob for data in request.input.data]
+        input_meta = [data.tensorDescriptor for data in request.input.data]
+
+    inference_request = InferenceRequest(
+        model_key=model_key,
+        callback=comm_channel,
+        raw_inputs=input_bytes,
+        input_meta=input_meta,
+        input_keys=input_keys,
+        raw_model=model_bytes,
+        batch_size=0,
+    )
+    return inference_request
+
+
 class WorkerDevice:
     def __init__(self, name: str) -> None:
         """Wrapper around a device to keep track of loaded Models and availability
@@ -50,7 +119,7 @@ def __init__(self, name: str) -> None:
         """The name used by the toolkit to identify this device"""
         self._models: dict[str, t.Any] = {}
         """Dictionary of model key to model for models stored on this device"""
-        self._lock = RLock()
+        self._lock = Lock()
         """Lock to ensure only one thread at the time accesses this device"""
 
     def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]:
@@ -79,11 +148,11 @@ def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> Non
         self._first_put: t.Optional[float] = None
         self._disposable = False
         self._model_key = model_key
-        self._flush_lock = RLock()
+        self._flush_lock = Lock()
         self._id = str(uuid.uuid4())
 
     @property
-    def id(self):
+    def id(self) -> str:
         return self._id
 
     def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]:
@@ -114,7 +183,6 @@ def put(
         timeout: t.Optional[float] = 0.0,
     ) -> None:
         if not self.acquire(blocking=False):
-            logger.error(f"Could not acquire queue {self._id} to put")
             raise Full
         try:
             if self.full():
@@ -174,24 +242,98 @@ def __init__(
         self,
         batch_timeout: float,
         batch_size: int,
+        incoming_channel: t.Optional[CommChannelBase],
+        comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel,
+        feature_store: t.Optional[FeatureStore] = None,
     ) -> None:
+        mp.set_start_method("dragon")
         self._queues: list[BatchQueue] = []
         self._active_queues: dict[str, BatchQueue] = {}
         self._model_last_version: dict[str, Version] = {}
         self._model_name_to_key: dict[str, str] = {}
         self._batch_timeout = batch_timeout
         self._batch_size = batch_size
-        self._queue_swap_lock = RLock()
+        self._queue_swap_lock: t.Optional[Lock] = None
+        self._incoming_channel = incoming_channel
+        self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0)
+        self._feature_store = feature_store
+        self._comm_channel_type = comm_channel_type
+        self._perf_timer = PerfTimer(prefix="r_")
+
+    def _validate_request(self, request: InferenceRequest) -> bool:
+        """Ensure the request can be processed.
+        :param request: The request to validate
+        :return: True if the request is valid, False otherwise"""
+        if not self._feature_store:
+            if request.model_key:
+                logger.error("Unable to load model by key without feature store")
+                return False
+
+            if request.input_keys:
+                logger.error("Unable to load inputs by key without feature store")
+                return False
+
+            if request.output_keys:
+                logger.error("Unable to persist outputs by key without feature store")
+                return False
+
+        if not request.model_key and not request.raw_model:
+            logger.error("Unable to continue without model bytes or feature store key")
+            return False
+
+        if not request.input_keys and not request.raw_inputs:
+            logger.error("Unable to continue without input bytes or feature store keys")
+            return False
+
+        if request.callback is None:
+            logger.error("No callback channel provided in request")
+            return False
+
+        return True
+
+    def run(self) -> None:
+        self._queue_swap_lock = Lock()
+        if self._incoming_channel is None:
+            raise SmartSimError("No incoming channel for dispatcher")
+        while True:
+            try:
+                request_bytes: bytes = self._incoming_channel.recv()
+            except Exception:
+                pass
+            else:
+                self._perf_timer.start_timings()
+                request = deserialize_message(request_bytes, self._comm_channel_type)
+                self._perf_timer.measure_time("deserialize_message")
+                if not self._validate_request(request):
+                    return
+                self._perf_timer.measure_time("validate_request")
+                self.dispatch(request)
+                self._perf_timer.measure_time("dispatch")
+            finally:
+                self.flush_requests()
+                self._perf_timer.measure_time("flush_requests")
+                # TODO: implement this
+                # self.remove_queues()
+
+                self._perf_timer.end_timings()
+
+                # pylint: disable-next=protected-access
+            if len(self._perf_timer._timings["r_dispatch"]) == 801:
+                self._perf_timer.print_timings(True)
+
+    @property
+    def task_queue(self) -> DragonQueue:
+        return self._outgoing_queue
 
     def _swap_queue(self, model_key: str) -> None:
+        if self._queue_swap_lock is None:
+            raise SmartSimError("Queue was not locked")
         with self._queue_swap_lock:
             for queue in self._queues:
                 if queue.model_key == model_key and not queue.full():
-                    logger.info("Found queue, swapping")
                     self._active_queues[model_key] = queue
                     return
 
-            logger.info("Creating new queue")
             new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_key)
             self._queues.append(new_queue)
             self._active_queues[model_key] = new_queue
@@ -210,14 +352,12 @@ def dispatch(self, request: InferenceRequest) -> None:
             return
 
         if request.model_key:
-            logger.info("Indirect inference requested, dispatching it to existing queue")
             success = False
             while not success:
                 try:
                     self._active_queues[request.model_key].put_nowait(request)
                     success = True
                 except (Full, KeyError):
-                    logger.info("Could not find non-full queue, swapping")
                     self._swap_queue(request.model_key)
 
     def _update_model_version(self, model: Model) -> None:
@@ -230,18 +370,15 @@ def _update_model_version(self, model: Model) -> None:
             self._model_last_version[model.name] = Version(model.version)
             return
 
-    def flush_requests(self) -> t.Optional[InferenceBatch]:
-        result = None
+    def flush_requests(self) -> None:
         for queue in self._queues:
-            # logger.info("Acquiring queue to flush")
             if queue.ready and queue.acquire(blocking=False):
                 try:
-                    logger.info(f"Acquired queue {queue.id}")
-                    result = InferenceBatch(
-                        model_key=queue.model_key, requests=queue.flush()
+                    self._outgoing_queue.put(
+                        InferenceBatch(
+                            model_key=queue.model_key, requests=queue.flush()
+                        )
                     )
                 finally:
                     queue.release()
                 break
-
-        return result
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 674bfc93a1..76e9ecc659 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -23,14 +23,18 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import typing as t
-import time
-import numpy as np
-import numbers
 
+import multiprocessing as mp
+import numbers
+import time
+import typing as t
 from collections import OrderedDict
 
-from .....log import ContextThread, get_logger
+import dragon
+import numpy as np
+
+from ....utils.timings import PerfTimer
+from .....log import get_logger
 from ....entrypoints.service import Service
 from ...comm.channel.channel import CommChannelBase
 from ...comm.channel.dragonchannel import DragonCommChannel
@@ -54,62 +58,6 @@
 logger = get_logger(__name__)
 
 
-def deserialize_message(
-    data_blob: bytes,
-    channel_type: t.Type[CommChannelBase],
-) -> InferenceRequest:
-    """Deserialize a message from a byte stream into an InferenceRequest
-    :param data_blob: The byte stream to deserialize"""
-    # todo: consider moving to XxxCore and only making
-    # workers implement the inputs and model conversion?
-
-    # alternatively, consider passing the capnproto models
-    # to this method instead of the data_blob...
-
-    # something is definitely wrong here... client shouldn't have to touch
-    # callback (or batch size)
-
-    request = MessageHandler.deserialize_request(data_blob)
-    # return request
-    model_key: t.Optional[str] = None
-    model_bytes: t.Optional[Model] = None
-
-    if request.model.which() == "key":
-        model_key = request.model.key.key
-    elif request.model.which() == "data":
-        model_bytes = request.model.data
-
-    callback_key = request.replyChannel.reply
-
-    # todo: shouldn't this be `CommChannel.find` instead of `DragonCommChannel`
-    comm_channel = channel_type(callback_key)
-    # comm_channel = DragonCommChannel(request.replyChannel)
-
-    input_keys: t.Optional[t.List[str]] = None
-    input_bytes: t.Optional[t.List[bytes]] = (
-        None  # these will really be tensors already
-    )
-
-    input_meta: t.List[t.Any] = []
-
-    if request.input.which() == "keys":
-        input_keys = [input_key.key for input_key in request.input.keys]
-    elif request.input.which() == "data":
-        input_bytes = [data.blob for data in request.input.data]
-        input_meta = [data.tensorDescriptor for data in request.input.data]
-
-    inference_request = InferenceRequest(
-        model_key=model_key,
-        callback=comm_channel,
-        raw_inputs=input_bytes,
-        input_meta=input_meta,
-        input_keys=input_keys,
-        raw_model=model_bytes,
-        batch_size=0,
-    )
-    return inference_request
-
-
 def build_failure_reply(status: "StatusEnum", message: str) -> Response:
     return MessageHandler.build_response(
         status=status,  # todo: need to indicate correct status
@@ -197,79 +145,30 @@ def __init__(
         self._cached_models: dict[str, t.Any] = {}
         """Dictionary of previously loaded models"""
         self._request_dispatcher: RequestDispatcher = RequestDispatcher(
-            batch_timeout=batch_timeout, batch_size=batch_size
+            batch_timeout=batch_timeout,
+            batch_size=batch_size,
+            incoming_channel=self._task_queue,
+            comm_channel_type=comm_channel_type,
+            feature_store=self._feature_store,
         )
         """Dispatcher used to batch requests"""
-        self._dispatcher_threads = 1
-        """Number of threads which dispatch requests"""
         self._device_manager: DeviceManager = DeviceManager([WorkerDevice("gpu")])
-        self._start = None
-        self._interm = None
-        self._timings: OrderedDict[str, list[numbers.Number]] = OrderedDict()
-        self._timing_on = True
-
-    def _add_label_to_timings(self, label: str):
-        if label not in self._timings:
-            self._timings[label] = []
-
-    @staticmethod
-    def _format_number(number: numbers.Number):
-        return f"{number:0.4e}"
-
-    def start_timings(self):
-        if self._timing_on:
-            # self._add_label_to_timings("batch_size")
-            # self._timings["batch_size"].append(batch_size)
-            self._start = time.perf_counter()
-            self._interm = time.perf_counter()
-
-    def end_timings(self):
-        if self._timing_on:
-            self._add_label_to_timings("total_time")
-            self._timings["total_time"].append(self._format_number(time.perf_counter()-self._start))
-
-    def measure_time(self, label: str):
-        if self._timing_on:
-            self._add_label_to_timings(label)
-            self._timings[label].append(self._format_number(time.perf_counter()-self._interm))
-            self._interm = time.perf_counter()
-
-    def print_timings(self, to_file: bool = False):
-        print(" ".join(self._timings.keys()))
-        value_array = np.array([value for  value in self._timings.values()], dtype=float)
-        value_array = np.transpose(value_array)
-        for i in range(value_array.shape[0]):
-            print(" ".join(self._format_number(value) for value in value_array[i]))
-        if to_file:
-            np.save("timings.npy", value_array)
-            np.savetxt("timings.txt", value_array)
-
-
-    def _receive_requests(self) -> None:
-        if self._task_queue is None:
-            return
-    # while not self._can_shutdown():
-        # perform default deserialization of the message envelope
-        request_bytes: bytes = self._task_queue.recv()
-
-        self.start_timings()
-        request = deserialize_message(request_bytes, self._comm_channel_type)
-        self.measure_time("w_deserialize")
-        if not self._validate_request(request):
-            return
 
-        self._request_dispatcher.dispatch(request)
-        self.measure_time("w_dispatch")
+        self._perf_timer = PerfTimer(prefix="w_")
+
+        try:
+            mp.set_start_method("dragon")
+        except RuntimeError:
+            pass
+        self._dispatcher_process = mp.Process(
+            target=self._request_dispatcher.run, name="Dispatcher"
+        )
 
     def _on_start(self) -> None:
-        # for thread_idx in range(self._dispatcher_threads):
-        #     dispatcher_thread = ContextThread(
-        #         name=f"Dispatcher_{thread_idx}",
-        #         target=self._receive_requests,
-        #         daemon=True,
-        #     )
-        #     dispatcher_thread.start()
-        pass
+        self._dispatcher_process.start()
+
+    def _on_shutdown(self) -> None:
+        self._dispatcher_process.join()
 
     def _validate_request(self, request: InferenceRequest) -> bool:
         """Ensure the request can be processed.
@@ -307,14 +206,12 @@ def _on_iteration(self) -> None:
         the inference pipeline"""
         logger.debug("executing worker manager pipeline")
 
-        self._receive_requests()
-
-        # logger.info("Getting request batch")
-        batch = self._request_dispatcher.flush_requests()
+        batch = self._request_dispatcher.task_queue.get()
+        self._perf_timer.start_timings()
         if batch is None or 0 == len(batch.requests):
             return
 
-        self.measure_time("w_flush_requests")
+        self._perf_timer.measure_time("flush_requests")
         # logger.info(f"Got batch of {len(batch.requests)} requests, acquiring device")
         device: WorkerDevice = next(
             self._device_manager.get_free_device(
@@ -323,20 +220,20 @@ def _on_iteration(self) -> None:
                 feature_store=self._feature_store,
             )
         )
-        self.measure_time("w_fetch_model")
+        self._perf_timer.measure_time("fetch_model")
 
         # logger.info(f"Acquired device {device.name}")
 
         model_result = LoadModelResult(device.get_model(batch.model_key))
-        self.measure_time("w_load_model")
+        self._perf_timer.measure_time("load_model")
 
         fetch_input_results = self._worker.fetch_inputs(batch, self._feature_store)
-        self.measure_time("w_fetch_input")
+        self._perf_timer.measure_time("fetch_input")
 
         transformed_input = self._worker.transform_input(
             batch, fetch_input_results, self._device
         )
-        self.measure_time("w_transform_input")
+        self._perf_timer.measure_time("transform_input")
 
         replies = [InferenceReply() for _ in range(len(batch.requests))]
 
@@ -344,19 +241,19 @@ def _on_iteration(self) -> None:
             execute_result = self._worker.execute(
                 batch, model_result, transformed_input
             )
-            self.measure_time("w_execute")
+            self._perf_timer.measure_time("execute")
             transformed_outputs = self._worker.transform_output(
                 batch, execute_result, self._device
             )
-            self.measure_time("w_transform_output")
+            self._perf_timer.measure_time("transform_output")
         except Exception:
             logger.exception("Error executing worker")
             for reply in replies:
                 reply.failed = True
         else:
-            for reply_idx, (request, transformed_output) in enumerate(zip(
-                batch.requests, transformed_outputs
-            )):
+            for reply_idx, (request, transformed_output) in enumerate(
+                zip(batch.requests, transformed_outputs)
+            ):
                 reply = replies[reply_idx]
                 try:
                     if request.output_keys:
@@ -365,12 +262,11 @@ def _on_iteration(self) -> None:
                         )
                     else:
                         reply.outputs = transformed_output.outputs
-                    self.measure_time("w_assign_output")
+                    self._perf_timer.measure_time("assign_output")
                 except Exception:
                     logger.exception("Error executing worker")
                     reply.failed = True
 
-
                 if reply.failed:
                     response = build_failure_reply("fail", "failure-occurred")
                 else:
@@ -378,21 +274,21 @@ def _on_iteration(self) -> None:
                         response = build_failure_reply("fail", "no-results")
 
                 response = build_reply(reply)
-                self.measure_time("w_build_reply")
+                self._perf_timer.measure_time("build_reply")
 
                 # serialized = self._worker.serialize_reply(request, transformed_output)
                 serialized_resp = MessageHandler.serialize_response(response)  # type: ignore
 
-                self.measure_time("w_serialize_resp")
+                self._perf_timer.measure_time("serialize_resp")
 
                 if request.callback:
                     request.callback.send(serialized_resp)
-                self.measure_time("w_send")
+                self._perf_timer.measure_time("send")
 
-        self.end_timings()
+        self._perf_timer.end_timings()
 
-        if len(self._timings["w_send"]) == 801:
-            self.print_timings(True)
+        if len(self._perf_timer._timings["w_send"]) == 801:
+            self._perf_timer.print_timings(True)
 
     def _can_shutdown(self) -> bool:
         """Return true when the criteria to shut down the service are met."""
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index 3156b587a7..4eedc18299 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -84,26 +84,31 @@ def transform_input(
             partial_result = []
             if fetch_result.meta is None:
                 raise ValueError("Cannot reconstruct tensor without meta information")
-            for item, item_meta in zip(fetch_result.inputs, fetch_result.meta):
+            for idx, (item, item_meta) in enumerate(
+                zip(fetch_result.inputs, fetch_result.meta)
+            ):
                 tensor_desc: tensor_capnp.TensorDescriptor = item_meta
                 partial_result.append(
-                    torch.tensor(np.frombuffer(item, dtype=str(tensor_desc.dataType)))
-                    .to(device)
-                    .reshape(tuple(dim for dim in tensor_desc.dimensions))
+                    torch.tensor(
+                        np.frombuffer(item, dtype=str(tensor_desc.dataType))
+                    ).reshape(tuple(dim for dim in tensor_desc.dimensions))
                 )
+                if idx == 0:
+                    num_samples = tensor_desc.dimensions[0]
+                    slices.append(slice(start, start + num_samples))
+                    start = start + num_samples
             results.append(partial_result)
-            num_samples = fetch_result.meta[0].dimensions[0]
-            slices.append(slice(start, start + num_samples))
-            start = start + num_samples
 
         result: list[torch.Tensor] = []
         if len(batch.requests) > 1:
             for t_idx in range(len(results[0])):
                 result.append(
-                    torch.concatenate([partial_result[t_idx] for partial_result in results])
+                    torch.concatenate(
+                        [partial_result[t_idx] for partial_result in results]
+                    ).to(device)
                 )
         else:
-            result = results[0]
+            result = [tensor.to(device) for tensor in results[0]]
 
         return TransformInputResult(result, slices)
         # return data # note: this fails copy test!
@@ -134,11 +139,17 @@ def transform_output(
         transformed_list: list[TransformOutputResult] = []
         for result_slice in execute_result.slices:
             if result_device != "cpu":
-                transformed = [item.to("cpu") for item in execute_result.predictions[result_slice]]
+                transformed = [
+                    item.to("cpu") for item in execute_result.predictions[result_slice]
+                ]
                 # todo: need the shape from latest schemas added here.
-                transformed_list.append(TransformOutputResult(transformed, None, "c", "float32"))  # fixme
+                transformed_list.append(
+                    TransformOutputResult(transformed, None, "c", "float32")
+                )  # fixme
 
-            transformed_list.append(TransformOutputResult(
-                execute_result.predictions[result_slice], None, "c", "float32"
-            ))  # fixme
-        return transformed_list
\ No newline at end of file
+            transformed_list.append(
+                TransformOutputResult(
+                    execute_result.predictions[result_slice], None, "c", "float32"
+                )
+            )  # fixme
+        return transformed_list
diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py
new file mode 100644
index 0000000000..7fa2af04a6
--- /dev/null
+++ b/smartsim/_core/utils/timings.py
@@ -0,0 +1,89 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import time
+import typing as t
+from collections import OrderedDict
+
+import numpy as np
+
+
+class PerfTimer:
+    def __init__(self, filename: str = "timings", prefix: str = ""):
+        self._start: t.Optional[float] = None
+        self._interm: t.Optional[float] = None
+        self._timings: OrderedDict[str, list[t.Union[float, int, str]]] = OrderedDict()
+        self._timing_on = True
+        self._filename = filename
+        self._prefix = prefix
+
+    def _add_label_to_timings(self, label: str) -> None:
+        if label not in self._timings:
+            self._timings[label] = []
+
+    @staticmethod
+    def _format_number(number: float | int) -> str:
+        return f"{number:0.4e}"
+
+    def start_timings(
+        self,
+        first_label: t.Optional[str] = None,
+        first_value: t.Optional[float | int] = None,
+    ) -> None:
+        if self._timing_on:
+            if first_label is not None and first_value is not None:
+                self._add_label_to_timings(self._make_label(first_label))
+                self._timings[self._make_label(first_label)].append(first_value)
+            self._start = time.perf_counter()
+            self._interm = time.perf_counter()
+
+    def end_timings(self) -> None:
+        if self._timing_on and self._start is not None:
+            self._add_label_to_timings(self._make_label("total_time"))
+            self._timings[self._make_label("total_time")].append(
+                self._format_number(time.perf_counter() - self._start)
+            )
+            self._interm = None
+
+    def _make_label(self, label: str) -> str:
+        return self._prefix + label
+
+    def measure_time(self, label: str) -> None:
+        if self._timing_on and self._interm is not None:
+            self._add_label_to_timings(self._make_label(label))
+            self._timings[self._make_label(label)].append(
+                self._format_number(time.perf_counter() - self._interm)
+            )
+            self._interm = time.perf_counter()
+
+    def print_timings(self, to_file: bool = False) -> None:
+        print(" ".join(self._timings.keys()))
+        value_array = np.array(list(self._timings.values()), dtype=float)
+        value_array = np.transpose(value_array)
+        for i in range(value_array.shape[0]):
+            print(" ".join(self._format_number(value) for value in value_array[i]))
+        if to_file:
+            np.save(self._prefix + self._filename + ".npy", value_array)

From 0bb14879e06e622749fbf9347f6f50c1238592c4 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 17 Jul 2024 18:19:51 -0500
Subject: [PATCH 38/84] Constrain torch threads in worker

---
 .../_core/mli/infrastructure/control/requestdispatcher.py    | 5 +++--
 smartsim/_core/mli/infrastructure/worker/torch_worker.py     | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index 2a8ed9e39f..babbd3fe56 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -203,7 +203,6 @@ def _waited_time(self) -> float:
     def ready(self) -> bool:
         if self.empty():
             return False
-
         return self.full() or (self._waited_time >= self._batch_timeout)
 
     def make_disposable(self) -> None:
@@ -311,7 +310,6 @@ def run(self) -> None:
                 self._perf_timer.measure_time("dispatch")
             finally:
                 self.flush_requests()
-                self._perf_timer.measure_time("flush_requests")
                 # TODO: implement this
                 # self.remove_queues()
 
@@ -374,11 +372,14 @@ def flush_requests(self) -> None:
         for queue in self._queues:
             if queue.ready and queue.acquire(blocking=False):
                 try:
+
+                    self._perf_timer.measure_time("find_queue")
                     self._outgoing_queue.put(
                         InferenceBatch(
                             model_key=queue.model_key, requests=queue.flush()
                         )
                     )
+                    self._perf_timer.measure_time("flush_requests")
                 finally:
                     queue.release()
                 break
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index 4eedc18299..f55d6d13d7 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -28,7 +28,6 @@
 
 import numpy as np
 import torch
-
 from .....error import SmartSimError
 from .....log import get_logger
 from ...mli_schemas.tensor import tensor_capnp
@@ -43,6 +42,7 @@
     TransformOutputResult,
 )
 
+torch.set_num_threads(1)
 logger = get_logger(__name__)
 
 

From 7b9e00ce6b0395bfa259b7c08ef4e1eaf5dbeae4 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Thu, 18 Jul 2024 09:46:48 -0500
Subject: [PATCH 39/84] Affinity and correct process

---
 .../_core/launcher/dragon/dragonBackend.py    |  4 ++
 .../infrastructure/control/workermanager.py   | 56 ++++++++++++++++---
 .../mli/infrastructure/worker/torch_worker.py |  1 +
 3 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index dcc5c8392b..545dbfaa6b 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -424,6 +424,8 @@ def _start_steps(self) -> None:
                 global_policy = dragon_policy.Policy(
                     placement=dragon_policy.Policy.Placement.HOST_NAME,
                     host_name=hosts[0],
+                    affinity = dragon_policy.Policy.Affinity.SPECIFIC,
+                    cpu_affinity=list(range(32))+list(range(64,64+32)),
                 )
                 options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
                 grp = dragon_process_group.ProcessGroup(
@@ -435,6 +437,8 @@ def _start_steps(self) -> None:
                     local_policy = dragon_policy.Policy(
                         placement=dragon_policy.Policy.Placement.HOST_NAME,
                         host_name=node_name,
+                        affinity = dragon_policy.Policy.Affinity.SPECIFIC,
+                        cpu_affinity=list(range(32))+list(range(64,64+32)),
                     )
                     policies.extend([local_policy] * request.tasks_per_node)
                     tmp_proc = dragon_process.ProcessTemplate(
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 21460186d0..3dde086367 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -24,14 +24,20 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import dragon
+import dragon.data.ddict.ddict as dragon_ddict
+import dragon.infrastructure.connection as dragon_connection
+import dragon.infrastructure.policy as dragon_policy
+import dragon.infrastructure.process_desc as dragon_process_desc
+import dragon.native.group_state as dragon_group_state
+import dragon.native.process as dragon_process
+import dragon.native.process_group as dragon_process_group
+import dragon.native.machine as dragon_machine
+
 import multiprocessing as mp
-import numbers
-import time
+import os
+import socket
 import typing as t
-from collections import OrderedDict
-
-import dragon
-import numpy as np
 
 from ....utils.timings import PerfTimer
 from .....log import get_logger
@@ -160,9 +166,43 @@ def __init__(
             mp.set_start_method("dragon")
         except RuntimeError:
             pass
-        self._dispatcher_process = mp.Process(
-            target=self._request_dispatcher.run, name="Dispatcher"
+        # self._dispatcher_process = mp.Process(
+        #     target=self._request_dispatcher.run, name="Dispatcher"
+        # )
+        self._dispatcher_process = self._create_local_dispatcher_process()
+
+    def _create_local_dispatcher_process(self):
+        self_affinity = list(os.sched_getaffinity(os.getpid()))
+        os.sched_setaffinity(os.getpid(), self_affinity[:-8])
+        global_policy = dragon_policy.Policy(
+            placement=dragon_policy.Policy.Placement.HOST_NAME,
+            host_name=socket.gethostname(),
+            affinity = dragon_policy.Policy.Affinity.SPECIFIC,
+            cpu_affinity=self_affinity[-8:],
+            device=dragon_policy.Policy.Device.CPU,
+            distribution = dragon_policy.Policy.Distribution.BLOCK,
+        )
+        options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
+        grp = dragon_process_group.ProcessGroup(
+            restart=False, pmi_enabled=True, policy=global_policy
+        )
+        local_policy = dragon_policy.Policy(
+            placement=dragon_policy.Policy.Placement.HOST_NAME,
+            host_name=socket.gethostname(),
+            affinity = dragon_policy.Policy.Affinity.SPECIFIC,
+            cpu_affinity=self_affinity[-8:],
+            device=dragon_policy.Policy.Device.CPU,
+        )
+        tmp_proc = dragon_process.ProcessTemplate(
+            target=self._request_dispatcher.run,
+            args=[],
+            cwd=os.getcwd(),
+            policy=local_policy,
+            options=options,
         )
+        grp.add_process(nproc=1, template=tmp_proc)
+        grp.init()
+        return grp
 
     def _on_start(self) -> None:
         self._dispatcher_process.start()
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index f55d6d13d7..84bcec0887 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -43,6 +43,7 @@
 )
 
 torch.set_num_threads(1)
+torch.set_num_interop_threads(16)
 logger = get_logger(__name__)
 
 

From 94a526336479f784aaaa87ae10771b2535211a87 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Thu, 18 Jul 2024 09:48:39 -0500
Subject: [PATCH 40/84] Fixes to example

---
 ex/high_throughput_inference/mli_driver.py               | 8 +++++---
 ex/high_throughput_inference/mock_app.py                 | 2 --
 ex/high_throughput_inference/standalone_workermanager.py | 6 ++++++
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py
index 6da559aa6f..1d4b121365 100644
--- a/ex/high_throughput_inference/mli_driver.py
+++ b/ex/high_throughput_inference/mli_driver.py
@@ -7,6 +7,7 @@
 from smartsim import Experiment
 from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
 from smartsim.status import TERMINAL_STATUSES
+from smartsim.settings import DragonRunSettings
 import time
 import typing as t
 
@@ -20,13 +21,13 @@
 
 os.environ["SMARTSIM_DRAGON_TRANSPORT"] = transport
 
-exp_path = os.path.join(filedir, f"MLI_proto_{transport.upper()}")
+exp_path = os.path.join(filedir, f"MLI_proto_batch_{transport.upper()}")
 os.makedirs(exp_path, exist_ok=True)
 exp = Experiment("MLI_proto", launcher="dragon", exp_path=exp_path)
 
 torch_worker_str = base64.b64encode(cloudpickle.dumps(TorchWorker)).decode("ascii")
 
-worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device, "--worker_class", torch_worker_str])
+worker_manager_rs: DragonRunSettings = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device, "--worker_class", torch_worker_str])
 worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs)
 worker_manager.attach_generator_files(to_copy=[worker_manager_script_name])
 
@@ -40,11 +41,12 @@
 
 while True:
     if exp.get_status(app)[0] in TERMINAL_STATUSES:
+        time.sleep(10)
         exp.stop(worker_manager)
         break
     if exp.get_status(worker_manager)[0] in TERMINAL_STATUSES:
+        time.sleep(10)
         exp.stop(app)
         break
-    time.sleep(5)
 
 print("Exiting.")
\ No newline at end of file
diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index 45246db2e5..76969e6a4c 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -112,7 +112,6 @@ def run_model(self, model: bytes | str, batch: torch.Tensor):
         built_tensor = MessageHandler.build_tensor(
             batch.numpy(), "c", "float32", list(batch.shape))
         self.measure_time("build_tensor")
-        built_model = None
         if isinstance(model, str):
             model_arg = MessageHandler.build_model_key(model)
         else:
@@ -130,7 +129,6 @@ def run_model(self, model: bytes | str, batch: torch.Tensor):
         self.measure_time("serialize_request")
         with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh:
             to_sendh.send_bytes(request_bytes)
-        logger.info(f"Message size: {len(request_bytes)} bytes")
 
         self.measure_time("send")
         with self._from_worker_ch.recvh(timeout=None) as from_recvh:
diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index f91c2269c6..f781444d81 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -45,6 +45,12 @@
 from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager
 from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader
 
+import os
+import socket
+pid = 0
+affinity = os.sched_getaffinity(pid)
+print("Entry point:", socket.gethostname(), affinity)
+print("CPUS:", os.cpu_count())
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser("Worker Manager")

From a7b52626f5be31eeadd1c3658c440cddd6abe715 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Fri, 19 Jul 2024 01:56:52 +0200
Subject: [PATCH 41/84] Add request dispatcher post-merge changes

---
 .../control/requestdispatcher.py              | 63 +++++++++++++++----
 1 file changed, 52 insertions(+), 11 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index babbd3fe56..8684bc7b6e 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -57,6 +57,7 @@
 def deserialize_message(
     data_blob: bytes,
     channel_type: t.Type[CommChannelBase],
+    device: t.Literal["cpu", "gpu"],
 ) -> InferenceRequest:
     """Deserialize a message from a byte stream into an InferenceRequest
     :param data_blob: The byte stream to deserialize"""
@@ -79,31 +80,34 @@ def deserialize_message(
     elif request.model.which() == "data":
         model_bytes = request.model.data
 
-    callback_key = request.replyChannel.reply
+    callback_key = request.replyChannel.descriptor
 
     # todo: shouldn't this be `CommChannel.find` instead of `DragonCommChannel`
     comm_channel = channel_type(callback_key)
     # comm_channel = DragonCommChannel(request.replyChannel)
 
     input_keys: t.Optional[t.List[str]] = None
-    input_bytes: t.Optional[t.List[bytes]] = (
-        None  # these will really be tensors already
-    )
+    input_bytes: t.Optional[t.List[bytes]] = None
+
+    output_keys: t.Optional[t.List[str]] = None
 
-    input_meta: t.List[t.Any] = []
+    input_meta: t.Optional[t.List[TensorDescriptor]] = None
 
     if request.input.which() == "keys":
         input_keys = [input_key.key for input_key in request.input.keys]
-    elif request.input.which() == "data":
-        input_bytes = [data.blob for data in request.input.data]
-        input_meta = [data.tensorDescriptor for data in request.input.data]
+    elif request.input.which() == "descriptors":
+        input_meta = request.input.descriptors  # type: ignore
+
+    if request.output:
+        output_keys = [tensor_key.key for tensor_key in request.output]
 
     inference_request = InferenceRequest(
         model_key=model_key,
         callback=comm_channel,
         raw_inputs=input_bytes,
-        input_meta=input_meta,
         input_keys=input_keys,
+        input_meta=input_meta,
+        output_keys=output_keys,
         raw_model=model_bytes,
         batch_size=0,
     )
@@ -235,7 +239,26 @@ def full(self) -> bool:
     def empty(self) -> bool:
         return self.qsize() == 0
 
-
+def exception_handler(
+    exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str
+) -> None:
+    """
+    Logs exceptions and sends a failure response.
+
+    :param exc: The exception to be logged
+    :param reply_channel: The channel used to send replies
+    :param failure_message: Failure message to log and send back
+    """
+    logger.exception(
+        f"{failure_message}\n"
+        f"Exception type: {type(exc).__name__}\n"
+        f"Exception message: {str(exc)}"
+    )
+    serialized_resp = MessageHandler.serialize_response(
+        build_failure_reply("fail", failure_message)
+    )
+    if reply_channel:
+        reply_channel.send(serialized_resp)
 class RequestDispatcher:
     def __init__(
         self,
@@ -296,10 +319,28 @@ def run(self) -> None:
             raise SmartSimError("No incoming channel for dispatcher")
         while True:
             try:
-                request_bytes: bytes = self._incoming_channel.recv()
+                bytes_list: t.List[bytes] = self._incoming_channel.recv()
+
+                if not bytes_list:
+                    exception_handler(
+                        ValueError("No request data found"),
+                        None,
+                        "No request data found.",
+                    )
+                    return
+
+
             except Exception:
                 pass
             else:
+                request_bytes = bytes_list[0]
+                tensor_bytes_list = bytes_list[1:]
+
+                request = deserialize_message(
+                    request_bytes, self._comm_channel_type, self._device
+                )
+                if request.input_meta and tensor_bytes_list:
+                    request.raw_inputs = tensor_bytes_list
                 self._perf_timer.start_timings()
                 request = deserialize_message(request_bytes, self._comm_channel_type)
                 self._perf_timer.measure_time("deserialize_message")

From 717ef8866eec89705d9fd9e4ca4d42a29a9e7535 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Fri, 19 Jul 2024 02:21:14 +0200
Subject: [PATCH 42/84] Misc fixes

---
 .../_core/launcher/dragon/dragonBackend.py    |  4 +-
 smartsim/_core/mli/comm/channel/channel.py    |  1 -
 .../_core/mli/comm/channel/dragonchannel.py   |  1 -
 .../control/requestdispatcher.py              | 37 ++---------
 .../infrastructure/control/workermanager.py   | 62 +++++++++----------
 .../mli/infrastructure/worker/torch_worker.py |  1 +
 .../_core/mli/infrastructure/worker/worker.py |  1 +
 7 files changed, 36 insertions(+), 71 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index fff62fcdde..a6a8700ab0 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -504,8 +504,8 @@ def _start_steps(self) -> None:
                 global_policy = dragon_policy.Policy(
                     placement=dragon_policy.Policy.Placement.HOST_NAME,
                     host_name=hosts[0],
-                    affinity = dragon_policy.Policy.Affinity.SPECIFIC,
-                    cpu_affinity=list(range(32))+list(range(64,64+32)),
+                    affinity=dragon_policy.Policy.Affinity.SPECIFIC,
+                    cpu_affinity=list(range(32)) + list(range(64, 64 + 32)),
                 )
                 options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
                 grp = dragon_process_group.ProcessGroup(
diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py
index df4872af1a..a3cce21814 100644
--- a/smartsim/_core/mli/comm/channel/channel.py
+++ b/smartsim/_core/mli/comm/channel/channel.py
@@ -41,7 +41,6 @@ def __init__(self, descriptor: t.Union[str, bytes]) -> None:
 
     @abstractmethod
     def send(self, value: bytes) -> None:
-        """Send a message through the underlying communication channel
         """Send a message through the underlying communication channel
         :param value: The value to send"""
 
diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py
index ab98261409..a45adaee33 100644
--- a/smartsim/_core/mli/comm/channel/dragonchannel.py
+++ b/smartsim/_core/mli/comm/channel/dragonchannel.py
@@ -24,7 +24,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import sys
 import sys
 import typing as t
 
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index 8684bc7b6e..10279c01d7 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -29,6 +29,7 @@
 # pylint: disable-next=unused-import
 import dragon
 from dragon.mpbridge.queues import DragonQueue
+
 # isort: on
 
 import multiprocessing as mp
@@ -50,6 +51,7 @@
 from ...infrastructure.worker.worker import InferenceBatch, InferenceRequest
 from ...message_handler import MessageHandler
 from ...mli_schemas.model.model_capnp import Model
+from ...mli_schemas.tensor.tensor_capnp import TensorDescriptor
 
 logger = get_logger("Request Dispatcher")
 
@@ -57,7 +59,6 @@
 def deserialize_message(
     data_blob: bytes,
     channel_type: t.Type[CommChannelBase],
-    device: t.Literal["cpu", "gpu"],
 ) -> InferenceRequest:
     """Deserialize a message from a byte stream into an InferenceRequest
     :param data_blob: The byte stream to deserialize"""
@@ -239,26 +240,7 @@ def full(self) -> bool:
     def empty(self) -> bool:
         return self.qsize() == 0
 
-def exception_handler(
-    exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str
-) -> None:
-    """
-    Logs exceptions and sends a failure response.
-
-    :param exc: The exception to be logged
-    :param reply_channel: The channel used to send replies
-    :param failure_message: Failure message to log and send back
-    """
-    logger.exception(
-        f"{failure_message}\n"
-        f"Exception type: {type(exc).__name__}\n"
-        f"Exception message: {str(exc)}"
-    )
-    serialized_resp = MessageHandler.serialize_response(
-        build_failure_reply("fail", failure_message)
-    )
-    if reply_channel:
-        reply_channel.send(serialized_resp)
+
 class RequestDispatcher:
     def __init__(
         self,
@@ -321,24 +303,13 @@ def run(self) -> None:
             try:
                 bytes_list: t.List[bytes] = self._incoming_channel.recv()
 
-                if not bytes_list:
-                    exception_handler(
-                        ValueError("No request data found"),
-                        None,
-                        "No request data found.",
-                    )
-                    return
-
-
             except Exception:
                 pass
             else:
                 request_bytes = bytes_list[0]
                 tensor_bytes_list = bytes_list[1:]
 
-                request = deserialize_message(
-                    request_bytes, self._comm_channel_type, self._device
-                )
+                request = deserialize_message(request_bytes, self._comm_channel_type)
                 if request.input_meta and tensor_bytes_list:
                     request.raw_inputs = tensor_bytes_list
                 self._perf_timer.start_timings()
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index d2cce15440..140ad9bc70 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -24,24 +24,25 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import multiprocessing as mp
+import os
+import socket
+import sys
+import typing as t
+
 import dragon
 import dragon.data.ddict.ddict as dragon_ddict
 import dragon.infrastructure.connection as dragon_connection
 import dragon.infrastructure.policy as dragon_policy
 import dragon.infrastructure.process_desc as dragon_process_desc
 import dragon.native.group_state as dragon_group_state
+import dragon.native.machine as dragon_machine
 import dragon.native.process as dragon_process
 import dragon.native.process_group as dragon_process_group
-import dragon.native.machine as dragon_machine
-
-import multiprocessing as mp
-import os
-import socket
-import typing as t
 
-from ....utils.timings import PerfTimer
 from .....log import get_logger
 from ....entrypoints.service import Service
+from ....utils.timings import PerfTimer
 from ...comm.channel.channel import CommChannelBase
 from ...comm.channel.dragonchannel import DragonCommChannel
 from ...infrastructure.environmentloader import EnvironmentConfigLoader
@@ -53,7 +54,7 @@
     MachineLearningWorkerBase,
 )
 from ...message_handler import MessageHandler
-from ...mli_schemas.response.response_capnp import Response, ResponseBuilder
+from ...mli_schemas.response.response_capnp import ResponseBuilder
 from .devicemanager import DeviceManager, WorkerDevice
 from .requestdispatcher import RequestDispatcher
 
@@ -187,16 +188,18 @@ def __init__(
         # )
         self._dispatcher_process = self._create_local_dispatcher_process()
 
-    def _create_local_dispatcher_process(self):
-        self_affinity = list(os.sched_getaffinity(os.getpid()))
-        os.sched_setaffinity(os.getpid(), self_affinity[:-8])
+    def _create_local_dispatcher_process(self) -> dragon_process_group.ProcessGroup:
+        if sys.platform != "darwin":
+            self_affinity: list[int] = list(os.sched_getaffinity(os.getpid()))
+            os.sched_setaffinity(os.getpid(), self_affinity[:-8])
+        else:
+            self_affinity: list[int] = []
         global_policy = dragon_policy.Policy(
             placement=dragon_policy.Policy.Placement.HOST_NAME,
             host_name=socket.gethostname(),
-            affinity = dragon_policy.Policy.Affinity.SPECIFIC,
+            affinity=dragon_policy.Policy.Affinity.SPECIFIC,
             cpu_affinity=self_affinity[-8:],
             device=dragon_policy.Policy.Device.CPU,
-            distribution = dragon_policy.Policy.Distribution.BLOCK,
         )
         options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
         grp = dragon_process_group.ProcessGroup(
@@ -205,7 +208,7 @@ def _create_local_dispatcher_process(self):
         local_policy = dragon_policy.Policy(
             placement=dragon_policy.Policy.Placement.HOST_NAME,
             host_name=socket.gethostname(),
-            affinity = dragon_policy.Policy.Affinity.SPECIFIC,
+            affinity=dragon_policy.Policy.Affinity.SPECIFIC,
             cpu_affinity=self_affinity[-8:],
             device=dragon_policy.Policy.Device.CPU,
         )
@@ -278,8 +281,6 @@ def _on_iteration(self) -> None:
         )
         self._perf_timer.measure_time("fetch_model")
 
-        # logger.info(f"Acquired device {device.name}")
-
         model_result = LoadModelResult(device.get_model(batch.model_key))
         self._perf_timer.measure_time("load_model")
 
@@ -304,8 +305,7 @@ def _on_iteration(self) -> None:
             self._perf_timer.measure_time("transform_output")
         except Exception:
             logger.exception("Error executing worker")
-            for reply in replies:
-                reply.failed = True
+
         else:
             for reply_idx, (request, transformed_output) in enumerate(
                 zip(batch.requests, transformed_outputs)
@@ -321,33 +321,27 @@ def _on_iteration(self) -> None:
                     self._perf_timer.measure_time("assign_output")
                 except Exception:
                     logger.exception("Error executing worker")
-                    reply.failed = True
 
-        if reply.outputs is None or not reply.outputs:
-            response = build_failure_reply("fail", "Outputs not found.")
-        else:
-            reply.status_enum = "complete"
-            reply.message = "Success"
-            response = build_reply(reply)
-                if reply.failed:
-                    response = build_failure_reply("fail", "failure-occurred")
+                if reply.outputs is None or not reply.outputs:
+                    response = build_failure_reply("fail", "Outputs not found.")
                 else:
-                    if reply.outputs is None or not reply.outputs:
-                        response = build_failure_reply("fail", "no-results")
+                    reply.status_enum = "complete"
+                    reply.message = "Success"
+                    response = build_reply(reply)
 
                 response = build_reply(reply)
                 self._perf_timer.measure_time("build_reply")
 
-                serialized_resp = MessageHandler.serialize_response(response)  # type: ignore
+                serialized_resp = MessageHandler.serialize_response(response)
 
                 self._perf_timer.measure_time("serialize_resp")
 
                 if request.callback:
                     request.callback.send(serialized_resp)
-            if reply.outputs:
-                # send tensor data after response
-                for output in reply.outputs:
-                    request.callback.send(output)
+                    if reply.outputs:
+                        # send tensor data after response
+                        for output in reply.outputs:
+                            request.callback.send(output)
                 self._perf_timer.measure_time("send")
 
         self._perf_timer.end_timings()
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index 84bcec0887..2cb79767f9 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -28,6 +28,7 @@
 
 import numpy as np
 import torch
+
 from .....error import SmartSimError
 from .....log import get_logger
 from ...mli_schemas.tensor import tensor_capnp
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 31e189d1dd..7448fdfb79 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -26,6 +26,7 @@
 
 import typing as t
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
 
 from .....error import SmartSimError
 from .....log import get_logger

From 05b49f3bd2b9d883210c118dff7c6b15fda00fb6 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Fri, 19 Jul 2024 14:57:12 +0200
Subject: [PATCH 43/84] Correct exception_handler behavior on batch

---
 .../infrastructure/control/workermanager.py   | 87 +++++++++++--------
 1 file changed, 50 insertions(+), 37 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 140ad9bc70..3e39b1731c 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -292,57 +292,70 @@ def _on_iteration(self) -> None:
         )
         self._perf_timer.measure_time("transform_input")
 
-        replies = [InferenceReply() for _ in range(len(batch.requests))]
 
         try:
             execute_result = self._worker.execute(
                 batch, model_result, transformed_input
             )
-            self._perf_timer.measure_time("execute")
+        except Exception as e:
+            for request in batch.requests:
+                exception_handler(
+                    e, request.callback, "Error executing worker."
+                )
+                return
+        self._perf_timer.measure_time("execute")
+
+        try:
             transformed_outputs = self._worker.transform_output(
                 batch, execute_result, self._device
             )
-            self._perf_timer.measure_time("transform_output")
-        except Exception:
-            logger.exception("Error executing worker")
+        except Exception as e:
+            for request in batch.requests:
+                exception_handler(
+                    e, request.callback, "Failed while transforming the output."
+                )
+            return
+        self._perf_timer.measure_time("transform_output")
 
-        else:
-            for reply_idx, (request, transformed_output) in enumerate(
-                zip(batch.requests, transformed_outputs)
-            ):
-                reply = replies[reply_idx]
+        for request, transformed_output in zip(batch.requests, transformed_outputs):
+            reply = InferenceReply()
+            if request.output_keys:
                 try:
-                    if request.output_keys:
-                        reply.output_keys = self._worker.place_output(
-                            request, transformed_output, self._feature_store
-                        )
-                    else:
-                        reply.outputs = transformed_output.outputs
-                    self._perf_timer.measure_time("assign_output")
-                except Exception:
-                    logger.exception("Error executing worker")
-
-                if reply.outputs is None or not reply.outputs:
-                    response = build_failure_reply("fail", "Outputs not found.")
-                else:
-                    reply.status_enum = "complete"
-                    reply.message = "Success"
-                    response = build_reply(reply)
-
+                    reply.output_keys = self._worker.place_output(
+                        request,
+                        transformed_output,
+                        self._feature_store,
+                    )
+                except Exception as e:
+                    exception_handler(
+                        e, request.callback, "Failed while placing the output."
+                    )
+                    continue
+            else:
+                reply.outputs = transformed_output.outputs
+            self._perf_timer.measure_time("assign_output")
+
+
+            if reply.outputs is None:
+                response = build_failure_reply("fail", "Outputs not found.")
+            else:
+                reply.status_enum = "complete"
+                reply.message = "Success"
                 response = build_reply(reply)
-                self._perf_timer.measure_time("build_reply")
 
-                serialized_resp = MessageHandler.serialize_response(response)
+            self._perf_timer.measure_time("build_reply")
+
+            serialized_resp = MessageHandler.serialize_response(response)
 
-                self._perf_timer.measure_time("serialize_resp")
+            self._perf_timer.measure_time("serialize_resp")
 
-                if request.callback:
-                    request.callback.send(serialized_resp)
-                    if reply.outputs:
-                        # send tensor data after response
-                        for output in reply.outputs:
-                            request.callback.send(output)
-                self._perf_timer.measure_time("send")
+            if request.callback:
+                request.callback.send(serialized_resp)
+                if reply.outputs:
+                    # send tensor data after response
+                    for output in reply.outputs:
+                        request.callback.send(output)
+            self._perf_timer.measure_time("send")
 
         self._perf_timer.end_timings()
 

From 14c3e9fec155e561696f310cc42fde060c63c6f5 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Fri, 19 Jul 2024 15:20:54 +0200
Subject: [PATCH 44/84] Style

---
 .../infrastructure/control/devicemanager.py   |  4 +--
 .../control/requestdispatcher.py              | 13 +++----
 .../infrastructure/control/workermanager.py   | 35 +++++++++----------
 smartsim/_core/utils/timings.py               |  6 ++++
 4 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py
index 1a2a860aa9..14b83a5044 100644
--- a/smartsim/_core/mli/infrastructure/control/devicemanager.py
+++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py
@@ -25,7 +25,6 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import typing as t
-from contextlib import contextmanager
 from threading import RLock
 from types import TracebackType
 
@@ -107,7 +106,8 @@ def get_free_device(
                         return_device = device
                         break
 
-            # If the model is not loaded on a free device, load it on another device (if available)
+            # If the model is not loaded on a free device,
+            # load it on another device (if available)
             if return_device is None:
                 for candidate_device in self._devices:
                     if (
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index 10279c01d7..b63cdcc9ee 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -24,12 +24,13 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
-# isort: off
+# pylint: disable=import-error
 # pylint: disable-next=unused-import
 import dragon
 from dragon.mpbridge.queues import DragonQueue
+# pylint: enable=import-error
 
+# isort: off
 # isort: on
 
 import multiprocessing as mp
@@ -157,7 +158,7 @@ def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> Non
         self._id = str(uuid.uuid4())
 
     @property
-    def id(self) -> str:
+    def queue_id(self) -> str:
         return self._id
 
     def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]:
@@ -327,9 +328,9 @@ def run(self) -> None:
 
                 self._perf_timer.end_timings()
 
-                # pylint: disable-next=protected-access
-            if len(self._perf_timer._timings["r_dispatch"]) == 801:
-                self._perf_timer.print_timings(True)
+
+                if self._perf_timer.max_length == 801:
+                    self._perf_timer.print_timings(True)
 
     @property
     def task_queue(self) -> DragonQueue:
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 3e39b1731c..b5667293d6 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -24,22 +24,25 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import multiprocessing as mp
-import os
-import socket
-import sys
-import typing as t
-
+# pylint: disable=import-error
+# pylint: disable-next=unused-import
 import dragon
-import dragon.data.ddict.ddict as dragon_ddict
-import dragon.infrastructure.connection as dragon_connection
 import dragon.infrastructure.policy as dragon_policy
 import dragon.infrastructure.process_desc as dragon_process_desc
-import dragon.native.group_state as dragon_group_state
-import dragon.native.machine as dragon_machine
 import dragon.native.process as dragon_process
 import dragon.native.process_group as dragon_process_group
 
+# pylint: enable=import-error
+
+# isort: off
+# isort: on
+
+import multiprocessing as mp
+import os
+import socket
+import sys
+import typing as t
+
 from .....log import get_logger
 from ....entrypoints.service import Service
 from ....utils.timings import PerfTimer
@@ -263,8 +266,6 @@ def _validate_request(self, request: InferenceRequest) -> bool:
     def _on_iteration(self) -> None:
         """Executes calls to the machine learning worker implementation to complete
         the inference pipeline"""
-        logger.debug("executing worker manager pipeline")
-
         batch = self._request_dispatcher.task_queue.get()
         self._perf_timer.start_timings()
         if batch is None or 0 == len(batch.requests):
@@ -292,17 +293,14 @@ def _on_iteration(self) -> None:
         )
         self._perf_timer.measure_time("transform_input")
 
-
         try:
             execute_result = self._worker.execute(
                 batch, model_result, transformed_input
             )
         except Exception as e:
             for request in batch.requests:
-                exception_handler(
-                    e, request.callback, "Error executing worker."
-                )
-                return
+                exception_handler(e, request.callback, "Error executing worker.")
+            return
         self._perf_timer.measure_time("execute")
 
         try:
@@ -335,7 +333,6 @@ def _on_iteration(self) -> None:
                 reply.outputs = transformed_output.outputs
             self._perf_timer.measure_time("assign_output")
 
-
             if reply.outputs is None:
                 response = build_failure_reply("fail", "Outputs not found.")
             else:
@@ -359,7 +356,7 @@ def _on_iteration(self) -> None:
 
         self._perf_timer.end_timings()
 
-        if len(self._perf_timer._timings["w_send"]) == 801:
+        if self._perf_timer.max_length == 801:
             self._perf_timer.print_timings(True)
 
     def _can_shutdown(self) -> bool:
diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py
index 7fa2af04a6..1d35570e65 100644
--- a/smartsim/_core/utils/timings.py
+++ b/smartsim/_core/utils/timings.py
@@ -79,6 +79,12 @@ def measure_time(self, label: str) -> None:
             )
             self._interm = time.perf_counter()
 
+    @property
+    def max_length(self) -> int:
+        if len(self._timings) == 0:
+            return 0
+        return max(len(value) for value in self._timings.values())
+
     def print_timings(self, to_file: bool = False) -> None:
         print(" ".join(self._timings.keys()))
         value_array = np.array(list(self._timings.values()), dtype=float)

From f93522f6b83cf86e2d9502efcde0561d4b6a6a9f Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Fri, 19 Jul 2024 15:05:58 -0500
Subject: [PATCH 45/84] Working post-merge version

---
 ex/high_throughput_inference/mock_app.py      |  3 +
 smartsim/_core/entrypoints/service.py         | 17 ----
 .../_core/launcher/dragon/dragonBackend.py    |  2 -
 .../control/requestdispatcher.py              | 81 +++++++++++++++----
 .../infrastructure/control/workermanager.py   | 47 ++---------
 .../mli/infrastructure/worker/torch_worker.py | 42 +++++-----
 .../_core/mli/infrastructure/worker/worker.py | 17 ++--
 smartsim/_core/utils/timings.py               | 38 ++++++---
 8 files changed, 132 insertions(+), 115 deletions(-)

diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index e244c93e0f..eef653791f 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -47,6 +47,9 @@
 from smartsim._core.mli.message_handler import MessageHandler
 from smartsim.log import get_logger
 
+torch.set_num_interop_threads(16)
+torch.set_num_threads(1)
+
 logger = get_logger("App")
 
 class ProtoClient:
diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py
index df9c2bbef6..6b4ef74b67 100644
--- a/smartsim/_core/entrypoints/service.py
+++ b/smartsim/_core/entrypoints/service.py
@@ -103,23 +103,6 @@ def execute(self) -> None:
         running = True
         cooldown_start: t.Optional[datetime.datetime] = None
 
-        headers = [
-            "batch_size",
-            "w_deserialize",
-            "w_fetch_model",
-            "w_load_model",
-            "w_fetch_input",
-            "w_transform_input",
-            "w_execute",
-            "w_transform_output",
-            "w_assign_output",
-            "w_build_reply",
-            "w_serialize_resp",
-            "w_send",
-        ]
-
-        print(",".join(headers))
-
         while running:
             self._on_iteration()
 
diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index a6a8700ab0..445538f20e 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -504,8 +504,6 @@ def _start_steps(self) -> None:
                 global_policy = dragon_policy.Policy(
                     placement=dragon_policy.Policy.Placement.HOST_NAME,
                     host_name=hosts[0],
-                    affinity=dragon_policy.Policy.Affinity.SPECIFIC,
-                    cpu_affinity=list(range(32)) + list(range(64, 64 + 32)),
                 )
                 options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
                 grp = dragon_process_group.ProcessGroup(
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index b63cdcc9ee..c930d7d42c 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -28,6 +28,7 @@
 # pylint: disable-next=unused-import
 import dragon
 from dragon.mpbridge.queues import DragonQueue
+
 # pylint: enable=import-error
 
 # isort: off
@@ -49,11 +50,16 @@
 from ...comm.channel.channel import CommChannelBase
 from ...comm.channel.dragonchannel import DragonCommChannel
 from ...infrastructure.storage.featurestore import FeatureStore
+from ...infrastructure.worker.torch_worker import TorchWorker
 from ...infrastructure.worker.worker import InferenceBatch, InferenceRequest
 from ...message_handler import MessageHandler
 from ...mli_schemas.model.model_capnp import Model
+from ...mli_schemas.response.response_capnp import ResponseBuilder
 from ...mli_schemas.tensor.tensor_capnp import TensorDescriptor
 
+if t.TYPE_CHECKING:
+    from smartsim._core.mli.mli_schemas.response.response_capnp import Status
+
 logger = get_logger("Request Dispatcher")
 
 
@@ -86,7 +92,6 @@ def deserialize_message(
 
     # todo: shouldn't this be `CommChannel.find` instead of `DragonCommChannel`
     comm_channel = channel_type(callback_key)
-    # comm_channel = DragonCommChannel(request.replyChannel)
 
     input_keys: t.Optional[t.List[str]] = None
     input_bytes: t.Optional[t.List[bytes]] = None
@@ -116,6 +121,37 @@ def deserialize_message(
     return inference_request
 
 
+def build_failure_reply(status: "Status", message: str) -> ResponseBuilder:
+    return MessageHandler.build_response(
+        status=status,
+        message=message,
+        result=[],
+        custom_attributes=None,
+    )
+
+
+def exception_handler(
+    exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str
+) -> None:
+    """
+    Logs exceptions and sends a failure response.
+
+    :param exc: The exception to be logged
+    :param reply_channel: The channel used to send replies
+    :param failure_message: Failure message to log and send back
+    """
+    logger.exception(
+        f"{failure_message}\n"
+        f"Exception type: {type(exc).__name__}\n"
+        f"Exception message: {str(exc)}"
+    )
+    serialized_resp = MessageHandler.serialize_response(
+        build_failure_reply("fail", failure_message)
+    )
+    if reply_channel:
+        reply_channel.send(serialized_resp)
+
+
 class WorkerDevice:
     def __init__(self, name: str) -> None:
         """Wrapper around a device to keep track of loaded Models and availability
@@ -263,7 +299,8 @@ def __init__(
         self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0)
         self._feature_store = feature_store
         self._comm_channel_type = comm_channel_type
-        self._perf_timer = PerfTimer(prefix="r_")
+        self._perf_timer = PerfTimer(prefix="r_", debug=False)
+        self._worker = TorchWorker()
 
     def _validate_request(self, request: InferenceRequest) -> bool:
         """Ensure the request can be processed.
@@ -303,21 +340,26 @@ def run(self) -> None:
         while True:
             try:
                 bytes_list: t.List[bytes] = self._incoming_channel.recv()
-
             except Exception:
                 pass
             else:
+                if not bytes_list:
+                    exception_handler(
+                        ValueError("No request data found"),
+                        None,
+                        "No request data found.",
+                    )
+
                 request_bytes = bytes_list[0]
                 tensor_bytes_list = bytes_list[1:]
+                self._perf_timer.start_timings()
 
                 request = deserialize_message(request_bytes, self._comm_channel_type)
                 if request.input_meta and tensor_bytes_list:
                     request.raw_inputs = tensor_bytes_list
-                self._perf_timer.start_timings()
-                request = deserialize_message(request_bytes, self._comm_channel_type)
                 self._perf_timer.measure_time("deserialize_message")
                 if not self._validate_request(request):
-                    return
+                    continue
                 self._perf_timer.measure_time("validate_request")
                 self.dispatch(request)
                 self._perf_timer.measure_time("dispatch")
@@ -328,7 +370,6 @@ def run(self) -> None:
 
                 self._perf_timer.end_timings()
 
-
                 if self._perf_timer.max_length == 801:
                     self._perf_timer.print_timings(True)
 
@@ -384,15 +425,25 @@ def _update_model_version(self, model: Model) -> None:
     def flush_requests(self) -> None:
         for queue in self._queues:
             if queue.ready and queue.acquire(blocking=False):
+                self._perf_timer.measure_time("find_queue")
                 try:
-
-                    self._perf_timer.measure_time("find_queue")
-                    self._outgoing_queue.put(
-                        InferenceBatch(
-                            model_key=queue.model_key, requests=queue.flush()
-                        )
+                    batch = InferenceBatch(
+                        model_key=queue.model_key, requests=queue.flush(), inputs=None
                     )
-                    self._perf_timer.measure_time("flush_requests")
                 finally:
+                    self._perf_timer.measure_time("flush_requests")
                     queue.release()
-                break
+                fetch_results = self._worker.fetch_inputs(
+                    batch=batch, feature_store=self._feature_store
+                )
+                self._perf_timer.measure_time("fetch_input")
+                transformed_inputs = self._worker.transform_input(
+                    batch=batch, fetch_results=fetch_results
+                )
+                self._perf_timer.measure_time("transform_input")
+                batch.inputs = transformed_inputs
+                for request in batch.requests:
+                    request.raw_inputs = []
+                    request.input_meta = []
+                self._outgoing_queue.put(batch)
+                self._perf_timer.measure_time("put")
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index b5667293d6..d41a09a0d8 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -232,41 +232,12 @@ def _on_start(self) -> None:
     def _on_shutdown(self) -> None:
         self._dispatcher_process.join()
 
-    def _validate_request(self, request: InferenceRequest) -> bool:
-        """Ensure the request can be processed.
-        :param request: The request to validate
-        :return: True if the request is valid, False otherwise"""
-        if not self._feature_store:
-            if request.model_key:
-                logger.error("Unable to load model by key without feature store")
-                return False
-
-            if request.input_keys:
-                logger.error("Unable to load inputs by key without feature store")
-                return False
-
-            if request.output_keys:
-                logger.error("Unable to persist outputs by key without feature store")
-                return False
-
-        if not request.model_key and not request.raw_model:
-            logger.error("Unable to continue without model bytes or feature store key")
-            return False
-
-        if not request.input_keys and not request.raw_inputs:
-            logger.error("Unable to continue without input bytes or feature store keys")
-            return False
-
-        if request.callback is None:
-            logger.error("No callback channel provided in request")
-            return False
-
-        return True
-
     def _on_iteration(self) -> None:
         """Executes calls to the machine learning worker implementation to complete
         the inference pipeline"""
-        batch = self._request_dispatcher.task_queue.get()
+
+        batch: InferenceRequest = self._request_dispatcher.task_queue.get()
+
         self._perf_timer.start_timings()
         if batch is None or 0 == len(batch.requests):
             return
@@ -285,17 +256,11 @@ def _on_iteration(self) -> None:
         model_result = LoadModelResult(device.get_model(batch.model_key))
         self._perf_timer.measure_time("load_model")
 
-        fetch_input_results = self._worker.fetch_inputs(batch, self._feature_store)
-        self._perf_timer.measure_time("fetch_input")
-
-        transformed_input = self._worker.transform_input(
-            batch, fetch_input_results, self._device
-        )
-        self._perf_timer.measure_time("transform_input")
+        transformed_input = batch.inputs
 
         try:
             execute_result = self._worker.execute(
-                batch, model_result, transformed_input
+                batch, model_result, transformed_input, device.name
             )
         except Exception as e:
             for request in batch.requests:
@@ -305,7 +270,7 @@ def _on_iteration(self) -> None:
 
         try:
             transformed_outputs = self._worker.transform_output(
-                batch, execute_result, self._device
+                batch, execute_result, device.name
             )
         except Exception as e:
             for request in batch.requests:
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index 2cb79767f9..45c9caadb3 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -64,7 +64,9 @@ def load_model(
             raise ValueError("Unable to load model without reference object")
 
         device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
-        device = device_to_torch[device]
+        for old, new in device_to_torch.items():
+            device = device.replace(old, new)
+
         buffer = io.BytesIO(initial_bytes=model_bytes)
         model = torch.jit.load(buffer, map_location=device)  # type: ignore
         result = LoadModelResult(model)
@@ -72,16 +74,12 @@ def load_model(
 
     @staticmethod
     def transform_input(
-        batch: InferenceBatch, fetch_results: list[FetchInputResult], device: str
+        batch: InferenceBatch, fetch_results: list[FetchInputResult]
     ) -> TransformInputResult:
         results: list[list[torch.Tensor]] = []
         start = 0
         slices: list[slice] = []
 
-        device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
-        for old, new in device_to_torch.items():
-            device = device.replace(old, new)
-
         for fetch_result in fetch_results:
             partial_result = []
             if fetch_result.meta is None:
@@ -107,10 +105,10 @@ def transform_input(
                 result.append(
                     torch.concatenate(
                         [partial_result[t_idx] for partial_result in results]
-                    ).to(device)
+                    )
                 )
         else:
-            result = [tensor.to(device) for tensor in results[0]]
+            result = results[0]
 
         return TransformInputResult(result, slices)
         # return data # note: this fails copy test!
@@ -121,13 +119,18 @@ def execute(
         batch: InferenceBatch,
         load_result: LoadModelResult,
         transform_result: TransformInputResult,
+        device: str,
     ) -> ExecuteResult:
         if not load_result.model:
             raise SmartSimError("Model must be loaded to execute")
-
+        device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
+        for old, new in device_to_torch.items():
+            device = device.replace(old, new)
         model: torch.nn.Module = load_result.model
         model.eval()
-        results = [model(tensor).detach() for tensor in transform_result.transformed]
+        results = [
+            model(tensor.to(device)).detach() for tensor in transform_result.transformed
+        ]
 
         execute_result = ExecuteResult(results, transform_result.slices)
         return execute_result
@@ -140,18 +143,13 @@ def transform_output(
     ) -> list[TransformOutputResult]:
         transformed_list: list[TransformOutputResult] = []
         for result_slice in execute_result.slices:
-            if result_device != "cpu":
-                transformed = [
-                    item.to("cpu") for item in execute_result.predictions[result_slice]
-                ]
-                # todo: need the shape from latest schemas added here.
-                transformed_list.append(
-                    TransformOutputResult(transformed, None, "c", "float32")
-                )  # fixme
-
+            transformed = [
+                item.to("cpu").numpy().tobytes()
+                for item in execute_result.predictions[result_slice]
+            ]
+            # todo: need the shape from latest schemas added here.
             transformed_list.append(
-                TransformOutputResult(
-                    execute_result.predictions[result_slice], None, "c", "float32"
-                )
+                TransformOutputResult(transformed, None, "c", "float32")
             )  # fixme
+
         return transformed_list
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 7448fdfb79..ae0a847aea 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -67,12 +67,6 @@ def __init__(
         self.batch_size = batch_size
 
 
-@dataclass
-class InferenceBatch:
-    model_key: str
-    requests: list[InferenceRequest]
-
-
 class InferenceReply:
     """Internal representation of the reply to a client request for inference"""
 
@@ -154,6 +148,13 @@ def __init__(self, result: bytes) -> None:
         self.model_bytes: bytes = result
 
 
+@dataclass
+class InferenceBatch:
+    model_key: str
+    requests: t.Optional[list[InferenceRequest]]
+    inputs: t.Optional[list[TransformInputResult]]
+
+
 class MachineLearningWorkerCore:
     """Basic functionality of ML worker that is shared across all worker types"""
 
@@ -274,12 +275,11 @@ def load_model(
     @staticmethod
     @abstractmethod
     def transform_input(
-        batch: InferenceBatch, fetch_results: list[FetchInputResult], device: str
+        batch: InferenceBatch, fetch_results: list[FetchInputResult]
     ) -> TransformInputResult:
         """Given a collection of data, perform a transformation on the data
         :param request: The request that triggered the pipeline
         :param fetch_result: Raw outputs from fetching inputs out of a feature store
-        :param device: The device on which the transformed input must be placed
         :return: The transformed inputs wrapped in a InputTransformResult"""
 
     @staticmethod
@@ -288,6 +288,7 @@ def execute(
         batch: InferenceBatch,
         load_result: LoadModelResult,
         transform_result: TransformInputResult,
+        device: str,
     ) -> ExecuteResult:
         """Execute an ML model on inputs transformed for use by the model
         :param request: The request that triggered the pipeline
diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py
index 1d35570e65..0ac13662a6 100644
--- a/smartsim/_core/utils/timings.py
+++ b/smartsim/_core/utils/timings.py
@@ -30,15 +30,22 @@
 
 import numpy as np
 
+from ...log import get_logger
+
+logger = get_logger("PerfTimer")
+
 
 class PerfTimer:
-    def __init__(self, filename: str = "timings", prefix: str = ""):
+    def __init__(
+        self, filename: str = "timings", prefix: str = "", debug: bool = False
+    ):
         self._start: t.Optional[float] = None
         self._interm: t.Optional[float] = None
         self._timings: OrderedDict[str, list[t.Union[float, int, str]]] = OrderedDict()
         self._timing_on = True
         self._filename = filename
         self._prefix = prefix
+        self._debug = debug
 
     def _add_label_to_timings(self, label: str) -> None:
         if label not in self._timings:
@@ -55,30 +62,40 @@ def start_timings(
     ) -> None:
         if self._timing_on:
             if first_label is not None and first_value is not None:
+                self._log(f"{first_label}: {first_value}")
                 self._add_label_to_timings(self._make_label(first_label))
-                self._timings[self._make_label(first_label)].append(first_value)
+                self._timings[self._make_label(first_label)].append(
+                    self._format_number(first_value)
+                )
             self._start = time.perf_counter()
             self._interm = time.perf_counter()
 
     def end_timings(self) -> None:
         if self._timing_on and self._start is not None:
             self._add_label_to_timings(self._make_label("total_time"))
-            self._timings[self._make_label("total_time")].append(
-                self._format_number(time.perf_counter() - self._start)
-            )
+            delta = self._format_number(time.perf_counter() - self._start)
+            self._timings[self._make_label("total_time")].append(delta)
+            self._log(f"total_time: {delta}")
             self._interm = None
 
     def _make_label(self, label: str) -> str:
         return self._prefix + label
 
+    def _get_delta(self) -> float | int:
+        return time.perf_counter() - self._interm
+
     def measure_time(self, label: str) -> None:
         if self._timing_on and self._interm is not None:
             self._add_label_to_timings(self._make_label(label))
-            self._timings[self._make_label(label)].append(
-                self._format_number(time.perf_counter() - self._interm)
-            )
+            delta = self._format_number(self._get_delta())
+            self._timings[self._make_label(label)].append(delta)
+            self._log(f"{label}: {delta}")
             self._interm = time.perf_counter()
 
+    def _log(self, msg: str) -> None:
+        if self._debug:
+            logger.info(msg)
+
     @property
     def max_length(self) -> int:
         if len(self._timings) == 0:
@@ -89,7 +106,8 @@ def print_timings(self, to_file: bool = False) -> None:
         print(" ".join(self._timings.keys()))
         value_array = np.array(list(self._timings.values()), dtype=float)
         value_array = np.transpose(value_array)
-        for i in range(value_array.shape[0]):
-            print(" ".join(self._format_number(value) for value in value_array[i]))
+        if self._debug:
+            for i in range(value_array.shape[0]):
+                print(" ".join(self._format_number(value) for value in value_array[i]))
         if to_file:
             np.save(self._prefix + self._filename + ".npy", value_array)

From 1bd73883243651452da2d26430c932787382e197 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Sat, 20 Jul 2024 12:52:54 -0500
Subject: [PATCH 46/84] Fix indexing in multi-output

---
 ex/high_throughput_inference/mli_driver.py    |  3 +-
 ex/high_throughput_inference/mock_app.py      | 79 ++++++-------------
 .../mock_app_redis.py                         | 14 +++-
 ex/high_throughput_inference/redis_driver.py  | 15 ++--
 .../standalone_workermanager.py               |  2 +
 .../control/requestdispatcher.py              |  6 +-
 .../infrastructure/control/workermanager.py   | 17 ++--
 .../mli/infrastructure/worker/torch_worker.py |  9 ++-
 .../_core/mli/infrastructure/worker/worker.py |  3 +-
 smartsim/_core/utils/timings.py               | 26 +++---
 10 files changed, 77 insertions(+), 97 deletions(-)

diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py
index 1d4b121365..a03f391b60 100644
--- a/ex/high_throughput_inference/mli_driver.py
+++ b/ex/high_throughput_inference/mli_driver.py
@@ -31,7 +31,8 @@
 worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs)
 worker_manager.attach_generator_files(to_copy=[worker_manager_script_name])
 
-app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device])
+app_rs: DragonRunSettings = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device])
+app_rs.set_tasks_per_node(4)
 app = exp.create_model("app", run_settings=app_rs)
 app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name])
 
diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index eef653791f..545c18b509 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -41,11 +41,11 @@
 import os
 import time
 import torch
-import numbers
 
-from collections import OrderedDict
+from mpi4py import MPI
 from smartsim._core.mli.message_handler import MessageHandler
 from smartsim.log import get_logger
+from smartsim._core.utils.timings import PerfTimer
 
 torch.set_num_interop_threads(16)
 torch.set_num_threads(1)
@@ -54,6 +54,8 @@
 
 class ProtoClient:
     def __init__(self, timing_on: bool):
+        comm = MPI.COMM_WORLD
+        rank = comm.Get_rank()
         connect_to_infrastructure()
         ddict_str = os.environ["SS_DRG_DDICT"]
         self._ddict = DDict.attach(ddict_str)
@@ -70,53 +72,14 @@ def __init__(self, timing_on: bool):
 
         self._start = None
         self._interm = None
-        self._timings: OrderedDict[str, list[numbers.Number]] = OrderedDict()
-        self._timing_on = timing_on
-
-    def _add_label_to_timings(self, label: str):
-        if label not in self._timings:
-            self._timings[label] = []
-
-    @staticmethod
-    def _format_number(number: numbers.Number):
-        return f"{number:0.4e}"
-
-    def start_timings(self, batch_size: int):
-        if self._timing_on:
-            self._add_label_to_timings("batch_size")
-            self._timings["batch_size"].append(batch_size)
-            self._start = time.perf_counter()
-            self._interm = time.perf_counter()
-
-    def end_timings(self):
-        if self._timing_on:
-            self._add_label_to_timings("total_time")
-            self._timings["total_time"].append(self._format_number(time.perf_counter()-self._start))
-
-    def measure_time(self, label: str):
-        if self._timing_on:
-            self._add_label_to_timings(label)
-            self._timings[label].append(self._format_number(time.perf_counter()-self._interm))
-            self._interm = time.perf_counter()
-
-    def print_timings(self, to_file: bool = False):
-        print(" ".join(self._timings.keys()))
-        value_array = numpy.array([value for  value in self._timings.values()], dtype=float)
-        value_array = numpy.transpose(value_array)
-        for i in range(value_array.shape[0]):
-            print(" ".join(self._format_number(value) for value in value_array[i]))
-        if to_file:
-            numpy.save("timings.npy", value_array)
-            numpy.savetxt("timings.txt", value_array)
-
+        self._perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"a{rank}_")
 
     def run_model(self, model: bytes | str, batch: torch.Tensor):
         tensors = [batch.numpy()]
-        self.start_timings(batch.shape[0])
+        self._perf_timer.start_timings("batch_size", batch.shape[0])
         built_tensor_desc = MessageHandler.build_tensor_descriptor(
             "c", "float32", list(batch.shape))
-        self.measure_time("build_tensor_descriptor")
-        built_model = None
+        self._perf_timer.measure_time("build_tensor_descriptor")
         if isinstance(model, str):
             model_arg = MessageHandler.build_model_key(model)
         else:
@@ -129,22 +92,21 @@ def run_model(self, model: bytes | str, batch: torch.Tensor):
             output_descriptors=[],
             custom_attributes=None,
         )
-        self.measure_time("build_request")
+        self._perf_timer.measure_time("build_request")
         request_bytes = MessageHandler.serialize_request(request)
-        self.measure_time("serialize_request")
+        self._perf_timer.measure_time("serialize_request")
         with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh:
             to_sendh.send_bytes(request_bytes)
             for t in tensors:
                 to_sendh.send_bytes(t.tobytes()) #TODO NOT FAST ENOUGH!!!
                 # to_sendh.send_bytes(bytes(t.data))
-        logger.info(f"Message size: {len(request_bytes)} bytes")
 
-        self.measure_time("send")
+        self._perf_timer.measure_time("send")
         with self._from_worker_ch.recvh(timeout=None) as from_recvh:
             resp = from_recvh.recv_bytes(timeout=None)
-            self.measure_time("receive")
+            self._perf_timer.measure_time("receive")
             response = MessageHandler.deserialize_response(resp)
-            self.measure_time("deserialize_response")
+            self._perf_timer.measure_time("deserialize_response")
             # list of data blobs? recv depending on the len(response.result.descriptors)?
             data_blob = from_recvh.recv_bytes(timeout=None)
             result = torch.from_numpy(
@@ -153,14 +115,17 @@ def run_model(self, model: bytes | str, batch: torch.Tensor):
                     dtype=str(response.result.descriptors[0].dataType),
                 )
             )
-            self.measure_time("deserialize_tensor")
+            self._perf_timer.measure_time("deserialize_tensor")
 
-        self.end_timings()
+        self._perf_timer.end_timings()
         return result
 
     def set_model(self, key: str, model: bytes):
         self._ddict[key] = model
 
+    def print_timings(self, to_file: bool):
+        self._perf_timer.print_timings(to_file)
+
 
 class ResNetWrapper():
     def __init__(self, name: str, model: str):
@@ -193,12 +158,12 @@ def name(self):
     client = ProtoClient(timing_on=True)
     client.set_model(resnet.name, resnet.model)
 
-    total_iterations = 100
+    TOTAL_ITERATIONS = 100
 
-    for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]:
-        logger.info(f"Batch size: {batch_size}")
-        for iteration_number in range(total_iterations + int(batch_size==1)):
+    for b_size in [1, 2, 4, 8, 16, 32, 64, 128]:
+        logger.info(f"Batch size: {b_size}")
+        for iteration_number in range(TOTAL_ITERATIONS + int(b_size==1)):
             logger.info(f"Iteration: {iteration_number}")
-            client.run_model(resnet.name, resnet.get_batch(batch_size))
+            client.run_model(resnet.name, resnet.get_batch(b_size))
 
     client.print_timings(to_file=True)
\ No newline at end of file
diff --git a/ex/high_throughput_inference/mock_app_redis.py b/ex/high_throughput_inference/mock_app_redis.py
index c56b4fb8b4..c0e67f82df 100644
--- a/ex/high_throughput_inference/mock_app_redis.py
+++ b/ex/high_throughput_inference/mock_app_redis.py
@@ -29,6 +29,7 @@
 import numpy
 import time
 import torch
+from mpi4py import MPI
 from smartsim.log import get_logger
 from smartredis import Client
 
@@ -56,6 +57,9 @@ def name(self):
 
 if __name__ == "__main__":
 
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+
     parser = argparse.ArgumentParser("Mock application")
     parser.add_argument("--device", default="cpu")
     args = parser.parse_args()
@@ -73,9 +77,11 @@ def name(self):
             timing = [batch_size]
             logger.info(f"Iteration: {iteration_number}")
             start = time.perf_counter()
-            client.put_tensor(name="batch", data=resnet.get_batch(batch_size).numpy())
-            client.run_model(name=resnet.name, inputs=["batch"], outputs=["result"])
-            result = client.get_tensor(name="result")
+            input_name = f"batch_{rank}"
+            output_name = f"result_{rank}"
+            client.put_tensor(name=input_name, data=resnet.get_batch(batch_size).numpy())
+            client.run_model(name=resnet.name, inputs=[input_name], outputs=[output_name])
+            result = client.get_tensor(name=output_name)
             end = time.perf_counter()
             timing.append(end-start)
             timings.append(timing)
@@ -83,6 +89,6 @@ def name(self):
 
 
     timings_np = numpy.asarray(timings)
-    numpy.save("timings.npy", timings_np)
+    numpy.save(f"timings_{rank}.npy", timings_np)
     for timing in timings:
         print(" ".join(str(t) for t in timing))
diff --git a/ex/high_throughput_inference/redis_driver.py b/ex/high_throughput_inference/redis_driver.py
index ceddba4ef7..6a8b00c2a8 100644
--- a/ex/high_throughput_inference/redis_driver.py
+++ b/ex/high_throughput_inference/redis_driver.py
@@ -29,23 +29,24 @@
 from smartsim import Experiment
 from smartsim.status import TERMINAL_STATUSES
 import time
-import typing as t
 
-device = "gpu"
+DEVICE = "gpu"
 filedir = os.path.dirname(__file__)
 app_script_name = os.path.join(filedir, "mock_app_redis.py")
-model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt")
+model_name = os.path.join(filedir, f"resnet50.{DEVICE.upper()}.pt")
 
 
-exp_path = os.path.join(filedir, "redis_ai")
+exp_path = os.path.join(filedir, "redis_ai_multi")
 os.makedirs(exp_path, exist_ok=True)
-exp = Experiment("redis_ai", launcher="slurm", exp_path=exp_path)
+exp = Experiment("redis_ai_multi", launcher="slurm", exp_path=exp_path)
 
 db = exp.create_database(interface="hsn0")
 
-app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device])
+app_rs = exp.create_run_settings(
+    sys.executable, exe_args = [app_script_name, "--device", DEVICE]
+    )
 app_rs.set_nodes(1)
-app_rs.set_tasks(1)
+app_rs.set_tasks(4)
 app = exp.create_model("app", run_settings=app_rs)
 app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name])
 
diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index f781444d81..7ccfdc21c4 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -100,5 +100,7 @@
         cooldown=10,
         comm_channel_type=DragonCommChannel,
         device = args.device,
+        batch_size=4,
+        batch_timeout=0.1,
     )
     worker_manager.execute()
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index c930d7d42c..c45edb33f2 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -299,7 +299,7 @@ def __init__(
         self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0)
         self._feature_store = feature_store
         self._comm_channel_type = comm_channel_type
-        self._perf_timer = PerfTimer(prefix="r_", debug=False)
+        self._perf_timer = PerfTimer(prefix="r_", debug=True)
         self._worker = TorchWorker()
 
     def _validate_request(self, request: InferenceRequest) -> bool:
@@ -370,8 +370,8 @@ def run(self) -> None:
 
                 self._perf_timer.end_timings()
 
-                if self._perf_timer.max_length == 801:
-                    self._perf_timer.print_timings(True)
+                if self._perf_timer.max_length == 4*801:
+                    self._perf_timer.print_timings(False)
 
     @property
     def task_queue(self) -> DragonQueue:
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index d41a09a0d8..159ce10478 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -180,7 +180,7 @@ def __init__(
         """Dispatcher used to batch requests"""
         self._device_manager: DeviceManager = DeviceManager([WorkerDevice("gpu")])
 
-        self._perf_timer = PerfTimer(prefix="w_")
+        self._perf_timer = PerfTimer(prefix="w_", debug=False)
 
         try:
             mp.set_start_method("dragon")
@@ -192,17 +192,17 @@ def __init__(
         self._dispatcher_process = self._create_local_dispatcher_process()
 
     def _create_local_dispatcher_process(self) -> dragon_process_group.ProcessGroup:
+        dispatcher_cpus = 2
         if sys.platform != "darwin":
             self_affinity: list[int] = list(os.sched_getaffinity(os.getpid()))
-            os.sched_setaffinity(os.getpid(), self_affinity[:-8])
+            os.sched_setaffinity(os.getpid(), self_affinity[:-dispatcher_cpus])
         else:
             self_affinity: list[int] = []
         global_policy = dragon_policy.Policy(
             placement=dragon_policy.Policy.Placement.HOST_NAME,
             host_name=socket.gethostname(),
             affinity=dragon_policy.Policy.Affinity.SPECIFIC,
-            cpu_affinity=self_affinity[-8:],
-            device=dragon_policy.Policy.Device.CPU,
+            cpu_affinity=self_affinity[-dispatcher_cpus:],
         )
         options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
         grp = dragon_process_group.ProcessGroup(
@@ -212,8 +212,7 @@ def _create_local_dispatcher_process(self) -> dragon_process_group.ProcessGroup:
             placement=dragon_policy.Policy.Placement.HOST_NAME,
             host_name=socket.gethostname(),
             affinity=dragon_policy.Policy.Affinity.SPECIFIC,
-            cpu_affinity=self_affinity[-8:],
-            device=dragon_policy.Policy.Device.CPU,
+            cpu_affinity=self_affinity[-dispatcher_cpus:],
         )
         tmp_proc = dragon_process.ProcessTemplate(
             target=self._request_dispatcher.run,
@@ -243,7 +242,6 @@ def _on_iteration(self) -> None:
             return
 
         self._perf_timer.measure_time("flush_requests")
-        # logger.info(f"Got batch of {len(batch.requests)} requests, acquiring device")
         device: WorkerDevice = next(
             self._device_manager.get_free_device(
                 worker=self._worker,
@@ -270,7 +268,7 @@ def _on_iteration(self) -> None:
 
         try:
             transformed_outputs = self._worker.transform_output(
-                batch, execute_result, device.name
+                batch, execute_result
             )
         except Exception as e:
             for request in batch.requests:
@@ -281,6 +279,7 @@ def _on_iteration(self) -> None:
         self._perf_timer.measure_time("transform_output")
 
         for request, transformed_output in zip(batch.requests, transformed_outputs):
+            print(len(transformed_output.outputs), flush=True)
             reply = InferenceReply()
             if request.output_keys:
                 try:
@@ -321,7 +320,7 @@ def _on_iteration(self) -> None:
 
         self._perf_timer.end_timings()
 
-        if self._perf_timer.max_length == 801:
+        if self._perf_timer.max_length == 4*801:
             self._perf_timer.print_timings(True)
 
     def _can_shutdown(self) -> bool:
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index 45c9caadb3..cc70c9451c 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -27,6 +27,7 @@
 import io
 
 import numpy as np
+import pickle
 import torch
 
 from .....error import SmartSimError
@@ -44,7 +45,7 @@
 )
 
 torch.set_num_threads(1)
-torch.set_num_interop_threads(16)
+torch.set_num_interop_threads(2)
 logger = get_logger(__name__)
 
 
@@ -139,13 +140,13 @@ def execute(
     def transform_output(
         batch: InferenceBatch,
         execute_result: ExecuteResult,
-        result_device: str,
     ) -> list[TransformOutputResult]:
         transformed_list: list[TransformOutputResult] = []
         for result_slice in execute_result.slices:
+            print(result_slice, flush=True)
             transformed = [
-                item.to("cpu").numpy().tobytes()
-                for item in execute_result.predictions[result_slice]
+                item[result_slice].to("cpu").numpy().tobytes()
+                for item in execute_result.predictions
             ]
             # todo: need the shape from latest schemas added here.
             transformed_list.append(
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index ae0a847aea..f0074e474e 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -299,11 +299,10 @@ def execute(
     @staticmethod
     @abstractmethod
     def transform_output(
-        batch: InferenceBatch, execute_result: ExecuteResult, result_device: str
+        batch: InferenceBatch, execute_result: ExecuteResult
     ) -> t.List[TransformOutputResult]:
         """Given inference results, perform transformations required to
         transmit results to the requestor.
         :param request: The request that triggered the pipeline
         :param execute_result: The result of inference wrapped in an ExecuteResult
-        :param result_device: The device on which the result of inference is placed
         :return:"""
diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py
index 0ac13662a6..154ebb67b8 100644
--- a/smartsim/_core/utils/timings.py
+++ b/smartsim/_core/utils/timings.py
@@ -37,7 +37,7 @@
 
 class PerfTimer:
     def __init__(
-        self, filename: str = "timings", prefix: str = "", debug: bool = False
+        self, filename: str = "timings", prefix: str = "", timing_on: bool = True, debug: bool = False
     ):
         self._start: t.Optional[float] = None
         self._interm: t.Optional[float] = None
@@ -62,34 +62,40 @@ def start_timings(
     ) -> None:
         if self._timing_on:
             if first_label is not None and first_value is not None:
-                self._log(f"{first_label}: {first_value}")
-                self._add_label_to_timings(self._make_label(first_label))
-                self._timings[self._make_label(first_label)].append(
-                    self._format_number(first_value)
+                mod_label = self._make_label(first_label)
+                value = self._format_number(first_value)
+                self._log(f"Started timing: {first_label}: {value}")
+                self._add_label_to_timings(mod_label)
+                self._timings[mod_label].append(
+                    value
                 )
             self._start = time.perf_counter()
             self._interm = time.perf_counter()
 
     def end_timings(self) -> None:
         if self._timing_on and self._start is not None:
-            self._add_label_to_timings(self._make_label("total_time"))
+            mod_label = self._make_label("total_time")
+            self._add_label_to_timings(mod_label)
             delta = self._format_number(time.perf_counter() - self._start)
             self._timings[self._make_label("total_time")].append(delta)
-            self._log(f"total_time: {delta}")
+            self._log(f"Finished timing: {mod_label}: {delta}")
             self._interm = None
 
     def _make_label(self, label: str) -> str:
         return self._prefix + label
 
     def _get_delta(self) -> float | int:
+        if self._interm is None:
+            return 0
         return time.perf_counter() - self._interm
 
     def measure_time(self, label: str) -> None:
         if self._timing_on and self._interm is not None:
-            self._add_label_to_timings(self._make_label(label))
+            mod_label = self._make_label(label)
+            self._add_label_to_timings(mod_label)
             delta = self._format_number(self._get_delta())
-            self._timings[self._make_label(label)].append(delta)
-            self._log(f"{label}: {delta}")
+            self._timings[mod_label].append(delta)
+            self._log(f"{mod_label}: {delta}")
             self._interm = time.perf_counter()
 
     def _log(self, msg: str) -> None:

From d1e9639260010d706512dbb86020a3b441e45468 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Sun, 21 Jul 2024 18:24:46 -0500
Subject: [PATCH 47/84] Almost good results

---
 ex/high_throughput_inference/mli_driver.py    |  21 ++-
 ex/high_throughput_inference/mock_app.py      |   4 +-
 .../standalone_workermanager.py               |   2 +-
 .../_core/mli/comm/channel/dragonchannel.py   |   5 +-
 smartsim/_core/mli/comm/channel/dragonfli.py  |   6 +-
 .../control/requestdispatcher.py              |  24 ++--
 .../infrastructure/control/workermanager.py   |  35 +++--
 .../mli/infrastructure/worker/torch_worker.py | 135 +++++++++++++-----
 .../_core/mli/infrastructure/worker/worker.py |   4 +-
 9 files changed, 159 insertions(+), 77 deletions(-)

diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py
index a03f391b60..c7c5445b8a 100644
--- a/ex/high_throughput_inference/mli_driver.py
+++ b/ex/high_throughput_inference/mli_driver.py
@@ -1,5 +1,3 @@
-
-
 import os
 import base64
 import cloudpickle
@@ -27,16 +25,27 @@
 
 torch_worker_str = base64.b64encode(cloudpickle.dumps(TorchWorker)).decode("ascii")
 
-worker_manager_rs: DragonRunSettings = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device, "--worker_class", torch_worker_str])
+worker_manager_rs: DragonRunSettings = exp.create_run_settings(
+    sys.executable,
+    [
+        worker_manager_script_name,
+        "--device",
+        device,
+        "--worker_class",
+        torch_worker_str,
+    ],
+)
 worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs)
 worker_manager.attach_generator_files(to_copy=[worker_manager_script_name])
 
-app_rs: DragonRunSettings = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device])
+app_rs: DragonRunSettings = exp.create_run_settings(
+    sys.executable,
+    exe_args=[app_script_name, "--device", device],
+)
 app_rs.set_tasks_per_node(4)
 app = exp.create_model("app", run_settings=app_rs)
 app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name])
 
-
 exp.generate(worker_manager, app, overwrite=True)
 exp.start(worker_manager, app, block=False)
 
@@ -50,4 +59,4 @@
         exp.stop(app)
         break
 
-print("Exiting.")
\ No newline at end of file
+print("Exiting.")
diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index 545c18b509..e497c1fdee 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -160,10 +160,12 @@ def name(self):
 
     TOTAL_ITERATIONS = 100
 
-    for b_size in [1, 2, 4, 8, 16, 32, 64, 128]:
+    for log2_bsize in range(7):
+        b_size: int = 2**log2_bsize
         logger.info(f"Batch size: {b_size}")
         for iteration_number in range(TOTAL_ITERATIONS + int(b_size==1)):
             logger.info(f"Iteration: {iteration_number}")
             client.run_model(resnet.name, resnet.get_batch(b_size))
+            logger.info(client._perf_timer.get_last("total_time"))
 
     client.print_timings(to_file=True)
\ No newline at end of file
diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index 7ccfdc21c4..8c870d1b95 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -101,6 +101,6 @@
         comm_channel_type=DragonCommChannel,
         device = args.device,
         batch_size=4,
-        batch_timeout=0.1,
+        batch_timeout=0.0005,  # 1e-3 is the best with ResNet50
     )
     worker_manager.execute()
diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py
index a45adaee33..1370c57452 100644
--- a/smartsim/_core/mli/comm/channel/dragonchannel.py
+++ b/smartsim/_core/mli/comm/channel/dragonchannel.py
@@ -24,7 +24,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import sys
 import typing as t
 
 import smartsim._core.mli.comm.channel.channel as cch
@@ -52,6 +51,6 @@ def send(self, value: bytes) -> None:
     def recv(self) -> t.List[bytes]:
         """Receieve a message through the underlying communication channel
         :returns: the received message"""
-        with self._channel.recvh(timeout=None) as recvh:
-            message_bytes: bytes = recvh.recv_bytes(timeout=None)
+        with self._channel.recvh(timeout=0.01) as recvh:
+            message_bytes: bytes = recvh.recv_bytes(timeout=1)
             return [message_bytes]
diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py
index 28b4c2bf3b..3d1ed3a1f6 100644
--- a/smartsim/_core/mli/comm/channel/dragonfli.py
+++ b/smartsim/_core/mli/comm/channel/dragonfli.py
@@ -62,11 +62,11 @@ def recv(self) -> t.List[bytes]:
         :returns: the received message"""
         messages = []
         eot = False
-        with self._fli.recvh(timeout=None) as recvh:
+        with self._fli.recvh(timeout=0.01) as recvh:
             while not eot:
                 try:
-                    message, _ = recvh.recv_bytes(timeout=None)
+                    message, _ = recvh.recv_bytes(timeout=1)
                     messages.append(message)
-                except fli.FLIEOT as exc:
+                except fli.FLIEOT:
                     eot = True
         return messages
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index c45edb33f2..d8ac4e2b7c 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -38,14 +38,15 @@
 import time
 import typing as t
 import uuid
+from concurrent.futures import Future, ThreadPoolExecutor
 from queue import Empty, Full, Queue
-from threading import Lock
+from threading import RLock
 from types import TracebackType
 
 from packaging.version import Version
 
 from .....error import SmartSimError
-from .....log import get_logger
+from .....log import ContextThread, get_logger
 from ....utils.timings import PerfTimer
 from ...comm.channel.channel import CommChannelBase
 from ...comm.channel.dragonchannel import DragonCommChannel
@@ -161,7 +162,7 @@ def __init__(self, name: str) -> None:
         """The name used by the toolkit to identify this device"""
         self._models: dict[str, t.Any] = {}
         """Dictionary of model key to model for models stored on this device"""
-        self._lock = Lock()
+        self._lock = RLock()
         """Lock to ensure only one thread at the time accesses this device"""
 
     def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]:
@@ -190,7 +191,7 @@ def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> Non
         self._first_put: t.Optional[float] = None
         self._disposable = False
         self._model_key = model_key
-        self._flush_lock = Lock()
+        self._flush_lock = RLock()
         self._id = str(uuid.uuid4())
 
     @property
@@ -294,12 +295,12 @@ def __init__(
         self._model_name_to_key: dict[str, str] = {}
         self._batch_timeout = batch_timeout
         self._batch_size = batch_size
-        self._queue_swap_lock: t.Optional[Lock] = None
+        self._queue_swap_lock: t.Optional[RLock] = None
         self._incoming_channel = incoming_channel
         self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0)
         self._feature_store = feature_store
         self._comm_channel_type = comm_channel_type
-        self._perf_timer = PerfTimer(prefix="r_", debug=True)
+        self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=False)
         self._worker = TorchWorker()
 
     def _validate_request(self, request: InferenceRequest) -> bool:
@@ -334,7 +335,7 @@ def _validate_request(self, request: InferenceRequest) -> bool:
         return True
 
     def run(self) -> None:
-        self._queue_swap_lock = Lock()
+        self._queue_swap_lock = RLock()
         if self._incoming_channel is None:
             raise SmartSimError("No incoming channel for dispatcher")
         while True:
@@ -357,11 +358,14 @@ def run(self) -> None:
                 request = deserialize_message(request_bytes, self._comm_channel_type)
                 if request.input_meta and tensor_bytes_list:
                     request.raw_inputs = tensor_bytes_list
+
                 self._perf_timer.measure_time("deserialize_message")
                 if not self._validate_request(request):
                     continue
+
                 self._perf_timer.measure_time("validate_request")
                 self.dispatch(request)
+
                 self._perf_timer.measure_time("dispatch")
             finally:
                 self.flush_requests()
@@ -370,9 +374,6 @@ def run(self) -> None:
 
                 self._perf_timer.end_timings()
 
-                if self._perf_timer.max_length == 4*801:
-                    self._perf_timer.print_timings(False)
-
     @property
     def task_queue(self) -> DragonQueue:
         return self._outgoing_queue
@@ -425,6 +426,7 @@ def _update_model_version(self, model: Model) -> None:
     def flush_requests(self) -> None:
         for queue in self._queues:
             if queue.ready and queue.acquire(blocking=False):
+                self._perf_timer.start_timings()
                 self._perf_timer.measure_time("find_queue")
                 try:
                     batch = InferenceBatch(
@@ -445,5 +447,7 @@ def flush_requests(self) -> None:
                 for request in batch.requests:
                     request.raw_inputs = []
                     request.input_meta = []
+
                 self._outgoing_queue.put(batch)
                 self._perf_timer.measure_time("put")
+                self._perf_timer.end_timings()
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 159ce10478..65111fe482 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -51,8 +51,8 @@
 from ...infrastructure.environmentloader import EnvironmentConfigLoader
 from ...infrastructure.storage.featurestore import FeatureStore
 from ...infrastructure.worker.worker import (
+    InferenceBatch,
     InferenceReply,
-    InferenceRequest,
     LoadModelResult,
     MachineLearningWorkerBase,
 )
@@ -178,21 +178,21 @@ def __init__(
             feature_store=self._feature_store,
         )
         """Dispatcher used to batch requests"""
-        self._device_manager: DeviceManager = DeviceManager([WorkerDevice("gpu")])
-
-        self._perf_timer = PerfTimer(prefix="w_", debug=False)
+        self._device_manager: DeviceManager = DeviceManager(
+            [WorkerDevice(f"gpu:{idx}") for idx in range(4)]
+        )
+        self._device_idx: int = 0
+        self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=False)
 
         try:
             mp.set_start_method("dragon")
         except RuntimeError:
             pass
-        # self._dispatcher_process = mp.Process(
-        #     target=self._request_dispatcher.run, name="Dispatcher"
-        # )
+
         self._dispatcher_process = self._create_local_dispatcher_process()
 
     def _create_local_dispatcher_process(self) -> dragon_process_group.ProcessGroup:
-        dispatcher_cpus = 2
+        dispatcher_cpus = 16
         if sys.platform != "darwin":
             self_affinity: list[int] = list(os.sched_getaffinity(os.getpid()))
             os.sched_setaffinity(os.getpid(), self_affinity[:-dispatcher_cpus])
@@ -235,7 +235,7 @@ def _on_iteration(self) -> None:
         """Executes calls to the machine learning worker implementation to complete
         the inference pipeline"""
 
-        batch: InferenceRequest = self._request_dispatcher.task_queue.get()
+        batch: InferenceBatch = self._request_dispatcher.task_queue.get()
 
         self._perf_timer.start_timings()
         if batch is None or 0 == len(batch.requests):
@@ -254,6 +254,14 @@ def _on_iteration(self) -> None:
         model_result = LoadModelResult(device.get_model(batch.model_key))
         self._perf_timer.measure_time("load_model")
 
+        if batch.inputs is None:
+            for request in batch.requests:
+                exception_handler(
+                    ValueError("Error batching inputs"),
+                    request.callback,
+                    "Error batching inputs.",
+                )
+            return
         transformed_input = batch.inputs
 
         try:
@@ -267,9 +275,7 @@ def _on_iteration(self) -> None:
         self._perf_timer.measure_time("execute")
 
         try:
-            transformed_outputs = self._worker.transform_output(
-                batch, execute_result
-            )
+            transformed_outputs = self._worker.transform_output(batch, execute_result)
         except Exception as e:
             for request in batch.requests:
                 exception_handler(
@@ -279,7 +285,6 @@ def _on_iteration(self) -> None:
         self._perf_timer.measure_time("transform_output")
 
         for request, transformed_output in zip(batch.requests, transformed_outputs):
-            print(len(transformed_output.outputs), flush=True)
             reply = InferenceReply()
             if request.output_keys:
                 try:
@@ -320,8 +325,8 @@ def _on_iteration(self) -> None:
 
         self._perf_timer.end_timings()
 
-        if self._perf_timer.max_length == 4*801:
-            self._perf_timer.print_timings(True)
+        # if self._perf_timer.max_length == 4 * 801:
+        #     self._perf_timer.print_timings(True)
 
     def _can_shutdown(self) -> bool:
         """Return true when the criteria to shut down the service are met."""
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index cc70c9451c..52a2698467 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -25,9 +25,9 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import io
+from concurrent.futures import Future, ThreadPoolExecutor
 
 import numpy as np
-import pickle
 import torch
 
 from .....error import SmartSimError
@@ -44,7 +44,7 @@
     TransformOutputResult,
 )
 
-torch.set_num_threads(1)
+torch.set_num_threads(4)
 torch.set_num_interop_threads(2)
 logger = get_logger(__name__)
 
@@ -70,6 +70,7 @@ def load_model(
 
         buffer = io.BytesIO(initial_bytes=model_bytes)
         model = torch.jit.load(buffer, map_location=device)  # type: ignore
+        model.eval()
         result = LoadModelResult(model)
         return result
 
@@ -77,42 +78,99 @@ def load_model(
     def transform_input(
         batch: InferenceBatch, fetch_results: list[FetchInputResult]
     ) -> TransformInputResult:
-        results: list[list[torch.Tensor]] = []
-        start = 0
+        results: list[torch.Tensor] = []
+        total_samples = 0
         slices: list[slice] = []
 
-        for fetch_result in fetch_results:
-            partial_result = []
-            if fetch_result.meta is None:
-                raise ValueError("Cannot reconstruct tensor without meta information")
-            for idx, (item, item_meta) in enumerate(
-                zip(fetch_result.inputs, fetch_result.meta)
+        all_dims: list[list[int]] = []
+        all_dtypes: list[str] = []
+        if fetch_results[0].meta is None:
+            raise ValueError("Cannot reconstruct tensor without meta information")
+        # Traverse inputs to get total number of samples and compute slices
+        # Assumption: first dimension is samples, all tensors in the same input
+        # have same number of samples
+        # thus we only look at the first tensor for each input
+        for res_idx, fetch_result in enumerate(fetch_results):
+            if fetch_result.meta is None or any(
+                item_meta is None for item_meta in fetch_result.meta
             ):
-                tensor_desc: tensor_capnp.TensorDescriptor = item_meta
-                partial_result.append(
-                    torch.tensor(
-                        np.frombuffer(item, dtype=str(tensor_desc.dataType))
-                    ).reshape(tuple(dim for dim in tensor_desc.dimensions))
-                )
-                if idx == 0:
-                    num_samples = tensor_desc.dimensions[0]
-                    slices.append(slice(start, start + num_samples))
-                    start = start + num_samples
-            results.append(partial_result)
-
-        result: list[torch.Tensor] = []
-        if len(batch.requests) > 1:
-            for t_idx in range(len(results[0])):
-                result.append(
-                    torch.concatenate(
-                        [partial_result[t_idx] for partial_result in results]
-                    )
+                raise ValueError("Cannot reconstruct tensor without meta information")
+            first_tensor_desc: tensor_capnp.TensorDescriptor = fetch_result.meta[0]
+            num_samples = first_tensor_desc.dimensions[0]
+            slices.append(slice(total_samples, total_samples + num_samples))
+            total_samples = total_samples + num_samples
+
+            if res_idx == len(fetch_results)-1:
+                # For each tensor in the last input, get remaining dimensions
+                # Assumptions: all inputs have the same number of tensors and
+                # last N-1 dimensions match across inputs for corresponding tensors
+                # thus: resulting array will be of size (num_samples, all_other_dims)
+                for item_meta in fetch_result.meta:
+                    tensor_desc: tensor_capnp.TensorDescriptor = item_meta
+                    tensor_dims = list(tensor_desc.dimensions)
+                    all_dims.append([total_samples, *tensor_dims[1:]])
+                    all_dtypes.append(str(tensor_desc.dataType))
+
+        for result_tensor_idx, (dims, dtype) in enumerate(zip(all_dims, all_dtypes)):
+            # List comprehension concatenation can be faster sometimes
+            all_bytes = b"".join(
+                [
+                    fetch_result.inputs[result_tensor_idx]
+                    for fetch_result in fetch_results
+                ]
+            )
+
+            results.append(
+                torch.from_numpy(
+                    np.frombuffer(
+                        all_bytes,
+                        dtype=dtype,
+                    ).reshape(dims)
                 )
-        else:
-            result = results[0]
-
-        return TransformInputResult(result, slices)
-        # return data # note: this fails copy test!
+            )
+
+        return TransformInputResult(results, slices)
+
+    # @staticmethod
+    # def _transform_input(
+    #     batch: InferenceBatch, fetch_results: list[FetchInputResult]
+    # ) -> TransformInputResult:
+    #     results: list[list[torch.Tensor]] = []
+    #     start = 0
+    #     slices: list[slice] = []
+
+    #     for fetch_result in fetch_results:
+    #         partial_result = []
+    #         if fetch_result.meta is None:
+    #             raise ValueError("Cannot reconstruct tensor without meta information")
+    #         for idx, (item, item_meta) in enumerate(
+    #             zip(fetch_result.inputs, fetch_result.meta)
+    #         ):
+    #             tensor_desc: tensor_capnp.TensorDescriptor = item_meta
+    #             partial_result.append(
+    #                 torch.tensor(
+    #                     np.frombuffer(item, dtype=str(tensor_desc.dataType))
+    #                 ).reshape(tuple(dim for dim in tensor_desc.dimensions))
+    #             )
+    #             if idx == 0:
+    #                 num_samples = tensor_desc.dimensions[0]
+    #                 slices.append(slice(start, start + num_samples))
+    #                 start = start + num_samples
+    #         results.append(partial_result)
+
+    #     result: list[torch.Tensor] = []
+    #     if len(batch.requests) > 1:
+    #         for t_idx in range(len(results[0])):
+    #             result.append(
+    #                 torch.concatenate(
+    #                     [partial_result[t_idx] for partial_result in results]
+    #                 )
+    #             )
+    #     else:
+    #         result = results[0]
+
+    #     return TransformInputResult(result, slices)
+    # return data # note: this fails copy test!
 
     # pylint: disable-next=unused-argument
     @staticmethod
@@ -129,10 +187,14 @@ def execute(
             device = device.replace(old, new)
         model: torch.nn.Module = load_result.model
         model.eval()
+        # print([tensor.shape for tensor in transform_result.transformed])
+        # torch.cuda.empty_cache()
         results = [
             model(tensor.to(device)).detach() for tensor in transform_result.transformed
         ]
 
+        transform_result.transformed = []
+
         execute_result = ExecuteResult(results, transform_result.slices)
         return execute_result
 
@@ -143,9 +205,8 @@ def transform_output(
     ) -> list[TransformOutputResult]:
         transformed_list: list[TransformOutputResult] = []
         for result_slice in execute_result.slices:
-            print(result_slice, flush=True)
             transformed = [
-                item[result_slice].to("cpu").numpy().tobytes()
+                item[result_slice].cpu().numpy().tobytes()
                 for item in execute_result.predictions
             ]
             # todo: need the shape from latest schemas added here.
@@ -153,4 +214,6 @@ def transform_output(
                 TransformOutputResult(transformed, None, "c", "float32")
             )  # fixme
 
+        execute_result.predictions = []
+
         return transformed_list
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index f0074e474e..a7dc6811da 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -151,8 +151,8 @@ def __init__(self, result: bytes) -> None:
 @dataclass
 class InferenceBatch:
     model_key: str
-    requests: t.Optional[list[InferenceRequest]]
-    inputs: t.Optional[list[TransformInputResult]]
+    requests: list[InferenceRequest]
+    inputs: t.Optional[TransformInputResult]
 
 
 class MachineLearningWorkerCore:

From 91ffaee5d802c4a72e801cd7617f4d65ab2bb1b4 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Sun, 21 Jul 2024 18:24:59 -0500
Subject: [PATCH 48/84] New timings API

---
 smartsim/_core/utils/timings.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py
index 154ebb67b8..c8f6c71003 100644
--- a/smartsim/_core/utils/timings.py
+++ b/smartsim/_core/utils/timings.py
@@ -37,12 +37,16 @@
 
 class PerfTimer:
     def __init__(
-        self, filename: str = "timings", prefix: str = "", timing_on: bool = True, debug: bool = False
+        self,
+        filename: str = "timings",
+        prefix: str = "",
+        timing_on: bool = True,
+        debug: bool = False,
     ):
         self._start: t.Optional[float] = None
         self._interm: t.Optional[float] = None
         self._timings: OrderedDict[str, list[t.Union[float, int, str]]] = OrderedDict()
-        self._timing_on = True
+        self._timing_on = timing_on
         self._filename = filename
         self._prefix = prefix
         self._debug = debug
@@ -66,9 +70,7 @@ def start_timings(
                 value = self._format_number(first_value)
                 self._log(f"Started timing: {first_label}: {value}")
                 self._add_label_to_timings(mod_label)
-                self._timings[mod_label].append(
-                    value
-                )
+                self._timings[mod_label].append(value)
             self._start = time.perf_counter()
             self._interm = time.perf_counter()
 
@@ -89,6 +91,15 @@ def _get_delta(self) -> float | int:
             return 0
         return time.perf_counter() - self._interm
 
+    def get_last(self, label: str) -> str:
+        mod_label = self._make_label(label)
+        if mod_label in self._timings:
+            value = self._timings[mod_label][-1]
+            if value:
+                return f"{label}: {value}"
+
+        return "Not measured yet"
+
     def measure_time(self, label: str) -> None:
         if self._timing_on and self._interm is not None:
             mod_label = self._make_label(label)

From b9e9796f7065501cc77c8aebebcc421a9fac9f00 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 23 Jul 2024 16:51:32 -0500
Subject: [PATCH 49/84] Pre-cleanup, best results so far

---
 ex/high_throughput_inference/mli_driver.py    |  11 +-
 ex/high_throughput_inference/mock_app.py      |  25 ++-
 .../standalone_workermanager.py               |   4 +-
 .../_core/launcher/dragon/dragonBackend.py    |   5 +-
 .../_core/mli/comm/channel/dragonchannel.py   |   4 +-
 smartsim/_core/mli/comm/channel/dragonfli.py  |   4 +-
 .../control/requestdispatcher.py              |  15 +-
 .../infrastructure/control/workermanager.py   |  41 +++--
 .../mli/infrastructure/worker/torch_worker.py | 155 +++++++++---------
 .../_core/mli/infrastructure/worker/worker.py |  14 +-
 10 files changed, 159 insertions(+), 119 deletions(-)

diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py
index c7c5445b8a..1d1642567c 100644
--- a/ex/high_throughput_inference/mli_driver.py
+++ b/ex/high_throughput_inference/mli_driver.py
@@ -19,7 +19,7 @@
 
 os.environ["SMARTSIM_DRAGON_TRANSPORT"] = transport
 
-exp_path = os.path.join(filedir, f"MLI_proto_batch_{transport.upper()}")
+exp_path = os.path.join(filedir, f"MLI_proto_{transport.upper()}")
 os.makedirs(exp_path, exist_ok=True)
 exp = Experiment("MLI_proto", launcher="dragon", exp_path=exp_path)
 
@@ -35,6 +35,11 @@
         torch_worker_str,
     ],
 )
+aff = []
+for i in range(32):
+    aff.append(i)
+    # aff.append(i+64)
+worker_manager_rs.set_cpu_affinity(aff)
 worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs)
 worker_manager.attach_generator_files(to_copy=[worker_manager_script_name])
 
@@ -42,7 +47,9 @@
     sys.executable,
     exe_args=[app_script_name, "--device", device],
 )
-app_rs.set_tasks_per_node(4)
+app_rs.set_tasks_per_node(1)
+
+
 app = exp.create_model("app", run_settings=app_rs)
 app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name])
 
diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index e497c1fdee..2a76fdbe9d 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -52,6 +52,8 @@
 
 logger = get_logger("App")
 
+CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False
+
 class ProtoClient:
     def __init__(self, timing_on: bool):
         comm = MPI.COMM_WORLD
@@ -70,8 +72,6 @@ def __init__(self, timing_on: bool):
         self._from_worker_ch_serialized = self._from_worker_ch.serialize()
         self._to_worker_ch = Channel.make_process_local()
 
-        self._start = None
-        self._interm = None
         self._perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"a{rank}_")
 
     def run_model(self, model: bytes | str, batch: torch.Tensor):
@@ -95,10 +95,13 @@ def run_model(self, model: bytes | str, batch: torch.Tensor):
         self._perf_timer.measure_time("build_request")
         request_bytes = MessageHandler.serialize_request(request)
         self._perf_timer.measure_time("serialize_request")
+        tensor_bytes = [bytes(tensor.data) for tensor in tensors]
+        # tensor_bytes = [tensor.reshape(-1).view(numpy.uint8).data for tensor in tensors]
+        self._perf_timer.measure_time("serialize_tensor")
         with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh:
             to_sendh.send_bytes(request_bytes)
-            for t in tensors:
-                to_sendh.send_bytes(t.tobytes()) #TODO NOT FAST ENOUGH!!!
+            for tb in tensor_bytes:
+                to_sendh.send_bytes(tb) #TODO NOT FAST ENOUGH!!!
                 # to_sendh.send_bytes(bytes(t.data))
 
         self._perf_timer.measure_time("send")
@@ -158,14 +161,24 @@ def name(self):
     client = ProtoClient(timing_on=True)
     client.set_model(resnet.name, resnet.model)
 
+    pt_model = torch.jit.load(io.BytesIO(initial_bytes=(resnet.model))).to("cuda:0")
+
     TOTAL_ITERATIONS = 100
 
-    for log2_bsize in range(7):
+    for log2_bsize in range(8):
         b_size: int = 2**log2_bsize
         logger.info(f"Batch size: {b_size}")
         for iteration_number in range(TOTAL_ITERATIONS + int(b_size==1)):
             logger.info(f"Iteration: {iteration_number}")
-            client.run_model(resnet.name, resnet.get_batch(b_size))
+            batch = resnet.get_batch(b_size)
+            remote_result = client.run_model(resnet.name, batch)
             logger.info(client._perf_timer.get_last("total_time"))
+            if CHECK_RESULTS_AND_MAKE_ALL_SLOWER:
+                local_res = pt_model(batch.to("cuda:0"))
+                err_norm = torch.linalg.vector_norm(torch.flatten(remote_result).to("cuda:0")-torch.flatten(local_res), ord=1).cpu()
+                res_norm = torch.linalg.vector_norm(remote_result, ord=1).item()
+                local_res_norm = torch.linalg.vector_norm(local_res, ord=1).item()
+                logger.info(f"Avg norm of error {err_norm.item()/b_size} compared to result norm of {res_norm/b_size}:{local_res_norm/b_size}")
+                torch.cuda.synchronize()
 
     client.print_timings(to_file=True)
\ No newline at end of file
diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index 8c870d1b95..89f5eedd0d 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -100,7 +100,7 @@
         cooldown=10,
         comm_channel_type=DragonCommChannel,
         device = args.device,
-        batch_size=4,
-        batch_timeout=0.0005,  # 1e-3 is the best with ResNet50
+        batch_size=1,
+        batch_timeout=0.001,  # 1e-3 is the best with ResNet50 for bs>32
     )
     worker_manager.execute()
diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 445538f20e..344a57bc34 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -501,10 +501,7 @@ def _start_steps(self) -> None:
 
                 logger.debug(f"Step id {step_id} allocated on {hosts}")
 
-                global_policy = dragon_policy.Policy(
-                    placement=dragon_policy.Policy.Placement.HOST_NAME,
-                    host_name=hosts[0],
-                )
+                global_policy = self.create_run_policy(request, hosts[0])
                 options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
                 grp = dragon_process_group.ProcessGroup(
                     restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy
diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py
index 1370c57452..e09f2f628c 100644
--- a/smartsim/_core/mli/comm/channel/dragonchannel.py
+++ b/smartsim/_core/mli/comm/channel/dragonchannel.py
@@ -51,6 +51,6 @@ def send(self, value: bytes) -> None:
     def recv(self) -> t.List[bytes]:
         """Receieve a message through the underlying communication channel
         :returns: the received message"""
-        with self._channel.recvh(timeout=0.01) as recvh:
-            message_bytes: bytes = recvh.recv_bytes(timeout=1)
+        with self._channel.recvh(timeout=None) as recvh:
+            message_bytes: bytes = recvh.recv_bytes(timeout=None)
             return [message_bytes]
diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py
index 3d1ed3a1f6..9f5d628d5f 100644
--- a/smartsim/_core/mli/comm/channel/dragonfli.py
+++ b/smartsim/_core/mli/comm/channel/dragonfli.py
@@ -62,10 +62,10 @@ def recv(self) -> t.List[bytes]:
         :returns: the received message"""
         messages = []
         eot = False
-        with self._fli.recvh(timeout=0.01) as recvh:
+        with self._fli.recvh(timeout=None) as recvh:
             while not eot:
                 try:
-                    message, _ = recvh.recv_bytes(timeout=1)
+                    message, _ = recvh.recv_bytes(timeout=None)
                     messages.append(message)
                 except fli.FLIEOT:
                     eot = True
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index d8ac4e2b7c..018e094e0b 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -27,6 +27,7 @@
 # pylint: disable=import-error
 # pylint: disable-next=unused-import
 import dragon
+from dragon.managed_memory import MemoryAlloc, MemoryPool
 from dragon.mpbridge.queues import DragonQueue
 
 # pylint: enable=import-error
@@ -38,7 +39,6 @@
 import time
 import typing as t
 import uuid
-from concurrent.futures import Future, ThreadPoolExecutor
 from queue import Empty, Full, Queue
 from threading import RLock
 from types import TracebackType
@@ -46,7 +46,7 @@
 from packaging.version import Version
 
 from .....error import SmartSimError
-from .....log import ContextThread, get_logger
+from .....log import get_logger
 from ....utils.timings import PerfTimer
 from ...comm.channel.channel import CommChannelBase
 from ...comm.channel.dragonchannel import DragonCommChannel
@@ -284,6 +284,7 @@ def __init__(
         self,
         batch_timeout: float,
         batch_size: int,
+        mem_pool: MemoryPool,
         incoming_channel: t.Optional[CommChannelBase],
         comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel,
         feature_store: t.Optional[FeatureStore] = None,
@@ -300,8 +301,9 @@ def __init__(
         self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0)
         self._feature_store = feature_store
         self._comm_channel_type = comm_channel_type
-        self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=False)
+        self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True)
         self._worker = TorchWorker()
+        self._mem_pool = mem_pool
 
     def _validate_request(self, request: InferenceRequest) -> bool:
         """Ensure the request can be processed.
@@ -374,6 +376,9 @@ def run(self) -> None:
 
                 self._perf_timer.end_timings()
 
+            if self._perf_timer.max_length == 801:
+                self._perf_timer.print_timings(True)
+
     @property
     def task_queue(self) -> DragonQueue:
         return self._outgoing_queue
@@ -426,7 +431,6 @@ def _update_model_version(self, model: Model) -> None:
     def flush_requests(self) -> None:
         for queue in self._queues:
             if queue.ready and queue.acquire(blocking=False):
-                self._perf_timer.start_timings()
                 self._perf_timer.measure_time("find_queue")
                 try:
                     batch = InferenceBatch(
@@ -440,7 +444,7 @@ def flush_requests(self) -> None:
                 )
                 self._perf_timer.measure_time("fetch_input")
                 transformed_inputs = self._worker.transform_input(
-                    batch=batch, fetch_results=fetch_results
+                    batch=batch, fetch_results=fetch_results, mem_pool=self._mem_pool
                 )
                 self._perf_timer.measure_time("transform_input")
                 batch.inputs = transformed_inputs
@@ -450,4 +454,3 @@ def flush_requests(self) -> None:
 
                 self._outgoing_queue.put(batch)
                 self._perf_timer.measure_time("put")
-                self._perf_timer.end_timings()
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 65111fe482..4d351f9bff 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -31,6 +31,7 @@
 import dragon.infrastructure.process_desc as dragon_process_desc
 import dragon.native.process as dragon_process
 import dragon.native.process_group as dragon_process_group
+from dragon.managed_memory import MemoryAlloc, MemoryPool
 
 # pylint: enable=import-error
 
@@ -41,6 +42,7 @@
 import os
 import socket
 import sys
+import time
 import typing as t
 
 from .....log import get_logger
@@ -170,19 +172,21 @@ def __init__(
         """Device on which workers need to run"""
         self._cached_models: dict[str, t.Any] = {}
         """Dictionary of previously loaded models"""
+        self._mem_pool = MemoryPool(size=1024**3, fname="wm_mempool", uid=123458)
         self._request_dispatcher: RequestDispatcher = RequestDispatcher(
             batch_timeout=batch_timeout,
             batch_size=batch_size,
             incoming_channel=self._task_queue,
             comm_channel_type=comm_channel_type,
             feature_store=self._feature_store,
+            mem_pool=self._mem_pool,
         )
         """Dispatcher used to batch requests"""
         self._device_manager: DeviceManager = DeviceManager(
-            [WorkerDevice(f"gpu:{idx}") for idx in range(4)]
+            [WorkerDevice(f"gpu:{idx}") for idx in [3]]
         )
         self._device_idx: int = 0
-        self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=False)
+        self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True)
 
         try:
             mp.set_start_method("dragon")
@@ -192,17 +196,19 @@ def __init__(
         self._dispatcher_process = self._create_local_dispatcher_process()
 
     def _create_local_dispatcher_process(self) -> dragon_process_group.ProcessGroup:
-        dispatcher_cpus = 16
+        wm_cpus = 0
         if sys.platform != "darwin":
             self_affinity: list[int] = list(os.sched_getaffinity(os.getpid()))
-            os.sched_setaffinity(os.getpid(), self_affinity[:-dispatcher_cpus])
+            wm_cpus = len(self_affinity) // 2
+            os.sched_setaffinity(os.getpid(), self_affinity[:wm_cpus])
         else:
             self_affinity: list[int] = []
+        disp_affinity = self_affinity[wm_cpus:]
         global_policy = dragon_policy.Policy(
             placement=dragon_policy.Policy.Placement.HOST_NAME,
             host_name=socket.gethostname(),
             affinity=dragon_policy.Policy.Affinity.SPECIFIC,
-            cpu_affinity=self_affinity[-dispatcher_cpus:],
+            cpu_affinity=disp_affinity,
         )
         options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
         grp = dragon_process_group.ProcessGroup(
@@ -212,7 +218,7 @@ def _create_local_dispatcher_process(self) -> dragon_process_group.ProcessGroup:
             placement=dragon_policy.Policy.Placement.HOST_NAME,
             host_name=socket.gethostname(),
             affinity=dragon_policy.Policy.Affinity.SPECIFIC,
-            cpu_affinity=self_affinity[-dispatcher_cpus:],
+            cpu_affinity=disp_affinity,
         )
         tmp_proc = dragon_process.ProcessTemplate(
             target=self._request_dispatcher.run,
@@ -235,13 +241,21 @@ def _on_iteration(self) -> None:
         """Executes calls to the machine learning worker implementation to complete
         the inference pipeline"""
 
-        batch: InferenceBatch = self._request_dispatcher.task_queue.get()
+        pre_batch_time = time.perf_counter()
+        try:
+            batch: InferenceBatch = self._request_dispatcher.task_queue.get(
+                timeout=0.001
+            )
+        except Exception:
+            return
+
+        self._perf_timer.start_timings(
+            "flush_requests", time.perf_counter() - pre_batch_time
+        )
 
-        self._perf_timer.start_timings()
         if batch is None or 0 == len(batch.requests):
             return
 
-        self._perf_timer.measure_time("flush_requests")
         device: WorkerDevice = next(
             self._device_manager.get_free_device(
                 worker=self._worker,
@@ -275,14 +289,15 @@ def _on_iteration(self) -> None:
         self._perf_timer.measure_time("execute")
 
         try:
-            transformed_outputs = self._worker.transform_output(batch, execute_result)
+            transformed_outputs = self._worker.transform_output(
+                batch, execute_result, self._perf_timer
+            )
         except Exception as e:
             for request in batch.requests:
                 exception_handler(
                     e, request.callback, "Failed while transforming the output."
                 )
             return
-        self._perf_timer.measure_time("transform_output")
 
         for request, transformed_output in zip(batch.requests, transformed_outputs):
             reply = InferenceReply()
@@ -325,8 +340,8 @@ def _on_iteration(self) -> None:
 
         self._perf_timer.end_timings()
 
-        # if self._perf_timer.max_length == 4 * 801:
-        #     self._perf_timer.print_timings(True)
+        if self._perf_timer.max_length == 801:
+            self._perf_timer.print_timings(True)
 
     def _can_shutdown(self) -> bool:
         """Return true when the criteria to shut down the service are met."""
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index 52a2698467..0e8273dd56 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -25,13 +25,14 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import io
-from concurrent.futures import Future, ThreadPoolExecutor
 
 import numpy as np
 import torch
+from dragon.managed_memory import MemoryAlloc, MemoryPool
 
 from .....error import SmartSimError
 from .....log import get_logger
+from ....utils.timings import PerfTimer
 from ...mli_schemas.tensor import tensor_capnp
 from .worker import (
     ExecuteResult,
@@ -44,8 +45,8 @@
     TransformOutputResult,
 )
 
-torch.set_num_threads(4)
-torch.set_num_interop_threads(2)
+torch.set_num_threads(1)
+torch.set_num_interop_threads(4)
 logger = get_logger(__name__)
 
 
@@ -69,14 +70,17 @@ def load_model(
             device = device.replace(old, new)
 
         buffer = io.BytesIO(initial_bytes=model_bytes)
-        model = torch.jit.load(buffer, map_location=device)  # type: ignore
-        model.eval()
+        with torch.no_grad():
+            model = torch.jit.load(buffer, map_location=device)  # type: ignore
+            model.eval()
         result = LoadModelResult(model)
         return result
 
     @staticmethod
     def transform_input(
-        batch: InferenceBatch, fetch_results: list[FetchInputResult]
+        batch: InferenceBatch,
+        fetch_results: list[FetchInputResult],
+        mem_pool: MemoryPool,
     ) -> TransformInputResult:
         results: list[torch.Tensor] = []
         total_samples = 0
@@ -100,7 +104,7 @@ def transform_input(
             slices.append(slice(total_samples, total_samples + num_samples))
             total_samples = total_samples + num_samples
 
-            if res_idx == len(fetch_results)-1:
+            if res_idx == len(fetch_results) - 1:
                 # For each tensor in the last input, get remaining dimensions
                 # Assumptions: all inputs have the same number of tensors and
                 # last N-1 dimensions match across inputs for corresponding tensors
@@ -112,65 +116,32 @@ def transform_input(
                     all_dtypes.append(str(tensor_desc.dataType))
 
         for result_tensor_idx, (dims, dtype) in enumerate(zip(all_dims, all_dtypes)):
-            # List comprehension concatenation can be faster sometimes
-            all_bytes = b"".join(
-                [
-                    fetch_result.inputs[result_tensor_idx]
-                    for fetch_result in fetch_results
-                ]
-            )
-
-            results.append(
-                torch.from_numpy(
-                    np.frombuffer(
-                        all_bytes,
-                        dtype=dtype,
-                    ).reshape(dims)
+            itemsize = np.empty((1), dtype=dtype).itemsize
+            alloc_size = int(np.prod(dims) * itemsize)
+            try:
+                mem_alloc = mem_pool.alloc(alloc_size)
+                mem_view = mem_alloc.get_memview()
+                mem_view[:alloc_size] = b"".join(
+                    [
+                        fetch_result.inputs[result_tensor_idx]
+                        for fetch_result in fetch_results
+                    ]
                 )
-            )
-
-        return TransformInputResult(results, slices)
-
-    # @staticmethod
-    # def _transform_input(
-    #     batch: InferenceBatch, fetch_results: list[FetchInputResult]
-    # ) -> TransformInputResult:
-    #     results: list[list[torch.Tensor]] = []
-    #     start = 0
-    #     slices: list[slice] = []
-
-    #     for fetch_result in fetch_results:
-    #         partial_result = []
-    #         if fetch_result.meta is None:
-    #             raise ValueError("Cannot reconstruct tensor without meta information")
-    #         for idx, (item, item_meta) in enumerate(
-    #             zip(fetch_result.inputs, fetch_result.meta)
-    #         ):
-    #             tensor_desc: tensor_capnp.TensorDescriptor = item_meta
-    #             partial_result.append(
-    #                 torch.tensor(
-    #                     np.frombuffer(item, dtype=str(tensor_desc.dataType))
-    #                 ).reshape(tuple(dim for dim in tensor_desc.dimensions))
-    #             )
-    #             if idx == 0:
-    #                 num_samples = tensor_desc.dimensions[0]
-    #                 slices.append(slice(start, start + num_samples))
-    #                 start = start + num_samples
-    #         results.append(partial_result)
-
-    #     result: list[torch.Tensor] = []
-    #     if len(batch.requests) > 1:
-    #         for t_idx in range(len(results[0])):
-    #             result.append(
-    #                 torch.concatenate(
-    #                     [partial_result[t_idx] for partial_result in results]
-    #                 )
-    #             )
-    #     else:
-    #         result = results[0]
-
-    #     return TransformInputResult(result, slices)
-    # return data # note: this fails copy test!
+            except Exception as e:
+                print(e)
+                raise e
+            # results.append(
+            #     torch.from_numpy(
+            #         np.frombuffer(
+            #             all_bytes,
+            #             dtype=dtype,
+            #         ).reshape(dims)
+            #     )
+            # )
+
+            results.append(mem_alloc.serialize())
+
+        return TransformInputResult(results, slices, all_dims)
 
     # pylint: disable-next=unused-argument
     @staticmethod
@@ -185,34 +156,60 @@ def execute(
         device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
         for old, new in device_to_torch.items():
             device = device.replace(old, new)
+
+        tensors = []
+        mem_allocs = []
+        for transformed, dims in zip(
+            transform_result.transformed, transform_result.dims
+        ):
+            mem_alloc = MemoryAlloc.attach(transformed)
+            mem_allocs.append(mem_alloc)
+            tensors.append(
+                torch.from_numpy(
+                    np.frombuffer(
+                        mem_alloc.get_memview()[0 : np.prod(dims) * 4], dtype=np.float32
+                    ).reshape(dims)
+                )
+            )
+
         model: torch.nn.Module = load_result.model
-        model.eval()
-        # print([tensor.shape for tensor in transform_result.transformed])
-        # torch.cuda.empty_cache()
-        results = [
-            model(tensor.to(device)).detach() for tensor in transform_result.transformed
-        ]
+        with torch.no_grad():
+            model.eval()
+            results = [
+                model(tensor.to(device, non_blocking=True)).detach()
+                for tensor in tensors
+            ]
+
+        torch.cuda.synchronize(3)
 
         transform_result.transformed = []
 
         execute_result = ExecuteResult(results, transform_result.slices)
+        for mem_alloc in mem_allocs:
+            mem_alloc.free()
         return execute_result
 
     @staticmethod
     def transform_output(
         batch: InferenceBatch,
         execute_result: ExecuteResult,
+        perf_timer: PerfTimer,
     ) -> list[TransformOutputResult]:
         transformed_list: list[TransformOutputResult] = []
+        cpu_predictions = [
+            prediction.cpu() for prediction in execute_result.predictions
+        ]
+        perf_timer.measure_time("to_cpu")
         for result_slice in execute_result.slices:
-            transformed = [
-                item[result_slice].cpu().numpy().tobytes()
-                for item in execute_result.predictions
-            ]
-            # todo: need the shape from latest schemas added here.
-            transformed_list.append(
-                TransformOutputResult(transformed, None, "c", "float32")
-            )  # fixme
+            transformed = []
+            for cpu_item in cpu_predictions:
+                transformed.append(cpu_item[result_slice].numpy().tobytes())
+                perf_timer.measure_time("serialize_tensor")
+
+                # todo: need the shape from latest schemas added here.
+                transformed_list.append(
+                    TransformOutputResult(transformed, None, "c", "float32")
+                )  # fixme
 
         execute_result.predictions = []
 
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index a7dc6811da..068e47b2fd 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -28,8 +28,11 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 
+from dragon.managed_memory import MemoryAlloc, MemoryPool
+
 from .....error import SmartSimError
 from .....log import get_logger
+from ....utils.timings import PerfTimer
 from ...comm.channel.channel import CommChannelBase
 from ...infrastructure.storage.featurestore import FeatureStore
 from ...mli_schemas.model.model_capnp import Model
@@ -95,10 +98,13 @@ def __init__(self, model: t.Any) -> None:
 class TransformInputResult:
     """A wrapper around a transformed batchinput"""
 
-    def __init__(self, result: t.Any, slices: list[slice]) -> None:
+    def __init__(
+        self, result: t.Any, slices: list[slice], dims: list[list[int]]
+    ) -> None:
         """Initialize the object"""
         self.transformed = result
         self.slices = slices
+        self.dims = dims
 
 
 class ExecuteResult:
@@ -275,7 +281,9 @@ def load_model(
     @staticmethod
     @abstractmethod
     def transform_input(
-        batch: InferenceBatch, fetch_results: list[FetchInputResult]
+        batch: InferenceBatch,
+        fetch_results: list[FetchInputResult],
+        mem_pool: MemoryPool,
     ) -> TransformInputResult:
         """Given a collection of data, perform a transformation on the data
         :param request: The request that triggered the pipeline
@@ -299,7 +307,7 @@ def execute(
     @staticmethod
     @abstractmethod
     def transform_output(
-        batch: InferenceBatch, execute_result: ExecuteResult
+        batch: InferenceBatch, execute_result: ExecuteResult, perf_timer: PerfTimer
     ) -> t.List[TransformOutputResult]:
         """Given inference results, perform transformations required to
         transmit results to the requestor.

From 8958aa14c28cbea40cfc09f717ee5eef382316c6 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 24 Jul 2024 17:07:37 -0500
Subject: [PATCH 50/84] Make dispatcher a service and refactor

---
 ex/high_throughput_inference/mli_driver.py    |   3 +-
 .../standalone_workermanager.py               | 166 +++++++++++++++---
 .../control/requestdispatcher.py              |  97 +++++-----
 .../infrastructure/control/workermanager.py   |  93 ++--------
 .../mli/infrastructure/worker/torch_worker.py |   2 +
 .../_core/mli/infrastructure/worker/worker.py |   7 +-
 6 files changed, 220 insertions(+), 148 deletions(-)

diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py
index 1d1642567c..effdc567d9 100644
--- a/ex/high_throughput_inference/mli_driver.py
+++ b/ex/high_throughput_inference/mli_driver.py
@@ -38,7 +38,8 @@
 aff = []
 for i in range(32):
     aff.append(i)
-    # aff.append(i+64)
+    aff.append(i+64)
+
 worker_manager_rs.set_cpu_affinity(aff)
 worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs)
 worker_manager.attach_generator_files(to_copy=[worker_manager_script_name])
diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index 89f5eedd0d..d26493fa1e 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -24,34 +24,120 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# isort: off
+
 import dragon
+
+# pylint disable=import-error
+import dragon.globalservices.pool as dragon_gs_pool
+import dragon.infrastructure.policy as dragon_policy
+import dragon.infrastructure.process_desc as dragon_process_desc
+import dragon.native.process as dragon_process
 from dragon import fli
 from dragon.channels import Channel
 from dragon.data.ddict.ddict import DDict
-from dragon.utils import b64decode, b64encode
 from dragon.globalservices.api_setup import connect_to_infrastructure
+from dragon.managed_memory import MemoryPool
+from dragon.utils import b64decode, b64encode
+# pylint enable=import-error
+
+# isort: off
 # isort: on
+
 import argparse
 import base64
-import cloudpickle
-import pickle
+import multiprocessing as mp
 import os
+import pickle
+import socket
+import sys
+import time
+import typing as t
+
+import cloudpickle
 
+from smartsim._core.entrypoints.service import Service
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
 from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel
-from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import DragonFeatureStore
 from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
-from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
+from smartsim._core.mli.infrastructure.control.requestdispatcher import (
+    RequestDispatcher,
+)
 from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager
 from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader
+from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
+    DragonFeatureStore,
+)
+from smartsim._core.mli.infrastructure.worker.worker import MachineLearningWorkerBase
 
-import os
-import socket
-pid = 0
+mp.set_start_method("dragon")
+
+pid = os.getpid()
 affinity = os.sched_getaffinity(pid)
 print("Entry point:", socket.gethostname(), affinity)
 print("CPUS:", os.cpu_count())
 
+
+def create_request_dispatcher(
+    batch_size: int,
+    batch_timeout: float,
+    comm_channel_type: t.Type[CommChannelBase],
+    worker_type: t.Type[MachineLearningWorkerBase],
+    config_loader: EnvironmentConfigLoader,
+) -> RequestDispatcher:
+    mem_pool = MemoryPool.attach(dragon_gs_pool.create(2 * 1024**3).sdesc)
+
+    return RequestDispatcher(
+        batch_timeout=batch_timeout,
+        batch_size=batch_size,
+        config_loader=config_loader,
+        comm_channel_type=comm_channel_type,
+        mem_pool=mem_pool,
+        worker_type=worker_type,
+    )
+
+
+def create_worker_manager(
+    worker_type: t.Type[MachineLearningWorkerBase],
+    config_loader: EnvironmentConfigLoader,
+    device: str,
+    dispatcher_queue: mp.Queue,
+) -> WorkerManager:
+    return WorkerManager(
+        config_loader=config_loader,
+        worker_type=worker_type,
+        as_service=True,
+        cooldown=10,
+        comm_channel_type=DragonCommChannel,
+        device=device,
+        task_queue=dispatcher_queue,
+    )
+
+
+def service_as_dragon_proc(
+    service: Service, cpu_affinity: list[int], gpu_affinity: list[int]
+) -> dragon_process.Process:
+
+    options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
+    local_policy = dragon_policy.Policy(
+        placement=dragon_policy.Policy.Placement.HOST_NAME,
+        host_name=socket.gethostname(),
+        affinity=dragon_policy.Policy.Affinity.SPECIFIC,
+        cpu_affinity=cpu_affinity,
+        gpu_affinity=gpu_affinity,
+    )
+    proc = dragon_process.Process(
+        target=service.execute,
+        args=[],
+        cwd=os.getcwd(),
+        policy=local_policy,
+        options=options,
+        stderr=dragon_process.Popen.PIPE,
+        stdout=dragon_process.Popen.STDOUT,
+    )
+
+    return proc
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser("Worker Manager")
     parser.add_argument(
@@ -70,8 +156,20 @@
     parser.add_argument(
         "--num_workers", type=int, default=1, help="Number of workers to run"
     )
-
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="How many requests the workers will try to aggregate before processing them",
+    )
+    parser.add_argument(
+        "--batch_timeout",
+        type=float,
+        default=0.001,
+        help="How much time (in seconds) should be waited before processing an incomplete aggregated request",
+    )
     args = parser.parse_args()
+
     connect_to_infrastructure()
     ddict_str = os.environ["SS_DRG_DDICT"]
     ddict = DDict.attach(ddict_str)
@@ -81,9 +179,9 @@
     to_worker_fli_serialized = to_worker_fli.serialize()
     ddict["to_worker_fli"] = to_worker_fli_serialized
 
-    torch_worker = cloudpickle.loads(
-        base64.b64decode(args.worker_class.encode('ascii'))
-        )()
+    arg_worker_type = cloudpickle.loads(
+        base64.b64decode(args.worker_class.encode("ascii"))
+    )
 
     dfs = DragonFeatureStore(ddict)
     comm_channel = DragonFLIChannel(to_worker_fli_serialized)
@@ -91,16 +189,38 @@
     os.environ["SSFeatureStore"] = base64.b64encode(pickle.dumps(dfs)).decode("utf-8")
     os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
 
-    config_loader = EnvironmentConfigLoader()
+    ss_config_loader = EnvironmentConfigLoader()
 
-    worker_manager = WorkerManager(
-        config_loader=config_loader,
-        worker=torch_worker,
-        as_service=True,
-        cooldown=10,
+    dispatcher = create_request_dispatcher(
+        batch_size=args.batch_size,
+        batch_timeout=args.batch_timeout,
         comm_channel_type=DragonCommChannel,
-        device = args.device,
-        batch_size=1,
-        batch_timeout=0.001,  # 1e-3 is the best with ResNet50 for bs>32
+        worker_type=arg_worker_type,
+        config_loader=ss_config_loader,
     )
-    worker_manager.execute()
+
+    worker_manager = create_worker_manager(
+        worker_type=arg_worker_type,
+        config_loader=ss_config_loader,
+        device=args.device,
+        dispatcher_queue=dispatcher.task_queue,
+    )
+
+    wm_affinity: list[int] = []
+    disp_affinity: list[int] = []
+    if sys.platform != "darwin":
+        curr_affinity: list[int] = list(os.sched_getaffinity(os.getpid()))
+        wm_cpus = 3 * len(curr_affinity) // 4
+        disp_affinity = curr_affinity[wm_cpus:]
+        wm_affinity = curr_affinity[:wm_cpus]
+
+    dispatcher_proc = service_as_dragon_proc(dispatcher, cpu_affinity=disp_affinity, gpu_affinity=[])
+    worker_manager_proc = service_as_dragon_proc(
+        worker_manager, cpu_affinity=wm_affinity, gpu_affinity=[]
+    )
+
+    dispatcher_proc.start()
+    worker_manager_proc.start()
+
+    while all(proc.is_alive for proc in [dispatcher_proc, worker_manager_proc]):
+        time.sleep(1)
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index 018e094e0b..b5925f70c6 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -27,7 +27,7 @@
 # pylint: disable=import-error
 # pylint: disable-next=unused-import
 import dragon
-from dragon.managed_memory import MemoryAlloc, MemoryPool
+from dragon.managed_memory import MemoryPool
 from dragon.mpbridge.queues import DragonQueue
 
 # pylint: enable=import-error
@@ -45,14 +45,19 @@
 
 from packaging.version import Version
 
+from smartsim._core.entrypoints.service import Service
 from .....error import SmartSimError
 from .....log import get_logger
 from ....utils.timings import PerfTimer
 from ...comm.channel.channel import CommChannelBase
 from ...comm.channel.dragonchannel import DragonCommChannel
+from ...infrastructure.environmentloader import EnvironmentConfigLoader
 from ...infrastructure.storage.featurestore import FeatureStore
-from ...infrastructure.worker.torch_worker import TorchWorker
-from ...infrastructure.worker.worker import InferenceBatch, InferenceRequest
+from ...infrastructure.worker.worker import (
+    InferenceBatch,
+    InferenceRequest,
+    MachineLearningWorkerBase,
+)
 from ...message_handler import MessageHandler
 from ...mli_schemas.model.model_capnp import Model
 from ...mli_schemas.response.response_capnp import ResponseBuilder
@@ -279,17 +284,17 @@ def empty(self) -> bool:
         return self.qsize() == 0
 
 
-class RequestDispatcher:
+class RequestDispatcher(Service):
     def __init__(
         self,
         batch_timeout: float,
         batch_size: int,
         mem_pool: MemoryPool,
-        incoming_channel: t.Optional[CommChannelBase],
+        config_loader: EnvironmentConfigLoader,
+        worker_type: t.Type[MachineLearningWorkerBase],
         comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel,
-        feature_store: t.Optional[FeatureStore] = None,
     ) -> None:
-        mp.set_start_method("dragon")
+        super().__init__(as_service=True, cooldown=1)
         self._queues: list[BatchQueue] = []
         self._active_queues: dict[str, BatchQueue] = {}
         self._model_last_version: dict[str, Version] = {}
@@ -297,12 +302,16 @@ def __init__(
         self._batch_timeout = batch_timeout
         self._batch_size = batch_size
         self._queue_swap_lock: t.Optional[RLock] = None
-        self._incoming_channel = incoming_channel
+        self._incoming_channel = config_loader.get_queue()
+        """the queue the manager monitors for new tasks"""
         self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0)
-        self._feature_store = feature_store
+        self._feature_store: t.Optional[FeatureStore] = (
+            config_loader.get_feature_store()
+        )
+        """a feature store to retrieve models from"""
         self._comm_channel_type = comm_channel_type
         self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True)
-        self._worker = TorchWorker()
+        self._worker = worker_type()
         self._mem_pool = mem_pool
 
     def _validate_request(self, request: InferenceRequest) -> bool:
@@ -336,48 +345,49 @@ def _validate_request(self, request: InferenceRequest) -> bool:
 
         return True
 
-    def run(self) -> None:
+    def _on_start(self) -> None:
         self._queue_swap_lock = RLock()
         if self._incoming_channel is None:
             raise SmartSimError("No incoming channel for dispatcher")
-        while True:
-            try:
-                bytes_list: t.List[bytes] = self._incoming_channel.recv()
-            except Exception:
-                pass
-            else:
-                if not bytes_list:
-                    exception_handler(
-                        ValueError("No request data found"),
-                        None,
-                        "No request data found.",
-                    )
 
-                request_bytes = bytes_list[0]
-                tensor_bytes_list = bytes_list[1:]
-                self._perf_timer.start_timings()
+    def _on_iteration(self) -> None:
+        try:
+            bytes_list: t.List[bytes] = self._incoming_channel.recv()
+        except Exception:
+            pass
+        else:
+            if not bytes_list:
+                exception_handler(
+                    ValueError("No request data found"),
+                    None,
+                    "No request data found.",
+                )
 
-                request = deserialize_message(request_bytes, self._comm_channel_type)
-                if request.input_meta and tensor_bytes_list:
-                    request.raw_inputs = tensor_bytes_list
+            request_bytes = bytes_list[0]
+            tensor_bytes_list = bytes_list[1:]
+            self._perf_timer.start_timings()
 
-                self._perf_timer.measure_time("deserialize_message")
-                if not self._validate_request(request):
-                    continue
+            request = deserialize_message(request_bytes, self._comm_channel_type)
+            if request.input_meta and tensor_bytes_list:
+                request.raw_inputs = tensor_bytes_list
 
-                self._perf_timer.measure_time("validate_request")
-                self.dispatch(request)
+            self._perf_timer.measure_time("deserialize_message")
+            if not self._validate_request(request):
+                return
 
-                self._perf_timer.measure_time("dispatch")
-            finally:
-                self.flush_requests()
-                # TODO: implement this
-                # self.remove_queues()
+            self._perf_timer.measure_time("validate_request")
+            self.dispatch(request)
 
-                self._perf_timer.end_timings()
+            self._perf_timer.measure_time("dispatch")
+        finally:
+            self.flush_requests()
+            # TODO: implement this
+            # self.remove_queues()
+
+            self._perf_timer.end_timings()
 
-            if self._perf_timer.max_length == 801:
-                self._perf_timer.print_timings(True)
+        if self._perf_timer.max_length == 801:
+            self._perf_timer.print_timings(True)
 
     @property
     def task_queue(self) -> DragonQueue:
@@ -454,3 +464,6 @@ def flush_requests(self) -> None:
 
                 self._outgoing_queue.put(batch)
                 self._perf_timer.measure_time("put")
+
+    def _can_shutdown(self) -> bool:
+        return False
\ No newline at end of file
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 4d351f9bff..9626506a41 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -27,11 +27,6 @@
 # pylint: disable=import-error
 # pylint: disable-next=unused-import
 import dragon
-import dragon.infrastructure.policy as dragon_policy
-import dragon.infrastructure.process_desc as dragon_process_desc
-import dragon.native.process as dragon_process
-import dragon.native.process_group as dragon_process_group
-from dragon.managed_memory import MemoryAlloc, MemoryPool
 
 # pylint: enable=import-error
 
@@ -39,9 +34,6 @@
 # isort: on
 
 import multiprocessing as mp
-import os
-import socket
-import sys
 import time
 import typing as t
 
@@ -61,7 +53,6 @@
 from ...message_handler import MessageHandler
 from ...mli_schemas.response.response_capnp import ResponseBuilder
 from .devicemanager import DeviceManager, WorkerDevice
-from .requestdispatcher import RequestDispatcher
 
 if t.TYPE_CHECKING:
     from smartsim._core.mli.mli_schemas.model.model_capnp import Model
@@ -109,7 +100,6 @@ def build_reply(reply: InferenceReply) -> ResponseBuilder:
         custom_attributes=None,
     )
 
-
 def exception_handler(
     exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str
 ) -> None:
@@ -139,13 +129,12 @@ class WorkerManager(Service):
     def __init__(
         self,
         config_loader: EnvironmentConfigLoader,
-        worker: MachineLearningWorkerBase,
+        worker_type: t.Type[MachineLearningWorkerBase],
+        task_queue: "mp.Queue[InferenceBatch]",
         as_service: bool = False,
         cooldown: int = 0,
         comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel,
         device: t.Literal["cpu", "gpu"] = "cpu",
-        batch_timeout: float = 0.0,
-        batch_size: int = 1,
     ) -> None:
         """Initialize the WorkerManager
         :param config_loader: Environment config loader that loads the task queue and
@@ -158,84 +147,26 @@ def __init__(
         """
         super().__init__(as_service, cooldown)
 
-        self._task_queue: t.Optional[CommChannelBase] = config_loader.get_queue()
+        self._task_queue = task_queue
         """the queue the manager monitors for new tasks"""
         self._feature_store: t.Optional[FeatureStore] = (
             config_loader.get_feature_store()
         )
         """a feature store to retrieve models from"""
-        self._worker = worker
+        self._worker = worker_type()
         """The ML Worker implementation"""
         self._comm_channel_type = comm_channel_type
         """The type of communication channel to construct for callbacks"""
         self._device = device
         """Device on which workers need to run"""
-        self._cached_models: dict[str, t.Any] = {}
-        """Dictionary of previously loaded models"""
-        self._mem_pool = MemoryPool(size=1024**3, fname="wm_mempool", uid=123458)
-        self._request_dispatcher: RequestDispatcher = RequestDispatcher(
-            batch_timeout=batch_timeout,
-            batch_size=batch_size,
-            incoming_channel=self._task_queue,
-            comm_channel_type=comm_channel_type,
-            feature_store=self._feature_store,
-            mem_pool=self._mem_pool,
-        )
-        """Dispatcher used to batch requests"""
-        self._device_manager: DeviceManager = DeviceManager(
-            [WorkerDevice(f"gpu:{idx}") for idx in [3]]
-        )
-        self._device_idx: int = 0
-        self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True)
 
-        try:
-            mp.set_start_method("dragon")
-        except RuntimeError:
-            pass
-
-        self._dispatcher_process = self._create_local_dispatcher_process()
-
-    def _create_local_dispatcher_process(self) -> dragon_process_group.ProcessGroup:
-        wm_cpus = 0
-        if sys.platform != "darwin":
-            self_affinity: list[int] = list(os.sched_getaffinity(os.getpid()))
-            wm_cpus = len(self_affinity) // 2
-            os.sched_setaffinity(os.getpid(), self_affinity[:wm_cpus])
-        else:
-            self_affinity: list[int] = []
-        disp_affinity = self_affinity[wm_cpus:]
-        global_policy = dragon_policy.Policy(
-            placement=dragon_policy.Policy.Placement.HOST_NAME,
-            host_name=socket.gethostname(),
-            affinity=dragon_policy.Policy.Affinity.SPECIFIC,
-            cpu_affinity=disp_affinity,
-        )
-        options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
-        grp = dragon_process_group.ProcessGroup(
-            restart=False, pmi_enabled=True, policy=global_policy
-        )
-        local_policy = dragon_policy.Policy(
-            placement=dragon_policy.Policy.Placement.HOST_NAME,
-            host_name=socket.gethostname(),
-            affinity=dragon_policy.Policy.Affinity.SPECIFIC,
-            cpu_affinity=disp_affinity,
-        )
-        tmp_proc = dragon_process.ProcessTemplate(
-            target=self._request_dispatcher.run,
-            args=[],
-            cwd=os.getcwd(),
-            policy=local_policy,
-            options=options,
-        )
-        grp.add_process(nproc=1, template=tmp_proc)
-        grp.init()
-        return grp
+        self._device_manager: t.Optional[DeviceManager] = None
+        self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True)
 
     def _on_start(self) -> None:
-        self._dispatcher_process.start()
-
-    def _on_shutdown(self) -> None:
-        self._dispatcher_process.join()
+        self._device_manager = DeviceManager(
+            [WorkerDevice(f"gpu:{idx}") for idx in [3]]
+        )
 
     def _on_iteration(self) -> None:
         """Executes calls to the machine learning worker implementation to complete
@@ -243,9 +174,7 @@ def _on_iteration(self) -> None:
 
         pre_batch_time = time.perf_counter()
         try:
-            batch: InferenceBatch = self._request_dispatcher.task_queue.get(
-                timeout=0.001
-            )
+            batch: InferenceBatch = self._task_queue.get(timeout=0.0001)
         except Exception:
             return
 
@@ -256,6 +185,8 @@ def _on_iteration(self) -> None:
         if batch is None or 0 == len(batch.requests):
             return
 
+        if self._device_manager is None:
+            raise ValueError("No Device Manager available: did you call _on_start()")
         device: WorkerDevice = next(
             self._device_manager.get_free_device(
                 worker=self._worker,
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index 0e8273dd56..37b8b7e843 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -28,6 +28,8 @@
 
 import numpy as np
 import torch
+
+# pylint: disable=import-error
 from dragon.managed_memory import MemoryAlloc, MemoryPool
 
 from .....error import SmartSimError
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 068e47b2fd..bc96633204 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -24,11 +24,16 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+# pylint: disable=import-error
+from dragon.managed_memory import MemoryPool
+
+# isort: off
+# isort: on
+
 import typing as t
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 
-from dragon.managed_memory import MemoryAlloc, MemoryPool
 
 from .....error import SmartSimError
 from .....log import get_logger

From 79eb936ba79168604499b60fd8781d1cf6fafede Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 24 Jul 2024 18:03:16 -0500
Subject: [PATCH 51/84] Fixes for batched requests

---
 ex/high_throughput_inference/mli_driver.py        | 15 ++++++++++-----
 ex/high_throughput_inference/mock_app.py          |  3 ++-
 smartsim/_core/mli/comm/channel/dragonfli.py      |  2 +-
 .../infrastructure/control/requestdispatcher.py   |  3 ++-
 .../mli/infrastructure/control/workermanager.py   |  2 +-
 smartsim/_core/utils/timings.py                   |  6 +++++-
 6 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py
index effdc567d9..c965a6d9a6 100644
--- a/ex/high_throughput_inference/mli_driver.py
+++ b/ex/high_throughput_inference/mli_driver.py
@@ -9,11 +9,12 @@
 import time
 import typing as t
 
-device = "gpu"
+DEVICE = "gpu"
+NUM_RANKS = 4
 filedir = os.path.dirname(__file__)
 worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py")
 app_script_name = os.path.join(filedir, "mock_app.py")
-model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt")
+model_name = os.path.join(filedir, f"resnet50.{DEVICE.upper()}.pt")
 
 transport: t.Literal["hsta", "tcp"] = "hsta"
 
@@ -30,9 +31,13 @@
     [
         worker_manager_script_name,
         "--device",
-        device,
+        DEVICE,
         "--worker_class",
         torch_worker_str,
+        "--batch_size",
+        str(NUM_RANKS),
+        "--batch_timeout",
+        str(0.001),
     ],
 )
 aff = []
@@ -46,9 +51,9 @@
 
 app_rs: DragonRunSettings = exp.create_run_settings(
     sys.executable,
-    exe_args=[app_script_name, "--device", device],
+    exe_args=[app_script_name, "--device", DEVICE, "--log_max_batchsize", str(7)],
 )
-app_rs.set_tasks_per_node(1)
+app_rs.set_tasks_per_node(NUM_RANKS)
 
 
 app = exp.create_model("app", run_settings=app_rs)
diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index 2a76fdbe9d..28b8a3d98a 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -154,6 +154,7 @@ def name(self):
 
     parser = argparse.ArgumentParser("Mock application")
     parser.add_argument("--device", default="cpu")
+    parser.add_argument("--log_max_batchsize", default=8, type=int)
     args = parser.parse_args()
 
     resnet = ResNetWrapper("resnet50", f"resnet50.{args.device.upper()}.pt")
@@ -165,7 +166,7 @@ def name(self):
 
     TOTAL_ITERATIONS = 100
 
-    for log2_bsize in range(8):
+    for log2_bsize in range(args.log_max_batchsize):
         b_size: int = 2**log2_bsize
         logger.info(f"Batch size: {b_size}")
         for iteration_number in range(TOTAL_ITERATIONS + int(b_size==1)):
diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py
index 9f5d628d5f..555d9104d9 100644
--- a/smartsim/_core/mli/comm/channel/dragonfli.py
+++ b/smartsim/_core/mli/comm/channel/dragonfli.py
@@ -62,7 +62,7 @@ def recv(self) -> t.List[bytes]:
         :returns: the received message"""
         messages = []
         eot = False
-        with self._fli.recvh(timeout=None) as recvh:
+        with self._fli.recvh(timeout=0.001) as recvh:
             while not eot:
                 try:
                     message, _ = recvh.recv_bytes(timeout=None)
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index b5925f70c6..9b3d8cabdb 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -253,6 +253,7 @@ def ready(self) -> bool:
             return False
         return self.full() or (self._waited_time >= self._batch_timeout)
 
+
     def make_disposable(self) -> None:
         self._disposable = True
 
@@ -310,7 +311,7 @@ def __init__(
         )
         """a feature store to retrieve models from"""
         self._comm_channel_type = comm_channel_type
-        self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True)
+        self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=False)
         self._worker = worker_type()
         self._mem_pool = mem_pool
 
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 9626506a41..0ee146ef1b 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -161,7 +161,7 @@ def __init__(
         """Device on which workers need to run"""
 
         self._device_manager: t.Optional[DeviceManager] = None
-        self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True)
+        self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=False)
 
     def _on_start(self) -> None:
         self._device_manager = DeviceManager(
diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py
index c8f6c71003..286bd4f4a8 100644
--- a/smartsim/_core/utils/timings.py
+++ b/smartsim/_core/utils/timings.py
@@ -121,7 +121,11 @@ def max_length(self) -> int:
 
     def print_timings(self, to_file: bool = False) -> None:
         print(" ".join(self._timings.keys()))
-        value_array = np.array(list(self._timings.values()), dtype=float)
+        try:
+            value_array = np.array(list(self._timings.values()), dtype=float)
+        except Exception as e:
+            logger.exception(e)
+            return
         value_array = np.transpose(value_array)
         if self._debug:
             for i in range(value_array.shape[0]):

From 8759e9f56468b984a934e77df787959663edfd9b Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Thu, 25 Jul 2024 16:18:53 -0500
Subject: [PATCH 52/84] Pre-PR

---
 ex/high_throughput_inference/mli_driver.py    | 14 +--
 ex/high_throughput_inference/mock_app.py      |  7 +-
 .../standalone_workermanager.py               | 57 +++++++----
 .../control/requestdispatcher.py              | 99 ++++++++++++++-----
 .../infrastructure/control/workermanager.py   | 23 +++--
 .../mli/infrastructure/worker/torch_worker.py | 34 ++-----
 .../_core/mli/infrastructure/worker/worker.py | 25 +++--
 7 files changed, 162 insertions(+), 97 deletions(-)

diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py
index c965a6d9a6..6d852ec6c3 100644
--- a/ex/high_throughput_inference/mli_driver.py
+++ b/ex/high_throughput_inference/mli_driver.py
@@ -10,7 +10,8 @@
 import typing as t
 
 DEVICE = "gpu"
-NUM_RANKS = 4
+NUM_RANKS = 1
+NUM_WORKERS = 1
 filedir = os.path.dirname(__file__)
 worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py")
 app_script_name = os.path.join(filedir, "mock_app.py")
@@ -35,17 +36,18 @@
         "--worker_class",
         torch_worker_str,
         "--batch_size",
-        str(NUM_RANKS),
+        str(NUM_RANKS//NUM_WORKERS),
         "--batch_timeout",
-        str(0.001),
+        str(0.002),
+        "--num_workers",
+        str(NUM_WORKERS)
     ],
 )
+
 aff = []
-for i in range(32):
-    aff.append(i)
-    aff.append(i+64)
 
 worker_manager_rs.set_cpu_affinity(aff)
+
 worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs)
 worker_manager.attach_generator_files(to_copy=[worker_manager_script_name])
 
diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index 28b8a3d98a..2440aa87c4 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -107,11 +107,12 @@ def run_model(self, model: bytes | str, batch: torch.Tensor):
         self._perf_timer.measure_time("send")
         with self._from_worker_ch.recvh(timeout=None) as from_recvh:
             resp = from_recvh.recv_bytes(timeout=None)
-            self._perf_timer.measure_time("receive")
+            self._perf_timer.measure_time("receive_response")
             response = MessageHandler.deserialize_response(resp)
             self._perf_timer.measure_time("deserialize_response")
             # list of data blobs? recv depending on the len(response.result.descriptors)?
-            data_blob = from_recvh.recv_bytes(timeout=None)
+            data_blob: bytes = from_recvh.recv_bytes(timeout=None)
+            self._perf_timer.measure_time("receive_tensor")
             result = torch.from_numpy(
                 numpy.frombuffer(
                     data_blob,
@@ -166,7 +167,7 @@ def name(self):
 
     TOTAL_ITERATIONS = 100
 
-    for log2_bsize in range(args.log_max_batchsize):
+    for log2_bsize in range(args.log_max_batchsize+1):
         b_size: int = 2**log2_bsize
         logger.info(f"Batch size: {b_size}")
         for iteration_number in range(TOTAL_ITERATIONS + int(b_size==1)):
diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index d26493fa1e..72e2bd20f0 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -125,17 +125,17 @@ def service_as_dragon_proc(
         cpu_affinity=cpu_affinity,
         gpu_affinity=gpu_affinity,
     )
-    proc = dragon_process.Process(
+    return dragon_process.Process(
         target=service.execute,
         args=[],
         cwd=os.getcwd(),
         policy=local_policy,
         options=options,
-        stderr=dragon_process.Popen.PIPE,
+        stderr=dragon_process.Popen.STDOUT,
         stdout=dragon_process.Popen.STDOUT,
     )
 
-    return proc
+
 
 
 if __name__ == "__main__":
@@ -199,28 +199,45 @@ def service_as_dragon_proc(
         config_loader=ss_config_loader,
     )
 
-    worker_manager = create_worker_manager(
-        worker_type=arg_worker_type,
-        config_loader=ss_config_loader,
-        device=args.device,
-        dispatcher_queue=dispatcher.task_queue,
-    )
+    wms = []
+    worker_device = args.device
+    for wm_idx in range(args.num_workers):
+        # if args.num_workers > 0:
+        #     worker_device = f"{args.device}:{wm_idx}"
+        worker_manager = create_worker_manager(
+            worker_type=arg_worker_type,
+            config_loader=ss_config_loader,
+            device=worker_device,
+            dispatcher_queue=dispatcher.task_queue,
+        )
+        wms.append(worker_manager)
 
     wm_affinity: list[int] = []
     disp_affinity: list[int] = []
-    if sys.platform != "darwin":
-        curr_affinity: list[int] = list(os.sched_getaffinity(os.getpid()))
-        wm_cpus = 3 * len(curr_affinity) // 4
-        disp_affinity = curr_affinity[wm_cpus:]
-        wm_affinity = curr_affinity[:wm_cpus]
+
+    # This is hardcoded for a specific type of node!
+    gpu_to_cpu_aff: dict[int, list[int]] = {}
+    gpu_to_cpu_aff[0] = list(range(48,64)) + list(range(112,128))
+    gpu_to_cpu_aff[1] = list(range(32,48)) + list(range(96,112))
+    gpu_to_cpu_aff[2] = list(range(16,32)) + list(range(80,96))
+    gpu_to_cpu_aff[3] = list(range(0,16)) + list(range(64,80))
+
+    worker_manager_procs = []
+    for worker_idx in range(args.num_workers):
+        wm_cpus = len(gpu_to_cpu_aff[worker_idx]) - 4
+        wm_affinity = gpu_to_cpu_aff[worker_idx][:wm_cpus]
+        disp_affinity.extend(gpu_to_cpu_aff[worker_idx][wm_cpus:])
+        worker_manager_procs.append(service_as_dragon_proc(
+                worker_manager, cpu_affinity=wm_affinity, gpu_affinity=[worker_idx]
+            ))
 
     dispatcher_proc = service_as_dragon_proc(dispatcher, cpu_affinity=disp_affinity, gpu_affinity=[])
-    worker_manager_proc = service_as_dragon_proc(
-        worker_manager, cpu_affinity=wm_affinity, gpu_affinity=[]
-    )
 
-    dispatcher_proc.start()
-    worker_manager_proc.start()
+    # TODO: use ProcessGroup and restart=True?
+    all_procs = [dispatcher_proc, *worker_manager_procs]
+
+    for proc in all_procs:
+        proc.start()
 
-    while all(proc.is_alive for proc in [dispatcher_proc, worker_manager_proc]):
+    while all(proc.is_alive for proc in all_procs):
         time.sleep(1)
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index 9b3d8cabdb..6a6f811fc9 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -43,9 +43,8 @@
 from threading import RLock
 from types import TracebackType
 
-from packaging.version import Version
-
 from smartsim._core.entrypoints.service import Service
+
 from .....error import SmartSimError
 from .....log import get_logger
 from ....utils.timings import PerfTimer
@@ -190,23 +189,46 @@ def __exit__(
 
 class BatchQueue(Queue[InferenceRequest]):
     def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> None:
+        """Queue used to store inference requests waiting to be batched and
+        sent to Worker Managers.
+        :param batch_timeout: Time in seconds that has to be waited before flushing a
+        non-full queue. The time of the firt item put is 0 seconds.
+        :param batch_size: Total capacity of the queue.
+        :param model_key: Key of the model which needs to be executed on the queued
+        requests
+        """
         super().__init__(maxsize=batch_size)
         self._batch_timeout = batch_timeout
+        """Time in seconds that has to be waited before flushing a non-full queue.
+        The time of the firt item put is 0 seconds."""
         self._batch_size = batch_size
+        """Total capacity of the queue."""
         self._first_put: t.Optional[float] = None
+        """Time at which the first item was put on the queue"""
         self._disposable = False
+        """Whether the queue will not be used again and can be deleted.
+        A disposable queue is always full."""
         self._model_key = model_key
+        """Key of the model which needs to be executed on the queued requets"""
         self._flush_lock = RLock()
+        """Lock used to make sure only one process can flush the queue (unused now)"""
         self._id = str(uuid.uuid4())
+        """Id of queue"""
 
     @property
     def queue_id(self) -> str:
+        """ID of this queue"""
         return self._id
 
     def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]:
+        """Acquire queue lock to flush
+        :param blocking: whether to block on lock acquisition
+        :param timeout: Time to wait if blocking, before raising exception
+        """
         return self._flush_lock.acquire(blocking=blocking, timeout=timeout)
 
     def release(self) -> None:
+        """Release queue lock"""
         self._flush_lock.release()
 
     def __enter__(self) -> None:
@@ -222,6 +244,7 @@ def __exit__(
 
     @property
     def model_key(self) -> str:
+        """Key of the model which needs to be run on the queued requests"""
         return self._model_key
 
     def put(
@@ -230,6 +253,11 @@ def put(
         block: bool = False,
         timeout: t.Optional[float] = 0.0,
     ) -> None:
+        """Put an inference request in the queue
+        :param item: The request
+        :param block: Whether to block when trying to put the item
+        :param timeout: Time to wait if block==True
+        """
         if not self.acquire(blocking=False):
             raise Full
         try:
@@ -249,19 +277,24 @@ def _waited_time(self) -> float:
 
     @property
     def ready(self) -> bool:
+        """True if the queue can be flushed"""
         if self.empty():
             return False
         return self.full() or (self._waited_time >= self._batch_timeout)
 
-
     def make_disposable(self) -> None:
+        """Set this queue as disposable, and never use it again after it gets flushed"""
         self._disposable = True
 
     @property
     def disposable(self) -> bool:
+        """Whether this queue can be used to put items or should be deleted"""
         return self.empty() and self._disposable
 
     def flush(self) -> list[t.Any]:
+        """Get all requests from queue
+        :return: Requests waiting to be executed
+        """
         num_items = self.qsize()
         self._first_put = None
         items = []
@@ -275,6 +308,7 @@ def flush(self) -> list[t.Any]:
         return items
 
     def full(self) -> bool:
+        """Return True if the queue has reached its maximum capacity"""
         if self._disposable:
             return True
         if self._batch_size <= 0:
@@ -282,6 +316,7 @@ def full(self) -> bool:
         return self.qsize() >= self._batch_size
 
     def empty(self) -> bool:
+        """Return True if the queue has 0 elements"""
         return self.qsize() == 0
 
 
@@ -295,25 +330,46 @@ def __init__(
         worker_type: t.Type[MachineLearningWorkerBase],
         comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel,
     ) -> None:
-        super().__init__(as_service=True, cooldown=1)
+        """The RquestDispatcher intercepts inference requests, stages them in
+        queues and batches them together before making them available to Worker
+        Managers.
+        :param batch_timeout: Time in seconds that has to be waited before flushing a
+        non-full queue after having put at least one item on it.
+        :param batch_size: Total capacity of each batch queue.
+        :param mem_pool: Memory pool used to share batched input tensors with worker
+        managers
+        :param config_loader: Object to load configuration from environment
+        :param worker_type: Type of worker to instantiate to batch inputs
+        :param comm_channel_type: Type of channel used to get requests
+        """
+        super().__init__(as_service=False, cooldown=1)
         self._queues: list[BatchQueue] = []
+        """All batch queues"""
         self._active_queues: dict[str, BatchQueue] = {}
-        self._model_last_version: dict[str, Version] = {}
-        self._model_name_to_key: dict[str, str] = {}
+        """Mapping telling which queue is the recipient of requets for a given model
+        key"""
         self._batch_timeout = batch_timeout
+        """Time in seconds that has to be waited before flushing a non-full queue"""
         self._batch_size = batch_size
+        """Total capacity of each batch queue."""
         self._queue_swap_lock: t.Optional[RLock] = None
+        """Lock used to swap the active queue for a key"""
         self._incoming_channel = config_loader.get_queue()
-        """the queue the manager monitors for new tasks"""
+        """The channel the dispatcher monitors for new tasks"""
         self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0)
+        """The queue on which batched inference requests are placed"""
         self._feature_store: t.Optional[FeatureStore] = (
             config_loader.get_feature_store()
         )
-        """a feature store to retrieve models from"""
+        """A feature store to retrieve models from"""
         self._comm_channel_type = comm_channel_type
-        self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=False)
+        """The type of the channel used to receive requests"""
         self._worker = worker_type()
+        """The worker used to batch inputs"""
         self._mem_pool = mem_pool
+        """Memory pool used to share batched input tensors with the Worker Managers"""
+        self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=False)
+        """Performance timer"""
 
     def _validate_request(self, request: InferenceRequest) -> bool:
         """Ensure the request can be processed.
@@ -348,10 +404,12 @@ def _validate_request(self, request: InferenceRequest) -> bool:
 
     def _on_start(self) -> None:
         self._queue_swap_lock = RLock()
+
+    def _on_iteration(self) -> None:
+
         if self._incoming_channel is None:
             raise SmartSimError("No incoming channel for dispatcher")
 
-    def _on_iteration(self) -> None:
         try:
             bytes_list: t.List[bytes] = self._incoming_channel.recv()
         except Exception:
@@ -392,11 +450,12 @@ def _on_iteration(self) -> None:
 
     @property
     def task_queue(self) -> DragonQueue:
+        """The queue on which batched requests are placed"""
         return self._outgoing_queue
 
     def _swap_queue(self, model_key: str) -> None:
         if self._queue_swap_lock is None:
-            raise SmartSimError("Queue was not locked")
+            raise SmartSimError("Queues were not locked")
         with self._queue_swap_lock:
             for queue in self._queues:
                 if queue.model_key == model_key and not queue.full():
@@ -409,6 +468,9 @@ def _swap_queue(self, model_key: str) -> None:
             return
 
     def dispatch(self, request: InferenceRequest) -> None:
+        """Assign a request to a batch queue
+        :param request: the request to place
+        """
         if request.raw_model is not None:
             logger.info("Direct inference requested, creating tmp queue")
             tmp_id = f"_tmp_{str(uuid.uuid4())}"
@@ -429,17 +491,10 @@ def dispatch(self, request: InferenceRequest) -> None:
                 except (Full, KeyError):
                     self._swap_queue(request.model_key)
 
-    def _update_model_version(self, model: Model) -> None:
-        if not model.version:
-            return
-        if (
-            model.name not in self._model_last_version
-            or Version(model.version) > self._model_last_version[model.name]
-        ):
-            self._model_last_version[model.name] = Version(model.version)
-            return
-
     def flush_requests(self) -> None:
+        """Get all requests from queues which are ready to be flushed. Place all
+        aviable request batches in the outgoing queue.
+        """
         for queue in self._queues:
             if queue.ready and queue.acquire(blocking=False):
                 self._perf_timer.measure_time("find_queue")
@@ -467,4 +522,4 @@ def flush_requests(self) -> None:
                 self._perf_timer.measure_time("put")
 
     def _can_shutdown(self) -> bool:
-        return False
\ No newline at end of file
+        return False
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 0ee146ef1b..74b36c3454 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -100,6 +100,7 @@ def build_reply(reply: InferenceReply) -> ResponseBuilder:
         custom_attributes=None,
     )
 
+
 def exception_handler(
     exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str
 ) -> None:
@@ -139,20 +140,23 @@ def __init__(
         """Initialize the WorkerManager
         :param config_loader: Environment config loader that loads the task queue and
         feature store
-        :param workers: A worker to manage
+        :param worker_type: The type of worker to manage
+        :param task_queue: Queue from witch the batched requests have to be pulled
         :param as_service: Specifies run-once or run-until-complete behavior of service
         :param cooldown: Number of seconds to wait before shutting down after
         shutdown criteria are met
         :param comm_channel_type: The type of communication channel used for callbacks
+        :param device: The device on which the Worker should run. Every worker manager
+        is assigned one single GPU (if available), thus the device should have no index.
         """
         super().__init__(as_service, cooldown)
 
         self._task_queue = task_queue
-        """the queue the manager monitors for new tasks"""
+        """The dispatcher queue the manager monitors for new tasks"""
         self._feature_store: t.Optional[FeatureStore] = (
             config_loader.get_feature_store()
         )
-        """a feature store to retrieve models from"""
+        """A feature store to retrieve models from"""
         self._worker = worker_type()
         """The ML Worker implementation"""
         self._comm_channel_type = comm_channel_type
@@ -161,13 +165,14 @@ def __init__(
         """Device on which workers need to run"""
 
         self._device_manager: t.Optional[DeviceManager] = None
-        self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=False)
+        """Object responsible for model caching and device access"""
+        self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True)
+        """Performance timer"""
 
     def _on_start(self) -> None:
-        self._device_manager = DeviceManager(
-            [WorkerDevice(f"gpu:{idx}") for idx in [3]]
-        )
+        self._device_manager = DeviceManager([WorkerDevice(self._device)])
 
+    # pylint: disable-next=too-many-statements
     def _on_iteration(self) -> None:
         """Executes calls to the machine learning worker implementation to complete
         the inference pipeline"""
@@ -220,9 +225,7 @@ def _on_iteration(self) -> None:
         self._perf_timer.measure_time("execute")
 
         try:
-            transformed_outputs = self._worker.transform_output(
-                batch, execute_result, self._perf_timer
-            )
+            transformed_outputs = self._worker.transform_output(batch, execute_result)
         except Exception as e:
             for request in batch.requests:
                 exception_handler(
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index 37b8b7e843..6723573cfb 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -34,7 +34,6 @@
 
 from .....error import SmartSimError
 from .....log import get_logger
-from ....utils.timings import PerfTimer
 from ...mli_schemas.tensor import tensor_capnp
 from .worker import (
     ExecuteResult,
@@ -120,26 +119,14 @@ def transform_input(
         for result_tensor_idx, (dims, dtype) in enumerate(zip(all_dims, all_dtypes)):
             itemsize = np.empty((1), dtype=dtype).itemsize
             alloc_size = int(np.prod(dims) * itemsize)
-            try:
-                mem_alloc = mem_pool.alloc(alloc_size)
-                mem_view = mem_alloc.get_memview()
-                mem_view[:alloc_size] = b"".join(
-                    [
-                        fetch_result.inputs[result_tensor_idx]
-                        for fetch_result in fetch_results
-                    ]
-                )
-            except Exception as e:
-                print(e)
-                raise e
-            # results.append(
-            #     torch.from_numpy(
-            #         np.frombuffer(
-            #             all_bytes,
-            #             dtype=dtype,
-            #         ).reshape(dims)
-            #     )
-            # )
+            mem_alloc = mem_pool.alloc(alloc_size)
+            mem_view = mem_alloc.get_memview()
+            mem_view[:alloc_size] = b"".join(
+                [
+                    fetch_result.inputs[result_tensor_idx]
+                    for fetch_result in fetch_results
+                ]
+            )
 
             results.append(mem_alloc.serialize())
 
@@ -182,8 +169,6 @@ def execute(
                 for tensor in tensors
             ]
 
-        torch.cuda.synchronize(3)
-
         transform_result.transformed = []
 
         execute_result = ExecuteResult(results, transform_result.slices)
@@ -195,18 +180,15 @@ def execute(
     def transform_output(
         batch: InferenceBatch,
         execute_result: ExecuteResult,
-        perf_timer: PerfTimer,
     ) -> list[TransformOutputResult]:
         transformed_list: list[TransformOutputResult] = []
         cpu_predictions = [
             prediction.cpu() for prediction in execute_result.predictions
         ]
-        perf_timer.measure_time("to_cpu")
         for result_slice in execute_result.slices:
             transformed = []
             for cpu_item in cpu_predictions:
                 transformed.append(cpu_item[result_slice].numpy().tobytes())
-                perf_timer.measure_time("serialize_tensor")
 
                 # todo: need the shape from latest schemas added here.
                 transformed_list.append(
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index bc96633204..01e2db6c86 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -34,10 +34,8 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 
-
 from .....error import SmartSimError
 from .....log import get_logger
-from ....utils.timings import PerfTimer
 from ...comm.channel.channel import CommChannelBase
 from ...infrastructure.storage.featurestore import FeatureStore
 from ...mli_schemas.model.model_capnp import Model
@@ -101,15 +99,19 @@ def __init__(self, model: t.Any) -> None:
 
 
 class TransformInputResult:
-    """A wrapper around a transformed batchinput"""
+    """A wrapper around a transformed batch of input tensors"""
 
     def __init__(
         self, result: t.Any, slices: list[slice], dims: list[list[int]]
     ) -> None:
         """Initialize the object"""
         self.transformed = result
+        """List of Dragon MemoryAlloc objects on which the tensors are stored"""
         self.slices = slices
+        """Each slice represents which portion of the input tensors belongs to
+        which request"""
         self.dims = dims
+        """Dimension of the transformed tensors"""
 
 
 class ExecuteResult:
@@ -174,7 +176,7 @@ def fetch_model(
         batch: InferenceBatch, feature_store: t.Optional[FeatureStore]
     ) -> FetchModelResult:
         """Given a resource key, retrieve the raw model from a feature store
-        :param batc: The batch of requests that triggered the pipeline
+        :param batch: The batch of requests that triggered the pipeline
         :param feature_store: The feature store used for persistence
         :return: Raw bytes of the model"""
 
@@ -206,7 +208,7 @@ def fetch_inputs(
     ) -> t.List[FetchInputResult]:
         """Given a collection of ResourceKeys, identify the physical location
         and input metadata
-        :param request: The request that triggered the pipeline
+        :param batch: The batch of requests that triggered the pipeline
         :param feature_store: The feature store used for persistence
         :return: the fetched input"""
         fetch_results = []
@@ -290,9 +292,11 @@ def transform_input(
         fetch_results: list[FetchInputResult],
         mem_pool: MemoryPool,
     ) -> TransformInputResult:
-        """Given a collection of data, perform a transformation on the data
+        """Given a collection of data, perform a transformation on the data and put
+        the raw tensor data on a MemoryPool allocation.
         :param request: The request that triggered the pipeline
         :param fetch_result: Raw outputs from fetching inputs out of a feature store
+        :param mem_pool: The memory pool used to access batched input tensors
         :return: The transformed inputs wrapped in a InputTransformResult"""
 
     @staticmethod
@@ -304,18 +308,19 @@ def execute(
         device: str,
     ) -> ExecuteResult:
         """Execute an ML model on inputs transformed for use by the model
-        :param request: The request that triggered the pipeline
+        :param batch: The batch of requests that triggered the pipeline
         :param load_result: The result of loading the model onto device memory
         :param transform_result: The result of transforming inputs for model consumption
+        :param device: The device on which the model will be executed
         :return: The result of inference wrapped in an ExecuteResult"""
 
     @staticmethod
     @abstractmethod
     def transform_output(
-        batch: InferenceBatch, execute_result: ExecuteResult, perf_timer: PerfTimer
+        batch: InferenceBatch, execute_result: ExecuteResult
     ) -> t.List[TransformOutputResult]:
         """Given inference results, perform transformations required to
         transmit results to the requestor.
-        :param request: The request that triggered the pipeline
+        :param batch: The batch of requests that triggered the pipeline
         :param execute_result: The result of inference wrapped in an ExecuteResult
-        :return:"""
+        :return: A list of transformed outputs"""

From 63a0f31ecf58e03bc3cbefc7293e911c61b969a1 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Thu, 25 Jul 2024 16:27:48 -0500
Subject: [PATCH 53/84] Remove unused fake versioning function

---
 smartsim/_core/mli/mli_schemas/model/utils.py | 41 -------------------
 1 file changed, 41 deletions(-)
 delete mode 100644 smartsim/_core/mli/mli_schemas/model/utils.py

diff --git a/smartsim/_core/mli/mli_schemas/model/utils.py b/smartsim/_core/mli/mli_schemas/model/utils.py
deleted file mode 100644
index b16dc8f623..0000000000
--- a/smartsim/_core/mli/mli_schemas/model/utils.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# BSD 2-Clause License
-#
-# Copyright (c) 2021-2024, Hewlett Packard Enterprise
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import typing as t
-from collections import namedtuple
-
-from .model_capnp import Model
-
-ModelInfo = namedtuple("ModelInfo", ["Name", "Version"])
-
-
-def make_model_key(model: Model) -> str:
-    return f"{model.name}_{model.version}"
-
-
-def get_model_name_and_version(key: str) -> t.NamedTuple:
-    split_key = key.rsplit("_", 1)
-    return ModelInfo(split_key[0], split_key[1])

From 6fb3efddfb12db46aee42e485100a2ba62bfea57 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Thu, 25 Jul 2024 16:41:23 -0500
Subject: [PATCH 54/84] Fix

---
 .../_core/mli/infrastructure/control/requestdispatcher.py   | 6 +++---
 smartsim/_core/mli/infrastructure/control/workermanager.py  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index 6a6f811fc9..a43290bf56 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -342,7 +342,7 @@ def __init__(
         :param worker_type: Type of worker to instantiate to batch inputs
         :param comm_channel_type: Type of channel used to get requests
         """
-        super().__init__(as_service=False, cooldown=1)
+        super().__init__(as_service=True, cooldown=1)
         self._queues: list[BatchQueue] = []
         """All batch queues"""
         self._active_queues: dict[str, BatchQueue] = {}
@@ -368,7 +368,7 @@ def __init__(
         """The worker used to batch inputs"""
         self._mem_pool = mem_pool
         """Memory pool used to share batched input tensors with the Worker Managers"""
-        self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=False)
+        self._perf_timer = PerfTimer(prefix="r_", debug=True, timing_on=True)
         """Performance timer"""
 
     def _validate_request(self, request: InferenceRequest) -> bool:
@@ -413,7 +413,7 @@ def _on_iteration(self) -> None:
         try:
             bytes_list: t.List[bytes] = self._incoming_channel.recv()
         except Exception:
-            pass
+            self._perf_timer.start_timings()
         else:
             if not bytes_list:
                 exception_handler(
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 74b36c3454..12a7891914 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -166,7 +166,7 @@ def __init__(
 
         self._device_manager: t.Optional[DeviceManager] = None
         """Object responsible for model caching and device access"""
-        self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True)
+        self._perf_timer = PerfTimer(prefix="w_", debug=True, timing_on=True)
         """Performance timer"""
 
     def _on_start(self) -> None:

From a0cd4ab44af718a493a03a7b87ca14507abcb456 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 13 Aug 2024 19:22:58 +0200
Subject: [PATCH 55/84] Address review

---
 ex/high_throughput_inference/mock_app.py      |  45 +++--
 ex/high_throughput_inference/redis_driver.py  |   2 +-
 .../standalone_workermanager.py               |  61 ++----
 .../mli/infrastructure/control/commons.py     |  65 ++++++
 .../infrastructure/control/devicemanager.py   | 114 +++++------
 .../control/requestdispatcher.py              | 188 +++++++++---------
 .../infrastructure/control/workermanager.py   |  54 ++---
 .../mli/infrastructure/worker/torch_worker.py |  19 +-
 .../_core/mli/infrastructure/worker/worker.py |  29 ++-
 tests/dragon/test_error_handling.py           |   4 +-
 10 files changed, 299 insertions(+), 282 deletions(-)
 create mode 100644 smartsim/_core/mli/infrastructure/control/commons.py

diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index 2440aa87c4..69ff6afeac 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -72,14 +72,14 @@ def __init__(self, timing_on: bool):
         self._from_worker_ch_serialized = self._from_worker_ch.serialize()
         self._to_worker_ch = Channel.make_process_local()
 
-        self._perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"a{rank}_")
+        self.perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"a{rank}_")
 
     def run_model(self, model: bytes | str, batch: torch.Tensor):
         tensors = [batch.numpy()]
-        self._perf_timer.start_timings("batch_size", batch.shape[0])
+        self.perf_timer.start_timings("batch_size", batch.shape[0])
         built_tensor_desc = MessageHandler.build_tensor_descriptor(
             "c", "float32", list(batch.shape))
-        self._perf_timer.measure_time("build_tensor_descriptor")
+        self.perf_timer.measure_time("build_tensor_descriptor")
         if isinstance(model, str):
             model_arg = MessageHandler.build_model_key(model)
         else:
@@ -92,43 +92,41 @@ def run_model(self, model: bytes | str, batch: torch.Tensor):
             output_descriptors=[],
             custom_attributes=None,
         )
-        self._perf_timer.measure_time("build_request")
+        self.perf_timer.measure_time("build_request")
         request_bytes = MessageHandler.serialize_request(request)
-        self._perf_timer.measure_time("serialize_request")
+        self.perf_timer.measure_time("serialize_request")
         tensor_bytes = [bytes(tensor.data) for tensor in tensors]
         # tensor_bytes = [tensor.reshape(-1).view(numpy.uint8).data for tensor in tensors]
-        self._perf_timer.measure_time("serialize_tensor")
+        self.perf_timer.measure_time("serialize_tensor")
         with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh:
             to_sendh.send_bytes(request_bytes)
             for tb in tensor_bytes:
                 to_sendh.send_bytes(tb) #TODO NOT FAST ENOUGH!!!
                 # to_sendh.send_bytes(bytes(t.data))
 
-        self._perf_timer.measure_time("send")
+        self.perf_timer.measure_time("send")
         with self._from_worker_ch.recvh(timeout=None) as from_recvh:
             resp = from_recvh.recv_bytes(timeout=None)
-            self._perf_timer.measure_time("receive_response")
+            self.perf_timer.measure_time("receive_response")
             response = MessageHandler.deserialize_response(resp)
-            self._perf_timer.measure_time("deserialize_response")
+            self.perf_timer.measure_time("deserialize_response")
             # list of data blobs? recv depending on the len(response.result.descriptors)?
             data_blob: bytes = from_recvh.recv_bytes(timeout=None)
-            self._perf_timer.measure_time("receive_tensor")
+            self.perf_timer.measure_time("receive_tensor")
             result = torch.from_numpy(
                 numpy.frombuffer(
                     data_blob,
                     dtype=str(response.result.descriptors[0].dataType),
                 )
             )
-            self._perf_timer.measure_time("deserialize_tensor")
+            self.perf_timer.measure_time("deserialize_tensor")
 
-        self._perf_timer.end_timings()
+        self.perf_timer.end_timings()
         return result
 
     def set_model(self, key: str, model: bytes):
         self._ddict[key] = model
 
-    def print_timings(self, to_file: bool):
-        self._perf_timer.print_timings(to_file)
 
 
 class ResNetWrapper():
@@ -154,7 +152,7 @@ def name(self):
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser("Mock application")
-    parser.add_argument("--device", default="cpu")
+    parser.add_argument("--device", default="cpu", type=str)
     parser.add_argument("--log_max_batchsize", default=8, type=int)
     args = parser.parse_args()
 
@@ -163,7 +161,10 @@ def name(self):
     client = ProtoClient(timing_on=True)
     client.set_model(resnet.name, resnet.model)
 
-    pt_model = torch.jit.load(io.BytesIO(initial_bytes=(resnet.model))).to("cuda:0")
+    if CHECK_RESULTS_AND_MAKE_ALL_SLOWER:
+        # TODO: adapt to non-Nvidia devices
+        torch_device = args.device.replace("gpu", "cuda")
+        pt_model = torch.jit.load(io.BytesIO(initial_bytes=(resnet.model))).to(torch_device)
 
     TOTAL_ITERATIONS = 100
 
@@ -172,15 +173,15 @@ def name(self):
         logger.info(f"Batch size: {b_size}")
         for iteration_number in range(TOTAL_ITERATIONS + int(b_size==1)):
             logger.info(f"Iteration: {iteration_number}")
-            batch = resnet.get_batch(b_size)
-            remote_result = client.run_model(resnet.name, batch)
-            logger.info(client._perf_timer.get_last("total_time"))
+            sample_batch = resnet.get_batch(b_size)
+            remote_result = client.run_model(resnet.name, sample_batch)
+            logger.info(client.perf_timer.get_last("total_time"))
             if CHECK_RESULTS_AND_MAKE_ALL_SLOWER:
-                local_res = pt_model(batch.to("cuda:0"))
-                err_norm = torch.linalg.vector_norm(torch.flatten(remote_result).to("cuda:0")-torch.flatten(local_res), ord=1).cpu()
+                local_res = pt_model(sample_batch.to(torch_device))
+                err_norm = torch.linalg.vector_norm(torch.flatten(remote_result).to(torch_device)-torch.flatten(local_res), ord=1).cpu()
                 res_norm = torch.linalg.vector_norm(remote_result, ord=1).item()
                 local_res_norm = torch.linalg.vector_norm(local_res, ord=1).item()
                 logger.info(f"Avg norm of error {err_norm.item()/b_size} compared to result norm of {res_norm/b_size}:{local_res_norm/b_size}")
                 torch.cuda.synchronize()
 
-    client.print_timings(to_file=True)
\ No newline at end of file
+    client.perf_timer.print_timings(to_file=True)
\ No newline at end of file
diff --git a/ex/high_throughput_inference/redis_driver.py b/ex/high_throughput_inference/redis_driver.py
index 6a8b00c2a8..ff57725d40 100644
--- a/ex/high_throughput_inference/redis_driver.py
+++ b/ex/high_throughput_inference/redis_driver.py
@@ -33,7 +33,7 @@
 DEVICE = "gpu"
 filedir = os.path.dirname(__file__)
 app_script_name = os.path.join(filedir, "mock_app_redis.py")
-model_name = os.path.join(filedir, f"resnet50.{DEVICE.upper()}.pt")
+model_name = os.path.join(filedir, f"resnet50.{DEVICE}.pt")
 
 
 exp_path = os.path.join(filedir, "redis_ai_multi")
diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index 72e2bd20f0..a17039d0fd 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -28,7 +28,6 @@
 import dragon
 
 # pylint disable=import-error
-import dragon.globalservices.pool as dragon_gs_pool
 import dragon.infrastructure.policy as dragon_policy
 import dragon.infrastructure.process_desc as dragon_process_desc
 import dragon.native.process as dragon_process
@@ -69,31 +68,16 @@
 )
 from smartsim._core.mli.infrastructure.worker.worker import MachineLearningWorkerBase
 
+from smartsim.log import get_logger
+
+logger = get_logger("Worker Manager Entry Point")
+
 mp.set_start_method("dragon")
 
 pid = os.getpid()
 affinity = os.sched_getaffinity(pid)
-print("Entry point:", socket.gethostname(), affinity)
-print("CPUS:", os.cpu_count())
-
-
-def create_request_dispatcher(
-    batch_size: int,
-    batch_timeout: float,
-    comm_channel_type: t.Type[CommChannelBase],
-    worker_type: t.Type[MachineLearningWorkerBase],
-    config_loader: EnvironmentConfigLoader,
-) -> RequestDispatcher:
-    mem_pool = MemoryPool.attach(dragon_gs_pool.create(2 * 1024**3).sdesc)
-
-    return RequestDispatcher(
-        batch_timeout=batch_timeout,
-        batch_size=batch_size,
-        config_loader=config_loader,
-        comm_channel_type=comm_channel_type,
-        mem_pool=mem_pool,
-        worker_type=worker_type,
-    )
+logger.log(f"Entry point: {socket.gethostname()}, {affinity}")
+logger.log(f"CPUS: {os.cpu_count()}")
 
 
 def create_worker_manager(
@@ -102,15 +86,7 @@ def create_worker_manager(
     device: str,
     dispatcher_queue: mp.Queue,
 ) -> WorkerManager:
-    return WorkerManager(
-        config_loader=config_loader,
-        worker_type=worker_type,
-        as_service=True,
-        cooldown=10,
-        comm_channel_type=DragonCommChannel,
-        device=device,
-        task_queue=dispatcher_queue,
-    )
+    return
 
 
 def service_as_dragon_proc(
@@ -191,31 +167,36 @@ def service_as_dragon_proc(
 
     ss_config_loader = EnvironmentConfigLoader()
 
-    dispatcher = create_request_dispatcher(
-        batch_size=args.batch_size,
+    dispatcher = RequestDispatcher(
         batch_timeout=args.batch_timeout,
+        batch_size=args.batch_size,
+        config_loader=ss_config_loader,
         comm_channel_type=DragonCommChannel,
         worker_type=arg_worker_type,
-        config_loader=ss_config_loader,
     )
 
     wms = []
     worker_device = args.device
     for wm_idx in range(args.num_workers):
-        # if args.num_workers > 0:
-        #     worker_device = f"{args.device}:{wm_idx}"
-        worker_manager = create_worker_manager(
-            worker_type=arg_worker_type,
+
+        worker_manager =  WorkerManager(
             config_loader=ss_config_loader,
+            worker_type=arg_worker_type,
+            as_service=True,
+            cooldown=10,
+            comm_channel_type=DragonCommChannel,
             device=worker_device,
-            dispatcher_queue=dispatcher.task_queue,
+            task_queue=dispatcher.task_queue,
         )
+
         wms.append(worker_manager)
 
     wm_affinity: list[int] = []
     disp_affinity: list[int] = []
 
-    # This is hardcoded for a specific type of node!
+    # This is hardcoded for a specific type of node:
+    # the GPU-to-CPU mapping is taken from the nvidia-smi tool
+    # TODO can this be computed on the fly?
     gpu_to_cpu_aff: dict[int, list[int]] = {}
     gpu_to_cpu_aff[0] = list(range(48,64)) + list(range(112,128))
     gpu_to_cpu_aff[1] = list(range(32,48)) + list(range(96,112))
diff --git a/smartsim/_core/mli/infrastructure/control/commons.py b/smartsim/_core/mli/infrastructure/control/commons.py
new file mode 100644
index 0000000000..4c67fb47b0
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/control/commons.py
@@ -0,0 +1,65 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+
+from .....log import get_logger
+from ...comm.channel.channel import CommChannelBase
+from ...message_handler import MessageHandler
+from ...mli_schemas.response.response_capnp import ResponseBuilder
+
+
+logger = get_logger(__file__)
+
+
+def build_failure_reply(status: "Status", message: str) -> ResponseBuilder:
+    return MessageHandler.build_response(
+        status=status,
+        message=message,
+        result=[],
+        custom_attributes=None,
+    )
+
+def exception_handler(
+    exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str
+) -> None:
+    """
+    Logs exceptions and sends a failure response.
+
+    :param exc: The exception to be logged
+    :param reply_channel: The channel used to send replies
+    :param failure_message: Failure message to log and send back
+    """
+    logger.exception(
+        f"{failure_message}\n"
+        f"Exception type: {type(exc).__name__}\n"
+        f"Exception message: {str(exc)}"
+    )
+    serialized_resp = MessageHandler.serialize_response(
+        build_failure_reply("fail", failure_message)
+    )
+    if reply_channel:
+        reply_channel.send(serialized_resp)
\ No newline at end of file
diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py
index 14b83a5044..9a56dd3ba5 100644
--- a/smartsim/_core/mli/infrastructure/control/devicemanager.py
+++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py
@@ -25,12 +25,10 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import typing as t
-from threading import RLock
-from types import TracebackType
 
 from ...infrastructure.storage.featurestore import FeatureStore
 from ..worker.worker import MachineLearningWorkerBase
-from .requestdispatcher import InferenceBatch
+from .requestdispatcher import RequestBatch
 
 
 class WorkerDevice:
@@ -40,91 +38,83 @@ def __init__(self, name: str) -> None:
         """
         self._name = name
         """The name used by the toolkit to identify this device"""
-        self._lock = RLock()
-        """Lock to ensure only one thread at the time accesses this device"""
         self._models: dict[str, t.Any] = {}
+        """Dict of keys to models which are loaded on this device"""
 
-    def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]:
-        return self._lock.acquire(blocking=blocking, timeout=timeout)
-
-    def release(self) -> None:
-        self._lock.release()
-
-    def __enter__(self) -> None:
-        self.acquire()
 
     @property
     def name(self) -> str:
+        """The identifier of the device represented by this object"""
         return self._name
 
     def add_model(self, key: str, model: t.Any) -> None:
+        """Add a reference to a model loaded on this device and assign it a key
+
+        :param key: The key under which the model is saved
+        :param model: The model which is added
+        """
         self._models[key] = model
 
     def remove_model(self, key: str) -> None:
+        """Remove the reference to a model loaded on this device
+
+        :param key: The key of the model to remove
+        """
         self._models.pop(key)
 
     def get_model(self, key: str) -> t.Any:
+        """Get the model corresponding to a given key
+
+        :param key: the model key
+        """
         return self._models[key]
 
     def __contains__(self, key: str) -> bool:
         return key in self._models
 
-    def __exit__(
-        self,
-        exc_type: t.Optional[t.Type[BaseException]],
-        exc_val: t.Optional[BaseException],
-        exc_tb: t.Optional[TracebackType],
-    ) -> None:
-        self.release()
-
 
 class DeviceManager:
-    def __init__(self, devices: list[WorkerDevice]):
-        self._devices = devices
-        """Dictionary of model key to devices on which it is loaded"""
+    def __init__(self, device: WorkerDevice):
+        self._device = device
+        """Device managed by this object"""
+
+    def _load_model_on_device(self,
+        worker: MachineLearningWorkerBase,
+        batch: RequestBatch,
+        feature_store: t.Optional[FeatureStore],
+    ) -> None:
+        model_bytes = worker.fetch_model(batch, feature_store)
+        loaded_model = worker.load_model(
+            batch, model_bytes, self._device.name
+        )
+        self._device.add_model(batch.model_key, loaded_model.model)
 
-    def get_free_device(
+    def get_device(
         self,
         worker: MachineLearningWorkerBase,
-        batch: InferenceBatch,
+        batch: RequestBatch,
         feature_store: t.Optional[FeatureStore],
     ) -> t.Generator[WorkerDevice, None, None]:
-        return_device = None
-        sample_request = batch.requests[0]
-        direct_inference = sample_request.raw_model is not None
-        while return_device is None:
-            loaded_devices = []
-            if not direct_inference:
-                # Look up devices to see if any of them already has a copy of the model
-                for device in self._devices:
-                    if batch.model_key in device:
-                        loaded_devices.append(device)
-
-                # If a pre-loaded model is found on a device, try using that device
-                for device in loaded_devices:
-                    if device.acquire(blocking=False):
-                        return_device = device
-                        break
-
-            # If the model is not loaded on a free device,
-            # load it on another device (if available)
-            if return_device is None:
-                for candidate_device in self._devices:
-                    if (
-                        candidate_device not in loaded_devices
-                        and candidate_device.acquire(blocking=False)
-                    ):
-                        model_bytes = worker.fetch_model(batch, feature_store)
-                        loaded_model = worker.load_model(
-                            batch, model_bytes, candidate_device.name
-                        )
-                        candidate_device.add_model(batch.model_key, loaded_model.model)
-
-                        return_device = candidate_device
+        """Get the device managed by this object
+
+        the model needed to run the batch of requests is
+        guaranteed to be available on the model
+
+        :param worker: The worker that wants to access the device
+        :param batch: The batch of requests
+        :param feature_store: The feature store on which part of the
+        data needed by the request may be stored
+        :return: A generator yielding the device
+        """
+        model_in_request = batch.has_raw_model
+
+        # Load model if not already loaded, or
+        # because it is sent with the request
+        if model_in_request or not batch.model_key in self._device:
+            self._load_model_on_device(worker, batch, feature_store)
 
         try:
-            yield return_device
+            yield self._device
         finally:
-            if direct_inference:
-                return_device.remove_model(batch.model_key)
-            return_device.release()
+            if model_in_request:
+                self._device.remove_model(batch.model_key)
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index a43290bf56..d050a646c3 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -29,6 +29,7 @@
 import dragon
 from dragon.managed_memory import MemoryPool
 from dragon.mpbridge.queues import DragonQueue
+import dragon.globalservices.pool as dragon_gs_pool
 
 # pylint: enable=import-error
 
@@ -53,14 +54,14 @@
 from ...infrastructure.environmentloader import EnvironmentConfigLoader
 from ...infrastructure.storage.featurestore import FeatureStore
 from ...infrastructure.worker.worker import (
-    InferenceBatch,
+    RequestBatch,
     InferenceRequest,
     MachineLearningWorkerBase,
 )
 from ...message_handler import MessageHandler
 from ...mli_schemas.model.model_capnp import Model
-from ...mli_schemas.response.response_capnp import ResponseBuilder
 from ...mli_schemas.tensor.tensor_capnp import TensorDescriptor
+from .commons import exception_handler
 
 if t.TYPE_CHECKING:
     from smartsim._core.mli.mli_schemas.response.response_capnp import Status
@@ -73,18 +74,10 @@ def deserialize_message(
     channel_type: t.Type[CommChannelBase],
 ) -> InferenceRequest:
     """Deserialize a message from a byte stream into an InferenceRequest
-    :param data_blob: The byte stream to deserialize"""
-    # todo: consider moving to XxxCore and only making
-    # workers implement the inputs and model conversion?
-
-    # alternatively, consider passing the capnproto models
-    # to this method instead of the data_blob...
-
-    # something is definitely wrong here... client shouldn't have to touch
-    # callback (or batch size)
+    :param data_blob: The byte stream to deserialize
+    :param channel_type: The channel used to send the response"""
 
     request = MessageHandler.deserialize_request(data_blob)
-    # return request
     model_key: t.Optional[str] = None
     model_bytes: t.Optional[Model] = None
 
@@ -126,37 +119,6 @@ def deserialize_message(
     return inference_request
 
 
-def build_failure_reply(status: "Status", message: str) -> ResponseBuilder:
-    return MessageHandler.build_response(
-        status=status,
-        message=message,
-        result=[],
-        custom_attributes=None,
-    )
-
-
-def exception_handler(
-    exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str
-) -> None:
-    """
-    Logs exceptions and sends a failure response.
-
-    :param exc: The exception to be logged
-    :param reply_channel: The channel used to send replies
-    :param failure_message: Failure message to log and send back
-    """
-    logger.exception(
-        f"{failure_message}\n"
-        f"Exception type: {type(exc).__name__}\n"
-        f"Exception message: {str(exc)}"
-    )
-    serialized_resp = MessageHandler.serialize_response(
-        build_failure_reply("fail", failure_message)
-    )
-    if reply_channel:
-        reply_channel.send(serialized_resp)
-
-
 class WorkerDevice:
     def __init__(self, name: str) -> None:
         """Wrapper around a device to keep track of loaded Models and availability
@@ -192,7 +154,7 @@ def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> Non
         """Queue used to store inference requests waiting to be batched and
         sent to Worker Managers.
         :param batch_timeout: Time in seconds that has to be waited before flushing a
-        non-full queue. The time of the firt item put is 0 seconds.
+        non-full queue. The time of the first item put is 0 seconds.
         :param batch_size: Total capacity of the queue.
         :param model_key: Key of the model which needs to be executed on the queued
         requests
@@ -200,7 +162,7 @@ def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> Non
         super().__init__(maxsize=batch_size)
         self._batch_timeout = batch_timeout
         """Time in seconds that has to be waited before flushing a non-full queue.
-        The time of the firt item put is 0 seconds."""
+        The time of the first item put is 0 seconds."""
         self._batch_size = batch_size
         """Total capacity of the queue."""
         self._first_put: t.Optional[float] = None
@@ -212,13 +174,13 @@ def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> Non
         """Key of the model which needs to be executed on the queued requets"""
         self._flush_lock = RLock()
         """Lock used to make sure only one process can flush the queue (unused now)"""
-        self._id = str(uuid.uuid4())
-        """Id of queue"""
+        self._uid = str(uuid.uuid4())
+        """Unique ID of queue"""
 
     @property
-    def queue_id(self) -> str:
+    def uid(self) -> str:
         """ID of this queue"""
-        return self._id
+        return self._uid
 
     def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]:
         """Acquire queue lock to flush
@@ -232,6 +194,7 @@ def release(self) -> None:
         self._flush_lock.release()
 
     def __enter__(self) -> None:
+        """Method to use the Queue as a Context Manager"""
         self.acquire()
 
     def __exit__(
@@ -240,6 +203,7 @@ def __exit__(
         exc_val: t.Optional[BaseException],
         exc_tb: t.Optional[TracebackType],
     ) -> None:
+        """Method to release the Queue as a Context Manager"""
         self.release()
 
     @property
@@ -256,7 +220,7 @@ def put(
         """Put an inference request in the queue
         :param item: The request
         :param block: Whether to block when trying to put the item
-        :param timeout: Time to wait if block==True
+        :param timeout: Time (in seconds) to wait if block==True
         """
         if not self.acquire(blocking=False):
             raise Full
@@ -270,8 +234,8 @@ def put(
             self.release()
 
     @property
-    def _waited_time(self) -> float:
-        if self._first_put is None:
+    def _elapsed_time(self) -> float:
+        if self.empty():
             return 0
         return time.time() - self._first_put
 
@@ -280,15 +244,15 @@ def ready(self) -> bool:
         """True if the queue can be flushed"""
         if self.empty():
             return False
-        return self.full() or (self._waited_time >= self._batch_timeout)
+        return self.full() or (self._elapsed_time >= self._batch_timeout)
 
     def make_disposable(self) -> None:
         """Set this queue as disposable, and never use it again after it gets flushed"""
         self._disposable = True
 
     @property
-    def disposable(self) -> bool:
-        """Whether this queue can be used to put items or should be deleted"""
+    def can_be_removed(self) -> bool:
+        """Whether this queue can be deleted and garbafe collected"""
         return self.empty() and self._disposable
 
     def flush(self) -> list[t.Any]:
@@ -298,7 +262,6 @@ def flush(self) -> list[t.Any]:
         num_items = self.qsize()
         self._first_put = None
         items = []
-        # Avoid (unlikely) race condition error
         for _ in range(num_items):
             try:
                 items.append(self.get())
@@ -325,28 +288,27 @@ def __init__(
         self,
         batch_timeout: float,
         batch_size: int,
-        mem_pool: MemoryPool,
         config_loader: EnvironmentConfigLoader,
         worker_type: t.Type[MachineLearningWorkerBase],
         comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel,
     ) -> None:
-        """The RquestDispatcher intercepts inference requests, stages them in
+        """The RequestDispatcher intercepts inference requests, stages them in
         queues and batches them together before making them available to Worker
         Managers.
-        :param batch_timeout: Time in seconds that has to be waited before flushing a
-        non-full queue after having put at least one item on it.
+        :param batch_timeout: Maximum elapsed time before flushing a complete or incomplete batch
         :param batch_size: Total capacity of each batch queue.
         :param mem_pool: Memory pool used to share batched input tensors with worker
         managers
         :param config_loader: Object to load configuration from environment
         :param worker_type: Type of worker to instantiate to batch inputs
         :param comm_channel_type: Type of channel used to get requests
+        :raises SmartSimError: If config_loaded.get_queue() does not return a channel
         """
         super().__init__(as_service=True, cooldown=1)
-        self._queues: list[BatchQueue] = []
-        """All batch queues"""
+        self._queues: dict[str, list[BatchQueue]] = []
+        """Dict of all batch queues available for a given model key"""
         self._active_queues: dict[str, BatchQueue] = {}
-        """Mapping telling which queue is the recipient of requets for a given model
+        """Mapping telling which queue is the recipient of requests for a given model
         key"""
         self._batch_timeout = batch_timeout
         """Time in seconds that has to be waited before flushing a non-full queue"""
@@ -354,7 +316,10 @@ def __init__(
         """Total capacity of each batch queue."""
         self._queue_swap_lock: t.Optional[RLock] = None
         """Lock used to swap the active queue for a key"""
-        self._incoming_channel = config_loader.get_queue()
+        incoming_channel = config_loader.get_queue()
+        if incoming_channel is None:
+            raise SmartSimError("No incoming channel for dispatcher")
+        self._incoming_channel = incoming_channel
         """The channel the dispatcher monitors for new tasks"""
         self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0)
         """The queue on which batched inference requests are placed"""
@@ -366,7 +331,7 @@ def __init__(
         """The type of the channel used to receive requests"""
         self._worker = worker_type()
         """The worker used to batch inputs"""
-        self._mem_pool = mem_pool
+        self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(2 * 1024**3).sdesc)
         """Memory pool used to share batched input tensors with the Worker Managers"""
         self._perf_timer = PerfTimer(prefix="r_", debug=True, timing_on=True)
         """Performance timer"""
@@ -407,9 +372,6 @@ def _on_start(self) -> None:
 
     def _on_iteration(self) -> None:
 
-        if self._incoming_channel is None:
-            raise SmartSimError("No incoming channel for dispatcher")
-
         try:
             bytes_list: t.List[bytes] = self._incoming_channel.recv()
         except Exception:
@@ -454,16 +416,28 @@ def task_queue(self) -> DragonQueue:
         return self._outgoing_queue
 
     def _swap_queue(self, model_key: str) -> None:
+        """Get an empty queue or create a new one
+
+        and make it the active one for a given model.
+
+        :param model_key: The key of the model for which the
+        queue has to be swapped
+        :raises SmartSimError: If the queue is not locked.
+        """
         if self._queue_swap_lock is None:
             raise SmartSimError("Queues were not locked")
         with self._queue_swap_lock:
-            for queue in self._queues:
-                if queue.model_key == model_key and not queue.full():
-                    self._active_queues[model_key] = queue
-                    return
+            for queue_list in self._queues[model_key]:
+                for queue in queue_list:
+                    if not queue.full():
+                        self._active_queues[model_key] = queue
+                        return
 
             new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_key)
-            self._queues.append(new_queue)
+            if model_key in self._queues:
+                self._queues[model_key].append(new_queue)
+            else:
+                self._queues[model_key] = [new_queue]
             self._active_queues[model_key] = new_queue
             return
 
@@ -493,33 +467,49 @@ def dispatch(self, request: InferenceRequest) -> None:
 
     def flush_requests(self) -> None:
         """Get all requests from queues which are ready to be flushed. Place all
-        aviable request batches in the outgoing queue.
+        avaliable request batches in the outgoing queue.
         """
-        for queue in self._queues:
-            if queue.ready and queue.acquire(blocking=False):
-                self._perf_timer.measure_time("find_queue")
-                try:
-                    batch = InferenceBatch(
-                        model_key=queue.model_key, requests=queue.flush(), inputs=None
-                    )
-                finally:
-                    self._perf_timer.measure_time("flush_requests")
-                    queue.release()
-                fetch_results = self._worker.fetch_inputs(
-                    batch=batch, feature_store=self._feature_store
-                )
-                self._perf_timer.measure_time("fetch_input")
-                transformed_inputs = self._worker.transform_input(
-                    batch=batch, fetch_results=fetch_results, mem_pool=self._mem_pool
-                )
-                self._perf_timer.measure_time("transform_input")
-                batch.inputs = transformed_inputs
-                for request in batch.requests:
-                    request.raw_inputs = []
-                    request.input_meta = []
-
-                self._outgoing_queue.put(batch)
-                self._perf_timer.measure_time("put")
+        for queue_list in self._queues:
+            for queue in queue_list:
+                if queue.ready and queue.acquire(blocking=False):
+                    self._perf_timer.measure_time("find_queue")
+                    try:
+                        batch = RequestBatch(
+                            model_key=queue.model_key, requests=queue.flush(), inputs=None
+                        )
+                    finally:
+                        self._perf_timer.measure_time("flush_requests")
+                        queue.release()
+                    try:
+                        fetch_results = self._worker.fetch_inputs(
+                            batch=batch, feature_store=self._feature_store
+                        )
+                    except Exception as exc:
+                        exception_handler(
+                            exc,
+                            None,
+                            "Error fetching input.",
+                        )
+                    self._perf_timer.measure_time("fetch_input")
+                    try:
+                        transformed_inputs = self._worker.transform_input(
+                            batch=batch, fetch_results=fetch_results, mem_pool=self._mem_pool
+                        )
+                    except Exception as exc:
+                        exception_handler(
+                            exc,
+                            None,
+                            "Error Transforming input.",
+                        )
+
+                    self._perf_timer.measure_time("transform_input")
+                    batch.inputs = transformed_inputs
+                    for request in batch.requests:
+                        request.raw_inputs = []
+                        request.input_meta = []
+
+                    self._outgoing_queue.put(batch)
+                    self._perf_timer.measure_time("put")
 
     def _can_shutdown(self) -> bool:
         return False
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 12a7891914..d0d1ca81fb 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -27,12 +27,13 @@
 # pylint: disable=import-error
 # pylint: disable-next=unused-import
 import dragon
-
 # pylint: enable=import-error
 
 # isort: off
 # isort: on
 
+from queue import Empty
+
 import multiprocessing as mp
 import time
 import typing as t
@@ -45,13 +46,14 @@
 from ...infrastructure.environmentloader import EnvironmentConfigLoader
 from ...infrastructure.storage.featurestore import FeatureStore
 from ...infrastructure.worker.worker import (
-    InferenceBatch,
+    RequestBatch,
     InferenceReply,
     LoadModelResult,
     MachineLearningWorkerBase,
 )
 from ...message_handler import MessageHandler
 from ...mli_schemas.response.response_capnp import ResponseBuilder
+from .commons import build_failure_reply, exception_handler
 from .devicemanager import DeviceManager, WorkerDevice
 
 if t.TYPE_CHECKING:
@@ -62,14 +64,6 @@
 logger = get_logger(__name__)
 
 
-def build_failure_reply(status: "Status", message: str) -> ResponseBuilder:
-    return MessageHandler.build_response(
-        status=status,
-        message=message,
-        result=[],
-        custom_attributes=None,
-    )
-
 
 def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]:
     prepared_outputs: t.List[t.Any] = []
@@ -100,29 +94,6 @@ def build_reply(reply: InferenceReply) -> ResponseBuilder:
         custom_attributes=None,
     )
 
-
-def exception_handler(
-    exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str
-) -> None:
-    """
-    Logs exceptions and sends a failure response.
-
-    :param exc: The exception to be logged
-    :param reply_channel: The channel used to send replies
-    :param failure_message: Failure message to log and send back
-    """
-    logger.exception(
-        f"{failure_message}\n"
-        f"Exception type: {type(exc).__name__}\n"
-        f"Exception message: {str(exc)}"
-    )
-    serialized_resp = MessageHandler.serialize_response(
-        build_failure_reply("fail", failure_message)
-    )
-    if reply_channel:
-        reply_channel.send(serialized_resp)
-
-
 class WorkerManager(Service):
     """An implementation of a service managing distribution of tasks to
     machine learning workers"""
@@ -131,7 +102,7 @@ def __init__(
         self,
         config_loader: EnvironmentConfigLoader,
         worker_type: t.Type[MachineLearningWorkerBase],
-        task_queue: "mp.Queue[InferenceBatch]",
+        dispatcher_queue: "mp.Queue[InferenceBatch]",
         as_service: bool = False,
         cooldown: int = 0,
         comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel,
@@ -141,7 +112,7 @@ def __init__(
         :param config_loader: Environment config loader that loads the task queue and
         feature store
         :param worker_type: The type of worker to manage
-        :param task_queue: Queue from witch the batched requests have to be pulled
+        :param dispatcher_queue: Queue from which the batched requests have to be pulled
         :param as_service: Specifies run-once or run-until-complete behavior of service
         :param cooldown: Number of seconds to wait before shutting down after
         shutdown criteria are met
@@ -151,7 +122,7 @@ def __init__(
         """
         super().__init__(as_service, cooldown)
 
-        self._task_queue = task_queue
+        self._dispatcher_queue = dispatcher_queue
         """The dispatcher queue the manager monitors for new tasks"""
         self._feature_store: t.Optional[FeatureStore] = (
             config_loader.get_feature_store()
@@ -179,8 +150,8 @@ def _on_iteration(self) -> None:
 
         pre_batch_time = time.perf_counter()
         try:
-            batch: InferenceBatch = self._task_queue.get(timeout=0.0001)
-        except Exception:
+            batch: RequestBatch = self._dispatcher_queue.get(timeout=0.0001)
+        except Empty:
             return
 
         self._perf_timer.start_timings(
@@ -188,12 +159,17 @@ def _on_iteration(self) -> None:
         )
 
         if batch is None or 0 == len(batch.requests):
+            exception_handler(
+                ValueError("An empty batch was received"),
+                None,
+                "Error batching inputs, the batch was empty.",
+            )
             return
 
         if self._device_manager is None:
             raise ValueError("No Device Manager available: did you call _on_start()")
         device: WorkerDevice = next(
-            self._device_manager.get_free_device(
+            self._device_manager.get_device(
                 worker=self._worker,
                 batch=batch,
                 feature_store=self._feature_store,
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index 6723573cfb..392e7e051e 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -39,7 +39,7 @@
     ExecuteResult,
     FetchInputResult,
     FetchModelResult,
-    InferenceBatch,
+    RequestBatch,
     LoadModelResult,
     MachineLearningWorkerBase,
     TransformInputResult,
@@ -56,13 +56,12 @@ class TorchWorker(MachineLearningWorkerBase):
 
     @staticmethod
     def load_model(
-        batch: InferenceBatch, fetch_result: FetchModelResult, device: str
+        batch: RequestBatch, fetch_result: FetchModelResult, device: str
     ) -> LoadModelResult:
-        request = batch.requests[0]
         if fetch_result.model_bytes:
             model_bytes = fetch_result.model_bytes
-        elif request.raw_model and request.raw_model.data:
-            model_bytes = request.raw_model.data
+        elif batch.raw_model and batch.raw_model.data:
+            model_bytes = batch.raw_model.data
         else:
             raise ValueError("Unable to load model without reference object")
 
@@ -79,7 +78,7 @@ def load_model(
 
     @staticmethod
     def transform_input(
-        batch: InferenceBatch,
+        batch: RequestBatch,
         fetch_results: list[FetchInputResult],
         mem_pool: MemoryPool,
     ) -> TransformInputResult:
@@ -135,7 +134,7 @@ def transform_input(
     # pylint: disable-next=unused-argument
     @staticmethod
     def execute(
-        batch: InferenceBatch,
+        batch: RequestBatch,
         load_result: LoadModelResult,
         transform_result: TransformInputResult,
         device: str,
@@ -165,8 +164,8 @@ def execute(
         with torch.no_grad():
             model.eval()
             results = [
-                model(tensor.to(device, non_blocking=True)).detach()
-                for tensor in tensors
+                model(*[tensor.to(device, non_blocking=True).detach()
+                for tensor in tensors])
             ]
 
         transform_result.transformed = []
@@ -178,7 +177,7 @@ def execute(
 
     @staticmethod
     def transform_output(
-        batch: InferenceBatch,
+        batch: RequestBatch,
         execute_result: ExecuteResult,
     ) -> list[TransformOutputResult]:
         transformed_list: list[TransformOutputResult] = []
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 01e2db6c86..0565146968 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -162,18 +162,33 @@ def __init__(self, result: bytes) -> None:
 
 
 @dataclass
-class InferenceBatch:
+class RequestBatch:
+    """A batch of aggregated inference requests
+    """
     model_key: str
     requests: list[InferenceRequest]
     inputs: t.Optional[TransformInputResult]
 
+    @property
+    def has_valid_requests(self) -> bool:
+        return len(self.requests) > 0
+
+    @property
+    def has_raw_nodel(self) -> bool:
+        return self.raw_model is not None
+
+    @property
+    def raw_model(self) -> t.Optional[t.Any]:
+        if self.has_valid_requests:
+            return self.requests[0].raw_model
+        return None
 
 class MachineLearningWorkerCore:
     """Basic functionality of ML worker that is shared across all worker types"""
 
     @staticmethod
     def fetch_model(
-        batch: InferenceBatch, feature_store: t.Optional[FeatureStore]
+        batch: RequestBatch, feature_store: t.Optional[FeatureStore]
     ) -> FetchModelResult:
         """Given a resource key, retrieve the raw model from a feature store
         :param batch: The batch of requests that triggered the pipeline
@@ -204,7 +219,7 @@ def fetch_model(
 
     @staticmethod
     def fetch_inputs(
-        batch: InferenceBatch, feature_store: t.Optional[FeatureStore]
+        batch: RequestBatch, feature_store: t.Optional[FeatureStore]
     ) -> t.List[FetchInputResult]:
         """Given a collection of ResourceKeys, identify the physical location
         and input metadata
@@ -276,7 +291,7 @@ class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC):
     @staticmethod
     @abstractmethod
     def load_model(
-        batch: InferenceBatch, fetch_result: FetchModelResult, device: str
+        batch: RequestBatch, fetch_result: FetchModelResult, device: str
     ) -> LoadModelResult:
         """Given a loaded MachineLearningModel, ensure it is loaded into
         device memory
@@ -288,7 +303,7 @@ def load_model(
     @staticmethod
     @abstractmethod
     def transform_input(
-        batch: InferenceBatch,
+        batch: RequestBatch,
         fetch_results: list[FetchInputResult],
         mem_pool: MemoryPool,
     ) -> TransformInputResult:
@@ -302,7 +317,7 @@ def transform_input(
     @staticmethod
     @abstractmethod
     def execute(
-        batch: InferenceBatch,
+        batch: RequestBatch,
         load_result: LoadModelResult,
         transform_result: TransformInputResult,
         device: str,
@@ -317,7 +332,7 @@ def execute(
     @staticmethod
     @abstractmethod
     def transform_output(
-        batch: InferenceBatch, execute_result: ExecuteResult
+        batch: RequestBatch, execute_result: ExecuteResult
     ) -> t.List[TransformOutputResult]:
         """Given inference results, perform transformations required to
         transmit results to the requestor.
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index 151bdd2fcc..c178426b4f 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -90,7 +90,7 @@ def setup_worker_manager_model_bytes(test_dir, monkeypatch: pytest.MonkeyPatch):
         test_dir, model, [tensor_key], [tensor_key], [], None
     )
     ser_request = MessageHandler.serialize_request(request)
-    worker_manager._task_queue.send(ser_request)
+    worker_manager._dispatcher_queue.send(ser_request)
 
     return worker_manager, integrated_worker
 
@@ -122,7 +122,7 @@ def setup_worker_manager_model_key(test_dir, monkeypatch: pytest.MonkeyPatch):
         test_dir, model_key, [tensor_key], [tensor_key], [], None
     )
     ser_request = MessageHandler.serialize_request(request)
-    worker_manager._task_queue.send(ser_request)
+    worker_manager._dispatcher_queue.send(ser_request)
 
     return worker_manager, integrated_worker
 

From af8b639f1187f73b295f96fb102ef20598fd47e5 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 14 Aug 2024 11:10:55 -0500
Subject: [PATCH 56/84] Static checker passes

---
 .../mli/infrastructure/control/commons.py     |   5 +-
 .../infrastructure/control/devicemanager.py   |  22 +-
 .../control/requestdispatcher.py              | 253 ++++++++++--------
 .../infrastructure/control/workermanager.py   | 145 ++--------
 .../mli/infrastructure/worker/torch_worker.py |  10 +-
 .../_core/mli/infrastructure/worker/worker.py |  55 ++--
 6 files changed, 202 insertions(+), 288 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/commons.py b/smartsim/_core/mli/infrastructure/control/commons.py
index 4c67fb47b0..a40ae014aa 100644
--- a/smartsim/_core/mli/infrastructure/control/commons.py
+++ b/smartsim/_core/mli/infrastructure/control/commons.py
@@ -31,6 +31,8 @@
 from ...message_handler import MessageHandler
 from ...mli_schemas.response.response_capnp import ResponseBuilder
 
+if t.TYPE_CHECKING:
+    from smartsim._core.mli.mli_schemas.response.response_capnp import Status
 
 logger = get_logger(__file__)
 
@@ -43,6 +45,7 @@ def build_failure_reply(status: "Status", message: str) -> ResponseBuilder:
         custom_attributes=None,
     )
 
+
 def exception_handler(
     exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str
 ) -> None:
@@ -62,4 +65,4 @@ def exception_handler(
         build_failure_reply("fail", failure_message)
     )
     if reply_channel:
-        reply_channel.send(serialized_resp)
\ No newline at end of file
+        reply_channel.send(serialized_resp)
diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py
index 9a56dd3ba5..c3dfcc0261 100644
--- a/smartsim/_core/mli/infrastructure/control/devicemanager.py
+++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py
@@ -41,7 +41,6 @@ def __init__(self, name: str) -> None:
         self._models: dict[str, t.Any] = {}
         """Dict of keys to models which are loaded on this device"""
 
-
     @property
     def name(self) -> str:
         """The identifier of the device represented by this object"""
@@ -78,22 +77,21 @@ def __init__(self, device: WorkerDevice):
         self._device = device
         """Device managed by this object"""
 
-    def _load_model_on_device(self,
+    def _load_model_on_device(
+        self,
         worker: MachineLearningWorkerBase,
         batch: RequestBatch,
-        feature_store: t.Optional[FeatureStore],
+        feature_stores: dict[str, FeatureStore],
     ) -> None:
-        model_bytes = worker.fetch_model(batch, feature_store)
-        loaded_model = worker.load_model(
-            batch, model_bytes, self._device.name
-        )
-        self._device.add_model(batch.model_key, loaded_model.model)
+        model_bytes = worker.fetch_model(batch, feature_stores)
+        loaded_model = worker.load_model(batch, model_bytes, self._device.name)
+        self._device.add_model(batch.model_key.key, loaded_model.model)
 
     def get_device(
         self,
         worker: MachineLearningWorkerBase,
         batch: RequestBatch,
-        feature_store: t.Optional[FeatureStore],
+        feature_stores: dict[str, FeatureStore],
     ) -> t.Generator[WorkerDevice, None, None]:
         """Get the device managed by this object
 
@@ -110,11 +108,11 @@ def get_device(
 
         # Load model if not already loaded, or
         # because it is sent with the request
-        if model_in_request or not batch.model_key in self._device:
-            self._load_model_on_device(worker, batch, feature_store)
+        if model_in_request or not batch.model_key.key in self._device:
+            self._load_model_on_device(worker, batch, feature_stores)
 
         try:
             yield self._device
         finally:
             if model_in_request:
-                self._device.remove_model(batch.model_key)
+                self._device.remove_model(batch.model_key.key)
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index d050a646c3..3c1105b501 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -27,9 +27,9 @@
 # pylint: disable=import-error
 # pylint: disable-next=unused-import
 import dragon
+import dragon.globalservices.pool as dragon_gs_pool
 from dragon.managed_memory import MemoryPool
 from dragon.mpbridge.queues import DragonQueue
-import dragon.globalservices.pool as dragon_gs_pool
 
 # pylint: enable=import-error
 
@@ -49,18 +49,13 @@
 from .....error import SmartSimError
 from .....log import get_logger
 from ....utils.timings import PerfTimer
-from ...comm.channel.channel import CommChannelBase
-from ...comm.channel.dragonchannel import DragonCommChannel
 from ...infrastructure.environmentloader import EnvironmentConfigLoader
-from ...infrastructure.storage.featurestore import FeatureStore
+from ...infrastructure.storage.featurestore import FeatureStore, FeatureStoreKey
 from ...infrastructure.worker.worker import (
-    RequestBatch,
     InferenceRequest,
     MachineLearningWorkerBase,
+    RequestBatch,
 )
-from ...message_handler import MessageHandler
-from ...mli_schemas.model.model_capnp import Model
-from ...mli_schemas.tensor.tensor_capnp import TensorDescriptor
 from .commons import exception_handler
 
 if t.TYPE_CHECKING:
@@ -69,56 +64,6 @@
 logger = get_logger("Request Dispatcher")
 
 
-def deserialize_message(
-    data_blob: bytes,
-    channel_type: t.Type[CommChannelBase],
-) -> InferenceRequest:
-    """Deserialize a message from a byte stream into an InferenceRequest
-    :param data_blob: The byte stream to deserialize
-    :param channel_type: The channel used to send the response"""
-
-    request = MessageHandler.deserialize_request(data_blob)
-    model_key: t.Optional[str] = None
-    model_bytes: t.Optional[Model] = None
-
-    if request.model.which() == "key":
-        model_key = request.model.key.key
-    elif request.model.which() == "data":
-        model_bytes = request.model.data
-
-    callback_key = request.replyChannel.descriptor
-
-    # todo: shouldn't this be `CommChannel.find` instead of `DragonCommChannel`
-    comm_channel = channel_type(callback_key)
-
-    input_keys: t.Optional[t.List[str]] = None
-    input_bytes: t.Optional[t.List[bytes]] = None
-
-    output_keys: t.Optional[t.List[str]] = None
-
-    input_meta: t.Optional[t.List[TensorDescriptor]] = None
-
-    if request.input.which() == "keys":
-        input_keys = [input_key.key for input_key in request.input.keys]
-    elif request.input.which() == "descriptors":
-        input_meta = request.input.descriptors  # type: ignore
-
-    if request.output:
-        output_keys = [tensor_key.key for tensor_key in request.output]
-
-    inference_request = InferenceRequest(
-        model_key=model_key,
-        callback=comm_channel,
-        raw_inputs=input_bytes,
-        input_keys=input_keys,
-        input_meta=input_meta,
-        output_keys=output_keys,
-        raw_model=model_bytes,
-        batch_size=0,
-    )
-    return inference_request
-
-
 class WorkerDevice:
     def __init__(self, name: str) -> None:
         """Wrapper around a device to keep track of loaded Models and availability
@@ -150,7 +95,9 @@ def __exit__(
 
 
 class BatchQueue(Queue[InferenceRequest]):
-    def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> None:
+    def __init__(
+        self, batch_timeout: float, batch_size: int, model_key: FeatureStoreKey
+    ) -> None:
         """Queue used to store inference requests waiting to be batched and
         sent to Worker Managers.
         :param batch_timeout: Time in seconds that has to be waited before flushing a
@@ -170,7 +117,7 @@ def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> Non
         self._disposable = False
         """Whether the queue will not be used again and can be deleted.
         A disposable queue is always full."""
-        self._model_key = model_key
+        self._model_key: FeatureStoreKey = model_key
         """Key of the model which needs to be executed on the queued requets"""
         self._flush_lock = RLock()
         """Lock used to make sure only one process can flush the queue (unused now)"""
@@ -207,7 +154,7 @@ def __exit__(
         self.release()
 
     @property
-    def model_key(self) -> str:
+    def model_key(self) -> FeatureStoreKey:
         """Key of the model which needs to be run on the queued requests"""
         return self._model_key
 
@@ -235,7 +182,7 @@ def put(
 
     @property
     def _elapsed_time(self) -> float:
-        if self.empty():
+        if self.empty() or self._first_put is None:
             return 0
         return time.time() - self._first_put
 
@@ -290,22 +237,21 @@ def __init__(
         batch_size: int,
         config_loader: EnvironmentConfigLoader,
         worker_type: t.Type[MachineLearningWorkerBase],
-        comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel,
     ) -> None:
         """The RequestDispatcher intercepts inference requests, stages them in
         queues and batches them together before making them available to Worker
         Managers.
-        :param batch_timeout: Maximum elapsed time before flushing a complete or incomplete batch
+        :param batch_timeout: Maximum elapsed time before flushing a complete or
+        incomplete batch
         :param batch_size: Total capacity of each batch queue.
         :param mem_pool: Memory pool used to share batched input tensors with worker
         managers
         :param config_loader: Object to load configuration from environment
         :param worker_type: Type of worker to instantiate to batch inputs
-        :param comm_channel_type: Type of channel used to get requests
         :raises SmartSimError: If config_loaded.get_queue() does not return a channel
         """
         super().__init__(as_service=True, cooldown=1)
-        self._queues: dict[str, list[BatchQueue]] = []
+        self._queues: dict[str, list[BatchQueue]] = {}
         """Dict of all batch queues available for a given model key"""
         self._active_queues: dict[str, BatchQueue] = {}
         """Mapping telling which queue is the recipient of requests for a given model
@@ -323,12 +269,15 @@ def __init__(
         """The channel the dispatcher monitors for new tasks"""
         self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0)
         """The queue on which batched inference requests are placed"""
-        self._feature_store: t.Optional[FeatureStore] = (
-            config_loader.get_feature_store()
-        )
-        """A feature store to retrieve models from"""
-        self._comm_channel_type = comm_channel_type
-        """The type of the channel used to receive requests"""
+        self._feature_stores: t.Dict[str, FeatureStore] = {}
+        """A collection of attached feature stores"""
+        self._featurestore_factory = config_loader._featurestore_factory
+        """A factory method to create a desired feature store client type"""
+        self._backbone: t.Optional[FeatureStore] = config_loader.get_backbone()
+        """A standalone, system-created feature store used to share internal
+        information among MLI components"""
+        self._callback_factory = config_loader._callback_factory
+        """The type of communication channel to construct for callbacks"""
         self._worker = worker_type()
         """The worker used to batch inputs"""
         self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(2 * 1024**3).sdesc)
@@ -336,37 +285,91 @@ def __init__(
         self._perf_timer = PerfTimer(prefix="r_", debug=True, timing_on=True)
         """Performance timer"""
 
-    def _validate_request(self, request: InferenceRequest) -> bool:
-        """Ensure the request can be processed.
+    def _check_feature_stores(self, request: InferenceRequest) -> bool:
+        """Ensures that all feature stores required by the request are available
+
         :param request: The request to validate
-        :return: True if the request is valid, False otherwise"""
-        if not self._feature_store:
-            if request.model_key:
-                logger.error("Unable to load model by key without feature store")
-                return False
-
-            if request.input_keys:
-                logger.error("Unable to load inputs by key without feature store")
-                return False
-
-            if request.output_keys:
-                logger.error("Unable to persist outputs by key without feature store")
-                return False
-
-        if not request.model_key and not request.raw_model:
-            logger.error("Unable to continue without model bytes or feature store key")
-            return False
+        :returns: False if feature store validation fails for the request, True
+        otherwise
+        """
+        # collect all feature stores required by the request
+        fs_model: t.Set[str] = set()
+        if request.model_key:
+            fs_model = {request.model_key.descriptor}
+        fs_inputs = {key.descriptor for key in request.input_keys}
+        fs_outputs = {key.descriptor for key in request.output_keys}
 
-        if not request.input_keys and not request.raw_inputs:
-            logger.error("Unable to continue without input bytes or feature store keys")
-            return False
+        # identify which feature stores are requested and unknown
+        fs_desired = fs_model.union(fs_inputs).union(fs_outputs)
+        fs_actual = {item.descriptor for item in self._feature_stores.values()}
+        fs_missing = fs_desired - fs_actual
 
-        if request.callback is None:
-            logger.error("No callback channel provided in request")
+        if self._featurestore_factory is None:
+            logger.error("No feature store factory configured")
             return False
 
+        # create the feature stores we need to service request
+        if fs_missing:
+            logger.debug(f"Adding feature store(s): {fs_missing}")
+            for descriptor in fs_missing:
+                feature_store = self._featurestore_factory(descriptor)
+                self._feature_stores[descriptor] = feature_store
+
         return True
 
+    # pylint: disable-next=no-self-use
+    def _check_model(self, request: InferenceRequest) -> bool:
+        """Ensure that a model is available for the request
+
+        :param request: The request to validate
+        :returns: False if model validation fails for the request, True otherwise
+        """
+        if request.model_key or request.raw_model:
+            return True
+
+        logger.error("Unable to continue without model bytes or feature store key")
+        return False
+
+    # pylint: disable-next=no-self-use
+    def _check_inputs(self, request: InferenceRequest) -> bool:
+        """Ensure that inputs are available for the request
+
+        :param request: The request to validate
+        :returns: False if input validation fails for the request, True otherwise
+        """
+        if request.input_keys or request.raw_inputs:
+            return True
+
+        logger.error("Unable to continue without input bytes or feature store keys")
+        return False
+
+    # pylint: disable-next=no-self-use
+    def _check_callback(self, request: InferenceRequest) -> bool:
+        """Ensure that a callback channel is available for the request
+
+        :param request: The request to validate
+        :returns: False if callback validation fails for the request, True otherwise
+        """
+        if request.callback is not None:
+            return True
+
+        logger.error("No callback channel provided in request")
+        return False
+
+    def _validate_request(self, request: InferenceRequest) -> bool:
+        """Ensure the request can be processed
+
+        :param request: The request to validate
+        :return: False if the request fails any validation checks, True otherwise"""
+        checks = [
+            self._check_feature_stores(request),
+            self._check_model(request),
+            self._check_inputs(request),
+            self._check_callback(request),
+        ]
+
+        return all(checks)
+
     def _on_start(self) -> None:
         self._queue_swap_lock = RLock()
 
@@ -388,18 +391,25 @@ def _on_iteration(self) -> None:
             tensor_bytes_list = bytes_list[1:]
             self._perf_timer.start_timings()
 
-            request = deserialize_message(request_bytes, self._comm_channel_type)
+            request = self._worker.deserialize_message(
+                request_bytes, self._callback_factory
+            )
             if request.input_meta and tensor_bytes_list:
                 request.raw_inputs = tensor_bytes_list
 
             self._perf_timer.measure_time("deserialize_message")
-            if not self._validate_request(request):
-                return
 
-            self._perf_timer.measure_time("validate_request")
-            self.dispatch(request)
-
-            self._perf_timer.measure_time("dispatch")
+            if not self._validate_request(request):
+                exception_handler(
+                    ValueError("Error validating the request"),
+                    request.callback,
+                    "Error validating the request.",
+                )
+                self._perf_timer.measure_time("validate_request")
+            else:
+                self._perf_timer.measure_time("validate_request")
+                self.dispatch(request)
+                self._perf_timer.measure_time("dispatch")
         finally:
             self.flush_requests()
             # TODO: implement this
@@ -415,7 +425,7 @@ def task_queue(self) -> DragonQueue:
         """The queue on which batched requests are placed"""
         return self._outgoing_queue
 
-    def _swap_queue(self, model_key: str) -> None:
+    def _swap_queue(self, model_key: FeatureStoreKey) -> None:
         """Get an empty queue or create a new one
 
         and make it the active one for a given model.
@@ -427,18 +437,17 @@ def _swap_queue(self, model_key: str) -> None:
         if self._queue_swap_lock is None:
             raise SmartSimError("Queues were not locked")
         with self._queue_swap_lock:
-            for queue_list in self._queues[model_key]:
-                for queue in queue_list:
-                    if not queue.full():
-                        self._active_queues[model_key] = queue
-                        return
+            for queue in self._queues[model_key.key]:
+                if not queue.full():
+                    self._active_queues[model_key.key] = queue
+                    return
 
             new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_key)
             if model_key in self._queues:
-                self._queues[model_key].append(new_queue)
+                self._queues[model_key.key].append(new_queue)
             else:
-                self._queues[model_key] = [new_queue]
-            self._active_queues[model_key] = new_queue
+                self._queues[model_key.key] = [new_queue]
+            self._active_queues[model_key.key] = new_queue
             return
 
     def dispatch(self, request: InferenceRequest) -> None:
@@ -449,7 +458,9 @@ def dispatch(self, request: InferenceRequest) -> None:
             logger.info("Direct inference requested, creating tmp queue")
             tmp_id = f"_tmp_{str(uuid.uuid4())}"
             tmp_queue: BatchQueue = BatchQueue(
-                batch_timeout=0, batch_size=1, model_key=tmp_id
+                batch_timeout=0,
+                batch_size=1,
+                model_key=FeatureStoreKey(key=tmp_id, descriptor="TMP"),
             )
             self._active_queues[tmp_id] = tmp_queue
             tmp_queue.put_nowait(request)
@@ -460,7 +471,7 @@ def dispatch(self, request: InferenceRequest) -> None:
             success = False
             while not success:
                 try:
-                    self._active_queues[request.model_key].put_nowait(request)
+                    self._active_queues[request.model_key.key].put_nowait(request)
                     success = True
                 except (Full, KeyError):
                     self._swap_queue(request.model_key)
@@ -469,20 +480,22 @@ def flush_requests(self) -> None:
         """Get all requests from queues which are ready to be flushed. Place all
         avaliable request batches in the outgoing queue.
         """
-        for queue_list in self._queues:
+        for queue_list in self._queues.values():
             for queue in queue_list:
                 if queue.ready and queue.acquire(blocking=False):
                     self._perf_timer.measure_time("find_queue")
                     try:
                         batch = RequestBatch(
-                            model_key=queue.model_key, requests=queue.flush(), inputs=None
+                            requests=queue.flush(),
+                            inputs=None,
+                            model_key=queue.model_key,
                         )
                     finally:
                         self._perf_timer.measure_time("flush_requests")
                         queue.release()
                     try:
                         fetch_results = self._worker.fetch_inputs(
-                            batch=batch, feature_store=self._feature_store
+                            batch=batch, feature_stores=self._feature_stores
                         )
                     except Exception as exc:
                         exception_handler(
@@ -490,10 +503,13 @@ def flush_requests(self) -> None:
                             None,
                             "Error fetching input.",
                         )
+                        continue
                     self._perf_timer.measure_time("fetch_input")
                     try:
                         transformed_inputs = self._worker.transform_input(
-                            batch=batch, fetch_results=fetch_results, mem_pool=self._mem_pool
+                            batch=batch,
+                            fetch_results=fetch_results,
+                            mem_pool=self._mem_pool,
                         )
                     except Exception as exc:
                         exception_handler(
@@ -501,6 +517,7 @@ def flush_requests(self) -> None:
                             None,
                             "Error Transforming input.",
                         )
+                        continue
 
                     self._perf_timer.measure_time("transform_input")
                     batch.inputs = transformed_inputs
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index fe0312e7ae..2459747ec0 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -27,33 +27,30 @@
 # pylint: disable=import-error
 # pylint: disable-next=unused-import
 import dragon
+
 # pylint: enable=import-error
 
 # isort: off
 # isort: on
 
-from queue import Empty
-
 import multiprocessing as mp
 import time
 import typing as t
+from queue import Empty
 
 from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
 
 from .....log import get_logger
 from ....entrypoints.service import Service
 from ....utils.timings import PerfTimer
-from ...comm.channel.channel import CommChannelBase
-from ...comm.channel.dragonchannel import DragonCommChannel
 from ...infrastructure.environmentloader import EnvironmentConfigLoader
 from ...infrastructure.worker.worker import (
-    RequestBatch,
     InferenceReply,
     LoadModelResult,
     MachineLearningWorkerBase,
+    RequestBatch,
 )
 from ...message_handler import MessageHandler
-from ...mli_schemas.response.response_capnp import ResponseBuilder
 from .commons import build_failure_reply, exception_handler
 from .devicemanager import DeviceManager, WorkerDevice
 
@@ -63,36 +60,6 @@
 logger = get_logger(__name__)
 
 
-
-def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]:
-    prepared_outputs: t.List[t.Any] = []
-    if reply.output_keys:
-        for key in reply.output_keys:
-            if not key:
-                continue
-            msg_key = MessageHandler.build_tensor_key(key)
-            prepared_outputs.append(msg_key)
-    elif reply.outputs:
-        for _ in reply.outputs:
-            msg_tensor_desc = MessageHandler.build_tensor_descriptor(
-                "c",
-                "float32",
-                [1],
-            )
-            prepared_outputs.append(msg_tensor_desc)
-    return prepared_outputs
-
-
-def build_reply(reply: InferenceReply) -> ResponseBuilder:
-    results = prepare_outputs(reply)
-
-    return MessageHandler.build_response(
-        status=reply.status_enum,
-        message=reply.message,
-        result=results,
-        custom_attributes=None,
-    )
-
 class WorkerManager(Service):
     """An implementation of a service managing distribution of tasks to
     machine learning workers"""
@@ -101,7 +68,7 @@ def __init__(
         self,
         config_loader: EnvironmentConfigLoader,
         worker_type: t.Type[MachineLearningWorkerBase],
-        dispatcher_queue: "mp.Queue[InferenceBatch]",
+        dispatcher_queue: "mp.Queue[RequestBatch]",
         as_service: bool = False,
         cooldown: int = 0,
         device: t.Literal["cpu", "gpu"] = "cpu",
@@ -123,10 +90,6 @@ def __init__(
 
         self._dispatcher_queue = dispatcher_queue
         """The dispatcher queue the manager monitors for new tasks"""
-        self._feature_store: t.Optional[FeatureStore] = (
-            config_loader.get_feature_store()
-        )
-        """A feature store to retrieve models from"""
         self._worker = worker_type()
         """The ML Worker implementation"""
         self._callback_factory = config_loader._callback_factory
@@ -148,87 +111,8 @@ def __init__(
         self._perf_timer = PerfTimer(prefix="w_", debug=True, timing_on=True)
         """Performance timer"""
 
-    def _check_feature_stores(self, request: InferenceRequest) -> bool:
-        """Ensures that all feature stores required by the request are available
-
-        :param request: The request to validate
-        :returns: False if feature store validation fails for the request, True otherwise
-        """
-        # collect all feature stores required by the request
-        fs_model: t.Set[str] = set()
-        if request.model_key:
-            fs_model = {request.model_key.descriptor}
-        fs_inputs = {key.descriptor for key in request.input_keys}
-        fs_outputs = {key.descriptor for key in request.output_keys}
-
-        # identify which feature stores are requested and unknown
-        fs_desired = fs_model.union(fs_inputs).union(fs_outputs)
-        fs_actual = {item.descriptor for item in self._feature_stores.values()}
-        fs_missing = fs_desired - fs_actual
-
-        if self._featurestore_factory is None:
-            logger.error("No feature store factory configured")
-            return False
-
-        # create the feature stores we need to service request
-        if fs_missing:
-            logger.debug(f"Adding feature store(s): {fs_missing}")
-            for descriptor in fs_missing:
-                feature_store = self._featurestore_factory(descriptor)
-                self._feature_stores[descriptor] = feature_store
-
-        return True
-
-    def _check_model(self, request: InferenceRequest) -> bool:
-        """Ensure that a model is available for the request
-
-        :param request: The request to validate
-        :returns: False if model validation fails for the request, True otherwise
-        """
-        if request.model_key or request.raw_model:
-            return True
-
-        logger.error("Unable to continue without model bytes or feature store key")
-        return False
-
-    def _check_inputs(self, request: InferenceRequest) -> bool:
-        """Ensure that inputs are available for the request
-
-        :param request: The request to validate
-        :returns: False if input validation fails for the request, True otherwise
-        """
-        if request.input_keys or request.raw_inputs:
-            return True
-
-        logger.error("Unable to continue without input bytes or feature store keys")
-        return False
-
-    def _check_callback(self, request: InferenceRequest) -> bool:
-        """Ensure that a callback channel is available for the request
-
-        :param request: The request to validate
-        :returns: False if callback validation fails for the request, True otherwise
-        """
-        if request.callback is not None:
-            return True
-
-        logger.error("No callback channel provided in request")
-        return False
-
-    def _validate_request(self, request: InferenceRequest) -> bool:
-        """Ensure the request can be processed
-
-        :param request: The request to validate
-        :return: False if the request fails any validation checks, True otherwise"""
-        checks = [
-            self._check_feature_stores(request),
-            self._check_model(request),
-            self._check_inputs(request),
-            self._check_callback(request),
-        ]
-
-        return all(checks)
-
+    # remove this when we are done with time measurements
+    # pylint: disable-next=too-many-statements
     def _on_iteration(self) -> None:
         """Executes calls to the machine learning worker implementation to complete
 
@@ -258,12 +142,12 @@ def _on_iteration(self) -> None:
             self._device_manager.get_device(
                 worker=self._worker,
                 batch=batch,
-                feature_store=self._feature_store,
+                feature_stores=self._feature_stores,
             )
         )
         self._perf_timer.measure_time("fetch_model")
 
-        model_result = LoadModelResult(device.get_model(batch.model_key))
+        model_result = LoadModelResult(device.get_model(batch.model_key.key))
         self._perf_timer.measure_time("load_model")
 
         if batch.inputs is None:
@@ -302,7 +186,7 @@ def _on_iteration(self) -> None:
                     reply.output_keys = self._worker.place_output(
                         request,
                         transformed_output,
-                        self._feature_store,
+                        self._feature_stores,
                     )
                 except Exception as e:
                     exception_handler(
@@ -313,12 +197,19 @@ def _on_iteration(self) -> None:
                 reply.outputs = transformed_output.outputs
             self._perf_timer.measure_time("assign_output")
 
-            if reply.outputs is None:
+            if reply.outputs is None or not reply.outputs:
                 response = build_failure_reply("fail", "Outputs not found.")
             else:
                 reply.status_enum = "complete"
                 reply.message = "Success"
-                response = build_reply(reply)
+
+                results = self._worker.prepare_outputs(reply)
+                response = MessageHandler.build_response(
+                    status=reply.status_enum,
+                    message=reply.message,
+                    result=results,
+                    custom_attributes=None,
+                )
 
             self._perf_timer.measure_time("build_reply")
 
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index 392e7e051e..7ea09b9af9 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -39,9 +39,9 @@
     ExecuteResult,
     FetchInputResult,
     FetchModelResult,
-    RequestBatch,
     LoadModelResult,
     MachineLearningWorkerBase,
+    RequestBatch,
     TransformInputResult,
     TransformOutputResult,
 )
@@ -164,8 +164,12 @@ def execute(
         with torch.no_grad():
             model.eval()
             results = [
-                model(*[tensor.to(device, non_blocking=True).detach()
-                for tensor in tensors])
+                model(
+                    *[
+                        tensor.to(device, non_blocking=True).detach()
+                        for tensor in tensors
+                    ]
+                )
             ]
 
         transform_result.transformed = []
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 6e5aafca3d..374f35b594 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -171,18 +171,18 @@ def __init__(self, result: bytes) -> None:
 
 @dataclass
 class RequestBatch:
-    """A batch of aggregated inference requests
-    """
-    model_key: str
+    """A batch of aggregated inference requests"""
+
     requests: list[InferenceRequest]
     inputs: t.Optional[TransformInputResult]
+    model_key: FeatureStoreKey
 
     @property
     def has_valid_requests(self) -> bool:
         return len(self.requests) > 0
 
     @property
-    def has_raw_nodel(self) -> bool:
+    def has_raw_model(self) -> bool:
         return self.raw_model is not None
 
     @property
@@ -191,6 +191,7 @@ def raw_model(self) -> t.Optional[t.Any]:
             return self.requests[0].raw_model
         return None
 
+
 class MachineLearningWorkerCore:
     """Basic functionality of ML worker that is shared across all worker types"""
 
@@ -279,27 +280,26 @@ def fetch_model(
         :return: Raw bytes of the model"""
 
         # All requests in the same batch share the model
-        sample_request = batch.requests[0]
-        if sample_request.raw_model:
-            return FetchModelResult(sample_request.raw_model.data)
+        if batch.raw_model:
+            return FetchModelResult(batch.raw_model.data)
 
         if not feature_stores:
             raise ValueError("Feature store is required for model retrieval")
 
-        if not sample_request.model_key:
+        if batch.model_key is None:
             raise SmartSimError(
                 "Key must be provided to retrieve model from feature store"
             )
 
-        key, fsd = request.model_key.key, request.model_key.descriptor
+        key, fsd = batch.model_key.key, batch.model_key.descriptor
 
         try:
             feature_store = feature_stores[fsd]
-            raw_bytes: bytes = t.cast(bytes, feature_store[sample_key])
+            raw_bytes: bytes = t.cast(bytes, feature_store[key])
             return FetchModelResult(raw_bytes)
         except FileNotFoundError as ex:
             logger.exception(ex)
-            raise SmartSimError(f"Model could not be retrieved with key {sample_key}") from ex
+            raise SmartSimError(f"Model could not be retrieved with key {key}") from ex
 
     @staticmethod
     def fetch_inputs(
@@ -321,22 +321,23 @@ def fetch_inputs(
             if not feature_stores:
                 raise ValueError("No input and no feature store provided")
 
-        if request.input_keys:
-            data: t.List[bytes] = []
-
-            for fs_key in request.input_keys:
-                try:
-                    feature_store = feature_stores[fs_key.descriptor]
-                    tensor_bytes = t.cast(bytes, feature_store[fs_key.key])
-                    data.append(tensor_bytes)
-                except KeyError as ex:
-                    logger.exception(ex)
-                    raise SmartSimError(
-                        f"Model could not be retrieved with key {fs_key.key}"
-                    ) from ex
-            return FetchInputResult(
-                data, meta=None
-            )  # fixme: need to get both tensor and descriptor
+            if request.input_keys:
+                data: t.List[bytes] = []
+
+                for fs_key in request.input_keys:
+                    try:
+                        feature_store = feature_stores[fs_key.descriptor]
+                        tensor_bytes = t.cast(bytes, feature_store[fs_key.key])
+                        data.append(tensor_bytes)
+                    except KeyError as ex:
+                        logger.exception(ex)
+                        raise SmartSimError(
+                            f"Model could not be retrieved with key {fs_key.key}"
+                        ) from ex
+                fetch_results.append(
+                    FetchInputResult(data, meta=None)
+                )  # fixme: need to get both tensor and descriptor
+                continue
 
             raise ValueError("No input source")
 

From e4a9db0f498f417db106b621aced006b1702f7f8 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 14 Aug 2024 16:55:08 -0500
Subject: [PATCH 57/84] Working version, still slow

---
 ex/high_throughput_inference/mli_driver.py    |  2 +-
 ex/high_throughput_inference/mock_app.py      |  3 +-
 .../standalone_workermanager.py               | 22 ++---
 .../infrastructure/control/devicemanager.py   |  4 +
 .../control/requestdispatcher.py              | 18 ++--
 .../infrastructure/control/workermanager.py   | 92 ++++++++++++++++---
 .../_core/mli/infrastructure/worker/worker.py | 16 ++++
 smartsim/_core/utils/timings.py               |  4 +
 8 files changed, 123 insertions(+), 38 deletions(-)

diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py
index 8d31d7610f..8f25540078 100644
--- a/ex/high_throughput_inference/mli_driver.py
+++ b/ex/high_throughput_inference/mli_driver.py
@@ -16,7 +16,7 @@
 filedir = os.path.dirname(__file__)
 worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py")
 app_script_name = os.path.join(filedir, "mock_app.py")
-model_name = os.path.join(filedir, f"resnet50.{DEVICE.upper()}.pt")
+model_name = os.path.join(filedir, f"resnet50.{DEVICE}.pt")
 
 transport: t.Literal["hsta", "tcp"] = "hsta"
 
diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index 4033ad960b..77daafd5c5 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -154,7 +154,6 @@ def model(self):
     def name(self):
         return self._name
 
-
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser("Mock application")
@@ -162,7 +161,7 @@ def name(self):
     parser.add_argument("--log_max_batchsize", default=8, type=int)
     args = parser.parse_args()
 
-    resnet = ResNetWrapper("resnet50", f"resnet50.{args.device.upper()}.pt")
+    resnet = ResNetWrapper("resnet50", f"resnet50.{args.device}.pt")
 
     client = ProtoClient(timing_on=True)
     client.set_model(resnet.name, resnet.model)
diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index b30945fef3..952cf2dc5c 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -81,18 +81,10 @@
 
 pid = os.getpid()
 affinity = os.sched_getaffinity(pid)
-logger.log(f"Entry point: {socket.gethostname()}, {affinity}")
-logger.log(f"CPUS: {os.cpu_count()}")
+logger.info(f"Entry point: {socket.gethostname()}, {affinity}")
+logger.info(f"CPUS: {os.cpu_count()}")
 
 
-def create_worker_manager(
-    worker_type: t.Type[MachineLearningWorkerBase],
-    config_loader: EnvironmentConfigLoader,
-    device: str,
-    dispatcher_queue: mp.Queue,
-) -> WorkerManager:
-    return
-
 
 def service_as_dragon_proc(
     service: Service, cpu_affinity: list[int], gpu_affinity: list[int]
@@ -102,7 +94,6 @@ def service_as_dragon_proc(
     local_policy = dragon_policy.Policy(
         placement=dragon_policy.Policy.Placement.HOST_NAME,
         host_name=socket.gethostname(),
-        affinity=dragon_policy.Policy.Affinity.SPECIFIC,
         cpu_affinity=cpu_affinity,
         gpu_affinity=gpu_affinity,
     )
@@ -179,8 +170,7 @@ def service_as_dragon_proc(
     dispatcher = RequestDispatcher(
         batch_timeout=args.batch_timeout,
         batch_size=args.batch_size,
-        config_loader=ss_config_loader,
-        comm_channel_type=DragonCommChannel,
+        config_loader=config_loader,
         worker_type=arg_worker_type,
     )
 
@@ -189,13 +179,12 @@ def service_as_dragon_proc(
     for wm_idx in range(args.num_workers):
 
         worker_manager =  WorkerManager(
-            config_loader=ss_config_loader,
+            config_loader=config_loader,
             worker_type=arg_worker_type,
             as_service=True,
             cooldown=10,
-            comm_channel_type=DragonCommChannel,
             device=worker_device,
-            task_queue=dispatcher.task_queue,
+            dispatcher_queue=dispatcher.task_queue,
         )
 
         wms.append(worker_manager)
@@ -226,6 +215,7 @@ def service_as_dragon_proc(
     # TODO: use ProcessGroup and restart=True?
     all_procs = [dispatcher_proc, *worker_manager_procs]
 
+    print(f"Dispatcher proc: {dispatcher_proc}")
     for proc in all_procs:
         proc.start()
 
diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py
index c3dfcc0261..382eca6b13 100644
--- a/smartsim/_core/mli/infrastructure/control/devicemanager.py
+++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py
@@ -29,6 +29,9 @@
 from ...infrastructure.storage.featurestore import FeatureStore
 from ..worker.worker import MachineLearningWorkerBase
 from .requestdispatcher import RequestBatch
+from .....log import get_logger
+
+logger = get_logger(__name__)
 
 
 class WorkerDevice:
@@ -83,6 +86,7 @@ def _load_model_on_device(
         batch: RequestBatch,
         feature_stores: dict[str, FeatureStore],
     ) -> None:
+
         model_bytes = worker.fetch_model(batch, feature_stores)
         loaded_model = worker.load_model(batch, model_bytes, self._device.name)
         self._device.add_model(batch.model_key.key, loaded_model.model)
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index 3c1105b501..151c04496d 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -282,7 +282,7 @@ def __init__(
         """The worker used to batch inputs"""
         self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(2 * 1024**3).sdesc)
         """Memory pool used to share batched input tensors with the Worker Managers"""
-        self._perf_timer = PerfTimer(prefix="r_", debug=True, timing_on=True)
+        self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True)
         """Performance timer"""
 
     def _check_feature_stores(self, request: InferenceRequest) -> bool:
@@ -376,9 +376,10 @@ def _on_start(self) -> None:
     def _on_iteration(self) -> None:
 
         try:
+            self._perf_timer.set_active(True)
             bytes_list: t.List[bytes] = self._incoming_channel.recv()
         except Exception:
-            self._perf_timer.start_timings()
+            self._perf_timer.set_active(False)
         else:
             if not bytes_list:
                 exception_handler(
@@ -437,13 +438,14 @@ def _swap_queue(self, model_key: FeatureStoreKey) -> None:
         if self._queue_swap_lock is None:
             raise SmartSimError("Queues were not locked")
         with self._queue_swap_lock:
-            for queue in self._queues[model_key.key]:
-                if not queue.full():
-                    self._active_queues[model_key.key] = queue
-                    return
+            if model_key.key in self._queues:
+                for queue in self._queues[model_key.key]:
+                    if not queue.full():
+                        self._active_queues[model_key.key] = queue
+                        return
 
             new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_key)
-            if model_key in self._queues:
+            if model_key.key in self._queues:
                 self._queues[model_key.key].append(new_queue)
             else:
                 self._queues[model_key.key] = [new_queue]
@@ -455,7 +457,7 @@ def dispatch(self, request: InferenceRequest) -> None:
         :param request: the request to place
         """
         if request.raw_model is not None:
-            logger.info("Direct inference requested, creating tmp queue")
+            logger.debug("Direct inference requested, creating tmp queue")
             tmp_id = f"_tmp_{str(uuid.uuid4())}"
             tmp_queue: BatchQueue = BatchQueue(
                 batch_timeout=0,
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 2459747ec0..fa508b3230 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -105,12 +105,56 @@ def __init__(
         self._backbone: t.Optional[FeatureStore] = config_loader.get_backbone()
         """A standalone, system-created feature store used to share internal
         information among MLI components"""
-
         self._device_manager: t.Optional[DeviceManager] = None
         """Object responsible for model caching and device access"""
-        self._perf_timer = PerfTimer(prefix="w_", debug=True, timing_on=True)
+        self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True)
         """Performance timer"""
 
+    def _on_start(self) -> None:
+        self._device_manager = DeviceManager(WorkerDevice(self._device))
+
+    def _check_feature_stores(self, batch: RequestBatch) -> bool:
+        """Ensures that all feature stores required by the request are available
+
+        :param batch: The batch of requests to validate
+        :returns: False if feature store validation fails for the batch, True otherwise
+        """
+        # collect all feature stores required by the request
+        fs_model: t.Set[str] = set()
+        if batch.model_key:
+            fs_model = {batch.model_key.descriptor}
+        fs_inputs = {key.descriptor for key in batch.input_keys}
+        fs_outputs = {key.descriptor for key in batch.output_keys}
+
+        # identify which feature stores are requested and unknown
+        fs_desired = fs_model.union(fs_inputs).union(fs_outputs)
+        fs_actual = {item.descriptor for item in self._feature_stores.values()}
+        fs_missing = fs_desired - fs_actual
+
+        if self._featurestore_factory is None:
+            logger.error("No feature store factory configured")
+            return False
+
+        # create the feature stores we need to service request
+        if fs_missing:
+            logger.debug(f"Adding feature store(s): {fs_missing}")
+            for descriptor in fs_missing:
+                feature_store = self._featurestore_factory(descriptor)
+                self._feature_stores[descriptor] = feature_store
+
+        return True
+
+    def _validate_batch(self, batch: RequestBatch) -> bool:
+        """Ensure the request can be processed
+
+        :param batch: The batch of requests to validate
+        :return: False if the request fails any validation checks, True otherwise"""
+
+        if batch is None or len(batch.requests)==0:
+            return False
+
+        return self._check_feature_stores(batch)
+
     # remove this when we are done with time measurements
     # pylint: disable-next=too-many-statements
     def _on_iteration(self) -> None:
@@ -128,7 +172,7 @@ def _on_iteration(self) -> None:
             "flush_requests", time.perf_counter() - pre_batch_time
         )
 
-        if batch is None or 0 == len(batch.requests):
+        if not self._validate_batch(batch):
             exception_handler(
                 ValueError("An empty batch was received"),
                 None,
@@ -136,18 +180,44 @@ def _on_iteration(self) -> None:
             )
             return
 
+
         if self._device_manager is None:
-            raise ValueError("No Device Manager available: did you call _on_start()")
-        device: WorkerDevice = next(
-            self._device_manager.get_device(
-                worker=self._worker,
-                batch=batch,
-                feature_stores=self._feature_stores,
+            for request in batch.requests:
+                exception_handler(
+                    ValueError("No Device Manager available: did you call _on_start()"),
+                    request.callback,
+                    "Error acquiring device manager"
+                )
+                return
+
+        try:
+            device: WorkerDevice = next(
+                self._device_manager.get_device(
+                    worker=self._worker,
+                    batch=batch,
+                    feature_stores=self._feature_stores,
+                )
             )
-        )
+        except Exception as exc:
+            for request in batch.requests:
+                exception_handler(
+                    exc,
+                    request.callback,
+                    "Error loading model on device or getting device"
+                )
+            return
         self._perf_timer.measure_time("fetch_model")
 
-        model_result = LoadModelResult(device.get_model(batch.model_key.key))
+        try:
+            model_result = LoadModelResult(device.get_model(batch.model_key.key))
+        except Exception as exc:
+            for request in batch.requests:
+                exception_handler(
+                    exc,
+                    request.callback,
+                    "Error getting model from device"
+                )
+            return
         self._perf_timer.measure_time("load_model")
 
         if batch.inputs is None:
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 374f35b594..b3d47b13c7 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -191,6 +191,22 @@ def raw_model(self) -> t.Optional[t.Any]:
             return self.requests[0].raw_model
         return None
 
+    @property
+    def input_keys(self) -> t.List[FeatureStoreKey]:
+        keys = []
+        for request in self.requests:
+            keys.extend(request.input_keys)
+
+        return keys
+
+    @property
+    def output_keys(self) -> t.List[FeatureStoreKey]:
+        keys = []
+        for request in self.requests:
+            keys.extend(request.output_keys)
+
+        return keys
+
 
 class MachineLearningWorkerCore:
     """Basic functionality of ML worker that is shared across all worker types"""
diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py
index 286bd4f4a8..79e51e4f8c 100644
--- a/smartsim/_core/utils/timings.py
+++ b/smartsim/_core/utils/timings.py
@@ -132,3 +132,7 @@ def print_timings(self, to_file: bool = False) -> None:
                 print(" ".join(self._format_number(value) for value in value_array[i]))
         if to_file:
             np.save(self._prefix + self._filename + ".npy", value_array)
+
+    def set_active(self, active: bool = True) -> None:
+        """Set whether the timer will record time"""
+        self._timing_on = active

From 0c0637c657d6d158878ed34445296df43d398fde Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 14 Aug 2024 17:39:40 -0500
Subject: [PATCH 58/84] Last fixes

---
 .../_core/mli/infrastructure/control/requestdispatcher.py     | 4 +---
 smartsim/_core/utils/timings.py                               | 4 ++++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index 151c04496d..056dc73f0e 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -374,7 +374,6 @@ def _on_start(self) -> None:
         self._queue_swap_lock = RLock()
 
     def _on_iteration(self) -> None:
-
         try:
             self._perf_timer.set_active(True)
             bytes_list: t.List[bytes] = self._incoming_channel.recv()
@@ -418,7 +417,7 @@ def _on_iteration(self) -> None:
 
             self._perf_timer.end_timings()
 
-        if self._perf_timer.max_length == 801:
+        if self._perf_timer.max_length == 801 and self._perf_timer.is_active:
             self._perf_timer.print_timings(True)
 
     @property
@@ -430,7 +429,6 @@ def _swap_queue(self, model_key: FeatureStoreKey) -> None:
         """Get an empty queue or create a new one
 
         and make it the active one for a given model.
-
         :param model_key: The key of the model for which the
         queue has to be swapped
         :raises SmartSimError: If the queue is not locked.
diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py
index 79e51e4f8c..2bf266e5a9 100644
--- a/smartsim/_core/utils/timings.py
+++ b/smartsim/_core/utils/timings.py
@@ -136,3 +136,7 @@ def print_timings(self, to_file: bool = False) -> None:
     def set_active(self, active: bool = True) -> None:
         """Set whether the timer will record time"""
         self._timing_on = active
+
+    def is_active(self) -> bool:
+        """Returns true if the timer will record time"""
+        return self._timing_on

From 7dbeded8663def673087c1f7ec3a07c58bf6e734 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Sat, 17 Aug 2024 10:25:32 -0500
Subject: [PATCH 59/84] Fixing tests

---
 .../infrastructure/control/devicemanager.py   |  6 +-
 .../control/requestdispatcher.py              | 22 +++++-
 .../infrastructure/control/workermanager.py   | 10 +--
 tests/dragon/test_error_handling.py           | 67 ++++++++++++++-----
 4 files changed, 77 insertions(+), 28 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py
index 382eca6b13..a42efa1d9d 100644
--- a/smartsim/_core/mli/infrastructure/control/devicemanager.py
+++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py
@@ -113,8 +113,10 @@ def get_device(
         # Load model if not already loaded, or
         # because it is sent with the request
         if model_in_request or not batch.model_key.key in self._device:
-            self._load_model_on_device(worker, batch, feature_stores)
-
+            try:
+                self._load_model_on_device(worker, batch, feature_stores)
+            except Exception as exc:
+                raise exc
         try:
             yield self._device
         finally:
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index 056dc73f0e..c59951204f 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -63,6 +63,8 @@
 
 logger = get_logger("Request Dispatcher")
 
+# Placeholder
+ModelIdentifier = FeatureStoreKey
 
 class WorkerDevice:
     def __init__(self, name: str) -> None:
@@ -77,12 +79,23 @@ def __init__(self, name: str) -> None:
         """Lock to ensure only one thread at the time accesses this device"""
 
     def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]:
+        """Acquire and lock this device to prevent other threads
+
+        from acquiring it concurrently.
+        :param blocking: If set to True, the call will block
+        for the time specified by ``timeout`` until the lock
+        can be acquired
+        :param timeout: Time (in seconds) to wait to acquire lock.
+        Ignored if ``blocking`` is set to False.
+        """
         return self._lock.acquire(blocking=blocking, timeout=timeout)
 
     def release(self) -> None:
+        """Release device to allow other threads to acquire it"""
         self._lock.release()
 
     def __enter__(self) -> None:
+        """Locked context creator for this device"""
         self.acquire()
 
     def __exit__(
@@ -91,12 +104,13 @@ def __exit__(
         exc_val: t.Optional[BaseException],
         exc_tb: t.Optional[TracebackType],
     ) -> None:
+        """Locked context destructor for this device"""
         self.release()
 
 
 class BatchQueue(Queue[InferenceRequest]):
     def __init__(
-        self, batch_timeout: float, batch_size: int, model_key: FeatureStoreKey
+        self, batch_timeout: float, batch_size: int, model_key: ModelIdentifier
     ) -> None:
         """Queue used to store inference requests waiting to be batched and
         sent to Worker Managers.
@@ -154,7 +168,7 @@ def __exit__(
         self.release()
 
     @property
-    def model_key(self) -> FeatureStoreKey:
+    def model_key(self) -> ModelIdentifier:
         """Key of the model which needs to be run on the queued requests"""
         return self._model_key
 
@@ -168,6 +182,7 @@ def put(
         :param item: The request
         :param block: Whether to block when trying to put the item
         :param timeout: Time (in seconds) to wait if block==True
+        :raises Full: If an item cannot be put on the queue
         """
         if not self.acquire(blocking=False):
             raise Full
@@ -182,6 +197,7 @@ def put(
 
     @property
     def _elapsed_time(self) -> float:
+        """Time elapsed since the first item was put on this queue"""
         if self.empty() or self._first_put is None:
             return 0
         return time.time() - self._first_put
@@ -199,7 +215,7 @@ def make_disposable(self) -> None:
 
     @property
     def can_be_removed(self) -> bool:
-        """Whether this queue can be deleted and garbafe collected"""
+        """Whether this queue can be deleted and garbage collected"""
         return self.empty() and self._disposable
 
     def flush(self) -> list[t.Any]:
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index fa508b3230..80145fb8e1 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -107,7 +107,7 @@ def __init__(
         information among MLI components"""
         self._device_manager: t.Optional[DeviceManager] = None
         """Object responsible for model caching and device access"""
-        self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True)
+        self._perf_timer = PerfTimer(prefix="w_", debug=True, timing_on=True)
         """Performance timer"""
 
     def _on_start(self) -> None:
@@ -166,6 +166,7 @@ def _on_iteration(self) -> None:
         try:
             batch: RequestBatch = self._dispatcher_queue.get(timeout=0.0001)
         except Empty:
+            logger.info("Empty queue")
             return
 
         self._perf_timer.start_timings(
@@ -174,17 +175,16 @@ def _on_iteration(self) -> None:
 
         if not self._validate_batch(batch):
             exception_handler(
-                ValueError("An empty batch was received"),
+                ValueError("An invalid batch was received"),
                 None,
-                "Error batching inputs, the batch was empty.",
+                "Error batching inputs, the batch was invalid.",
             )
             return
 
-
         if self._device_manager is None:
             for request in batch.requests:
                 exception_handler(
-                    ValueError("No Device Manager available: did you call _on_start()"),
+                    ValueError("No Device Manager available: did you call _on_start()?"),
                     request.callback,
                     "Error acquiring device manager"
                 )
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index bacffba933..345ebba5e9 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -40,17 +40,22 @@
     WorkerManager,
     exception_handler,
 )
+from smartsim._core.mli.infrastructure.control.requestdispatcher import (
+    RequestDispatcher,
+)
 from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader
 from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
     DragonFeatureStore,
 )
-from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore, FeatureStoreKey
 from smartsim._core.mli.infrastructure.worker.worker import (
     ExecuteResult,
     FetchInputResult,
     FetchModelResult,
     InferenceReply,
+    InferenceRequest,
     LoadModelResult,
+    RequestBatch,
     TransformInputResult,
     TransformOutputResult,
 )
@@ -85,35 +90,56 @@ def setup_worker_manager_model_bytes(
     backbone_descriptor: str,
     app_feature_store: FeatureStore,
 ):
-    integrated_worker = IntegratedTorchWorker()
+    integrated_worker_type = IntegratedTorchWorker
 
     chan = Channel.make_process_local()
     queue = FLInterface(main_ch=chan)
+    wrapped_queue = DragonFLIChannel(queue.serialize())
+
     monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()))
     # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
     monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor)
 
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    dispatcher = RequestDispatcher(
+        batch_timeout=0,
+        batch_size=1,
+        config_loader=config_loader,
+        worker_type=integrated_worker_type,
+    )
+
     worker_manager = WorkerManager(
-        EnvironmentConfigLoader(
-            featurestore_factory=DragonFeatureStore.from_descriptor,
-            callback_factory=FileSystemCommChannel.from_descriptor,
-            queue_factory=DragonFLIChannel.from_descriptor,
-        ),
-        integrated_worker,
+        config_loader=config_loader,
+        worker_type=integrated_worker_type,
+        dispatcher_queue=dispatcher.task_queue,
         as_service=False,
         cooldown=3,
     )
 
-    tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
-    output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
-    model = MessageHandler.build_model(b"model", "model name", "v 0.0.1")
-    request = MessageHandler.build_request(
-        test_dir, model, [tensor_key], [output_key], [], None
+    tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
+    output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
+
+    request = InferenceRequest(model_key= None, callback = None, raw_inputs= None, input_keys=[tensor_key], input_meta = None, output_keys=[output_key], raw_model=b'model', batch_size=0)
+
+    model_id = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
+
+    request_batch = RequestBatch(
+        [request],
+        TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]]),
+        model_id,
     )
-    ser_request = MessageHandler.serialize_request(request)
-    worker_manager._dispatcher_queue.send(ser_request)
 
-    return worker_manager, integrated_worker
+    dispatcher.task_queue.put(request_batch)
+
+    #
+    # wrapped_queue.send(ser_request)
+
+    return worker_manager, integrated_worker_type
 
 
 @pytest.fixture
@@ -147,7 +173,8 @@ def setup_worker_manager_model_key(
     model_key = MessageHandler.build_model_key(
         "model key", app_feature_store.descriptor
     )
-    request = MessageHandler.build_request(
+
+    MessageHandler.build_request(
         test_dir, model_key, [tensor_key], [output_key], [], None
     )
     ser_request = MessageHandler.serialize_request(request)
@@ -223,7 +250,10 @@ def test_pipeline_stage_errors_handled(
     error_message: str,
 ):
     """Ensures that the worker manager does not crash after a failure in various pipeline stages"""
-    worker_manager, integrated_worker = request.getfixturevalue(setup_worker_manager)
+    worker_manager, integrated_worker_type = request.getfixturevalue(
+        setup_worker_manager
+    )
+    integrated_worker = worker_manager._worker
     mock_reply_fn = mock_pipeline_stage(monkeypatch, integrated_worker, stage)
 
     if stage not in ["fetch_model"]:
@@ -279,6 +309,7 @@ def test_pipeline_stage_errors_handled(
             ),
         )
 
+    worker_manager._on_start()
     worker_manager._on_iteration()
 
     mock_reply_fn.assert_called_once()

From 0eadc63f1881ca63c212ba7dda735c904d0cef2c Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Sat, 17 Aug 2024 10:26:25 -0500
Subject: [PATCH 60/84] MLI driver multi-client

---
 ex/high_throughput_inference/mli_driver.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py
index 8f25540078..807a70b219 100644
--- a/ex/high_throughput_inference/mli_driver.py
+++ b/ex/high_throughput_inference/mli_driver.py
@@ -1,4 +1,3 @@
-import argparse
 import os
 import base64
 import cloudpickle
@@ -11,7 +10,7 @@
 import typing as t
 
 DEVICE = "gpu"
-NUM_RANKS = 1
+NUM_RANKS = 4
 NUM_WORKERS = 1
 filedir = os.path.dirname(__file__)
 worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py")
@@ -39,7 +38,7 @@
         "--batch_size",
         str(NUM_RANKS//NUM_WORKERS),
         "--batch_timeout",
-        str(0.002),
+        str(0.00),
         "--num_workers",
         str(NUM_WORKERS)
     ],
@@ -54,7 +53,7 @@
 
 app_rs: DragonRunSettings = exp.create_run_settings(
     sys.executable,
-    exe_args=[app_script_name, "--device", DEVICE, "--log_max_batchsize", str(7)],
+    exe_args=[app_script_name, "--device", DEVICE, "--log_max_batchsize", str(6)],
 )
 app_rs.set_tasks_per_node(NUM_RANKS)
 

From 8e178d938db00303f8288d5d7dfc3a375432aac7 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 20 Aug 2024 12:10:02 -0500
Subject: [PATCH 61/84] Fixed broken test

---
 .../infrastructure/control/devicemanager.py   |   2 +-
 .../control/requestdispatcher.py              |   1 +
 .../infrastructure/control/workermanager.py   |  16 +-
 tests/dragon/test_error_handling.py           | 139 +++++++++---------
 4 files changed, 82 insertions(+), 76 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py
index a42efa1d9d..09fab32f95 100644
--- a/smartsim/_core/mli/infrastructure/control/devicemanager.py
+++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py
@@ -26,10 +26,10 @@
 
 import typing as t
 
+from .....log import get_logger
 from ...infrastructure.storage.featurestore import FeatureStore
 from ..worker.worker import MachineLearningWorkerBase
 from .requestdispatcher import RequestBatch
-from .....log import get_logger
 
 logger = get_logger(__name__)
 
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index c59951204f..f4e02dfc02 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -66,6 +66,7 @@
 # Placeholder
 ModelIdentifier = FeatureStoreKey
 
+
 class WorkerDevice:
     def __init__(self, name: str) -> None:
         """Wrapper around a device to keep track of loaded Models and availability
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 80145fb8e1..c89ed211ee 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -150,7 +150,7 @@ def _validate_batch(self, batch: RequestBatch) -> bool:
         :param batch: The batch of requests to validate
         :return: False if the request fails any validation checks, True otherwise"""
 
-        if batch is None or len(batch.requests)==0:
+        if batch is None or len(batch.requests) == 0:
             return False
 
         return self._check_feature_stores(batch)
@@ -184,9 +184,11 @@ def _on_iteration(self) -> None:
         if self._device_manager is None:
             for request in batch.requests:
                 exception_handler(
-                    ValueError("No Device Manager available: did you call _on_start()?"),
+                    ValueError(
+                        "No Device Manager available: did you call _on_start()?"
+                    ),
                     request.callback,
-                    "Error acquiring device manager"
+                    "Error acquiring device manager",
                 )
                 return
 
@@ -203,7 +205,7 @@ def _on_iteration(self) -> None:
                 exception_handler(
                     exc,
                     request.callback,
-                    "Error loading model on device or getting device"
+                    "Error loading model on device or getting device.",
                 )
             return
         self._perf_timer.measure_time("fetch_model")
@@ -213,9 +215,7 @@ def _on_iteration(self) -> None:
         except Exception as exc:
             for request in batch.requests:
                 exception_handler(
-                    exc,
-                    request.callback,
-                    "Error getting model from device"
+                    exc, request.callback, "Error getting model from device."
                 )
             return
         self._perf_timer.measure_time("load_model")
@@ -236,7 +236,7 @@ def _on_iteration(self) -> None:
             )
         except Exception as e:
             for request in batch.requests:
-                exception_handler(e, request.callback, "Error executing worker.")
+                exception_handler(e, request.callback, "Failed while executing.")
             return
         self._perf_timer.measure_time("execute")
 
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index 345ebba5e9..17cd344c1e 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -24,30 +24,37 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, patch
 
 import pytest
 
 dragon = pytest.importorskip("dragon")
 
+import multiprocessing as mp
+
 import dragon.utils as du
 from dragon.channels import Channel
 from dragon.data.ddict.ddict import DDict
 from dragon.fli import FLInterface
+from dragon.mpbridge.queues import DragonQueue
 
 from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
+from smartsim._core.mli.infrastructure.control.devicemanager import WorkerDevice
+from smartsim._core.mli.infrastructure.control.requestdispatcher import (
+    RequestDispatcher,
+)
 from smartsim._core.mli.infrastructure.control.workermanager import (
     WorkerManager,
     exception_handler,
 )
-from smartsim._core.mli.infrastructure.control.requestdispatcher import (
-    RequestDispatcher,
-)
 from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader
 from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
     DragonFeatureStore,
 )
-from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore, FeatureStoreKey
+from smartsim._core.mli.infrastructure.storage.featurestore import (
+    FeatureStore,
+    FeatureStoreKey,
+)
 from smartsim._core.mli.infrastructure.worker.worker import (
     ExecuteResult,
     FetchInputResult,
@@ -106,17 +113,12 @@ def setup_worker_manager_model_bytes(
         queue_factory=DragonFLIChannel.from_descriptor,
     )
 
-    dispatcher = RequestDispatcher(
-        batch_timeout=0,
-        batch_size=1,
-        config_loader=config_loader,
-        worker_type=integrated_worker_type,
-    )
+    dispatcher_task_queue = mp.Queue(maxsize=0)
 
     worker_manager = WorkerManager(
         config_loader=config_loader,
         worker_type=integrated_worker_type,
-        dispatcher_queue=dispatcher.task_queue,
+        dispatcher_queue=dispatcher_task_queue,
         as_service=False,
         cooldown=3,
     )
@@ -124,7 +126,16 @@ def setup_worker_manager_model_bytes(
     tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
     output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
 
-    request = InferenceRequest(model_key= None, callback = None, raw_inputs= None, input_keys=[tensor_key], input_meta = None, output_keys=[output_key], raw_model=b'model', batch_size=0)
+    request = InferenceRequest(
+        model_key=None,
+        callback=None,
+        raw_inputs=None,
+        input_keys=[tensor_key],
+        input_meta=None,
+        output_keys=[output_key],
+        raw_model=b"model",
+        batch_size=0,
+    )
 
     model_id = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
 
@@ -134,11 +145,7 @@ def setup_worker_manager_model_bytes(
         model_id,
     )
 
-    dispatcher.task_queue.put(request_batch)
-
-    #
-    # wrapped_queue.send(ser_request)
-
+    dispatcher_task_queue.put(request_batch)
     return worker_manager, integrated_worker_type
 
 
@@ -149,7 +156,7 @@ def setup_worker_manager_model_key(
     backbone_descriptor: str,
     app_feature_store: FeatureStore,
 ):
-    integrated_worker = IntegratedTorchWorker()
+    integrated_worker_type = IntegratedTorchWorker
 
     chan = Channel.make_process_local()
     queue = FLInterface(main_ch=chan)
@@ -157,30 +164,46 @@ def setup_worker_manager_model_key(
     # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
     monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor)
 
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    dispatcher_task_queue = mp.Queue(maxsize=0)
+
     worker_manager = WorkerManager(
-        EnvironmentConfigLoader(
-            featurestore_factory=DragonFeatureStore.from_descriptor,
-            callback_factory=FileSystemCommChannel.from_descriptor,
-            queue_factory=DragonFLIChannel.from_descriptor,
-        ),
-        integrated_worker,
+        config_loader=config_loader,
+        worker_type=integrated_worker_type,
+        dispatcher_queue=dispatcher_task_queue,
         as_service=False,
         cooldown=3,
     )
 
-    tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
-    output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
-    model_key = MessageHandler.build_model_key(
-        "model key", app_feature_store.descriptor
+    tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
+    output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
+    model_key = FeatureStoreKey(
+        key="model key", descriptor=app_feature_store.descriptor
     )
 
-    MessageHandler.build_request(
-        test_dir, model_key, [tensor_key], [output_key], [], None
+    request = InferenceRequest(
+        model_key=model_key,
+        callback=None,
+        raw_inputs=None,
+        input_keys=[tensor_key],
+        input_meta=None,
+        output_keys=[output_key],
+        raw_model=b"model",
+        batch_size=0,
+    )
+    request_batch = RequestBatch(
+        [request],
+        TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]]),
+        model_key=model_key,
     )
-    ser_request = MessageHandler.serialize_request(request)
-    worker_manager._dispatcher_queue.send(ser_request)
 
-    return worker_manager, integrated_worker
+    dispatcher_task_queue.put(request_batch)
+    return worker_manager, integrated_worker_type
 
 
 def mock_pipeline_stage(monkeypatch: pytest.MonkeyPatch, integrated_worker, stage):
@@ -190,7 +213,7 @@ def mock_stage(*args, **kwargs):
     monkeypatch.setattr(integrated_worker, stage, mock_stage)
     mock_reply_fn = MagicMock()
     monkeypatch.setattr(
-        "smartsim._core.mli.infrastructure.control.workermanager.build_failure_reply",
+        "smartsim._core.mli.infrastructure.control.commons.build_failure_reply",
         mock_reply_fn,
     )
 
@@ -216,21 +239,15 @@ def mock_exception_handler(exc, reply_channel, failure_message):
     "stage, error_message",
     [
         pytest.param(
-            "fetch_model", "Failed while fetching the model.", id="fetch model"
+            "fetch_model",
+            "Error loading model on device or getting device.",
+            id="fetch model",
         ),
         pytest.param(
             "load_model",
-            "Failed while loading model from feature store.",
+            "Error loading model on device or getting device.",
             id="load model",
         ),
-        pytest.param(
-            "fetch_inputs", "Failed while fetching the inputs.", id="fetch inputs"
-        ),
-        pytest.param(
-            "transform_input",
-            "Failed while transforming the input.",
-            id="transform inputs",
-        ),
         pytest.param("execute", "Failed while executing.", id="execute"),
         pytest.param(
             "transform_output",
@@ -242,7 +259,7 @@ def mock_exception_handler(exc, reply_channel, failure_message):
         ),
     ],
 )
-def test_pipeline_stage_errors_handled(
+def test_wm_pipeline_stage_errors_handled(
     request,
     setup_worker_manager,
     monkeypatch: pytest.MonkeyPatch,
@@ -254,6 +271,9 @@ def test_pipeline_stage_errors_handled(
         setup_worker_manager
     )
     integrated_worker = worker_manager._worker
+
+    worker_manager._on_start()
+    device = worker_manager._device_manager._device
     mock_reply_fn = mock_pipeline_stage(monkeypatch, integrated_worker, stage)
 
     if stage not in ["fetch_model"]:
@@ -262,42 +282,28 @@ def test_pipeline_stage_errors_handled(
             "fetch_model",
             MagicMock(return_value=FetchModelResult(b"result_bytes")),
         )
-
     if stage not in ["fetch_model", "load_model"]:
         monkeypatch.setattr(
             integrated_worker,
             "load_model",
             MagicMock(return_value=LoadModelResult(b"result_bytes")),
         )
-    if stage not in ["fetch_model", "load_model", "fetch_inputs"]:
         monkeypatch.setattr(
-            integrated_worker,
-            "fetch_inputs",
-            MagicMock(return_value=FetchInputResult([b"result_bytes"], None)),
-        )
-    if stage not in ["fetch_model", "load_model", "fetch_inputs", "transform_input"]:
-        monkeypatch.setattr(
-            integrated_worker,
-            "transform_input",
-            MagicMock(return_value=TransformInputResult(b"result_bytes")),
+            device,
+            "get_model",
+            MagicMock(return_value=b"result_bytes"),
         )
     if stage not in [
         "fetch_model",
-        "load_model",
-        "fetch_inputs",
-        "transform_input",
         "execute",
     ]:
         monkeypatch.setattr(
             integrated_worker,
             "execute",
-            MagicMock(return_value=ExecuteResult(b"result_bytes")),
+            MagicMock(return_value=ExecuteResult(b"result_bytes", [slice(0, 1)])),
         )
     if stage not in [
         "fetch_model",
-        "load_model",
-        "fetch_inputs",
-        "transform_input",
         "execute",
         "transform_output",
     ]:
@@ -305,11 +311,10 @@ def test_pipeline_stage_errors_handled(
             integrated_worker,
             "transform_output",
             MagicMock(
-                return_value=TransformOutputResult(b"result", [], "c", "float32")
+                return_value=[TransformOutputResult(b"result", [], "c", "float32")]
             ),
         )
 
-    worker_manager._on_start()
     worker_manager._on_iteration()
 
     mock_reply_fn.assert_called_once()
@@ -323,7 +328,7 @@ def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch):
 
     mock_reply_fn = MagicMock()
     monkeypatch.setattr(
-        "smartsim._core.mli.infrastructure.control.workermanager.build_failure_reply",
+        "smartsim._core.mli.infrastructure.control.commons.build_failure_reply",
         mock_reply_fn,
     )
 

From 5fb822494872c51b96aa3890cb6d0a84e12260c2 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 20 Aug 2024 12:21:15 -0500
Subject: [PATCH 62/84] MyPy

---
 .../infrastructure/control/workermanager.py   | 23 +++++++++----------
 smartsim/_core/utils/timings.py               |  1 +
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index c89ed211ee..fad470c80c 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -121,7 +121,7 @@ def _check_feature_stores(self, batch: RequestBatch) -> bool:
         """
         # collect all feature stores required by the request
         fs_model: t.Set[str] = set()
-        if batch.model_key:
+        if batch.model_key.key:
             fs_model = {batch.model_key.descriptor}
         fs_inputs = {key.descriptor for key in batch.input_keys}
         fs_outputs = {key.descriptor for key in batch.output_keys}
@@ -181,18 +181,17 @@ def _on_iteration(self) -> None:
             )
             return
 
-        if self._device_manager is None:
-            for request in batch.requests:
-                exception_handler(
-                    ValueError(
-                        "No Device Manager available: did you call _on_start()?"
-                    ),
-                    request.callback,
-                    "Error acquiring device manager",
-                )
-                return
-
         try:
+            if self._device_manager is None:
+                for request in batch.requests:
+                    exception_handler(
+                        ValueError(
+                            "No Device Manager available: did you call _on_start()?"
+                        ),
+                        request.callback,
+                        "Error acquiring device manager",
+                    )
+                return
             device: WorkerDevice = next(
                 self._device_manager.get_device(
                     worker=self._worker,
diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py
index 2bf266e5a9..34595c8586 100644
--- a/smartsim/_core/utils/timings.py
+++ b/smartsim/_core/utils/timings.py
@@ -137,6 +137,7 @@ def set_active(self, active: bool = True) -> None:
         """Set whether the timer will record time"""
         self._timing_on = active
 
+    @property
     def is_active(self) -> bool:
         """Returns true if the timer will record time"""
         return self._timing_on

From b6ea732bc236dbd1ac441aa589b33c051b00e66b Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Wed, 21 Aug 2024 15:43:22 -0500
Subject: [PATCH 63/84] Fix WM test and add dispatcher error handling

---
 .../control/requestdispatcher.py              |  4 +-
 .../infrastructure/control/workermanager.py   |  1 -
 tests/dragon/test_error_handling.py           | 94 ++++++++++++++++++-
 tests/dragon/test_worker_manager.py           |  9 +-
 4 files changed, 102 insertions(+), 6 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index f4e02dfc02..6fb4b7d084 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -299,7 +299,7 @@ def __init__(
         """The worker used to batch inputs"""
         self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(2 * 1024**3).sdesc)
         """Memory pool used to share batched input tensors with the Worker Managers"""
-        self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True)
+        self._perf_timer = PerfTimer(prefix="r_", debug=True, timing_on=True)
         """Performance timer"""
 
     def _check_feature_stores(self, request: InferenceRequest) -> bool:
@@ -480,6 +480,7 @@ def dispatch(self, request: InferenceRequest) -> None:
                 model_key=FeatureStoreKey(key=tmp_id, descriptor="TMP"),
             )
             self._active_queues[tmp_id] = tmp_queue
+            self._queues[tmp_id] = [tmp_queue]
             tmp_queue.put_nowait(request)
             tmp_queue.make_disposable()
             return
@@ -497,6 +498,7 @@ def flush_requests(self) -> None:
         """Get all requests from queues which are ready to be flushed. Place all
         avaliable request batches in the outgoing queue.
         """
+        print(self._queues.items())
         for queue_list in self._queues.values():
             for queue in queue_list:
                 if queue.ready and queue.acquire(blocking=False):
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index fad470c80c..3949476b6b 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -166,7 +166,6 @@ def _on_iteration(self) -> None:
         try:
             batch: RequestBatch = self._dispatcher_queue.get(timeout=0.0001)
         except Empty:
-            logger.info("Empty queue")
             return
 
         self._perf_timer.start_timings(
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index 17cd344c1e..1d9391212d 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -24,7 +24,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock
 
 import pytest
 
@@ -206,6 +206,48 @@ def setup_worker_manager_model_key(
     return worker_manager, integrated_worker_type
 
 
+@pytest.fixture
+def setup_request_dispatcher_model_bytes(
+    test_dir,
+    monkeypatch: pytest.MonkeyPatch,
+    backbone_descriptor: str,
+    app_feature_store: FeatureStore,
+):
+    integrated_worker_type = IntegratedTorchWorker
+
+    chan = Channel.make_process_local()
+    queue = FLInterface(main_ch=chan)
+    monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()))
+    # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
+    monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor)
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    request_dispatcher = RequestDispatcher(
+        batch_timeout=0,
+        batch_size=0,
+        config_loader=config_loader,
+        worker_type=integrated_worker_type,
+    )
+    request_dispatcher._on_start()
+
+    tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    model = MessageHandler.build_model(b"model", "model name", "v 0.0.1")
+    request = MessageHandler.build_request(
+        test_dir, model, [tensor_key], [output_key], [], None
+    )
+    ser_request = MessageHandler.serialize_request(request)
+
+    request_dispatcher._incoming_channel.send(ser_request)
+
+    return request_dispatcher, integrated_worker_type
+
+
 def mock_pipeline_stage(monkeypatch: pytest.MonkeyPatch, integrated_worker, stage):
     def mock_stage(*args, **kwargs):
         raise ValueError(f"Simulated error in {stage}")
@@ -321,6 +363,56 @@ def test_wm_pipeline_stage_errors_handled(
     mock_reply_fn.assert_called_with("fail", error_message)
 
 
+@pytest.mark.parametrize(
+    "setup_request_dispatcher",
+    [
+        pytest.param("setup_request_dispatcher_model_bytes"),
+        # pytest.param("setup_worker_manager_model_key"),
+    ],
+)
+@pytest.mark.parametrize(
+    "stage, error_message",
+    [
+        pytest.param(
+            "fetch_inputs",
+            "Error fetching input.",
+            id="fetch input",
+        ),
+        pytest.param(
+            "transform_input",
+            "Error Transforming input.",
+            id="transform input",
+        ),
+    ],
+)
+def test_dispatcher_pipeline_stage_errors_handled(
+    request,
+    setup_request_dispatcher,
+    monkeypatch: pytest.MonkeyPatch,
+    stage: str,
+    error_message: str,
+):
+    """Ensures that the request dispatcher does not crash after a failure in various pipeline stages"""
+    request_dispatcher, integrated_worker_type = request.getfixturevalue(
+        setup_request_dispatcher
+    )
+    integrated_worker = request_dispatcher._worker
+
+    mock_reply_fn = mock_pipeline_stage(monkeypatch, integrated_worker, stage)
+
+    if stage not in ["fetch_inputs"]:
+        monkeypatch.setattr(
+            integrated_worker,
+            "fetch_inputs",
+            MagicMock(return_value=[FetchInputResult(result=[b"result"], meta=None)]),
+        )
+
+    request_dispatcher._on_iteration()
+
+    mock_reply_fn.assert_called_once()
+    mock_reply_fn.assert_called_with("fail", error_message)
+
+
 def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch):
     """Ensures that the worker manager does not crash after a failure in the
     execute pipeline stage"""
diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py
index 864e14993c..52e7a84d51 100644
--- a/tests/dragon/test_worker_manager.py
+++ b/tests/dragon/test_worker_manager.py
@@ -26,7 +26,6 @@
 
 import io
 import logging
-import multiprocessing as mp
 import pathlib
 import time
 
@@ -40,6 +39,9 @@
 
 import dragon.channels as dch
 from dragon import fli
+from dragon.mpbridge.queues import DragonQueue
+
+import multiprocessing as mp
 
 from smartsim._core.mli.comm.channel.channel import CommChannelBase
 from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
@@ -174,14 +176,15 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None:
         callback_factory=FileSystemCommChannel.from_descriptor,
         queue_factory=DragonFLIChannel.from_descriptor,
     )
-    integrated_worker = TorchWorker()
+    integrated_worker_type = TorchWorker
 
     worker_manager = WorkerManager(
         config_loader,
-        integrated_worker,
+        integrated_worker_type,
         as_service=True,
         cooldown=5,
         device="cpu",
+        dispatcher_queue=mp.Queue(maxsize=0)
     )
 
     worker_queue = config_loader.get_queue()

From 67242ecf7969ff4d4423fbfc9469d05e41549295 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Mon, 26 Aug 2024 06:08:41 -0500
Subject: [PATCH 64/84] Add RequestDispatcher tests

---
 .../control/requestdispatcher.py              | 130 ++----
 .../mli/infrastructure/worker/torch_worker.py |  13 +-
 .../_core/mli/infrastructure/worker/worker.py |   8 +-
 tests/dragon/test_request_dispatcher.py       | 395 ++++++++++++++++++
 tests/dragon/test_worker_manager.py           |   5 +-
 5 files changed, 448 insertions(+), 103 deletions(-)
 create mode 100644 tests/dragon/test_request_dispatcher.py

diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index 6fb4b7d084..20786fdf9a 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -41,7 +41,6 @@
 import typing as t
 import uuid
 from queue import Empty, Full, Queue
-from threading import RLock
 from types import TracebackType
 
 from smartsim._core.entrypoints.service import Service
@@ -76,37 +75,6 @@ def __init__(self, name: str) -> None:
         """The name used by the toolkit to identify this device"""
         self._models: dict[str, t.Any] = {}
         """Dictionary of model key to model for models stored on this device"""
-        self._lock = RLock()
-        """Lock to ensure only one thread at the time accesses this device"""
-
-    def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]:
-        """Acquire and lock this device to prevent other threads
-
-        from acquiring it concurrently.
-        :param blocking: If set to True, the call will block
-        for the time specified by ``timeout`` until the lock
-        can be acquired
-        :param timeout: Time (in seconds) to wait to acquire lock.
-        Ignored if ``blocking`` is set to False.
-        """
-        return self._lock.acquire(blocking=blocking, timeout=timeout)
-
-    def release(self) -> None:
-        """Release device to allow other threads to acquire it"""
-        self._lock.release()
-
-    def __enter__(self) -> None:
-        """Locked context creator for this device"""
-        self.acquire()
-
-    def __exit__(
-        self,
-        exc_type: t.Optional[t.Type[BaseException]],
-        exc_val: t.Optional[BaseException],
-        exc_tb: t.Optional[TracebackType],
-    ) -> None:
-        """Locked context destructor for this device"""
-        self.release()
 
 
 class BatchQueue(Queue[InferenceRequest]):
@@ -134,8 +102,6 @@ def __init__(
         A disposable queue is always full."""
         self._model_key: FeatureStoreKey = model_key
         """Key of the model which needs to be executed on the queued requets"""
-        self._flush_lock = RLock()
-        """Lock used to make sure only one process can flush the queue (unused now)"""
         self._uid = str(uuid.uuid4())
         """Unique ID of queue"""
 
@@ -144,30 +110,6 @@ def uid(self) -> str:
         """ID of this queue"""
         return self._uid
 
-    def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]:
-        """Acquire queue lock to flush
-        :param blocking: whether to block on lock acquisition
-        :param timeout: Time to wait if blocking, before raising exception
-        """
-        return self._flush_lock.acquire(blocking=blocking, timeout=timeout)
-
-    def release(self) -> None:
-        """Release queue lock"""
-        self._flush_lock.release()
-
-    def __enter__(self) -> None:
-        """Method to use the Queue as a Context Manager"""
-        self.acquire()
-
-    def __exit__(
-        self,
-        exc_type: t.Optional[t.Type[BaseException]],
-        exc_val: t.Optional[BaseException],
-        exc_tb: t.Optional[TracebackType],
-    ) -> None:
-        """Method to release the Queue as a Context Manager"""
-        self.release()
-
     @property
     def model_key(self) -> ModelIdentifier:
         """Key of the model which needs to be run on the queued requests"""
@@ -185,16 +127,11 @@ def put(
         :param timeout: Time (in seconds) to wait if block==True
         :raises Full: If an item cannot be put on the queue
         """
-        if not self.acquire(blocking=False):
+        if self.full():
             raise Full
-        try:
-            if self.full():
-                raise Full
-            if self._first_put is None:
-                self._first_put = time.time()
-            super().put(item, block=block, timeout=timeout)
-        finally:
-            self.release()
+        if self._first_put is None:
+            self._first_put = time.time()
+        super().put(item, block=block, timeout=timeout)
 
     @property
     def _elapsed_time(self) -> float:
@@ -208,7 +145,12 @@ def ready(self) -> bool:
         """True if the queue can be flushed"""
         if self.empty():
             return False
-        return self.full() or (self._elapsed_time >= self._batch_timeout)
+
+        timed_out = (
+            self._batch_timeout > 0 and self._elapsed_time >= self._batch_timeout
+        )
+        logger.debug(f"Is full: {self.full()} or has timed out: {timed_out}")
+        return self.full() or timed_out
 
     def make_disposable(self) -> None:
         """Set this queue as disposable, and never use it again after it gets flushed"""
@@ -277,8 +219,6 @@ def __init__(
         """Time in seconds that has to be waited before flushing a non-full queue"""
         self._batch_size = batch_size
         """Total capacity of each batch queue."""
-        self._queue_swap_lock: t.Optional[RLock] = None
-        """Lock used to swap the active queue for a key"""
         incoming_channel = config_loader.get_queue()
         if incoming_channel is None:
             raise SmartSimError("No incoming channel for dispatcher")
@@ -299,7 +239,7 @@ def __init__(
         """The worker used to batch inputs"""
         self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(2 * 1024**3).sdesc)
         """Memory pool used to share batched input tensors with the Worker Managers"""
-        self._perf_timer = PerfTimer(prefix="r_", debug=True, timing_on=True)
+        self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True)
         """Performance timer"""
 
     def _check_feature_stores(self, request: InferenceRequest) -> bool:
@@ -387,9 +327,6 @@ def _validate_request(self, request: InferenceRequest) -> bool:
 
         return all(checks)
 
-    def _on_start(self) -> None:
-        self._queue_swap_lock = RLock()
-
     def _on_iteration(self) -> None:
         try:
             self._perf_timer.set_active(True)
@@ -448,24 +385,21 @@ def _swap_queue(self, model_key: FeatureStoreKey) -> None:
         and make it the active one for a given model.
         :param model_key: The key of the model for which the
         queue has to be swapped
-        :raises SmartSimError: If the queue is not locked.
         """
-        if self._queue_swap_lock is None:
-            raise SmartSimError("Queues were not locked")
-        with self._queue_swap_lock:
-            if model_key.key in self._queues:
-                for queue in self._queues[model_key.key]:
-                    if not queue.full():
-                        self._active_queues[model_key.key] = queue
-                        return
-
-            new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_key)
-            if model_key.key in self._queues:
-                self._queues[model_key.key].append(new_queue)
-            else:
-                self._queues[model_key.key] = [new_queue]
-            self._active_queues[model_key.key] = new_queue
-            return
+
+        if model_key.key in self._queues:
+            for queue in self._queues[model_key.key]:
+                if not queue.full():
+                    self._active_queues[model_key.key] = queue
+                    return
+
+        new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_key)
+        if model_key.key in self._queues:
+            self._queues[model_key.key].append(new_queue)
+        else:
+            self._queues[model_key.key] = [new_queue]
+        self._active_queues[model_key.key] = new_queue
+        return
 
     def dispatch(self, request: InferenceRequest) -> None:
         """Assign a request to a batch queue
@@ -498,10 +432,9 @@ def flush_requests(self) -> None:
         """Get all requests from queues which are ready to be flushed. Place all
         avaliable request batches in the outgoing queue.
         """
-        print(self._queues.items())
         for queue_list in self._queues.values():
             for queue in queue_list:
-                if queue.ready and queue.acquire(blocking=False):
+                if queue.ready:
                     self._perf_timer.measure_time("find_queue")
                     try:
                         batch = RequestBatch(
@@ -511,7 +444,6 @@ def flush_requests(self) -> None:
                         )
                     finally:
                         self._perf_timer.measure_time("flush_requests")
-                        queue.release()
                     try:
                         fetch_results = self._worker.fetch_inputs(
                             batch=batch, feature_stores=self._feature_stores
@@ -544,7 +476,15 @@ def flush_requests(self) -> None:
                         request.raw_inputs = []
                         request.input_meta = []
 
-                    self._outgoing_queue.put(batch)
+                    try:
+                        self._outgoing_queue.put(batch)
+                    except Exception as exc:
+                        exception_handler(
+                            exc,
+                            None,
+                            "Error placing batch on task queue.",
+                        )
+                        continue
                     self._perf_timer.measure_time("put")
 
     def _can_shutdown(self) -> bool:
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
index 7ea09b9af9..0639d59696 100644
--- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -46,6 +46,9 @@
     TransformOutputResult,
 )
 
+# pylint: enable=import-error
+
+
 torch.set_num_threads(1)
 torch.set_num_interop_threads(4)
 logger = get_logger(__name__)
@@ -129,7 +132,7 @@ def transform_input(
 
             results.append(mem_alloc.serialize())
 
-        return TransformInputResult(results, slices, all_dims)
+        return TransformInputResult(results, slices, all_dims, all_dtypes)
 
     # pylint: disable-next=unused-argument
     @staticmethod
@@ -147,15 +150,17 @@ def execute(
 
         tensors = []
         mem_allocs = []
-        for transformed, dims in zip(
-            transform_result.transformed, transform_result.dims
+        for transformed, dims, dtype in zip(
+            transform_result.transformed, transform_result.dims, transform_result.dtypes
         ):
             mem_alloc = MemoryAlloc.attach(transformed)
             mem_allocs.append(mem_alloc)
+            itemsize = np.empty((1), dtype=dtype).itemsize
             tensors.append(
                 torch.from_numpy(
                     np.frombuffer(
-                        mem_alloc.get_memview()[0 : np.prod(dims) * 4], dtype=np.float32
+                        mem_alloc.get_memview()[0 : np.prod(dims) * itemsize],
+                        dtype=dtype,
                     ).reshape(dims)
                 )
             )
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index b3d47b13c7..30d41c0285 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -110,7 +110,11 @@ class TransformInputResult:
     """A wrapper around a transformed batch of input tensors"""
 
     def __init__(
-        self, result: t.Any, slices: list[slice], dims: list[list[int]]
+        self,
+        result: t.Any,
+        slices: list[slice],
+        dims: list[list[int]],
+        dtypes: list[str],
     ) -> None:
         """Initialize the object"""
         self.transformed = result
@@ -120,6 +124,8 @@ def __init__(
         which request"""
         self.dims = dims
         """Dimension of the transformed tensors"""
+        self.dtypes = dtypes
+        """Data type of transformed tensors"""
 
 
 class ExecuteResult:
diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py
new file mode 100644
index 0000000000..8bed9fc16d
--- /dev/null
+++ b/tests/dragon/test_request_dispatcher.py
@@ -0,0 +1,395 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import io
+import logging
+import pathlib
+import socket
+import time
+import typing as t
+from queue import Empty
+
+import numpy as np
+import pytest
+
+torch = pytest.importorskip("torch")
+dragon = pytest.importorskip("dragon")
+
+import base64
+import multiprocessing as mp
+
+try:
+    mp.set_start_method("dragon")
+except Exception:
+    pass
+
+import os
+
+import dragon.channels as dch
+import dragon.infrastructure.policy as dragon_policy
+import dragon.infrastructure.process_desc as dragon_process_desc
+import dragon.native.process as dragon_process
+from dragon import fli
+from dragon.channels import Channel
+from dragon.data.ddict.ddict import DDict
+from dragon.managed_memory import MemoryAlloc, MemoryPool
+from dragon.mpbridge.queues import DragonQueue
+
+from smartsim._core.entrypoints.service import Service
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
+from smartsim._core.mli.infrastructure.control.requestdispatcher import (
+    RequestBatch,
+    RequestDispatcher,
+)
+from smartsim._core.mli.infrastructure.control.workermanager import (
+    EnvironmentConfigLoader,
+)
+from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
+    DragonFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
+from smartsim._core.mli.message_handler import MessageHandler
+from smartsim.log import get_logger
+
+from .featurestore import FileSystemFeatureStore
+from .utils.channel import FileSystemCommChannel
+
+logger = get_logger(__name__)
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+
+def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
+    """Create a simple torch model and persist to disk for
+    testing purposes.
+
+    TODO: remove once unit tests are in place"""
+    # test_path = pathlib.Path(work_dir)
+    if not model_path.parent.exists():
+        model_path.parent.mkdir(parents=True, exist_ok=True)
+
+    model_path.unlink(missing_ok=True)
+    # model_path = test_path / "basic.pt"
+
+    model = torch.nn.Linear(2, 1)
+    torch.save(model, model_path)
+
+    return model_path
+
+
+def mock_messages(
+    request_dispatcher_queue: DragonFLIChannel,
+    feature_store: FeatureStore,
+    feature_store_root_dir: pathlib.Path,
+    comm_channel_root_dir: pathlib.Path,
+) -> None:
+    """Mock event producer for triggering the inference pipeline"""
+    feature_store_root_dir.mkdir(parents=True, exist_ok=True)
+    comm_channel_root_dir.mkdir(parents=True, exist_ok=True)
+
+    model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt")
+    model_bytes = model_path.read_bytes()
+    model_key = str(feature_store_root_dir / "model_fs.pt")
+
+    feature_store[model_key] = model_bytes
+
+    for iteration_number in range(2):
+        time.sleep(1)
+        # 1. for demo, ignore upstream and just put stuff into downstream
+        # 2. for demo, only one downstream but we'd normally have to filter
+        #       msg content and send to the correct downstream (worker) queue
+        # timestamp = time.time_ns()
+        # mock_channel = test_path / f"brainstorm-{timestamp}.txt"
+        # mock_channel.touch()
+
+        # thread - just look for key (wait for keys)
+        # call checkpoint, try to get non-persistent key, it blocks
+        # working set size > 1 has side-effects
+        # only incurs cost when working set size has been exceeded
+
+        channel_key = Channel.make_process_local().serialize()
+        callback_channel = DragonCommChannel(channel_key)
+
+        input_path = feature_store_root_dir / f"{iteration_number}/input.pt"
+        output_path = feature_store_root_dir / f"{iteration_number}/output.pt"
+
+        input_key = str(input_path)
+        output_key = str(output_path)
+
+        tensor = (
+            (iteration_number + 1) * torch.ones((1, 2), dtype=torch.float32)
+        ).numpy()
+        fsd = feature_store.descriptor
+
+        tensor_desc = MessageHandler.build_tensor_descriptor(
+            "c", "float32", list(tensor.shape)
+        )
+
+        message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd)
+        message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd)
+        message_model_key = MessageHandler.build_model_key(model_key, fsd)
+
+        request = MessageHandler.build_request(
+            reply_channel=callback_channel.descriptor,
+            model=message_model_key,
+            inputs=[tensor_desc],
+            outputs=[message_tensor_output_key],
+            output_descriptors=[],
+            custom_attributes=None,
+        )
+        request_bytes = MessageHandler.serialize_request(request)
+        with request_dispatcher_queue._fli.sendh(
+            timeout=None, stream_channel=request_dispatcher_queue._channel
+        ) as sendh:
+            sendh.send_bytes(request_bytes)
+            sendh.send_bytes(tensor.tobytes())
+
+
+@pytest.fixture
+def prepare_environment(test_dir: str) -> pathlib.Path:
+    """Cleanup prior outputs to run demo repeatedly"""
+    path = pathlib.Path(f"{test_dir}/workermanager.log")
+    logging.basicConfig(filename=path.absolute(), level=logging.DEBUG)
+    return path
+
+
+def service_as_dragon_proc(
+    service: Service, cpu_affinity: list[int], gpu_affinity: list[int]
+) -> dragon_process.Process:
+
+    options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
+    local_policy = dragon_policy.Policy(
+        placement=dragon_policy.Policy.Placement.HOST_NAME,
+        host_name=socket.gethostname(),
+        cpu_affinity=cpu_affinity,
+        gpu_affinity=gpu_affinity,
+    )
+    return dragon_process.Process(
+        target=service.execute,
+        args=[],
+        cwd=os.getcwd(),
+        policy=local_policy,
+        options=options,
+        stderr=dragon_process.Popen.STDOUT,
+        stdout=dragon_process.Popen.STDOUT,
+    )
+
+
+def test_request_dispatcher_batching(prepare_environment: pathlib.Path) -> None:
+    """Test dispatcher's batching of requests"""
+
+    test_path = prepare_environment
+    fs_path = test_path / "feature_store"
+    comm_path = test_path / "comm_store"
+
+    to_worker_channel = dch.Channel.make_process_local()
+    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
+    to_worker_fli_serialized = to_worker_fli.serialize()
+
+    # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
+    # or test environment may be unable to send messages w/queue
+    descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
+    os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor
+
+    ddict = DDict(1, 1)
+    dd_descriptor = ddict.serialize()
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=DragonCommChannel,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+    integrated_worker_type = TorchWorker
+
+    request_dispatcher = RequestDispatcher(
+        batch_timeout=0,
+        batch_size=2,
+        config_loader=config_loader,
+        worker_type=integrated_worker_type,
+    )
+
+    worker_queue = config_loader.get_queue()
+    if worker_queue is None:
+        logger.warn(
+            "FLI input queue not loaded correctly from config_loader: "
+            f"{config_loader._queue_descriptor}"
+        )
+
+    # create a mock client application to populate the request queue
+    msg_pump = mp.Process(
+        target=mock_messages,
+        args=(
+            worker_queue,
+            DragonFeatureStore(ddict),
+            fs_path,
+            comm_path,
+        ),
+    )
+    msg_pump.start()
+
+    # create a process to execute commands
+    process = service_as_dragon_proc(request_dispatcher, [], [])
+    process.start()
+
+    batch: RequestBatch = request_dispatcher.task_queue.get(timeout=None)
+
+    try:
+
+        assert batch.has_valid_requests
+        tensors = []
+        mem_allocs = []
+
+        transform_result = batch.inputs
+        for transformed, dims, dtype in zip(
+            transform_result.transformed, transform_result.dims, transform_result.dtypes
+        ):
+            mem_alloc = MemoryAlloc.attach(transformed)
+            mem_allocs.append(mem_alloc)
+            itemsize = np.empty((1), dtype=dtype).itemsize
+            tensors.append(
+                torch.from_numpy(
+                    np.frombuffer(
+                        mem_alloc.get_memview()[0 : np.prod(dims) * itemsize],
+                        dtype=dtype,
+                    ).reshape(dims)
+                )
+            )
+
+        assert len(batch.requests) == 2
+        assert len(tensors) == 1
+        assert tensors[0].shape == torch.Size([2, 2])
+        model_key = str(fs_path / "model_fs.pt")
+        assert batch.model_key.key == model_key
+
+        for tensor in tensors:
+            for sample_idx in range(tensor.shape[0]):
+                tensor_in = tensor[sample_idx]
+                tensor_out = (sample_idx + 1) * torch.ones((2,), dtype=torch.float32)
+                assert torch.equal(tensor_in, tensor_out)
+
+    except Exception as exc:
+        raise exc
+    finally:
+        for mem_alloc in mem_allocs:
+            mem_alloc.free()
+
+        process.join(timeout=5)
+        process.kill()
+        msg_pump.kill()
+
+
+def test_request_dispatcher_queues(prepare_environment: pathlib.Path) -> None:
+    """Test the request dispatcher internal queues"""
+
+    test_path = prepare_environment
+    fs_path = test_path / "feature_store"
+    comm_path = test_path / "comm_store"
+
+    to_worker_channel = dch.Channel.make_process_local()
+    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
+    to_worker_fli_serialized = to_worker_fli.serialize()
+
+    # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
+    # or test environment may be unable to send messages w/queue
+    descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
+    os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor
+
+    ddict = DDict(1, 1)
+    dd_descriptor = ddict.serialize()
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=DragonCommChannel,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+    integrated_worker_type = TorchWorker
+
+    request_dispatcher = RequestDispatcher(
+        batch_timeout=0,
+        batch_size=2,
+        config_loader=config_loader,
+        worker_type=integrated_worker_type,
+    )
+
+    worker_queue = config_loader.get_queue()
+    if worker_queue is None:
+        logger.warn(
+            "FLI input queue not loaded correctly from config_loader: "
+            f"{config_loader._queue_descriptor}"
+        )
+
+    request_dispatcher._on_start()
+
+    # create a mock client application to populate the request queue
+    msg_pump = mp.Process(
+        target=mock_messages,
+        args=(
+            worker_queue,
+            DragonFeatureStore(ddict),
+            fs_path,
+            comm_path,
+        ),
+    )
+    msg_pump.start()
+
+    batch: t.Optional[RequestBatch] = None
+    for attempts in range(10):
+        try:
+            request_dispatcher._on_iteration()
+            batch = request_dispatcher.task_queue.get(timeout=1)
+            break
+        except Empty as exc:
+            continue
+
+    try:
+        assert batch is not None
+        assert batch.has_valid_requests
+        mem_allocs = []
+
+        transform_result = batch.inputs
+        for transformed in transform_result.transformed:
+            mem_alloc = MemoryAlloc.attach(transformed)
+            mem_allocs.append(mem_alloc)
+
+        assert len(batch.requests) == 2
+        model_key = str(fs_path / "model_fs.pt")
+        assert batch.model_key.key == model_key
+        assert model_key in request_dispatcher._queues
+        assert model_key in request_dispatcher._active_queues
+        assert len(request_dispatcher._queues[model_key]) == 1
+        assert request_dispatcher._queues[model_key][0].empty()
+        assert request_dispatcher._queues[model_key][0].model_key.key == model_key
+
+    except Exception as exc:
+        raise exc
+    finally:
+        for mem_alloc in mem_allocs:
+            mem_alloc.free()
+
+        msg_pump.kill()
diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py
index fcbcc20b77..ac466491d7 100644
--- a/tests/dragon/test_worker_manager.py
+++ b/tests/dragon/test_worker_manager.py
@@ -35,14 +35,13 @@
 dragon = pytest.importorskip("dragon")
 
 import base64
+import multiprocessing as mp
 import os
 
 import dragon.channels as dch
 from dragon import fli
 from dragon.mpbridge.queues import DragonQueue
 
-import multiprocessing as mp
-
 from smartsim._core.mli.comm.channel.channel import CommChannelBase
 from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
 from smartsim._core.mli.infrastructure.control.workermanager import (
@@ -184,7 +183,7 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None:
         as_service=True,
         cooldown=5,
         device="cpu",
-        dispatcher_queue=mp.Queue(maxsize=0)
+        dispatcher_queue=mp.Queue(maxsize=0),
     )
 
     worker_queue = config_loader.get_queue()

From 4a5185bcf8c75d7fa116432644d5bc1a0258b1f3 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Mon, 26 Aug 2024 11:17:16 -0500
Subject: [PATCH 65/84] Added tests for device manager

---
 .../infrastructure/control/devicemanager.py   |  43 +++--
 .../control/requestdispatcher.py              |  40 +++--
 .../infrastructure/control/workermanager.py   | 153 +++++++++--------
 .../_core/mli/infrastructure/worker/worker.py |   1 -
 tests/dragon/test_error_handling.py           |   4 +-
 tests/dragon/test_request_dispatcher.py       | 130 +++++++-------
 tests/mli/test_device_manager.py              | 162 ++++++++++++++++++
 7 files changed, 366 insertions(+), 167 deletions(-)
 create mode 100644 tests/mli/test_device_manager.py

diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py
index 09fab32f95..49f8403b8c 100644
--- a/smartsim/_core/mli/infrastructure/control/devicemanager.py
+++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py
@@ -24,12 +24,12 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from contextlib import contextmanager
 import typing as t
 
 from .....log import get_logger
 from ...infrastructure.storage.featurestore import FeatureStore
-from ..worker.worker import MachineLearningWorkerBase
-from .requestdispatcher import RequestBatch
+from ..worker.worker import MachineLearningWorkerBase, RequestBatch
 
 logger = get_logger(__name__)
 
@@ -68,15 +68,33 @@ def get_model(self, key: str) -> t.Any:
         """Get the model corresponding to a given key
 
         :param key: the model key
+        :returns: the model for the given key
         """
         return self._models[key]
 
     def __contains__(self, key: str) -> bool:
+        """Check if model with a given key is available on the device
+
+        :param key: the key of the model to check for existence
+        :returns: whether the model is available on the device
+        """
         return key in self._models
 
+    @contextmanager
+    def get(self, key_to_remove: t.Optional[str]):
+        yield self
+        if key_to_remove is not None:
+            self.remove_model(key_to_remove)
 
 class DeviceManager:
     def __init__(self, device: WorkerDevice):
+        """An object to manage devices such as GPUs and CPUs.
+
+        The main goal of the ``DeviceManager`` is to ensure that
+        the managed device is ready to be used by a worker to
+        run a given model
+        :param device: The managed device
+        """
         self._device = device
         """Device managed by this object"""
 
@@ -86,6 +104,14 @@ def _load_model_on_device(
         batch: RequestBatch,
         feature_stores: dict[str, FeatureStore],
     ) -> None:
+        """Load the model needed to execute on a batch on the managed device.
+
+        The model is loaded by the worker.
+
+        :param worker: the worker that loads the model
+        :param batch: the batch for which the model is needed
+        :param feature_stores: feature stores where the model could be stored
+        """
 
         model_bytes = worker.fetch_model(batch, feature_stores)
         loaded_model = worker.load_model(batch, model_bytes, self._device.name)
@@ -113,12 +139,7 @@ def get_device(
         # Load model if not already loaded, or
         # because it is sent with the request
         if model_in_request or not batch.model_key.key in self._device:
-            try:
-                self._load_model_on_device(worker, batch, feature_stores)
-            except Exception as exc:
-                raise exc
-        try:
-            yield self._device
-        finally:
-            if model_in_request:
-                self._device.remove_model(batch.model_key.key)
+            self._load_model_on_device(worker, batch, feature_stores)
+
+        key_to_remove = batch.model_key.key if model_in_request else None
+        return self._device.get(key_to_remove)
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index 20786fdf9a..0016c18a9b 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -41,7 +41,6 @@
 import typing as t
 import uuid
 from queue import Empty, Full, Queue
-from types import TracebackType
 
 from smartsim._core.entrypoints.service import Service
 
@@ -65,18 +64,6 @@
 # Placeholder
 ModelIdentifier = FeatureStoreKey
 
-
-class WorkerDevice:
-    def __init__(self, name: str) -> None:
-        """Wrapper around a device to keep track of loaded Models and availability
-        :param name: name used by the toolkit to identify this device, e.g. ``cuda:0``
-        """
-        self._name = name
-        """The name used by the toolkit to identify this device"""
-        self._models: dict[str, t.Any] = {}
-        """Dictionary of model key to model for models stored on this device"""
-
-
 class BatchQueue(Queue[InferenceRequest]):
     def __init__(
         self, batch_timeout: float, batch_size: int, model_key: ModelIdentifier
@@ -366,14 +353,37 @@ def _on_iteration(self) -> None:
                 self._perf_timer.measure_time("dispatch")
         finally:
             self.flush_requests()
-            # TODO: implement this
-            # self.remove_queues()
+            self.remove_queues()
 
             self._perf_timer.end_timings()
 
         if self._perf_timer.max_length == 801 and self._perf_timer.is_active:
             self._perf_timer.print_timings(True)
 
+    def remove_queues(self) -> None:
+        """Remove references to queues that can be removed
+        and allow them to be garbage collected"""
+        queue_lists_to_remove = []
+        for key, queues in self._queues.items():
+            queues_to_remove = []
+            for queue in queues:
+                if queue.can_be_removed:
+                    queues_to_remove.append(queue)
+
+            for queue_to_remove in queues_to_remove:
+                queues.remove(queue_to_remove)
+                if (
+                    key in self._active_queues
+                    and self._active_queues[key] == queue_to_remove
+                ):
+                    del self._active_queues[key]
+
+            if len(queues) == 0:
+                queue_lists_to_remove.append(key)
+
+        for key in queue_lists_to_remove:
+            del self._queues[key]
+
     @property
     def task_queue(self) -> DragonQueue:
         """The queue on which batched requests are placed"""
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 3949476b6b..8256ce4f55 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -191,13 +191,12 @@ def _on_iteration(self) -> None:
                         "Error acquiring device manager",
                     )
                 return
-            device: WorkerDevice = next(
-                self._device_manager.get_device(
+            device_cm = self._device_manager.get_device(
                     worker=self._worker,
                     batch=batch,
                     feature_stores=self._feature_stores,
                 )
-            )
+
         except Exception as exc:
             for request in batch.requests:
                 exception_handler(
@@ -208,90 +207,92 @@ def _on_iteration(self) -> None:
             return
         self._perf_timer.measure_time("fetch_model")
 
-        try:
-            model_result = LoadModelResult(device.get_model(batch.model_key.key))
-        except Exception as exc:
-            for request in batch.requests:
-                exception_handler(
-                    exc, request.callback, "Error getting model from device."
-                )
-            return
-        self._perf_timer.measure_time("load_model")
+        with device_cm as device:
 
-        if batch.inputs is None:
-            for request in batch.requests:
-                exception_handler(
-                    ValueError("Error batching inputs"),
-                    request.callback,
-                    "Error batching inputs.",
-                )
-            return
-        transformed_input = batch.inputs
+            try:
+                model_result = LoadModelResult(device.get_model(batch.model_key.key))
+            except Exception as exc:
+                for request in batch.requests:
+                    exception_handler(
+                        exc, request.callback, "Error getting model from device."
+                    )
+                return
+            self._perf_timer.measure_time("load_model")
 
-        try:
-            execute_result = self._worker.execute(
-                batch, model_result, transformed_input, device.name
-            )
-        except Exception as e:
-            for request in batch.requests:
-                exception_handler(e, request.callback, "Failed while executing.")
-            return
-        self._perf_timer.measure_time("execute")
+            if batch.inputs is None:
+                for request in batch.requests:
+                    exception_handler(
+                        ValueError("Error batching inputs"),
+                        request.callback,
+                        "Error batching inputs.",
+                    )
+                return
+            transformed_input = batch.inputs
 
-        try:
-            transformed_outputs = self._worker.transform_output(batch, execute_result)
-        except Exception as e:
-            for request in batch.requests:
-                exception_handler(
-                    e, request.callback, "Failed while transforming the output."
+            try:
+                execute_result = self._worker.execute(
+                    batch, model_result, transformed_input, device.name
                 )
-            return
+            except Exception as e:
+                for request in batch.requests:
+                    exception_handler(e, request.callback, "Failed while executing.")
+                return
+            self._perf_timer.measure_time("execute")
 
-        for request, transformed_output in zip(batch.requests, transformed_outputs):
-            reply = InferenceReply()
-            if request.output_keys:
-                try:
-                    reply.output_keys = self._worker.place_output(
-                        request,
-                        transformed_output,
-                        self._feature_stores,
-                    )
-                except Exception as e:
+            try:
+                transformed_outputs = self._worker.transform_output(batch, execute_result)
+            except Exception as e:
+                for request in batch.requests:
                     exception_handler(
-                        e, request.callback, "Failed while placing the output."
+                        e, request.callback, "Failed while transforming the output."
+                    )
+                return
+
+            for request, transformed_output in zip(batch.requests, transformed_outputs):
+                reply = InferenceReply()
+                if request.output_keys:
+                    try:
+                        reply.output_keys = self._worker.place_output(
+                            request,
+                            transformed_output,
+                            self._feature_stores,
+                        )
+                    except Exception as e:
+                        exception_handler(
+                            e, request.callback, "Failed while placing the output."
+                        )
+                        continue
+                else:
+                    reply.outputs = transformed_output.outputs
+                self._perf_timer.measure_time("assign_output")
+
+                if reply.outputs is None or not reply.outputs:
+                    response = build_failure_reply("fail", "Outputs not found.")
+                else:
+                    reply.status_enum = "complete"
+                    reply.message = "Success"
+
+                    results = self._worker.prepare_outputs(reply)
+                    response = MessageHandler.build_response(
+                        status=reply.status_enum,
+                        message=reply.message,
+                        result=results,
+                        custom_attributes=None,
                     )
-                    continue
-            else:
-                reply.outputs = transformed_output.outputs
-            self._perf_timer.measure_time("assign_output")
-
-            if reply.outputs is None or not reply.outputs:
-                response = build_failure_reply("fail", "Outputs not found.")
-            else:
-                reply.status_enum = "complete"
-                reply.message = "Success"
-
-                results = self._worker.prepare_outputs(reply)
-                response = MessageHandler.build_response(
-                    status=reply.status_enum,
-                    message=reply.message,
-                    result=results,
-                    custom_attributes=None,
-                )
 
-            self._perf_timer.measure_time("build_reply")
+                self._perf_timer.measure_time("build_reply")
 
-            serialized_resp = MessageHandler.serialize_response(response)
+                serialized_resp = MessageHandler.serialize_response(response)
 
-            self._perf_timer.measure_time("serialize_resp")
+                self._perf_timer.measure_time("serialize_resp")
 
-            if request.callback:
-                request.callback.send(serialized_resp)
-                if reply.outputs:
-                    # send tensor data after response
-                    for output in reply.outputs:
-                        request.callback.send(output)
-            self._perf_timer.measure_time("send")
+                if request.callback:
+                    request.callback.send(serialized_resp)
+                    if reply.outputs:
+                        # send tensor data after response
+                        for output in reply.outputs:
+                            request.callback.send(output)
+                self._perf_timer.measure_time("send")
 
         self._perf_timer.end_timings()
 
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 30d41c0285..008b6202be 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -406,7 +406,6 @@ def load_model(
         device memory
         :param request: The request that triggered the pipeline
         :param device: The device on which the model must be placed
-        :param device: The device on which the model must be placed
         :return: ModelLoadResult wrapping the model loaded for the request"""
 
     @staticmethod
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index 21a5758311..113f7ccba0 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -141,7 +141,7 @@ def setup_worker_manager_model_bytes(
 
     request_batch = RequestBatch(
         [request],
-        TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]]),
+        TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]),
         model_id,
     )
 
@@ -200,7 +200,7 @@ def setup_worker_manager_model_key(
     )
     request_batch = RequestBatch(
         [request],
-        TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]]),
+        TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]),
         model_key=model_key,
     )
 
diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py
index 8bed9fc16d..d1e97a8b5b 100644
--- a/tests/dragon/test_request_dispatcher.py
+++ b/tests/dragon/test_request_dispatcher.py
@@ -120,18 +120,7 @@ def mock_messages(
     feature_store[model_key] = model_bytes
 
     for iteration_number in range(2):
-        time.sleep(1)
-        # 1. for demo, ignore upstream and just put stuff into downstream
-        # 2. for demo, only one downstream but we'd normally have to filter
-        #       msg content and send to the correct downstream (worker) queue
-        # timestamp = time.time_ns()
-        # mock_channel = test_path / f"brainstorm-{timestamp}.txt"
-        # mock_channel.touch()
-
-        # thread - just look for key (wait for keys)
-        # call checkpoint, try to get non-persistent key, it blocks
-        # working set size > 1 has side-effects
-        # only incurs cost when working set size has been exceeded
+        time.sleep(0.1)
 
         channel_key = Channel.make_process_local().serialize()
         callback_channel = DragonCommChannel(channel_key)
@@ -156,7 +145,7 @@ def mock_messages(
         message_model_key = MessageHandler.build_model_key(model_key, fsd)
 
         request = MessageHandler.build_request(
-            reply_channel=callback_channel.descriptor,
+            reply_channel=base64.b64encode(callback_channel.descriptor).decode("utf-8"),
             model=message_model_key,
             inputs=[tensor_desc],
             outputs=[message_tensor_output_key],
@@ -218,11 +207,10 @@ def test_request_dispatcher_batching(prepare_environment: pathlib.Path) -> None:
     os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor
 
     ddict = DDict(1, 1)
-    dd_descriptor = ddict.serialize()
 
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=DragonCommChannel,
+        callback_factory=DragonCommChannel.from_descriptor,
         queue_factory=DragonFLIChannel.from_descriptor,
     )
     integrated_worker_type = TorchWorker
@@ -260,7 +248,6 @@ def test_request_dispatcher_batching(prepare_environment: pathlib.Path) -> None:
     batch: RequestBatch = request_dispatcher.task_queue.get(timeout=None)
 
     try:
-
         assert batch.has_valid_requests
         tensors = []
         mem_allocs = []
@@ -305,7 +292,11 @@ def test_request_dispatcher_batching(prepare_environment: pathlib.Path) -> None:
 
 
 def test_request_dispatcher_queues(prepare_environment: pathlib.Path) -> None:
-    """Test the request dispatcher internal queues"""
+    """Test the request dispatcher internal queues
+
+    This also includes setting a queue to disposable, checking that it is no
+    longer referenced and that it is re-created when needed.
+    """
 
     test_path = prepare_environment
     fs_path = test_path / "feature_store"
@@ -321,11 +312,10 @@ def test_request_dispatcher_queues(prepare_environment: pathlib.Path) -> None:
     os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor
 
     ddict = DDict(1, 1)
-    dd_descriptor = ddict.serialize()
 
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=DragonCommChannel,
+        callback_factory=DragonCommChannel.from_descriptor,
         queue_factory=DragonFLIChannel.from_descriptor,
     )
     integrated_worker_type = TorchWorker
@@ -346,50 +336,66 @@ def test_request_dispatcher_queues(prepare_environment: pathlib.Path) -> None:
 
     request_dispatcher._on_start()
 
-    # create a mock client application to populate the request queue
-    msg_pump = mp.Process(
-        target=mock_messages,
-        args=(
-            worker_queue,
-            DragonFeatureStore(ddict),
-            fs_path,
-            comm_path,
-        ),
-    )
-    msg_pump.start()
+    model_key = str(fs_path / "model_fs.pt")
 
-    batch: t.Optional[RequestBatch] = None
-    for attempts in range(10):
-        try:
-            request_dispatcher._on_iteration()
-            batch = request_dispatcher.task_queue.get(timeout=1)
-            break
-        except Empty as exc:
-            continue
-
-    try:
-        assert batch is not None
-        assert batch.has_valid_requests
+    for iteration in range(2):
+        batch: t.Optional[RequestBatch] = None
         mem_allocs = []
 
-        transform_result = batch.inputs
-        for transformed in transform_result.transformed:
-            mem_alloc = MemoryAlloc.attach(transformed)
-            mem_allocs.append(mem_alloc)
-
-        assert len(batch.requests) == 2
-        model_key = str(fs_path / "model_fs.pt")
-        assert batch.model_key.key == model_key
-        assert model_key in request_dispatcher._queues
-        assert model_key in request_dispatcher._active_queues
-        assert len(request_dispatcher._queues[model_key]) == 1
-        assert request_dispatcher._queues[model_key][0].empty()
-        assert request_dispatcher._queues[model_key][0].model_key.key == model_key
-
-    except Exception as exc:
-        raise exc
-    finally:
-        for mem_alloc in mem_allocs:
-            mem_alloc.free()
+        # create a mock client application to populate the request queue
+        msg_pump = mp.Process(
+            target=mock_messages,
+            args=(
+                worker_queue,
+                DragonFeatureStore(ddict),
+                fs_path,
+                comm_path,
+            ),
+        )
+        msg_pump.start()
+
+        for attempts in range(15):
+            try:
+                request_dispatcher._on_iteration()
+                batch = request_dispatcher.task_queue.get(timeout=1)
+                break
+            except Empty:
+                logger.info("Empty queue")
+                continue
+            except Exception as exc:
+                logger.info(f"Failed at iteration #{iteration}")
+                raise exc
 
-        msg_pump.kill()
+        try:
+            assert batch is not None
+            assert batch.has_valid_requests
+
+            transform_result = batch.inputs
+            for transformed in transform_result.transformed:
+                mem_alloc = MemoryAlloc.attach(transformed)
+                mem_allocs.append(mem_alloc)
+
+            assert len(batch.requests) == 2
+            assert batch.model_key.key == model_key
+            assert model_key in request_dispatcher._queues
+            assert model_key in request_dispatcher._active_queues
+            assert len(request_dispatcher._queues[model_key]) == 1
+            assert request_dispatcher._queues[model_key][0].empty()
+            assert request_dispatcher._queues[model_key][0].model_key.key == model_key
+
+        except Exception as exc:
+            logger.log(f"Failed at iteration #{iteration}")
+            raise exc
+        finally:
+            for mem_alloc in mem_allocs:
+                mem_alloc.free()
+
+            msg_pump.kill()
+
+        request_dispatcher._active_queues[model_key].make_disposable()
+        assert request_dispatcher._active_queues[model_key].can_be_removed
+
+        request_dispatcher._on_iteration()
+
+        assert model_key not in request_dispatcher._active_queues
+        assert model_key not in request_dispatcher._queues
diff --git a/tests/mli/test_device_manager.py b/tests/mli/test_device_manager.py
new file mode 100644
index 0000000000..12fe2578af
--- /dev/null
+++ b/tests/mli/test_device_manager.py
@@ -0,0 +1,162 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+
+
+from smartsim._core.mli.infrastructure.control.devicemanager import DeviceManager, WorkerDevice
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore, FeatureStoreKey
+from smartsim._core.mli.infrastructure.worker.worker import MachineLearningWorkerBase, ExecuteResult, FetchInputResult, FetchModelResult, InferenceRequest, LoadModelResult, RequestBatch, TransformInputResult, TransformOutputResult
+
+class MockWorker(MachineLearningWorkerBase):
+    @staticmethod
+    def fetch_model(
+        batch: RequestBatch, feature_stores: t.Dict[str, FeatureStore]
+    ) -> FetchModelResult:
+        if batch.has_raw_model:
+            return FetchModelResult(batch.raw_model)
+        return FetchModelResult(b'fetched_model')
+
+    @staticmethod
+    def load_model(
+        batch: RequestBatch, fetch_result: FetchModelResult, device: str
+    ) -> LoadModelResult:
+        return LoadModelResult(fetch_result.model_bytes)
+
+    @staticmethod
+    def transform_input(
+        batch: RequestBatch,
+        fetch_results: list[FetchInputResult],
+        mem_pool: "MemoryPool",
+    ) -> TransformInputResult:
+        return TransformInputResult(b'result', [slice(0,1)], [[1,2]], ["float32"])
+
+    @staticmethod
+    def execute(
+        batch: RequestBatch,
+        load_result: LoadModelResult,
+        transform_result: TransformInputResult,
+        device: str,
+    ) -> ExecuteResult:
+        return ExecuteResult(b'result', [slice(0,1)])
+
+    @staticmethod
+    def transform_output(
+        batch: RequestBatch, execute_result: ExecuteResult
+    ) -> t.List[TransformOutputResult]:
+        return [TransformOutputResult(b'result', None, "c", "float32")]
+
+
+def test_worker_device():
+    worker_device = WorkerDevice("gpu:0")
+    assert worker_device.name == "gpu:0"
+
+    model_key = "my_model_key"
+    model = b"the model"
+
+    worker_device.add_model(model_key, model)
+
+    assert model_key in worker_device
+    assert worker_device.get_model(model_key) == model
+    worker_device.remove_model(model_key)
+
+    assert model_key not in worker_device
+
+
+def test_device_manager_model_in_request():
+
+    worker_device = WorkerDevice("gpu:0")
+    device_manager = DeviceManager(worker_device)
+
+    worker = MockWorker()
+
+    tensor_key = FeatureStoreKey(key="key", descriptor="desc")
+    output_key = FeatureStoreKey(key="key", descriptor="desc")
+    model_key = FeatureStoreKey(
+        key="model key", descriptor="desc"
+    )
+
+    request = InferenceRequest(
+        model_key=model_key,
+        callback=None,
+        raw_inputs=None,
+        input_keys=[tensor_key],
+        input_meta=None,
+        output_keys=[output_key],
+        raw_model=b"raw model",
+        batch_size=0,
+    )
+
+    request_batch = RequestBatch(
+        [request],
+        TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]),
+        model_key=model_key,
+    )
+
+    with device_manager.get_device(worker=worker, batch=request_batch, feature_stores={}) as returned_device:
+
+        assert returned_device == worker_device
+        assert worker_device.get_model(model_key.key) == b"raw model"
+
+    assert model_key.key not in worker_device
+
+
+def test_device_manager_model_key():
+
+    worker_device = WorkerDevice("gpu:0")
+    device_manager = DeviceManager(worker_device)
+
+    worker = MockWorker()
+
+    tensor_key = FeatureStoreKey(key="key", descriptor="desc")
+    output_key = FeatureStoreKey(key="key", descriptor="desc")
+    model_key = FeatureStoreKey(
+        key="model key", descriptor="desc"
+    )
+
+    request = InferenceRequest(
+        model_key=model_key,
+        callback=None,
+        raw_inputs=None,
+        input_keys=[tensor_key],
+        input_meta=None,
+        output_keys=[output_key],
+        raw_model=None,
+        batch_size=0,
+    )
+
+    request_batch = RequestBatch(
+        [request],
+        TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]),
+        model_key=model_key,
+    )
+
+    with device_manager.get_device(worker=worker, batch=request_batch, feature_stores={}) as returned_device:
+
+        assert returned_device == worker_device
+        assert worker_device.get_model(model_key.key) == b"fetched_model"
+
+    assert model_key.key in worker_device
\ No newline at end of file

From 9d0ba309d1c61d6a3bb86c7c1fa90084e13ac5fa Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Mon, 26 Aug 2024 13:00:11 -0500
Subject: [PATCH 66/84] Fix tests

---
 tests/dragon/test_request_dispatcher.py | 167 +++++++-----------------
 tests/dragon/test_worker_manager.py     |   6 +
 2 files changed, 50 insertions(+), 123 deletions(-)

diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py
index d1e97a8b5b..f47ef46d7a 100644
--- a/tests/dragon/test_request_dispatcher.py
+++ b/tests/dragon/test_request_dispatcher.py
@@ -95,7 +95,6 @@ def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
         model_path.parent.mkdir(parents=True, exist_ok=True)
 
     model_path.unlink(missing_ok=True)
-    # model_path = test_path / "basic.pt"
 
     model = torch.nn.Linear(2, 1)
     torch.save(model, model_path)
@@ -110,6 +109,7 @@ def mock_messages(
     comm_channel_root_dir: pathlib.Path,
 ) -> None:
     """Mock event producer for triggering the inference pipeline"""
+    logger.info("Mocking messages")
     feature_store_root_dir.mkdir(parents=True, exist_ok=True)
     comm_channel_root_dir.mkdir(parents=True, exist_ok=True)
 
@@ -117,10 +117,11 @@ def mock_messages(
     model_bytes = model_path.read_bytes()
     model_key = str(feature_store_root_dir / "model_fs.pt")
 
+    logger.info("Putting model on FS")
     feature_store[model_key] = model_bytes
 
     for iteration_number in range(2):
-        time.sleep(0.1)
+        logger.info(f"Message #{iteration_number}")
 
         channel_key = Channel.make_process_local().serialize()
         callback_channel = DragonCommChannel(channel_key)
@@ -158,6 +159,7 @@ def mock_messages(
         ) as sendh:
             sendh.send_bytes(request_bytes)
             sendh.send_bytes(tensor.tobytes())
+        time.sleep(1)
 
 
 @pytest.fixture
@@ -189,9 +191,12 @@ def service_as_dragon_proc(
         stdout=dragon_process.Popen.STDOUT,
     )
 
+def test_request_dispatcher(prepare_environment: pathlib.Path) -> None:
+    """Test the request dispatcher batching and queueing system
 
-def test_request_dispatcher_batching(prepare_environment: pathlib.Path) -> None:
-    """Test dispatcher's batching of requests"""
+    This also includes setting a queue to disposable, checking that it is no
+    longer referenced by the dispatcher.
+    """
 
     test_path = prepare_environment
     fs_path = test_path / "feature_store"
@@ -206,7 +211,8 @@ def test_request_dispatcher_batching(prepare_environment: pathlib.Path) -> None:
     descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
     os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor
 
-    ddict = DDict(1, 1)
+    ddict = DDict(1, 1, 2*1024**2)
+    dragon_fs = DragonFeatureStore(ddict)
 
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
@@ -229,28 +235,43 @@ def test_request_dispatcher_batching(prepare_environment: pathlib.Path) -> None:
             f"{config_loader._queue_descriptor}"
         )
 
+    request_dispatcher._on_start()
+
+    batch: t.Optional[RequestBatch] = None
+    mem_allocs = []
+    tensors = []
+    fs_path = test_path / f"feature_store"
+    comm_path = test_path / f"comm_store"
+    model_key = str(fs_path / "model_fs.pt")
+
     # create a mock client application to populate the request queue
     msg_pump = mp.Process(
         target=mock_messages,
         args=(
             worker_queue,
-            DragonFeatureStore(ddict),
+            dragon_fs,
             fs_path,
             comm_path,
         ),
     )
+
     msg_pump.start()
 
-    # create a process to execute commands
-    process = service_as_dragon_proc(request_dispatcher, [], [])
-    process.start()
+    time.sleep(1)
 
-    batch: RequestBatch = request_dispatcher.task_queue.get(timeout=None)
+    for attempts in range(15):
+        try:
+            request_dispatcher._on_iteration()
+            batch = request_dispatcher.task_queue.get(timeout=1)
+            break
+        except Empty:
+            continue
+        except Exception as exc:
+            raise exc
 
     try:
+        assert batch is not None
         assert batch.has_valid_requests
-        tensors = []
-        mem_allocs = []
 
         transform_result = batch.inputs
         for transformed, dims, dtype in zip(
@@ -269,10 +290,14 @@ def test_request_dispatcher_batching(prepare_environment: pathlib.Path) -> None:
             )
 
         assert len(batch.requests) == 2
+        assert batch.model_key.key == model_key
+        assert model_key in request_dispatcher._queues
+        assert model_key in request_dispatcher._active_queues
+        assert len(request_dispatcher._queues[model_key]) == 1
+        assert request_dispatcher._queues[model_key][0].empty()
+        assert request_dispatcher._queues[model_key][0].model_key.key == model_key
         assert len(tensors) == 1
         assert tensors[0].shape == torch.Size([2, 2])
-        model_key = str(fs_path / "model_fs.pt")
-        assert batch.model_key.key == model_key
 
         for tensor in tensors:
             for sample_idx in range(tensor.shape[0]):
@@ -286,116 +311,12 @@ def test_request_dispatcher_batching(prepare_environment: pathlib.Path) -> None:
         for mem_alloc in mem_allocs:
             mem_alloc.free()
 
-        process.join(timeout=5)
-        process.kill()
         msg_pump.kill()
 
+    request_dispatcher._active_queues[model_key].make_disposable()
+    assert request_dispatcher._active_queues[model_key].can_be_removed
 
-def test_request_dispatcher_queues(prepare_environment: pathlib.Path) -> None:
-    """Test the request dispatcher internal queues
-
-    This also includes setting a queue to disposable, checking that it is no
-    longer referenced and that it is re-created when needed.
-    """
-
-    test_path = prepare_environment
-    fs_path = test_path / "feature_store"
-    comm_path = test_path / "comm_store"
-
-    to_worker_channel = dch.Channel.make_process_local()
-    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
-    to_worker_fli_serialized = to_worker_fli.serialize()
-
-    # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
-    # or test environment may be unable to send messages w/queue
-    descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
-    os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor
-
-    ddict = DDict(1, 1)
-
-    config_loader = EnvironmentConfigLoader(
-        featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=DragonCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_descriptor,
-    )
-    integrated_worker_type = TorchWorker
-
-    request_dispatcher = RequestDispatcher(
-        batch_timeout=0,
-        batch_size=2,
-        config_loader=config_loader,
-        worker_type=integrated_worker_type,
-    )
-
-    worker_queue = config_loader.get_queue()
-    if worker_queue is None:
-        logger.warn(
-            "FLI input queue not loaded correctly from config_loader: "
-            f"{config_loader._queue_descriptor}"
-        )
-
-    request_dispatcher._on_start()
-
-    model_key = str(fs_path / "model_fs.pt")
-
-    for iteration in range(2):
-        batch: t.Optional[RequestBatch] = None
-        mem_allocs = []
-
-        # create a mock client application to populate the request queue
-        msg_pump = mp.Process(
-            target=mock_messages,
-            args=(
-                worker_queue,
-                DragonFeatureStore(ddict),
-                fs_path,
-                comm_path,
-            ),
-        )
-        msg_pump.start()
-
-        for attempts in range(15):
-            try:
-                request_dispatcher._on_iteration()
-                batch = request_dispatcher.task_queue.get(timeout=1)
-                break
-            except Empty:
-                logger.info("Empty queue")
-                continue
-            except Exception as exc:
-                logger.info(f"Failed at iteration #{iteration}")
-                raise exc
-
-        try:
-            assert batch is not None
-            assert batch.has_valid_requests
-
-            transform_result = batch.inputs
-            for transformed in transform_result.transformed:
-                mem_alloc = MemoryAlloc.attach(transformed)
-                mem_allocs.append(mem_alloc)
-
-            assert len(batch.requests) == 2
-            assert batch.model_key.key == model_key
-            assert model_key in request_dispatcher._queues
-            assert model_key in request_dispatcher._active_queues
-            assert len(request_dispatcher._queues[model_key]) == 1
-            assert request_dispatcher._queues[model_key][0].empty()
-            assert request_dispatcher._queues[model_key][0].model_key.key == model_key
-
-        except Exception as exc:
-            logger.log(f"Failed at iteration #{iteration}")
-            raise exc
-        finally:
-            for mem_alloc in mem_allocs:
-                mem_alloc.free()
-
-            msg_pump.kill()
-
-        request_dispatcher._active_queues[model_key].make_disposable()
-        assert request_dispatcher._active_queues[model_key].can_be_removed
-
-        request_dispatcher._on_iteration()
+    request_dispatcher._on_iteration()
 
-        assert model_key not in request_dispatcher._active_queues
-        assert model_key not in request_dispatcher._queues
+    assert model_key not in request_dispatcher._active_queues
+    assert model_key not in request_dispatcher._queues
diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py
index ac466491d7..a334164257 100644
--- a/tests/dragon/test_worker_manager.py
+++ b/tests/dragon/test_worker_manager.py
@@ -36,6 +36,12 @@
 
 import base64
 import multiprocessing as mp
+
+try:
+    mp.set_start_method("dragon")
+except Exception:
+    pass
+
 import os
 
 import dragon.channels as dch

From 99da3558d080018497c4e90e96c6854dfc8b67e4 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Mon, 26 Aug 2024 13:08:35 -0500
Subject: [PATCH 67/84] Style and type

---
 .../infrastructure/control/devicemanager.py   |  7 +--
 .../control/requestdispatcher.py              |  1 +
 .../infrastructure/control/workermanager.py   | 12 +++--
 tests/dragon/test_request_dispatcher.py       |  3 +-
 tests/mli/test_device_manager.py              | 48 ++++++++++++-------
 5 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py
index 49f8403b8c..37256581db 100644
--- a/smartsim/_core/mli/infrastructure/control/devicemanager.py
+++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py
@@ -24,8 +24,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from contextlib import contextmanager
 import typing as t
+from contextlib import contextmanager, _GeneratorContextManager
 
 from .....log import get_logger
 from ...infrastructure.storage.featurestore import FeatureStore
@@ -81,11 +81,12 @@ def __contains__(self, key: str) -> bool:
         return key in self._models
 
     @contextmanager
-    def get(self, key_to_remove: t.Optional[str]):
+    def get(self, key_to_remove: t.Optional[str]) -> t.Iterator[t.Self]:
         yield self
         if key_to_remove is not None:
             self.remove_model(key_to_remove)
 
+
 class DeviceManager:
     def __init__(self, device: WorkerDevice):
         """An object to manage devices such as GPUs and CPUs.
@@ -122,7 +123,7 @@ def get_device(
         worker: MachineLearningWorkerBase,
         batch: RequestBatch,
         feature_stores: dict[str, FeatureStore],
-    ) -> t.Generator[WorkerDevice, None, None]:
+    ) -> _GeneratorContextManager[WorkerDevice]:
         """Get the device managed by this object
 
         the model needed to run the batch of requests is
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index 0016c18a9b..a4de00a9f0 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -64,6 +64,7 @@
 # Placeholder
 ModelIdentifier = FeatureStoreKey
 
+
 class BatchQueue(Queue[InferenceRequest]):
     def __init__(
         self, batch_timeout: float, batch_size: int, model_key: ModelIdentifier
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 8256ce4f55..e2ce19dd6d 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -192,10 +192,10 @@ def _on_iteration(self) -> None:
                     )
                 return
             device_cm = self._device_manager.get_device(
-                    worker=self._worker,
-                    batch=batch,
-                    feature_stores=self._feature_stores,
-                )
+                worker=self._worker,
+                batch=batch,
+                feature_stores=self._feature_stores,
+            )
 
         except Exception as exc:
             for request in batch.requests:
@@ -240,7 +240,9 @@ def _on_iteration(self) -> None:
             self._perf_timer.measure_time("execute")
 
             try:
-                transformed_outputs = self._worker.transform_output(batch, execute_result)
+                transformed_outputs = self._worker.transform_output(
+                    batch, execute_result
+                )
             except Exception as e:
                 for request in batch.requests:
                     exception_handler(
diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py
index f47ef46d7a..768467c245 100644
--- a/tests/dragon/test_request_dispatcher.py
+++ b/tests/dragon/test_request_dispatcher.py
@@ -191,6 +191,7 @@ def service_as_dragon_proc(
         stdout=dragon_process.Popen.STDOUT,
     )
 
+
 def test_request_dispatcher(prepare_environment: pathlib.Path) -> None:
     """Test the request dispatcher batching and queueing system
 
@@ -211,7 +212,7 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None:
     descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
     os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor
 
-    ddict = DDict(1, 1, 2*1024**2)
+    ddict = DDict(1, 1, 2 * 1024**2)
     dragon_fs = DragonFeatureStore(ddict)
 
     config_loader = EnvironmentConfigLoader(
diff --git a/tests/mli/test_device_manager.py b/tests/mli/test_device_manager.py
index 12fe2578af..1c8b9172da 100644
--- a/tests/mli/test_device_manager.py
+++ b/tests/mli/test_device_manager.py
@@ -26,10 +26,26 @@
 
 import typing as t
 
+from smartsim._core.mli.infrastructure.control.devicemanager import (
+    DeviceManager,
+    WorkerDevice,
+)
+from smartsim._core.mli.infrastructure.storage.featurestore import (
+    FeatureStore,
+    FeatureStoreKey,
+)
+from smartsim._core.mli.infrastructure.worker.worker import (
+    ExecuteResult,
+    FetchInputResult,
+    FetchModelResult,
+    InferenceRequest,
+    LoadModelResult,
+    MachineLearningWorkerBase,
+    RequestBatch,
+    TransformInputResult,
+    TransformOutputResult,
+)
 
-from smartsim._core.mli.infrastructure.control.devicemanager import DeviceManager, WorkerDevice
-from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore, FeatureStoreKey
-from smartsim._core.mli.infrastructure.worker.worker import MachineLearningWorkerBase, ExecuteResult, FetchInputResult, FetchModelResult, InferenceRequest, LoadModelResult, RequestBatch, TransformInputResult, TransformOutputResult
 
 class MockWorker(MachineLearningWorkerBase):
     @staticmethod
@@ -38,7 +54,7 @@ def fetch_model(
     ) -> FetchModelResult:
         if batch.has_raw_model:
             return FetchModelResult(batch.raw_model)
-        return FetchModelResult(b'fetched_model')
+        return FetchModelResult(b"fetched_model")
 
     @staticmethod
     def load_model(
@@ -52,7 +68,7 @@ def transform_input(
         fetch_results: list[FetchInputResult],
         mem_pool: "MemoryPool",
     ) -> TransformInputResult:
-        return TransformInputResult(b'result', [slice(0,1)], [[1,2]], ["float32"])
+        return TransformInputResult(b"result", [slice(0, 1)], [[1, 2]], ["float32"])
 
     @staticmethod
     def execute(
@@ -61,13 +77,13 @@ def execute(
         transform_result: TransformInputResult,
         device: str,
     ) -> ExecuteResult:
-        return ExecuteResult(b'result', [slice(0,1)])
+        return ExecuteResult(b"result", [slice(0, 1)])
 
     @staticmethod
     def transform_output(
         batch: RequestBatch, execute_result: ExecuteResult
     ) -> t.List[TransformOutputResult]:
-        return [TransformOutputResult(b'result', None, "c", "float32")]
+        return [TransformOutputResult(b"result", None, "c", "float32")]
 
 
 def test_worker_device():
@@ -95,9 +111,7 @@ def test_device_manager_model_in_request():
 
     tensor_key = FeatureStoreKey(key="key", descriptor="desc")
     output_key = FeatureStoreKey(key="key", descriptor="desc")
-    model_key = FeatureStoreKey(
-        key="model key", descriptor="desc"
-    )
+    model_key = FeatureStoreKey(key="model key", descriptor="desc")
 
     request = InferenceRequest(
         model_key=model_key,
@@ -116,7 +130,9 @@ def test_device_manager_model_in_request():
         model_key=model_key,
     )
 
-    with device_manager.get_device(worker=worker, batch=request_batch, feature_stores={}) as returned_device:
+    with device_manager.get_device(
+        worker=worker, batch=request_batch, feature_stores={}
+    ) as returned_device:
 
         assert returned_device == worker_device
         assert worker_device.get_model(model_key.key) == b"raw model"
@@ -133,9 +149,7 @@ def test_device_manager_model_key():
 
     tensor_key = FeatureStoreKey(key="key", descriptor="desc")
     output_key = FeatureStoreKey(key="key", descriptor="desc")
-    model_key = FeatureStoreKey(
-        key="model key", descriptor="desc"
-    )
+    model_key = FeatureStoreKey(key="model key", descriptor="desc")
 
     request = InferenceRequest(
         model_key=model_key,
@@ -154,9 +168,11 @@ def test_device_manager_model_key():
         model_key=model_key,
     )
 
-    with device_manager.get_device(worker=worker, batch=request_batch, feature_stores={}) as returned_device:
+    with device_manager.get_device(
+        worker=worker, batch=request_batch, feature_stores={}
+    ) as returned_device:
 
         assert returned_device == worker_device
         assert worker_device.get_model(model_key.key) == b"fetched_model"
 
-    assert model_key.key in worker_device
\ No newline at end of file
+    assert model_key.key in worker_device

From c3646d7b477b0aa20448d3acdf25aba9ae343049 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Mon, 26 Aug 2024 13:15:58 -0500
Subject: [PATCH 68/84] Fix mock app

---
 ex/high_throughput_inference/mock_app.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index 26045f9020..ea72b3dc16 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -100,14 +100,11 @@ def run_model(self, model: bytes | str, batch: torch.Tensor):
         self.perf_timer.measure_time("build_request")
         request_bytes = MessageHandler.serialize_request(request)
         self.perf_timer.measure_time("serialize_request")
-        tensor_bytes = [bytes(tensor.data) for tensor in tensors]
-        # tensor_bytes = [tensor.reshape(-1).view(numpy.uint8).data for tensor in tensors]
         self.perf_timer.measure_time("serialize_tensor")
         with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh:
             to_sendh.send_bytes(request_bytes)
-            for tb in tensor_bytes:
-                to_sendh.send_bytes(tb) #TODO NOT FAST ENOUGH!!!
-                # to_sendh.send_bytes(bytes(t.data))
+            for tensor in tensors:
+                to_sendh.send_bytes(tensor.tobytes()) #TODO NOT FAST ENOUGH!!!
 
         self.perf_timer.measure_time("send")
         with self._from_worker_ch.recvh(timeout=None) as from_recvh:

From c54e8802dace9768625ce89aa6322280bd5148c1 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Mon, 26 Aug 2024 13:28:06 -0500
Subject: [PATCH 69/84] Small change to app

---
 ex/high_throughput_inference/mock_app.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index ea72b3dc16..aaa1ee86ca 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -54,6 +54,7 @@
 torch.set_num_threads(1)
 
 logger = get_logger("App")
+logger.info("Started app")
 
 CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False
 

From 093d70621efdb007707463dcd000fb8bd2a52d8a Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Mon, 26 Aug 2024 16:53:56 -0500
Subject: [PATCH 70/84] Small change to app

---
 smartsim/_core/mli/infrastructure/control/workermanager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index e2ce19dd6d..7f6eb8edbf 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -107,7 +107,7 @@ def __init__(
         information among MLI components"""
         self._device_manager: t.Optional[DeviceManager] = None
         """Object responsible for model caching and device access"""
-        self._perf_timer = PerfTimer(prefix="w_", debug=True, timing_on=True)
+        self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True)
         """Performance timer"""
 
     def _on_start(self) -> None:

From d9de5c13f6a91bbc26a80c482525c6d687900fba Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 27 Aug 2024 13:05:27 -0500
Subject: [PATCH 71/84] Last fixes!

---
 doc/changelog.md                              |   1 +
 .../mock_app_redis.py                         |  16 +-
 .../infrastructure/control/devicemanager.py   |   6 +-
 .../control/{commons.py => error_handling.py} |   2 +
 .../control/requestdispatcher.py              |  64 +++----
 .../infrastructure/control/workermanager.py   |  48 ++---
 .../_core/mli/infrastructure/worker/worker.py |  41 ++++-
 tests/dragon/test_error_handling.py           |  59 ++++++-
 tests/dragon/test_request_dispatcher.py       | 164 +++++++++---------
 9 files changed, 243 insertions(+), 158 deletions(-)
 rename smartsim/_core/mli/infrastructure/control/{commons.py => error_handling.py} (96%)

diff --git a/doc/changelog.md b/doc/changelog.md
index 964e62b49d..ac09ecf604 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -13,6 +13,7 @@ Jump to:
 
 Description
 
+- Add RequestDispatcher and the possibility of batching inference requests
 - Enable hostname selection for dragon tasks
 - Remove pydantic dependency from MLI code
 - Update MLI environment variables using new naming convention
diff --git a/ex/high_throughput_inference/mock_app_redis.py b/ex/high_throughput_inference/mock_app_redis.py
index c0e67f82df..8978bcea23 100644
--- a/ex/high_throughput_inference/mock_app_redis.py
+++ b/ex/high_throughput_inference/mock_app_redis.py
@@ -31,6 +31,7 @@
 import torch
 from mpi4py import MPI
 from smartsim.log import get_logger
+from smartsim._core.utils.timings import PerfTimer
 from smartredis import Client
 
 logger = get_logger("App")
@@ -69,26 +70,21 @@ def name(self):
     client = Client(cluster=False, address=None)
     client.set_model(resnet.name, resnet.model, backend='TORCH', device=args.device.upper())
 
+    perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"redis{rank}_")
+
     total_iterations = 100
     timings=[]
     for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]:
         logger.info(f"Batch size: {batch_size}")
         for iteration_number in range(total_iterations + int(batch_size==1)):
-            timing = [batch_size]
+            perf_timer.start_timings("batch_size", batch_size)
             logger.info(f"Iteration: {iteration_number}")
-            start = time.perf_counter()
             input_name = f"batch_{rank}"
             output_name = f"result_{rank}"
             client.put_tensor(name=input_name, data=resnet.get_batch(batch_size).numpy())
             client.run_model(name=resnet.name, inputs=[input_name], outputs=[output_name])
             result = client.get_tensor(name=output_name)
-            end = time.perf_counter()
-            timing.append(end-start)
-            timings.append(timing)
-
+            perf_timer.end_timings()
 
 
-    timings_np = numpy.asarray(timings)
-    numpy.save(f"timings_{rank}.npy", timings_np)
-    for timing in timings:
-        print(" ".join(str(t) for t in timing))
+    perf_timer.print_timings(True)
diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py
index 37256581db..d716d756e4 100644
--- a/smartsim/_core/mli/infrastructure/control/devicemanager.py
+++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py
@@ -116,7 +116,7 @@ def _load_model_on_device(
 
         model_bytes = worker.fetch_model(batch, feature_stores)
         loaded_model = worker.load_model(batch, model_bytes, self._device.name)
-        self._device.add_model(batch.model_key.key, loaded_model.model)
+        self._device.add_model(batch.model_id.key, loaded_model.model)
 
     def get_device(
         self,
@@ -139,8 +139,8 @@ def get_device(
 
         # Load model if not already loaded, or
         # because it is sent with the request
-        if model_in_request or not batch.model_key.key in self._device:
+        if model_in_request or not batch.model_id.key in self._device:
             self._load_model_on_device(worker, batch, feature_stores)
 
-        key_to_remove = batch.model_key.key if model_in_request else None
+        key_to_remove = batch.model_id.key if model_in_request else None
         return self._device.get(key_to_remove)
diff --git a/smartsim/_core/mli/infrastructure/control/commons.py b/smartsim/_core/mli/infrastructure/control/error_handling.py
similarity index 96%
rename from smartsim/_core/mli/infrastructure/control/commons.py
rename to smartsim/_core/mli/infrastructure/control/error_handling.py
index a40ae014aa..e2c5bcd9e1 100644
--- a/smartsim/_core/mli/infrastructure/control/commons.py
+++ b/smartsim/_core/mli/infrastructure/control/error_handling.py
@@ -66,3 +66,5 @@ def exception_handler(
     )
     if reply_channel:
         reply_channel.send(serialized_resp)
+    else:
+        logger.warning("Unable to notify client of error without reply_channel")
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
index a4de00a9f0..d56912a8f0 100644
--- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -48,33 +48,31 @@
 from .....log import get_logger
 from ....utils.timings import PerfTimer
 from ...infrastructure.environmentloader import EnvironmentConfigLoader
-from ...infrastructure.storage.featurestore import FeatureStore, FeatureStoreKey
+from ...infrastructure.storage.featurestore import FeatureStore
 from ...infrastructure.worker.worker import (
     InferenceRequest,
     MachineLearningWorkerBase,
+    ModelIdentifier,
     RequestBatch,
 )
-from .commons import exception_handler
+from .error_handling import exception_handler
 
 if t.TYPE_CHECKING:
     from smartsim._core.mli.mli_schemas.response.response_capnp import Status
 
 logger = get_logger("Request Dispatcher")
 
-# Placeholder
-ModelIdentifier = FeatureStoreKey
-
 
 class BatchQueue(Queue[InferenceRequest]):
     def __init__(
-        self, batch_timeout: float, batch_size: int, model_key: ModelIdentifier
+        self, batch_timeout: float, batch_size: int, model_id: ModelIdentifier
     ) -> None:
         """Queue used to store inference requests waiting to be batched and
         sent to Worker Managers.
         :param batch_timeout: Time in seconds that has to be waited before flushing a
         non-full queue. The time of the first item put is 0 seconds.
         :param batch_size: Total capacity of the queue.
-        :param model_key: Key of the model which needs to be executed on the queued
+        :param model_id: Key of the model which needs to be executed on the queued
         requests
         """
         super().__init__(maxsize=batch_size)
@@ -88,8 +86,8 @@ def __init__(
         self._disposable = False
         """Whether the queue will not be used again and can be deleted.
         A disposable queue is always full."""
-        self._model_key: FeatureStoreKey = model_key
-        """Key of the model which needs to be executed on the queued requets"""
+        self._model_id: ModelIdentifier = model_id
+        """Key of the model which needs to be executed on the queued requests"""
         self._uid = str(uuid.uuid4())
         """Unique ID of queue"""
 
@@ -99,9 +97,9 @@ def uid(self) -> str:
         return self._uid
 
     @property
-    def model_key(self) -> ModelIdentifier:
+    def model_id(self) -> ModelIdentifier:
         """Key of the model which needs to be run on the queued requests"""
-        return self._model_key
+        return self._model_id
 
     def put(
         self,
@@ -115,11 +113,9 @@ def put(
         :param timeout: Time (in seconds) to wait if block==True
         :raises Full: If an item cannot be put on the queue
         """
-        if self.full():
-            raise Full
+        super().put(item, block=block, timeout=timeout)
         if self._first_put is None:
             self._first_put = time.time()
-        super().put(item, block=block, timeout=timeout)
 
     @property
     def _elapsed_time(self) -> float:
@@ -168,8 +164,6 @@ def full(self) -> bool:
         """Return True if the queue has reached its maximum capacity"""
         if self._disposable:
             return True
-        if self._batch_size <= 0:
-            return False
         return self.qsize() >= self._batch_size
 
     def empty(self) -> bool:
@@ -184,6 +178,7 @@ def __init__(
         batch_size: int,
         config_loader: EnvironmentConfigLoader,
         worker_type: t.Type[MachineLearningWorkerBase],
+        mem_pool_size: int = 2 * 1024**3,
     ) -> None:
         """The RequestDispatcher intercepts inference requests, stages them in
         queues and batches them together before making them available to Worker
@@ -195,11 +190,12 @@ def __init__(
         managers
         :param config_loader: Object to load configuration from environment
         :param worker_type: Type of worker to instantiate to batch inputs
+        :param mem_pool_size: Size of the memory pool used to allocate tensors
         :raises SmartSimError: If config_loaded.get_queue() does not return a channel
         """
         super().__init__(as_service=True, cooldown=1)
         self._queues: dict[str, list[BatchQueue]] = {}
-        """Dict of all batch queues available for a given model key"""
+        """Dict of all batch queues available for a given model id"""
         self._active_queues: dict[str, BatchQueue] = {}
         """Mapping telling which queue is the recipient of requests for a given model
         key"""
@@ -225,7 +221,7 @@ def __init__(
         """The type of communication channel to construct for callbacks"""
         self._worker = worker_type()
         """The worker used to batch inputs"""
-        self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(2 * 1024**3).sdesc)
+        self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(mem_pool_size).sdesc)
         """Memory pool used to share batched input tensors with the Worker Managers"""
         self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True)
         """Performance timer"""
@@ -316,6 +312,9 @@ def _validate_request(self, request: InferenceRequest) -> bool:
         return all(checks)
 
     def _on_iteration(self) -> None:
+        """This method is executed repeatedly until ``Service`` shutdown
+        conditions are satisfied and cooldown is elapsed.
+        """
         try:
             self._perf_timer.set_active(True)
             bytes_list: t.List[bytes] = self._incoming_channel.recv()
@@ -390,26 +389,25 @@ def task_queue(self) -> DragonQueue:
         """The queue on which batched requests are placed"""
         return self._outgoing_queue
 
-    def _swap_queue(self, model_key: FeatureStoreKey) -> None:
+    def _swap_queue(self, model_id: ModelIdentifier) -> None:
         """Get an empty queue or create a new one
 
         and make it the active one for a given model.
-        :param model_key: The key of the model for which the
+        :param model_id: The id of the model for which the
         queue has to be swapped
         """
-
-        if model_key.key in self._queues:
-            for queue in self._queues[model_key.key]:
+        if model_id.key in self._queues:
+            for queue in self._queues[model_id.key]:
                 if not queue.full():
-                    self._active_queues[model_key.key] = queue
+                    self._active_queues[model_id.key] = queue
                     return
 
-        new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_key)
-        if model_key.key in self._queues:
-            self._queues[model_key.key].append(new_queue)
+        new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_id)
+        if model_id.key in self._queues:
+            self._queues[model_id.key].append(new_queue)
         else:
-            self._queues[model_key.key] = [new_queue]
-        self._active_queues[model_key.key] = new_queue
+            self._queues[model_id.key] = [new_queue]
+        self._active_queues[model_id.key] = new_queue
         return
 
     def dispatch(self, request: InferenceRequest) -> None:
@@ -422,7 +420,7 @@ def dispatch(self, request: InferenceRequest) -> None:
             tmp_queue: BatchQueue = BatchQueue(
                 batch_timeout=0,
                 batch_size=1,
-                model_key=FeatureStoreKey(key=tmp_id, descriptor="TMP"),
+                model_id=ModelIdentifier(key=tmp_id, descriptor="TMP"),
             )
             self._active_queues[tmp_id] = tmp_queue
             self._queues[tmp_id] = [tmp_queue]
@@ -451,7 +449,7 @@ def flush_requests(self) -> None:
                         batch = RequestBatch(
                             requests=queue.flush(),
                             inputs=None,
-                            model_key=queue.model_key,
+                            model_id=queue.model_id,
                         )
                     finally:
                         self._perf_timer.measure_time("flush_requests")
@@ -499,4 +497,8 @@ def flush_requests(self) -> None:
                     self._perf_timer.measure_time("put")
 
     def _can_shutdown(self) -> bool:
+        """Whether the Service can be shut down"""
         return False
+
+    def __del__(self) -> None:
+        self._mem_pool.destroy()
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 7f6eb8edbf..da65412d23 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -51,7 +51,7 @@
     RequestBatch,
 )
 from ...message_handler import MessageHandler
-from .commons import build_failure_reply, exception_handler
+from .error_handling import build_failure_reply, exception_handler
 from .devicemanager import DeviceManager, WorkerDevice
 
 if t.TYPE_CHECKING:
@@ -75,21 +75,20 @@ def __init__(
     ) -> None:
         """Initialize the WorkerManager
 
-        :param config_loader: Environment config loader that loads the task queue and
-        feature store
+        :param config_loader: Environment config loader for loading queues
+        and feature stores
         :param worker_type: The type of worker to manage
-        :param dispatcher_queue: Queue from which the batched requests have to be pulled
+        :param dispatcher_queue: Queue from which the batched requests are pulled
         :param as_service: Specifies run-once or run-until-complete behavior of service
         :param cooldown: Number of seconds to wait before shutting down after
         shutdown criteria are met
-        :param comm_channel_type: The type of communication channel used for callbacks
         :param device: The device on which the Worker should run. Every worker manager
         is assigned one single GPU (if available), thus the device should have no index.
         """
         super().__init__(as_service, cooldown)
 
         self._dispatcher_queue = dispatcher_queue
-        """The dispatcher queue the manager monitors for new tasks"""
+        """The Dispatcher queue that the WorkerManager monitors for new batches"""
         self._worker = worker_type()
         """The ML Worker implementation"""
         self._callback_factory = config_loader._callback_factory
@@ -111,6 +110,8 @@ def __init__(
         """Performance timer"""
 
     def _on_start(self) -> None:
+        """Called on initial entry into Service `execute` event loop before
+        `_on_iteration` is invoked."""
         self._device_manager = DeviceManager(WorkerDevice(self._device))
 
     def _check_feature_stores(self, batch: RequestBatch) -> bool:
@@ -121,8 +122,8 @@ def _check_feature_stores(self, batch: RequestBatch) -> bool:
         """
         # collect all feature stores required by the request
         fs_model: t.Set[str] = set()
-        if batch.model_key.key:
-            fs_model = {batch.model_key.descriptor}
+        if batch.model_id.key:
+            fs_model = {batch.model_id.descriptor}
         fs_inputs = {key.descriptor for key in batch.input_keys}
         fs_outputs = {key.descriptor for key in batch.output_keys}
 
@@ -180,23 +181,30 @@ def _on_iteration(self) -> None:
             )
             return
 
+        if self._device_manager is None:
+            for request in batch.requests:
+                msg = "No Device Manager found. WorkerManager._on_start() "
+                "must be called after initialization. If possible, "
+                "you should use `WorkerManager.execute()` instead of "
+                "directly calling `_on_iteration()`."
+                try:
+                    self._dispatcher_queue.put(batch)
+                except Exception:
+                    msg += "\nThe batch could not be put back in the queue "
+                    "and will not be processed."
+                exception_handler(
+                    RuntimeError(msg),
+                    request.callback,
+                    "Error acquiring device manager",
+                )
+            return
+
         try:
-            if self._device_manager is None:
-                for request in batch.requests:
-                    exception_handler(
-                        ValueError(
-                            "No Device Manager available: did you call _on_start()?"
-                        ),
-                        request.callback,
-                        "Error acquiring device manager",
-                    )
-                return
             device_cm = self._device_manager.get_device(
                 worker=self._worker,
                 batch=batch,
                 feature_stores=self._feature_stores,
             )
-
         except Exception as exc:
             for request in batch.requests:
                 exception_handler(
@@ -210,7 +218,7 @@ def _on_iteration(self) -> None:
         with device_cm as device:
 
             try:
-                model_result = LoadModelResult(device.get_model(batch.model_key.key))
+                model_result = LoadModelResult(device.get_model(batch.model_id.key))
             except Exception as exc:
                 for request in batch.requests:
                     exception_handler(
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 008b6202be..6ce3323407 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -47,6 +47,8 @@
 
 logger = get_logger(__name__)
 
+# Placeholder
+ModelIdentifier = FeatureStoreKey
 
 class InferenceRequest:
     """Internal representation of an inference request from a client"""
@@ -181,24 +183,38 @@ class RequestBatch:
 
     requests: list[InferenceRequest]
     inputs: t.Optional[TransformInputResult]
-    model_key: FeatureStoreKey
+    model_id: ModelIdentifier
 
     @property
     def has_valid_requests(self) -> bool:
+        """Returns whether the batch contains at least one request.
+
+        :return: True if at least one request is available
+        """
         return len(self.requests) > 0
 
     @property
     def has_raw_model(self) -> bool:
+        """Returns whether the batch has a raw model
+
+        :return: True if the batch has a raw model
+        """
         return self.raw_model is not None
 
     @property
     def raw_model(self) -> t.Optional[t.Any]:
+        """Returns the raw model to use to execute for this batch
+        if it is available.
+        :return: A model if available, otherwise None"""
         if self.has_valid_requests:
             return self.requests[0].raw_model
         return None
 
     @property
     def input_keys(self) -> t.List[FeatureStoreKey]:
+        """All input keys available in this batch's requests
+
+        :return: All input keys belonging to requests in this batch"""
         keys = []
         for request in self.requests:
             keys.extend(request.input_keys)
@@ -207,6 +223,9 @@ def input_keys(self) -> t.List[FeatureStoreKey]:
 
     @property
     def output_keys(self) -> t.List[FeatureStoreKey]:
+        """All output keys available in this batch's requests
+
+        :return: All output keys belonging to requests in this batch"""
         keys = []
         for request in self.requests:
             keys.extend(request.output_keys)
@@ -299,7 +318,11 @@ def fetch_model(
         """Given a resource key, retrieve the raw model from a feature store
         :param batch: The batch of requests that triggered the pipeline
         :param feature_stores: Available feature stores used for persistence
-        :return: Raw bytes of the model"""
+        :return: Raw bytes of the model
+        :raises SmartSimError: if neither a key or a model are provided or the
+        model cannot be retrieved from the feature store
+        :raises ValueError: if a feature store is not available and a raw
+        model is not provided"""
 
         # All requests in the same batch share the model
         if batch.raw_model:
@@ -308,12 +331,12 @@ def fetch_model(
         if not feature_stores:
             raise ValueError("Feature store is required for model retrieval")
 
-        if batch.model_key is None:
+        if batch.model_id is None:
             raise SmartSimError(
                 "Key must be provided to retrieve model from feature store"
             )
 
-        key, fsd = batch.model_key.key, batch.model_key.descriptor
+        key, fsd = batch.model_id.key, batch.model_id.descriptor
 
         try:
             feature_store = feature_stores[fsd]
@@ -331,7 +354,9 @@ def fetch_inputs(
         and input metadata
         :param batch: The batch of requests that triggered the pipeline
         :param feature_stores: Available feature stores used for persistence
-        :return: the fetched input"""
+        :return: the fetched input
+        :raises ValueError: If neither an input key or an input tensor are provided
+        :raises SmartSimError: If a tensor for a given key cannot be retrieved"""
         fetch_results = []
         for request in batch.requests:
             if request.raw_inputs:
@@ -354,7 +379,7 @@ def fetch_inputs(
                     except KeyError as ex:
                         logger.exception(ex)
                         raise SmartSimError(
-                            f"Model could not be retrieved with key {fs_key.key}"
+                            f"Tensor could not be retrieved with key {fs_key.key}"
                         ) from ex
                 fetch_results.append(
                     FetchInputResult(data, meta=None)
@@ -376,7 +401,9 @@ def place_output(
         :param request: The request that triggered the pipeline
         :param execute_result: Results from inference
         :param feature_stores: Available feature stores used for persistence
-        :return: A collection of keys that were placed in the feature store"""
+        :return: A collection of keys that were placed in the feature store
+        :raises ValueError: If a feature store is not provided
+        """
         if not feature_stores:
             raise ValueError("Feature store is required for output persistence")
 
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index 113f7ccba0..9544768447 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -142,7 +142,7 @@ def setup_worker_manager_model_bytes(
     request_batch = RequestBatch(
         [request],
         TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]),
-        model_id,
+        model_id=model_id,
     )
 
     dispatcher_task_queue.put(request_batch)
@@ -184,12 +184,12 @@ def setup_worker_manager_model_key(
 
     tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
     output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
-    model_key = FeatureStoreKey(
+    model_id = FeatureStoreKey(
         key="model key", descriptor=app_feature_store.descriptor
     )
 
     request = InferenceRequest(
-        model_key=model_key,
+        model_key=model_id,
         callback=None,
         raw_inputs=None,
         input_keys=[tensor_key],
@@ -201,7 +201,7 @@ def setup_worker_manager_model_key(
     request_batch = RequestBatch(
         [request],
         TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]),
-        model_key=model_key,
+        model_id=model_id,
     )
 
     dispatcher_task_queue.put(request_batch)
@@ -252,6 +252,51 @@ def setup_request_dispatcher_model_bytes(
     return request_dispatcher, integrated_worker_type
 
 
+@pytest.fixture
+def setup_request_dispatcher_model_key(
+    test_dir,
+    monkeypatch: pytest.MonkeyPatch,
+    backbone_descriptor: str,
+    app_feature_store: FeatureStore,
+):
+    integrated_worker_type = IntegratedTorchWorker
+
+    chan = Channel.make_process_local()
+    queue = FLInterface(main_ch=chan)
+    monkeypatch.setenv(
+        "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
+    )
+    # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
+    monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor)
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    request_dispatcher = RequestDispatcher(
+        batch_timeout=0,
+        batch_size=0,
+        config_loader=config_loader,
+        worker_type=integrated_worker_type,
+    )
+    request_dispatcher._on_start()
+
+    tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    model_key = MessageHandler.build_model_key(
+        key="model key", feature_store_descriptor=app_feature_store.descriptor
+    )
+    request = MessageHandler.build_request(
+        test_dir, model_key, [tensor_key], [output_key], [], None
+    )
+    ser_request = MessageHandler.serialize_request(request)
+
+    request_dispatcher._incoming_channel.send(ser_request)
+
+    return request_dispatcher, integrated_worker_type
+
 def mock_pipeline_stage(monkeypatch: pytest.MonkeyPatch, integrated_worker, stage):
     def mock_stage(*args, **kwargs):
         raise ValueError(f"Simulated error in {stage}")
@@ -259,7 +304,7 @@ def mock_stage(*args, **kwargs):
     monkeypatch.setattr(integrated_worker, stage, mock_stage)
     mock_reply_fn = MagicMock()
     monkeypatch.setattr(
-        "smartsim._core.mli.infrastructure.control.commons.build_failure_reply",
+        "smartsim._core.mli.infrastructure.control.error_handling.build_failure_reply",
         mock_reply_fn,
     )
 
@@ -371,7 +416,7 @@ def test_wm_pipeline_stage_errors_handled(
     "setup_request_dispatcher",
     [
         pytest.param("setup_request_dispatcher_model_bytes"),
-        # pytest.param("setup_worker_manager_model_key"),
+        pytest.param("setup_request_dispatcher_model_key"),
     ],
 )
 @pytest.mark.parametrize(
@@ -424,7 +469,7 @@ def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch):
 
     mock_reply_fn = MagicMock()
     monkeypatch.setattr(
-        "smartsim._core.mli.infrastructure.control.commons.build_failure_reply",
+        "smartsim._core.mli.infrastructure.control.error_handling.build_failure_reply",
         mock_reply_fn,
     )
 
diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py
index 768467c245..8ccd55f634 100644
--- a/tests/dragon/test_request_dispatcher.py
+++ b/tests/dragon/test_request_dispatcher.py
@@ -24,6 +24,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import gc
 import io
 import logging
 import pathlib
@@ -109,7 +110,6 @@ def mock_messages(
     comm_channel_root_dir: pathlib.Path,
 ) -> None:
     """Mock event producer for triggering the inference pipeline"""
-    logger.info("Mocking messages")
     feature_store_root_dir.mkdir(parents=True, exist_ok=True)
     comm_channel_root_dir.mkdir(parents=True, exist_ok=True)
 
@@ -117,11 +117,9 @@ def mock_messages(
     model_bytes = model_path.read_bytes()
     model_key = str(feature_store_root_dir / "model_fs.pt")
 
-    logger.info("Putting model on FS")
     feature_store[model_key] = model_bytes
 
     for iteration_number in range(2):
-        logger.info(f"Message #{iteration_number}")
 
         channel_key = Channel.make_process_local().serialize()
         callback_channel = DragonCommChannel(channel_key)
@@ -212,7 +210,7 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None:
     descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
     os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor
 
-    ddict = DDict(1, 1, 2 * 1024**2)
+    ddict = DDict(1, 2, 4 * 1024**2)
     dragon_fs = DragonFeatureStore(ddict)
 
     config_loader = EnvironmentConfigLoader(
@@ -227,6 +225,7 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None:
         batch_size=2,
         config_loader=config_loader,
         worker_type=integrated_worker_type,
+        mem_pool_size=2*1024**2,
     )
 
     worker_queue = config_loader.get_queue()
@@ -238,86 +237,91 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None:
 
     request_dispatcher._on_start()
 
-    batch: t.Optional[RequestBatch] = None
-    mem_allocs = []
-    tensors = []
-    fs_path = test_path / f"feature_store"
-    comm_path = test_path / f"comm_store"
-    model_key = str(fs_path / "model_fs.pt")
-
-    # create a mock client application to populate the request queue
-    msg_pump = mp.Process(
-        target=mock_messages,
-        args=(
-            worker_queue,
-            dragon_fs,
-            fs_path,
-            comm_path,
-        ),
-    )
+    for _ in range(2):
+        batch: t.Optional[RequestBatch] = None
+        mem_allocs = []
+        tensors = []
+        fs_path = test_path / f"feature_store"
+        comm_path = test_path / f"comm_store"
+        model_key = str(fs_path / "model_fs.pt")
+
+        # create a mock client application to populate the request queue
+        msg_pump = mp.Process(
+            target=mock_messages,
+            args=(
+                worker_queue,
+                dragon_fs,
+                fs_path,
+                comm_path,
+            ),
+        )
+
+        msg_pump.start()
 
-    msg_pump.start()
+        time.sleep(1)
 
-    time.sleep(1)
+        for attempts in range(15):
+            try:
+                request_dispatcher._on_iteration()
+                batch = request_dispatcher.task_queue.get(timeout=1)
+                break
+            except Empty:
+                continue
+            except Exception as exc:
+                raise exc
 
-    for attempts in range(15):
         try:
-            request_dispatcher._on_iteration()
-            batch = request_dispatcher.task_queue.get(timeout=1)
-            break
-        except Empty:
-            continue
+            assert batch is not None
+            assert batch.has_valid_requests
+
+            transform_result = batch.inputs
+            for transformed, dims, dtype in zip(
+                transform_result.transformed, transform_result.dims, transform_result.dtypes
+            ):
+                mem_alloc = MemoryAlloc.attach(transformed)
+                mem_allocs.append(mem_alloc)
+                itemsize = np.empty((1), dtype=dtype).itemsize
+                tensors.append(
+                    torch.from_numpy(
+                        np.frombuffer(
+                            mem_alloc.get_memview()[0 : np.prod(dims) * itemsize],
+                            dtype=dtype,
+                        ).reshape(dims)
+                    )
+                )
+
+            assert len(batch.requests) == 2
+            assert batch.model_id.key == model_key
+            assert model_key in request_dispatcher._queues
+            assert model_key in request_dispatcher._active_queues
+            assert len(request_dispatcher._queues[model_key]) == 1
+            assert request_dispatcher._queues[model_key][0].empty()
+            assert request_dispatcher._queues[model_key][0].model_id.key == model_key
+            assert len(tensors) == 1
+            assert tensors[0].shape == torch.Size([2, 2])
+
+            for tensor in tensors:
+                for sample_idx in range(tensor.shape[0]):
+                    tensor_in = tensor[sample_idx]
+                    tensor_out = (sample_idx + 1) * torch.ones((2,), dtype=torch.float32)
+                    assert torch.equal(tensor_in, tensor_out)
+
         except Exception as exc:
             raise exc
+        finally:
+            for mem_alloc in mem_allocs:
+                mem_alloc.free()
 
-    try:
-        assert batch is not None
-        assert batch.has_valid_requests
-
-        transform_result = batch.inputs
-        for transformed, dims, dtype in zip(
-            transform_result.transformed, transform_result.dims, transform_result.dtypes
-        ):
-            mem_alloc = MemoryAlloc.attach(transformed)
-            mem_allocs.append(mem_alloc)
-            itemsize = np.empty((1), dtype=dtype).itemsize
-            tensors.append(
-                torch.from_numpy(
-                    np.frombuffer(
-                        mem_alloc.get_memview()[0 : np.prod(dims) * itemsize],
-                        dtype=dtype,
-                    ).reshape(dims)
-                )
-            )
-
-        assert len(batch.requests) == 2
-        assert batch.model_key.key == model_key
-        assert model_key in request_dispatcher._queues
-        assert model_key in request_dispatcher._active_queues
-        assert len(request_dispatcher._queues[model_key]) == 1
-        assert request_dispatcher._queues[model_key][0].empty()
-        assert request_dispatcher._queues[model_key][0].model_key.key == model_key
-        assert len(tensors) == 1
-        assert tensors[0].shape == torch.Size([2, 2])
-
-        for tensor in tensors:
-            for sample_idx in range(tensor.shape[0]):
-                tensor_in = tensor[sample_idx]
-                tensor_out = (sample_idx + 1) * torch.ones((2,), dtype=torch.float32)
-                assert torch.equal(tensor_in, tensor_out)
-
-    except Exception as exc:
-        raise exc
-    finally:
-        for mem_alloc in mem_allocs:
-            mem_alloc.free()
-
-        msg_pump.kill()
-
-    request_dispatcher._active_queues[model_key].make_disposable()
-    assert request_dispatcher._active_queues[model_key].can_be_removed
-
-    request_dispatcher._on_iteration()
-
-    assert model_key not in request_dispatcher._active_queues
-    assert model_key not in request_dispatcher._queues
+            msg_pump.kill()
+
+        request_dispatcher._active_queues[model_key].make_disposable()
+        assert request_dispatcher._active_queues[model_key].can_be_removed
+
+        request_dispatcher._on_iteration()
+
+        assert model_key not in request_dispatcher._active_queues
+        assert model_key not in request_dispatcher._queues
+
+    # Try to remove the dispatcher and free the memory
+    del request_dispatcher
+    gc.collect()
\ No newline at end of file

From eb03f0835c7820f091dd7f9cf3530a324e7ec119 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 27 Aug 2024 13:19:22 -0500
Subject: [PATCH 72/84] Avoid using t.Self

---
 smartsim/_core/mli/infrastructure/control/devicemanager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py
index d716d756e4..74d278c9a9 100644
--- a/smartsim/_core/mli/infrastructure/control/devicemanager.py
+++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py
@@ -81,7 +81,7 @@ def __contains__(self, key: str) -> bool:
         return key in self._models
 
     @contextmanager
-    def get(self, key_to_remove: t.Optional[str]) -> t.Iterator[t.Self]:
+    def get(self, key_to_remove: t.Optional[str]) -> t.Iterator["WorkerDevice"]:
         yield self
         if key_to_remove is not None:
             self.remove_model(key_to_remove)

From 1e1b8c910a7f46752ebeea61d6838b7dacaba50c Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 27 Aug 2024 13:31:00 -0500
Subject: [PATCH 73/84] Remove unused timing

---
 ex/high_throughput_inference/mock_app.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index aaa1ee86ca..0e43caf6a7 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -101,7 +101,6 @@ def run_model(self, model: bytes | str, batch: torch.Tensor):
         self.perf_timer.measure_time("build_request")
         request_bytes = MessageHandler.serialize_request(request)
         self.perf_timer.measure_time("serialize_request")
-        self.perf_timer.measure_time("serialize_tensor")
         with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh:
             to_sendh.send_bytes(request_bytes)
             for tensor in tensors:

From be0b8e0ea675e16e9cfc723ff7f06bd1f9d2a31f Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 27 Aug 2024 13:32:33 -0500
Subject: [PATCH 74/84] Split timing for request and tensors

---
 ex/high_throughput_inference/mock_app.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index 0e43caf6a7..517d18fb2f 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -103,10 +103,10 @@ def run_model(self, model: bytes | str, batch: torch.Tensor):
         self.perf_timer.measure_time("serialize_request")
         with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh:
             to_sendh.send_bytes(request_bytes)
+            self.perf_timer.measure_time("send_request")
             for tensor in tensors:
                 to_sendh.send_bytes(tensor.tobytes()) #TODO NOT FAST ENOUGH!!!
-
-        self.perf_timer.measure_time("send")
+        self.perf_timer.measure_time("send_tensors")
         with self._from_worker_ch.recvh(timeout=None) as from_recvh:
             resp = from_recvh.recv_bytes(timeout=None)
             self.perf_timer.measure_time("receive_response")

From bc11d92b84bc63c244a6137780c765db2a11d42c Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 27 Aug 2024 14:00:24 -0500
Subject: [PATCH 75/84] Pin watchdog to <5

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 512da78de9..709913eda8 100644
--- a/setup.py
+++ b/setup.py
@@ -177,7 +177,7 @@ class BuildError(Exception):
         "filelock>=3.4.2",
         "protobuf~=3.20",
         "jinja2>=3.1.2",
-        "watchdog>=4.0.0",
+        "watchdog>=4.0.0,<5",
         "pycapnp==2.0.0",
         "pydantic==1.10.14",
         "pyzmq>=25.1.2",

From b04f4c155fff42d805aac2b98e9050ab55a2d388 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 27 Aug 2024 14:07:08 -0500
Subject: [PATCH 76/84] Style

---
 smartsim/_core/mli/infrastructure/control/devicemanager.py | 2 +-
 smartsim/_core/mli/infrastructure/control/workermanager.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py
index 74d278c9a9..3570bd51ed 100644
--- a/smartsim/_core/mli/infrastructure/control/devicemanager.py
+++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py
@@ -25,7 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import typing as t
-from contextlib import contextmanager, _GeneratorContextManager
+from contextlib import _GeneratorContextManager, contextmanager
 
 from .....log import get_logger
 from ...infrastructure.storage.featurestore import FeatureStore
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index da65412d23..54a245b813 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -51,8 +51,8 @@
     RequestBatch,
 )
 from ...message_handler import MessageHandler
-from .error_handling import build_failure_reply, exception_handler
 from .devicemanager import DeviceManager, WorkerDevice
+from .error_handling import build_failure_reply, exception_handler
 
 if t.TYPE_CHECKING:
     from smartsim._core.mli.mli_schemas.response.response_capnp import Status

From 47088f09debf3e4643ce7b15cc5c83106e3a4b4e Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 27 Aug 2024 14:08:12 -0500
Subject: [PATCH 77/84] Other styling fixes

---
 smartsim/_core/mli/infrastructure/worker/worker.py |  1 +
 tests/dragon/test_error_handling.py                |  5 ++---
 tests/dragon/test_request_dispatcher.py            | 12 ++++++++----
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 6ce3323407..25e4dc49f7 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -50,6 +50,7 @@
 # Placeholder
 ModelIdentifier = FeatureStoreKey
 
+
 class InferenceRequest:
     """Internal representation of an inference request from a client"""
 
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index 9544768447..b20424866a 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -184,9 +184,7 @@ def setup_worker_manager_model_key(
 
     tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
     output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
-    model_id = FeatureStoreKey(
-        key="model key", descriptor=app_feature_store.descriptor
-    )
+    model_id = FeatureStoreKey(key="model key", descriptor=app_feature_store.descriptor)
 
     request = InferenceRequest(
         model_key=model_id,
@@ -297,6 +295,7 @@ def setup_request_dispatcher_model_key(
 
     return request_dispatcher, integrated_worker_type
 
+
 def mock_pipeline_stage(monkeypatch: pytest.MonkeyPatch, integrated_worker, stage):
     def mock_stage(*args, **kwargs):
         raise ValueError(f"Simulated error in {stage}")
diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py
index 8ccd55f634..c8d97dd7ed 100644
--- a/tests/dragon/test_request_dispatcher.py
+++ b/tests/dragon/test_request_dispatcher.py
@@ -225,7 +225,7 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None:
         batch_size=2,
         config_loader=config_loader,
         worker_type=integrated_worker_type,
-        mem_pool_size=2*1024**2,
+        mem_pool_size=2 * 1024**2,
     )
 
     worker_queue = config_loader.get_queue()
@@ -276,7 +276,9 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None:
 
             transform_result = batch.inputs
             for transformed, dims, dtype in zip(
-                transform_result.transformed, transform_result.dims, transform_result.dtypes
+                transform_result.transformed,
+                transform_result.dims,
+                transform_result.dtypes,
             ):
                 mem_alloc = MemoryAlloc.attach(transformed)
                 mem_allocs.append(mem_alloc)
@@ -303,7 +305,9 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None:
             for tensor in tensors:
                 for sample_idx in range(tensor.shape[0]):
                     tensor_in = tensor[sample_idx]
-                    tensor_out = (sample_idx + 1) * torch.ones((2,), dtype=torch.float32)
+                    tensor_out = (sample_idx + 1) * torch.ones(
+                        (2,), dtype=torch.float32
+                    )
                     assert torch.equal(tensor_in, tensor_out)
 
         except Exception as exc:
@@ -324,4 +328,4 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None:
 
     # Try to remove the dispatcher and free the memory
     del request_dispatcher
-    gc.collect()
\ No newline at end of file
+    gc.collect()

From 0609eec4437680bb3fc810558b767df1c13ce006 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 27 Aug 2024 14:20:30 -0500
Subject: [PATCH 78/84] Move tests that require dragon.MemoryPool

---
 tests/{mli => dragon}/test_core_machine_learning_worker.py | 4 ++--
 tests/{mli => dragon}/test_device_manager.py               | 2 ++
 tests/{mli => dragon}/test_torch_worker.py                 | 4 ++--
 3 files changed, 6 insertions(+), 4 deletions(-)
 rename tests/{mli => dragon}/test_core_machine_learning_worker.py (99%)
 rename tests/{mli => dragon}/test_device_manager.py (98%)
 rename tests/{mli => dragon}/test_torch_worker.py (98%)

diff --git a/tests/mli/test_core_machine_learning_worker.py b/tests/dragon/test_core_machine_learning_worker.py
similarity index 99%
rename from tests/mli/test_core_machine_learning_worker.py
rename to tests/dragon/test_core_machine_learning_worker.py
index 7ef4ab259b..145fe5b2cd 100644
--- a/tests/mli/test_core_machine_learning_worker.py
+++ b/tests/dragon/test_core_machine_learning_worker.py
@@ -42,8 +42,8 @@
 
 from .featurestore import FileSystemFeatureStore, MemoryFeatureStore
 
-# The tests in this file belong to the group_a group
-pytestmark = pytest.mark.group_b
+# The tests in this file belong to the group_dragon group
+pytestmark = pytest.mark.group_dragon
 
 # retrieved from pytest fixtures
 is_dragon = (
diff --git a/tests/mli/test_device_manager.py b/tests/dragon/test_device_manager.py
similarity index 98%
rename from tests/mli/test_device_manager.py
rename to tests/dragon/test_device_manager.py
index 1c8b9172da..b89f286c86 100644
--- a/tests/mli/test_device_manager.py
+++ b/tests/dragon/test_device_manager.py
@@ -46,6 +46,8 @@
     TransformOutputResult,
 )
 
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
 
 class MockWorker(MachineLearningWorkerBase):
     @staticmethod
diff --git a/tests/mli/test_torch_worker.py b/tests/dragon/test_torch_worker.py
similarity index 98%
rename from tests/mli/test_torch_worker.py
rename to tests/dragon/test_torch_worker.py
index 1e8bba7e33..4ff4fb9e55 100644
--- a/tests/mli/test_torch_worker.py
+++ b/tests/dragon/test_torch_worker.py
@@ -45,8 +45,8 @@
 from smartsim.log import get_logger
 
 logger = get_logger(__name__)
-# The tests in this file belong to the group_a group
-pytestmark = pytest.mark.group_a
+# The tests in this file belong to the group_dragon group
+pytestmark = pytest.mark.group_dragon
 
 
 # simple MNIST in PyTorch

From 275e102963339d473c865109565aaa127f6a09b7 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 27 Aug 2024 17:51:27 -0500
Subject: [PATCH 79/84] Update tests

---
 .../test_core_machine_learning_worker.py      | 94 +++++++++++--------
 tests/dragon/test_device_manager.py           |  5 +-
 tests/dragon/test_torch_worker.py             | 83 +++++++++++-----
 3 files changed, 116 insertions(+), 66 deletions(-)

diff --git a/tests/dragon/test_core_machine_learning_worker.py b/tests/dragon/test_core_machine_learning_worker.py
index 145fe5b2cd..5b6056e5b7 100644
--- a/tests/dragon/test_core_machine_learning_worker.py
+++ b/tests/dragon/test_core_machine_learning_worker.py
@@ -35,6 +35,7 @@
 from smartsim._core.mli.infrastructure.worker.worker import (
     InferenceRequest,
     MachineLearningWorkerCore,
+    RequestBatch,
     TransformInputResult,
     TransformOutputResult,
 )
@@ -42,8 +43,8 @@
 
 from .featurestore import FileSystemFeatureStore, MemoryFeatureStore
 
-# The tests in this file belong to the group_dragon group
-pytestmark = pytest.mark.group_dragon
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
 
 # retrieved from pytest fixtures
 is_dragon = (
@@ -94,9 +95,11 @@ def test_fetch_model_disk(persist_torch_model: pathlib.Path, test_dir: str) -> N
     fsd = feature_store.descriptor
     feature_store[str(persist_torch_model)] = persist_torch_model.read_bytes()
 
-    request = InferenceRequest(model_key=FeatureStoreKey(key=key, descriptor=fsd))
+    model_key = FeatureStoreKey(key=key, descriptor=fsd)
+    request = InferenceRequest(model_key=model_key)
+    batch = RequestBatch([request], None, model_key)
 
-    fetch_result = worker.fetch_model(request, {fsd: feature_store})
+    fetch_result = worker.fetch_model(batch, {fsd: feature_store})
     assert fetch_result.model_bytes
     assert fetch_result.model_bytes == persist_torch_model.read_bytes()
 
@@ -110,10 +113,12 @@ def test_fetch_model_disk_missing() -> None:
 
     key = "/path/that/doesnt/exist"
 
-    request = InferenceRequest(model_key=FeatureStoreKey(key=key, descriptor=fsd))
+    model_key = FeatureStoreKey(key=key, descriptor=fsd)
+    request = InferenceRequest(model_key=model_key)
+    batch = RequestBatch([request], None, model_key)
 
     with pytest.raises(sse.SmartSimError) as ex:
-        worker.fetch_model(request, {fsd: feature_store})
+        worker.fetch_model(batch, {fsd: feature_store})
 
     # ensure the error message includes key-identifying information
     assert key in ex.value.args[0]
@@ -133,10 +138,11 @@ def test_fetch_model_feature_store(persist_torch_model: pathlib.Path) -> None:
     fsd = feature_store.descriptor
     feature_store[key] = persist_torch_model.read_bytes()
 
-    request = InferenceRequest(
-        model_key=FeatureStoreKey(key=key, descriptor=feature_store.descriptor)
-    )
-    fetch_result = worker.fetch_model(request, {fsd: feature_store})
+    model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor)
+    request = InferenceRequest(model_key=model_key)
+    batch = RequestBatch([request], None, model_key)
+
+    fetch_result = worker.fetch_model(batch, {fsd: feature_store})
     assert fetch_result.model_bytes
     assert fetch_result.model_bytes == persist_torch_model.read_bytes()
 
@@ -150,13 +156,13 @@ def test_fetch_model_feature_store_missing() -> None:
     feature_store = MemoryFeatureStore()
     fsd = feature_store.descriptor
 
-    request = InferenceRequest(
-        model_key=FeatureStoreKey(key=key, descriptor=feature_store.descriptor)
-    )
+    model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor)
+    request = InferenceRequest(model_key=model_key)
+    batch = RequestBatch([request], None, model_key)
 
     # todo: consider that raising this exception shows impl. replace...
     with pytest.raises(sse.SmartSimError) as ex:
-        worker.fetch_model(request, {fsd: feature_store})
+        worker.fetch_model(batch, {fsd: feature_store})
 
     # ensure the error message includes key-identifying information
     assert key in ex.value.args[0]
@@ -173,11 +179,11 @@ def test_fetch_model_memory(persist_torch_model: pathlib.Path) -> None:
     fsd = feature_store.descriptor
     feature_store[key] = persist_torch_model.read_bytes()
 
-    request = InferenceRequest(
-        model_key=FeatureStoreKey(key=key, descriptor=feature_store.descriptor)
-    )
+    model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor)
+    request = InferenceRequest(model_key=model_key)
+    batch = RequestBatch([request], None, model_key)
 
-    fetch_result = worker.fetch_model(request, {fsd: feature_store})
+    fetch_result = worker.fetch_model(batch, {fsd: feature_store})
     assert fetch_result.model_bytes
     assert fetch_result.model_bytes == persist_torch_model.read_bytes()
 
@@ -193,12 +199,16 @@ def test_fetch_input_disk(persist_torch_tensor: pathlib.Path) -> None:
     request = InferenceRequest(
         input_keys=[FeatureStoreKey(key=tensor_name, descriptor=fsd)]
     )
+
+    model_key = FeatureStoreKey(key="test-model", descriptor=fsd)
+    batch = RequestBatch([request], None, model_key)
+
     worker = MachineLearningWorkerCore
 
     feature_store[tensor_name] = persist_torch_tensor.read_bytes()
 
-    fetch_result = worker.fetch_inputs(request, {fsd: feature_store})
-    assert fetch_result.inputs is not None
+    fetch_result = worker.fetch_inputs(batch, {fsd: feature_store})
+    assert fetch_result[0].inputs is not None
 
 
 def test_fetch_input_disk_missing() -> None:
@@ -212,8 +222,11 @@ def test_fetch_input_disk_missing() -> None:
 
     request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)])
 
+    model_key = FeatureStoreKey(key="test-model", descriptor=fsd)
+    batch = RequestBatch([request], None, model_key)
+
     with pytest.raises(sse.SmartSimError) as ex:
-        worker.fetch_inputs(request, {fsd: feature_store})
+        worker.fetch_inputs(batch, {fsd: feature_store})
 
     # ensure the error message includes key-identifying information
     assert key[0] in ex.value.args[0]
@@ -236,9 +249,12 @@ def test_fetch_input_feature_store(persist_torch_tensor: pathlib.Path) -> None:
     # put model bytes into the feature store
     feature_store[tensor_name] = persist_torch_tensor.read_bytes()
 
-    fetch_result = worker.fetch_inputs(request, {fsd: feature_store})
-    assert fetch_result.inputs
-    assert list(fetch_result.inputs)[0][:10] == persist_torch_tensor.read_bytes()[:10]
+    model_key = FeatureStoreKey(key="test-model", descriptor=fsd)
+    batch = RequestBatch([request], None, model_key)
+
+    fetch_result = worker.fetch_inputs(batch, {fsd: feature_store})
+    assert fetch_result[0].inputs
+    assert list(fetch_result[0].inputs)[0][:10] == persist_torch_tensor.read_bytes()[:10]
 
 
 @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
@@ -269,9 +285,12 @@ def test_fetch_multi_input_feature_store(persist_torch_tensor: pathlib.Path) ->
         ]
     )
 
-    fetch_result = worker.fetch_inputs(request, {fsd: feature_store})
+    model_key = FeatureStoreKey(key="test-model", descriptor=fsd)
+    batch = RequestBatch([request], None, model_key)
 
-    raw_bytes = list(fetch_result.inputs)
+    fetch_result = worker.fetch_inputs(batch, {fsd: feature_store})
+
+    raw_bytes = list(fetch_result[0].inputs)
     assert raw_bytes
     assert raw_bytes[0][:10] == persist_torch_tensor.read_bytes()[:10]
     assert raw_bytes[1][:10] == body2[:10]
@@ -288,8 +307,11 @@ def test_fetch_input_feature_store_missing() -> None:
     fsd = feature_store.descriptor
     request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)])
 
+    model_key = FeatureStoreKey(key="test-model", descriptor=fsd)
+    batch = RequestBatch([request], None, model_key)
+
     with pytest.raises(sse.SmartSimError) as ex:
-        worker.fetch_inputs(request, {fsd: feature_store})
+        worker.fetch_inputs(batch, {fsd: feature_store})
 
     # ensure the error message includes key-identifying information
     assert key in ex.value.args[0]
@@ -307,21 +329,11 @@ def test_fetch_input_memory(persist_torch_tensor: pathlib.Path) -> None:
     feature_store[key] = persist_torch_tensor.read_bytes()
     request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)])
 
-    fetch_result = worker.fetch_inputs(request, {fsd: feature_store})
-    assert fetch_result.inputs is not None
-
-
-def test_batch_requests() -> None:
-    """Verify batch requests handles an empty data set gracefully"""
-    worker = MachineLearningWorkerCore
-    result = TransformInputResult([])
-
-    request = InferenceRequest(batch_size=10)
+    model_key = FeatureStoreKey(key="test-model", descriptor=fsd)
+    batch = RequestBatch([request], None, model_key)
 
-    with pytest.raises(NotImplementedError):
-        # NOTE: we expect this to fail since it's not yet implemented.
-        # TODO: once implemented, replace this expectation of failure...
-        worker.batch_requests(request, result)
+    fetch_result = worker.fetch_inputs(batch, {fsd: feature_store})
+    assert fetch_result[0].inputs is not None
 
 
 def test_place_outputs() -> None:
diff --git a/tests/dragon/test_device_manager.py b/tests/dragon/test_device_manager.py
index b89f286c86..71ea844ed8 100644
--- a/tests/dragon/test_device_manager.py
+++ b/tests/dragon/test_device_manager.py
@@ -24,6 +24,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pytest
 import typing as t
 
 from smartsim._core.mli.infrastructure.control.devicemanager import (
@@ -129,7 +130,7 @@ def test_device_manager_model_in_request():
     request_batch = RequestBatch(
         [request],
         TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]),
-        model_key=model_key,
+        model_id=model_key,
     )
 
     with device_manager.get_device(
@@ -167,7 +168,7 @@ def test_device_manager_model_key():
     request_batch = RequestBatch(
         [request],
         TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]),
-        model_key=model_key,
+        model_id=model_key,
     )
 
     with device_manager.get_device(
diff --git a/tests/dragon/test_torch_worker.py b/tests/dragon/test_torch_worker.py
index 4ff4fb9e55..2d10af623d 100644
--- a/tests/dragon/test_torch_worker.py
+++ b/tests/dragon/test_torch_worker.py
@@ -26,8 +26,15 @@
 
 import io
 
+import numpy as np
 import pytest
 import torch
+import typing as t
+
+dragon = pytest.importorskip("dragon")
+import dragon.globalservices.pool as dragon_gs_pool
+from dragon.managed_memory import MemoryPool, MemoryAlloc
+
 from torch import nn
 from torch.nn import functional as F
 
@@ -39,14 +46,15 @@
     FetchModelResult,
     InferenceRequest,
     LoadModelResult,
+    RequestBatch,
     TransformInputResult,
 )
 from smartsim._core.mli.message_handler import MessageHandler
 from smartsim.log import get_logger
 
 logger = get_logger(__name__)
-# The tests in this file belong to the group_dragon group
-pytestmark = pytest.mark.group_dragon
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
 
 
 # simple MNIST in PyTorch
@@ -60,7 +68,7 @@ def __init__(self):
         self.fc1 = nn.Linear(9216, 128)
         self.fc2 = nn.Linear(128, 10)
 
-    def forward(self, x):
+    def forward(self, x, y):
         x = self.conv1(x)
         x = F.relu(x)
         x = self.conv2(x)
@@ -86,7 +94,7 @@ def get_batch() -> torch.Tensor:
 def create_torch_model():
     n = Net()
     example_forward_input = get_batch()
-    module = torch.jit.trace(n, example_forward_input)
+    module = torch.jit.trace(n, [example_forward_input, example_forward_input])
     model_buffer = io.BytesIO()
     torch.jit.save(module, model_buffer)
     return model_buffer.getvalue()
@@ -112,18 +120,23 @@ def get_request() -> InferenceRequest:
         batch_size=0,
     )
 
+def get_request_batch_from_request(request: InferenceRequest, inputs: t.Optional[TransformInputResult] = None) -> RequestBatch:
+
+    return RequestBatch([request], inputs, request.model_key)
 
 sample_request: InferenceRequest = get_request()
+sample_request_batch: RequestBatch = get_request_batch_from_request(sample_request)
 worker = TorchWorker()
 
 
 def test_load_model(mlutils) -> None:
     fetch_model_result = FetchModelResult(sample_request.raw_model)
     load_model_result = worker.load_model(
-        sample_request, fetch_model_result, mlutils.get_test_device().lower()
+        sample_request_batch, fetch_model_result, mlutils.get_test_device().lower()
     )
 
     assert load_model_result.model(
+        get_batch().to(torch_device[mlutils.get_test_device().lower()]),
         get_batch().to(torch_device[mlutils.get_test_device().lower()])
     ).shape == torch.Size((20, 10))
 
@@ -133,44 +146,68 @@ def test_transform_input(mlutils) -> None:
         sample_request.raw_inputs, sample_request.input_meta
     )
 
+    mem_pool = MemoryPool.attach(dragon_gs_pool.create(1024**2).sdesc)
+
     transform_input_result = worker.transform_input(
-        sample_request, fetch_input_result, mlutils.get_test_device().lower()
+        sample_request_batch, [fetch_input_result], mem_pool
     )
 
-    assert all(
-        transformed.shape == get_batch().shape
-        for transformed in transform_input_result.transformed
-    )
+    batch = get_batch().numpy()
+    assert transform_input_result.slices[0] == slice(0, batch.shape[0])
+
+    for tensor_index in range(2):
+        assert torch.Size(transform_input_result.dims[tensor_index]) == batch.shape
+        assert transform_input_result.dtypes[tensor_index] == str(batch.dtype)
+        mem_alloc = MemoryAlloc.attach(transform_input_result.transformed[tensor_index])
+        itemsize = batch.itemsize
+        tensor = torch.from_numpy(
+                np.frombuffer(
+                    mem_alloc.get_memview()[0 : np.prod(transform_input_result.dims[tensor_index]) * itemsize],
+                    dtype=transform_input_result.dtypes[tensor_index],
+                ).reshape(transform_input_result.dims[tensor_index])
+            )
+
+        assert torch.equal(tensor, torch.from_numpy(sample_request.raw_inputs[tensor_index]))
+
+    mem_pool.destroy()
 
 
 def test_execute(mlutils) -> None:
     load_model_result = LoadModelResult(
         Net().to(torch_device[mlutils.get_test_device().lower()])
     )
-    transform_result = TransformInputResult(
-        [
-            get_batch().to(torch_device[mlutils.get_test_device().lower()])
-            for _ in range(2)
-        ]
+    fetch_input_result = FetchInputResult(
+        sample_request.raw_inputs, sample_request.input_meta
+    )
+
+    request_batch = get_request_batch_from_request(sample_request, fetch_input_result)
+
+    mem_pool = MemoryPool.attach(dragon_gs_pool.create(1024**2).sdesc)
+
+    transform_result = worker.transform_input(
+        request_batch, [fetch_input_result], mem_pool
     )
 
-    execute_result = worker.execute(sample_request, load_model_result, transform_result)
+    execute_result = worker.execute(request_batch, load_model_result, transform_result, mlutils.get_test_device().lower())
 
     assert all(
         result.shape == torch.Size((20, 10)) for result in execute_result.predictions
     )
 
+    mem_pool.destroy()
+
 
 def test_transform_output(mlutils):
-    execute_result = ExecuteResult([torch.rand((20, 10)) for _ in range(2)])
+    tensors = [torch.rand((20, 10)) for _ in range(2)]
+    execute_result = ExecuteResult(tensors, [slice(0, 20)])
 
     transformed_output = worker.transform_output(
-        sample_request, execute_result, torch_device[mlutils.get_test_device().lower()]
+        sample_request_batch, execute_result
     )
 
-    assert transformed_output.outputs == [
-        item.numpy().tobytes() for item in execute_result.predictions
+    assert transformed_output[0].outputs == [
+        item.numpy().tobytes() for item in tensors
     ]
-    assert transformed_output.shape == None
-    assert transformed_output.order == "c"
-    assert transformed_output.dtype == "float32"
+    assert transformed_output[0].shape == None
+    assert transformed_output[0].order == "c"
+    assert transformed_output[0].dtype == "float32"

From b220d99e1180d36c354852b40b8b9e0f52cc4580 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 27 Aug 2024 17:58:11 -0500
Subject: [PATCH 80/84] Style

---
 .../test_core_machine_learning_worker.py      |  4 +-
 tests/dragon/test_device_manager.py           |  4 +-
 tests/dragon/test_torch_worker.py             | 46 +++++++++++--------
 3 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/tests/dragon/test_core_machine_learning_worker.py b/tests/dragon/test_core_machine_learning_worker.py
index 5b6056e5b7..d576997ea9 100644
--- a/tests/dragon/test_core_machine_learning_worker.py
+++ b/tests/dragon/test_core_machine_learning_worker.py
@@ -254,7 +254,9 @@ def test_fetch_input_feature_store(persist_torch_tensor: pathlib.Path) -> None:
 
     fetch_result = worker.fetch_inputs(batch, {fsd: feature_store})
     assert fetch_result[0].inputs
-    assert list(fetch_result[0].inputs)[0][:10] == persist_torch_tensor.read_bytes()[:10]
+    assert (
+        list(fetch_result[0].inputs)[0][:10] == persist_torch_tensor.read_bytes()[:10]
+    )
 
 
 @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
diff --git a/tests/dragon/test_device_manager.py b/tests/dragon/test_device_manager.py
index 71ea844ed8..fccb9b42f9 100644
--- a/tests/dragon/test_device_manager.py
+++ b/tests/dragon/test_device_manager.py
@@ -24,9 +24,10 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import pytest
 import typing as t
 
+import pytest
+
 from smartsim._core.mli.infrastructure.control.devicemanager import (
     DeviceManager,
     WorkerDevice,
@@ -50,6 +51,7 @@
 # The tests in this file belong to the dragon group
 pytestmark = pytest.mark.dragon
 
+
 class MockWorker(MachineLearningWorkerBase):
     @staticmethod
     def fetch_model(
diff --git a/tests/dragon/test_torch_worker.py b/tests/dragon/test_torch_worker.py
index 2d10af623d..88e800240f 100644
--- a/tests/dragon/test_torch_worker.py
+++ b/tests/dragon/test_torch_worker.py
@@ -25,16 +25,15 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import io
+import typing as t
 
 import numpy as np
 import pytest
 import torch
-import typing as t
 
 dragon = pytest.importorskip("dragon")
 import dragon.globalservices.pool as dragon_gs_pool
-from dragon.managed_memory import MemoryPool, MemoryAlloc
-
+from dragon.managed_memory import MemoryAlloc, MemoryPool
 from torch import nn
 from torch.nn import functional as F
 
@@ -120,10 +119,14 @@ def get_request() -> InferenceRequest:
         batch_size=0,
     )
 
-def get_request_batch_from_request(request: InferenceRequest, inputs: t.Optional[TransformInputResult] = None) -> RequestBatch:
+
+def get_request_batch_from_request(
+    request: InferenceRequest, inputs: t.Optional[TransformInputResult] = None
+) -> RequestBatch:
 
     return RequestBatch([request], inputs, request.model_key)
 
+
 sample_request: InferenceRequest = get_request()
 sample_request_batch: RequestBatch = get_request_batch_from_request(sample_request)
 worker = TorchWorker()
@@ -137,7 +140,7 @@ def test_load_model(mlutils) -> None:
 
     assert load_model_result.model(
         get_batch().to(torch_device[mlutils.get_test_device().lower()]),
-        get_batch().to(torch_device[mlutils.get_test_device().lower()])
+        get_batch().to(torch_device[mlutils.get_test_device().lower()]),
     ).shape == torch.Size((20, 10))
 
 
@@ -161,13 +164,17 @@ def test_transform_input(mlutils) -> None:
         mem_alloc = MemoryAlloc.attach(transform_input_result.transformed[tensor_index])
         itemsize = batch.itemsize
         tensor = torch.from_numpy(
-                np.frombuffer(
-                    mem_alloc.get_memview()[0 : np.prod(transform_input_result.dims[tensor_index]) * itemsize],
-                    dtype=transform_input_result.dtypes[tensor_index],
-                ).reshape(transform_input_result.dims[tensor_index])
-            )
-
-        assert torch.equal(tensor, torch.from_numpy(sample_request.raw_inputs[tensor_index]))
+            np.frombuffer(
+                mem_alloc.get_memview()[
+                    0 : np.prod(transform_input_result.dims[tensor_index]) * itemsize
+                ],
+                dtype=transform_input_result.dtypes[tensor_index],
+            ).reshape(transform_input_result.dims[tensor_index])
+        )
+
+        assert torch.equal(
+            tensor, torch.from_numpy(sample_request.raw_inputs[tensor_index])
+        )
 
     mem_pool.destroy()
 
@@ -188,7 +195,12 @@ def test_execute(mlutils) -> None:
         request_batch, [fetch_input_result], mem_pool
     )
 
-    execute_result = worker.execute(request_batch, load_model_result, transform_result, mlutils.get_test_device().lower())
+    execute_result = worker.execute(
+        request_batch,
+        load_model_result,
+        transform_result,
+        mlutils.get_test_device().lower(),
+    )
 
     assert all(
         result.shape == torch.Size((20, 10)) for result in execute_result.predictions
@@ -201,13 +213,9 @@ def test_transform_output(mlutils):
     tensors = [torch.rand((20, 10)) for _ in range(2)]
     execute_result = ExecuteResult(tensors, [slice(0, 20)])
 
-    transformed_output = worker.transform_output(
-        sample_request_batch, execute_result
-    )
+    transformed_output = worker.transform_output(sample_request_batch, execute_result)
 
-    assert transformed_output[0].outputs == [
-        item.numpy().tobytes() for item in tensors
-    ]
+    assert transformed_output[0].outputs == [item.numpy().tobytes() for item in tensors]
     assert transformed_output[0].shape == None
     assert transformed_output[0].order == "c"
     assert transformed_output[0].dtype == "float32"

From d3ab796004cb1f83f07b4c6f136a78b6956c3f82 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 27 Aug 2024 18:05:39 -0500
Subject: [PATCH 81/84] Import or skip dragon

---
 tests/dragon/test_core_machine_learning_worker.py | 2 ++
 tests/dragon/test_device_manager.py               | 1 +
 2 files changed, 3 insertions(+)

diff --git a/tests/dragon/test_core_machine_learning_worker.py b/tests/dragon/test_core_machine_learning_worker.py
index d576997ea9..940c76c8a1 100644
--- a/tests/dragon/test_core_machine_learning_worker.py
+++ b/tests/dragon/test_core_machine_learning_worker.py
@@ -27,6 +27,8 @@
 import pathlib
 import time
 
+dragon = pytest.importorskip("dragon")
+
 import pytest
 import torch
 
diff --git a/tests/dragon/test_device_manager.py b/tests/dragon/test_device_manager.py
index fccb9b42f9..2b7fa1f549 100644
--- a/tests/dragon/test_device_manager.py
+++ b/tests/dragon/test_device_manager.py
@@ -27,6 +27,7 @@
 import typing as t
 
 import pytest
+dragon = pytest.importorskip("dragon")
 
 from smartsim._core.mli.infrastructure.control.devicemanager import (
     DeviceManager,

From 14e627e8e91c3e63d664027bc255ec5edd319219 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 27 Aug 2024 18:12:46 -0500
Subject: [PATCH 82/84] Isort

---
 tests/dragon/test_device_manager.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/dragon/test_device_manager.py b/tests/dragon/test_device_manager.py
index 2b7fa1f549..8edeb60fbb 100644
--- a/tests/dragon/test_device_manager.py
+++ b/tests/dragon/test_device_manager.py
@@ -27,6 +27,7 @@
 import typing as t
 
 import pytest
+
 dragon = pytest.importorskip("dragon")
 
 from smartsim._core.mli.infrastructure.control.devicemanager import (

From bbe97ff8899dbdf5573df19707e321fa09140192 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 27 Aug 2024 18:20:14 -0500
Subject: [PATCH 83/84] Fix pytest import

---
 tests/dragon/test_core_machine_learning_worker.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/dragon/test_core_machine_learning_worker.py b/tests/dragon/test_core_machine_learning_worker.py
index 940c76c8a1..231a971241 100644
--- a/tests/dragon/test_core_machine_learning_worker.py
+++ b/tests/dragon/test_core_machine_learning_worker.py
@@ -27,9 +27,10 @@
 import pathlib
 import time
 
+import pytest
+
 dragon = pytest.importorskip("dragon")
 
-import pytest
 import torch
 
 import smartsim.error as sse

From eea793e95f5971250da98f758d1c6ee247d3782c Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Tue, 27 Aug 2024 18:33:20 -0500
Subject: [PATCH 84/84] Adapt syntax for python 3.9

---
 smartsim/_core/utils/timings.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py
index 34595c8586..a61a243220 100644
--- a/smartsim/_core/utils/timings.py
+++ b/smartsim/_core/utils/timings.py
@@ -56,13 +56,13 @@ def _add_label_to_timings(self, label: str) -> None:
             self._timings[label] = []
 
     @staticmethod
-    def _format_number(number: float | int) -> str:
+    def _format_number(number: t.Union[float, int]) -> str:
         return f"{number:0.4e}"
 
     def start_timings(
         self,
         first_label: t.Optional[str] = None,
-        first_value: t.Optional[float | int] = None,
+        first_value: t.Optional[t.Union[float, int]] = None,
     ) -> None:
         if self._timing_on:
             if first_label is not None and first_value is not None:
@@ -86,7 +86,7 @@ def end_timings(self) -> None:
     def _make_label(self, label: str) -> str:
         return self._prefix + label
 
-    def _get_delta(self) -> float | int:
+    def _get_delta(self) -> t.Union[float, int]:
         if self._interm is None:
             return 0
         return time.perf_counter() - self._interm