CrayLabs
diff --git a/‎doc/changelog.md‎
Lines changed: 1 addition & 0 deletions b/‎doc/changelog.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎smartsim/_core/mli/infrastructure/control/workermanager.py‎
Lines changed: 78 additions & 136 deletions b/‎smartsim/_core/mli/infrastructure/control/workermanager.py‎
Lines changed: 78 additions & 136 deletions
diff --git a/‎smartsim/_core/mli/infrastructure/environmentloader.py‎
Lines changed: 31 additions & 6 deletions b/‎smartsim/_core/mli/infrastructure/environmentloader.py‎
Lines changed: 31 additions & 6 deletions
diff --git a/‎smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py‎
Lines changed: 7 additions & 0 deletions b/‎smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py‎
Lines changed: 7 additions & 0 deletions
@@ -13,6 +13,7 @@ Jump to:
 
 Description
 
+- Enable dynamic feature store selection
 - Add TorchWorker first implementation and mock inference app example
 - Add EnvironmentConfigLoader for ML Worker Manager
 - Add Model schema with model metadata included
 
@@ -25,18 +25,9 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import sys
-
-# isort: off
-import dragon
-from dragon import fli
-
-# isort: on
-
 import time
 import typing as t
 
-import numpy as np
-
 from .....error import SmartSimError
 from .....log import get_logger
 from ....entrypoints.service import Service
@@ -54,108 +45,28 @@
 from ...mli_schemas.response.response_capnp import Response
 
 if t.TYPE_CHECKING:
-    from dragon.fli import FLInterface
-
-    from smartsim._core.mli.mli_schemas.model.model_capnp import Model
     from smartsim._core.mli.mli_schemas.response.response_capnp import StatusEnum
 
 logger = get_logger(__name__)
 
 
-def deserialize_message(
-    data_blob: bytes,
-    channel_type: t.Type[CommChannelBase],
-    device: t.Literal["cpu", "gpu"],
-) -> InferenceRequest:
-    """Deserialize a message from a byte stream into an InferenceRequest
-    :param data_blob: The byte stream to deserialize"""
-    # todo: consider moving to XxxCore and only making
-    # workers implement the inputs and model conversion?
-
-    # alternatively, consider passing the capnproto models
-    # to this method instead of the data_blob...
-
-    # something is definitely wrong here... client shouldn't have to touch
-    # callback (or batch size)
-
-    request = MessageHandler.deserialize_request(data_blob)
-    # return request
-    model_key: t.Optional[str] = None
-    model_bytes: t.Optional[Model] = None
-
-    if request.model.which() == "key":
-        model_key = request.model.key.key
-    elif request.model.which() == "data":
-        model_bytes = request.model.data
-
-    callback_key = request.replyChannel.reply
-
-    # todo: shouldn't this be `CommChannel.find` instead of `DragonCommChannel`
-    comm_channel = channel_type(callback_key)
-    # comm_channel = DragonCommChannel(request.replyChannel)
-
-    input_keys: t.Optional[t.List[str]] = None
-    input_bytes: t.Optional[t.List[bytes]] = (
-        None  # these will really be tensors already
-    )
-
-    input_meta: t.List[t.Any] = []
-
-    if request.input.which() == "keys":
-        input_keys = [input_key.key for input_key in request.input.keys]
-    elif request.input.which() == "data":
-        input_bytes = [data.blob for data in request.input.data]
-        input_meta = [data.tensorDescriptor for data in request.input.data]
-
-    inference_request = InferenceRequest(
-        model_key=model_key,
-        callback=comm_channel,
-        raw_inputs=input_bytes,
-        input_meta=input_meta,
-        input_keys=input_keys,
-        raw_model=model_bytes,
-        batch_size=0,
-    )
-    return inference_request
-
-
 def build_failure_reply(status: "StatusEnum", message: str) -> Response:
+    """Build a response indicating a failure occurred
+    :param status: The status of the response
+    :param message: The error message to include in the response"""
     return MessageHandler.build_response(
         status=status,  # todo: need to indicate correct status
         message=message,  # todo: decide what these will be
-        result=[],
+        result=None,
         custom_attributes=None,
     )
 
 
-def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]:
-    prepared_outputs: t.List[t.Any] = []
-    if reply.output_keys:
-        for key in reply.output_keys:
-            if not key:
-                continue
-            msg_key = MessageHandler.build_tensor_key(key)
-            prepared_outputs.append(msg_key)
-    elif reply.outputs:
-        arrays: t.List[np.ndarray[t.Any, np.dtype[t.Any]]] = [
-            output.numpy() for output in reply.outputs
-        ]
-        for tensor in arrays:
-            # todo: need to have the output attributes specified in the req?
-            # maybe, add `MessageHandler.dtype_of(tensor)`?
-            # can `build_tensor` do dtype and shape?
-            msg_tensor = MessageHandler.build_tensor(
-                tensor,
-                "c",
-                "float32",
-                [1],
-            )
-            prepared_outputs.append(msg_tensor)
-    return prepared_outputs
-
-
-def build_reply(reply: InferenceReply) -> Response:
-    results = prepare_outputs(reply)
+def build_reply(worker: MachineLearningWorkerBase, reply: InferenceReply) -> Response:
+    """Builds a response for a successful inference request
+    :param worker: A worker to process the reply with
+    :param reply: The internal representation of the reply"""
+    results = worker.prepare_outputs(reply)
 
     return MessageHandler.build_response(
         status="complete",
@@ -191,10 +102,6 @@ def __init__(
 
         self._task_queue: t.Optional[CommChannelBase] = config_loader.get_queue()
         """the queue the manager monitors for new tasks"""
-        self._feature_store: t.Optional[FeatureStore] = (
-            config_loader.get_feature_store()
-        )
-        """a feature store to retrieve models from"""
         self._worker = worker
         """The ML Worker implementation"""
         self._comm_channel_type = comm_channel_type
@@ -203,37 +110,68 @@ def __init__(
         """Device on which workers need to run"""
         self._cached_models: dict[str, t.Any] = {}
         """Dictionary of previously loaded models"""
+        self._feature_stores = config_loader.get_feature_stores()
+        """A collection of attached feature stores"""
+
+    def _check_feature_stores(self, request: InferenceRequest) -> bool:
+        """Ensures that all feature stores required by the request are available
+        :param request: The request to validate"""
+        # collect all feature stores required by the request
+        fs_model = {request.model_key.descriptor}
+        fs_inputs = {key.descriptor for key in request.input_keys}
+        fs_outputs = {key.descriptor for key in request.output_keys}
+
+        # identify which feature stores are requested and unknown
+        fs_desired = fs_model + fs_inputs + fs_outputs
+        fs_actual = {key for key in self._feature_stores}
+        fs_missing = fs_desired - fs_actual
+
+        # exit if all desired feature stores are not available
+        if fs_missing:
+            logger.error(f"Missing feature store(s): {fs_missing}")
+            return False
 
-    def _validate_request(self, request: InferenceRequest) -> bool:
-        """Ensure the request can be processed.
-        :param request: The request to validate
-        :return: True if the request is valid, False otherwise"""
-        if not self._feature_store:
-            if request.model_key:
-                logger.error("Unable to load model by key without feature store")
-                return False
+        return True
 
-            if request.input_keys:
-                logger.error("Unable to load inputs by key without feature store")
-                return False
+    def _check_model(self, request: InferenceRequest) -> bool:
+        """Ensure that a model is available for the request
+        :param request: The request to validate"""
+        if request.model_key or request.raw_model:
+            return True
 
-            if request.output_keys:
-                logger.error("Unable to persist outputs by key without feature store")
-                return False
+        logger.error("Unable to continue without model bytes or feature store key")
+        return False
 
-        if not request.model_key and not request.raw_model:
-            logger.error("Unable to continue without model bytes or feature store key")
-            return False
+    def _check_inputs(self, request: InferenceRequest) -> bool:
+        """Ensure that inputs are available for the request
+        :param request: The request to validate"""
+        if request.input_keys or request.raw_inputs:
+            return True
 
-        if not request.input_keys and not request.raw_inputs:
-            logger.error("Unable to continue without input bytes or feature store keys")
-            return False
+        logger.error("Unable to continue without input bytes or feature store keys")
+        return False
 
-        if request.callback is None:
-            logger.error("No callback channel provided in request")
-            return False
+    def _check_callback(self, request: InferenceRequest) -> bool:
+        """Ensure that a callback channel is available for the request
+        :param request: The request to validate"""
+        if request.callback is not None:
+            return True
 
-        return True
+        logger.error("No callback channel provided in request")
+        return False
+
+    def _validate_request(self, request: InferenceRequest) -> bool:
+        """Ensure the request can be processed.
+        :param request: The request to validate
+        :return: True if the request is valid, False otherwise"""
+        checks = [
+            self._check_feature_stores(request),
+            self._check_model(request),
+            self._check_inputs(request),
+            self._check_callback(request),
+        ]
+
+        return all(checks)
 
     def _on_iteration(self) -> None:
         """Executes calls to the machine learning worker implementation to complete
@@ -249,8 +187,8 @@ def _on_iteration(self) -> None:
         request_bytes: bytes = self._task_queue.recv()
 
         interm = time.perf_counter()  # timing
-        request = deserialize_message(
-            request_bytes, self._comm_channel_type, self._device
+        request = self._worker.deserialize_message(
+            request_bytes, self._comm_channel_type
         )
         if not self._validate_request(request):
             return
@@ -262,18 +200,21 @@ def _on_iteration(self) -> None:
             if request.model_key is None:
                 # A valid request should never get here.
                 raise ValueError("Could not read model key")
-            if request.model_key in self._cached_models:
+
+            if request.model_key.key in self._cached_models:
                 timings.append(time.perf_counter() - interm)  # timing
                 interm = time.perf_counter()  # timing
-                model_result = LoadModelResult(self._cached_models[request.model_key])
+                model_result = LoadModelResult(
+                    self._cached_models[request.model_key.key]
+                )
 
             else:
                 fetch_model_result = None
                 while True:
                     try:
                         interm = time.perf_counter()  # timing
                         fetch_model_result = self._worker.fetch_model(
-                            request, self._feature_store
+                            request, self._feature_stores
                         )
                     except KeyError:
                         time.sleep(0.1)
@@ -287,16 +228,17 @@ def _on_iteration(self) -> None:
                 model_result = self._worker.load_model(
                     request, fetch_model_result, self._device
                 )
-                self._cached_models[request.model_key] = model_result.model
+                self._cached_models[request.model_key.key] = model_result.model
         else:
-            fetch_model_result = self._worker.fetch_model(request, None)
+            fetch_model_result = self._worker.fetch_model(request, {})
             model_result = self._worker.load_model(
                 request, fetch_result=fetch_model_result, device=self._device
             )
 
         timings.append(time.perf_counter() - interm)  # timing
         interm = time.perf_counter()  # timing
-        fetch_input_result = self._worker.fetch_inputs(request, self._feature_store)
+
+        fetch_input_result = self._worker.fetch_inputs(request, self._feature_stores)
 
         timings.append(time.perf_counter() - interm)  # timing
         interm = time.perf_counter()  # timing
@@ -324,7 +266,7 @@ def _on_iteration(self) -> None:
             interm = time.perf_counter()  # timing
             if request.output_keys:
                 reply.output_keys = self._worker.place_output(
-                    request, transformed_output, self._feature_store
+                    request, transformed_output, self._feature_stores
                 )
             else:
                 reply.outputs = transformed_output.outputs
@@ -341,7 +283,7 @@ def _on_iteration(self) -> None:
             if reply.outputs is None or not reply.outputs:
                 response = build_failure_reply("fail", "no-results")
 
-            response = build_reply(reply)
+            response = build_reply(self._worker, reply)
 
         timings.append(time.perf_counter() - interm)  # timing
         interm = time.perf_counter()  # timing
 
@@ -31,10 +31,15 @@
 
 from dragon.fli import FLInterface  # pylint: disable=all
 
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
 from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
 from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
 
 
+logger = get_logger(__name__)
+
+
 class EnvironmentConfigLoader:
     """
     Facilitates the loading of a FeatureStore and Queue
@@ -47,15 +52,35 @@ def __init__(self) -> None:
         )
         self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None)
         self.feature_store: t.Optional[FeatureStore] = None
+        self.feature_stores: t.Optional[t.Dict[FeatureStore]] = None
         self.queue: t.Optional[DragonFLIChannel] = None
 
-    def get_feature_store(self) -> t.Optional[FeatureStore]:
-        """Loads the Feature Store previously set in SSFeatureStore"""
-        if self._feature_store_descriptor is not None:
-            self.feature_store = pickle.loads(
-                base64.b64decode(self._feature_store_descriptor)
+    def _load_feature_store(self, env_var: str) -> FeatureStore:
+        """Load a feature store from a descriptor
+        :param descriptor: The descriptor of the feature store
+        :returns: The hydrated feature store"""
+        logger.debug(f"Loading feature store from env: {env_var}")
+
+        value = os.getenv(env_var)
+        if not value:
+            raise SmartSimError(f"Empty feature store descriptor in environment: {env_var}")
+
+        try:
+            return pickle.loads(base64.b64decode(value))
+        except:
+            raise SmartSimError(
+                f"Invalid feature store descriptor in environment: {env_var}"
             )
-        return self.feature_store
+
+    def get_feature_stores(self) -> t.Dict[str, FeatureStore]:
+        """Loads multiple Feature Stores by scanning environment for variables
+        prefixed with `SSFeatureStore`"""
+        prefix = "SSFeatureStore"
+        if self.feature_stores is None:
+            env_vars = [var for var in os.environ if var.startswith(prefix)]
+            stores = [self._load_feature_store(var) for var in env_vars]
+            self.feature_stores = {fs.descriptor: fs for fs in stores}
+        return self.feature_stores
 
     def get_queue(self, sender_supplied: bool = True) -> t.Optional[DragonFLIChannel]:
         """Returns the Queue previously set in SSQueue"""
 
@@ -69,3 +69,10 @@ def __contains__(self, key: str) -> bool:
         Return `True` if the key is found, `False` otherwise
         :param key: Unique key of an item to retrieve from the feature store"""
         return key in self._storage
+
+    @property
+    def descriptor(self) -> str:
+        """Return a unique identifier enabling a client to connect to
+        the feature store
+        :returns: A descriptor encoded as a string"""
+        return str(self._storage.serialize())