update TM as per latest estimator changes

Dewen Qi · Dewen Qi · commit 46bba6883a6b · 2022-08-04T20:33:13.000-07:00
diff --git a/src/sagemaker/instance_group.py b/src/sagemaker/instance_group.py
@@ -13,15 +13,19 @@
 """Defines the InstanceGroup class that configures a heterogeneous cluster."""
 from __future__ import absolute_import
 
+from typing import Optional, Union
+
+from sagemaker.workflow.entities import PipelineVariable
+
 
 class InstanceGroup(object):
     """The class to create instance groups for a heterogeneous cluster."""
 
     def __init__(
         self,
-        instance_group_name=None,
-        instance_type=None,
-        instance_count=None,
+        instance_group_name: Optional[Union[str, PipelineVariable]] = None,
+        instance_type: Optional[Union[str, PipelineVariable]] = None,
+        instance_count: Optional[Union[int, PipelineVariable]] = None,
     ):
         """It initializes an ``InstanceGroup`` instance.
 
diff --git a/tests/unit/sagemaker/workflow/test_mechanism/test_code/__init__.py b/tests/unit/sagemaker/workflow/test_mechanism/test_code/__init__.py
@@ -19,6 +19,7 @@
 
 from sagemaker import ModelMetrics, MetricsSource, FileSource, Predictor
 from sagemaker.drift_check_baselines import DriftCheckBaselines
+from sagemaker.instance_group import InstanceGroup
 from sagemaker.metadata_properties import MetadataProperties
 from sagemaker.model import FrameworkModel
 from sagemaker.parameter import IntegerParameter
@@ -233,14 +234,17 @@ def _generate_all_pipeline_vars() -> dict:
     )
 
 
+# TODO: we should remove the _IS_TRUE_TMP and replace its usages with IS_TRUE
+# As currently the `instance_groups` does not work well with some estimator subclasses,
+# we temporarily hard code it to False which disables the instance_groups
+_IS_TRUE_TMP = False
 IS_TRUE = bool(getrandbits(1))
 PIPELINE_SESSION = _generate_mock_pipeline_session()
 PIPELINE_VARIABLES = _generate_all_pipeline_vars()
 
 # TODO: need to recursively assign with Pipeline Variable in later changes
 FIXED_ARGUMENTS = dict(
     common=dict(
-        instance_type=INSTANCE_TYPE,
         role=ROLE,
         sagemaker_session=PIPELINE_SESSION,
         source_dir=f"s3://{BUCKET}/source",
@@ -281,6 +285,7 @@ def _generate_all_pipeline_vars() -> dict:
         response_types=["application/json"],
     ),
     processor=dict(
+        instance_type=INSTANCE_TYPE,
         estimator_cls=PyTorch,
         code=f"s3://{BUCKET}/code",
         spark_event_logs_s3_uri=f"s3://{BUCKET}/my-spark-output-path",
@@ -438,13 +443,33 @@ def _generate_all_pipeline_vars() -> dict:
                 input_mode=ParameterString(name="train_inputs_input_mode"),
                 attribute_names=[ParameterString(name="train_inputs_attribute_name")],
                 target_attribute_name=ParameterString(name="train_inputs_target_attr_name"),
+                instance_groups=[ParameterString(name="train_inputs_instance_groups")],
             ),
         },
+        instance_groups=[
+            InstanceGroup(
+                instance_group_name=ParameterString(name="instance_group_name"),
+                # hard code the instance_type here because InstanceGroup.instance_type
+                # would be used to retrieve image_uri if image_uri is not presented
+                # and currently the test mechanism does not support skip the test case
+                # relating to bonded parameters in composite variables (i.e. the InstanceGroup)
+                # TODO: we should support skip testing on bonded parameters in composite vars
+                instance_type="ml.m5.xlarge",
+                instance_count=ParameterString(name="instance_group_instance_count"),
+            ),
+        ]
+        if _IS_TRUE_TMP
+        else None,
+        instance_type="ml.m5.xlarge" if not _IS_TRUE_TMP else None,
+        instance_count=1 if not _IS_TRUE_TMP else None,
+        distribution={} if not _IS_TRUE_TMP else None,
     ),
     transformer=dict(
+        instance_type=INSTANCE_TYPE,
         data=f"s3://{BUCKET}/data",
     ),
     tuner=dict(
+        instance_type=INSTANCE_TYPE,
         estimator=TensorFlow(
             entry_point=TENSORFLOW_ENTRY_POINT,
             role=ROLE,
@@ -475,12 +500,14 @@ def _generate_all_pipeline_vars() -> dict:
         include_cls_metadata={"estimator-1": IS_TRUE},
     ),
     model=dict(
+        instance_type=INSTANCE_TYPE,
         serverless_inference_config=ServerlessInferenceConfig(),
         framework_version="1.11.0",
         py_version="py3",
         accelerator_type="ml.eia2.xlarge",
     ),
     pipelinemodel=dict(
+        instance_type=INSTANCE_TYPE,
         models=[
             SparkMLModel(
                 name="MySparkMLModel",
@@ -577,12 +604,17 @@ def _generate_all_pipeline_vars() -> dict:
         },
     ),
 )
-# A dict to keep the optional arguments which should not be None according to the logic
-# specific to the subclass.
+# A dict to keep the optional arguments which should not be set to None
+# in the test iteration according to the logic specific to the subclass.
 PARAMS_SHOULD_NOT_BE_NONE = dict(
     estimator=dict(
         init=dict(
-            common={"instance_count", "instance_type"},
+            # TODO: we should remove the three instance_ parameters here
+            # For mutually exclusive parameters: instance group
+            # vs instance count/instance type, if any side is set to None during iteration,
+            # the other side should get a not None value, instead of listing them here
+            # and force them to be not None
+            common={"instance_count", "instance_type", "instance_groups"},
             LDA={"mini_batch_size"},
         )
     ),
@@ -692,7 +724,10 @@ def _generate_all_pipeline_vars() -> dict:
     ),
     estimator=dict(
         init=dict(
-            common=dict(),
+            common=dict(
+                entry_point={"enable_network_isolation"},
+                source_dir={"enable_network_isolation"},
+            ),
             TensorFlow=dict(
                 image_uri={"compiler_config"},
                 compiler_config={"image_uri"},
@@ -701,7 +736,13 @@ def _generate_all_pipeline_vars() -> dict:
                 image_uri={"compiler_config"},
                 compiler_config={"image_uri"},
             ),
-        )
+        ),
+        fit=dict(
+            common=dict(
+                instance_count={"instance_groups"},
+                instance_type={"instance_groups"},
+            ),
+        ),
     ),
 )
 
diff --git a/tests/unit/sagemaker/workflow/test_mechanism/test_code/test_pipeline_var_compatibility_template.py b/tests/unit/sagemaker/workflow/test_mechanism/test_code/test_pipeline_var_compatibility_template.py
@@ -15,7 +15,7 @@
 import json
 
 from random import getrandbits
-from typing import Optional
+from typing import Optional, List
 from typing_extensions import get_origin
 
 from sagemaker import Model, PipelineModel, AlgorithmEstimator
@@ -368,14 +368,14 @@ def _verify_composite_object_against_pipeline_var(
         self,
         param_with_none: str,
         step_dsl: str,
-        step_dsl_obj: object,
+        step_dsl_obj: List[dict],
     ):
         """verify pipeline definition regarding composite objects against pipeline variables
 
         Args:
             param_with_none (str): The name of the parameter with None value.
             step_dsl (str): The step definition retrieved from the pipeline definition DSL.
-            step_dsl_obj (objet): The json load object of the step definition.
+            step_dsl_obj (List[dict]): The json load object of the step definition.
         """
         # TODO: remove the following hard code assertion once recursive assignment is added
         if issubclass(self.clazz, Processor):
@@ -398,6 +398,12 @@ def _verify_composite_object_against_pipeline_var(
                     assert '{"Get": "Parameters.proc_input_s3_data_type"}' in step_dsl
                     assert '{"Get": "Parameters.proc_input_app_managed"}' in step_dsl
         elif issubclass(self.clazz, EstimatorBase):
+            if (
+                param_with_none != "instance_groups"
+                and self.default_args[CLAZZ_ARGS]["instance_groups"]
+            ):
+                assert '{"Get": "Parameters.instance_group_name"}' in step_dsl
+                assert '{"Get": "Parameters.instance_group_instance_count"}' in step_dsl
             if issubclass(self.clazz, AmazonAlgorithmEstimatorBase):
                 # AmazonAlgorithmEstimatorBase's input is records
                 if param_with_none != "records":
@@ -415,6 +421,7 @@ def _verify_composite_object_against_pipeline_var(
                     assert '{"Get": "Parameters.train_inputs_input_mode"}' in step_dsl
                     assert '{"Get": "Parameters.train_inputs_attribute_name"}' in step_dsl
                     assert '{"Get": "Parameters.train_inputs_target_attr_name"}' in step_dsl
+                    assert '{"Get": "Parameters.train_inputs_instance_groups"}' in step_dsl
             if not issubclass(self.clazz, (TensorFlow, MXNet, PyTorch, AlgorithmEstimator)):
                 # debugger_hook_config may be disabled for these first 3 frameworks
                 # AlgorithmEstimator ignores the kwargs
diff --git a/tests/unit/sagemaker/workflow/test_mechanism/test_entries/test_pipeline_var_compatibility_with_estimators.py b/tests/unit/sagemaker/workflow/test_mechanism/test_entries/test_pipeline_var_compatibility_with_estimators.py
@@ -291,7 +291,6 @@ def test_sklearn_estimator_compatibility():
         clazz_args=dict(
             py_version="py3",
             instance_count=1,
-            instance_type="ml.m5.xlarge",
             framework_version="0.20.0",
         ),
         func_args=dict(),