From c412f8e47240271ba8d5c930be976efcaed59145 Mon Sep 17 00:00:00 2001
From: Keerthan Vasist <kvasist@amazon.com>
Date: Mon, 27 Dec 2021 12:10:11 -0800
Subject: [PATCH] deprecation: add deprecation warning for
 s3_data_distribution_type in Clarify DataConfig

---
 src/sagemaker/clarify.py   | 18 +++++++++++++++---
 tests/unit/test_clarify.py | 13 +++++++++++++
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/src/sagemaker/clarify.py b/src/sagemaker/clarify.py
index 70353ea2c2..a06ca79f12 100644
--- a/src/sagemaker/clarify.py
+++ b/src/sagemaker/clarify.py
@@ -22,11 +22,17 @@
 import tempfile
 from abc import ABC, abstractmethod
 from sagemaker import image_uris, s3, utils
+from sagemaker.deprecations import deprecation_warning
 from sagemaker.processing import ProcessingInput, ProcessingOutput, Processor
 
 logger = logging.getLogger(__name__)
 
 
+@deprecation_warning(
+    msg="s3_data_distribution_type parameter will no longer be supported. Everything else will"
+    " remain as is",
+    date="15 Mar 2022",
+)
 class DataConfig:
     """Config object related to configurations of the input and output dataset."""
 
@@ -58,8 +64,8 @@ def __init__(
                 dataset format is JSONLines.
             dataset_type (str): Format of the dataset. Valid values are "text/csv" for CSV,
                 "application/jsonlines" for JSONLines, and "application/x-parquet" for Parquet.
-            s3_data_distribution_type (str): Valid options are "FullyReplicated" or
-                "ShardedByS3Key".
+            s3_data_distribution_type (str): Deprecated. Only valid option is "FullyReplicated".
+                Any other value is ignored.
             s3_compression_type (str): Valid options are "None" or "Gzip".
             joinsource (str): The name or index of the column in the dataset that acts as an
                 identifier column (for instance, while performing a join). This column is only
@@ -80,7 +86,13 @@ def __init__(
         self.s3_data_input_path = s3_data_input_path
         self.s3_output_path = s3_output_path
         self.s3_analysis_config_output_path = s3_analysis_config_output_path
-        self.s3_data_distribution_type = s3_data_distribution_type
+        if s3_data_distribution_type != "FullyReplicated":
+            logger.warning(
+                "s3_data_distribution_type parameter, set to %s, is being ignored. Only"
+                " valid option is FullyReplicated",
+                s3_data_distribution_type,
+            )
+        self.s3_data_distribution_type = "FullyReplicated"
         self.s3_compression_type = s3_compression_type
         self.label = label
         self.headers = headers
diff --git a/tests/unit/test_clarify.py b/tests/unit/test_clarify.py
index 56057351f6..910268a7dc 100644
--- a/tests/unit/test_clarify.py
+++ b/tests/unit/test_clarify.py
@@ -82,6 +82,19 @@ def test_invalid_data_config():
         )
 
 
+def test_s3_data_distribution_type_ignorance():
+    data_config = DataConfig(
+        s3_data_input_path="s3://input/train.csv",
+        s3_output_path="s3://output/analysis_test_result",
+        label="Label",
+        headers=["Label", "F1", "F2", "F3", "F4"],
+        dataset_type="text/csv",
+        joinsource="F4",
+        s3_data_distribution_type="ShardedByS3Key",
+    )
+    assert data_config.s3_data_distribution_type == "FullyReplicated"
+
+
 def test_bias_config():
     label_values = [1]
     facet_name = "F1"