From c412f8e47240271ba8d5c930be976efcaed59145 Mon Sep 17 00:00:00 2001 From: Keerthan Vasist Date: Mon, 27 Dec 2021 12:10:11 -0800 Subject: [PATCH] deprecation: add deprecation warning for s3_data_distribution_type in Clarify DataConfig --- src/sagemaker/clarify.py | 18 +++++++++++++++--- tests/unit/test_clarify.py | 13 +++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/src/sagemaker/clarify.py b/src/sagemaker/clarify.py index 70353ea2c2..a06ca79f12 100644 --- a/src/sagemaker/clarify.py +++ b/src/sagemaker/clarify.py @@ -22,11 +22,17 @@ import tempfile from abc import ABC, abstractmethod from sagemaker import image_uris, s3, utils +from sagemaker.deprecations import deprecation_warning from sagemaker.processing import ProcessingInput, ProcessingOutput, Processor logger = logging.getLogger(__name__) +@deprecation_warning( + msg="s3_data_distribution_type parameter will no longer be supported. Everything else will" + " remain as is", + date="15 Mar 2022", +) class DataConfig: """Config object related to configurations of the input and output dataset.""" @@ -58,8 +64,8 @@ def __init__( dataset format is JSONLines. dataset_type (str): Format of the dataset. Valid values are "text/csv" for CSV, "application/jsonlines" for JSONLines, and "application/x-parquet" for Parquet. - s3_data_distribution_type (str): Valid options are "FullyReplicated" or - "ShardedByS3Key". + s3_data_distribution_type (str): Deprecated. Only valid option is "FullyReplicated". + Any other value is ignored. s3_compression_type (str): Valid options are "None" or "Gzip". joinsource (str): The name or index of the column in the dataset that acts as an identifier column (for instance, while performing a join). This column is only @@ -80,7 +86,13 @@ def __init__( self.s3_data_input_path = s3_data_input_path self.s3_output_path = s3_output_path self.s3_analysis_config_output_path = s3_analysis_config_output_path - self.s3_data_distribution_type = s3_data_distribution_type + if s3_data_distribution_type != "FullyReplicated": + logger.warning( + "s3_data_distribution_type parameter, set to %s, is being ignored. Only" + " valid option is FullyReplicated", + s3_data_distribution_type, + ) + self.s3_data_distribution_type = "FullyReplicated" self.s3_compression_type = s3_compression_type self.label = label self.headers = headers diff --git a/tests/unit/test_clarify.py b/tests/unit/test_clarify.py index 56057351f6..910268a7dc 100644 --- a/tests/unit/test_clarify.py +++ b/tests/unit/test_clarify.py @@ -82,6 +82,19 @@ def test_invalid_data_config(): ) +def test_s3_data_distribution_type_ignorance(): + data_config = DataConfig( + s3_data_input_path="s3://input/train.csv", + s3_output_path="s3://output/analysis_test_result", + label="Label", + headers=["Label", "F1", "F2", "F3", "F4"], + dataset_type="text/csv", + joinsource="F4", + s3_data_distribution_type="ShardedByS3Key", + ) + assert data_config.s3_data_distribution_type == "FullyReplicated" + + def test_bias_config(): label_values = [1] facet_name = "F1"