From fefd9c4d3b0626f329321bc8f53a96013fdc8a4b Mon Sep 17 00:00:00 2001 From: sagemaker-bot Date: Wed, 13 Dec 2023 20:23:06 +0000 Subject: [PATCH 01/76] change: update image_uri_configs 12-13-2023 12:23:06 PST --- src/sagemaker/image_uri_config/autogluon.json | 234 +++++++--- .../image_uri_config/blazingtext.json | 17 +- src/sagemaker/image_uri_config/clarify.json | 6 +- .../image_uri_config/detailed-profiler.json | 27 +- .../image_uri_config/djl-deepspeed.json | 31 +- .../djl-fastertransformer.json | 18 +- .../image_uri_config/djl-neuronx.json | 18 +- .../image_uri_config/djl-tensorrtllm.json | 9 +- .../factorization-machines.json | 17 +- .../image_uri_config/forecasting-deepar.json | 15 +- .../image_uri_config/huggingface-llm.json | 118 +++-- .../image_uri_config/huggingface-neuron.json | 29 +- .../image_uri_config/huggingface-neuronx.json | 94 ++-- .../huggingface-training-compiler.json | 61 ++- .../image_uri_config/huggingface.json | 408 +++++++++++++----- .../image-classification.json | 17 +- .../image_uri_config/ipinsights.json | 15 +- src/sagemaker/image_uri_config/kmeans.json | 15 +- src/sagemaker/image_uri_config/knn.json | 15 +- .../image_uri_config/linear-learner.json | 15 +- .../image_uri_config/model-monitor.json | 10 +- src/sagemaker/image_uri_config/mxnet.json | 175 ++++++-- src/sagemaker/image_uri_config/ntm.json | 15 +- .../image_uri_config/object-detection.json | 15 +- .../image_uri_config/object2vec.json | 15 +- src/sagemaker/image_uri_config/pca.json | 15 +- .../image_uri_config/pytorch-neuron.json | 63 +-- .../pytorch-training-compiler.json | 8 +- src/sagemaker/image_uri_config/pytorch.json | 150 ++++--- .../image_uri_config/randomcutforest.json | 15 +- .../sagemaker-base-python.json | 39 +- .../semantic-segmentation.json | 15 +- src/sagemaker/image_uri_config/seq2seq.json | 15 +- src/sagemaker/image_uri_config/sklearn.json | 146 ++++--- src/sagemaker/image_uri_config/spark.json | 239 ++++++---- .../image_uri_config/stabilityai.json | 13 +- .../image_uri_config/tensorflow.json | 259 +++++++---- src/sagemaker/image_uri_config/xgboost.json | 218 ++++++---- 38 files changed, 1698 insertions(+), 906 deletions(-) diff --git a/src/sagemaker/image_uri_config/autogluon.json b/src/sagemaker/image_uri_config/autogluon.json index c9ecd5b068..aecc3584df 100644 --- a/src/sagemaker/image_uri_config/autogluon.json +++ b/src/sagemaker/image_uri_config/autogluon.json @@ -1,6 +1,9 @@ { "training": { - "processors": ["cpu", "gpu"], + "processors": [ + "cpu", + "gpu" + ], "version_aliases": { "0.3": "0.3.2", "0.4": "0.4.3", @@ -43,10 +46,13 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-training", - "py_versions": ["py37"] + "py_versions": [ + "py37" + ] }, "0.3.2": { "registries": { @@ -81,10 +87,13 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-training", - "py_versions": ["py38"] + "py_versions": [ + "py38" + ] }, "0.4.0": { "registries": { @@ -119,11 +128,17 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-training", - "processors": ["cpu", "gpu"], - "py_versions": ["py38"] + "processors": [ + "cpu", + "gpu" + ], + "py_versions": [ + "py38" + ] }, "0.4.2": { "registries": { @@ -158,11 +173,17 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-training", - "processors": ["cpu", "gpu"], - "py_versions": ["py38"] + "processors": [ + "cpu", + "gpu" + ], + "py_versions": [ + "py38" + ] }, "0.4.3": { "registries": { @@ -197,11 +218,17 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-training", - "processors": ["cpu", "gpu"], - "py_versions": ["py38"] + "processors": [ + "cpu", + "gpu" + ], + "py_versions": [ + "py38" + ] }, "0.5.2": { "registries": { @@ -236,11 +263,17 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-training", - "processors": ["cpu", "gpu"], - "py_versions": ["py38"] + "processors": [ + "cpu", + "gpu" + ], + "py_versions": [ + "py38" + ] }, "0.6.1": { "registries": { @@ -271,11 +304,17 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-training", - "processors": ["cpu", "gpu"], - "py_versions": ["py38"] + "processors": [ + "cpu", + "gpu" + ], + "py_versions": [ + "py38" + ] }, "0.6.2": { "registries": { @@ -306,11 +345,17 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-training", - "processors": ["cpu", "gpu"], - "py_versions": ["py38"] + "processors": [ + "cpu", + "gpu" + ], + "py_versions": [ + "py38" + ] }, "0.7.0": { "registries": { @@ -341,11 +386,17 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-training", - "processors": ["cpu", "gpu"], - "py_versions": ["py39"] + "processors": [ + "cpu", + "gpu" + ], + "py_versions": [ + "py39" + ] }, "0.8.2": { "registries": { @@ -376,11 +427,17 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-training", - "processors": ["cpu", "gpu"], - "py_versions": ["py39"] + "processors": [ + "cpu", + "gpu" + ], + "py_versions": [ + "py39" + ] } } }, @@ -429,11 +486,16 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-inference", - "processors": ["cpu"], - "py_versions": ["py37"] + "processors": [ + "cpu" + ], + "py_versions": [ + "py37" + ] }, "0.3.2": { "registries": { @@ -470,11 +532,16 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-inference", - "processors": ["cpu"], - "py_versions": ["py38"] + "processors": [ + "cpu" + ], + "py_versions": [ + "py38" + ] }, "0.4.0": { "registries": { @@ -511,11 +578,16 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-inference", - "processors": ["cpu"], - "py_versions": ["py38"] + "processors": [ + "cpu" + ], + "py_versions": [ + "py38" + ] }, "0.4.2": { "registries": { @@ -552,11 +624,17 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-inference", - "processors": ["cpu", "gpu"], - "py_versions": ["py38"] + "processors": [ + "cpu", + "gpu" + ], + "py_versions": [ + "py38" + ] }, "0.4.3": { "registries": { @@ -593,11 +671,17 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-inference", - "processors": ["cpu", "gpu"], - "py_versions": ["py38"] + "processors": [ + "cpu", + "gpu" + ], + "py_versions": [ + "py38" + ] }, "0.5.2": { "registries": { @@ -634,11 +718,17 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-inference", - "processors": ["cpu", "gpu"], - "py_versions": ["py38"] + "processors": [ + "cpu", + "gpu" + ], + "py_versions": [ + "py38" + ] }, "0.6.1": { "registries": { @@ -671,11 +761,17 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-inference", - "processors": ["cpu", "gpu"], - "py_versions": ["py38"] + "processors": [ + "cpu", + "gpu" + ], + "py_versions": [ + "py38" + ] }, "0.6.2": { "registries": { @@ -708,11 +804,17 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-inference", - "processors": ["cpu", "gpu"], - "py_versions": ["py38"] + "processors": [ + "cpu", + "gpu" + ], + "py_versions": [ + "py38" + ] }, "0.7.0": { "registries": { @@ -745,11 +847,17 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-inference", - "processors": ["cpu", "gpu"], - "py_versions": ["py39"] + "processors": [ + "cpu", + "gpu" + ], + "py_versions": [ + "py39" + ] }, "0.8.2": { "registries": { @@ -782,12 +890,18 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "autogluon-inference", - "processors": ["cpu", "gpu"], - "py_versions": ["py39"] + "processors": [ + "cpu", + "gpu" + ], + "py_versions": [ + "py39" + ] } } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/blazingtext.json b/src/sagemaker/image_uri_config/blazingtext.json index 66a58720ee..2c5601b356 100644 --- a/src/sagemaker/image_uri_config/blazingtext.json +++ b/src/sagemaker/image_uri_config/blazingtext.json @@ -1,5 +1,8 @@ { - "scope": ["inference", "training"], + "scope": [ + "inference", + "training" + ], "versions": { "1": { "registries": { @@ -8,11 +11,11 @@ "ap-northeast-1": "501404015308", "ap-northeast-2": "306986355934", "ap-northeast-3": "867004704886", - "ap-southeast-3": "951798379941", "ap-south-1": "991648021394", "ap-south-2": "628508329040", "ap-southeast-1": "475088953585", "ap-southeast-2": "544295431143", + "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "469771592824", "cn-north-1": "390948362332", @@ -20,19 +23,19 @@ "eu-central-1": "813361260812", "eu-central-2": "680994064768", "eu-north-1": "669576153137", + "eu-south-1": "257386234256", + "eu-south-2": "104374241257", "eu-west-1": "685385470294", "eu-west-2": "644912444149", "eu-west-3": "749696950732", - "eu-south-1": "257386234256", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "249704162688", "me-central-1": "272398656194", + "me-south-1": "249704162688", "sa-east-1": "855470959533", "us-east-1": "811284229777", "us-east-2": "825641698319", - "us-gov-west-1": "226302683700", "us-gov-east-1": "237065988967", + "us-gov-west-1": "226302683700", "us-iso-east-1": "490574956308", "us-isob-east-1": "765400339828", "us-west-1": "632365934929", @@ -41,4 +44,4 @@ "repository": "blazingtext" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/clarify.json b/src/sagemaker/image_uri_config/clarify.json index a558a98822..4058a8803f 100644 --- a/src/sagemaker/image_uri_config/clarify.json +++ b/src/sagemaker/image_uri_config/clarify.json @@ -25,12 +25,12 @@ "sa-east-1": "520018980103", "us-east-1": "205585389593", "us-east-2": "211330385671", + "us-gov-west-1": "598674086554", "us-west-1": "740489534195", - "us-west-2": "306415355426", - "us-gov-west-1": "598674086554" + "us-west-2": "306415355426" }, "repository": "sagemaker-clarify-processing" } } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/detailed-profiler.json b/src/sagemaker/image_uri_config/detailed-profiler.json index 14c966b806..f765c51a2c 100644 --- a/src/sagemaker/image_uri_config/detailed-profiler.json +++ b/src/sagemaker/image_uri_config/detailed-profiler.json @@ -1,35 +1,18 @@ { - "scope": ["detailed-profiler"], + "scope": [ + "detailed-profiler" + ], "versions": { "latest": { "registries": { - "af-south-1": "314341159256", - "ap-east-1": "199566480951", - "ap-northeast-1": "430734990657", - "ap-northeast-2": "578805364391", - "ap-northeast-3": "479947661362", - "ap-south-1": "904829902805", - "ap-southeast-1": "972752614525", - "ap-southeast-2": "184798709955", - "ca-central-1": "519511493484", - "cn-north-1": "618459771430", - "cn-northwest-1": "658757709296", "eu-central-1": "482524230118", - "eu-north-1": "314864569078", - "eu-south-1": "563282790590", "eu-west-1": "929884845733", - "eu-west-2": "250201462417", - "eu-west-3": "447278800020", - "il-central-1": "216881608335", - "me-south-1": "986000313247", - "sa-east-1": "818342061345", + "il-central-1": "216881608335", "us-east-1": "503895931360", "us-east-2": "915447279597", - "us-gov-west-1": "515509971035", - "us-west-1": "685455198987", "us-west-2": "895741380848" }, "repository": "detailed-profiler-processing" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/djl-deepspeed.json b/src/sagemaker/image_uri_config/djl-deepspeed.json index 0172953da3..1612d85ff7 100644 --- a/src/sagemaker/image_uri_config/djl-deepspeed.json +++ b/src/sagemaker/image_uri_config/djl-deepspeed.json @@ -1,5 +1,7 @@ { - "scope": ["inference"], + "scope": [ + "inference" + ], "versions": { "0.25.0": { "registries": { @@ -27,12 +29,13 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "djl-inference", "tag_prefix": "0.25.0-deepspeed0.11.0-cu118" }, - "0.24.0": { + "0.24.0": { "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -58,12 +61,13 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "djl-inference", "tag_prefix": "0.24.0-deepspeed0.10.0-cu118" }, - "0.23.0": { + "0.23.0": { "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -89,7 +93,8 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "djl-inference", "tag_prefix": "0.23.0-deepspeed0.9.5-cu118" @@ -120,7 +125,8 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "djl-inference", "tag_prefix": "0.22.1-deepspeed0.9.2-cu118" @@ -151,7 +157,8 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "djl-inference", "tag_prefix": "0.21.0-deepspeed0.8.3-cu117" @@ -182,7 +189,8 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "djl-inference", "tag_prefix": "0.20.0-deepspeed0.7.5-cu116" @@ -213,10 +221,11 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "djl-inference", "tag_prefix": "0.19.0-deepspeed0.7.3-cu113" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/djl-fastertransformer.json b/src/sagemaker/image_uri_config/djl-fastertransformer.json index 01c7335048..fd9ced32fe 100644 --- a/src/sagemaker/image_uri_config/djl-fastertransformer.json +++ b/src/sagemaker/image_uri_config/djl-fastertransformer.json @@ -1,5 +1,7 @@ { - "scope": ["inference"], + "scope": [ + "inference" + ], "versions": { "0.24.0": { "registries": { @@ -27,7 +29,8 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "djl-inference", "tag_prefix": "0.24.0-fastertransformer5.3.0-cu118" @@ -58,7 +61,8 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "djl-inference", "tag_prefix": "0.23.0-fastertransformer5.3.0-cu118" @@ -89,7 +93,8 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "djl-inference", "tag_prefix": "0.22.1-fastertransformer5.3.0-cu118" @@ -120,10 +125,11 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "djl-inference", "tag_prefix": "0.21.0-fastertransformer5.3.0-cu117" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/djl-neuronx.json b/src/sagemaker/image_uri_config/djl-neuronx.json index 6802f86e0c..b8c0f2be1a 100644 --- a/src/sagemaker/image_uri_config/djl-neuronx.json +++ b/src/sagemaker/image_uri_config/djl-neuronx.json @@ -1,5 +1,7 @@ { - "scope": ["inference"], + "scope": [ + "inference" + ], "versions": { "0.25.0": { "registries": { @@ -13,7 +15,8 @@ "sa-east-1": "763104351884", "us-east-1": "763104351884", "us-east-2": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "djl-inference", "tag_prefix": "0.25.0-neuronx-sdk2.15.0" @@ -30,7 +33,8 @@ "sa-east-1": "763104351884", "us-east-1": "763104351884", "us-east-2": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "djl-inference", "tag_prefix": "0.24.0-neuronx-sdk2.14.1" @@ -47,7 +51,8 @@ "sa-east-1": "763104351884", "us-east-1": "763104351884", "us-east-2": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "djl-inference", "tag_prefix": "0.23.0-neuronx-sdk2.12.0" @@ -64,10 +69,11 @@ "sa-east-1": "763104351884", "us-east-1": "763104351884", "us-east-2": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "djl-inference", "tag_prefix": "0.22.1-neuronx-sdk2.10.0" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/djl-tensorrtllm.json b/src/sagemaker/image_uri_config/djl-tensorrtllm.json index 5cb7bcfe38..545e49f630 100644 --- a/src/sagemaker/image_uri_config/djl-tensorrtllm.json +++ b/src/sagemaker/image_uri_config/djl-tensorrtllm.json @@ -1,5 +1,7 @@ { - "scope": ["inference"], + "scope": [ + "inference" + ], "versions": { "0.25.0": { "registries": { @@ -27,10 +29,11 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "djl-inference", "tag_prefix": "0.25.0-tensorrtllm0.5.0-cu122" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/factorization-machines.json b/src/sagemaker/image_uri_config/factorization-machines.json index 8ec3b5e4de..610f36e000 100644 --- a/src/sagemaker/image_uri_config/factorization-machines.json +++ b/src/sagemaker/image_uri_config/factorization-machines.json @@ -1,5 +1,8 @@ { - "scope": ["inference", "training"], + "scope": [ + "inference", + "training" + ], "versions": { "1": { "registries": { @@ -8,11 +11,11 @@ "ap-northeast-1": "351501993468", "ap-northeast-2": "835164637446", "ap-northeast-3": "867004704886", - "ap-southeast-3": "951798379941", "ap-south-1": "991648021394", "ap-south-2": "628508329040", "ap-southeast-1": "475088953585", "ap-southeast-2": "712309505854", + "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "469771592824", "cn-north-1": "390948362332", @@ -20,19 +23,19 @@ "eu-central-1": "664544806723", "eu-central-2": "680994064768", "eu-north-1": "669576153137", + "eu-south-1": "257386234256", + "eu-south-2": "104374241257", "eu-west-1": "438346466558", "eu-west-2": "644912444149", "eu-west-3": "749696950732", - "eu-south-1": "257386234256", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "249704162688", "me-central-1": "272398656194", + "me-south-1": "249704162688", "sa-east-1": "855470959533", "us-east-1": "382416733822", "us-east-2": "404615174143", - "us-gov-west-1": "226302683700", "us-gov-east-1": "237065988967", + "us-gov-west-1": "226302683700", "us-iso-east-1": "490574956308", "us-isob-east-1": "765400339828", "us-west-1": "632365934929", @@ -41,4 +44,4 @@ "repository": "factorization-machines" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/forecasting-deepar.json b/src/sagemaker/image_uri_config/forecasting-deepar.json index 721150a715..1adf88d7f3 100644 --- a/src/sagemaker/image_uri_config/forecasting-deepar.json +++ b/src/sagemaker/image_uri_config/forecasting-deepar.json @@ -1,5 +1,8 @@ { - "scope": ["inference", "training"], + "scope": [ + "inference", + "training" + ], "versions": { "1": { "registries": { @@ -20,19 +23,19 @@ "eu-central-1": "495149712605", "eu-central-2": "680994064768", "eu-north-1": "669576153137", + "eu-south-1": "257386234256", + "eu-south-2": "104374241257", "eu-west-1": "224300973850", "eu-west-2": "644912444149", "eu-west-3": "749696950732", - "eu-south-1": "257386234256", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "249704162688", "me-central-1": "272398656194", + "me-south-1": "249704162688", "sa-east-1": "855470959533", "us-east-1": "522234722520", "us-east-2": "566113047672", - "us-gov-west-1": "226302683700", "us-gov-east-1": "237065988967", + "us-gov-west-1": "226302683700", "us-iso-east-1": "490574956308", "us-isob-east-1": "765400339828", "us-west-1": "632365934929", @@ -41,4 +44,4 @@ "repository": "forecasting-deepar" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/huggingface-llm.json b/src/sagemaker/image_uri_config/huggingface-llm.json index d527a472c7..3431a8e445 100644 --- a/src/sagemaker/image_uri_config/huggingface-llm.json +++ b/src/sagemaker/image_uri_config/huggingface-llm.json @@ -1,18 +1,20 @@ { "inference": { - "processors": ["gpu"], + "processors": [ + "gpu" + ], "version_aliases": { "0.6": "0.6.0", "0.8": "0.8.2", "0.9": "0.9.3", "1.0": "1.0.3", - "1.1": "1.1.0", - "1.2": "1.2.0", - "1.3": "1.3.1" + "1.1": "1.1.0" }, "versions": { "0.6.0": { - "py_versions": ["py39"], + "py_versions": [ + "py39" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -47,14 +49,19 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "tag_prefix": "2.0.0-tgi0.6.0", "repository": "huggingface-pytorch-tgi-inference", - "container_version": {"gpu": "cu118-ubuntu20.04"} + "container_version": { + "gpu": "cu118-ubuntu20.04" + } }, "0.8.2": { - "py_versions": ["py39"], + "py_versions": [ + "py39" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -89,14 +96,19 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "tag_prefix": "2.0.0-tgi0.8.2", "repository": "huggingface-pytorch-tgi-inference", - "container_version": {"gpu": "cu118-ubuntu20.04"} + "container_version": { + "gpu": "cu118-ubuntu20.04" + } }, "0.9.3": { - "py_versions": ["py39"], + "py_versions": [ + "py39" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -131,14 +143,19 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "tag_prefix": "2.0.1-tgi0.9.3", "repository": "huggingface-pytorch-tgi-inference", - "container_version": {"gpu": "cu118-ubuntu20.04"} + "container_version": { + "gpu": "cu118-ubuntu20.04" + } }, "1.0.3": { - "py_versions": ["py39"], + "py_versions": [ + "py39" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -173,14 +190,19 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "tag_prefix": "2.0.1-tgi1.0.3", "repository": "huggingface-pytorch-tgi-inference", - "container_version": {"gpu": "cu118-ubuntu20.04"} + "container_version": { + "gpu": "cu118-ubuntu20.04" + } }, "1.1.0": { - "py_versions": ["py39"], + "py_versions": [ + "py39" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -215,14 +237,19 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "tag_prefix": "2.0.1-tgi1.1.0", "repository": "huggingface-pytorch-tgi-inference", - "container_version": {"gpu": "cu118-ubuntu20.04"} + "container_version": { + "gpu": "cu118-ubuntu20.04" + } }, "1.2.0": { - "py_versions": ["py310"], + "py_versions": [ + "py310" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -257,54 +284,15 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "tag_prefix": "2.1.1-tgi1.2.0", "repository": "huggingface-pytorch-tgi-inference", - "container_version": {"gpu": "cu121-ubuntu20.04"} - }, - "1.3.1": { - "py_versions": ["py310"], - "registries": { - "af-south-1": "626614931356", - "il-central-1": "780543022126", - "ap-east-1": "871362719292", - "ap-northeast-1": "763104351884", - "ap-northeast-2": "763104351884", - "ap-northeast-3": "364406365360", - "ap-south-1": "763104351884", - "ap-south-2": "772153158452", - "ap-southeast-1": "763104351884", - "ap-southeast-2": "763104351884", - "ap-southeast-3": "907027046896", - "ap-southeast-4": "457447274322", - "ca-central-1": "763104351884", - "cn-north-1": "727897471807", - "cn-northwest-1": "727897471807", - "eu-central-1": "763104351884", - "eu-central-2": "380420809688", - "eu-north-1": "763104351884", - "eu-west-1": "763104351884", - "eu-west-2": "763104351884", - "eu-west-3": "763104351884", - "eu-south-1": "692866216735", - "eu-south-2": "503227376785", - "me-south-1": "217643126080", - "me-central-1": "914824155844", - "sa-east-1": "763104351884", - "us-east-1": "763104351884", - "us-east-2": "763104351884", - "us-gov-east-1": "446045086412", - "us-gov-west-1": "442386744353", - "us-iso-east-1": "886529160074", - "us-isob-east-1": "094389454867", - "us-west-1": "763104351884", - "us-west-2": "763104351884" - }, - "tag_prefix": "2.1.1-tgi1.3.1", - "repository": "huggingface-pytorch-tgi-inference", - "container_version": {"gpu": "cu121-ubuntu20.04"} + "container_version": { + "gpu": "cu121-ubuntu20.04" + } } } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/huggingface-neuron.json b/src/sagemaker/image_uri_config/huggingface-neuron.json index 386846e832..ae38ce209b 100644 --- a/src/sagemaker/image_uri_config/huggingface-neuron.json +++ b/src/sagemaker/image_uri_config/huggingface-neuron.json @@ -1,12 +1,20 @@ { "inference": { - "processors": ["inf"], - "version_aliases": {"4.12": "4.12.3"}, + "processors": [ + "inf" + ], + "version_aliases": { + "4.12": "4.12.3" + }, "versions": { "4.12.3": { - "version_aliases": {"pytorch1.9": "pytorch1.9.1"}, + "version_aliases": { + "pytorch1.9": "pytorch1.9.1" + }, "pytorch1.9.1": { - "py_versions": ["py37"], + "py_versions": [ + "py37" + ], "repository": "huggingface-pytorch-inference-neuron", "registries": { "ap-northeast-1": "763104351884", @@ -24,12 +32,17 @@ "sa-east-1": "763104351884", "us-east-1": "763104351884", "us-east-2": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, - "container_version": {"inf": "ubuntu18.04"}, - "sdk_versions": ["sdk1.17.1"] + "container_version": { + "inf": "ubuntu18.04" + }, + "sdk_versions": [ + "sdk1.17.1" + ] } } } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/huggingface-neuronx.json b/src/sagemaker/image_uri_config/huggingface-neuronx.json index e4371e753c..4765937d8f 100644 --- a/src/sagemaker/image_uri_config/huggingface-neuronx.json +++ b/src/sagemaker/image_uri_config/huggingface-neuronx.json @@ -1,12 +1,20 @@ { "training": { - "processors": ["trn"], - "version_aliases": {"4.28": "4.28.1"}, + "processors": [ + "trn" + ], + "version_aliases": { + "4.28": "4.28.1" + }, "versions": { "4.28.1": { - "version_aliases": {"pytorch1.13": "pytorch1.13.0"}, + "version_aliases": { + "pytorch1.13": "pytorch1.13.0" + }, "pytorch1.13.0": { - "py_versions": ["py38"], + "py_versions": [ + "py38" + ], "repository": "huggingface-pytorch-training-neuronx", "registries": { "ap-northeast-1": "763104351884", @@ -24,16 +32,25 @@ "sa-east-1": "763104351884", "us-east-1": "763104351884", "us-east-2": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, - "container_version": {"trn": "ubuntu20.04"}, - "sdk_versions": ["sdk2.9.1"] + "container_version": { + "trn": "ubuntu20.04" + }, + "sdk_versions": [ + "sdk2.9.1" + ] } }, "4.34.1": { - "version_aliases": {"pytorch1.13": "pytorch1.13.1"}, + "version_aliases": { + "pytorch1.13": "pytorch1.13.1" + }, "pytorch1.13.1": { - "py_versions": ["py310"], + "py_versions": [ + "py310" + ], "repository": "huggingface-pytorch-inference-neuronx", "registries": { "ap-northeast-1": "763104351884", @@ -51,22 +68,35 @@ "sa-east-1": "763104351884", "us-east-1": "763104351884", "us-east-2": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" + }, + "container_version": { + "inf": "ubuntu20.04" }, - "container_version": {"inf": "ubuntu20.04"}, - "sdk_versions": ["sdk2.15.0"] + "sdk_versions": [ + "sdk2.15.0" + ] } } } }, "inference": { - "processors": ["inf"], - "version_aliases": {"4.28": "4.28.1"}, + "processors": [ + "inf" + ], + "version_aliases": { + "4.28": "4.28.1" + }, "versions": { "4.28.1": { - "version_aliases": {"pytorch1.13": "pytorch1.13.0"}, + "version_aliases": { + "pytorch1.13": "pytorch1.13.0" + }, "pytorch1.13.0": { - "py_versions": ["py38"], + "py_versions": [ + "py38" + ], "repository": "huggingface-pytorch-inference-neuronx", "registries": { "af-south-1": "626614931356", @@ -100,16 +130,25 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, - "container_version": {"inf": "ubuntu20.04"}, - "sdk_versions": ["sdk2.9.1"] + "container_version": { + "inf": "ubuntu20.04" + }, + "sdk_versions": [ + "sdk2.9.1" + ] } }, "4.34.1": { - "version_aliases": {"pytorch1.13": "pytorch1.13.1"}, + "version_aliases": { + "pytorch1.13": "pytorch1.13.1" + }, "pytorch1.13.1": { - "py_versions": ["py310"], + "py_versions": [ + "py310" + ], "repository": "huggingface-pytorch-inference-neuronx", "registries": { "af-south-1": "626614931356", @@ -143,12 +182,17 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" + }, + "container_version": { + "inf": "ubuntu20.04" }, - "container_version": {"inf": "ubuntu20.04"}, - "sdk_versions": ["sdk2.15.0"] + "sdk_versions": [ + "sdk2.15.0" + ] } } } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/huggingface-training-compiler.json b/src/sagemaker/image_uri_config/huggingface-training-compiler.json index 76ad9a6e6d..735e7917b3 100644 --- a/src/sagemaker/image_uri_config/huggingface-training-compiler.json +++ b/src/sagemaker/image_uri_config/huggingface-training-compiler.json @@ -1,6 +1,8 @@ { "training": { - "processors": ["gpu"], + "processors": [ + "gpu" + ], "version_aliases": { "4.11": "4.11.0", "4.17": "4.17.0", @@ -13,26 +15,36 @@ "tensorflow2.5": "tensorflow2.5.1" }, "pytorch1.9.0": { - "py_versions": ["py38"], + "py_versions": [ + "py38" + ], "registries": { "eu-west-1": "763104351884", "us-east-1": "763104351884", "us-east-2": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-trcomp-training", - "container_version": {"gpu":"cu111-ubuntu20.04"} + "container_version": { + "gpu": "cu111-ubuntu20.04" + } }, "tensorflow2.5.1": { - "py_versions": ["py37"], + "py_versions": [ + "py37" + ], "registries": { "eu-west-1": "763104351884", "us-east-1": "763104351884", "us-east-2": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-tensorflow-trcomp-training", - "container_version": {"gpu":"cu112-ubuntu18.04"} + "container_version": { + "gpu": "cu112-ubuntu18.04" + } } }, "4.17.0": { @@ -41,7 +53,9 @@ "tensorflow2.6": "tensorflow2.6.3" }, "pytorch1.10.2": { - "py_versions": ["py38"], + "py_versions": [ + "py38" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -70,13 +84,18 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-trcomp-training", - "container_version": {"gpu":"cu113-ubuntu20.04"} + "container_version": { + "gpu": "cu113-ubuntu20.04" + } }, "tensorflow2.6.3": { - "py_versions": ["py38"], + "py_versions": [ + "py38" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -105,10 +124,13 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-tensorflow-trcomp-training", - "container_version": {"gpu":"cu112-ubuntu20.04"} + "container_version": { + "gpu": "cu112-ubuntu20.04" + } } }, "4.21.1": { @@ -116,7 +138,9 @@ "pytorch1.11": "pytorch1.11.0" }, "pytorch1.11.0": { - "py_versions": ["py38"], + "py_versions": [ + "py38" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -145,12 +169,15 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-trcomp-training", - "container_version": {"gpu":"cu113-ubuntu20.04"} + "container_version": { + "gpu": "cu113-ubuntu20.04" + } } } } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/huggingface.json b/src/sagemaker/image_uri_config/huggingface.json index 15eeac1eb5..c39a79e0fb 100644 --- a/src/sagemaker/image_uri_config/huggingface.json +++ b/src/sagemaker/image_uri_config/huggingface.json @@ -1,6 +1,8 @@ { "training": { - "processors": ["gpu"], + "processors": [ + "gpu" + ], "version_aliases": { "4.4": "4.4.2", "4.5": "4.5.0", @@ -19,7 +21,9 @@ "tensorflow2.4": "tensorflow2.4.1" }, "pytorch1.6.0": { - "py_versions": ["py36"], + "py_versions": [ + "py36" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -54,12 +58,15 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-training" }, "tensorflow2.4.1": { - "py_versions": ["py37"], + "py_versions": [ + "py37" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -94,7 +101,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-tensorflow-training" } @@ -105,7 +113,9 @@ "tensorflow2.4": "tensorflow2.4.1" }, "pytorch1.6.0": { - "py_versions": ["py36"], + "py_versions": [ + "py36" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -140,12 +150,15 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-training" }, "tensorflow2.4.1": { - "py_versions": ["py37"], + "py_versions": [ + "py37" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -180,7 +193,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-tensorflow-training" } @@ -193,7 +207,9 @@ "tensorflow2.4": "tensorflow2.4.1" }, "pytorch1.6.0": { - "py_versions": ["py36"], + "py_versions": [ + "py36" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -228,13 +244,18 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-training", - "container_version": {"gpu":"cu110-ubuntu18.04"} + "container_version": { + "gpu": "cu110-ubuntu18.04" + } }, "pytorch1.7.1": { - "py_versions": ["py36"], + "py_versions": [ + "py36" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -269,13 +290,18 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-training", - "container_version": {"gpu":"cu110-ubuntu18.04"} + "container_version": { + "gpu": "cu110-ubuntu18.04" + } }, "pytorch1.8.1": { - "py_versions": ["py36"], + "py_versions": [ + "py36" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -310,13 +336,18 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-training", - "container_version": {"gpu":"cu111-ubuntu18.04"} + "container_version": { + "gpu": "cu111-ubuntu18.04" + } }, "tensorflow2.4.1": { - "py_versions": ["py37"], + "py_versions": [ + "py37" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -351,10 +382,13 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-tensorflow-training", - "container_version": {"gpu":"cu110-ubuntu18.04"} + "container_version": { + "gpu": "cu110-ubuntu18.04" + } } }, "4.10.2": { @@ -365,7 +399,9 @@ "tensorflow2.5": "tensorflow2.5.1" }, "pytorch1.8.1": { - "py_versions": ["py36"], + "py_versions": [ + "py36" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -400,13 +436,18 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-training", - "container_version": {"gpu":"cu110-ubuntu18.04"} + "container_version": { + "gpu": "cu110-ubuntu18.04" + } }, "pytorch1.9.0": { - "py_versions": ["py38"], + "py_versions": [ + "py38" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -441,13 +482,18 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-training", - "container_version": {"gpu": "cu111-ubuntu20.04"} + "container_version": { + "gpu": "cu111-ubuntu20.04" + } }, "tensorflow2.4.1": { - "py_versions": ["py37"], + "py_versions": [ + "py37" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -482,13 +528,18 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-tensorflow-training", - "container_version": {"gpu":"cu110-ubuntu18.04"} + "container_version": { + "gpu": "cu110-ubuntu18.04" + } }, "tensorflow2.5.1": { - "py_versions": ["py37"], + "py_versions": [ + "py37" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -523,10 +574,13 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-tensorflow-training", - "container_version": {"gpu": "cu112-ubuntu18.04"} + "container_version": { + "gpu": "cu112-ubuntu18.04" + } } }, "4.11.0": { @@ -535,7 +589,9 @@ "tensorflow2.5": "tensorflow2.5.1" }, "pytorch1.9.0": { - "py_versions": ["py38"], + "py_versions": [ + "py38" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -570,13 +626,18 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-training", - "container_version": {"gpu": "cu111-ubuntu20.04"} + "container_version": { + "gpu": "cu111-ubuntu20.04" + } }, "tensorflow2.5.1": { - "py_versions": ["py37"], + "py_versions": [ + "py37" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -611,10 +672,13 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-tensorflow-training", - "container_version": {"gpu": "cu112-ubuntu18.04"} + "container_version": { + "gpu": "cu112-ubuntu18.04" + } } }, "4.12.3": { @@ -623,7 +687,9 @@ "tensorflow2.5": "tensorflow2.5.1" }, "pytorch1.9.1": { - "py_versions": ["py38"], + "py_versions": [ + "py38" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -658,13 +724,18 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-training", - "container_version": {"gpu": "cu111-ubuntu20.04"} + "container_version": { + "gpu": "cu111-ubuntu20.04" + } }, "tensorflow2.5.1": { - "py_versions": ["py37"], + "py_versions": [ + "py37" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -699,10 +770,13 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-tensorflow-training", - "container_version": {"gpu": "cu112-ubuntu18.04"} + "container_version": { + "gpu": "cu112-ubuntu18.04" + } } }, "4.17.0": { @@ -711,7 +785,9 @@ "tensorflow2.6": "tensorflow2.6.3" }, "pytorch1.10.2": { - "py_versions": ["py38"], + "py_versions": [ + "py38" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -746,13 +822,18 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-training", - "container_version": {"gpu": "cu113-ubuntu20.04"} + "container_version": { + "gpu": "cu113-ubuntu20.04" + } }, "tensorflow2.6.3": { - "py_versions": ["py38"], + "py_versions": [ + "py38" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -787,10 +868,13 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-tensorflow-training", - "container_version": {"gpu": "cu112-ubuntu20.04"} + "container_version": { + "gpu": "cu112-ubuntu20.04" + } } }, "4.26.0": { @@ -798,7 +882,9 @@ "pytorch1.13": "pytorch1.13.1" }, "pytorch1.13.1": { - "py_versions": ["py39"], + "py_versions": [ + "py39" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -829,10 +915,13 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-training", - "container_version": {"gpu": "cu117-ubuntu20.04"} + "container_version": { + "gpu": "cu117-ubuntu20.04" + } } }, "4.28.1": { @@ -840,7 +929,9 @@ "pytorch2.0": "pytorch2.0.0" }, "pytorch2.0.0": { - "py_versions": ["py310"], + "py_versions": [ + "py310" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -871,17 +962,22 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-training", - "container_version": {"gpu": "cu118-ubuntu20.04"} + "container_version": { + "gpu": "cu118-ubuntu20.04" + } } } } }, - "inference": { - "processors": ["gpu", "cpu"], + "processors": [ + "gpu", + "cpu" + ], "version_aliases": { "4.6": "4.6.1", "4.10": "4.10.2", @@ -898,7 +994,9 @@ "tensorflow2.4": "tensorflow2.4.1" }, "pytorch1.7.1": { - "py_versions": ["py36"], + "py_versions": [ + "py36" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -933,13 +1031,19 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-inference", - "container_version": {"gpu":"cu110-ubuntu18.04", "cpu":"ubuntu18.04" } + "container_version": { + "gpu": "cu110-ubuntu18.04", + "cpu": "ubuntu18.04" + } }, "pytorch1.8.1": { - "py_versions": ["py36"], + "py_versions": [ + "py36" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -974,13 +1078,19 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-inference", - "container_version": {"gpu":"cu111-ubuntu18.04", "cpu":"ubuntu18.04" } + "container_version": { + "gpu": "cu111-ubuntu18.04", + "cpu": "ubuntu18.04" + } }, "tensorflow2.4.1": { - "py_versions": ["py37"], + "py_versions": [ + "py37" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -1015,10 +1125,14 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-tensorflow-inference", - "container_version": {"gpu":"cu110-ubuntu18.04", "cpu":"ubuntu18.04" } + "container_version": { + "gpu": "cu110-ubuntu18.04", + "cpu": "ubuntu18.04" + } } }, "4.10.2": { @@ -1029,7 +1143,9 @@ "tensorflow2.5": "tensorflow2.5.1" }, "pytorch1.8.1": { - "py_versions": ["py36"], + "py_versions": [ + "py36" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -1064,13 +1180,19 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-inference", - "container_version": {"gpu":"cu111-ubuntu18.04", "cpu":"ubuntu18.04" } + "container_version": { + "gpu": "cu111-ubuntu18.04", + "cpu": "ubuntu18.04" + } }, "pytorch1.9.0": { - "py_versions": ["py38"], + "py_versions": [ + "py38" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -1105,13 +1227,19 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-inference", - "container_version": {"gpu": "cu111-ubuntu20.04", "cpu": "ubuntu20.04" } + "container_version": { + "gpu": "cu111-ubuntu20.04", + "cpu": "ubuntu20.04" + } }, "tensorflow2.4.1": { - "py_versions": ["py37"], + "py_versions": [ + "py37" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -1146,13 +1274,19 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-tensorflow-inference", - "container_version": {"gpu":"cu110-ubuntu18.04", "cpu":"ubuntu18.04" } + "container_version": { + "gpu": "cu110-ubuntu18.04", + "cpu": "ubuntu18.04" + } }, "tensorflow2.5.1": { - "py_versions": ["py37"], + "py_versions": [ + "py37" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -1187,10 +1321,14 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-tensorflow-inference", - "container_version": {"gpu": "cu112-ubuntu18.04", "cpu": "ubuntu18.04" } + "container_version": { + "gpu": "cu112-ubuntu18.04", + "cpu": "ubuntu18.04" + } } }, "4.11.0": { @@ -1199,7 +1337,9 @@ "tensorflow2.5": "tensorflow2.5.1" }, "pytorch1.9.0": { - "py_versions": ["py38"], + "py_versions": [ + "py38" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -1234,13 +1374,19 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-inference", - "container_version": {"gpu": "cu111-ubuntu20.04", "cpu": "ubuntu20.04" } + "container_version": { + "gpu": "cu111-ubuntu20.04", + "cpu": "ubuntu20.04" + } }, "tensorflow2.5.1": { - "py_versions": ["py37"], + "py_versions": [ + "py37" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -1275,10 +1421,14 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-tensorflow-inference", - "container_version": {"gpu": "cu112-ubuntu18.04", "cpu": "ubuntu18.04" } + "container_version": { + "gpu": "cu112-ubuntu18.04", + "cpu": "ubuntu18.04" + } } }, "4.12.3": { @@ -1287,7 +1437,9 @@ "tensorflow2.5": "tensorflow2.5.1" }, "pytorch1.9.1": { - "py_versions": ["py38"], + "py_versions": [ + "py38" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -1322,13 +1474,19 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-inference", - "container_version": {"gpu": "cu111-ubuntu20.04", "cpu": "ubuntu20.04" } + "container_version": { + "gpu": "cu111-ubuntu20.04", + "cpu": "ubuntu20.04" + } }, "tensorflow2.5.1": { - "py_versions": ["py37"], + "py_versions": [ + "py37" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -1363,10 +1521,14 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-tensorflow-inference", - "container_version": {"gpu": "cu112-ubuntu18.04", "cpu": "ubuntu18.04" } + "container_version": { + "gpu": "cu112-ubuntu18.04", + "cpu": "ubuntu18.04" + } } }, "4.17.0": { @@ -1375,7 +1537,9 @@ "tensorflow2.6": "tensorflow2.6.3" }, "pytorch1.10.2": { - "py_versions": ["py38"], + "py_versions": [ + "py38" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -1410,13 +1574,19 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-inference", - "container_version": {"gpu": "cu113-ubuntu20.04", "cpu": "ubuntu20.04" } + "container_version": { + "gpu": "cu113-ubuntu20.04", + "cpu": "ubuntu20.04" + } }, "tensorflow2.6.3": { - "py_versions": ["py38"], + "py_versions": [ + "py38" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -1451,10 +1621,14 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-tensorflow-inference", - "container_version": {"gpu": "cu112-ubuntu20.04", "cpu": "ubuntu20.04" } + "container_version": { + "gpu": "cu112-ubuntu20.04", + "cpu": "ubuntu20.04" + } } }, "4.26.0": { @@ -1463,7 +1637,9 @@ "tensorflow2.11": "tensorflow2.11.0" }, "pytorch1.13.1": { - "py_versions": ["py39"], + "py_versions": [ + "py39" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -1498,13 +1674,19 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-inference", - "container_version": {"gpu": "cu117-ubuntu20.04", "cpu": "ubuntu20.04" } + "container_version": { + "gpu": "cu117-ubuntu20.04", + "cpu": "ubuntu20.04" + } }, "tensorflow2.11.0": { - "py_versions": ["py39"], + "py_versions": [ + "py39" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -1539,10 +1721,14 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-tensorflow-inference", - "container_version": {"gpu": "cu112-ubuntu20.04", "cpu": "ubuntu20.04" } + "container_version": { + "gpu": "cu112-ubuntu20.04", + "cpu": "ubuntu20.04" + } } }, "4.28.1": { @@ -1550,7 +1736,9 @@ "pytorch2.0": "pytorch2.0.0" }, "pytorch2.0.0": { - "py_versions": ["py310"], + "py_versions": [ + "py310" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -1585,12 +1773,16 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "huggingface-pytorch-inference", - "container_version": {"gpu": "cu118-ubuntu20.04", "cpu": "ubuntu20.04" } + "container_version": { + "gpu": "cu118-ubuntu20.04", + "cpu": "ubuntu20.04" + } } } } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/image-classification.json b/src/sagemaker/image_uri_config/image-classification.json index d9fc0e3308..ae2fc4a31b 100644 --- a/src/sagemaker/image_uri_config/image-classification.json +++ b/src/sagemaker/image_uri_config/image-classification.json @@ -1,5 +1,8 @@ { - "scope": ["inference", "training"], + "scope": [ + "inference", + "training" + ], "versions": { "1": { "registries": { @@ -7,32 +10,32 @@ "ap-east-1": "286214385809", "ap-northeast-1": "501404015308", "ap-northeast-2": "306986355934", + "ap-northeast-3": "867004704886", "ap-south-1": "991648021394", "ap-south-2": "628508329040", "ap-southeast-1": "475088953585", "ap-southeast-2": "544295431143", "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", - "ap-northeast-3": "867004704886", "ca-central-1": "469771592824", "cn-north-1": "390948362332", "cn-northwest-1": "387376663083", "eu-central-1": "813361260812", "eu-central-2": "680994064768", "eu-north-1": "669576153137", + "eu-south-1": "257386234256", + "eu-south-2": "104374241257", "eu-west-1": "685385470294", "eu-west-2": "644912444149", "eu-west-3": "749696950732", - "eu-south-1": "257386234256", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "249704162688", "me-central-1": "272398656194", + "me-south-1": "249704162688", "sa-east-1": "855470959533", "us-east-1": "811284229777", "us-east-2": "825641698319", - "us-gov-west-1": "226302683700", "us-gov-east-1": "237065988967", + "us-gov-west-1": "226302683700", "us-iso-east-1": "490574956308", "us-isob-east-1": "765400339828", "us-west-1": "632365934929", @@ -41,4 +44,4 @@ "repository": "image-classification" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/ipinsights.json b/src/sagemaker/image_uri_config/ipinsights.json index 195772d67e..aa4012ce94 100644 --- a/src/sagemaker/image_uri_config/ipinsights.json +++ b/src/sagemaker/image_uri_config/ipinsights.json @@ -1,5 +1,8 @@ { - "scope": ["inference", "training"], + "scope": [ + "inference", + "training" + ], "versions": { "1": { "registries": { @@ -20,19 +23,19 @@ "eu-central-1": "664544806723", "eu-central-2": "680994064768", "eu-north-1": "669576153137", + "eu-south-1": "257386234256", + "eu-south-2": "104374241257", "eu-west-1": "438346466558", "eu-west-2": "644912444149", "eu-west-3": "749696950732", - "eu-south-1": "257386234256", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "249704162688", "me-central-1": "272398656194", + "me-south-1": "249704162688", "sa-east-1": "855470959533", "us-east-1": "382416733822", "us-east-2": "404615174143", - "us-gov-west-1": "226302683700", "us-gov-east-1": "237065988967", + "us-gov-west-1": "226302683700", "us-iso-east-1": "490574956308", "us-isob-east-1": "765400339828", "us-west-1": "632365934929", @@ -41,4 +44,4 @@ "repository": "ipinsights" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/kmeans.json b/src/sagemaker/image_uri_config/kmeans.json index a8a31144c5..091e99ada8 100644 --- a/src/sagemaker/image_uri_config/kmeans.json +++ b/src/sagemaker/image_uri_config/kmeans.json @@ -1,5 +1,8 @@ { - "scope": ["inference", "training"], + "scope": [ + "inference", + "training" + ], "versions": { "1": { "registries": { @@ -20,19 +23,19 @@ "eu-central-1": "664544806723", "eu-central-2": "680994064768", "eu-north-1": "669576153137", + "eu-south-1": "257386234256", + "eu-south-2": "104374241257", "eu-west-1": "438346466558", "eu-west-2": "644912444149", "eu-west-3": "749696950732", - "eu-south-1": "257386234256", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "249704162688", "me-central-1": "272398656194", + "me-south-1": "249704162688", "sa-east-1": "855470959533", "us-east-1": "382416733822", "us-east-2": "404615174143", - "us-gov-west-1": "226302683700", "us-gov-east-1": "237065988967", + "us-gov-west-1": "226302683700", "us-iso-east-1": "490574956308", "us-isob-east-1": "765400339828", "us-west-1": "632365934929", @@ -41,4 +44,4 @@ "repository": "kmeans" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/knn.json b/src/sagemaker/image_uri_config/knn.json index 4bd66172c5..7d54e730f4 100644 --- a/src/sagemaker/image_uri_config/knn.json +++ b/src/sagemaker/image_uri_config/knn.json @@ -1,5 +1,8 @@ { - "scope": ["inference", "training"], + "scope": [ + "inference", + "training" + ], "versions": { "1": { "registries": { @@ -20,19 +23,19 @@ "eu-central-1": "664544806723", "eu-central-2": "680994064768", "eu-north-1": "669576153137", + "eu-south-1": "257386234256", + "eu-south-2": "104374241257", "eu-west-1": "438346466558", "eu-west-2": "644912444149", "eu-west-3": "749696950732", - "eu-south-1": "257386234256", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "249704162688", "me-central-1": "272398656194", + "me-south-1": "249704162688", "sa-east-1": "855470959533", "us-east-1": "382416733822", "us-east-2": "404615174143", - "us-gov-west-1": "226302683700", "us-gov-east-1": "237065988967", + "us-gov-west-1": "226302683700", "us-iso-east-1": "490574956308", "us-isob-east-1": "765400339828", "us-west-1": "632365934929", @@ -41,4 +44,4 @@ "repository": "knn" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/linear-learner.json b/src/sagemaker/image_uri_config/linear-learner.json index 5768c6e7d3..f59384e2af 100644 --- a/src/sagemaker/image_uri_config/linear-learner.json +++ b/src/sagemaker/image_uri_config/linear-learner.json @@ -1,5 +1,8 @@ { - "scope": ["inference", "training"], + "scope": [ + "inference", + "training" + ], "versions": { "1": { "registries": { @@ -20,19 +23,19 @@ "eu-central-1": "664544806723", "eu-central-2": "680994064768", "eu-north-1": "669576153137", + "eu-south-1": "257386234256", + "eu-south-2": "104374241257", "eu-west-1": "438346466558", "eu-west-2": "644912444149", "eu-west-3": "749696950732", - "eu-south-1": "257386234256", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "249704162688", "me-central-1": "272398656194", + "me-south-1": "249704162688", "sa-east-1": "855470959533", "us-east-1": "382416733822", "us-east-2": "404615174143", - "us-gov-west-1": "226302683700", "us-gov-east-1": "237065988967", + "us-gov-west-1": "226302683700", "us-iso-east-1": "490574956308", "us-isob-east-1": "765400339828", "us-west-1": "632365934929", @@ -41,4 +44,4 @@ "repository": "linear-learner" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/model-monitor.json b/src/sagemaker/image_uri_config/model-monitor.json index 000151bc28..744f4d5acb 100644 --- a/src/sagemaker/image_uri_config/model-monitor.json +++ b/src/sagemaker/image_uri_config/model-monitor.json @@ -1,5 +1,7 @@ { - "scope": ["monitoring"], + "scope": [ + "monitoring" + ], "versions": { "": { "registries": { @@ -21,15 +23,15 @@ "eu-west-1": "468650794304", "eu-west-2": "749857270468", "eu-west-3": "680080141114", + "il-central-1": "843974653677", "me-south-1": "607024016150", "sa-east-1": "539772159869", "us-east-1": "156813124566", "us-east-2": "777275614652", "us-west-1": "890145073186", - "us-west-2": "159807026194", - "il-central-1": "843974653677" + "us-west-2": "159807026194" }, "repository": "sagemaker-model-monitor-analyzer" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/mxnet.json b/src/sagemaker/image_uri_config/mxnet.json index 231ddba2e7..fe962a2420 100644 --- a/src/sagemaker/image_uri_config/mxnet.json +++ b/src/sagemaker/image_uri_config/mxnet.json @@ -1,6 +1,9 @@ { "training": { - "processors": ["cpu", "gpu"], + "processors": [ + "cpu", + "gpu" + ], "version_aliases": { "0.12": "0.12.1", "1.0": "1.0.0", @@ -43,7 +46,10 @@ "us-west-2": "520713654638" }, "repository": "sagemaker-mxnet", - "py_versions": ["py2", "py3"] + "py_versions": [ + "py2", + "py3" + ] }, "1.0.0": { "registries": { @@ -74,7 +80,10 @@ "us-west-2": "520713654638" }, "repository": "sagemaker-mxnet", - "py_versions": ["py2", "py3"] + "py_versions": [ + "py2", + "py3" + ] }, "1.1.0": { "registries": { @@ -105,7 +114,10 @@ "us-west-2": "520713654638" }, "repository": "sagemaker-mxnet", - "py_versions": ["py2", "py3"] + "py_versions": [ + "py2", + "py3" + ] }, "1.2.1": { "registries": { @@ -136,7 +148,10 @@ "us-west-2": "520713654638" }, "repository": "sagemaker-mxnet", - "py_versions": ["py2", "py3"] + "py_versions": [ + "py2", + "py3" + ] }, "1.3.0": { "registries": { @@ -167,7 +182,10 @@ "us-west-2": "520713654638" }, "repository": "sagemaker-mxnet", - "py_versions": ["py2", "py3"] + "py_versions": [ + "py2", + "py3" + ] }, "1.4.0": { "registries": { @@ -198,7 +216,10 @@ "us-west-2": "520713654638" }, "repository": "sagemaker-mxnet", - "py_versions": ["py2", "py3"] + "py_versions": [ + "py2", + "py3" + ] }, "1.4.1": { "py2": { @@ -234,7 +255,7 @@ "py3": { "registries": { "af-south-1": "626614931356", - "il-central-1": "780543022126", + "il-central-1": "780543022126", "ap-east-1": "871362719292", "ap-northeast-1": "763104351884", "ap-northeast-2": "763104351884", @@ -266,7 +287,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "mxnet-training" } @@ -306,10 +328,14 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "mxnet-training", - "py_versions": ["py2", "py3"] + "py_versions": [ + "py2", + "py3" + ] }, "1.7.0": { "registries": { @@ -346,10 +372,13 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "mxnet-training", - "py_versions": ["py3"] + "py_versions": [ + "py3" + ] }, "1.8.0": { "registries": { @@ -386,10 +415,13 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "mxnet-training", - "py_versions": ["py37"] + "py_versions": [ + "py37" + ] }, "1.9.0": { "registries": { @@ -426,15 +458,21 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "mxnet-training", - "py_versions": ["py38"] + "py_versions": [ + "py38" + ] } } }, "inference": { - "processors": ["cpu", "gpu"], + "processors": [ + "cpu", + "gpu" + ], "version_aliases": { "0.12": "0.12.1", "1.0": "1.0.0", @@ -477,7 +515,10 @@ "us-west-2": "520713654638" }, "repository": "sagemaker-mxnet", - "py_versions": ["py2", "py3"] + "py_versions": [ + "py2", + "py3" + ] }, "1.0.0": { "registries": { @@ -508,7 +549,10 @@ "us-west-2": "520713654638" }, "repository": "sagemaker-mxnet", - "py_versions": ["py2", "py3"] + "py_versions": [ + "py2", + "py3" + ] }, "1.1.0": { "registries": { @@ -539,7 +583,10 @@ "us-west-2": "520713654638" }, "repository": "sagemaker-mxnet", - "py_versions": ["py2", "py3"] + "py_versions": [ + "py2", + "py3" + ] }, "1.2.1": { "registries": { @@ -570,7 +617,10 @@ "us-west-2": "520713654638" }, "repository": "sagemaker-mxnet", - "py_versions": ["py2", "py3"] + "py_versions": [ + "py2", + "py3" + ] }, "1.3.0": { "registries": { @@ -601,7 +651,10 @@ "us-west-2": "520713654638" }, "repository": "sagemaker-mxnet", - "py_versions": ["py2", "py3"] + "py_versions": [ + "py2", + "py3" + ] }, "1.4.0": { "registries": { @@ -632,7 +685,10 @@ "us-west-2": "520713654638" }, "repository": "sagemaker-mxnet-serving", - "py_versions": ["py2", "py3"] + "py_versions": [ + "py2", + "py3" + ] }, "1.4.1": { "py2": { @@ -700,7 +756,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "mxnet-inference" } @@ -740,10 +797,14 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "mxnet-inference", - "py_versions": ["py2", "py3"] + "py_versions": [ + "py2", + "py3" + ] }, "1.7.0": { "registries": { @@ -780,10 +841,13 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "mxnet-inference", - "py_versions": ["py3"] + "py_versions": [ + "py3" + ] }, "1.8.0": { "registries": { @@ -820,10 +884,13 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "mxnet-inference", - "py_versions": ["py37"] + "py_versions": [ + "py37" + ] }, "1.9.0": { "registries": { @@ -860,15 +927,20 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "mxnet-inference", - "py_versions": ["py38"] + "py_versions": [ + "py38" + ] } } }, "eia": { - "processors": ["cpu"], + "processors": [ + "cpu" + ], "version_aliases": { "1.3": "1.3.0", "1.4": "1.4.1", @@ -906,7 +978,10 @@ "us-west-2": "520713654638" }, "repository": "sagemaker-mxnet-eia", - "py_versions": ["py2", "py3"] + "py_versions": [ + "py2", + "py3" + ] }, "1.4.0": { "registries": { @@ -938,7 +1013,10 @@ "us-west-2": "520713654638" }, "repository": "sagemaker-mxnet-serving-eia", - "py_versions": ["py2", "py3"] + "py_versions": [ + "py2", + "py3" + ] }, "1.4.1": { "registries": { @@ -975,10 +1053,14 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "mxnet-inference-eia", - "py_versions": ["py2", "py3"] + "py_versions": [ + "py2", + "py3" + ] }, "1.5.1": { "registries": { @@ -1015,10 +1097,14 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "mxnet-inference-eia", - "py_versions": ["py2", "py3"] + "py_versions": [ + "py2", + "py3" + ] }, "1.7.0": { "registries": { @@ -1055,11 +1141,14 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "mxnet-inference-eia", - "py_versions": ["py3"] + "py_versions": [ + "py3" + ] } } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/ntm.json b/src/sagemaker/image_uri_config/ntm.json index 493e150885..a942c68bbb 100644 --- a/src/sagemaker/image_uri_config/ntm.json +++ b/src/sagemaker/image_uri_config/ntm.json @@ -1,5 +1,8 @@ { - "scope": ["inference", "training"], + "scope": [ + "inference", + "training" + ], "versions": { "1": { "registries": { @@ -20,19 +23,19 @@ "eu-central-1": "664544806723", "eu-central-2": "680994064768", "eu-north-1": "669576153137", + "eu-south-1": "257386234256", + "eu-south-2": "104374241257", "eu-west-1": "438346466558", "eu-west-2": "644912444149", "eu-west-3": "749696950732", - "eu-south-1": "257386234256", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "249704162688", "me-central-1": "272398656194", + "me-south-1": "249704162688", "sa-east-1": "855470959533", "us-east-1": "382416733822", "us-east-2": "404615174143", - "us-gov-west-1": "226302683700", "us-gov-east-1": "237065988967", + "us-gov-west-1": "226302683700", "us-iso-east-1": "490574956308", "us-isob-east-1": "765400339828", "us-west-1": "632365934929", @@ -41,4 +44,4 @@ "repository": "ntm" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/object-detection.json b/src/sagemaker/image_uri_config/object-detection.json index 00a2b90213..079ef594ec 100644 --- a/src/sagemaker/image_uri_config/object-detection.json +++ b/src/sagemaker/image_uri_config/object-detection.json @@ -1,5 +1,8 @@ { - "scope": ["inference", "training"], + "scope": [ + "inference", + "training" + ], "versions": { "1": { "registries": { @@ -20,19 +23,19 @@ "eu-central-1": "813361260812", "eu-central-2": "680994064768", "eu-north-1": "669576153137", + "eu-south-1": "257386234256", + "eu-south-2": "104374241257", "eu-west-1": "685385470294", "eu-west-2": "644912444149", "eu-west-3": "749696950732", - "eu-south-1": "257386234256", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "249704162688", "me-central-1": "272398656194", + "me-south-1": "249704162688", "sa-east-1": "855470959533", "us-east-1": "811284229777", "us-east-2": "825641698319", - "us-gov-west-1": "226302683700", "us-gov-east-1": "237065988967", + "us-gov-west-1": "226302683700", "us-iso-east-1": "490574956308", "us-isob-east-1": "765400339828", "us-west-1": "632365934929", @@ -41,4 +44,4 @@ "repository": "object-detection" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/object2vec.json b/src/sagemaker/image_uri_config/object2vec.json index 950bd60157..be4258a207 100644 --- a/src/sagemaker/image_uri_config/object2vec.json +++ b/src/sagemaker/image_uri_config/object2vec.json @@ -1,5 +1,8 @@ { - "scope": ["inference", "training"], + "scope": [ + "inference", + "training" + ], "versions": { "1": { "registries": { @@ -20,19 +23,19 @@ "eu-central-1": "664544806723", "eu-central-2": "680994064768", "eu-north-1": "669576153137", + "eu-south-1": "257386234256", + "eu-south-2": "104374241257", "eu-west-1": "438346466558", "eu-west-2": "644912444149", "eu-west-3": "749696950732", - "eu-south-1": "257386234256", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "249704162688", "me-central-1": "272398656194", + "me-south-1": "249704162688", "sa-east-1": "855470959533", "us-east-1": "382416733822", "us-east-2": "404615174143", - "us-gov-west-1": "226302683700", "us-gov-east-1": "237065988967", + "us-gov-west-1": "226302683700", "us-iso-east-1": "490574956308", "us-isob-east-1": "765400339828", "us-west-1": "632365934929", @@ -41,4 +44,4 @@ "repository": "object2vec" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/pca.json b/src/sagemaker/image_uri_config/pca.json index ff5e247220..5b87591d9f 100644 --- a/src/sagemaker/image_uri_config/pca.json +++ b/src/sagemaker/image_uri_config/pca.json @@ -1,5 +1,8 @@ { - "scope": ["inference", "training"], + "scope": [ + "inference", + "training" + ], "versions": { "1": { "registries": { @@ -20,19 +23,19 @@ "eu-central-1": "664544806723", "eu-central-2": "680994064768", "eu-north-1": "669576153137", + "eu-south-1": "257386234256", + "eu-south-2": "104374241257", "eu-west-1": "438346466558", "eu-west-2": "644912444149", "eu-west-3": "749696950732", - "eu-south-1": "257386234256", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "249704162688", "me-central-1": "272398656194", + "me-south-1": "249704162688", "sa-east-1": "855470959533", "us-east-1": "382416733822", "us-east-2": "404615174143", - "us-gov-west-1": "226302683700", "us-gov-east-1": "237065988967", + "us-gov-west-1": "226302683700", "us-iso-east-1": "490574956308", "us-isob-east-1": "765400339828", "us-west-1": "632365934929", @@ -41,4 +44,4 @@ "repository": "pca" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/pytorch-neuron.json b/src/sagemaker/image_uri_config/pytorch-neuron.json index dd4d6d3498..c69c22c646 100644 --- a/src/sagemaker/image_uri_config/pytorch-neuron.json +++ b/src/sagemaker/image_uri_config/pytorch-neuron.json @@ -1,32 +1,43 @@ { "training": { - "processors": ["trn"], - "version_aliases": {"1.11": "1.11.0"}, + "processors": [ + "trn" + ], + "version_aliases": { + "1.11": "1.11.0" + }, "versions": { - "1.11.0": { - "py_versions": ["py38"], - "repository": "pytorch-training-neuron", - "registries": { - "ap-northeast-1": "763104351884", - "ap-south-1": "763104351884", - "ap-south-2": "772153158452", - "ap-southeast-1": "763104351884", - "ap-southeast-2": "763104351884", - "ap-southeast-4": "457447274322", - "eu-central-1": "763104351884", - "eu-central-2": "380420809688", - "eu-south-2": "503227376785", - "eu-west-1": "763104351884", - "eu-west-3": "763104351884", - "il-central-1": "780543022126", - "sa-east-1": "763104351884", - "us-east-1": "763104351884", - "us-east-2": "763104351884", - "us-west-2": "763104351884" - }, - "container_version": {"trn": "ubuntu20.04"}, - "sdk_versions": ["sdk2.4.0"] - } + "1.11.0": { + "py_versions": [ + "py38" + ], + "repository": "pytorch-training-neuron", + "registries": { + "ap-northeast-1": "763104351884", + "ap-south-1": "763104351884", + "ap-south-2": "772153158452", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ap-southeast-4": "457447274322", + "eu-central-1": "763104351884", + "eu-central-2": "380420809688", + "eu-south-2": "503227376785", + "eu-west-1": "763104351884", + "eu-west-3": "763104351884", + "il-central-1": "780543022126", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-west-2": "763104351884", + "ca-west-1": "204538143572" + }, + "container_version": { + "trn": "ubuntu20.04" + }, + "sdk_versions": [ + "sdk2.4.0" + ] } } } +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/pytorch-training-compiler.json b/src/sagemaker/image_uri_config/pytorch-training-compiler.json index 8563166391..ce72fb365a 100644 --- a/src/sagemaker/image_uri_config/pytorch-training-compiler.json +++ b/src/sagemaker/image_uri_config/pytorch-training-compiler.json @@ -38,7 +38,8 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-trcomp-training" }, @@ -69,10 +70,11 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-trcomp-training" } } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/pytorch.json b/src/sagemaker/image_uri_config/pytorch.json index 5313e34732..3296b807a3 100644 --- a/src/sagemaker/image_uri_config/pytorch.json +++ b/src/sagemaker/image_uri_config/pytorch.json @@ -32,7 +32,8 @@ "eu-south-2": "503227376785", "us-east-1": "763104351884", "us-east-2": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference-eia" }, @@ -51,7 +52,8 @@ "eu-south-2": "503227376785", "us-east-1": "763104351884", "us-east-2": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference-eia" } @@ -223,7 +225,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" }, @@ -266,7 +269,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" }, @@ -308,7 +312,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" }, @@ -350,7 +355,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" }, @@ -393,7 +399,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" }, @@ -436,7 +443,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" }, @@ -479,7 +487,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" }, @@ -522,7 +531,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" }, @@ -564,7 +574,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" }, @@ -606,7 +617,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" }, @@ -648,7 +660,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" }, @@ -690,7 +703,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" }, @@ -732,7 +746,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" }, @@ -774,7 +789,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" }, @@ -815,7 +831,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" }, @@ -853,7 +870,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" }, @@ -891,7 +909,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" }, @@ -929,7 +948,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" }, @@ -967,7 +987,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference" } @@ -1020,10 +1041,13 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference-graviton", - "container_version": {"cpu": "ubuntu20.04"} + "container_version": { + "cpu": "ubuntu20.04" + } }, "2.0.0": { "py_versions": [ @@ -1056,10 +1080,13 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference-graviton", - "container_version": {"cpu": "ubuntu20.04"} + "container_version": { + "cpu": "ubuntu20.04" + } }, "2.0.1": { "py_versions": [ @@ -1092,10 +1119,13 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference-graviton", - "container_version": {"cpu": "ubuntu20.04"} + "container_version": { + "cpu": "ubuntu20.04" + } }, "2.1.0": { "py_versions": [ @@ -1128,10 +1158,13 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-inference-graviton", - "container_version": {"cpu": "ubuntu20.04"} + "container_version": { + "cpu": "ubuntu20.04" + } } } }, @@ -1301,7 +1334,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" }, @@ -1344,7 +1378,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" }, @@ -1387,7 +1422,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" }, @@ -1429,7 +1465,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" }, @@ -1472,7 +1509,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" }, @@ -1515,7 +1553,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" }, @@ -1558,7 +1597,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" }, @@ -1601,7 +1641,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" }, @@ -1643,7 +1684,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" }, @@ -1685,7 +1727,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" }, @@ -1727,7 +1770,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" }, @@ -1769,7 +1813,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" }, @@ -1811,7 +1856,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" }, @@ -1853,7 +1899,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" }, @@ -1894,7 +1941,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" }, @@ -1932,7 +1980,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" }, @@ -1970,7 +2019,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" }, @@ -2008,7 +2058,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" }, @@ -2046,10 +2097,11 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "pytorch-training" } } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/randomcutforest.json b/src/sagemaker/image_uri_config/randomcutforest.json index 8035001661..fe4b0cbf91 100644 --- a/src/sagemaker/image_uri_config/randomcutforest.json +++ b/src/sagemaker/image_uri_config/randomcutforest.json @@ -1,5 +1,8 @@ { - "scope": ["inference", "training"], + "scope": [ + "inference", + "training" + ], "versions": { "1": { "registries": { @@ -20,19 +23,19 @@ "eu-central-1": "664544806723", "eu-central-2": "680994064768", "eu-north-1": "669576153137", + "eu-south-1": "257386234256", + "eu-south-2": "104374241257", "eu-west-1": "438346466558", "eu-west-2": "644912444149", "eu-west-3": "749696950732", - "eu-south-1": "257386234256", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "249704162688", "me-central-1": "272398656194", + "me-south-1": "249704162688", "sa-east-1": "855470959533", "us-east-1": "382416733822", "us-east-2": "404615174143", - "us-gov-west-1": "226302683700", "us-gov-east-1": "237065988967", + "us-gov-west-1": "226302683700", "us-iso-east-1": "490574956308", "us-isob-east-1": "765400339828", "us-west-1": "632365934929", @@ -41,4 +44,4 @@ "repository": "randomcutforest" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/sagemaker-base-python.json b/src/sagemaker/image_uri_config/sagemaker-base-python.json index 771f66ab95..15c4c78eb2 100644 --- a/src/sagemaker/image_uri_config/sagemaker-base-python.json +++ b/src/sagemaker/image_uri_config/sagemaker-base-python.json @@ -2,33 +2,34 @@ "versions": { "1.0": { "registries": { - "us-east-2": "429704687514", - "me-south-1": "117516905037", - "us-west-2": "236514542706", - "ca-central-1": "310906938811", + "af-south-1": "559312083959", "ap-east-1": "493642496378", - "us-east-1": "081325390199", + "ap-northeast-1": "102112518831", "ap-northeast-2": "806072073708", - "eu-west-2": "712779665605", - "ap-southeast-2": "52832661640", - "cn-northwest-1": "390780980154", - "eu-north-1": "243637512696", - "cn-north-1": "390048526115", + "ap-northeast-3": "792733760839", "ap-south-1": "394103062818", - "eu-west-3": "615547856133", + "ap-southeast-1": "492261229750", + "ap-southeast-2": "452832661640", "ap-southeast-3": "276181064229", - "af-south-1": "559312083959", - "eu-west-1": "470317259841", + "ca-central-1": "310906938811", + "cn-north-1": "390048526115", + "cn-northwest-1": "390780980154", "eu-central-1": "936697816551", - "sa-east-1": "782484402741", - "ap-northeast-3": "792733760839", + "eu-north-1": "243637512696", "eu-south-1": "592751261982", - "ap-northeast-1": "102112518831", - "us-west-1": "742091327244", - "ap-southeast-1": "492261229750", + "eu-west-1": "470317259841", + "eu-west-2": "712779665605", + "eu-west-3": "615547856133", + "il-central-1": "380164790875", "me-central-1": "103105715889", + "me-south-1": "117516905037", + "sa-east-1": "782484402741", + "us-east-1": "081325390199", + "us-east-2": "429704687514", "us-gov-east-1": "107072934176", - "us-gov-west-1": "107173498710" + "us-gov-west-1": "107173498710", + "us-west-1": "742091327244", + "us-west-2": "236514542706" }, "repository": "sagemaker-base-python" } diff --git a/src/sagemaker/image_uri_config/semantic-segmentation.json b/src/sagemaker/image_uri_config/semantic-segmentation.json index 3f21ddf2e3..37671ed7a1 100644 --- a/src/sagemaker/image_uri_config/semantic-segmentation.json +++ b/src/sagemaker/image_uri_config/semantic-segmentation.json @@ -1,5 +1,8 @@ { - "scope": ["inference", "training"], + "scope": [ + "inference", + "training" + ], "versions": { "1": { "registries": { @@ -20,19 +23,19 @@ "eu-central-1": "813361260812", "eu-central-2": "680994064768", "eu-north-1": "669576153137", + "eu-south-1": "257386234256", + "eu-south-2": "104374241257", "eu-west-1": "685385470294", "eu-west-2": "644912444149", "eu-west-3": "749696950732", - "eu-south-1": "257386234256", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "249704162688", "me-central-1": "272398656194", + "me-south-1": "249704162688", "sa-east-1": "855470959533", "us-east-1": "811284229777", "us-east-2": "825641698319", - "us-gov-west-1": "226302683700", "us-gov-east-1": "237065988967", + "us-gov-west-1": "226302683700", "us-iso-east-1": "490574956308", "us-isob-east-1": "765400339828", "us-west-1": "632365934929", @@ -41,4 +44,4 @@ "repository": "semantic-segmentation" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/seq2seq.json b/src/sagemaker/image_uri_config/seq2seq.json index ecf2b180e1..cc73055bb0 100644 --- a/src/sagemaker/image_uri_config/seq2seq.json +++ b/src/sagemaker/image_uri_config/seq2seq.json @@ -1,5 +1,8 @@ { - "scope": ["inference", "training"], + "scope": [ + "inference", + "training" + ], "versions": { "1": { "registries": { @@ -20,19 +23,19 @@ "eu-central-1": "813361260812", "eu-central-2": "680994064768", "eu-north-1": "669576153137", + "eu-south-1": "257386234256", + "eu-south-2": "104374241257", "eu-west-1": "685385470294", "eu-west-2": "644912444149", "eu-west-3": "749696950732", - "eu-south-1": "257386234256", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "249704162688", "me-central-1": "272398656194", + "me-south-1": "249704162688", "sa-east-1": "855470959533", "us-east-1": "811284229777", "us-east-2": "825641698319", - "us-gov-west-1": "226302683700", "us-gov-east-1": "237065988967", + "us-gov-west-1": "226302683700", "us-iso-east-1": "490574956308", "us-isob-east-1": "765400339828", "us-west-1": "632365934929", @@ -41,4 +44,4 @@ "repository": "seq2seq" } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/sklearn.json b/src/sagemaker/image_uri_config/sklearn.json index bd3981da9f..7d25792e8c 100644 --- a/src/sagemaker/image_uri_config/sklearn.json +++ b/src/sagemaker/image_uri_config/sklearn.json @@ -2,8 +2,12 @@ "inference": { "versions": { "0.20.0": { - "processors": ["cpu"], - "py_versions": ["py3"], + "processors": [ + "cpu" + ], + "py_versions": [ + "py3" + ], "registries": { "af-south-1": "510948584623", "ap-east-1": "651117190479", @@ -22,19 +26,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -43,8 +47,12 @@ "repository": "sagemaker-scikit-learn" }, "0.23-1": { - "processors": ["cpu"], - "py_versions": ["py3"], + "processors": [ + "cpu" + ], + "py_versions": [ + "py3" + ], "registries": { "af-south-1": "510948584623", "ap-east-1": "651117190479", @@ -63,19 +71,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -84,8 +92,12 @@ "repository": "sagemaker-scikit-learn" }, "1.0-1": { - "processors": ["cpu"], - "py_versions": ["py3"], + "processors": [ + "cpu" + ], + "py_versions": [ + "py3" + ], "registries": { "af-south-1": "510948584623", "ap-east-1": "651117190479", @@ -104,19 +116,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -125,8 +137,12 @@ "repository": "sagemaker-scikit-learn" }, "1.2-1": { - "processors": ["cpu"], - "py_versions": ["py3"], + "processors": [ + "cpu" + ], + "py_versions": [ + "py3" + ], "registries": { "af-south-1": "510948584623", "ap-east-1": "651117190479", @@ -145,19 +161,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -170,8 +186,12 @@ "training": { "versions": { "0.20.0": { - "processors": ["cpu"], - "py_versions": ["py3"], + "processors": [ + "cpu" + ], + "py_versions": [ + "py3" + ], "registries": { "af-south-1": "510948584623", "ap-east-1": "651117190479", @@ -190,19 +210,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -211,8 +231,12 @@ "repository": "sagemaker-scikit-learn" }, "0.23-1": { - "processors": ["cpu"], - "py_versions": ["py3"], + "processors": [ + "cpu" + ], + "py_versions": [ + "py3" + ], "registries": { "af-south-1": "510948584623", "ap-east-1": "651117190479", @@ -231,19 +255,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -252,8 +276,12 @@ "repository": "sagemaker-scikit-learn" }, "1.0-1": { - "processors": ["cpu"], - "py_versions": ["py3"], + "processors": [ + "cpu" + ], + "py_versions": [ + "py3" + ], "registries": { "af-south-1": "510948584623", "ap-east-1": "651117190479", @@ -272,19 +300,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -293,8 +321,12 @@ "repository": "sagemaker-scikit-learn" }, "1.2-1": { - "processors": ["cpu"], - "py_versions": ["py3"], + "processors": [ + "cpu" + ], + "py_versions": [ + "py3" + ], "registries": { "af-south-1": "510948584623", "ap-east-1": "651117190479", @@ -313,19 +345,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -338,8 +370,12 @@ "inference_graviton": { "versions": { "1.0-1": { - "processors": ["cpu"], - "py_versions": ["py3"], + "processors": [ + "cpu" + ], + "py_versions": [ + "py3" + ], "registries": { "af-south-1": "510948584623", "ap-east-1": "651117190479", @@ -358,19 +394,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -380,4 +416,4 @@ } } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/spark.json b/src/sagemaker/image_uri_config/spark.json index 5527cf5fd4..8fac03e716 100644 --- a/src/sagemaker/image_uri_config/spark.json +++ b/src/sagemaker/image_uri_config/spark.json @@ -1,160 +1,209 @@ { "processing": { - "processors": ["cpu"], + "processors": [ + "cpu" + ], "versions": { "2.4": { - "py_versions": ["py37"], + "py_versions": [ + "py37" + ], "registries": { - "me-south-1": "750251592176", - "ap-south-1": "105495057255", - "eu-north-1": "330188676905", - "eu-west-3": "136845547031", - "us-east-2": "314815235551", - "eu-west-1": "571004829621", - "eu-central-1": "906073651304", - "sa-east-1": "737130764395", + "af-south-1": "309385258863", "ap-east-1": "732049463269", - "us-east-1": "173754725891", - "ap-northeast-2": "860869212795", - "eu-west-2": "836651553127", "ap-northeast-1": "411782140378", - "us-west-2": "153931337802", - "us-west-1": "667973535471", + "ap-northeast-2": "860869212795", + "ap-northeast-3": "102471314380", + "ap-south-1": "105495057255", + "ap-south-2": "873151114052", "ap-southeast-1": "759080221371", "ap-southeast-2": "440695851116", + "ap-southeast-3": "800295151634", + "ap-southeast-4": "819679513684", "ca-central-1": "446299261295", "cn-north-1": "671472414489", "cn-northwest-1": "844356804704", + "eu-central-1": "906073651304", + "eu-central-2": "142351485170", + "eu-north-1": "330188676905", "eu-south-1": "753923664805", - "af-south-1": "309385258863", + "eu-south-2": "833944533722", + "eu-west-1": "571004829621", + "eu-west-2": "836651553127", + "eu-west-3": "136845547031", + "il-central-1": "408426139102", + "me-central-1": "395420993607", + "me-south-1": "750251592176", + "sa-east-1": "737130764395", + "us-east-1": "173754725891", + "us-east-2": "314815235551", + "us-gov-east-1": "260923028637", "us-gov-west-1": "271483468897", - "ap-southeast-3": "732049463269" + "us-west-1": "667973535471", + "us-west-2": "153931337802" }, "repository": "sagemaker-spark-processing" }, "3.0": { - "py_versions": ["py37"], - "registries": { - "me-south-1": "750251592176", - "ap-south-1": "105495057255", - "eu-north-1": "330188676905", - "eu-west-3": "136845547031", - "us-east-2": "314815235551", - "eu-west-1": "571004829621", - "eu-central-1": "906073651304", - "sa-east-1": "737130764395", - "ap-east-1": "732049463269", - "us-east-1": "173754725891", - "ap-northeast-2": "860869212795", - "eu-west-2": "836651553127", - "ap-northeast-1": "411782140378", - "us-west-2": "153931337802", - "us-west-1": "667973535471", - "ap-southeast-1": "759080221371", - "ap-southeast-2": "440695851116", - "ca-central-1": "446299261295", - "cn-north-1": "671472414489", - "cn-northwest-1": "844356804704", - "eu-south-1": "753923664805", - "af-south-1": "309385258863", - "us-gov-west-1": "271483468897", - "ap-southeast-3": "732049463269" - }, - "repository": "sagemaker-spark-processing" - }, - "3.1": { - "py_versions": ["py37"], + "py_versions": [ + "py37" + ], "registries": { - "me-south-1": "750251592176", + "af-south-1": "309385258863", + "ap-east-1": "732049463269", + "ap-northeast-1": "411782140378", + "ap-northeast-2": "860869212795", + "ap-northeast-3": "102471314380", "ap-south-1": "105495057255", + "ap-south-2": "873151114052", + "ap-southeast-1": "759080221371", + "ap-southeast-2": "440695851116", + "ap-southeast-3": "800295151634", + "ap-southeast-4": "819679513684", + "ca-central-1": "446299261295", + "cn-north-1": "671472414489", + "cn-northwest-1": "844356804704", + "eu-central-1": "906073651304", + "eu-central-2": "142351485170", "eu-north-1": "330188676905", - "eu-west-3": "136845547031", - "us-east-2": "314815235551", + "eu-south-1": "753923664805", + "eu-south-2": "833944533722", "eu-west-1": "571004829621", - "eu-central-1": "906073651304", + "eu-west-2": "836651553127", + "eu-west-3": "136845547031", + "il-central-1": "408426139102", + "me-central-1": "395420993607", + "me-south-1": "750251592176", "sa-east-1": "737130764395", - "ap-east-1": "732049463269", "us-east-1": "173754725891", - "ap-northeast-2": "860869212795", - "eu-west-2": "836651553127", - "ap-northeast-1": "411782140378", - "us-west-2": "153931337802", + "us-east-2": "314815235551", + "us-gov-east-1": "260923028637", + "us-gov-west-1": "271483468897", "us-west-1": "667973535471", + "us-west-2": "153931337802" + }, + "repository": "sagemaker-spark-processing" + }, + "3.1": { + "py_versions": [ + "py37" + ], + "registries": { + "af-south-1": "309385258863", + "ap-east-1": "732049463269", + "ap-northeast-1": "411782140378", + "ap-northeast-2": "860869212795", + "ap-northeast-3": "102471314380", + "ap-south-1": "105495057255", + "ap-south-2": "873151114052", "ap-southeast-1": "759080221371", "ap-southeast-2": "440695851116", + "ap-southeast-3": "800295151634", + "ap-southeast-4": "819679513684", "ca-central-1": "446299261295", "cn-north-1": "671472414489", "cn-northwest-1": "844356804704", + "eu-central-1": "906073651304", + "eu-central-2": "142351485170", + "eu-north-1": "330188676905", "eu-south-1": "753923664805", - "af-south-1": "309385258863", + "eu-south-2": "833944533722", + "eu-west-1": "571004829621", + "eu-west-2": "836651553127", + "eu-west-3": "136845547031", + "il-central-1": "408426139102", + "me-central-1": "395420993607", + "me-south-1": "750251592176", + "sa-east-1": "737130764395", + "us-east-1": "173754725891", + "us-east-2": "314815235551", + "us-gov-east-1": "260923028637", "us-gov-west-1": "271483468897", - "ap-southeast-3": "732049463269", - "ap-northeast-3": "102471314380" + "us-west-1": "667973535471", + "us-west-2": "153931337802" }, "repository": "sagemaker-spark-processing" }, "3.2": { - "py_versions": ["py39"], + "py_versions": [ + "py39" + ], "registries": { - "me-south-1": "750251592176", - "ap-south-1": "105495057255", - "eu-north-1": "330188676905", - "eu-west-3": "136845547031", - "us-east-2": "314815235551", - "eu-west-1": "571004829621", - "eu-central-1": "906073651304", - "sa-east-1": "737130764395", + "af-south-1": "309385258863", "ap-east-1": "732049463269", - "us-east-1": "173754725891", - "ap-northeast-2": "860869212795", - "eu-west-2": "836651553127", "ap-northeast-1": "411782140378", - "us-west-2": "153931337802", - "us-west-1": "667973535471", + "ap-northeast-2": "860869212795", + "ap-northeast-3": "102471314380", + "ap-south-1": "105495057255", + "ap-south-2": "873151114052", "ap-southeast-1": "759080221371", "ap-southeast-2": "440695851116", + "ap-southeast-3": "800295151634", + "ap-southeast-4": "819679513684", "ca-central-1": "446299261295", "cn-north-1": "671472414489", "cn-northwest-1": "844356804704", + "eu-central-1": "906073651304", + "eu-central-2": "142351485170", + "eu-north-1": "330188676905", "eu-south-1": "753923664805", - "af-south-1": "309385258863", + "eu-south-2": "833944533722", + "eu-west-1": "571004829621", + "eu-west-2": "836651553127", + "eu-west-3": "136845547031", + "il-central-1": "408426139102", + "me-central-1": "395420993607", + "me-south-1": "750251592176", + "sa-east-1": "737130764395", + "us-east-1": "173754725891", + "us-east-2": "314815235551", + "us-gov-east-1": "260923028637", "us-gov-west-1": "271483468897", - "ap-southeast-3": "732049463269", - "ap-northeast-3": "102471314380" + "us-west-1": "667973535471", + "us-west-2": "153931337802" }, "repository": "sagemaker-spark-processing" }, "3.3": { - "py_versions": ["py39"], + "py_versions": [ + "py39" + ], "registries": { - "me-south-1": "750251592176", - "ap-south-1": "105495057255", - "eu-north-1": "330188676905", - "eu-west-3": "136845547031", - "us-east-2": "314815235551", - "eu-west-1": "571004829621", - "eu-central-1": "906073651304", - "sa-east-1": "737130764395", + "af-south-1": "309385258863", "ap-east-1": "732049463269", - "us-east-1": "173754725891", - "ap-northeast-2": "860869212795", - "eu-west-2": "836651553127", "ap-northeast-1": "411782140378", - "us-west-2": "153931337802", - "us-west-1": "667973535471", + "ap-northeast-2": "860869212795", + "ap-northeast-3": "102471314380", + "ap-south-1": "105495057255", + "ap-south-2": "873151114052", "ap-southeast-1": "759080221371", "ap-southeast-2": "440695851116", + "ap-southeast-3": "800295151634", + "ap-southeast-4": "819679513684", "ca-central-1": "446299261295", "cn-north-1": "671472414489", "cn-northwest-1": "844356804704", + "eu-central-1": "906073651304", + "eu-central-2": "142351485170", + "eu-north-1": "330188676905", "eu-south-1": "753923664805", - "af-south-1": "309385258863", + "eu-south-2": "833944533722", + "eu-west-1": "571004829621", + "eu-west-2": "836651553127", + "eu-west-3": "136845547031", + "il-central-1": "408426139102", + "me-central-1": "395420993607", + "me-south-1": "750251592176", + "sa-east-1": "737130764395", + "us-east-1": "173754725891", + "us-east-2": "314815235551", + "us-gov-east-1": "260923028637", "us-gov-west-1": "271483468897", - "ap-southeast-3": "732049463269", - "ap-northeast-3": "102471314380" + "us-west-1": "667973535471", + "us-west-2": "153931337802" }, "repository": "sagemaker-spark-processing" } } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/stabilityai.json b/src/sagemaker/image_uri_config/stabilityai.json index 9daa1aca32..3f3ff729f6 100644 --- a/src/sagemaker/image_uri_config/stabilityai.json +++ b/src/sagemaker/image_uri_config/stabilityai.json @@ -1,12 +1,16 @@ { "inference": { - "processors": ["gpu"], + "processors": [ + "gpu" + ], "version_aliases": { "0.1": "0.1.0" }, "versions": { "0.1.0": { - "py_versions": ["py310"], + "py_versions": [ + "py310" + ], "registries": { "af-south-1": "626614931356", "il-central-1": "780543022126", @@ -35,7 +39,8 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "tag_prefix": "2.0.1-sgm0.1.0", "repository": "stabilityai-pytorch-inference", @@ -45,4 +50,4 @@ } } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/tensorflow.json b/src/sagemaker/image_uri_config/tensorflow.json index 30cda89344..3453f5f120 100644 --- a/src/sagemaker/image_uri_config/tensorflow.json +++ b/src/sagemaker/image_uri_config/tensorflow.json @@ -172,7 +172,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference-eia" }, @@ -211,7 +212,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference-eia" }, @@ -250,7 +252,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference-eia" }, @@ -289,7 +292,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference-eia" } @@ -457,7 +461,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -496,7 +501,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -535,7 +541,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -574,7 +581,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -613,7 +621,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -652,7 +661,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -691,7 +701,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -928,7 +939,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -967,7 +979,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1006,7 +1019,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1045,7 +1059,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1084,7 +1099,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1123,7 +1139,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1162,7 +1179,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1201,7 +1219,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1240,7 +1259,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1279,7 +1299,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1318,7 +1339,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1357,7 +1379,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1396,7 +1419,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1435,7 +1459,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1474,7 +1499,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1513,7 +1539,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1552,7 +1579,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1591,7 +1619,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1630,7 +1659,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1669,7 +1699,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1708,7 +1739,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1747,7 +1779,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1786,7 +1819,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1824,7 +1858,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1862,7 +1897,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1900,7 +1936,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1938,7 +1975,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -1976,7 +2014,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -2014,7 +2053,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -2052,7 +2092,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" }, @@ -2090,7 +2131,8 @@ "us-gov-east-1": "446045086412", "us-gov-west-1": "442386744353", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference" } @@ -2143,10 +2185,13 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference-graviton", - "container_version": {"cpu": "ubuntu20.04"} + "container_version": { + "cpu": "ubuntu20.04" + } }, "2.12.1": { "py_versions": [ @@ -2185,10 +2230,13 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference-graviton", - "container_version": {"cpu": "ubuntu20.04"} + "container_version": { + "cpu": "ubuntu20.04" + } }, "2.13.0": { "py_versions": [ @@ -2227,10 +2275,13 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-inference-graviton", - "container_version": {"cpu": "ubuntu20.04"} + "container_version": { + "cpu": "ubuntu20.04" + } } } }, @@ -2403,7 +2454,7 @@ "py3": { "registries": { "af-south-1": "626614931356", - "il-central-1": "780543022126", + "il-central-1": "780543022126", "ap-east-1": "871362719292", "ap-northeast-1": "763104351884", "ap-northeast-2": "763104351884", @@ -2435,7 +2486,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" } @@ -2479,7 +2531,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -2522,7 +2575,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -2566,7 +2620,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -2610,7 +2665,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -2654,7 +2710,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -2698,7 +2755,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -2939,7 +2997,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -2982,7 +3041,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3025,7 +3085,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3067,7 +3128,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3109,7 +3171,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3152,7 +3215,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3195,7 +3259,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3237,7 +3302,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3279,7 +3345,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3321,7 +3388,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3363,7 +3431,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3405,7 +3474,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3447,7 +3517,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3489,7 +3560,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3531,7 +3603,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3573,7 +3646,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3615,7 +3689,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3657,7 +3732,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3699,7 +3775,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3741,7 +3818,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3783,7 +3861,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3825,7 +3904,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3867,7 +3947,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3909,7 +3990,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3951,7 +4033,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -3992,7 +4075,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -4030,7 +4114,8 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -4062,7 +4147,8 @@ "us-east-1": "763104351884", "us-east-2": "763104351884", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" }, @@ -4100,10 +4186,11 @@ "us-iso-east-1": "886529160074", "us-isob-east-1": "094389454867", "us-west-1": "763104351884", - "us-west-2": "763104351884" + "us-west-2": "763104351884", + "ca-west-1": "204538143572" }, "repository": "tensorflow-training" } } } -} +} \ No newline at end of file diff --git a/src/sagemaker/image_uri_config/xgboost.json b/src/sagemaker/image_uri_config/xgboost.json index bedd7cf067..b3883eabe7 100644 --- a/src/sagemaker/image_uri_config/xgboost.json +++ b/src/sagemaker/image_uri_config/xgboost.json @@ -23,18 +23,18 @@ "eu-central-1": "813361260812", "eu-central-2": "680994064768", "eu-north-1": "669576153137", + "eu-south-1": "257386234256", + "eu-south-2": "104374241257", "eu-west-1": "685385470294", "eu-west-2": "644912444149", "eu-west-3": "749696950732", - "eu-south-1": "257386234256", - "eu-south-2": "104374241257", - "me-south-1": "249704162688", "me-central-1": "272398656194", + "me-south-1": "249704162688", "sa-east-1": "855470959533", "us-east-1": "811284229777", "us-east-2": "825641698319", - "us-gov-west-1": "226302683700", "us-gov-east-1": "237065988967", + "us-gov-west-1": "226302683700", "us-iso-east-1": "490574956308", "us-isob-east-1": "765400339828", "us-west-1": "632365934929", @@ -43,8 +43,12 @@ "repository": "xgboost" }, "0.90-1": { - "processors": ["cpu"], - "py_versions": ["py3"], + "processors": [ + "cpu" + ], + "py_versions": [ + "py3" + ], "registries": { "af-south-1": "510948584623", "ap-east-1": "651117190479", @@ -63,18 +67,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", - "me-south-1": "801668240914", + "il-central-1": "898809789911", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -83,10 +88,14 @@ "repository": "sagemaker-xgboost" }, "0.90-2": { - "processors": ["cpu"], - "py_versions": ["py3"], + "processors": [ + "cpu" + ], + "py_versions": [ + "py3" + ], "registries": { - "af-south-1": "510948584623", + "af-south-1": "510948584623", "ap-east-1": "651117190479", "ap-northeast-1": "354813040037", "ap-northeast-2": "366743142698", @@ -103,18 +112,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", - "me-south-1": "801668240914", + "il-central-1": "898809789911", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -123,8 +133,12 @@ "repository": "sagemaker-xgboost" }, "1.0-1": { - "processors": ["cpu"], - "py_versions": ["py3"], + "processors": [ + "cpu" + ], + "py_versions": [ + "py3" + ], "registries": { "af-south-1": "510948584623", "ap-east-1": "651117190479", @@ -143,19 +157,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -182,19 +196,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -221,19 +235,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -260,19 +274,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -299,19 +313,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -338,19 +352,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -384,18 +398,18 @@ "eu-central-1": "813361260812", "eu-central-2": "680994064768", "eu-north-1": "669576153137", + "eu-south-1": "257386234256", + "eu-south-2": "104374241257", "eu-west-1": "685385470294", "eu-west-2": "644912444149", "eu-west-3": "749696950732", - "eu-south-1": "257386234256", - "eu-south-2": "104374241257", - "me-south-1": "249704162688", "me-central-1": "272398656194", + "me-south-1": "249704162688", "sa-east-1": "855470959533", "us-east-1": "811284229777", "us-east-2": "825641698319", - "us-gov-west-1": "226302683700", "us-gov-east-1": "237065988967", + "us-gov-west-1": "226302683700", "us-iso-east-1": "490574956308", "us-isob-east-1": "765400339828", "us-west-1": "632365934929", @@ -404,8 +418,12 @@ "repository": "xgboost" }, "0.90-1": { - "processors": ["cpu"], - "py_versions": ["py3"], + "processors": [ + "cpu" + ], + "py_versions": [ + "py3" + ], "registries": { "af-south-1": "510948584623", "ap-east-1": "651117190479", @@ -424,18 +442,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", - "me-south-1": "801668240914", + "il-central-1": "898809789911", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -444,10 +463,14 @@ "repository": "sagemaker-xgboost" }, "0.90-2": { - "processors": ["cpu"], - "py_versions": ["py3"], + "processors": [ + "cpu" + ], + "py_versions": [ + "py3" + ], "registries": { - "af-south-1": "510948584623", + "af-south-1": "510948584623", "ap-east-1": "651117190479", "ap-northeast-1": "354813040037", "ap-northeast-2": "366743142698", @@ -464,18 +487,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", - "me-south-1": "801668240914", + "il-central-1": "898809789911", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -484,8 +508,12 @@ "repository": "sagemaker-xgboost" }, "1.0-1": { - "processors": ["cpu"], - "py_versions": ["py3"], + "processors": [ + "cpu" + ], + "py_versions": [ + "py3" + ], "registries": { "af-south-1": "510948584623", "ap-east-1": "651117190479", @@ -504,19 +532,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -543,19 +571,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -582,19 +610,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -621,19 +649,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -660,19 +688,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -699,19 +727,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -742,19 +770,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -781,19 +809,19 @@ "eu-central-1": "492215442770", "eu-central-2": "680994064768", "eu-north-1": "662702820516", + "eu-south-1": "978288397137", + "eu-south-2": "104374241257", "eu-west-1": "141502667606", "eu-west-2": "764974769150", "eu-west-3": "659782779980", - "eu-south-1": "978288397137", - "eu-south-2": "104374241257", "il-central-1": "898809789911", - "me-south-1": "801668240914", "me-central-1": "272398656194", + "me-south-1": "801668240914", "sa-east-1": "737474898029", "us-east-1": "683313688378", "us-east-2": "257758044811", - "us-gov-west-1": "414596584902", "us-gov-east-1": "237065988967", + "us-gov-west-1": "414596584902", "us-iso-east-1": "833128469047", "us-isob-east-1": "281123927165", "us-west-1": "746614075791", @@ -803,4 +831,4 @@ } } } -} +} \ No newline at end of file From 612b5cee99307e9b83ba2a2a248ab46d84951e84 Mon Sep 17 00:00:00 2001 From: sagemaker-bot Date: Wed, 13 Dec 2023 22:04:54 +0000 Subject: [PATCH 02/76] change: update image_uri_configs 12-13-2023 14:04:54 PST --- .../image_uri_config/huggingface-llm.json | 51 ++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/src/sagemaker/image_uri_config/huggingface-llm.json b/src/sagemaker/image_uri_config/huggingface-llm.json index 3431a8e445..352ea3587f 100644 --- a/src/sagemaker/image_uri_config/huggingface-llm.json +++ b/src/sagemaker/image_uri_config/huggingface-llm.json @@ -8,7 +8,9 @@ "0.8": "0.8.2", "0.9": "0.9.3", "1.0": "1.0.3", - "1.1": "1.1.0" + "1.1": "1.1.0", + "1.2": "1.2.0", + "1.3": "1.3.1" }, "versions": { "0.6.0": { @@ -292,6 +294,53 @@ "container_version": { "gpu": "cu121-ubuntu20.04" } + }, + "1.3.1": { + "py_versions": [ + "py310" + ], + "registries": { + "af-south-1": "626614931356", + "il-central-1": "780543022126", + "ap-east-1": "871362719292", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-south-2": "772153158452", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ap-southeast-3": "907027046896", + "ap-southeast-4": "457447274322", + "ca-central-1": "763104351884", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-central-2": "380420809688", + "eu-north-1": "763104351884", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "eu-south-1": "692866216735", + "eu-south-2": "503227376785", + "me-south-1": "217643126080", + "me-central-1": "914824155844", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-gov-east-1": "446045086412", + "us-gov-west-1": "442386744353", + "us-iso-east-1": "886529160074", + "us-isob-east-1": "094389454867", + "us-west-1": "763104351884", + "us-west-2": "763104351884", + "ca-west-1": "204538143572" + }, + "tag_prefix": "2.1.1-tgi1.3.1", + "repository": "huggingface-pytorch-tgi-inference", + "container_version": { + "gpu": "cu121-ubuntu20.04" + } } } } From 1919ae79d9b1cc6402172b16965b3785464c5621 Mon Sep 17 00:00:00 2001 From: ci Date: Thu, 14 Dec 2023 20:08:43 +0000 Subject: [PATCH 03/76] prepare release v2.200.1 --- CHANGELOG.md | 9 +++++++++ VERSION | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e78b1ff91..50ed614047 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## v2.200.1 (2023-12-14) + +### Bug Fixes and Other Changes + + * Merge branch 'master-rba' into local_merge + * Fix user agent tag issue + * update image_uri_configs 12-13-2023 14:04:54 PST + * update image_uri_configs 12-13-2023 12:23:06 PST + ## v2.200.0 (2023-12-13) ### Deprecations and Removals diff --git a/VERSION b/VERSION index 5cec97550a..c7d56de15d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.200.1.dev0 +2.200.1 From 52512b3f4cb5b315071d3229079d040135a0f17a Mon Sep 17 00:00:00 2001 From: ci Date: Thu, 14 Dec 2023 20:08:45 +0000 Subject: [PATCH 04/76] update development version to v2.200.2.dev0 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index c7d56de15d..d59c8f162b 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.200.1 +2.200.2.dev0 From 250767ce9dba6fe1a31a4d26951c4d155345d695 Mon Sep 17 00:00:00 2001 From: qidewenwhen <32910701+qidewenwhen@users.noreply.github.com> Date: Thu, 14 Dec 2023 12:19:26 -0800 Subject: [PATCH 05/76] fix: Move func and args serialization of function step to step level (#4312) --- .../core/pipeline_variables.py | 41 ++++++++++-- .../remote_function/core/serialization.py | 59 +++++++++++------- .../remote_function/core/stored_function.py | 42 +++++++++++++ src/sagemaker/remote_function/job.py | 16 ++--- src/sagemaker/workflow/function_step.py | 21 +++++++ .../sagemaker/workflow/test_step_decorator.py | 58 +++++++++++++++++ .../core/test_pipeline_variables.py | 56 +++++++++++++---- .../core/test_stored_function.py | 62 +++++++++++++++---- .../sagemaker/remote_function/test_job.py | 18 +++--- .../sagemaker/workflow/test_function_step.py | 40 +++++++----- 10 files changed, 328 insertions(+), 85 deletions(-) diff --git a/src/sagemaker/remote_function/core/pipeline_variables.py b/src/sagemaker/remote_function/core/pipeline_variables.py index 2b18e6c50c..269ce94113 100644 --- a/src/sagemaker/remote_function/core/pipeline_variables.py +++ b/src/sagemaker/remote_function/core/pipeline_variables.py @@ -77,6 +77,17 @@ class _ExecutionVariable: name: str +@dataclass +class _S3BaseUriIdentifier: + """Identifies that the class refers to function step s3 base uri. + + The s3_base_uri = s3_root_uri + pipeline_name. + This identifier is resolved in function step runtime by SDK. + """ + + NAME = "S3_BASE_URI" + + @dataclass class _DelayedReturn: """Delayed return from a function.""" @@ -155,6 +166,7 @@ def __init__( hmac_key: str, parameter_resolver: _ParameterResolver, execution_variable_resolver: _ExecutionVariableResolver, + s3_base_uri: str, **settings, ): """Resolve delayed return. @@ -164,8 +176,12 @@ def __init__( hmac_key: key used to encrypt serialized and deserialized function and arguments. parameter_resolver: resolver used to pipeline parameters. execution_variable_resolver: resolver used to resolve execution variables. + s3_base_uri (str): the s3 base uri of the function step that + the serialized artifacts will be uploaded to. + The s3_base_uri = s3_root_uri + pipeline_name. **settings: settings to pass to the deserialization function. """ + self._s3_base_uri = s3_base_uri self._parameter_resolver = parameter_resolver self._execution_variable_resolver = execution_variable_resolver # different delayed returns can have the same uri, so we need to dedupe @@ -205,6 +221,8 @@ def _resolve_delayed_return_uri(self, delayed_return: _DelayedReturn): uri.append(self._parameter_resolver.resolve(component)) elif isinstance(component, _ExecutionVariable): uri.append(self._execution_variable_resolver.resolve(component)) + elif isinstance(component, _S3BaseUriIdentifier): + uri.append(self._s3_base_uri) else: uri.append(component) return s3_path_join(*uri) @@ -219,7 +237,12 @@ def _retrieve_child_item(delayed_return: _DelayedReturn, deserialized_obj: Any): def resolve_pipeline_variables( - context: Context, func_args: Tuple, func_kwargs: Dict, hmac_key: str, **settings + context: Context, + func_args: Tuple, + func_kwargs: Dict, + hmac_key: str, + s3_base_uri: str, + **settings, ): """Resolve pipeline variables. @@ -228,6 +251,8 @@ def resolve_pipeline_variables( func_args: function args. func_kwargs: function kwargs. hmac_key: key used to encrypt serialized and deserialized function and arguments. + s3_base_uri: the s3 base uri of the function step that the serialized artifacts + will be uploaded to. The s3_base_uri = s3_root_uri + pipeline_name. **settings: settings to pass to the deserialization function. """ @@ -251,6 +276,7 @@ def resolve_pipeline_variables( hmac_key=hmac_key, parameter_resolver=parameter_resolver, execution_variable_resolver=execution_variable_resolver, + s3_base_uri=s3_base_uri, **settings, ) @@ -289,11 +315,10 @@ def resolve_pipeline_variables( return resolved_func_args, resolved_func_kwargs -def convert_pipeline_variables_to_pickleable(s3_base_uri: str, func_args: Tuple, func_kwargs: Dict): +def convert_pipeline_variables_to_pickleable(func_args: Tuple, func_kwargs: Dict): """Convert pipeline variables to pickleable. Args: - s3_base_uri: s3 base uri where artifacts are stored. func_args: function args. func_kwargs: function kwargs. """ @@ -304,11 +329,19 @@ def convert_pipeline_variables_to_pickleable(s3_base_uri: str, func_args: Tuple, from sagemaker.workflow.function_step import DelayedReturn + # Notes: + # 1. The s3_base_uri = s3_root_uri + pipeline_name, but the two may be unknown + # when defining function steps. After step-level arg serialization, + # it's hard to update the s3_base_uri in pipeline compile time. + # Thus set a placeholder: _S3BaseUriIdentifier, and let the runtime job to resolve it. + # 2. For saying s3_root_uri is unknown, it's because when defining function steps, + # the pipeline's sagemaker_session is not passed in, but the default s3_root_uri + # should be retrieved from the pipeline's sagemaker_session. def convert(arg): if isinstance(arg, DelayedReturn): return _DelayedReturn( uri=[ - s3_base_uri, + _S3BaseUriIdentifier(), ExecutionVariables.PIPELINE_EXECUTION_ID._pickleable, arg._step.name, "results", diff --git a/src/sagemaker/remote_function/core/serialization.py b/src/sagemaker/remote_function/core/serialization.py index 943e89636d..821744ee6b 100644 --- a/src/sagemaker/remote_function/core/serialization.py +++ b/src/sagemaker/remote_function/core/serialization.py @@ -161,17 +161,13 @@ def serialize_func_to_s3( Raises: SerializationError: when fail to serialize function to bytes. """ - bytes_to_upload = CloudpickleSerializer.serialize(func) - _upload_bytes_to_s3(bytes_to_upload, f"{s3_uri}/payload.pkl", s3_kms_key, sagemaker_session) - - sha256_hash = _compute_hash(bytes_to_upload, secret_key=hmac_key) - - _upload_bytes_to_s3( - _MetaData(sha256_hash).to_json(), - f"{s3_uri}/metadata.json", - s3_kms_key, - sagemaker_session, + _upload_payload_and_metadata_to_s3( + bytes_to_upload=CloudpickleSerializer.serialize(func), + hmac_key=hmac_key, + s3_uri=s3_uri, + sagemaker_session=sagemaker_session, + s3_kms_key=s3_kms_key, ) @@ -220,17 +216,12 @@ def serialize_obj_to_s3( SerializationError: when fail to serialize object to bytes. """ - bytes_to_upload = CloudpickleSerializer.serialize(obj) - - _upload_bytes_to_s3(bytes_to_upload, f"{s3_uri}/payload.pkl", s3_kms_key, sagemaker_session) - - sha256_hash = _compute_hash(bytes_to_upload, secret_key=hmac_key) - - _upload_bytes_to_s3( - _MetaData(sha256_hash).to_json(), - f"{s3_uri}/metadata.json", - s3_kms_key, - sagemaker_session, + _upload_payload_and_metadata_to_s3( + bytes_to_upload=CloudpickleSerializer.serialize(obj), + hmac_key=hmac_key, + s3_uri=s3_uri, + sagemaker_session=sagemaker_session, + s3_kms_key=s3_kms_key, ) @@ -318,8 +309,32 @@ def serialize_exception_to_s3( """ pickling_support.install() - bytes_to_upload = CloudpickleSerializer.serialize(exc) + _upload_payload_and_metadata_to_s3( + bytes_to_upload=CloudpickleSerializer.serialize(exc), + hmac_key=hmac_key, + s3_uri=s3_uri, + sagemaker_session=sagemaker_session, + s3_kms_key=s3_kms_key, + ) + +def _upload_payload_and_metadata_to_s3( + bytes_to_upload: Union[bytes, io.BytesIO], + hmac_key: str, + s3_uri: str, + sagemaker_session: Session, + s3_kms_key, +): + """Uploads serialized payload and metadata to s3. + + Args: + bytes_to_upload (bytes): Serialized bytes to upload. + hmac_key (str): Key used to calculate hmac-sha256 hash of the serialized obj. + s3_uri (str): S3 root uri to which resulting serialized artifacts will be uploaded. + sagemaker_session (sagemaker.session.Session): + The underlying Boto3 session which AWS service calls are delegated to. + s3_kms_key (str): KMS key used to encrypt artifacts uploaded to S3. + """ _upload_bytes_to_s3(bytes_to_upload, f"{s3_uri}/payload.pkl", s3_kms_key, sagemaker_session) sha256_hash = _compute_hash(bytes_to_upload, secret_key=hmac_key) diff --git a/src/sagemaker/remote_function/core/stored_function.py b/src/sagemaker/remote_function/core/stored_function.py index 52b9e33936..862c67d9ee 100644 --- a/src/sagemaker/remote_function/core/stored_function.py +++ b/src/sagemaker/remote_function/core/stored_function.py @@ -14,6 +14,7 @@ from __future__ import absolute_import import os +from dataclasses import dataclass from typing import Any @@ -36,6 +37,14 @@ JSON_RESULTS_FILE = "results.json" +@dataclass +class _SerializedData: + """Data class to store serialized function and arguments""" + + func: bytes + args: bytes + + class StoredFunction: """Class representing a remote function stored in S3.""" @@ -105,6 +114,38 @@ def save(self, func, *args, **kwargs): s3_kms_key=self.s3_kms_key, ) + def save_pipeline_step_function(self, serialized_data): + """Upload serialized function and arguments to s3. + + Args: + serialized_data (_SerializedData): The serialized function + and function arguments of a function step. + """ + + logger.info( + "Uploading serialized function code to %s", + s3_path_join(self.func_upload_path, FUNCTION_FOLDER), + ) + serialization._upload_payload_and_metadata_to_s3( + bytes_to_upload=serialized_data.func, + hmac_key=self.hmac_key, + s3_uri=s3_path_join(self.func_upload_path, FUNCTION_FOLDER), + sagemaker_session=self.sagemaker_session, + s3_kms_key=self.s3_kms_key, + ) + + logger.info( + "Uploading serialized function arguments to %s", + s3_path_join(self.func_upload_path, ARGUMENTS_FOLDER), + ) + serialization._upload_payload_and_metadata_to_s3( + bytes_to_upload=serialized_data.args, + hmac_key=self.hmac_key, + s3_uri=s3_path_join(self.func_upload_path, ARGUMENTS_FOLDER), + sagemaker_session=self.sagemaker_session, + s3_kms_key=self.s3_kms_key, + ) + def load_and_invoke(self) -> Any: """Load and deserialize the function and the arguments and then execute it.""" @@ -134,6 +175,7 @@ def load_and_invoke(self) -> Any: args, kwargs, hmac_key=self.hmac_key, + s3_base_uri=self.s3_base_uri, sagemaker_session=self.sagemaker_session, ) diff --git a/src/sagemaker/remote_function/job.py b/src/sagemaker/remote_function/job.py index a3d6a1d780..c4570da463 100644 --- a/src/sagemaker/remote_function/job.py +++ b/src/sagemaker/remote_function/job.py @@ -52,11 +52,8 @@ from sagemaker.utils import name_from_base, _tmpdir, resolve_value_from_config from sagemaker.s3 import s3_path_join, S3Uploader from sagemaker import vpc_utils -from sagemaker.remote_function.core.stored_function import StoredFunction -from sagemaker.remote_function.core.pipeline_variables import ( - Context, - convert_pipeline_variables_to_pickleable, -) +from sagemaker.remote_function.core.stored_function import StoredFunction, _SerializedData +from sagemaker.remote_function.core.pipeline_variables import Context from sagemaker.remote_function.runtime_environment.runtime_environment_manager import ( RuntimeEnvironmentManager, _DependencySettings, @@ -695,6 +692,7 @@ def compile( func_args: tuple, func_kwargs: dict, run_info=None, + serialized_data: _SerializedData = None, ) -> dict: """Build the artifacts and generate the training job request.""" from sagemaker.workflow.properties import Properties @@ -732,12 +730,8 @@ def compile( func_step_s3_dir=step_compilation_context.pipeline_build_time, ), ) - converted_func_args, converted_func_kwargs = convert_pipeline_variables_to_pickleable( - s3_base_uri=s3_base_uri, - func_args=func_args, - func_kwargs=func_kwargs, - ) - stored_function.save(func, *converted_func_args, **converted_func_kwargs) + + stored_function.save_pipeline_step_function(serialized_data) stopping_condition = { "MaxRuntimeInSeconds": job_settings.max_runtime_in_seconds, diff --git a/src/sagemaker/workflow/function_step.py b/src/sagemaker/workflow/function_step.py index da20fd93d9..4fee8ef269 100644 --- a/src/sagemaker/workflow/function_step.py +++ b/src/sagemaker/workflow/function_step.py @@ -83,6 +83,11 @@ def __init__( func_kwargs (dict): keyword arguments of the python function. **kwargs: Additional arguments to be passed to the `step` decorator. """ + from sagemaker.remote_function.core.pipeline_variables import ( + convert_pipeline_variables_to_pickleable, + ) + from sagemaker.remote_function.core.serialization import CloudpickleSerializer + from sagemaker.remote_function.core.stored_function import _SerializedData super(_FunctionStep, self).__init__( name, StepTypeEnum.TRAINING, display_name, description, depends_on, retry_policies @@ -96,6 +101,21 @@ def __init__( self.__job_settings = None + ( + self._converted_func_args, + self._converted_func_kwargs, + ) = convert_pipeline_variables_to_pickleable( + func_args=self._func_args, + func_kwargs=self._func_kwargs, + ) + + self._serialized_data = _SerializedData( + func=CloudpickleSerializer.serialize(self._func), + args=CloudpickleSerializer.serialize( + (self._converted_func_args, self._converted_func_kwargs) + ), + ) + @property def func(self): """The python function to run as a pipeline step.""" @@ -185,6 +205,7 @@ def arguments(self) -> RequestType: func=self.func, func_args=self.func_args, func_kwargs=self.func_kwargs, + serialized_data=self._serialized_data, ) # Continue to pop job name if not explicitly opted-in via config request_dict = trim_request_dict(request_dict, "TrainingJobName", step_compilation_context) diff --git a/tests/integ/sagemaker/workflow/test_step_decorator.py b/tests/integ/sagemaker/workflow/test_step_decorator.py index bd4eb4c3d1..70424383f1 100644 --- a/tests/integ/sagemaker/workflow/test_step_decorator.py +++ b/tests/integ/sagemaker/workflow/test_step_decorator.py @@ -858,3 +858,61 @@ def cuberoot(x): pipeline.delete() except Exception: pass + + +def test_step_level_serialization( + sagemaker_session, role, pipeline_name, region_name, dummy_container_without_error +): + os.environ["AWS_DEFAULT_REGION"] = region_name + + _EXPECTED_STEP_A_OUTPUT = "This pipeline is a function." + _EXPECTED_STEP_B_OUTPUT = "This generates a function arg." + + step_config = dict( + role=role, + image_uri=dummy_container_without_error, + instance_type=INSTANCE_TYPE, + ) + + # This pipeline function may clash with the pipeline object + # defined below. + # However, if the function and args serialization happen in + # step level, this clash won't happen. + def pipeline(): + return _EXPECTED_STEP_A_OUTPUT + + @step(**step_config) + def generator(): + return _EXPECTED_STEP_B_OUTPUT + + @step(**step_config) + def func_with_collision(var: str): + return f"{pipeline()} {var}" + + step_output_a = generator() + step_output_b = func_with_collision(step_output_a) + + pipeline = Pipeline( # noqa: F811 + name=pipeline_name, + steps=[step_output_b], + sagemaker_session=sagemaker_session, + ) + + try: + create_and_execute_pipeline( + pipeline=pipeline, + pipeline_name=pipeline_name, + region_name=region_name, + role=role, + no_of_steps=2, + last_step_name=get_step(step_output_b).name, + execution_parameters=dict(), + step_status="Succeeded", + step_result_type=str, + step_result_value=f"{_EXPECTED_STEP_A_OUTPUT} {_EXPECTED_STEP_B_OUTPUT}", + ) + finally: + try: + pipeline.delete() + except Exception: + pass diff --git a/tests/unit/sagemaker/remote_function/core/test_pipeline_variables.py b/tests/unit/sagemaker/remote_function/core/test_pipeline_variables.py index d936db22ee..ebe26653b8 100644 --- a/tests/unit/sagemaker/remote_function/core/test_pipeline_variables.py +++ b/tests/unit/sagemaker/remote_function/core/test_pipeline_variables.py @@ -28,6 +28,7 @@ _DelayedReturnResolver, resolve_pipeline_variables, convert_pipeline_variables_to_pickleable, + _S3BaseUriIdentifier, ) from sagemaker.workflow.parameters import ( @@ -39,6 +40,8 @@ from sagemaker.workflow.function_step import DelayedReturn from sagemaker.workflow.properties import Properties +PIPELINE_NAME = "some-pipeline" + @patch("sagemaker.remote_function.core.pipeline_variables.deserialize_obj_from_s3") def test_resolve_delayed_returns(mock_deserializer): @@ -70,6 +73,7 @@ def test_resolve_delayed_returns(mock_deserializer): _ParameterResolver(Context()), _ExecutionVariableResolver(Context()), sagemaker_session=None, + s3_base_uri=f"s3://my-bucket/{PIPELINE_NAME}", ) assert resolver.resolve(delayed_returns[0]) == 1 @@ -99,6 +103,7 @@ def test_deserializer_fails(mock_deserializer): _ParameterResolver(Context()), _ExecutionVariableResolver(Context()), sagemaker_session=None, + s3_base_uri=f"s3://my-bucket/{PIPELINE_NAME}", ) @@ -116,7 +121,12 @@ def test_no_pipeline_variables_to_resolve(mock_deserializer, func_args, func_kwa mock_deserializer.return_value = (1.0, 2.0, 3.0) resolved_args, resolved_kwargs = resolve_pipeline_variables( - Context(), func_args, func_kwargs, hmac_key="1234", sagemaker_session=None + Context(), + func_args, + func_kwargs, + hmac_key="1234", + s3_base_uri="s3://my-bucket", + sagemaker_session=None, ) assert resolved_args == func_args @@ -133,11 +143,19 @@ def test_no_pipeline_variables_to_resolve(mock_deserializer, func_args, func_kwa _ParameterFloat("parameter_2"), _ParameterBoolean("parameter_4"), _DelayedReturn( - uri=["s3://my-bucket/", _ExecutionVariable("ExecutionId"), "sub-folder-1/"], + uri=[ + _S3BaseUriIdentifier(), + _ExecutionVariable("ExecutionId"), + "sub-folder-1/", + ], reference_path=(("__getitem__", 0),), ), _DelayedReturn( - uri=["s3://my-bucket/", _ExecutionVariable("ExecutionId"), "sub-folder-1/"], + uri=[ + _S3BaseUriIdentifier(), + _ExecutionVariable("ExecutionId"), + "sub-folder-1/", + ], reference_path=(("__getitem__", 1),), ), _Properties("Steps.step_name.TrainingJobName"), @@ -154,11 +172,19 @@ def test_no_pipeline_variables_to_resolve(mock_deserializer, func_args, func_kwa "c": _ParameterFloat("parameter_2"), "d": _ParameterBoolean("parameter_4"), "e": _DelayedReturn( - uri=["s3://my-bucket/", _ExecutionVariable("ExecutionId"), "sub-folder-1/"], + uri=[ + _S3BaseUriIdentifier(), + _ExecutionVariable("ExecutionId"), + "sub-folder-1/", + ], reference_path=(("__getitem__", 0),), ), "f": _DelayedReturn( - uri=["s3://my-bucket/", _ExecutionVariable("ExecutionId"), "sub-folder-1/"], + uri=[ + _S3BaseUriIdentifier(), + _ExecutionVariable("ExecutionId"), + "sub-folder-1/", + ], reference_path=(("__getitem__", 1),), ), "g": _Properties("Steps.step_name.TrainingJobName"), @@ -184,6 +210,7 @@ def test_resolve_pipeline_variables( expected_resolved_args, expected_resolved_kwargs, ): + s3_base_uri = f"s3://my-bucket/{PIPELINE_NAME}" context = Context( property_references={ "Parameters.parameter_1": "1", @@ -192,20 +219,25 @@ def test_resolve_pipeline_variables( "Parameters.parameter_4": "true", "Execution.ExecutionId": "execution-id", "Steps.step_name.TrainingJobName": "a-cool-name", - } + }, ) mock_deserializer.return_value = (1.0, 2.0, 3.0) resolved_args, resolved_kwargs = resolve_pipeline_variables( - context, func_args, func_kwargs, hmac_key="1234", sagemaker_session=None + context, + func_args, + func_kwargs, + hmac_key="1234", + s3_base_uri=s3_base_uri, + sagemaker_session=None, ) assert resolved_args == expected_resolved_args assert resolved_kwargs == expected_resolved_kwargs mock_deserializer.assert_called_once_with( sagemaker_session=None, - s3_uri="s3://my-bucket/execution-id/sub-folder-1", + s3_uri=f"{s3_base_uri}/execution-id/sub-folder-1", hmac_key="1234", ) @@ -237,15 +269,13 @@ def test_convert_pipeline_variables_to_pickleable(): } converted_args, converted_kwargs = convert_pipeline_variables_to_pickleable( - "base_uri", func_args, func_kwargs + func_args, func_kwargs ) - print(converted_args) - assert converted_args == ( _DelayedReturn( uri=[ - "base_uri", + _S3BaseUriIdentifier(), _ExecutionVariable(name="PipelineExecutionId"), "parent_step", "results", @@ -264,7 +294,7 @@ def test_convert_pipeline_variables_to_pickleable(): assert converted_kwargs == { "a": _DelayedReturn( uri=[ - "base_uri", + _S3BaseUriIdentifier(), _ExecutionVariable(name="PipelineExecutionId"), "parent_step", "results", diff --git a/tests/unit/sagemaker/remote_function/core/test_stored_function.py b/tests/unit/sagemaker/remote_function/core/test_stored_function.py index 78b5a700da..bcc09cb585 100644 --- a/tests/unit/sagemaker/remote_function/core/test_stored_function.py +++ b/tests/unit/sagemaker/remote_function/core/test_stored_function.py @@ -24,10 +24,12 @@ from sagemaker.remote_function.core.stored_function import ( StoredFunction, JSON_SERIALIZED_RESULT_KEY, + _SerializedData, ) from sagemaker.remote_function.core.serialization import ( deserialize_obj_from_s3, serialize_obj_to_s3, + CloudpickleSerializer, ) from sagemaker.remote_function.core.pipeline_variables import ( Context, @@ -308,19 +310,13 @@ def test_load_and_invoke_json_serialization( @patch("sagemaker.s3.S3Uploader.upload_bytes", new=upload_bytes) @patch("sagemaker.s3.S3Downloader.read_bytes", new=read_bytes) -@patch("sagemaker.s3.S3Uploader.upload") -@patch("sagemaker.s3.S3Downloader.download") -def test_save_and_load_with_pipeline_variable( - s3_source_dir_download, s3_source_dir_upload, monkeypatch -): +@patch("sagemaker.s3.S3Uploader.upload", MagicMock()) +@patch("sagemaker.s3.S3Downloader.download", MagicMock()) +def test_save_and_load_with_pipeline_variable(monkeypatch): session = Mock() s3_base_uri = random_s3_uri() - job_settings = Mock() - job_settings.s3_root_uri = s3_base_uri - function_step = _FunctionStep( - name="func_1", display_name=None, description=None, job_settings=job_settings - ) + function_step = _FunctionStep(name="func_1", display_name=None, description=None) x = DelayedReturn(function_step=function_step) serialize_obj_to_s3( 3.0, session, f"{s3_base_uri}/execution-id/func_1/results", HMAC_KEY, KMS_KEY @@ -337,12 +333,11 @@ def test_save_and_load_with_pipeline_variable( "Parameters.b": "2.0", "Parameters.c": "3.0", "Execution.PipelineExecutionId": "execution-id", - } + }, ), ) func_args, func_kwargs = convert_pipeline_variables_to_pickleable( - s3_base_uri=s3_base_uri, func_args=(x,), func_kwargs={ "a": ParameterFloat("a"), @@ -350,9 +345,50 @@ def test_save_and_load_with_pipeline_variable( "c": ParameterFloat("c"), }, ) - stored_function.save(quadratic, *func_args, **func_kwargs) + + test_serialized_data = _SerializedData( + func=CloudpickleSerializer.serialize(quadratic), + args=CloudpickleSerializer.serialize((func_args, func_kwargs)), + ) + + stored_function.save_pipeline_step_function(test_serialized_data) stored_function.load_and_invoke() assert deserialize_obj_from_s3( session, s3_uri=f"{s3_base_uri}/results", hmac_key=HMAC_KEY ) == quadratic(3.0, a=1.0, b=2.0, c=3.0) + + +@patch("sagemaker.remote_function.core.serialization._upload_payload_and_metadata_to_s3") +@patch("sagemaker.remote_function.job._JobSettings") +def test_save_pipeline_step_function(mock_job_settings, upload_payload): + session = Mock() + s3_base_uri = random_s3_uri() + mock_job_settings.s3_root_uri = s3_base_uri + + stored_function = StoredFunction( + sagemaker_session=session, + s3_base_uri=s3_base_uri, + s3_kms_key=KMS_KEY, + hmac_key=HMAC_KEY, + context=Context( + step_name="step_name", + execution_id="execution_id", + ), + ) + + func_args, func_kwargs = convert_pipeline_variables_to_pickleable( + func_args=(1,), + func_kwargs={ + "a": 2, + "b": 3, + }, + ) + + test_serialized_data = _SerializedData( + func=CloudpickleSerializer.serialize(quadratic), + args=CloudpickleSerializer.serialize((func_args, func_kwargs)), + ) + stored_function.save_pipeline_step_function(test_serialized_data) + + assert upload_payload.call_count == 2 diff --git a/tests/unit/sagemaker/remote_function/test_job.py b/tests/unit/sagemaker/remote_function/test_job.py index f025276634..1884486f8b 100644 --- a/tests/unit/sagemaker/remote_function/test_job.py +++ b/tests/unit/sagemaker/remote_function/test_job.py @@ -17,9 +17,11 @@ import pytest from mock import patch, Mock, ANY, mock_open +from mock.mock import MagicMock from sagemaker.config import load_sagemaker_config from sagemaker.remote_function.checkpoint_location import CheckpointLocation +from sagemaker.remote_function.core.stored_function import _SerializedData from sagemaker.session_settings import SessionSettings from sagemaker.remote_function.spark_config import SparkConfig @@ -149,6 +151,10 @@ def job_function_with_checkpoint(a, checkpoint_1=None, *, b, checkpoint_2=None): return a + b +def serialized_data(): + return _SerializedData(func=b"serialized_func", args=b"serialized_args") + + @patch("secrets.token_hex", return_value=HMAC_KEY) @patch("sagemaker.remote_function.job.Session", return_value=mock_session()) @patch("sagemaker.remote_function.job.get_execution_role", return_value=DEFAULT_ROLE_ARN) @@ -731,7 +737,7 @@ def test_start_with_complete_job_settings( @patch("sagemaker.workflow.utilities._pipeline_config", MOCKED_PIPELINE_CONFIG) -@patch("secrets.token_hex", return_value=HMAC_KEY) +@patch("secrets.token_hex", MagicMock(return_value=HMAC_KEY)) @patch( "sagemaker.remote_function.job._prepare_dependencies_and_pre_execution_scripts", return_value="some_s3_uri", @@ -750,11 +756,9 @@ def test_get_train_args_under_pipeline_context( mock_bootstrap_scripts_upload, mock_user_workspace_upload, mock_user_dependencies_upload, - secret_token, ): from sagemaker.workflow.parameters import ParameterInteger - from sagemaker.remote_function.core.pipeline_variables import _ParameterInteger mock_stored_function = Mock() mock_stored_function_ctr.return_value = mock_stored_function @@ -776,6 +780,7 @@ def test_get_train_args_under_pipeline_context( security_group_ids=["sg"], ) + mocked_serialized_data = serialized_data() s3_base_uri = f"{S3_URI}/{TEST_PIPELINE_NAME}" train_args = _Job.compile( job_settings=job_settings, @@ -784,6 +789,7 @@ def test_get_train_args_under_pipeline_context( func=job_function, func_args=(1, ParameterInteger(name="b", default_value=2)), func_kwargs={"c": 3, "d": ParameterInteger(name="d", default_value=4)}, + serialized_data=mocked_serialized_data, ) mock_stored_function_ctr.assert_called_once_with( @@ -796,11 +802,7 @@ def test_get_train_args_under_pipeline_context( func_step_s3_dir=MOCKED_PIPELINE_CONFIG.pipeline_build_time, ), ) - mock_stored_function.save.assert_called_once_with( - job_function, - *(1, _ParameterInteger(name="b")), - **{"c": 3, "d": _ParameterInteger(name="d")}, - ) + mock_stored_function.save_pipeline_step_function.assert_called_once_with(mocked_serialized_data) local_dependencies_path = mock_runtime_manager().snapshot() mock_python_version = mock_runtime_manager()._current_python_version() diff --git a/tests/unit/sagemaker/workflow/test_function_step.py b/tests/unit/sagemaker/workflow/test_function_step.py index 5e08d7005b..888635ae02 100644 --- a/tests/unit/sagemaker/workflow/test_function_step.py +++ b/tests/unit/sagemaker/workflow/test_function_step.py @@ -19,6 +19,8 @@ from mock import patch, Mock, ANY from typing import List, Tuple +from mock.mock import MagicMock + from sagemaker.workflow.parameters import ParameterInteger from sagemaker.workflow.execution_variables import ExecutionVariables from sagemaker.workflow.functions import Join @@ -105,6 +107,8 @@ def sum(a, b, c, d): assert function_step._job_settings is not None assert mock_job_settings.call_args[1]["image_uri"] == "test_image_uri" + assert function_step._serialized_data.func is not None + assert function_step._serialized_data.args is not None @patch("sagemaker.workflow.utilities._pipeline_config", MOCKED_PIPELINE_CONFIG) @@ -127,6 +131,8 @@ def sum(a, b, c, d): assert function_step.description == "Returns sum of numbers" assert function_step.retry_policies == [] assert function_step.depends_on == [] + assert function_step._serialized_data.func is not None + assert function_step._serialized_data.args is not None @patch("sagemaker.workflow.utilities._pipeline_config", MOCKED_PIPELINE_CONFIG) @@ -261,11 +267,13 @@ def sum(a, b): func=ANY, func_args=(2, 3), func_kwargs={}, + serialized_data=step_output._step._serialized_data, ) mock_job_settings_ctr.assert_called_once() +@patch("sagemaker.remote_function.job._JobSettings", MagicMock()) @pytest.mark.parametrize( "type_hint", [ @@ -278,8 +286,7 @@ def sum(a, b): Tuple[int, ...], ], ) -@patch("sagemaker.remote_function.job._JobSettings") -def test_step_function_with_sequence_return_value(mock_job_settings, type_hint): +def test_step_function_with_sequence_return_value(type_hint): @step def func() -> type_hint: return 1, 2, 3 @@ -366,8 +373,9 @@ def func(): pass -@patch("sagemaker.remote_function.job._JobSettings") -def test_step_function_take_in_delayed_return_as_positional_arguments(mock_job_settings): +@patch("sagemaker.remote_function.core.serialization.CloudpickleSerializer.serialize", MagicMock()) +@patch("sagemaker.remote_function.job._JobSettings", MagicMock()) +def test_step_function_take_in_delayed_return_as_positional_arguments(): @step def func_1() -> Tuple: return 1, 2, 3 @@ -390,8 +398,9 @@ def func_2(a, b, c, param_1, param_2): get_step(func_2_output).depends_on = [] -@patch("sagemaker.remote_function.job._JobSettings") -def test_step_function_take_in_delayed_return_as_keyword_arguments(mock_job_settings): +@patch("sagemaker.remote_function.core.serialization.CloudpickleSerializer.serialize", MagicMock()) +@patch("sagemaker.remote_function.job._JobSettings", MagicMock()) +def test_step_function_take_in_delayed_return_as_keyword_arguments(): @step def func_1() -> Tuple: return 1, 2, 3 @@ -414,8 +423,9 @@ def func_2(a, b, c, param_1, param_2): get_step(func_2_output).depends_on = [] -@patch("sagemaker.remote_function.job._JobSettings") -def test_delayed_returns_in_nested_object_are_ignored(mock_job_settings): +@patch("sagemaker.remote_function.core.serialization.CloudpickleSerializer.serialize", MagicMock()) +@patch("sagemaker.remote_function.job._JobSettings", MagicMock()) +def test_delayed_returns_in_nested_object_are_ignored(): @step def func_1() -> Tuple: return 1, 2, 3 @@ -438,8 +448,9 @@ def func_2(data, param_1, param_2): assert get_step(func_2_output).depends_on == [] -@patch("sagemaker.remote_function.job._JobSettings") -def test_unsupported_pipeline_variables_as_function_arguments(mock_job_settings): +@patch("sagemaker.remote_function.core.serialization.CloudpickleSerializer.serialize", MagicMock()) +@patch("sagemaker.remote_function.job._JobSettings", MagicMock()) +def test_unsupported_pipeline_variables_as_function_arguments(): @step def func_1() -> Tuple: return 1, 2, 3 @@ -461,8 +472,9 @@ def func_2(a, b, c, param_1, param_2): assert "Properties attribute is not supported for _FunctionStep" in str(e.value) -@patch("sagemaker.remote_function.job._JobSettings") -def test_both_data_and_execution_dependency_between_steps(mock_job_settings): +@patch("sagemaker.remote_function.core.serialization.CloudpickleSerializer.serialize", MagicMock()) +@patch("sagemaker.remote_function.job._JobSettings", MagicMock()) +def test_both_data_and_execution_dependency_between_steps(): @step def func_0() -> None: pass @@ -491,8 +503,8 @@ def func_2(a, b, c, param_1, param_2): get_step(func_2_output).depends_on = [] -@patch("sagemaker.remote_function.job._JobSettings") -def test_disable_deepcopy_of_delayed_return(mock_job_settings): +@patch("sagemaker.remote_function.job._JobSettings", MagicMock()) +def test_disable_deepcopy_of_delayed_return(): @step def func(): return 1 From 108300f186674161b24382ce00d15c078b4bd078 Mon Sep 17 00:00:00 2001 From: qidewenwhen <32910701+qidewenwhen@users.noreply.github.com> Date: Fri, 15 Dec 2023 09:45:25 -0800 Subject: [PATCH 06/76] fix: Add write permission to job output dirs for remote and step decorator running on non-root job user (#4325) --- .../bootstrap_runtime_environment.py | 13 ++ .../runtime_environment_manager.py | 26 ++++ tests/integ/sagemaker/conftest.py | 5 + .../remote_function/test_decorator.py | 19 +++ .../sagemaker/workflow/test_step_decorator.py | 46 ++++++ .../test_bootstrap_runtime_environment.py | 145 +++++++++++++++++- .../test_runtime_environment_manager.py | 51 ++++++ 7 files changed, 298 insertions(+), 7 deletions(-) diff --git a/src/sagemaker/remote_function/runtime_environment/bootstrap_runtime_environment.py b/src/sagemaker/remote_function/runtime_environment/bootstrap_runtime_environment.py index 7eeb80e7b2..0fbc926aae 100644 --- a/src/sagemaker/remote_function/runtime_environment/bootstrap_runtime_environment.py +++ b/src/sagemaker/remote_function/runtime_environment/bootstrap_runtime_environment.py @@ -14,6 +14,7 @@ from __future__ import absolute_import import argparse +import getpass import sys import os import shutil @@ -38,6 +39,7 @@ REMOTE_FUNCTION_WORKSPACE = "sm_rf_user_ws" BASE_CHANNEL_PATH = "/opt/ml/input/data" FAILURE_REASON_PATH = "/opt/ml/output/failure" +JOB_OUTPUT_DIRS = ["/opt/ml/output", "/opt/ml/model", "/tmp"] PRE_EXECUTION_SCRIPT_NAME = "pre_exec.sh" JOB_REMOTE_FUNCTION_WORKSPACE = "sagemaker_remote_function_workspace" SCRIPT_AND_DEPENDENCIES_CHANNEL_NAME = "pre_exec_script_and_dependencies" @@ -63,6 +65,17 @@ def main(sys_args=None): RuntimeEnvironmentManager()._validate_python_version(client_python_version, conda_env) + user = getpass.getuser() + if user != "root": + log_message = ( + "The job is running on non-root user: %s. Adding write permissions to the " + "following job output directories: %s." + ) + logger.info(log_message, user, JOB_OUTPUT_DIRS) + RuntimeEnvironmentManager().change_dir_permission( + dirs=JOB_OUTPUT_DIRS, new_permission="777" + ) + if pipeline_execution_id: _bootstrap_runtime_env_for_pipeline_step( client_python_version, func_step_workspace, conda_env, dependency_settings diff --git a/src/sagemaker/remote_function/runtime_environment/runtime_environment_manager.py b/src/sagemaker/remote_function/runtime_environment/runtime_environment_manager.py index c9d9a1c1be..0affa9beac 100644 --- a/src/sagemaker/remote_function/runtime_environment/runtime_environment_manager.py +++ b/src/sagemaker/remote_function/runtime_environment/runtime_environment_manager.py @@ -216,6 +216,32 @@ def run_pre_exec_script(self, pre_exec_script_path: str): pre_exec_script_path, ) + def change_dir_permission(self, dirs: list, new_permission: str): + """Change the permission of given directories + + Args: + dirs (list[str]): A list of directories for permission update. + new_permission (str): The new permission for the given directories. + """ + + _ERROR_MSG_PREFIX = "Failed to change directory permissions due to: " + command = ["sudo", "chmod", "-R", new_permission] + dirs + logger.info("Executing '%s'.", " ".join(command)) + + try: + subprocess.run(command, check=True, stderr=subprocess.PIPE) + except subprocess.CalledProcessError as called_process_err: + err_msg = called_process_err.stderr.decode("utf-8") + raise RuntimeEnvironmentError(f"{_ERROR_MSG_PREFIX} {err_msg}") + except FileNotFoundError as file_not_found_err: + if "[Errno 2] No such file or directory: 'sudo'" in str(file_not_found_err): + raise RuntimeEnvironmentError( + f"{_ERROR_MSG_PREFIX} {file_not_found_err}. " + "Please contact the image owner to install 'sudo' in the job container " + "and provide sudo privilege to the container user." + ) + raise RuntimeEnvironmentError(file_not_found_err) + def _is_file_exists(self, dependencies): """Check whether the dependencies file exists at the given location. diff --git a/tests/integ/sagemaker/conftest.py b/tests/integ/sagemaker/conftest.py index 15a1f4082b..2dc9f7df4d 100644 --- a/tests/integ/sagemaker/conftest.py +++ b/tests/integ/sagemaker/conftest.py @@ -68,7 +68,12 @@ "RUN curl 'https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip' -o 'awscliv2.zip' \ && unzip awscliv2.zip \ && ./aws/install\n\n" + "RUN apt install sudo\n" "RUN useradd -ms /bin/bash integ-test-user\n" + # Add the user to sudo group + "RUN usermod -aG sudo integ-test-user\n" + # Ensure passwords are not required for sudo group users + "RUN echo '%sudo ALL= (ALL) NOPASSWD:ALL' >> /etc/sudoers\n" "USER integ-test-user\n" "WORKDIR /home/integ-test-user\n" "COPY {source_archive} ./\n" diff --git a/tests/integ/sagemaker/remote_function/test_decorator.py b/tests/integ/sagemaker/remote_function/test_decorator.py index c1094c3ca5..63ced1dd9c 100644 --- a/tests/integ/sagemaker/remote_function/test_decorator.py +++ b/tests/integ/sagemaker/remote_function/test_decorator.py @@ -747,6 +747,25 @@ def cuberoot(x): assert cuberoot(27) == 3 +def test_with_user_and_workdir_set_in_the_image_client_error_case( + sagemaker_session, dummy_container_with_user_and_workdir, cpu_instance_type +): + client_error_message = "Testing client error in job." + + @remote( + role=ROLE, + image_uri=dummy_container_with_user_and_workdir, + instance_type=cpu_instance_type, + sagemaker_session=sagemaker_session, + ) + def my_func(): + raise RuntimeError(client_error_message) + + with pytest.raises(RuntimeError) as error: + my_func() + assert client_error_message in str(error) + + @pytest.mark.skip def test_decorator_with_spark_job(sagemaker_session, cpu_instance_type): @remote( diff --git a/tests/integ/sagemaker/workflow/test_step_decorator.py b/tests/integ/sagemaker/workflow/test_step_decorator.py index 70424383f1..66f59956c3 100644 --- a/tests/integ/sagemaker/workflow/test_step_decorator.py +++ b/tests/integ/sagemaker/workflow/test_step_decorator.py @@ -860,6 +860,52 @@ def cuberoot(x): pass +def test_with_user_and_workdir_set_in_the_image_client_error_case( + sagemaker_session, role, pipeline_name, region_name, dummy_container_with_user_and_workdir +): + # This test aims to ensure client error in step decorated function + # can be successfully surfaced and the job can be failed. + os.environ["AWS_DEFAULT_REGION"] = region_name + client_error_message = "Testing client error in job." + + @step( + role=role, + image_uri=dummy_container_with_user_and_workdir, + instance_type=INSTANCE_TYPE, + ) + def my_func(): + raise RuntimeError(client_error_message) + + step_a = my_func() + + pipeline = Pipeline( + name=pipeline_name, + steps=[step_a], + sagemaker_session=sagemaker_session, + ) + + try: + _, execution_steps = create_and_execute_pipeline( + pipeline=pipeline, + pipeline_name=pipeline_name, + region_name=region_name, + role=role, + no_of_steps=1, + last_step_name=get_step(step_a).name, + execution_parameters=dict(), + step_status="Failed", + ) + assert ( + f"ClientError: AlgorithmError: RuntimeError('{client_error_message}')" + in execution_steps[0]["FailureReason"] + ) + finally: + try: + pipeline.delete() + except Exception: + pass + + def test_step_level_serialization( sagemaker_session, role, pipeline_name, region_name, dummy_container_without_error ): diff --git a/tests/unit/sagemaker/remote_function/runtime_environment/test_bootstrap_runtime_environment.py b/tests/unit/sagemaker/remote_function/runtime_environment/test_bootstrap_runtime_environment.py index 6a2834a12e..ee83388a15 100644 --- a/tests/unit/sagemaker/remote_function/runtime_environment/test_bootstrap_runtime_environment.py +++ b/tests/unit/sagemaker/remote_function/runtime_environment/test_bootstrap_runtime_environment.py @@ -12,8 +12,9 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import - from mock import patch +from mock.mock import MagicMock + from sagemaker.remote_function.runtime_environment.runtime_environment_manager import ( RuntimeEnvironmentError, _DependencySettings, @@ -78,7 +79,13 @@ def args_for_step(): "sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment." "_bootstrap_runtime_env_for_remote_function" ) -def test_main_success_remote_job( +@patch("getpass.getuser", MagicMock(return_value="root")) +@patch( + "sagemaker.remote_function.runtime_environment.runtime_environment_manager." + "RuntimeEnvironmentManager.change_dir_permission" +) +def test_main_success_remote_job_with_root_user( + change_dir_permission, bootstrap_remote, run_pre_exec_script, bootstrap_runtime, @@ -86,6 +93,8 @@ def test_main_success_remote_job( _exit_process, ): bootstrap.main(args_for_remote()) + + change_dir_permission.assert_not_called() validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV) bootstrap_remote.assert_called_once_with( TEST_PYTHON_VERSION, @@ -114,7 +123,13 @@ def test_main_success_remote_job( "sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment." "_bootstrap_runtime_env_for_pipeline_step" ) -def test_main_success_pipeline_step( +@patch("getpass.getuser", MagicMock(return_value="root")) +@patch( + "sagemaker.remote_function.runtime_environment.runtime_environment_manager." + "RuntimeEnvironmentManager.change_dir_permission" +) +def test_main_success_pipeline_step_with_root_user( + change_dir_permission, bootstrap_step, run_pre_exec_script, bootstrap_runtime, @@ -122,6 +137,7 @@ def test_main_success_pipeline_step( _exit_process, ): bootstrap.main(args_for_step()) + change_dir_permission.assert_not_called() validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV) bootstrap_step.assert_called_once_with( TEST_PYTHON_VERSION, @@ -150,14 +166,25 @@ def test_main_success_pipeline_step( "sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment." "_bootstrap_runtime_env_for_remote_function" ) -def test_main_failure_remote_job( - bootstrap_runtime, run_pre_exec_script, write_failure, _exit_process, validate_python +@patch("getpass.getuser", MagicMock(return_value="root")) +@patch( + "sagemaker.remote_function.runtime_environment.runtime_environment_manager." + "RuntimeEnvironmentManager.change_dir_permission" +) +def test_main_failure_remote_job_with_root_user( + change_dir_permission, + bootstrap_runtime, + run_pre_exec_script, + write_failure, + _exit_process, + validate_python, ): runtime_err = RuntimeEnvironmentError("some failure reason") bootstrap_runtime.side_effect = runtime_err bootstrap.main(args_for_remote()) + change_dir_permission.assert_not_called() validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV) run_pre_exec_script.assert_not_called() bootstrap_runtime.assert_called() @@ -181,14 +208,25 @@ def test_main_failure_remote_job( "sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment." "_bootstrap_runtime_env_for_pipeline_step" ) -def test_main_failure_pipeline_step( - bootstrap_runtime, run_pre_exec_script, write_failure, _exit_process, validate_python +@patch("getpass.getuser", MagicMock(return_value="root")) +@patch( + "sagemaker.remote_function.runtime_environment.runtime_environment_manager." + "RuntimeEnvironmentManager.change_dir_permission" +) +def test_main_failure_pipeline_step_with_root_user( + change_dir_permission, + bootstrap_runtime, + run_pre_exec_script, + write_failure, + _exit_process, + validate_python, ): runtime_err = RuntimeEnvironmentError("some failure reason") bootstrap_runtime.side_effect = runtime_err bootstrap.main(args_for_step()) + change_dir_permission.assert_not_called() validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV) run_pre_exec_script.assert_not_called() bootstrap_runtime.assert_called() @@ -196,6 +234,99 @@ def test_main_failure_pipeline_step( _exit_process.assert_called_with(1) +@patch("sys.exit") +@patch( + "sagemaker.remote_function.runtime_environment.runtime_environment_manager." + "RuntimeEnvironmentManager._validate_python_version" +) +@patch( + "sagemaker.remote_function.runtime_environment.runtime_environment_manager." + "RuntimeEnvironmentManager.bootstrap" +) +@patch( + "sagemaker.remote_function.runtime_environment.runtime_environment_manager." + "RuntimeEnvironmentManager.run_pre_exec_script" +) +@patch( + "sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment." + "_bootstrap_runtime_env_for_remote_function" +) +@patch("getpass.getuser", MagicMock(return_value="non_root")) +@patch( + "sagemaker.remote_function.runtime_environment.runtime_environment_manager." + "RuntimeEnvironmentManager.change_dir_permission" +) +def test_main_remote_job_with_non_root_user( + change_dir_permission, + bootstrap_remote, + run_pre_exec_script, + bootstrap_runtime, + validate_python, + _exit_process, +): + bootstrap.main(args_for_remote()) + + change_dir_permission.assert_called_once_with( + dirs=bootstrap.JOB_OUTPUT_DIRS, new_permission="777" + ) + validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV) + bootstrap_remote.assert_called_once_with( + TEST_PYTHON_VERSION, + TEST_JOB_CONDA_ENV, + _DependencySettings(TEST_DEPENDENCY_FILE_NAME), + ) + run_pre_exec_script.assert_not_called() + bootstrap_runtime.assert_not_called() + _exit_process.assert_called_with(0) + + +@patch("sys.exit") +@patch( + "sagemaker.remote_function.runtime_environment.runtime_environment_manager." + "RuntimeEnvironmentManager._validate_python_version" +) +@patch( + "sagemaker.remote_function.runtime_environment.runtime_environment_manager." + "RuntimeEnvironmentManager.bootstrap" +) +@patch( + "sagemaker.remote_function.runtime_environment.runtime_environment_manager." + "RuntimeEnvironmentManager.run_pre_exec_script" +) +@patch( + "sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment." + "_bootstrap_runtime_env_for_pipeline_step" +) +@patch("getpass.getuser", MagicMock(return_value="non_root")) +@patch( + "sagemaker.remote_function.runtime_environment.runtime_environment_manager." + "RuntimeEnvironmentManager.change_dir_permission" +) +def test_main_pipeline_step_with_non_root_user( + change_dir_permission, + bootstrap_step, + run_pre_exec_script, + bootstrap_runtime, + validate_python, + _exit_process, +): + bootstrap.main(args_for_step()) + + change_dir_permission.assert_called_once_with( + dirs=bootstrap.JOB_OUTPUT_DIRS, new_permission="777" + ) + validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV) + bootstrap_step.assert_called_once_with( + TEST_PYTHON_VERSION, + FUNC_STEP_WORKSPACE, + TEST_JOB_CONDA_ENV, + None, + ) + run_pre_exec_script.assert_not_called() + bootstrap_runtime.assert_not_called() + _exit_process.assert_called_with(0) + + @patch("shutil.unpack_archive") @patch("os.getcwd", return_value=CURR_WORKING_DIR) @patch("os.path.exists", return_value=True) diff --git a/tests/unit/sagemaker/remote_function/runtime_environment/test_runtime_environment_manager.py b/tests/unit/sagemaker/remote_function/runtime_environment/test_runtime_environment_manager.py index 516140c4da..afbcfb1ec5 100644 --- a/tests/unit/sagemaker/remote_function/runtime_environment/test_runtime_environment_manager.py +++ b/tests/unit/sagemaker/remote_function/runtime_environment/test_runtime_environment_manager.py @@ -12,12 +12,16 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import +import subprocess + import pytest from mock import patch, Mock import sys import shlex import os +from mock.mock import MagicMock + from sagemaker.remote_function.runtime_environment.runtime_environment_manager import ( RuntimeEnvironmentManager, RuntimeEnvironmentError, @@ -413,3 +417,50 @@ def test_run_pre_exec_script_cmd_error(isfile): call_args = popen.call_args[0][0] expected_cmd = ["/bin/bash", "-eu", "path/to/pre_exec.sh"] assert call_args == expected_cmd + + +@patch("subprocess.run") +def test_change_dir_permission(mock_subprocess_run): + RuntimeEnvironmentManager().change_dir_permission(dirs=["a", "b", "c"], new_permission="777") + expected_command = ["sudo", "chmod", "-R", "777", "a", "b", "c"] + assert mock_subprocess_run.called_once_with( + expected_command, check=True, stderr=subprocess.PIPE + ) + + +@patch( + "subprocess.run", + MagicMock(side_effect=FileNotFoundError("[Errno 2] No such file or directory: 'sudo'")), +) +def test_change_dir_permission_and_no_sudo_installed(): + with pytest.raises(RuntimeEnvironmentError) as error: + RuntimeEnvironmentManager().change_dir_permission( + dirs=["a", "b", "c"], new_permission="777" + ) + assert ( + "Please contact the image owner to install 'sudo' in the job container " + "and provide sudo privilege to the container user." + ) in str(error) + + +@patch("subprocess.run", MagicMock(side_effect=FileNotFoundError("Other file not found error"))) +def test_change_dir_permission_and_sudo_installed_but_other_file_not_found_error(): + with pytest.raises(RuntimeEnvironmentError) as error: + RuntimeEnvironmentManager().change_dir_permission( + dirs=["a", "b", "c"], new_permission="777" + ) + assert "Other file not found error" in str(error) + + +@patch("subprocess.run") +def test_change_dir_permission_and_dir_not_exist(mock_subprocess_run): + mock_subprocess_run.side_effect = subprocess.CalledProcessError( + returncode=1, + cmd="sudo chmod ...", + stderr=b"chmod: cannot access ...: No such file or directory", + ) + with pytest.raises(RuntimeEnvironmentError) as error: + RuntimeEnvironmentManager().change_dir_permission( + dirs=["a", "b", "c"], new_permission="777" + ) + assert "chmod: cannot access ...: No such file or directory" in str(error) From e315b9ce8a525b70df5f8a7a035ee44e9c89f2a7 Mon Sep 17 00:00:00 2001 From: Keshav Chandak Date: Fri, 15 Dec 2023 23:16:39 +0530 Subject: [PATCH 07/76] feat: Added update for model package (#4309) Co-authored-by: Keshav Chandak --- src/sagemaker/model.py | 102 ++++++++++++++++-- src/sagemaker/session.py | 88 +++++++++++++-- tests/integ/test_model_package.py | 43 ++++++++ .../sagemaker/model/test_model_package.py | 73 +++++++++++++ 4 files changed, 293 insertions(+), 13 deletions(-) diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py index d9122cacf1..9caca5feff 100644 --- a/src/sagemaker/model.py +++ b/src/sagemaker/model.py @@ -75,6 +75,7 @@ ) from sagemaker.compute_resource_requirements.resource_requirements import ResourceRequirements from sagemaker.enums import EndpointType +from sagemaker.session import get_add_model_package_inference_args LOGGER = logging.getLogger("sagemaker") @@ -485,12 +486,6 @@ def register( if response_types is not None: self.response_types = response_types - if self.content_types is None: - raise ValueError("The supported MIME types for the input data is not set") - - if self.response_types is None: - raise ValueError("The supported MIME types for the output data is not set") - if image_uri is not None: self.image_uri = image_uri @@ -2181,7 +2176,7 @@ def update_approval_status(self, approval_status, approval_description=None): """Update the approval status for the model package Args: - approval_status (str or PipelineVariable): Model Approval Status, values can be + approval_status (str): Model Approval Status, values can be "Approved", "Rejected", or "PendingManualApproval". approval_description (str): Optional. Description for the approval status of the model (default: None). @@ -2202,3 +2197,96 @@ def update_approval_status(self, approval_status, approval_description=None): update_approval_args["ApprovalDescription"] = approval_description sagemaker_session.sagemaker_client.update_model_package(**update_approval_args) + + def update_customer_metadata(self, customer_metadata_properties: Dict[str, str]): + """Updating customer metadata properties for the model package + + Args: + customer_metadata_properties (dict[str, str]): + A dictionary of key-value paired metadata properties (default: None). + """ + + update_metadata_args = { + "ModelPackageArn": self.model_package_arn, + "CustomerMetadataProperties": customer_metadata_properties, + } + + sagemaker_session = self.sagemaker_session or sagemaker.Session() + sagemaker_session.sagemaker_client.update_model_package(**update_metadata_args) + + def remove_customer_metadata_properties( + self, customer_metadata_properties_to_remove: List[str] + ): + """Removes the specified keys from customer metadata properties + + Args: + customer_metadata_properties (list[str, str]): + list of keys of customer metadata properties to remove. + """ + + delete_metadata_args = { + "ModelPackageArn": self.model_package_arn, + "CustomerMetadataPropertiesToRemove": customer_metadata_properties_to_remove, + } + + sagemaker_session = self.sagemaker_session or sagemaker.Session() + sagemaker_session.sagemaker_client.update_model_package(**delete_metadata_args) + + def add_inference_specification( + self, + name: str, + containers: Dict = None, + image_uris: List[str] = None, + description: str = None, + content_types: List[str] = None, + response_types: List[str] = None, + inference_instances: List[str] = None, + transform_instances: List[str] = None, + ): + """Additional inference specification to be added for the model package + + Args: + name (str): Name to identify the additional inference specification + containers (dict): The Amazon ECR registry path of the Docker image + that contains the inference code. + image_uris (List[str]): The ECR path where inference code is stored. + description (str): Description for the additional inference specification + content_types (list[str]): The supported MIME types + for the input data. + response_types (list[str]): The supported MIME types + for the output data. + inference_instances (list[str]): A list of the instance + types that are used to generate inferences in real-time (default: None). + transform_instances (list[str]): A list of the instance + types on which a transformation job can be run or on which an endpoint can be + deployed (default: None). + + """ + sagemaker_session = self.sagemaker_session or sagemaker.Session() + if containers is not None and image_uris is not None: + raise ValueError("Cannot have both containers and image_uris.") + if containers is None and image_uris is None: + raise ValueError("Should have either containers or image_uris for inference.") + container_def = [] + if image_uris: + for uri in image_uris: + container_def.append( + { + "Image": uri, + } + ) + else: + container_def = containers + + model_package_update_args = get_add_model_package_inference_args( + model_package_arn=self.model_package_arn, + name=name, + containers=container_def, + content_types=content_types, + description=description, + response_types=response_types, + inference_instances=inference_instances, + transform_instances=transform_instances, + ) + + sagemaker_session.sagemaker_client.update_model_package(**model_package_update_args) diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py index 6b7a8dc2c7..3b2de0239e 100644 --- a/src/sagemaker/session.py +++ b/src/sagemaker/session.py @@ -6557,15 +6557,21 @@ def get_create_model_package_request( if task is not None: request_dict["Task"] = task if containers is not None: - if not all([content_types, response_types]): - raise ValueError( - "content_types and response_types " "must be provided if containers is present." - ) inference_specification = { "Containers": containers, - "SupportedContentTypes": content_types, - "SupportedResponseMIMETypes": response_types, } + if content_types is not None: + inference_specification.update( + { + "SupportedContentTypes": content_types, + } + ) + if response_types is not None: + inference_specification.update( + { + "SupportedResponseMIMETypes": response_types, + } + ) if model_package_group_name is not None: if inference_instances is not None: inference_specification.update( @@ -6598,6 +6604,76 @@ def get_create_model_package_request( return request_dict +def get_add_model_package_inference_args( + model_package_arn, + name, + containers=None, + content_types=None, + response_types=None, + inference_instances=None, + transform_instances=None, + description=None, +): + """Get request dictionary for UpdateModelPackage API for additional inference. + + Args: + model_package_arn (str): Arn for the model package. + name (str): Name to identify the additional inference specification + containers (dict): The Amazon ECR registry path of the Docker image + that contains the inference code. + image_uris (List[str]): The ECR path where inference code is stored. + description (str): Description for the additional inference specification + content_types (list[str]): The supported MIME types + for the input data. + response_types (list[str]): The supported MIME types + for the output data. + inference_instances (list[str]): A list of the instance + types that are used to generate inferences in real-time (default: None). + transform_instances (list[str]): A list of the instance + types on which a transformation job can be run or on which an endpoint can be + deployed (default: None). + """ + + request_dict = {} + if containers is not None: + inference_specification = { + "Containers": containers, + } + + if name is not None: + inference_specification.update({"Name": name}) + + if description is not None: + inference_specification.update({"Description": description}) + if content_types is not None: + inference_specification.update( + { + "SupportedContentTypes": content_types, + } + ) + if response_types is not None: + inference_specification.update( + { + "SupportedResponseMIMETypes": response_types, + } + ) + if inference_instances is not None: + inference_specification.update( + { + "SupportedRealtimeInferenceInstanceTypes": inference_instances, + } + ) + if transform_instances is not None: + inference_specification.update( + { + "SupportedTransformInstanceTypes": transform_instances, + } + ) + request_dict["AdditionalInferenceSpecificationsToAdd"] = [inference_specification] + request_dict.update({"ModelPackageArn": model_package_arn}) + return request_dict + + def update_args(args: Dict[str, Any], **kwargs): """Updates the request arguments dict with the value if populated. diff --git a/tests/integ/test_model_package.py b/tests/integ/test_model_package.py index 641056265e..1554825fc2 100644 --- a/tests/integ/test_model_package.py +++ b/tests/integ/test_model_package.py @@ -17,6 +17,7 @@ from sagemaker.utils import unique_name_from_base from tests.integ import DATA_DIR from sagemaker.xgboost import XGBoostModel +from sagemaker import image_uris _XGBOOST_PATH = os.path.join(DATA_DIR, "xgboost_abalone") @@ -61,3 +62,45 @@ def test_update_approval_model_package(sagemaker_session): sagemaker_session.sagemaker_client.delete_model_package_group( ModelPackageGroupName=model_group_name ) + + +def test_inference_specification_addition(sagemaker_session): + + model_group_name = unique_name_from_base("test-model-group") + + sagemaker_session.sagemaker_client.create_model_package_group( + ModelPackageGroupName=model_group_name + ) + + xgb_model_data_s3 = sagemaker_session.upload_data( + path=os.path.join(_XGBOOST_PATH, "xgb_model.tar.gz"), + key_prefix="integ-test-data/xgboost/model", + ) + model = XGBoostModel( + model_data=xgb_model_data_s3, framework_version="1.3-1", sagemaker_session=sagemaker_session + ) + + model_package = model.register( + content_types=["text/csv"], + response_types=["text/csv"], + inference_instances=["ml.m5.large"], + transform_instances=["ml.m5.large"], + model_package_group_name=model_group_name, + ) + + xgb_image = image_uris.retrieve( + "xgboost", sagemaker_session.boto_region_name, version="1", image_scope="inference" + ) + model_package.add_inference_specification(image_uris=[xgb_image], name="Inference") + desc_model_package = sagemaker_session.sagemaker_client.describe_model_package( + ModelPackageName=model_package.model_package_arn + ) + assert len(desc_model_package["AdditionalInferenceSpecifications"]) == 1 + assert desc_model_package["AdditionalInferenceSpecifications"][0]["Name"] == "Inference" + + sagemaker_session.sagemaker_client.delete_model_package( + ModelPackageName=model_package.model_package_arn + ) + sagemaker_session.sagemaker_client.delete_model_package_group( + ModelPackageGroupName=model_group_name + ) diff --git a/tests/unit/sagemaker/model/test_model_package.py b/tests/unit/sagemaker/model/test_model_package.py index cd2c3d1637..8be561030e 100644 --- a/tests/unit/sagemaker/model/test_model_package.py +++ b/tests/unit/sagemaker/model/test_model_package.py @@ -326,3 +326,76 @@ def test_model_package_auto_approve_on_deploy(update_approval_status, sagemaker_ update_approval_status.call_args_list[0][1]["approval_status"] == ModelApprovalStatusEnum.APPROVED ) + + +def test_update_customer_metadata(sagemaker_session): + model_package = ModelPackage( + role="role", + model_package_arn=MODEL_PACKAGE_VERSIONED_ARN, + sagemaker_session=sagemaker_session, + ) + + customer_metadata_to_update = { + "Key": "Value", + } + model_package.update_customer_metadata(customer_metadata_properties=customer_metadata_to_update) + + sagemaker_session.sagemaker_client.update_model_package.assert_called_with( + ModelPackageArn=MODEL_PACKAGE_VERSIONED_ARN, + CustomerMetadataProperties=customer_metadata_to_update, + ) + + +def test_remove_customer_metadata(sagemaker_session): + model_package = ModelPackage( + role="role", + model_package_arn=MODEL_PACKAGE_VERSIONED_ARN, + sagemaker_session=sagemaker_session, + ) + + customer_metadata_to_remove = ["Key"] + + model_package.remove_customer_metadata_properties( + customer_metadata_properties_to_remove=customer_metadata_to_remove + ) + + sagemaker_session.sagemaker_client.update_model_package.assert_called_with( + ModelPackageArn=MODEL_PACKAGE_VERSIONED_ARN, + CustomerMetadataPropertiesToRemove=customer_metadata_to_remove, + ) + + +def test_add_inference_specification(sagemaker_session): + model_package = ModelPackage( + role="role", + model_package_arn=MODEL_PACKAGE_VERSIONED_ARN, + sagemaker_session=sagemaker_session, + ) + + image_uris = ["image_uri"] + + containers = [{"Image": "image_uri"}] + + try: + model_package.add_inference_specification( + image_uris=image_uris, name="Inference", containers=containers + ) + except ValueError as ve: + assert "Cannot have both containers and image_uris." in str(ve) + + try: + model_package.add_inference_specification(name="Inference") + except ValueError as ve: + assert "Should have either containers or image_uris for inference." in str(ve) + + model_package.add_inference_specification(image_uris=image_uris, name="Inference") + + sagemaker_session.sagemaker_client.update_model_package.assert_called_with( + ModelPackageArn=MODEL_PACKAGE_VERSIONED_ARN, + AdditionalInferenceSpecificationsToAdd=[ + { + "Containers": [{"Image": "image_uri"}], + "Name": "Inference", + } + ], + ) From f53b2e6e2421a3bcd5712051dfdb83335e1196cb Mon Sep 17 00:00:00 2001 From: stacicho Date: Tue, 19 Dec 2023 00:35:18 -0800 Subject: [PATCH 08/76] documentation: fix ModelBuilder sample notebook links (#4319) --- doc/overview.rst | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/doc/overview.rst b/doc/overview.rst index 244f0b0b68..319560b5ff 100644 --- a/doc/overview.rst +++ b/doc/overview.rst @@ -1034,7 +1034,7 @@ You can deploy the XGBoost model from the previous example to a SageMaker endpoi initial_instance_count=1 ) -For a sample notebook that demonstrates using ``ModelBuilder`` to build a XGBoost model, see `XGBoost example `_. +For a sample notebook that demonstrates using ``ModelBuilder`` to build a XGBoost model, see `XGBoost example `_. **Triton models**. You can use ``ModelBuilder`` to serve PyTorch models on a Triton inference server. Specify the ``model_server`` parameter as ``ModelServer.TRITON``, pass a model, and include a ``SchemaBuilder`` object which requires sample inputs and outputs from the model. The following snippet shows an example. @@ -1056,7 +1056,7 @@ You can use ``ModelBuilder`` to serve PyTorch models on a Triton inference serve initial_instance_count=1 ) -For a sample notebook that demonstrates using ``ModelBuilder`` to build a Triton model, see `Triton example `_. +For a sample notebook that demonstrates using ``ModelBuilder`` to build a Triton model, see `Triton example `_. **Hugging Face models**. @@ -1101,7 +1101,7 @@ Create the ``ModelBuilder`` object and deploy the model onto a SageMaker endpoin instance_type='ml.g5.2xlarge' ) -For a sample notebook that demonstrates using ``ModelBuilder`` to build a Hugging Face model, see `Hugging Face example `_. +For a sample notebook that demonstrates using ``ModelBuilder`` to build a Hugging Face model, see `Hugging Face example `_. Deploy foundation models to SageMaker Endpoints @@ -1135,7 +1135,7 @@ For gated models on Hugging Face Hub, request access and pass the associated key A feature of ``ModelBuilder`` is the ability to run local tuning on the container when you use `LOCAL_CONTAINER` mode. In this case ``ModelBuilder`` tunes the parameter(s) for the underlying model server. This feature can be used by executing `tuned_model=model.tune()`. Before running `tune`, clean up other containers running locally or else you might see an "address already in use" error. -For a sample notebook that demonstrates using ``ModelBuilder`` to build a Hugging Face Hub model, see `Hugging Face Hub example `_. +For a sample notebook that demonstrates using ``ModelBuilder`` to build a Hugging Face Hub model, see `Hugging Face Hub example `_. **JumpStart**. JumpStart also offers a number of pre-trained foundation models. Again, the model ID is required. Deploying a JumpStart model to a SageMaker endpoint is straightforward, as shown in the following example: @@ -1154,24 +1154,24 @@ For a sample notebook that demonstrates using ``ModelBuilder`` to build a Huggin For a list of available JumpStart model IDs, see `Built-in Algorithms with pre-trained Model Table `_. -For a sample notebook that demonstrates using ``ModelBuilder`` to build a JumpStart model, see `JumpStart example `_. +For a sample notebook that demonstrates using ``ModelBuilder`` to build a JumpStart model, see `JumpStart example `_. ModelBuilder examples ^^^^^^^^^^^^^^^^^^^^^ For example notebooks that demonstrate the use of ``ModelBuilder`` and its supporting classes, as well as model creation of traditional and foundation models, see the following links: - * `Pytorch example `_ + * `Pytorch example `__ - * `XGBoost example `_ + * `XGBoost example `__ - * `Triton example `_ + * `Triton example `__ - * `Hugging Face example `_ + * `Hugging Face example `__ - * `Hugging Face Hub example `_ + * `Hugging Face Hub example `__ - * `JumpStart example `_ + * `JumpStart example `__ Fine-tune a Model and Deploy to a SageMaker Endpoint From c7ee5d16909350c5d532d12bc23e78a8f8473c2e Mon Sep 17 00:00:00 2001 From: Teng-xu <67929972+Teng-xu@users.noreply.github.com> Date: Tue, 19 Dec 2023 12:44:57 -0800 Subject: [PATCH 09/76] feat: Use specific images for SMP v2 jobs (#4333) * Add check for smp lib * update syntax * Remove unused images * Update repo name and regions * Update account number * Update framework name and check for None distribution * Add unit tests for smp v2 uri * Check enabled * Remove logging * Add cuda version in uri * Update cu121 * Update syntax * Fix black check * Fix black --------- Co-authored-by: huilgolr --- .../image_uri_config/pytorch-smp.json | 37 +++++++++++++ src/sagemaker/image_uris.py | 19 ++++++- .../unit/sagemaker/image_uris/test_smp_v2.py | 53 +++++++++++++++++++ 3 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 src/sagemaker/image_uri_config/pytorch-smp.json create mode 100644 tests/unit/sagemaker/image_uris/test_smp_v2.py diff --git a/src/sagemaker/image_uri_config/pytorch-smp.json b/src/sagemaker/image_uri_config/pytorch-smp.json new file mode 100644 index 0000000000..96afc3cb1c --- /dev/null +++ b/src/sagemaker/image_uri_config/pytorch-smp.json @@ -0,0 +1,37 @@ +{ + "training": { + "processors": [ + "gpu" + ], + "version_aliases": { + "2.0": "2.0.1" + }, + "versions": { + "2.0.1": { + "py_versions": [ + "py310" + ], + "registries": { + "ap-northeast-1": "658645717510", + "ap-northeast-2": "658645717510", + "ap-northeast-3": "658645717510", + "ap-south-1": "658645717510", + "ap-southeast-1": "658645717510", + "ap-southeast-2": "658645717510", + "ca-central-1": "658645717510", + "eu-central-1": "658645717510", + "eu-north-1": "658645717510", + "eu-west-1": "658645717510", + "eu-west-2": "658645717510", + "eu-west-3": "658645717510", + "sa-east-1": "658645717510", + "us-east-1": "658645717510", + "us-east-2": "658645717510", + "us-west-1": "658645717510", + "us-west-2": "658645717510" + }, + "repository": "smdistributed-modelparallel" + } + } + } +} \ No newline at end of file diff --git a/src/sagemaker/image_uris.py b/src/sagemaker/image_uris.py index 267532cb1c..56e4bf346f 100644 --- a/src/sagemaker/image_uris.py +++ b/src/sagemaker/image_uris.py @@ -27,7 +27,10 @@ from sagemaker.jumpstart import artifacts from sagemaker.workflow import is_pipeline_variable from sagemaker.workflow.utilities import override_pipeline_parameter_var -from sagemaker.fw_utils import GRAVITON_ALLOWED_TARGET_INSTANCE_FAMILY, GRAVITON_ALLOWED_FRAMEWORKS +from sagemaker.fw_utils import ( + GRAVITON_ALLOWED_TARGET_INSTANCE_FAMILY, + GRAVITON_ALLOWED_FRAMEWORKS, +) logger = logging.getLogger(__name__) @@ -343,7 +346,8 @@ def _config_for_framework_and_scope(framework, image_scope, accelerator_type=Non if image_scope not in ("eia", "inference"): logger.warning( - "Elastic inference is for inference only. Ignoring image scope: %s.", image_scope + "Elastic inference is for inference only. Ignoring image scope: %s.", + image_scope, ) image_scope = "eia" @@ -660,6 +664,17 @@ def get_training_image_uri( container_version = None base_framework_version = None + # Check for smp library + if distribution is not None: + if "torch_distributed" in distribution and "smdistributed" in distribution: + if "modelparallel" in distribution["smdistributed"]: + if distribution["smdistributed"]["modelparallel"].get("enabled", True): + framework = "pytorch-smp" + if "p5" in instance_type: + container_version = "cu121" + else: + container_version = "cu118" + return retrieve( framework, region, diff --git a/tests/unit/sagemaker/image_uris/test_smp_v2.py b/tests/unit/sagemaker/image_uris/test_smp_v2.py new file mode 100644 index 0000000000..634c8a0f7f --- /dev/null +++ b/tests/unit/sagemaker/image_uris/test_smp_v2.py @@ -0,0 +1,53 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import pytest +from sagemaker import image_uris +from tests.unit.sagemaker.image_uris import expected_uris + +CONTAINER_VERSIONS = {"ml.p4d.24xlarge": "cu118", "ml.p5d.24xlarge": "cu121"} + + +@pytest.mark.parametrize("load_config", ["pytorch-smp.json"], indirect=True) +def test_smp_v2(load_config): + VERSIONS = load_config["training"]["versions"] + PROCESSORS = load_config["training"]["processors"] + distribution = { + "torch_distributed": {"enabled": True}, + "smdistributed": {"modelparallel": {"enabled": True}}, + } + for processor in PROCESSORS: + for version in VERSIONS: + ACCOUNTS = load_config["training"]["versions"][version]["registries"] + PY_VERSIONS = load_config["training"]["versions"][version]["py_versions"] + for py_version in PY_VERSIONS: + for region in ACCOUNTS.keys(): + for instance_type in CONTAINER_VERSIONS.keys(): + uri = image_uris.get_training_image_uri( + region, + framework="pytorch", + framework_version=version, + py_version=py_version, + distribution=distribution, + instance_type=instance_type, + ) + expected = expected_uris.framework_uri( + repo="smdistributed-modelparallel", + fw_version=version, + py_version=f"{py_version}-{CONTAINER_VERSIONS[instance_type]}", + processor=processor, + region=region, + account=ACCOUNTS[region], + ) + assert expected == uri From 39f2cd9ba4fb1118ff8447ee32ca1009fea31b48 Mon Sep 17 00:00:00 2001 From: Gary Wang <38331932+gwang111@users.noreply.github.com> Date: Tue, 19 Dec 2023 12:47:45 -0800 Subject: [PATCH 10/76] Fix: Updated js mb compression logic - ModelBuilder (#4294) Co-authored-by: EC2 Default User --- .../serve/builder/jumpstart_builder.py | 1 + src/sagemaker/serve/builder/tgi_builder.py | 5 +- .../serve/model_server/djl_serving/prepare.py | 62 +++- .../serve/model_server/tgi/prepare.py | 53 +++- .../serve/builder/test_djl_builder.py | 9 + .../sagemaker/serve/model_server/constants.py | 33 +++ .../djl_serving/test_djl_prepare.py | 275 ++++++++++++++++++ .../model_server/tgi/test_tgi_prepare.py | 159 ++++++++++ 8 files changed, 568 insertions(+), 29 deletions(-) create mode 100644 tests/unit/sagemaker/serve/model_server/constants.py create mode 100644 tests/unit/sagemaker/serve/model_server/djl_serving/test_djl_prepare.py create mode 100644 tests/unit/sagemaker/serve/model_server/tgi/test_tgi_prepare.py diff --git a/src/sagemaker/serve/builder/jumpstart_builder.py b/src/sagemaker/serve/builder/jumpstart_builder.py index 2c087adf81..b7dc5b1da5 100644 --- a/src/sagemaker/serve/builder/jumpstart_builder.py +++ b/src/sagemaker/serve/builder/jumpstart_builder.py @@ -95,6 +95,7 @@ def _is_jumpstart_model_id(self) -> bool: def _create_pre_trained_js_model(self) -> Type[Model]: """Placeholder docstring""" pysdk_model = JumpStartModel(self.model) + pysdk_model.sagemaker_session = self.sagemaker_session self._original_deploy = pysdk_model.deploy pysdk_model.deploy = self._js_builder_deploy_wrapper diff --git a/src/sagemaker/serve/builder/tgi_builder.py b/src/sagemaker/serve/builder/tgi_builder.py index ef25e6ff93..832e3a9258 100644 --- a/src/sagemaker/serve/builder/tgi_builder.py +++ b/src/sagemaker/serve/builder/tgi_builder.py @@ -133,7 +133,10 @@ def _create_tgi_model(self) -> Type[Model]: logger.info("Auto detected %s. Proceeding with the the deployment.", self.image_uri) pysdk_model = HuggingFaceModel( - image_uri=self.image_uri, env=self.env_vars, role=self.role_arn + image_uri=self.image_uri, + env=self.env_vars, + role=self.role_arn, + sagemaker_session=self.sagemaker_session, ) self._original_deploy = pysdk_model.deploy diff --git a/src/sagemaker/serve/model_server/djl_serving/prepare.py b/src/sagemaker/serve/model_server/djl_serving/prepare.py index cda21c93c3..386c5fb66e 100644 --- a/src/sagemaker/serve/model_server/djl_serving/prepare.py +++ b/src/sagemaker/serve/model_server/djl_serving/prepare.py @@ -14,14 +14,14 @@ from __future__ import absolute_import import shutil -import tarfile -import subprocess import json +import tarfile import logging from typing import List from pathlib import Path from sagemaker.utils import _tmpdir +from sagemaker.s3 import S3Downloader from sagemaker.djl_inference import DJLModel from sagemaker.djl_inference.model import _read_existing_serving_properties from sagemaker.serve.utils.local_hardware import _check_disk_space, _check_docker_disk_usage @@ -34,27 +34,57 @@ def _has_serving_properties_file(code_dir: Path) -> bool: - """Placeholder Docstring""" + """Check for existing serving properties in the directory""" return code_dir.joinpath(_SERVING_PROPERTIES_FILE).is_file() -def _members(resources: object, depth: int): - """Placeholder Docstring""" - for member in resources.getmembers(): - member.path = member.path.split("/", depth)[-1] - yield member +def _move_to_code_dir(js_model_dir: str, code_dir: Path): + """Move DJL Jumpstart resources from model to code_dir""" + js_model_resources = Path(js_model_dir).joinpath("model") + for resource in js_model_resources.glob("*"): + try: + shutil.move(resource, code_dir) + except shutil.Error as e: + if "already exists" in str(e): + continue + + +def _extract_js_resource(js_model_dir: str, js_id: str): + """Uncompress the jumpstart resource""" + tmp_sourcedir = Path(js_model_dir).joinpath(f"infer-prepack-{js_id}.tar.gz") + with tarfile.open(str(tmp_sourcedir)) as resources: + resources.extractall(path=js_model_dir) def _copy_jumpstart_artifacts(model_data: str, js_id: str, code_dir: Path): - """Placeholder Docstring""" + """Copy the associated JumpStart Resource into the code directory""" logger.info("Downloading JumpStart artifacts from S3...") - with _tmpdir(directory=str(code_dir)) as js_model_dir: - subprocess.run(["aws", "s3", "cp", model_data, js_model_dir]) - logger.info("Uncompressing JumpStart artifacts for faster loading...") - tmp_sourcedir = Path(js_model_dir).joinpath(f"infer-prepack-{js_id}.tar.gz") - with tarfile.open(str(tmp_sourcedir)) as resources: - resources.extractall(path=code_dir, members=_members(resources, 1)) + s3_downloader = S3Downloader() + invalid_model_data_format = False + with _tmpdir(directory=str(code_dir)) as js_model_dir: + if isinstance(model_data, str): + if model_data.endswith(".tar.gz"): + logger.info("Uncompressing JumpStart artifacts for faster loading...") + s3_downloader.download(model_data, js_model_dir) + _extract_js_resource(js_model_dir, js_id) + else: + logger.info("Copying uncompressed JumpStart artifacts...") + s3_downloader.download(model_data, js_model_dir) + elif ( + isinstance(model_data, dict) + and model_data.get("S3DataSource") + and model_data.get("S3DataSource").get("S3Uri") + ): + logger.info("Copying uncompressed JumpStart artifacts...") + s3_downloader.download(model_data.get("S3DataSource").get("S3Uri"), js_model_dir) + else: + invalid_model_data_format = True + if not invalid_model_data_format: + _move_to_code_dir(js_model_dir, code_dir) + + if invalid_model_data_format: + raise ValueError("JumpStart model data compression format is unsupported: %s", model_data) existing_properties = _read_existing_serving_properties(code_dir) config_json_file = code_dir.joinpath("config.json") @@ -70,7 +100,7 @@ def _copy_jumpstart_artifacts(model_data: str, js_id: str, code_dir: Path): def _generate_properties_file( model: DJLModel, code_dir: Path, overwrite_props_from_file: bool, manual_set_props: dict ): - """Placeholder Docstring""" + """Construct serving properties file taking into account of overrides or manual specs""" if _has_serving_properties_file(code_dir): existing_properties = _read_existing_serving_properties(code_dir) else: diff --git a/src/sagemaker/serve/model_server/tgi/prepare.py b/src/sagemaker/serve/model_server/tgi/prepare.py index 6159841ff3..fe1162e505 100644 --- a/src/sagemaker/serve/model_server/tgi/prepare.py +++ b/src/sagemaker/serve/model_server/tgi/prepare.py @@ -1,37 +1,66 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. """Prepare TgiModel for Deployment""" from __future__ import absolute_import import tarfile -import subprocess import logging from typing import List from pathlib import Path from sagemaker.serve.utils.local_hardware import _check_disk_space, _check_docker_disk_usage from sagemaker.utils import _tmpdir +from sagemaker.s3 import S3Downloader logger = logging.getLogger(__name__) +def _extract_js_resource(js_model_dir: str, code_dir: Path, js_id: str): + """Uncompress the jumpstart resource""" + tmp_sourcedir = Path(js_model_dir).joinpath(f"infer-prepack-{js_id}.tar.gz") + with tarfile.open(str(tmp_sourcedir)) as resources: + resources.extractall(path=code_dir) + + def _copy_jumpstart_artifacts(model_data: str, js_id: str, code_dir: Path) -> bool: - """Placeholder Docstring""" + """Copy the associated JumpStart Resource into the code directory""" logger.info("Downloading JumpStart artifacts from S3...") - with _tmpdir(directory=str(code_dir)) as js_model_dir: - js_model_data_loc = model_data.get("S3DataSource").get("S3Uri") - # TODO: leave this check here until we are sure every js model has moved to uncompressed - if js_model_data_loc.endswith("tar.gz"): - subprocess.run(["aws", "s3", "cp", js_model_data_loc, js_model_dir]) + + s3_downloader = S3Downloader() + if isinstance(model_data, str): + if model_data.endswith(".tar.gz"): logger.info("Uncompressing JumpStart artifacts for faster loading...") - tmp_sourcedir = Path(js_model_dir).joinpath(f"infer-prepack-{js_id}.tar.gz") - with tarfile.open(str(tmp_sourcedir)) as resources: - resources.extractall(path=code_dir) + with _tmpdir(directory=str(code_dir)) as js_model_dir: + s3_downloader.download(model_data, js_model_dir) + _extract_js_resource(js_model_dir, code_dir, js_id) else: - subprocess.run(["aws", "s3", "cp", js_model_data_loc, js_model_dir, "--recursive"]) + logger.info("Copying uncompressed JumpStart artifacts...") + s3_downloader.download(model_data, code_dir) + elif ( + isinstance(model_data, dict) + and model_data.get("S3DataSource") + and model_data.get("S3DataSource").get("S3Uri") + ): + logger.info("Copying uncompressed JumpStart artifacts...") + s3_downloader.download(model_data.get("S3DataSource").get("S3Uri"), code_dir) + else: + raise ValueError("JumpStart model data compression format is unsupported: %s", model_data) + return True def _create_dir_structure(model_path: str) -> tuple: - """Placeholder Docstring""" + """Create the expected model directory structure for the TGI server""" model_path = Path(model_path) if not model_path.exists(): model_path.mkdir(parents=True) diff --git a/tests/unit/sagemaker/serve/builder/test_djl_builder.py b/tests/unit/sagemaker/serve/builder/test_djl_builder.py index 018dc3356e..ee52373dd7 100644 --- a/tests/unit/sagemaker/serve/builder/test_djl_builder.py +++ b/tests/unit/sagemaker/serve/builder/test_djl_builder.py @@ -114,10 +114,12 @@ def test_build_deploy_for_djl_local_container( mode=Mode.LOCAL_CONTAINER, model_server=ModelServer.DJL_SERVING, ) + builder._prepare_for_mode = MagicMock() builder._prepare_for_mode.side_effect = None model = builder.build() + builder.serve_settings.telemetry_opt_out = True assert isinstance(model, HuggingFaceAccelerateModel) assert ( @@ -176,6 +178,7 @@ def test_build_for_djl_local_container_faster_transformer( model_server=ModelServer.DJL_SERVING, ) model = builder.build() + builder.serve_settings.telemetry_opt_out = True assert isinstance(model, FasterTransformerModel) assert ( @@ -211,6 +214,7 @@ def test_build_for_djl_local_container_deepspeed( model_server=ModelServer.DJL_SERVING, ) model = builder.build() + builder.serve_settings.telemetry_opt_out = True assert isinstance(model, DeepSpeedModel) assert model.generate_serving_properties() == mock_expected_deepspeed_serving_properties @@ -268,6 +272,7 @@ def test_tune_for_djl_local_container( builder._djl_model_builder_deploy_wrapper = MagicMock() model = builder.build() + builder.serve_settings.telemetry_opt_out = True tuned_model = model.tune() assert tuned_model.generate_serving_properties() == mock_most_performant_serving_properties @@ -317,6 +322,7 @@ def test_tune_for_djl_local_container_deep_ping_ex( builder._prepare_for_mode.side_effect = None model = builder.build() + builder.serve_settings.telemetry_opt_out = True tuned_model = model.tune() assert ( tuned_model.generate_serving_properties() @@ -369,6 +375,7 @@ def test_tune_for_djl_local_container_load_ex( builder._prepare_for_mode.side_effect = None model = builder.build() + builder.serve_settings.telemetry_opt_out = True tuned_model = model.tune() assert ( tuned_model.generate_serving_properties() @@ -421,6 +428,7 @@ def test_tune_for_djl_local_container_oom_ex( builder._prepare_for_mode.side_effect = None model = builder.build() + builder.serve_settings.telemetry_opt_out = True tuned_model = model.tune() assert ( tuned_model.generate_serving_properties() @@ -473,6 +481,7 @@ def test_tune_for_djl_local_container_invoke_ex( builder._prepare_for_mode.side_effect = None model = builder.build() + builder.serve_settings.telemetry_opt_out = True tuned_model = model.tune() assert ( tuned_model.generate_serving_properties() diff --git a/tests/unit/sagemaker/serve/model_server/constants.py b/tests/unit/sagemaker/serve/model_server/constants.py new file mode 100644 index 0000000000..41a0a832cb --- /dev/null +++ b/tests/unit/sagemaker/serve/model_server/constants.py @@ -0,0 +1,33 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +MOCK_MODEL_PATH = "/path/to/mock/model/dir" +MOCK_CODE_DIR = "/path/to/mock/model/dir/code" +MOCK_JUMPSTART_ID = "mock_llm_js_id" +MOCK_TMP_DIR = "tmp123456" +MOCK_COMPRESSED_MODEL_DATA_STR = ( + "s3://jumpstart-cache/to/infer-prepack-huggingface-llm-falcon-7b-bf16.tar.gz" +) +MOCK_UNCOMPRESSED_MODEL_DATA_STR = "s3://jumpstart-cache/to/artifacts/inference-prepack/v1.0.1/" +MOCK_UNCOMPRESSED_MODEL_DATA_STR_FOR_DICT = ( + "s3://jumpstart-cache/to/artifacts/inference-prepack/v1.0.1/dict/" +) +MOCK_UNCOMPRESSED_MODEL_DATA_DICT = { + "S3DataSource": { + "S3Uri": MOCK_UNCOMPRESSED_MODEL_DATA_STR_FOR_DICT, + "S3DataType": "S3Prefix", + "CompressionType": "None", + } +} +MOCK_INVALID_MODEL_DATA_DICT = {} diff --git a/tests/unit/sagemaker/serve/model_server/djl_serving/test_djl_prepare.py b/tests/unit/sagemaker/serve/model_server/djl_serving/test_djl_prepare.py new file mode 100644 index 0000000000..40d3edb251 --- /dev/null +++ b/tests/unit/sagemaker/serve/model_server/djl_serving/test_djl_prepare.py @@ -0,0 +1,275 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +from unittest import TestCase +from unittest.mock import Mock, PropertyMock, patch, mock_open, call + +from sagemaker.serve.model_server.djl_serving.prepare import ( + _copy_jumpstart_artifacts, + _create_dir_structure, + _move_to_code_dir, + _extract_js_resource, +) +from tests.unit.sagemaker.serve.model_server.constants import ( + MOCK_JUMPSTART_ID, + MOCK_TMP_DIR, + MOCK_COMPRESSED_MODEL_DATA_STR, + MOCK_UNCOMPRESSED_MODEL_DATA_STR, + MOCK_UNCOMPRESSED_MODEL_DATA_STR_FOR_DICT, + MOCK_UNCOMPRESSED_MODEL_DATA_DICT, + MOCK_INVALID_MODEL_DATA_DICT, +) + +MOCK_DJL_JUMPSTART_GLOBED_RESOURCES = ["./inference.py", "./serving.properties", "./config.json"] + + +class DjlPrepareTests(TestCase): + @patch("sagemaker.serve.model_server.djl_serving.prepare._check_disk_space") + @patch("sagemaker.serve.model_server.djl_serving.prepare._check_docker_disk_usage") + @patch("sagemaker.serve.model_server.djl_serving.prepare.Path") + def test_create_dir_structure_from_new(self, mock_path, mock_disk_usage, mock_disk_space): + mock_model_path = Mock() + mock_model_path.exists.return_value = False + mock_code_dir = Mock() + mock_model_path.joinpath.return_value = mock_code_dir + mock_path.return_value = mock_model_path + + ret_model_path, ret_code_dir = _create_dir_structure(mock_model_path) + + mock_model_path.mkdir.assert_called_once_with(parents=True) + mock_model_path.joinpath.assert_called_once_with("code") + mock_code_dir.mkdir.assert_called_once_with(exist_ok=True, parents=True) + mock_disk_space.assert_called_once_with(mock_model_path) + mock_disk_usage.assert_called_once() + + self.assertEquals(ret_model_path, mock_model_path) + self.assertEquals(ret_code_dir, mock_code_dir) + + @patch("sagemaker.serve.model_server.djl_serving.prepare.Path") + def test_create_dir_structure_invalid_path(self, mock_path): + mock_model_path = Mock() + mock_model_path.exists.return_value = True + mock_model_path.is_dir.return_value = False + mock_path.return_value = mock_model_path + + with self.assertRaises(ValueError) as context: + _create_dir_structure(mock_model_path) + + self.assertEquals("model_dir is not a valid directory", str(context.exception)) + + @patch("sagemaker.serve.model_server.djl_serving.prepare.S3Downloader") + @patch("sagemaker.serve.model_server.djl_serving.prepare._tmpdir") + @patch( + "sagemaker.serve.model_server.djl_serving.prepare._read_existing_serving_properties", + return_value={}, + ) + @patch("sagemaker.serve.model_server.djl_serving.prepare._move_to_code_dir") + @patch("builtins.open", new_callable=mock_open, read_data="data") + @patch("json.load", return_value={}) + def test_prepare_djl_js_resources_for_jumpstart_uncompressed_str( + self, + mock_load, + mock_open, + mock_move_to_code_dir, + mock_existing_props, + mock_tmpdir, + mock_s3_downloader, + ): + mock_code_dir = Mock() + mock_config_json_file = Mock() + mock_config_json_file.is_file.return_value = True + mock_code_dir.joinpath.return_value = mock_config_json_file + + mock_s3_downloader_obj = Mock() + mock_s3_downloader.return_value = mock_s3_downloader_obj + + mock_tmpdir_obj = Mock() + mock_js_dir = Mock() + mock_js_dir.return_value = MOCK_TMP_DIR + type(mock_tmpdir_obj).__enter__ = PropertyMock(return_value=mock_js_dir) + type(mock_tmpdir_obj).__exit__ = PropertyMock(return_value=Mock()) + mock_tmpdir.return_value = mock_tmpdir_obj + + existing_properties, hf_model_config, success = _copy_jumpstart_artifacts( + MOCK_UNCOMPRESSED_MODEL_DATA_STR, MOCK_JUMPSTART_ID, mock_code_dir + ) + + mock_s3_downloader_obj.download.assert_called_once_with( + MOCK_UNCOMPRESSED_MODEL_DATA_STR, MOCK_TMP_DIR + ) + mock_move_to_code_dir.assert_called_once_with(MOCK_TMP_DIR, mock_code_dir) + mock_code_dir.joinpath.assert_called_once_with("config.json") + self.assertEqual(existing_properties, {}) + self.assertEqual(hf_model_config, {}) + self.assertEqual(success, True) + + @patch("sagemaker.serve.model_server.djl_serving.prepare.S3Downloader") + @patch("sagemaker.serve.model_server.djl_serving.prepare._tmpdir") + @patch( + "sagemaker.serve.model_server.djl_serving.prepare._read_existing_serving_properties", + return_value={}, + ) + @patch("sagemaker.serve.model_server.djl_serving.prepare._move_to_code_dir") + @patch("builtins.open", new_callable=mock_open, read_data="data") + @patch("json.load", return_value={}) + def test_prepare_djl_js_resources_for_jumpstart_uncompressed_dict( + self, + mock_load, + mock_open, + mock_move_to_code_dir, + mock_existing_props, + mock_tmpdir, + mock_s3_downloader, + ): + mock_code_dir = Mock() + mock_config_json_file = Mock() + mock_config_json_file.is_file.return_value = True + mock_code_dir.joinpath.return_value = mock_config_json_file + + mock_s3_downloader_obj = Mock() + mock_s3_downloader.return_value = mock_s3_downloader_obj + + mock_tmpdir_obj = Mock() + mock_js_dir = Mock() + mock_js_dir.return_value = MOCK_TMP_DIR + type(mock_tmpdir_obj).__enter__ = PropertyMock(return_value=mock_js_dir) + type(mock_tmpdir_obj).__exit__ = PropertyMock(return_value=Mock()) + mock_tmpdir.return_value = mock_tmpdir_obj + + existing_properties, hf_model_config, success = _copy_jumpstart_artifacts( + MOCK_UNCOMPRESSED_MODEL_DATA_DICT, MOCK_JUMPSTART_ID, mock_code_dir + ) + + mock_s3_downloader_obj.download.assert_called_once_with( + MOCK_UNCOMPRESSED_MODEL_DATA_STR_FOR_DICT, MOCK_TMP_DIR + ) + mock_move_to_code_dir.assert_called_once_with(MOCK_TMP_DIR, mock_code_dir) + mock_code_dir.joinpath.assert_called_once_with("config.json") + self.assertEqual(existing_properties, {}) + self.assertEqual(hf_model_config, {}) + self.assertEqual(success, True) + + @patch("sagemaker.serve.model_server.djl_serving.prepare._tmpdir") + @patch("sagemaker.serve.model_server.djl_serving.prepare._move_to_code_dir") + def test_prepare_djl_js_resources_for_jumpstart_invalid_model_data( + self, mock_move_to_code_dir, mock_tmpdir + ): + mock_code_dir = Mock() + mock_tmpdir_obj = Mock() + type(mock_tmpdir_obj).__enter__ = PropertyMock(return_value=Mock()) + type(mock_tmpdir_obj).__exit__ = PropertyMock(return_value=Mock()) + mock_tmpdir.return_value = mock_tmpdir_obj + + with self.assertRaises(ValueError) as context: + _copy_jumpstart_artifacts( + MOCK_INVALID_MODEL_DATA_DICT, MOCK_JUMPSTART_ID, mock_code_dir + ) + + assert not mock_move_to_code_dir.called + self.assertTrue( + "JumpStart model data compression format is unsupported" in str(context.exception) + ) + + @patch("sagemaker.serve.model_server.djl_serving.prepare.S3Downloader") + @patch("sagemaker.serve.model_server.djl_serving.prepare._extract_js_resource") + @patch("sagemaker.serve.model_server.djl_serving.prepare._tmpdir") + @patch( + "sagemaker.serve.model_server.djl_serving.prepare._read_existing_serving_properties", + return_value={}, + ) + @patch("sagemaker.serve.model_server.djl_serving.prepare._move_to_code_dir") + @patch("builtins.open", new_callable=mock_open, read_data="data") + @patch("json.load", return_value={}) + def test_prepare_djl_js_resources_for_jumpstart_compressed_str( + self, + mock_load, + mock_open, + mock_move_to_code_dir, + mock_existing_props, + mock_tmpdir, + mock_extract_js_resource, + mock_s3_downloader, + ): + mock_code_dir = Mock() + mock_config_json_file = Mock() + mock_config_json_file.is_file.return_value = True + mock_code_dir.joinpath.return_value = mock_config_json_file + + mock_s3_downloader_obj = Mock() + mock_s3_downloader.return_value = mock_s3_downloader_obj + + mock_tmpdir_obj = Mock() + mock_js_dir = Mock() + mock_js_dir.return_value = MOCK_TMP_DIR + type(mock_tmpdir_obj).__enter__ = PropertyMock(return_value=mock_js_dir) + type(mock_tmpdir_obj).__exit__ = PropertyMock(return_value=Mock()) + mock_tmpdir.return_value = mock_tmpdir_obj + + existing_properties, hf_model_config, success = _copy_jumpstart_artifacts( + MOCK_COMPRESSED_MODEL_DATA_STR, MOCK_JUMPSTART_ID, mock_code_dir + ) + + mock_s3_downloader_obj.download.assert_called_once_with( + MOCK_COMPRESSED_MODEL_DATA_STR, MOCK_TMP_DIR + ) + mock_extract_js_resource.assert_called_with(MOCK_TMP_DIR, MOCK_JUMPSTART_ID) + mock_move_to_code_dir.assert_called_once_with(MOCK_TMP_DIR, mock_code_dir) + mock_code_dir.joinpath.assert_called_once_with("config.json") + self.assertEqual(existing_properties, {}) + self.assertEqual(hf_model_config, {}) + self.assertEqual(success, True) + + @patch("sagemaker.serve.model_server.djl_serving.prepare.Path") + @patch("sagemaker.serve.model_server.djl_serving.prepare.shutil") + def test_move_to_code_dir_success(self, mock_shutil, mock_path): + mock_path_obj = Mock() + mock_js_model_resources = Mock() + mock_js_model_resources.glob.return_value = MOCK_DJL_JUMPSTART_GLOBED_RESOURCES + mock_path_obj.joinpath.return_value = mock_js_model_resources + mock_path.return_value = mock_path_obj + + mock_js_model_dir = "" + mock_code_dir = Mock() + _move_to_code_dir(mock_js_model_dir, mock_code_dir) + + mock_path_obj.joinpath.assert_called_once_with("model") + + expected_moves = [ + call("./inference.py", mock_code_dir), + call("./serving.properties", mock_code_dir), + call("./config.json", mock_code_dir), + ] + mock_shutil.move.assert_has_calls(expected_moves) + + @patch("sagemaker.serve.model_server.djl_serving.prepare.Path") + @patch("sagemaker.serve.model_server.djl_serving.prepare.tarfile") + def test_extract_js_resources_success(self, mock_tarfile, mock_path): + mock_path_obj = Mock() + mock_path_obj.joinpath.return_value = Mock() + mock_path.return_value = mock_path_obj + + mock_tar_obj = Mock() + mock_enter = Mock() + mock_resource_obj = Mock() + mock_enter.return_value = mock_resource_obj + type(mock_tar_obj).__enter__ = PropertyMock(return_value=mock_enter) + type(mock_tar_obj).__exit__ = PropertyMock(return_value=Mock()) + mock_tarfile.open.return_value = mock_tar_obj + + js_model_dir = "" + _extract_js_resource(js_model_dir, MOCK_JUMPSTART_ID) + + mock_path.assert_called_once_with(js_model_dir) + mock_path_obj.joinpath.assert_called_once_with(f"infer-prepack-{MOCK_JUMPSTART_ID}.tar.gz") + mock_resource_obj.extractall.assert_called_once_with(path=js_model_dir) diff --git a/tests/unit/sagemaker/serve/model_server/tgi/test_tgi_prepare.py b/tests/unit/sagemaker/serve/model_server/tgi/test_tgi_prepare.py new file mode 100644 index 0000000000..c055be1f7d --- /dev/null +++ b/tests/unit/sagemaker/serve/model_server/tgi/test_tgi_prepare.py @@ -0,0 +1,159 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +from unittest import TestCase +from unittest.mock import Mock, PropertyMock, patch + +from sagemaker.serve.model_server.tgi.prepare import ( + _create_dir_structure, + _copy_jumpstart_artifacts, + _extract_js_resource, +) +from tests.unit.sagemaker.serve.model_server.constants import ( + MOCK_JUMPSTART_ID, + MOCK_TMP_DIR, + MOCK_COMPRESSED_MODEL_DATA_STR, + MOCK_UNCOMPRESSED_MODEL_DATA_STR, + MOCK_UNCOMPRESSED_MODEL_DATA_STR_FOR_DICT, + MOCK_UNCOMPRESSED_MODEL_DATA_DICT, + MOCK_INVALID_MODEL_DATA_DICT, +) + + +class TgiPrepareTests(TestCase): + @patch("sagemaker.serve.model_server.tgi.prepare._check_disk_space") + @patch("sagemaker.serve.model_server.tgi.prepare._check_docker_disk_usage") + @patch("sagemaker.serve.model_server.tgi.prepare.Path") + def test_create_dir_structure_from_new(self, mock_path, mock_disk_usage, mock_disk_space): + mock_model_path = Mock() + mock_model_path.exists.return_value = False + mock_code_dir = Mock() + mock_model_path.joinpath.return_value = mock_code_dir + mock_path.return_value = mock_model_path + + ret_model_path, ret_code_dir = _create_dir_structure(mock_model_path) + + mock_model_path.mkdir.assert_called_once_with(parents=True) + mock_model_path.joinpath.assert_called_once_with("code") + mock_code_dir.mkdir.assert_called_once_with(exist_ok=True, parents=True) + mock_disk_space.assert_called_once_with(mock_model_path) + mock_disk_usage.assert_called_once() + + self.assertEquals(ret_model_path, mock_model_path) + self.assertEquals(ret_code_dir, mock_code_dir) + + @patch("sagemaker.serve.model_server.tgi.prepare.Path") + def test_create_dir_structure_invalid_path(self, mock_path): + mock_model_path = Mock() + mock_model_path.exists.return_value = True + mock_model_path.is_dir.return_value = False + mock_path.return_value = mock_model_path + + with self.assertRaises(ValueError) as context: + _create_dir_structure(mock_model_path) + + self.assertEquals("model_dir is not a valid directory", str(context.exception)) + + @patch("sagemaker.serve.model_server.tgi.prepare.S3Downloader") + def test_prepare_tgi_js_resources_for_jumpstart_uncompressed_str(self, mock_s3_downloader): + mock_code_dir = Mock() + mock_s3_downloader_obj = Mock() + mock_s3_downloader.return_value = mock_s3_downloader_obj + + _copy_jumpstart_artifacts( + MOCK_UNCOMPRESSED_MODEL_DATA_STR, MOCK_JUMPSTART_ID, mock_code_dir + ) + + mock_s3_downloader_obj.download.assert_called_once_with( + MOCK_UNCOMPRESSED_MODEL_DATA_STR, mock_code_dir + ) + + @patch("sagemaker.serve.model_server.tgi.prepare.S3Downloader") + def test_prepare_tgi_js_resources_for_jumpstart_invalid_model_data(self, mock_s3_downloader): + mock_code_dir = Mock() + mock_s3_downloader_obj = Mock() + mock_s3_downloader.return_value = mock_s3_downloader_obj + + _copy_jumpstart_artifacts( + MOCK_UNCOMPRESSED_MODEL_DATA_DICT, MOCK_JUMPSTART_ID, mock_code_dir + ) + + mock_s3_downloader_obj.download.assert_called_once_with( + MOCK_UNCOMPRESSED_MODEL_DATA_STR_FOR_DICT, mock_code_dir + ) + + def test_prepare_tgi_js_resources_for_jumpstart_invalid_format(self): + mock_code_dir = Mock() + + with self.assertRaises(ValueError) as context: + _copy_jumpstart_artifacts( + MOCK_INVALID_MODEL_DATA_DICT, MOCK_JUMPSTART_ID, mock_code_dir + ) + + self.assertTrue( + "JumpStart model data compression format is unsupported" in str(context.exception) + ) + + @patch("sagemaker.serve.model_server.tgi.prepare.S3Downloader") + @patch("sagemaker.serve.model_server.tgi.prepare._tmpdir") + @patch("sagemaker.serve.model_server.tgi.prepare._extract_js_resource") + def test_prepare_tgi_js_resources_for_jumpstart_compressed_str( + self, + mock_extract_js_resource, + mock_tmpdir, + mock_s3_downloader, + ): + mock_code_dir = Mock() + + mock_s3_downloader_obj = Mock() + mock_s3_downloader.return_value = mock_s3_downloader_obj + + mock_tmpdir_obj = Mock() + mock_js_dir = Mock() + mock_js_dir.return_value = MOCK_TMP_DIR + type(mock_tmpdir_obj).__enter__ = PropertyMock(return_value=mock_js_dir) + type(mock_tmpdir_obj).__exit__ = PropertyMock(return_value=Mock()) + mock_tmpdir.return_value = mock_tmpdir_obj + + _copy_jumpstart_artifacts(MOCK_COMPRESSED_MODEL_DATA_STR, MOCK_JUMPSTART_ID, mock_code_dir) + + mock_s3_downloader_obj.download.assert_called_once_with( + MOCK_COMPRESSED_MODEL_DATA_STR, MOCK_TMP_DIR + ) + mock_extract_js_resource.assert_called_once_with( + MOCK_TMP_DIR, mock_code_dir, MOCK_JUMPSTART_ID + ) + + @patch("sagemaker.serve.model_server.tgi.prepare.Path") + @patch("sagemaker.serve.model_server.tgi.prepare.tarfile") + def test_extract_js_resources_success(self, mock_tarfile, mock_path): + mock_path_obj = Mock() + mock_path_obj.joinpath.return_value = Mock() + mock_path.return_value = mock_path_obj + + mock_tar_obj = Mock() + mock_enter = Mock() + mock_resource_obj = Mock() + mock_enter.return_value = mock_resource_obj + type(mock_tar_obj).__enter__ = PropertyMock(return_value=mock_enter) + type(mock_tar_obj).__exit__ = PropertyMock(return_value=Mock()) + mock_tarfile.open.return_value = mock_tar_obj + + js_model_dir = "" + code_dir = Mock() + _extract_js_resource(js_model_dir, code_dir, MOCK_JUMPSTART_ID) + + mock_path.assert_called_once_with(js_model_dir) + mock_path_obj.joinpath.assert_called_once_with(f"infer-prepack-{MOCK_JUMPSTART_ID}.tar.gz") + mock_resource_obj.extractall.assert_called_once_with(path=code_dir) From b5dfe14bead767226bbaaf0f6274adcc7dae79b2 Mon Sep 17 00:00:00 2001 From: akrishna1995 <38850354+akrishna1995@users.noreply.github.com> Date: Tue, 19 Dec 2023 13:00:57 -0800 Subject: [PATCH 11/76] documentation: SMP v2 doc updates (#1423) (#4336) * doc update for estimator distribution art * add note to the SMP doc and minor fixes * remove subnodes * rm all v1 content as documenting everything in aws docs * fix build errors * fix white spaces * rm smdistributed from TF estimator distribution * rm white spaces * add notes to TF estimator distribution * fix links * incorporate feedback * update example values * fix version numbers in the notes Co-authored-by: Miyoung --- doc/api/training/distributed.rst | 23 +- doc/api/training/smd_model_parallel.rst | 43 - .../training/smd_model_parallel_general.rst | 465 --------- .../smd_model_parallel_change_log.rst | 902 ----------------- doc/api/training/smp_versions/archives.rst | 13 - doc/api/training/smp_versions/latest.rst | 35 - .../latest/smd_model_parallel_common_api.rst | 517 ---------- .../latest/smd_model_parallel_pytorch.rst | 944 ------------------ ...model_parallel_pytorch_tensor_parallel.rst | 896 ----------------- .../latest/smd_model_parallel_tensorflow.rst | 165 --- .../smp_versions/model-data-parallel.png | Bin 36777 -> 0 bytes .../v1.1.0/smd_model_parallel_common_api.rst | 485 --------- .../v1.1.0/smd_model_parallel_pytorch.rst | 521 ---------- .../v1.1.0/smd_model_parallel_tensorflow.rst | 164 --- .../v1.10.0/smd_model_parallel_common_api.rst | 538 ---------- .../v1.10.0/smd_model_parallel_pytorch.rst | 883 ---------------- ...model_parallel_pytorch_tensor_parallel.rst | 903 ----------------- .../v1.10.0/smd_model_parallel_tensorflow.rst | 171 ---- .../v1.2.0/smd_model_parallel_common_api.rst | 487 --------- .../v1.2.0/smd_model_parallel_pytorch.rst | 553 ---------- .../v1.2.0/smd_model_parallel_tensorflow.rst | 164 --- .../smp_versions/v1.3.0/add_smd_version.sh | 10 - .../v1.3.0/smd_model_parallel_common_api.rst | 488 --------- .../v1.3.0/smd_model_parallel_pytorch.rst | 572 ----------- .../v1.3.0/smd_model_parallel_tensorflow.rst | 172 ---- .../v1.4.0/smd_model_parallel_common_api.rst | 488 --------- .../v1.4.0/smd_model_parallel_pytorch.rst | 572 ----------- .../v1.4.0/smd_model_parallel_tensorflow.rst | 172 ---- .../v1.5.0/smd_model_parallel_common_api.rst | 488 --------- .../v1.5.0/smd_model_parallel_pytorch.rst | 572 ----------- .../v1.5.0/smd_model_parallel_tensorflow.rst | 172 ---- .../v1.6.0/smd_model_parallel_common_api.rst | 538 ---------- .../v1.6.0/smd_model_parallel_pytorch.rst | 678 ------------- ...model_parallel_pytorch_tensor_parallel.rst | 855 ---------------- .../v1.6.0/smd_model_parallel_tensorflow.rst | 171 ---- .../v1.9.0/smd_model_parallel_common_api.rst | 538 ---------- .../v1.9.0/smd_model_parallel_pytorch.rst | 677 ------------- ...model_parallel_pytorch_tensor_parallel.rst | 876 ---------------- .../v1.9.0/smd_model_parallel_tensorflow.rst | 171 ---- doc/api/training/smp_versions/v1_10_0.rst | 13 - doc/api/training/smp_versions/v1_1_0.rst | 12 - doc/api/training/smp_versions/v1_2_0.rst | 12 - doc/api/training/smp_versions/v1_3_0.rst | 12 - doc/api/training/smp_versions/v1_4_0.rst | 12 - doc/api/training/smp_versions/v1_5_0.rst | 12 - doc/api/training/smp_versions/v1_6_0.rst | 13 - doc/api/training/smp_versions/v1_9_0.rst | 13 - src/sagemaker/pytorch/estimator.py | 68 +- src/sagemaker/tensorflow/estimator.py | 76 +- 49 files changed, 80 insertions(+), 17245 deletions(-) delete mode 100644 doc/api/training/smd_model_parallel.rst delete mode 100644 doc/api/training/smd_model_parallel_general.rst delete mode 100644 doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.rst delete mode 100644 doc/api/training/smp_versions/archives.rst delete mode 100644 doc/api/training/smp_versions/latest.rst delete mode 100644 doc/api/training/smp_versions/latest/smd_model_parallel_common_api.rst delete mode 100644 doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst delete mode 100644 doc/api/training/smp_versions/latest/smd_model_parallel_pytorch_tensor_parallel.rst delete mode 100644 doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst delete mode 100644 doc/api/training/smp_versions/model-data-parallel.png delete mode 100644 doc/api/training/smp_versions/v1.1.0/smd_model_parallel_common_api.rst delete mode 100644 doc/api/training/smp_versions/v1.1.0/smd_model_parallel_pytorch.rst delete mode 100644 doc/api/training/smp_versions/v1.1.0/smd_model_parallel_tensorflow.rst delete mode 100644 doc/api/training/smp_versions/v1.10.0/smd_model_parallel_common_api.rst delete mode 100644 doc/api/training/smp_versions/v1.10.0/smd_model_parallel_pytorch.rst delete mode 100644 doc/api/training/smp_versions/v1.10.0/smd_model_parallel_pytorch_tensor_parallel.rst delete mode 100644 doc/api/training/smp_versions/v1.10.0/smd_model_parallel_tensorflow.rst delete mode 100644 doc/api/training/smp_versions/v1.2.0/smd_model_parallel_common_api.rst delete mode 100644 doc/api/training/smp_versions/v1.2.0/smd_model_parallel_pytorch.rst delete mode 100644 doc/api/training/smp_versions/v1.2.0/smd_model_parallel_tensorflow.rst delete mode 100755 doc/api/training/smp_versions/v1.3.0/add_smd_version.sh delete mode 100644 doc/api/training/smp_versions/v1.3.0/smd_model_parallel_common_api.rst delete mode 100644 doc/api/training/smp_versions/v1.3.0/smd_model_parallel_pytorch.rst delete mode 100644 doc/api/training/smp_versions/v1.3.0/smd_model_parallel_tensorflow.rst delete mode 100644 doc/api/training/smp_versions/v1.4.0/smd_model_parallel_common_api.rst delete mode 100644 doc/api/training/smp_versions/v1.4.0/smd_model_parallel_pytorch.rst delete mode 100644 doc/api/training/smp_versions/v1.4.0/smd_model_parallel_tensorflow.rst delete mode 100644 doc/api/training/smp_versions/v1.5.0/smd_model_parallel_common_api.rst delete mode 100644 doc/api/training/smp_versions/v1.5.0/smd_model_parallel_pytorch.rst delete mode 100644 doc/api/training/smp_versions/v1.5.0/smd_model_parallel_tensorflow.rst delete mode 100644 doc/api/training/smp_versions/v1.6.0/smd_model_parallel_common_api.rst delete mode 100644 doc/api/training/smp_versions/v1.6.0/smd_model_parallel_pytorch.rst delete mode 100644 doc/api/training/smp_versions/v1.6.0/smd_model_parallel_pytorch_tensor_parallel.rst delete mode 100644 doc/api/training/smp_versions/v1.6.0/smd_model_parallel_tensorflow.rst delete mode 100644 doc/api/training/smp_versions/v1.9.0/smd_model_parallel_common_api.rst delete mode 100644 doc/api/training/smp_versions/v1.9.0/smd_model_parallel_pytorch.rst delete mode 100644 doc/api/training/smp_versions/v1.9.0/smd_model_parallel_pytorch_tensor_parallel.rst delete mode 100644 doc/api/training/smp_versions/v1.9.0/smd_model_parallel_tensorflow.rst delete mode 100644 doc/api/training/smp_versions/v1_10_0.rst delete mode 100644 doc/api/training/smp_versions/v1_1_0.rst delete mode 100644 doc/api/training/smp_versions/v1_2_0.rst delete mode 100644 doc/api/training/smp_versions/v1_3_0.rst delete mode 100644 doc/api/training/smp_versions/v1_4_0.rst delete mode 100644 doc/api/training/smp_versions/v1_5_0.rst delete mode 100644 doc/api/training/smp_versions/v1_6_0.rst delete mode 100644 doc/api/training/smp_versions/v1_9_0.rst diff --git a/doc/api/training/distributed.rst b/doc/api/training/distributed.rst index 21837bc1e4..be050d1011 100644 --- a/doc/api/training/distributed.rst +++ b/doc/api/training/distributed.rst @@ -22,10 +22,19 @@ The SageMaker Distributed Data Parallel Library The SageMaker Distributed Model Parallel Library ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. toctree:: - :maxdepth: 2 - - smd_model_parallel - smp_versions/latest - smd_model_parallel_general - smd_model_parallel_release_notes/smd_model_parallel_change_log +.. note:: + + Since the release of the SageMaker model parallelism (SMP) version 2 in December 2023, + this documentation is no longer supported for maintenence. + The live documentation is available at + `SageMaker model parallelism library v2 + `_ + in the *Amazon SageMaker User Guide*. + + The documentation for the SMP library v1.x is archived and available at + `Run distributed training with the SageMaker model parallelism library + `_ + in the *Amazon SageMaker User Guide*, + and the SMP v1.x API reference is available in the + `SageMaker Python SDK v2.199.0 documentation + `_. diff --git a/doc/api/training/smd_model_parallel.rst b/doc/api/training/smd_model_parallel.rst deleted file mode 100644 index 635dcd582d..0000000000 --- a/doc/api/training/smd_model_parallel.rst +++ /dev/null @@ -1,43 +0,0 @@ -The SageMaker Distributed Model Parallel Library Overview ---------------------------------------------------------- - -The Amazon SageMaker distributed model parallel library is a model parallelism library for training -large deep learning models that were previously difficult to train due to GPU memory limitations. -The library automatically and efficiently splits a model across multiple GPUs and instances and coordinates model training, -allowing you to increase prediction accuracy by creating larger models with more parameters. - -You can use the library to automatically partition your existing TensorFlow and PyTorch workloads -across multiple GPUs with minimal code changes. The library's API can be accessed through the Amazon SageMaker SDK. - -.. tip:: - - We recommend that you use this API documentation along with the conceptual guide at - `SageMaker's Distributed Model Parallel - `_ - in the *Amazon SageMaker developer guide*. - The conceptual guide includes the following topics: - - - An overview of model parallelism, and the library's - `core features `_, - and `extended features for PyTorch `_. - - Instructions on how to modify `TensorFlow - `_ - and `PyTorch - `_ - training scripts. - - Instructions on how to `run a distributed training job using the SageMaker Python SDK - and the SageMaker model parallel library - `_. - - `Configuration tips and pitfalls - `_. - - -.. important:: - The model parallel library only supports SageMaker training jobs using CUDA 11. - Make sure you use the pre-built Deep Learning Containers. - If you want to extend or customize your own training image, - you must use a CUDA 11 base image. For more information, see `Extend a Prebuilt Docker - Container that Contains SageMaker's Distributed Model Parallel Library - `_ - and `Create Your Own Docker Container with the SageMaker Distributed Model Parallel Library - `_. diff --git a/doc/api/training/smd_model_parallel_general.rst b/doc/api/training/smd_model_parallel_general.rst deleted file mode 100644 index e626ad9083..0000000000 --- a/doc/api/training/smd_model_parallel_general.rst +++ /dev/null @@ -1,465 +0,0 @@ -.. _sm-sdk-modelparallel-general: - -############################################################# -Run a Distributed Training Job Using the SageMaker Python SDK -############################################################# - -Walk through the following pages to learn about the SageMaker model parallel library's APIs -to configure and enable distributed model parallelism -through an Amazon SageMaker estimator. - -.. _sm-sdk-modelparallel-params: - -Configuration Parameters for ``distribution`` -============================================= - -Amazon SageMaker's TensorFlow and PyTorch estimator objects contain a ``distribution`` parameter, -which you can use to enable and specify parameters for SageMaker distributed training. -The SageMaker model parallel library internally uses MPI. -To use model parallelism, both ``smdistributed`` and MPI must be enabled -through the ``distribution`` parameter. - -The following code example is a template of setting up model parallelism for a PyTorch estimator. - -.. code:: python - - import sagemaker - from sagemaker.pytorch import PyTorch - - smp_options = { - "enabled":True, - "parameters": { - ... - } - } - - mpi_options = { - "enabled" : True, - ... - } - - smdmp_estimator = PyTorch( - ... - distribution={ - "smdistributed": {"modelparallel": smp_options}, - "mpi": mpi_options - } - ) - - smdmp_estimator.fit() - -.. tip:: - - This page provides you a complete list of parameters you can use - when you construct a SageMaker estimator and configure for distributed training. - - To find examples of how to construct a SageMaker estimator with the distributed training parameters, see - `Launch a SageMaker Distributed Model Parallel Training Job `_ - in the `SageMaker's Distributed Model Parallel developer guide `_. - -.. contents:: Table of Contents - :depth: 3 - :local: - -Parameters for ``smdistributed`` ----------------------------------- - -You can use the following parameters to initialize the library -configuring a dictionary for ``modelparallel``, which goes -into the ``smdistributed`` option for the ``distribution`` parameter. - -.. note:: - - ``partitions`` for TensorFlow and ``pipeline_parallel_degree`` for PyTorch are required parameters. - All other parameters in the following - table are optional. - -Common Parameters -~~~~~~~~~~~~~~~~~ - -.. list-table:: - :widths: 10 20 10 60 - :header-rows: 1 - - * - Parameter - - Type / Valid values - - Default - - Description - * - ``partitions`` for TensorFlow and PyTorch with smdistributed-modelparallel=v1.6) - - int - - - - **Required.** The number of partitions to split the model into. - In case of ``pipeline_parallel_degree`` for PyTorch, this is the number of devices - over which pipeline parallelism will be performed. - * - ``microbatches`` - - int - - 1 - - The number of microbatches to perform pipelining over. 1 means no pipelining. - Batch size must be divisible by the number of microbatches. - * - ``pipeline`` - - ``"interleaved"`` or ``"simple"`` - - ``"interleaved"`` - - The pipeline schedule. - * - ``optimize`` - - ``"memory"`` or ``"speed"`` - - ``"memory"`` - - Determines the distribution mechanism of transformer layers. - If optimizing ``speed``, there will be less communication across tensor-parallel ranks - and layer normalization will not be distributed. However, there will be duplicate activations - stored across tensor-parallel ranks. - If optimizing ``memory``, there will be no redundant activations stored, - but this will result in more communication overhead across tensor parallel ranks. - * - ``placement_strategy`` - - ``"cluster"``, ``"spread"``, or a permutation of the string ``D``, ``P``, and ``T``. - - ``"cluster"`` - - Determines the mapping of model partitions onto physical devices. - When hybrid model/data parallelism is used, ``cluster`` places a single model replica in - neighboring device IDs. Contrarily, ``spread`` places a model replica as far as possible. - For more information, see :ref:`ranking-basics`. - - In case of the permutation letters, ``D`` stands for reduced-data parallelism, - ``P`` stands for pipeline parallelism, - and ``T`` stands for tensor parallelism. - ``spread`` is equivalent to ``"TPD"``, and ``cluster`` is equivalent to ``"DPT"``. - For more information, see :ref:`ranking-basics-tensor-parallelism`. - - Note: For TensorFlow, tensor parallelism is not implemented and - available parameter values are only ``"spread"`` and ``"cluster"``. - * - ``auto_partition`` - - bool - - ``True`` - - Enable auto-partitioning. If disabled, ``default_partition`` parameter must be provided. - * - ``default_partition`` - - int - - ``0`` - - **Required** if ``auto_partition`` is false. The partition ID to place operations/modules - that are not placed in any ``smp.partition`` contexts. - -TensorFlow-specific Parameters -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. list-table:: - :widths: 10 20 10 60 - :header-rows: 1 - - * - Parameter - - Type / Valid values - - Default - - Description - * - ``contiguous`` - - bool - - ``True`` - - Whether the model partitions should be contiguous. If true, each partition forms a connected component in the computational graph, unless the graph itself is not connected. - * - ``horovod`` - - bool - - ``False`` - - Must be set to ``True`` if hybrid model/data parallelism is used and the data parallelism (DP) framework is Horovod. - - -PyTorch-specific Parameters -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. list-table:: - :widths: 10 20 10 60 - :header-rows: 1 - - * - Parameter - - Type / Valid values - - Default - - Description - * - ``memory_weight`` - - float [0.0, 1.0] - - ``0.2`` if ``optimize`` is ``"speed"``, else ``0.8`` - - The weight of memory balancing in the auto-partitioni ng objective, as opposed to balancing computational load. If 0.0, the library only tries to balance computation; if 1.0 the library only tries to balance the memory use. Any value in between interpolates between these extremes. - * - ``ddp`` - - bool - - ``False`` - - Must be set to True if hybrid model/data parallelism is used with DistributedDataParallel. DistributedDataParallel is used with NCCL backend, and uses the MASTER_PORT provided by SageMaker. - * - ``active_microbatches`` (**smdistributed-modelparallel**>=v1.3) - - int - - ``partitions`` + 2 - - This is the maximum number of microbatches that are simultaneously in execution during pipelining. Jointly scaling batch size and number of microbatches can often mitigate the pipeline bubble overhead, but that can lead to increased memory usage if too many microbatches are simultaneously in execution. In such cases setting the number of active microbatches to a lower number can help control memory usage. By default this is set to two plus the number of partitions of the model. - * - ``deterministic_server`` (**smdistributed-modelparallel**>=v1.3) - - bool - - ``False`` - - Setting this to true ensures that the execution server for pipelining executes requests in the same order across all data parallel ranks. - * - ``offload_activations`` (**smdistributed-modelparallel**>=v1.6) - - bool - - False - - Enables activation - offloading. To improve GPU memory usage, use activation offloading - only when (1) the ``microbatches`` and ``active_microbatches`` are - greater than 1, and (2) activation checkpointing is enabled for at - least one module in the model. - * - ``activation_loading_horizon`` (**smdistributed-modelparallel**>=v1.6) - - int - - 4 - - Specify the number - of pipeline tasks. This determines how early the activations should - be loaded back to the GPU, expressed in number of pipeline tasks. - Smaller value indicates that activations are loaded closer in time to - when they are needed for backward pass. Setting this value too small - might improve memory usage, but might potentially cause throughput - loss and GPU bottlenecks during the CPU-to-GPU data transfer. - * - ``tensor_parallel_degree`` (**smdistributed-modelparallel**>=v1.6) - - int - - 1 - - The number of devices over which the tensor parallel modules will be distributed. - If ``tensor_parallel_degree`` is greater than 1, then ``ddp`` must be set to ``True``. - * - ``fp16`` (**smdistributed-modelparallel**>=v1.10) - - bool - - ``False`` - - To run FP16 training, add ``"fp16"'": True`` to the smp configuration. - Other APIs remain the same between FP16 and FP32. - If ``fp16`` is enabled and when user calls ``smp.DistributedModel``, - the model will be wrapped with ``FP16_Module``, which converts the model - to FP16 dtype and deals with forward pass in FP16. - If ``fp16`` is enabled and when user calls ``smp.DistributedOptimizer``, - the optimizer will be wrapped with ``FP16_Optimizer``. - * - ``fp16_params`` (**smdistributed-modelparallel**>=v1.6) - - bool - - ``False`` - - If ``True``, the parameters of the distributed modules will be initialized in FP16. - * - ``shard_optimizer_state`` (**smdistributed-modelparallel**>=v1.6) - - bool - - ``False`` - - If ``True``, the library shards the optimizer state of all parameters across - the data parallel processes which hold the same parameter. - This optimizer state sharding happens in a balanced manner. - Note that when sharding optimizer state, full optimizer saving is not currently supported. - Please save partial optimizer state. For more information about saving and loading checkpoints with - optimizer state sharding, see `Instructions for Checkpointing with Tensor Parallelism `_. - * - ``prescaled_batch`` (**smdistributed-modelparallel**>=v1.6) - - bool - - ``False`` - - If ``True`` and when ``smp.nn.DistributedTransformerLMHead`` is used - (this is typically used for GPT-2 or GPT-3 models), - the library assumes that the devices in the same tensor parallelism group - receive the same input data. Otherwise, it is assumed that they receive - different examples. To learn more, see :ref:`prescaled-batch`. - * - ``skip_tracing`` (**smdistributed-modelparallel**>=v1.6) - - bool - - False - - Skips the initial tracing step. This can be useful in very large models - where even model tracing at the CPU is not possible due to memory constraints. - * - ``sharded_data_parallel_degree`` (**smdistributed-modelparallel**>=v1.11) - - int - - 1 - - To run a training job using sharded data parallelism, add this parameter and specify a number greater than 1. - Sharded data parallelism is a memory-saving distributed training technique that splits the training state of a model (model parameters, gradients, and optimizer states) across GPUs in a data parallel group. - For more information, see `Sharded Data Parallelism - `_. - * - ``sdp_reduce_bucket_size`` (**smdistributed-modelparallel**>=v1.11) - - int - - 5e8 - - Configuration parameter for sharded data parallelism (for ``sharded_data_parallel_degree > 2``). - Specifies the size of PyTorch DDP gradient buckets in number of elements of the default dtype. - * - ``sdp_param_persistence_threshold`` (**smdistributed-modelparallel**>=v1.11) - - int - - 1e6 - - Specifies the size of a parameter tensor in number of elements that can persist at each GPU. Sharded data parallelism splits each parameter tensor across GPUs of a data parallel group. If the number of elements in the parameter tensor is smaller than this threshold, the parameter tensor is not split; this helps reduce communication overhead because the parameter tensor is replicated across data-parallel GPUs. - * - ``sdp_max_live_parameters`` (**smdistributed-modelparallel**>=v1.11) - - int - - 1e9 - - Specifies the maximum number of parameters that can simultaneously be in a recombined training state during the forward and backward pass. Parameter fetching with the AllGather operation pauses when the number of active parameters reaches the given threshold. Note that increasing this parameter increases the memory footprint. - * - ``sdp_hierarchical_allgather`` (**smdistributed-modelparallel**>=v1.11) - - bool - - True - - If set to True, the AllGather operation runs hierarchically: it runs within each node first, and then runs across nodes. For multi-node distributed training jobs, the hierarchical AllGather operation is automatically activated. - * - ``sdp_gradient_clipping`` (**smdistributed-modelparallel**>=v1.11) - - float - - 1.0 - - Specifies a threshold for gradient clipping the L2 norm of the gradients before propagating them backward through the model parameters. When sharded data parallelism is activated, gradient clipping is also activated. The default threshold is 1.0. Adjust this parameter if you have the exploding gradients problem. - - -Parameters for ``mpi`` ----------------------- - -For the ``"mpi"`` key, a dict must be passed which contains: - -* ``"enabled"``: Set to ``True`` to launch the training job with MPI. - -* ``"processes_per_host"``: Specifies the number of processes MPI should launch on each host. - In SageMaker a host is a single Amazon EC2 ml instance. The SageMaker distributed model parallel library maintains - a one-to-one mapping between processes and GPUs across model and data parallelism. - This means that SageMaker schedules each process on a single, separate GPU and no GPU contains more than one process. - If you are using PyTorch, you must restrict each process to its own device using - ``torch.cuda.set_device(smp.local_rank())``. To learn more, see - `Modify a PyTorch Training Script - `_. - - .. important:: - ``process_per_host`` must be less than or equal to the number of GPUs per instance, and typically will be equal to - the number of GPUs per instance. - - For example, if you use one instance with 4-way model parallelism and 2-way data parallelism, - then processes_per_host should be 2 x 4 = 8. Therefore, you must choose an instance that has at least 8 GPUs, - such as an ml.p3.16xlarge. - - The following image illustrates how 2-way data parallelism and 4-way model parallelism is distributed across 8 GPUs: - the model is partitioned across 4 GPUs, and each partition is added to 2 GPUs. - - .. image:: smp_versions/model-data-parallel.png - :width: 650 - :alt: 2-way data parallelism and 4-way model parallelism distributed across 8 GPUs - - -* ``"custom_mpi_options"``: Use this key to pass any custom MPI options you might need. - To avoid Docker warnings from contaminating your training logs, we recommend the following flag. - ```--mca btl_vader_single_copy_mechanism none``` - - -.. _ranking-basics: - -Ranking Basics without Tensor Parallelism -========================================= - -The library maintains a one-to-one mapping between processes and available GPUs: -for each GPU, there is a corresponding CPU process. Each CPU process -maintains a “rank” assigned by MPI, which is a 0-based unique index for -the process. For instance, if a training job is launched with 4 -``p3dn.24xlarge`` instances using all its GPUs, there are 32 processes -across all instances, and the ranks of these processes range from 0 to -31. - -The ``local_rank`` of a process is the rank of the process among the -processes in the same instance. This can range from 0 up to the number -of GPUs in the instance, but can be lower if fewer processes than GPUs are -launched in the instance. For instance, in the preceding -example, ``local_rank``\ s of the processes will range from 0 to 7, -since there are 8 GPUs in a ``p3dn.24xlarge`` instance. - -When model parallelism is used together with data parallelism (Horovod for TensorFlow -and DDP for PyTorch), the library partitions the set of processes into -disjoint \ ``mp_group``\ s. An ``mp_group`` is a subset of all processes -that together hold a single, partitioned model replica. - -For instance, if -a single node job is launched with 8 local processes with -``partitions=2`` (meaning the model will be split into 2), there are -four \ ``mp_group``\ s. The specific sets of processes that form the -``mp_group``\ s can be adjusted by the ``placement_strategy`` option. - -- If ``placement_strategy`` is ``spread``, then the four - ``mp_group``\ s are ``[0, 4], [1, 5], [2, 6], [3, 7]``. The - ``mp_rank`` is the rank of a process within each ``mp_group``. For example, - the ``mp_rank`` is 0 for the processes 0, 1, 2, and 3, and the ``mp_rank`` is 1 for - the processes 4, 5, 6, and 7. - - Analogously, the library defines ``dp_group``\ s as sets of processes that - all hold the same model partition, and perform data parallelism among - each other. If ``placement_strategy`` is ``spread``, there are two ``dp_group``\ s: - ``[0, 1, 2, 3]`` and ``[4, 5, 6, 7]``. - - Since each process within the ``dp_group`` holds the same partition of - the model, and makes allreduce calls among themselves. Allreduce for - data parallelism does not take place *across* ``dp_group``\ s. - ``dp_rank`` is defined as the rank of a process within its ``dp_group``. - In the preceding example, the \ ``dp_rank`` of process 6 is 2. - -- If ``placement_strategy`` is ``cluster``, the four ``mp_group``\ s - become ``[0, 1], [2, 3], [4, 5], [6, 7]``, and the the two ``dp_group``\ s become - ``[0, 2, 4, 6]`` and ``[1, 3, 5, 7]``. - -.. _ranking-basics-tensor-parallelism: - -Placement Strategy with Tensor Parallelism -========================================== - -In addition to the two placement strategies introduced in the previous section, -the library provides additional placement strategies for extended tensor parallelism features -for PyTorch. The additional placement strategies (parallelism types) are denoted as follows: - -- ``D`` stands for (reduced) data parallelism. -- ``P`` stands for pipeline parallelism. -- ``T`` stands for tensor parallelism. - -With given permutation of the tree letters, the library takes the right-most letter -as the first strategy performs over the global ranks in ascending order. -Contrarily, the parallelism type represented by the left-most letter is performed -over the ranks that are as distant as possible. - -- **Example:** Given 8 devices with ``tp_size() == 2``, - ``pp_size() == 2``, ``rdp_size() == 2`` - - - ``placement_strategy: "DPT"`` gives - - ==== ======== ======= ======= - rank rdp_rank pp_rank tp_rank - ==== ======== ======= ======= - 0 0 0 0 - 1 0 0 1 - 2 0 1 0 - 3 0 1 1 - 4 1 0 0 - 5 1 0 1 - 6 1 1 0 - 7 1 1 1 - ==== ======== ======= ======= - - - ``placement_strategy: "PTD"`` gives - - ==== ======== ======= ======= - rank rdp_rank pp_rank tp_rank - ==== ======== ======= ======= - 0 0 0 0 - 1 1 0 0 - 2 0 0 1 - 3 1 0 1 - 4 0 1 0 - 5 1 1 0 - 6 0 1 1 - 7 1 1 1 - ==== ======== ======= ======= - -Because the neighboring ranks are placed on the same instance with -high-bandwidth NVLinks, it is recommended to place the -parallelism type that has higher bandwidth requirements for your model -on the right-most position in the ``placement_strategy`` string. Because -tensor parallelism often requires frequent communication, placing -``T`` in the right-most position is recommended (as in the default -``"cluster"`` strategy). In many large models, keeping the default of -``"cluster"`` would result in the best performance. - - -.. _prescaled-batch: - -Prescaled Batch -=============== - -``prescaled_batch`` is a configuration parameter that can be useful for -``DistributedTransformerLMHead``, which is used for GPT-2 and GPT-3. - -The way tensor parallelism works is that when a module is distributed, -the inputs to the distributed module in different ``tp_rank``\ s gets -shuffled around in a way that is sliced by the hidden dimension and -scaled by the batch dimension. For example, if tensor parallel degree is -8, the inputs to ``DistributedTransformer`` (a tensor with shape -``[B, S, H]`` where ``B``\ =batch size, ``S``\ =sequence length, -``H``\ =hidden width) in different ``tp_rank``\ s will be communicated -around, and the shapes will become ``[8B, S, H/8]``. Each ``tp_rank`` -has the batch from all the peer ``tp_rank``\ s, but only the slice that -interacts with their local partition of the module. - -By default, the library assumes that each ``tp_rank`` gets assigned a -different batch, and performs the communication described above. If -``prescaled_batch`` is true, then the library assumes that the input -batch is already scaled (and is the same across the ``tp_rank``\ s), and -only does the slicing. In the example above, the library assumes that -input tensor has shape ``[8B, S, H]``, and only converts it into -``[8B, S, H/8]``. So if ``prescaled_batch`` is true, it is the user’s -responsibility to feed the same batch to the ``tp_rank``\ s in the same -``TP_GROUP``. This can be done by doing the data sharding based on -``smp.rdp_size()`` and ``smp.rdp_rank()``, instead of ``smp.dp_size()`` -and ``smp.dp_rank()``. When ``prescaled_batch`` is true, the global -batch size is ``smp.rdp_size()`` multiplied by the per-``MP_GROUP`` -batch size. When ``prescaled_batch`` is false, global batch size is -``smp.dp_size()`` multiplied by the per-``PP_GROUP`` batch size. - -If you use pipeline parallelism degree 1, then you can keep -``prescaled_batch`` false (the default option). If you use a pipeline -parallellism degree more than 1, it is recommended to use -``prescaled_batch`` true, so that you can increase per-``MP_GROUP`` -batch size for efficient pipelining, without running into out-of-memory -issues. diff --git a/doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.rst b/doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.rst deleted file mode 100644 index 9409d69aad..0000000000 --- a/doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.rst +++ /dev/null @@ -1,902 +0,0 @@ -############# -Release Notes -############# - -New features, bug fixes, and improvements are regularly made to the SageMaker -model parallelism library. - - -SageMaker Distributed Model Parallel 1.15.0 Release Notes -========================================================= - -*Date: Apr. 27. 2023* - -**Currency Updates** - -* Added support for PyTorch v2.0.0. - Note that the library does not support ``torch.compile`` in this release. - -**New Features** - -* Using sharded data parallelism with tensor parallelism together is now - available for PyTorch 1.13.1. It allows you to train with smaller global batch - sizes while scaling up to large clusters. For more information, see `Sharded - data parallelism with tensor parallelism `_ - in the *Amazon SageMaker Developer Guide*. -* Added support for saving and loading full model checkpoints when using sharded - data parallelism. This is enabled by using the standard checkpointing API, - ``smp.save_checkpoint`` with ``partial=False``. - Before, full checkpoints needed to be created by merging partial checkpoint - files after training finishes. -* `DistributedTransformer `_ - now supports the ALiBi position embeddings. - When using DistributedTransformer, you can set the ``use_alibi`` parameter - to ``True`` to use the Triton-based flash attention kernels. This helps - evaluate sequences longer than those used for training. - -**Bug Fixes** - -* When using tensor parallelism, parameters were initialized multiple times - unncessarily. This release fixed the multiple initialization of parameters - so that each parameter is initialized exactly once. - It not only saves time, but also ensures that the random generator behavior - is similar to the non-tensor parallelism case. - -**Known issues** - -* Model initialization might take longer with PyTorch 2.0 than that with PyTorch 1.13. - -**Migration to AWS Deep Learning Containers** - -This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers (DLC): - -- SageMaker training container for PyTorch v2.0.0 - - .. code:: - - 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.0.0-gpu-py310-cu118-ubuntu20.04-sagemaker - -- SageMaker training container for PyTorch v1.13.1 - - .. code:: - - 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.13.1-gpu-py39-cu117-ubuntu20.04-sagemaker - -Binary file of this version of the library for `custom container -`_ users: - -- For PyTorch v2.0.0 - - .. code:: - - https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/pytorch-2.0.0/build-artifacts/2023-04-14-20-14/smdistributed_modelparallel-1.15.0-cp310-cp310-linux_x86_64.whl - -- For PyTorch v1.13.1 - - .. code:: - - https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/pytorch-1.13.1/build-artifacts/2023-04-17-15-49/smdistributed_modelparallel-1.15.0-cp39-cp39-linux_x86_64.whl - ----- - -Release History -=============== - -SageMaker Distributed Model Parallel 1.14.0 Release Notes ---------------------------------------------------------- - -*Date: Jan. 30. 2023* - -**Currency Updates** - -* Added support for PyTorch v1.13.1 - -**Improvements** - -* Upgraded the flash-attention (https://github.com/HazyResearch/flash-attention) library to v0.2.6.post1 - -**Migration to AWS Deep Learning Containers** - -This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers (DLC): - -- SageMaker training container for PyTorch v1.13.1 - - .. code:: - - 763104351884.dkr.ecr..amazonaws.com/pytorch-training:1.13.1-gpu-py39-cu117-ubuntu20.04-sagemaker - - -Binary file of this version of the library for `custom container -`_ users: - -- For PyTorch 1.13.1 - - .. code:: - - https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/pytorch-1.13.1/build-artifacts/2023-01-19-18-35/smdistributed_modelparallel-1.14.0-cp39-cp39-linux_x86_64.whl - - -SageMaker Distributed Model Parallel 1.13.0 Release Notes ---------------------------------------------------------- - -*Date: Dec. 15. 2022* - -**New Features** - -* Sharded data parallelism now supports a new backend for collectives called *SMDDP Collectives*. - For supported scenarios, SMDDP Collectives are on by default for the AllGather operation. - For more information, see - `Sharded data parallelism with SMDDP Collectives - `_ - in the *Amazon SageMaker Developer Guide*. -* Introduced FlashAttention for DistributedTransformer to improve memory usage and computational - performance of models such as GPT2, GPTNeo, GPTJ, GPTNeoX, BERT, and RoBERTa. - -**Bug Fixes** - -* Fixed initialization of ``lm_head`` in DistributedTransformer to use a provided range - for initialization, when weights are not tied with the embeddings. - -**Improvements** - -* When a module has no parameters, we have introduced an optimization to execute - such a module on the same rank as its parent during pipeline parallelism. - -**Migration to AWS Deep Learning Containers** - -This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers (DLC): - -- SageMaker training container for PyTorch v1.12.1 - - .. code:: - - 763104351884.dkr.ecr..amazonaws.com/pytorch-training:1.12.1-gpu-py38-cu113-ubuntu20.04-sagemaker - - -Binary file of this version of the library for `custom container -`_ users: - -- For PyTorch 1.12.1 - - .. code:: - - https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/pytorch-1.12.1/build-artifacts/2022-12-08-21-34/smdistributed_modelparallel-1.13.0-cp38-cp38-linux_x86_64.whl - - -SageMaker Distributed Model Parallel 1.11.0 Release Notes ---------------------------------------------------------- - -*Date: August. 17. 2022* - -**New Features** - -The following new features are added for PyTorch. - -* The library implements sharded data parallelism, which is a memory-saving - distributed training technique that splits the training state of a model - (model parameters, gradients, and optimizer states) across data parallel groups. - With sharded data parallelism, you can reduce the per-GPU memory footprint of - a model by sharding the training state over multiple GPUs. To learn more, - see `Sharded Data Parallelism - `_ - in the *Amazon SageMaker Developer Guide*. - -**Migration to AWS Deep Learning Containers** - -This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers (DLC): - -- DLC for PyTorch 1.12.0 - - .. code:: - - 763104351884.dkr.ecr..amazonaws.com/pytorch-training:1.12.0-gpu-py38-cu113-ubuntu20.04-sagemaker - -Binary file of this version of the library for `custom container -`_ users: - -- For PyTorch 1.12.0 - - .. code:: - - https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/pytorch-1.12.0/build-artifacts/2022-08-12-16-58/smdistributed_modelparallel-1.11.0-cp38-cp38-linux_x86_64.whl - -SageMaker Distributed Model Parallel 1.10.1 Release Notes ---------------------------------------------------------- - -*Date: August. 8. 2022* - -**Currency Updates** - -* Added support for Transformers v4.21. - - -**Migration to AWS Deep Learning Containers** - -This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers (DLC): - -- DLC for PyTorch 1.11.0 - - .. code:: - - 763104351884.dkr.ecr..amazonaws.com/pytorch-training:1.11.0-gpu-py38-cu113-ubuntu20.04-sagemaker - - -Binary file of this version of the library for `custom container -`_ users: - -- For PyTorch 1.11.0 - - .. code:: - - https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/pytorch-1.11.0/build-artifacts/2022-07-28-23-07/smdistributed_modelparallel-1.10.1-cp38-cp38-linux_x86_64.whl - - - -SageMaker Distributed Model Parallel 1.10.0 Release Notes ---------------------------------------------------------- - -*Date: July. 19. 2022* - -**New Features** - -The following new features are added for PyTorch. - -* Added support for FP16 training by implementing smdistributed.modelparallel - modification of Apex FP16_Module and FP16_Optimizer. To learn more, see - `FP16 Training with Model Parallelism - `_. -* New checkpoint APIs for CPU memory usage optimization. To learn more, see - `Checkpointing Distributed Models and Optimizer States - `_. - -**Improvements** - -* The SageMaker distributed model parallel library manages and optimizes CPU - memory by garbage-collecting non-local parameters in general and during checkpointing. -* Changes in the `GPT-2 translate functions - `_ - (``smdistributed.modelparallel.torch.nn.huggingface.gpt2``) - to save memory by not maintaining two copies of weights at the same time. - -**Migration to AWS Deep Learning Containers** - -This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers (DLC): - -- DLC for PyTorch 1.11.0 - - .. code:: - - 763104351884.dkr.ecr..amazonaws.com/pytorch-training:1.11.0-gpu-py38-cu113-ubuntu20.04-sagemaker - -- DLC for PyTorch 1.12.0 - - .. code:: - - 763104351884.dkr.ecr..amazonaws.com/pytorch-training:1.12.0-gpu-py38-cu113-ubuntu20.04-sagemaker - -Binary file of this version of the library for `custom container -`_ users: - -- For PyTorch 1.11.0 - - .. code:: - - https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/pytorch-1.11.0/build-artifacts/2022-07-11-19-23/smdistributed_modelparallel-1.10.0-cp38-cp38-linux_x86_64.whl - -- For PyTorch 1.12.0 - - .. code:: - - https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/pytorch-1.12.0/build-artifacts/2022-07-11-19-23/smdistributed_modelparallel-1.10.0-cp38-cp38-linux_x86_64.whl - - -SageMaker Distributed Model Parallel 1.9.0 Release Notes --------------------------------------------------------- - -*Date: May. 3. 2022* - -**Currency Updates** - -* Added support for PyTorch 1.11.0 - -**Migration to AWS Deep Learning Containers** - -This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers (DLC): - -- PyTorch 1.11.0 DLC - - .. code:: - - 763104351884.dkr.ecr..amazonaws.com/pytorch-training:1.11.0-gpu-py38-cu113-ubuntu20.04-sagemaker - -Binary file of this version of the library for custom container users: - - .. code:: - - https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/pytorch-1.11.0/build-artifacts/2022-04-20-17-05/smdistributed_modelparallel-1.9.0-cp38-cp38-linux_x86_64.whl - - - -SageMaker Distributed Model Parallel 1.8.1 Release Notes --------------------------------------------------------- - -*Date: April. 23. 2022* - -**New Features** - -* Added support for more configurations of the Hugging Face Transformers GPT-2 and GPT-J models - with tensor parallelism: ``scale_attn_weights``, ``scale_attn_by_inverse_layer_idx``, - ``reorder_and_upcast_attn``. To learn more about these features, please refer to - the following model configuration classes - in the *Hugging Face Transformers documentation*: - - * `transformers.GPT2Config `_ - * `transformers.GPTJConfig `_ - -* Added support for activation checkpointing of modules which pass keyword value arguments - and arbitrary structures in their forward methods. This helps support - activation checkpointing with Hugging Face Transformers models even - when tensor parallelism is not enabled. - -**Bug Fixes** - -* Fixed a correctness issue with tensor parallelism for GPT-J model - which was due to improper scaling during gradient reduction - for some layer normalization modules. -* Fixed the creation of unnecessary additional processes which take up some - GPU memory on GPU 0 when the :class:`smp.allgather` collective is called. - -**Improvements** - -* Improved activation offloading so that activations are preloaded on a - per-layer basis as opposed to all activations for a micro batch earlier. - This not only improves memory efficiency and performance, but also makes - activation offloading a useful feature for non-pipeline parallelism cases. - -**Migration to AWS Deep Learning Containers** - -This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers: - -* HuggingFace 4.17.0 DLC with PyTorch 1.10.2 - - .. code:: - - 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.10.2-transformers4.17.0-gpu-py38-cu113-ubuntu20.04 - - -* The binary file of this version of the library for custom container users - - .. code:: - - https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/pytorch-1.10.0/build-artifacts/2022-04-14-03-58/smdistributed_modelparallel-1.8.1-cp38-cp38-linux_x86_64.whl - - -SageMaker Distributed Model Parallel 1.8.0 Release Notes --------------------------------------------------------- - -*Date: March. 23. 2022* - -**New Features** - -* Added tensor parallelism support for the `GPT-J model - `_. - When using the GPT-J model of Hugging Face Transformers v4.17.0 with - tensor parallelism, the SageMaker model parallel library automatically - replaces the model with a tensor parallel distributed GPT-J model. - For more information, see `Support for Hugging Face Transformer Models - `_ - in the *Amazon SageMaker Model Parallel Training developer guide*. - -**Migration to AWS Deep Learning Containers** - -This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers: - -* HuggingFace 4.17.0 DLC with PyTorch 1.10.2 - - .. code:: - - 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.10.2-transformers4.17.0-gpu-py38-cu113-ubuntu20.04 - - -The binary file of this version of the library for custom container users: - - .. code:: - - https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/pytorch-1.10.0/build-artifacts/2022-03-12-00-33/smdistributed_modelparallel-1.8.0-cp38-cp38-linux_x86_64.whl - - -SageMaker Distributed Model Parallel 1.7.0 Release Notes --------------------------------------------------------- - -*Date: March. 07. 2022* - -**Currency Updates** - -* Support for PyTorch 1.10.2 -* Support for Hugging Face Transformers 4.16.2 - -**Improvements** - -* Additional support for the :ref:`smdmp-pytorch-tensor-parallel`. - - * Added support for FP32 residual addition to avoid overflow (NaN loss values) - for large models with more than 100 billion parameters when using FP16. - This is integrated to the following module: - - * :class:`smp.nn.DistributedTransformerOutputLayer` - - - * Added support for the following two `NVIDIA Megatron fused kernels - `_: - - * Fusion of attention masking and softmax (``fused_softmax``) - * Fusion of bias addition and Gelu activation (``fused_bias_gelu``) - - To learn more about these options and how to use them, - see the :class:`smp.tensor_parallelism` context manager. - - - -**Migration to AWS Deep Learning Containers** - -This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers: - - -* PyTorch 1.10.2 - - .. code:: - - 763104351884.dkr.ecr..amazonaws.com/pytorch-training:1.10.2-gpu-py38-cu113-ubuntu20.04-sagemaker - - -SageMaker Distributed Model Parallel 1.6.0 Release Notes --------------------------------------------------------- - -*Date: December. 20. 2021* - -**New Features** - -- **PyTorch** - - - Added extended memory-saving features for PyTorch 1.8.1: - - - `Tensor parallelism `_ - - `Optimizer state sharding `_ - - `Activation checkpointing `_ - - `Activation offloading `_ - - For more information, see the following documentation: - - - `SageMaker distributed model parallel developer guide `_ - - `SageMaker distributed model parallel API documentation for v1.6.0 `_ - -**Migration to AWS Deep Learning Containers** - -This version passed benchmark testing and is migrated to the following -AWS Deep Learning Container(s): - -- Deep Learning Container for PyTorch 1.8.1: - - .. code:: - - 763104351884.dkr.ecr..amazonaws.com/pytorch-training:1.8.1-gpu-py36-cu111-ubuntu18.04 - - - -SageMaker Distributed Model Parallel 1.5.0 Release Notes --------------------------------------------------------- - -*Date: November. 03. 2021* - -**New Features** - -- **PyTorch** - - - Currency update for PyTorch 1.10.0 - -**Migration to AWS Deep Learning Containers** - -This version passed benchmark testing and is migrated to the following -AWS Deep Learning Containers: - -- Deep Learning Container for PyTorch 1.10.0: - - .. code:: - - 763104351884.dkr.ecr..amazonaws.com/pytorch-training:1.10.0-gpu-py38-cu113-ubuntu20.04-sagemaker - ----- - -SageMaker Distributed Model Parallel 1.4.0 Release Notes --------------------------------------------------------- - -*Date: June. 29. 2021* - -**New Features** - -- **TensorFlow** - - - Added support for TensorFlow v2.5.0. - - Added support for ``keras.model.fit()``. - -**Migration to AWS Deep Learning Containers** - -This version passed benchmark testing and is migrated to the following -AWS Deep Learning Containers: - -- Deep Learning Container for TensorFlow 2.5.0: - - .. code:: - - 763104351884.dkr.ecr..amazonaws.com/tensorflow-training:2.5.0-gpu-py37-cu112-ubuntu18.04-v1.0 - -- Deep Learning Container for PyTorch 1.9.1: - - .. code:: - - 763104351884.dkr.ecr..amazonaws.com/pytorch-training:1.9.1-gpu-py38-cu111-ubuntu20.04 - ----- - -SageMaker Distributed Model Parallel 1.3.1 Release Notes --------------------------------------------------------- - -- New Features -- Bug Fixes -- Known Issues - -**New Features** - -- **TensorFlow** - - - Exposes a new decorator ``register_post_partition_hook``. This allows - invoking the decorated methods just after model partition but before - executing the first step. For example loading a checkpoint. Refer to - the `SageMaker distributed model parallel API - documentation `__ - for more information. - -**Bug Fixes** - -- **PyTorch** - - - Improved memory efficiency when using active microbatches by clearing - activations at end of each microbatch. - -- **TensorFlow** - - - Fixed issue that caused hangs when training some models with XLA - enabled. - -**Known Issues** - -- **PyTorch** - - - A crash was observed when ``optimizer.step()`` was called for certain - optimizers such as AdaDelta, when the partition on which this method - was called has no local parameters assigned to it after partitioning. - This is due to a bug in PyTorch which `has since been - fixed `__. Till that - makes its way to the next release of PyTorch, only call - ``optimizer.step()`` on processes which have at least one local - parameter. This can be checked like this - ``len(list(model.local_parameters())) > 0``. - - - A performance regression still exists when training on SMP with - PyTorch 1.7.1 compared to 1.6. The rootcause was found to be the - slowdown in performance of ``.grad`` method calls in PyTorch 1.7.1 - compared to 1.6. See the related discussion: - https://github.com/pytorch/pytorch/issues/50636. This issue does not - exist with PyTorch 1.8. - ----- - -SageMaker Distributed Model Parallel 1.3.0 Release Notes --------------------------------------------------------- - -- New Features -- Bug Fixes -- Known Issues - -.. _new-features-1: - -**New Features** - -.. _pytorch-2: - -- **PyTorch** - - Add support for PyTorch 1.8 - - - Adds a new method to DistributedModel ``register_comm_hook`` (for - PyTorch 1.8 and newer only). This method behaves the same as the - corresponding method with the same name in - ``torch.DistributedDataParallel`` API. Refer to the `SageMaker - distributed model parallel API - documentation `__ - for more information. - -**Improvements** - -- Adds a configuration ``active_microbatches`` to the SageMaker SDK API - for launching jobs, to control the number of active microbatches - during training. This helps limit memory usage in cases where the - number of microbatches is high. Refer to the `SageMaker Python SDK - parameters API - documentation `__ - for more information. - -- Adds a configuration ``deterministic_server`` to the SageMaker SDK - API for launching jobs, which ensures that the execution server for - pipeline parallelism processes requests in a deterministic order - across data parallel ranks. Refer to the `SageMaker Python SDK - parameters API - documentation `__ - for more information. - -- Parameter passing is now supported in ``module.forward`` methods for - DistributedModel and its submodules. This removes the restriction of - having to pass ``nn.Parameter`` to the ``__init__`` call and making - it a member of the module to use it. ## Bug Fixes - -.. _pytorch-3: - -- **PyTorch** - - - Fixed a case where training hangs due to a module having computation - which requires grads that is not used by the final output of the - module. Now such a situtation raises an error with suggestions on - making such computation compatible. - - - Fixed an issue with buffers which caused the buffers to not be on the - correct device after a model is partitioned, and not be synchronized - across steps (when ``broadcast_buffers`` is True). This could have - caused correctness issues in models with buffers. - -.. _known-issues-1: - -**Known Issues** - -.. _pytorch-4: - -- **PyTorch** - - - ``mp_barrier`` and ``get_mp_process_group`` are wrongly marked as - deprecated methods. Ignore the deprecation warning. - - - A crash was observed when ``optimizer.step()`` was called for certain - optimizers such as AdaDelta, when the partition on which this method - was called has no local parameters assigned to it after partitioning. - This is due to a bug in PyTorch which `has since been - fixed `__. Till that - makes its way to the next release of PyTorch, only call - ``optimizer.step()`` on processes which have at least one local - parameter. This can be checked like this - ``len(list(model.local_parameters())) > 0``. - - - A performance regression still exists when training on SMP with - PyTorch 1.7.1 compared to 1.6. The rootcause was found to be the - slowdown in performance of ``.grad`` method calls in PyTorch 1.7.1 - compared to 1.6. See the related discussion: - https://github.com/pytorch/pytorch/issues/50636. This issue does not - exist with PyTorch 1.8. - ----- - -SageMaker Distributed Model Parallel 1.2.0 Release Notes --------------------------------------------------------- - -- New Features -- Bug Fixes -- Known Issues - -.. _new-features-2: - -**New Features** - -.. _pytorch-5: - -- **PyTorch** - - Add support for PyTorch 1.7.1 - - - Adds support for ``gradient_as_bucket_view`` (PyTorch 1.7.1 only), - ``find_unused_parameters`` (PyTorch 1.7.1 only) and - ``broadcast_buffers`` options to ``smp.DistributedModel``. These - options behave the same as the corresponding options (with the same - names) in ``torch.DistributedDataParallel`` API. Refer to the - `SageMaker distributed model parallel API - documentation `__ - for more information. - - - Adds support for ``join`` (PyTorch 1.7.1 only) context manager, which - is to be used in conjunction with an instance of - ``smp.DistributedModel`` to be able to train with uneven inputs - across participating processes. - - - Adds support for ``_register_comm_hook`` (PyTorch 1.7.1 only) which - will register the callable as a communication hook for DDP. NOTE: - Like in DDP, this is an experimental API and subject to change. - -.. _tensorflow-2: - -- **Tensorflow** - - - Adds support for Tensorflow 2.4.1 - -.. _bug-fixes-1: - -**Bug Fixes** - -.. _pytorch-6: - -- **PyTorch** - - - ``Serialization``: Fix a bug with serialization/flattening where - instances of subclasses of dict/OrderedDicts were - serialized/deserialized or internally flattened/unflattened as - regular dicts. - -.. _tensorflow-3: - -- **Tensorflow** - - - Fix a bug that may cause a hang during evaluation when there is no - model input for one partition. - -.. _known-issues-2: - -**Known Issues** - -.. _pytorch-7: - -- **PyTorch** - - - A performance regression was observed when training on SMP with - PyTorch 1.7.1 compared to 1.6.0. The rootcause was found to be the - slowdown in performance of ``.grad`` method calls in PyTorch 1.7.1 - compared to 1.6.0. See the related discussion: - https://github.com/pytorch/pytorch/issues/50636. - ----- - -SageMaker Distributed Model Parallel 1.1.0 Release Notes --------------------------------------------------------- - -- New Features -- Bug Fixes -- Improvements -- Performance -- Known Issues - -.. _new-features-3: - -**New Features** - -The following sections describe new feature releases that are common -across frameworks and that are framework specific. - -**Common across frameworks*** - -- Custom slicing support (``smp_slice`` method) for objects passed to ``smp.step`` decorated functions - - To pass an object to ``smp.step`` that contains tensors that needs to be - split across microbatches and is not an instance of list, dict, tuple or - set, you should implement ``smp_slice`` method for the object. - - Below is an example of how to use this with PyTorch: - - .. code-block:: python - - class CustomType: - def __init__(self, tensor): - self.data = tensor - - # SMP will call this to invoke slicing on the object passing in total microbatches (num_mb) - # and the current microbatch index (mb). - def smp_slice(self, num_mb, mb, axis): - dim_size = list(self.data.size())[axis] - - split_size = dim_size // num_mb - sliced_tensor = self.data.narrow(axis, mb * split_size, split_size) - return CustomType(sliced_tensor, self.other) - - custom_obj = CustomType(torch.ones(4,)) - - @smp.step() - def step(custom_obj): - loss = model(custom_obj) - model.backward(loss) - return loss - -.. _pytorch-8: - -- **PyTorch** - - - Add support for smp.DistributedModel.cpu() - - ``smp.DistributedModel.cpu()`` - `allgather `__\ s - parameters and buffers across all ``mp_ranks`` and moves them to the - CPU. - - - Add ``trace_memory_usage`` option to ``smp.DistributedModel`` to measure memory usage per module - - Adds ``trace_memory_usage`` option to ``smp.DistributedModel``. This - attempts to measure memory usage per module during tracing. If this is - disabled, memory usage is estimated through the sizes of tensors - returned from the module. This option is disabled by default. - -.. _bug-fixes-2: - -**Bug Fixes** - -.. _pytorch-9: - -- **PyTorch** - - - ``torch.nn.Sequential``: Fix a bug with ``torch.nn.Sequential`` which - causes a failure with the error message : - ``shouldnt go less than 0, there is a bug`` when the inputs to the - first module don’t require grads. - - - ``smp.DistributedModel``: Fix a bug with ``DistributedModel`` - execution when a module has multiple parents. The bug surfaces with - the error message: - ``actual_parent should be different than module_execution_stack parent only for torch.nn.ModuleList`` - - - ``apex.optimizers.FusedNovoGrad``: Fix a bug with - ``apex.optimizers.FusedNovoGrad`` which surfaces with the error - message: ``KeyError: 'exp_avg_sq'`` - -**Improvements** - -*Usability* - -.. _pytorch-10: - -- **PyTorch** - - - ``smp.DistributedModel``: Improve the error message when the forward - pass on ``smp.DistributedModel`` is called outside the ``smp.step`` - decorated function. - - - ``smp.load``: Add user friendly error messages when loading - checkpoints with ``smp.load``. - -*Partitioning Algorithm* - -.. _pytorch-11: - -- **PyTorch** - - - Better memory balancing by taking into account the existing modules - already assigned to the parent, while partitioning the children of a - given module. - -**Performance** - -.. _tensorflow-4: - -- **Tensorflow** - - - Addresses long pre-processing times introduced by SMP XLA optimizer - when dealing with large graphs and large number of microbatches. BERT - (large) preprocessing time goes down from 40 minutes to 6 minutes on - p3.16xlarge. - -.. _known-issues-3: - -**Known Issues** - -.. _pytorch-12: - -- **PyTorch** - - - Serialization for Torch in SMP overwrites instances of dict subclass - to be dict itself, instead of the instances of subclass. One of the - use cases which fails because of this issue is when a user implements - a subclass of OrderedDict with the ``__getitem__`` method. After - serialization/deserialization in SMP, indexing on the object will - lead to errors. A workaround is to use the dict keys to access the - underlying item. diff --git a/doc/api/training/smp_versions/archives.rst b/doc/api/training/smp_versions/archives.rst deleted file mode 100644 index a7426e8aec..0000000000 --- a/doc/api/training/smp_versions/archives.rst +++ /dev/null @@ -1,13 +0,0 @@ -.. _smdmp-pt-version-archive: - -.. toctree:: - :maxdepth: 1 - - v1_10_0.rst - v1_9_0.rst - v1_6_0.rst - v1_5_0.rst - v1_4_0.rst - v1_3_0.rst - v1_2_0.rst - v1_1_0.rst diff --git a/doc/api/training/smp_versions/latest.rst b/doc/api/training/smp_versions/latest.rst deleted file mode 100644 index cec4468c54..0000000000 --- a/doc/api/training/smp_versions/latest.rst +++ /dev/null @@ -1,35 +0,0 @@ -############################################### -Use the Library's API to Adapt Training Scripts -############################################### - -The library provides Common APIs that you can use across frameworks, -as well as framework-specific APIs for TensorFlow and PyTorch. - -Select the latest or one of the previous versions of the API documentation -depending on which version of the library you need to use. -To use the library, reference the -**Common API** documentation alongside the framework specific API documentation. - -Version 1.11.0, 1.13.0, 1.14.0, 1.15.0 (Latest) -=============================================== - -To use the library, reference the Common API documentation alongside the framework specific API documentation. - -.. toctree:: - :maxdepth: 1 - - latest/smd_model_parallel_common_api - latest/smd_model_parallel_pytorch - latest/smd_model_parallel_pytorch_tensor_parallel - latest/smd_model_parallel_tensorflow - -To find archived API documentation for the previous versions of the library, -see the following link: - -Documentation Archive -===================== - -.. toctree:: - :maxdepth: 1 - - archives diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_common_api.rst deleted file mode 100644 index d1f6b4d45b..0000000000 --- a/doc/api/training/smp_versions/latest/smd_model_parallel_common_api.rst +++ /dev/null @@ -1,517 +0,0 @@ -Common API -========== - -The following SageMaker distribute model parallel APIs are common across all frameworks. - -.. contents:: Table of Contents - :depth: 3 - :local: - -The Library's Core APIs ------------------------ - -This API document assumes you use the following import statement in your training scripts. - -**TensorFlow** - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -**PyTorch** - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. function:: smp.init( ) - - Initialize the library. Must be called at the beginning of training script. - -.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs]) - - A decorator that must be placed over a function that represents a single - forward and backward pass (for training use cases), or a single forward - pass (for evaluation use cases). Any computation that is defined inside - the ``smp.step``-decorated function is executed in a pipelined manner. - - By default, every tensor input to the function is split across its batch - dimension into a number of microbatches specified while launching the - training job. This behavior can be customized through the arguments to - ``smp.step``, described below. The library then orchestrates the execution of - each microbatch across all partitions, based on the chosen pipeline - type. - - In a typical use case, forward pass and back-propagation are executed - inside an \ ``smp.step``-decorated function and gradients, loss, and - other relevant metrics (such as accuracy, etc.) are returned from - ``smp.step``-decorated function. - - Any gradient post-processing operation, such as gradient clipping and - allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or - ``optimizer.step`` (for PT) should be applied on the gradients returned - from the ``smp.step`` function, and not inside the ``smp.step`` - function. This is because every operation inside ``smp.step`` is - executed once per microbatch, so having these operations inside - ``smp.step`` can either be inefficient (in the case of allreduce), or - lead to wrong results (in the case of ``apply_gradients`` / - ``optimizer.step``). - - If the objects returned from the ``smp.step``-decorated function contain - ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to - ``StepOutput`` objects. A ``StepOutput`` object encapsulates all - versions of the tensor across different microbatches - (see ``StepOutput`` entry for more information). - - The argument to ``smp.step`` decorated function should either be a tensor - or an instance of list, tuple, dict or set for it to be split across - microbatches. If your object doesn't fall into this category, you can make - the library split your object, by implementing ``smp_slice`` method. - - Below is an example of how to use it with PyTorch. - - .. code:: python - - class CustomType: - def __init__(self, tensor): - self.data = tensor - - # The library will call this to invoke slicing on the object passing in total microbatches (num_mb) - # and the current microbatch index (mb). - def smp_slice(self, num_mb, mb, axis): - dim_size = list(self.data.size())[axis] - - split_size = dim_size // num_mb - sliced_tensor = self.data.narrow(axis, mb * split_size, split_size) - return CustomType(sliced_tensor, self.other) - - custom_obj = CustomType(torch.ones(4,)) - - @smp.step() - def step(custom_obj): - loss = model(custom_obj) - model.backward(loss) - return loss - - - **Important:** ``smp.step`` splits the batch into microbatches, and - executes everything inside the decorated function once per microbatch. - This might affect the behavior of batch normalization, any operation - that explicitly uses the batch size information, or any other Python - code that is expected to run once. - - **TensorFlow-specific behavior** - - ``smp.step`` is a wrapper that - inherits from and extends the behavior of ``tf.function``, and as such, - all the caveats that apply to the use of ``tf.function``\ s also apply - to ``smp.step``. In particular, any operation that is inside - ``smp.step`` executes in graph mode, and not eager mode. - - In the first call, ``smp.step`` performs tracing of the wrapped function every time - one of the tensor arguments changes their shape or dtype, or for every - new value of a Python argument, if there is one. Tracing is expensive, - so such scenarios should be avoided as much as possible or, - alternatively, an ``input_signature`` argument must be provided. For - more information on the usage of ``tf.function``, refer to the - TensorFlow documentation: - - - https://www.tensorflow.org/api_docs/python/tf/function\ - - https://www.tensorflow.org/guide/function\ - - Each ``smp.step`` decorated function must have a return value that depends on the - output of ``smp.DistributedModel``. - - **Common parameters** - - - ``non_split_inputs`` (``list``): The list of arguments to the decorated function - that should not be split along the batch dimension. Should be used - for all input tensors that do not have a batch dimension. Should be a - list of argument names as ``str``, as they appear in the signature of - the ``smp.step``-decorated function. By default it is considered an - empty list. - - - ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch - axis. The keys should be the argument names as ``str``, as they - appear in the signature of the ``smp.step``-decorated function.  By - default all batch axes are assumed to be the 0-axis. - - **TensorFlow-only parameters** - - - All arguments of ``tf.function``. Note: - The \ ``experimental_compile`` argument of ``tf.function`` may not - work as expected with ``smp.step``, since it interferes with - pipelining and model partitioning. To enable XLA with the library, you can - instead use \ ``tf.config.optimizer.set_jit(True)``. - - **PyTorch-only parameters** - - - ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on - all returned ``torch.Tensor`` outputs. Setting it to ``False`` - increases memory consumption, unless ``detach()`` is manually called - on the returned tensors, because the model graph is not cleared from - memory after the training step. Set to \ ``True`` by default. - - **Returns** - - - The same object(s) returned from the decorated function. All - returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or - ``torch.Tensor`` objects (for PT) are wrapped inside - a \ ``StepOutput`` object, even when they are inside a Python - ``list``, ``tuple``, or ``dict``. - - - -.. class:: StepOutput - - - A class that encapsulates all versions of a ``tf.Tensor`` - or \ ``torch.Tensor`` across all microbatches. - - When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside - ``smp.step``, different versions of the tensor are computed for each - microbatch. - - When this tensor is returned from ``smp.step`` and is accessed outside - of the decorated function, it appears as a ``StepOutput`` object, which - contains all such versions. For example, - - - In the case of Tensorflow, the gradient for a particular - ``tf.Variable`` is computed on each microbatch individually, and if - this gradient is returned from ``smp.step``, all gradients for this - ``tf.Variable`` become part of the same ``StepOutput`` object. The - ``StepOutput`` class offers the following API for commonly-used - post-processing operations on such tensors. - - In the case of PyTorch, the loss for each microbatch is computed - individually and all the ``torch.Tensor``\ s that represent the loss - for different microbatches become part of same ``StepOutput`` object, - if loss is returned from the ``smp.step`` function. - - - The ``StepOutput`` class offers the following API for commonly-used - post-processing operations on tensors. - - .. data:: StepOutput.outputs - - Returns a list of the underlying tensors, indexed by microbatch. - - .. function:: StepOutput.reduce_mean( ) - - Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s - ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches. - - .. function:: StepOutput.reduce_sum( ) - - Returns a ``tf.Tensor`` / - ``torch.Tensor`` that sums the constituent - ``tf.Tensor``\ s/\ ``torch.Tensor``\ s. - - .. function:: StepOutput.concat( ) - - Returns a - ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the - batch dimension using ``tf.concat`` / ``torch.cat``. - - .. function:: StepOutput.stack( ) - - Applies ``tf.stack`` / ``torch.stack`` - operation to the list of constituent ``tf.Tensor``\ s / - ``torch.Tensor``\ s. - - **TensorFlow-only methods** - - .. function:: StepOutput.merge( ) - - Returns a ``tf.Tensor`` that - concatenates the constituent ``tf.Tensor``\ s along the batch - dimension. This is commonly used for merging the model predictions - across microbatches. - - .. function:: StepOutput.accumulate(method="variable", var=None) - - Functionally the same as ``StepOutput.reduce_mean()``. However, it is - more memory-efficient, especially for large numbers of microbatches, - since it does not wait for all constituent \ ``tf.Tensor``\ s to be - ready to start averaging them, thereby saving memory. - - In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end - up being more memory-efficient than ``StepOutput.accumulate()``. - - **Parameters** - - - ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``): - If ``"add_n"`` or ``"accumulate_n"``, the library uses - ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement - accumulation. If ``"variable"``, the library uses an internal ``tf.Variable`` - into which to accumulate the tensors. Default is \ ``"variable"``. - Note: Memory usage behavior of these choices can depend on the model - and implementation. - - - ``var``: A ``tf.Variable`` into which, if provided, the library uses to - accumulate the tensors. If \ ``None``, the library internally creates a - variable. If ``method`` is not ``"variable"``, this argument is - ignored. - -.. _mpi_basics: - -MPI Basics ----------- - -The library exposes the following basic MPI primitives to its Python API: - -**Global** - -- ``smp.rank()`` : The global rank of the current process. -- ``smp.size()`` : The total number of processes. -- ``smp.get_world_process_group()`` : - ``torch.distributed.ProcessGroup`` that contains all processes. -- ``smp.CommGroup.WORLD``: The communication group corresponding to all processes. -- ``smp.local_rank()``: The rank among the processes on the current instance. -- ``smp.local_size()``: The total number of processes on the current instance. -- ``smp.get_mp_group()``: The list of ranks over which the current model replica is partitioned. -- ``smp.get_dp_group()``: The list of ranks that hold different replicas of the same model partition. - -**Tensor Parallelism** - -- ``smp.tp_rank()`` : The rank of the process within its - tensor-parallelism group. -- ``smp.tp_size()`` : The size of the tensor-parallelism group. -- ``smp.get_tp_process_group()`` : Equivalent to - ``torch.distributed.ProcessGroup`` that contains the processes in the - current tensor-parallelism group. -- ``smp.CommGroup.TP_GROUP`` : The communication group corresponding to - the current tensor parallelism group. - -**Pipeline Parallelism** - -- ``smp.pp_rank()`` : The rank of the process within its - pipeline-parallelism group. -- ``smp.pp_size()`` : The size of the pipeline-parallelism group. -- ``smp.get_pp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current pipeline-parallelism group. -- ``smp.CommGroup.PP_GROUP`` : The communication group corresponding to - the current pipeline parallelism group. - -**Reduced-Data Parallelism** - -- ``smp.rdp_rank()`` : The rank of the process within its - reduced-data-parallelism group. -- ``smp.rdp_size()`` : The size of the reduced-data-parallelism group. -- ``smp.get_rdp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current reduced data parallelism - group. -- ``smp.CommGroup.RDP_GROUP`` : The communication group corresponding - to the current reduced data parallelism group. - -**Model Parallelism** - -- ``smp.mp_rank()`` : The rank of the process within its model-parallelism - group. -- ``smp.mp_size()`` : The size of the model-parallelism group. -- ``smp.get_mp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current model-parallelism group. -- ``smp.CommGroup.MP_GROUP`` : The communication group corresponding to - the current model parallelism group. - -**Data Parallelism** - -- ``smp.dp_rank()`` : The rank of the process within its data-parallelism - group. -- ``smp.dp_size()`` : The size of the data-parallelism group. -- ``smp.get_dp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current data-parallelism group. -- ``smp.CommGroup.DP_GROUP`` : The communication group corresponding to - the current data-parallelism group. - -.. _communication_api: - -Communication API ------------------ - -The library provides a few communication primitives which can be helpful while -developing the training script. These primitives use the following -``enum`` s as arguments to specify which processes the communication -should involve. -​ - -**Helper structures** - -.. data:: smp.CommGroup - - An ``enum`` that takes the values - ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``. - These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``, - and ``smp.DP_GROUP`` respectively. - - - ``CommGroup.WORLD``: Represents the entire group of processes used in - training - - ``CommGroup.MP_GROUP``: Represents the group of processes that hold - the same model replica as the current process. The processes in a - single ``MP_GROUP`` collectively store an entire replica of the - model. - - ``CommGroup.DP_GROUP``: Represents the group of processes that hold - the same model partition as the current process. The processes in a - single ``DP_GROUP`` perform data parallelism/allreduce among - themselves. - -.. data:: smp.RankType - - An ``enum`` that takes the values - ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``. - - - ``RankType.WORLD_RANK``: The associated rank is to be interpreted as - the rank of the process across all processes used in training. - - ``RankType.MP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``MP_GROUP``. - - ``RankType.DP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``DP_GROUP``. - - -**Communication primitives:** - -.. function:: smp.broadcast(obj, group) - - Sends the object to all processes in the - group. The receiving process must call ``smp.recv_from`` to receive the - sent object. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be broadcast. - - - ``group``: A ``CommGroup`` argument that represents to which group of - processes the object will be sent. - - **Notes** - - - When you use ``broadcast`` on the sender process, there needs - to be an accompanying ``smp.recv_from()`` call on the receiver - processes. - - - This is a synchronous call; the ``broadcast`` statement - returns only after all ranks participating in the call have made a - matching ``recv_from`` call. - - **Example** - - .. code:: python - - if smp.rank() == 0: -     smp.broadcast(something, group=smp.CommGroup.WORLD) - else: -     smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK) - -.. function:: smp.send(obj, dest_rank, rank_type) - - Sends the object ``obj`` to - ``dest_rank``, which is of a type specified by ``rank_type``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be sent. - - - ``dest_rank`` (``int``): An integer denoting the rank of the receiving process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process - with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current - process. - - **Notes** - - - Note: \ This is a synchronous call; the ``send`` statement returns - only after the destination rank has made a matching - ``recv_from`` call. - -.. function:: smp.recv_from(src_rank, rank_type) - - Receive an object from a peer process. Can be used with a matching - ``smp.send`` or a ``smp.broadcast`` call. - - **Inputs** - - - ``src_rank`` (``int``): An integer denoting rank of the sending process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then the object is received from - the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the - current process. - - **Returns** - - Returns the python object that is sent by the peer process. - - **Notes** - - - Note: This is a synchronous call; the ``recv_from`` statement returns - only after the source rank has made a matching ``send`` or - ``broadcast`` call, and the object is received. - -.. function:: smp.allgather(obj, group) - - A collective call that gathers all the - submitted objects across all ranks in the specified ``group``. Returns a - list whose ``i``\ th index contains the object submitted by the - ``i``\ th rank in ``group``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be - allgathered. - - - ``group`` : A ``CommGroup`` argument that represents which group of - processes participate in ``allgather``. - - **Notes** - - - Note: This is a synchronous call; the ``allgather`` statement returns - only after all ranks participating in the call have made a matching - ``allgather`` call, and all the objects are received at the current - rank. - - **Examples** - - .. code:: python - - # assuming mp_size() == 2 - - if smp.mp_rank() == 0: -     out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - else: -     out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - -.. function:: smp.barrier(group=smp.WORLD) - - A statement that hangs until all - processes in the specified group reach the barrier statement, similar to - ``MPI_Barrier()``. - - **Inputs** - - - ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of - processes participating in the barrier call. Defaults to - ``smp.WORLD``. - - **Examples** - - - Assume there are 8 processes and 2 model partitions, and - therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If - the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its - group argument, then each process only waits until the other process - of its own ``mp_group`` reaches that point. It does not wait for - processes outside that ``mp_group``. - -.. function:: smp.dp_barrier() - - Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``. - Waits for the processes in the same \ ``dp_group`` as - the current process to reach the same point in execution. - -.. function:: smp.mp_barrier() - - Same as passing ``smp.MP_GROUP`` to - ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as - the current process to reach the same point in execution. diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst deleted file mode 100644 index 05357e673b..0000000000 --- a/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst +++ /dev/null @@ -1,944 +0,0 @@ -PyTorch API -=========== - -To use the PyTorch-specific APIs for SageMaker distributed model parallism, -import the ``smdistributed.modelparallel.torch`` package at the top of your training script. - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. tip:: - - Refer to - `Modify a PyTorch Training Script - `_ - to learn how to use the following API in your PyTorch training script. - -.. contents:: Topics - :depth: 1 - :local: - -smdistributed.modelparallel.torch.DistributedModel -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. class:: smdistributed.modelparallel.torch.DistributedModel - - A sub-class of ``torch.nn.Module`` which specifies the model to be - partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is - the model to be partitioned. The returned ``DistributedModel`` object - internally manages model parallelism and data parallelism. Only one - model in the training script can be wrapped with - ``smdistributed.modelparallel.torch.DistributedModel``. - - **Example:** - - .. code:: python - - import smdistributed.modelparallel.torch as smp - - model = smp.DistributedModel(model) - - **Important**: The ``__call__`` and  ``backward`` method calls on the - ``smdistributed.modelparallel.torch.DistributedModel`` object (in the following example, the object - is \ ``model``) can only be made inside a ``smdistributed.modelparallel.torch.step``-decorated - function. - - Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can - be performed by calling the \ ``DistributedModel`` object on the input - tensors. - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - For a backward pass, one needs to call the backward function on - the \ ``DistributedModel`` object, with tensors and gradients as - arguments, replacing the PyTorch operations \ ``torch.Tensor.backward`` - or ``torch.autograd.backward``. - - The API for ``model.backward`` is very similar to - ``torch.autograd.backward``. For example, the following - ``backward`` calls: - - .. code:: python - - torch.autograd.backward(loss) or loss.backward() - - should be replaced with: - - .. code:: python - - model.backward(loss) # loss is a tensor with only one element as its data - - Similarly, for non-scalar tensors, replace the following - ``backward`` call containing incoming gradient arguments: - - .. code:: python - - torch.autograd.backward(outputs, out_grads) - - with the following line: - - .. code:: python - - model.backward(outputs, out_grads) - - In these examples, all ``__call__``  and ``backward`` method calls on - the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside - a ``smdistributed.modelparallel.torch.step``-decorated function. - - **Using DDP** - - If DDP is enabled with the SageMaker model parallel library, do not not place a PyTorch - ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because - the ``DistributedModel`` wrapper will also handle data parallelism. - - Unlike the original DDP wrapper, when you use ``DistributedModel``, - model parameters and buffers are not immediately broadcast across - processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the - ``smdistributed.modelparallel.torch.step``-decorated function when the partition is done. - - **Parameters** - - - ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism). - - - ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``) - Whether to perform the tracing step on the GPU or CPU. The tracing step gathers - information on the order of execution of modules, the shapes of - intermediate outputs, and execution times, to be used by the - partitioning algorithm. If ``trace_device`` is set to GPU, accurate - module execution times can be gathered during tracing for potentially - improved partitioning decision. However, if the model is too large to - fit in a single GPU, then ``trace_device`` should be set to CPU. - - - ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``, - the library profiles the execution time of each module during tracing, and uses - it in the partitioning decision. This improves the partitioning - decision, but it might make the tracing slower. It may also introduce - some degree of non-determinism in partitioning results, because of the - inherent randomness in module execution times. Must be ``False`` if - ``trace_device`` is ``"cpu"``. - - - ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` while launching training). The library uses this flag - to decide whether to do overlapping allreduce whenever a parameter - gradients are ready. This leads to overlapping of communication and - computation and can improve performance. If this is set to ``False`` , - allreduce is performed at the end of the step. - - - ``backward_passes_per_step`` (``int``) (default: 1): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` in config). This parameter indicates the - number of backward passes to perform before calling allreduce on DDP. - This allows accumulating updates over multiple mini-batches before - reducing and applying them. - - - ``average_grads_across_microbatches`` (``bool``) (default: ``True``): - Whether or not the computed gradients should be averaged across - microbatches. If ``False``, the computed gradients will be summed across - microbatches, but not divided by the number of microbatches. In typical - use case where the computed loss is averaged over the mini-batch, this - should be left as ``True``. If you use a loss function that only sums - the per-sample loss across the batch (and not divide by the batch size), - then this must be set to ``False`` for correctness. - - - ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets - parameters into multiple buckets so that gradient reduction of each - bucket can potentially overlap with backward - computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes - (MB). - - - ``trace_memory_usage`` (default: False): When set to True, the library attempts - to measure memory usage per module during tracing. If this is disabled, - memory usage will be estimated through the sizes of tensors returned from - the module. - - - ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``. - This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper. - Please see: `broadcast_buffer `__. - - - ``gradient_as_bucket_view`` (default: False): To be - used with ``ddp=True``. This parameter is forwarded to the underlying - ``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view `__. - - **Properties** - - - ``partitioned``: Is ``True`` if the model is partitioned, ``False`` - otherwise. Initialized to ``False`` when ``DistributedModel`` is first - created. It becomes be ``True`` during the first call - to ``smdistributed.modelparallel.torch.step``-decorated function. Once the model is partitioned, the - local parameters or local ``state_dict`` can be fetched using the - following methods. - - **Methods** - - .. function:: backward(tensors, grad_tensors) - - Triggers a distributed backward - pass across model partitions. Example usage provided in the previous - section. The API is very similar - to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward. - ``retain_grad`` and ``create_graph``  flags are not supported. - - .. function:: local_buffers( ) - - Returns an iterator over buffers for the modules in - the partitioned model that have been assigned to the current process. - - .. function:: local_named_buffers( ) - - Returns an iterator over buffers for the - modules in the partitioned model that have been assigned to the current - process. This yields both the name of the buffer as well as the buffer - itself. - - .. function:: local_parameters( ) - - Returns an iterator over parameters for the - modules in the partitioned model that have been assigned to the current - process. - - .. function:: local_named_parameters( ) - - Returns an iterator over parameters for - the modules in the partitioned model that have been assigned to the - current process. This yields both the name of the parameter as well as - the parameter itself. - - .. function:: local_modules( ) - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. - - .. function:: local_named_modules( ) - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. This - yields both the name of the module as well as the module itself. - - .. function:: local_state_dict( ) - - Returns the ``state_dict`` that contains local - parameters that belong to the current \ ``mp_rank``. This ``state_dict`` - contains a key \ ``_smp_is_partial`` to indicate this is a - partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - .. function:: state_dict( ) - - Returns the ``state_dict`` that contains parameters - for the entire model. It first collects the \ ``local_state_dict``  and - gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to - create a full ``state_dict``. Please note that this needs to be called on all ranks with - ``dp_rank()==0`` to ensure the gather happens properly. - If it is only called on all such ranks, it can hang. - - .. function:: load_state_dict( ) - - Same as the ``torch.module.load_state_dict()`` , - except: It first gathers and merges the ``state_dict``\ s across - ``mp_rank``\ s, if they are partial. The actual loading happens after the - model partition so that each rank knows its local parameters. - - .. function:: register_post_partition_hook(hook) - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smdistributed.modelparallel.torch.step``, but before the actual execution of the - first forward pass. Returns a ``RemovableHandle`` object ``handle``, - which can be used to remove the hook by calling ``handle.remove()``. - - .. function:: cpu( ) - - Allgathers parameters and buffers across all ``mp_rank``\ s and moves them - to the CPU. - - .. function:: join( ) - - A context manager to be used in conjunction with an instance of - ``smdistributed.modelparallel.torch.DistributedModel`` to be able to train with uneven inputs across - participating processes. This is only supported when ``ddp=True``. This will use the join with the wrapped - ``DistributedDataParallel`` instance. For more information, see: - `join `__ - in the PyTorch documentation. - - .. function:: register_comm_hook( state, callable ) - - **Available for PyTorch 1.8.1 only** - Registers a communication hook which is an enhancement that provides - a flexible hook ``callable`` to users where they can specify how - gradients are aggregated across multiple workers. This method will be called on the wrapped ``DistributedDataParallel`` instance. - - Please note that when you register a comm hook you have full control of how the gradients are processed. - When using only data parallelism with Torch DDP you are expected to average grads across data parallel replicas within the hook. - Similarly, when using DistributedModel you have to averaging grads across data parallel replicas within the hook. - In addition to that, you also have to average grads across microbatches within the hook unless you explicitly desire to not average based on your loss function. - See ``average_grads_across_microbatches`` for more information about averaging grads across microbatches. - - This is only supported when ``ddp=True`` and ``overlapping_allreduce=True`` (default). - For more information, see: - `register_comm_hook `__ - in the PyTorch documentation. - - **Behavior of** ``smdistributed.modelparallel.torch.DistributedModel`` **with Tensor Parallelism** - - When a model is wrapped by ``smdistributed.modelparallel.torch.DistributedModel``, the library - immediately traverses the modules of the model object, and replaces the - modules that are supported for tensor parallelism with their distributed - counterparts. This replacement happens in place. If there are no other - references to the original modules in the script, they are - garbage-collected. The module attributes that previously referred to the - original submodules now refer to the distributed versions of those - submodules. - - **Example:** - - .. code:: python - - # register DistributedSubmodule as the distributed version of Submodule - # (note this is a hypothetical example, smp.nn.DistributedSubmodule does not exist) - import smdistributed.modelparallel.torch as smp - - smp.tp_register_with_module(Submodule, smp.nn.DistributedSubmodule) - - class MyModule(nn.Module): - def __init__(self): - ... - - self.submodule = Submodule() - ... - - # enabling tensor parallelism for the entire model - with smp.tensor_parallelism(): - model = MyModule() - - # here model.submodule is still a Submodule object - assert isinstance(model.submodule, Submodule) - - model = smp.DistributedModel(model) - - # now model.submodule is replaced with an equivalent instance - # of smp.nn.DistributedSubmodule - assert isinstance(model.module.submodule, smp.nn.DistributedSubmodule) - - If ``pipeline_parallel_degree`` (equivalently, ``partitions``) is 1, the - placement of model partitions into GPUs and the initial broadcast of - model parameters and buffers across data-parallel ranks take place - immediately. This is because it does not need to wait for the model - partition when ``smdistributed.modelparallel.torch.DistributedModel`` wrapper is called. For other - cases with ``pipeline_parallel_degree`` greater than 1, the broadcast - and device placement will be deferred until the first call of an - ``smdistributed.modelparallel.torch.step``-decorated function happens. This is because the first - ``smdistributed.modelparallel.torch.step``-decorated function call is when the model partitioning - happens if pipeline parallelism is enabled. - - Because of the module replacement during the ``smdistributed.modelparallel.torch.DistributedModel`` - call, any ``load_state_dict`` calls on the model, as well as any direct - access to model parameters, such as during the optimizer creation, - should be done **after** the ``smdistributed.modelparallel.torch.DistributedModel`` call. - - Since the broadcast of the model parameters and buffers happens - immediately during ``smdistributed.modelparallel.torch.DistributedModel`` call when the degree of - pipeline parallelism is 1, using ``@smp.step`` decorators is not - required when tensor parallelism is used by itself (without pipeline - parallelism). - - For more information about the library's tensor parallelism APIs for PyTorch, - see :ref:`smdmp-pytorch-tensor-parallel`. - - **Additional Methods of** ``smdistributed.modelparallel.torch.DistributedModel`` **for Tensor Parallelism** - - The following are the new methods of ``smdistributed.modelparallel.torch.DistributedModel``, in - addition to the ones listed in the - `documentation `__. - - .. function:: distributed_modules() - - - An iterator that runs over the set of distributed - (tensor-parallelized) modules in the model - - .. function:: is_distributed_parameter(param) - - - Returns ``True`` if the given ``nn.Parameter`` is distributed over - tensor-parallel ranks. - - .. function:: is_distributed_buffer(buf) - - - Returns ``True`` if the given buffer is distributed over - tensor-parallel ranks. - - .. function:: is_scaled_batch_parameter(param) - - - Returns ``True`` if the given ``nn.Parameter`` is operates on the - scaled batch (batch over the entire ``TP_GROUP``, and not only the - local batch). - - .. function:: is_scaled_batch_buffer(buf) - - - Returns ``True`` if the parameter corresponding to the given - buffer operates on the scaled batch (batch over the entire - ``TP_GROUP``, and not only the local batch). - - .. function:: default_reducer_named_parameters() - - - Returns an iterator that runs over ``(name, param)`` tuples, for - ``param`` that is allreduced over the ``DP_GROUP``. - - .. function:: scaled_batch_reducer_named_parameters() - - - Returns an iterator that runs over ``(name, param)`` tuples, for - ``param`` that is allreduced over the ``RDP_GROUP``. - -smdistributed.modelparallel.torch.DistributedOptimizer -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. class:: smdistributed.modelparallel.torch.DistributedOptimizer(optimizer, static_loss_scale=1.0, dynamic_loss_scale=False, **dynamic_loss_args) - - An optimizer wrapper for saving and loading optimizer states. - - :param optimizer: An optimizer object. - :type optimizer: object - :param static_loss_scale: Effective only for FP16 training. The default value is ``1.0``. - :type static_loss_scale: float - :param dynamic_loss_scale: Effective only for FP16 training. Set to ``True`` to - use dynamic loss scale. The default value is ``False``. - :type dynamic_loss_scale: boolean - :param dynamic_loss_args: Effective only for FP16 training. - If ``dynamic_loss_scale=True``, you can configure additional scale - parameters for dynamic loss scale. - The following list shows available parameters. - - * ``"init_scale"``: Default is ``2**32`` - * ``"scale_factor"``: Default is ``2.`` - * ``"scale_window"``: Default is ``1000`` - * ``"min_scale"``: Default is ``1`` - * ``"delayed_shift"``: Default is ``1`` - * ``"consecutive_hysteresis"``: Default is ``False`` - :type dynamic_loss_args: dict - - **Example usage of an FP32 Optimizer:** - - .. code:: python - - optimizer = torch.optim.AdaDelta(...) - optimizer = smdistributed.modelparallel.torch.DistributedOptimizer(optimizer) - - **Example usage of an FP16 Optimizer with static loss scale:** - - .. code:: python - - optimizer = torch.optim.AdaDelta(...) - optimizer = smdistributed.modelparallel.torch.DistributedOptimizer( - optimizer, - static_loss_scale=1.0 - ) - - **Example usage of an FP16 Optimizer with dynamic loss scale:** - - .. code:: python - - optimizer = torch.optim.AdaDelta(...) - optimizer = smdistributed.modelparallel.torch.DistributedOptimizer( - optimizer, - static_loss_scale=None, - dynamic_loss_scale=True, - dynamic_loss_args={ - "scale_window": 1000, - "min_scale": 1, - "delayed_shift": 2 - } - ) - - .. tip:: - - After you modify training scripts with - :class:`smdistributed.modelparallel.torch.DistributedModel` and - :class:`smdistributed.modelparallel.torch.DistributedOptimizer`, - use the SageMaker PyTorch estimator's distribution configuration to enable FP16 training. - You simply need to add ``"fp16": True`` to the ``smp_options`` config dictionary's - ``"parameters"`` key as shown in - `Using the SageMaker TensorFlow and PyTorch Estimators - `_. - For more information about available parameters for the ``smp_options`` config, - see :ref:`sm-sdk-modelparallel-general`. - - This wrapper returns an ``optimizer`` object with the following methods overridden: - - .. method:: state_dict( ) - - Returns the ``state_dict`` that contains optimizer state for the entire model. - It first collects the ``local_state_dict`` and gathers and merges - the ``local_state_dict`` from all ``mp_rank``\ s to create a full - ``state_dict``. - - .. method:: load_state_dict( ) - - Same as the ``torch.optimizer.load_state_dict()`` , except: - - - It first gathers and merges the local ``state_dict``\ s if they are - partial. - - The actual loading happens after the model partition so that each - rank knows its local parameters. - - .. method:: local_state_dict( ) - - Returns the ``state_dict`` that contains the - local optimizer state that belongs to the current \ ``mp_rank``. This - ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is - a partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - -smdistributed.modelparallel.torch.nn.FlashAttentionLayer -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. function:: smdistributed.modelparallel.torch.nn.FlashAttentionLayer(attention_dropout_prob=0.1, attention_head_size=None, scale_attention_scores=True, scale_attn_by_layer_idx=False, layer_idx=None, scale=None, triton_flash_attention=False, use_alibi=False) - - This class supports - `FlashAttention `_ - for PyTorch 2.0. - It takes the ``qkv`` matrix as an argument through its ``forward`` class method, - computes attention scores and probabilities, - and then operates the matrix multiplication with value layers. - - Through this class, the smp library supports - custom attention masks such as Attention with - Linear Biases (ALiBi), and you can activate them by setting - ``triton_flash_attention`` and ``use_alibi`` to ``True``. - - Note that the Triton flash attention does not support dropout - on the attention probabilities. It uses standard lower triangular - causal mask when causal mode is enabled. It also runs only - on P4d and P4de instances, with fp16 or bf16. - - This class computes the scale factor to apply when computing attention. - By default, ``scale`` is set to ``None``, and it's automatically calculated. - When ``scale_attention_scores`` is ``True`` (which is default), you must pass a value - to ``attention_head_size``. When ``scale_attn_by_layer_idx`` is ``True``, - you must pass a value to ``layer_idx``. If both factors are used, they are - multiplied as follows: ``(1/(sqrt(attention_head_size) * (layer_idx+1)))``. - This scale calculation can be bypassed if you specify a custom scaling - factor to ``scale``. In other words, if you specify a value to ``scale``, the set of parameters - (``scale_attention_scores``, ``attention_head_size``, ``scale_attn_by_layer_idx``, ``layer_idx``) - is overridden and ignored. - - **Parameters** - - * ``attention_dropout_prob`` (float): (default: 0.1) specifies dropout probability - to apply to attention. - * ``attention_head_size`` (int): Required when ``scale_attention_scores`` is True. - When ``scale_attention_scores`` is passed, this contributes - ``1/sqrt(attention_head_size)`` to the scale factor. - * ``scale_attention_scores`` (boolean): (default: True) determines whether - to multiply 1/sqrt(attention_head_size) to the scale factor. - * ``layer_idx`` (int): Required when ``scale_attn_by_layer_idx`` is ``True``. - The layer id to use for scaling attention by layer id. - It contributes 1/(layer_idx + 1) to the scaling factor. - * ``scale_attn_by_layer_idx`` (boolean): (default: False) determines whether - to multiply 1/(layer_idx + 1) to the scale factor. - * ``scale`` (float) (default: None): If passed, this scale factor will be - applied bypassing the all of the previous arguments. - * ``triton_flash_attention`` (bool): (default: False) If passed, Triton - implementation of flash attention will be used. This is necessary to supports - Attention with Linear Biases (ALiBi) (see next arg). Note that this version - of the kernel doesn’t support dropout. - * ``use_alibi`` (bool): (default: False) If passed, it enables Attention with - Linear Biases (ALiBi) using the mask provided. - - .. method:: forward(self, qkv, attn_mask=None, causal=False) - - Returns a single ``torch.Tensor`` ``(batch_size x num_heads x seq_len x head_size)``, - which represents the output of attention computation. - - **Parameters** - - * ``qkv``: ``torch.Tensor`` in the form of ``(batch_size x seqlen x 3 x num_heads x head_size)``. - * ``attn_mask``: ``torch.Tensor`` in the form of ``(batch_size x 1 x 1 x seqlen)``. - By default it is ``None``, and usage of this mask needs ``triton_flash_attention`` - and ``use_alibi`` to be set. See how to generate the mask in the following code snippet. - * ``causal``: When passed, it uses the standard lower triangular mask. The default is ``False``. - - When using ALiBi, it needs an attention mask prepared like the following. - - .. code:: python - - def generate_alibi_attn_mask(attention_mask, batch_size, seq_length, - num_attention_heads, alibi_bias_max=8): - - device, dtype = attention_mask.device, attention_mask.dtype - alibi_attention_mask = torch.zeros( - 1, num_attention_heads, 1, seq_length, dtype=dtype, device=device - ) - - alibi_bias = torch.arange(1 - seq_length, 1, dtype=dtype, device=device).view( - 1, 1, 1, seq_length - ) - m = torch.arange(1, num_attention_heads + 1, dtype=dtype, device=device) - m.mul_(alibi_bias_max / num_attention_heads) - alibi_bias = alibi_bias * (1.0 / (2 ** m.view(1, num_attention_heads, 1, 1))) - - alibi_attention_mask.add_(alibi_bias) - alibi_attention_mask = alibi_attention_mask[..., :seq_length, :seq_length] - if attention_mask is not None and attention_mask.bool().any(): - alibi_attention_mask.masked_fill( - attention_mask.bool().view(batch_size, 1, 1, seq_length), float("-inf") - ) - - return alibi_attention_mask - -smdistributed.modelparallel.torch Context Managers and Util Functions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. function:: smdistributed.modelparallel.torch.model_creation(tensor_parallelism=False, dtype=None, **tensor_parallel_config) - - Context manager to create a ``torch`` model. This API combines both the - :class:`smdistributed.modelparallel.torch.tensor_parallelism` and - :class:`smdistributed.modelparallel.torch.delay_param_initialization` decorators, - so you can simply use this single context when creating the torch model. - - :param tensor_parallelism: Whether to enable tensor parallelism during model creation. - :type tensor_parallelism: boolean - :param dtype: The dtype to use when creating the model. It has the following rules. - - * If dtype is specified, it will be used during model creation. - * If dtype is not specified, the default dtype will be used during model creation, - which is usually FP32. This is for the best performance on CPU. - * Any model that causes out-of-memory problems with FP32 initialization - is recommended to be created with - :class:`smdistributed.modelparallel.torch.delayed_parameter_initialization`. - * ``FP16_Module`` casts the model back to FP16 if FP16 training is enabled - with the ``smp`` config. For more inforamtion about FP16 training - in SageMaker with the model parallel library, see `FP16 Training - `_ - in the *Amazon SageMaker Developer Guide*. - - :type dtype: ``torch.dtype`` - :param tensor_parallel_config: kwargs to specifiy other tensor parallel configs. - This is not used if ``tensor_parallelism`` is ``False``. - :type tensor_parallel_config: dict - - **Example Usage:** - - .. code:: python - - import smdistributed.modelparallel.torch as smp - - with smp.model_creation( - tensor_parallelism=smp.tp_size() > 1, - dtype=torch.float16 if args.fp16 else torch.get_default_dtype() - ): - model = MyModel(...) - -.. function:: smdistributed.modelparallel.torch.partition(index) - - :param index: The index of the partition. - :type index: int - - A context manager which places all modules defined inside into the - partition with ID ``index``.  The ``index`` argument must be less than - the number of partitions. - - Use ``smdistributed.modelparallel.torch.partition`` to implement manual partitioning. - If ``"auto_partition"`` is ``True``, then the - ``smdistributed.modelparallel.torch.partition`` contexts are ignored. Any module that is not placed in - any ``smdistributed.modelparallel.torch.partition`` context is placed in the - ``default_partition`` defined through the SageMaker Python SDK. - - When ``smdistributed.modelparallel.torch.partition`` contexts are nested, the innermost context - overrides the rest (see the following example). In PyTorch, manual - partitioning should be done inside the module \ ``__init__``, and the - partition assignment applies to the modules that are *created* inside - the ``smdistributed.modelparallel.torch.partition`` context. - - Example: - - .. code:: python - - import smdistributed.modelparallel.torch as smp - - class Model(torch.nn.Module): -     def __init__(self): -         with smp.partition(1): -             self.child0 = Child0()            # child0 on partition 1 -             with smp.partition(2): -                 self.child1 = Child1()        # child1 on partition 2 -             self.child2 = Child2()            # child2 on partition 1 -         self.child3 = Child3()                # child3 on default_partition - -.. data:: smdistributed.modelparallel.torch.amp.GradScaler - - `Torch AMP Gradscaler `__ - currently doesn’t work with the library. ``smdistributed.modelparallel.torch.amp.GradScaler`` replaces - ``torch.amp.GradScaler`` and provides the same functionality. - -.. function:: smdistributed.modelparallel.torch.delay_param_initialization(enabled=True) - - If enabled, it delays the initialization of parameters - to save CPU memory. That is, parameter initialization takes place - after the model is partitioned on GPUs. - -.. function:: smdistributed.modelparallel.torch.get_world_process_group( ) - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all - processes, which can be used with the ``torch.distributed`` API. - Requires ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smdistributed.modelparallel.torch.get_mp_process_group( ) - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``MP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smdistributed.modelparallel.torch.get_dp_process_group( ) - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``DP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smdistributed.modelparallel.torch.is_initialized( ) - - Returns ``True`` if ``smdistributed.modelparallel.torch.init`` has already been called for the - process, and ``False`` otherwise. - -.. function::smp.is_tracing( ) - - Returns ``True`` if the current process is running the tracing step, and - ``False`` otherwise. - -.. data:: smdistributed.modelparallel.torch.nn.FusedLayerNorm - - `Apex Fused Layer Norm `__ is currently not - supported by the library. ``smdistributed.modelparallel.torch.nn.FusedLayerNorm`` replaces ``apex`` - ``FusedLayerNorm`` and provides the same functionality. This requires - ``apex`` to be installed on the system. - -.. data:: smdistributed.modelparallel.torch.optimizers.FusedNovoGrad - - - `Fused Novo Grad optimizer `__ is - currently not supported by the library. ``smdistributed.modelparallel.torch.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad`` - optimizer and provides the same functionality. This requires ``apex`` to - be installed on the system. - -.. data:: smdistributed.modelparallel.torch.optimizers.FusedLamb - - - `FusedLamb optimizer `__ - currently doesn’t work with the library. ``smdistributed.modelparallel.torch.optimizers.FusedLamb`` replaces - ``apex`` ``FusedLamb`` optimizer and provides the same functionality. - This requires ``apex`` to be installed on the system. - -.. _pytorch_saving_loading: - -smdistributed.modelparallel.torch APIs for Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. function:: smdistributed.modelparallel.torch.save(obj, f, partial=True, pickel_module=picklemodule, pickle_protocol=2, ) - - Saves an object. This operation is similar to `torch.save() - `_, except that - it has an additional keyword argument, ``partial``, and accepts only - string type for the argument ``f`` (file). If ``partial=True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank`` - index to your saved file. - - **Parameters** - - - ``obj`` (dict): A saved object. - - ``f`` (str): A string containing a file name. - - ``partial`` (bool, default= ``True``):  When set to ``True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an - ``mp_rank`` index to the saved file. If you want to be able to load - and further train a model that you save with ``smdistributed.modelparallel.torch.save()``, you must - set ``partial=True``. - - ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``): - A module used for pickling metadata and objects. - - ``pickle_protocol``  (int, default=2): Can be specified to - override the defaultprotocol. - -.. function:: smdistributed.modelparallel.torch.load(f, map_location, pickle_module, pickle_load_args, partial=True) - - Loads an object saved with ``smdistributed.modelparallel.torch.save()`` from a file. - - Similar to, `torch.load() `__, - except it has an additional keyword argument, ``partial``, and accepts - only string type for the argument ``f`` (file). If \ ``partial=True``, - then each ``mp_rank`` loads a separate checkpoint file. - - **Parameters** - - - ``f`` (string): A string containing a file name. - - ``map_location`` (function): A function - `torch.device `__, - a string, or a dict specifying how to remap storage locations. - - ``pickle_module`` (pickle module): A module used for unpickling - metadata and objects (has to match the \ ``pickle_module``\ used to - serialize file). - - ``pickle_load_args`` (Python 3 only): Optional keyword arguments - passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``. - - ``partial`` (bool, default= ``True``): When set to ``True``, each - ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``. - Should be used when loading a model trained with the library. - -.. function:: smdistributed.modelparallel.torch.save_checkpoint(path, tag, partial=True, model=None, optimizer=None, user_content=None, translate_if_full=True, num_kept_partial_checkpoints=None) - - Saves a checkpoint. While :class:`smdistributed.modelparallel.torch.save` saves - model and optimizer objects, - this function checkpoints model and optimizer and saves the checkpoints as separate files. - It creates checkpoint folders in the following structure. - - .. code:: text - - - path - - ${tag}_partial (folder for partial checkpoint) - - model_rankinfo.pt - - optimizer_rankinfo.pt - - fp16_states_rankinfo.pt - - user_content.pt - - $tag (checkpoint file for full checkpoint) - - user_content_$tag (user_content file for full checkpoint) - - newest (a file that indicates the newest checkpoint) - - **Parameters** - - * ``path`` (str) (required): Path to save the checkpoint. The library creates - the directory if it does not already exist. - For example, ``/opt/ml/checkpoint/model_parallel``. - * ``tag`` (str) (required): A tag for the current checkpoint, usually the train - steps. Note: tag needs to be the same across all ranks (GPU workers). - When ``partial=False`` this will be the checkpoint file name. - * ``partial`` (boolean) (default: True): Whether to save the partial checkpoint. - * ``model`` (:class:`smdistributed.modelparallel.torch.DistributedModel`) - (default: None): The model to save. It needs to an ``smp.DistributedModel`` object. - * ``optimizer`` (:class:`smdistributed.modelparallel.torch.DistributedOptimizer`) - (default: None): The optimizer to save. It needs to be an ``smp.DistributedOptimizer`` object. - * ``user_content`` (any) (default: None): User-defined content to save. - * ``translate_if_full`` (boolean) (default: True): Whether to translate the - full ``state_dict`` to HF ``state_dict`` if possible. - * ``num_kept_partial_checkpoints`` (int) (default: None): The maximum number - of partial checkpoints to keep on disk. - -.. function:: smdistributed.modelparallel.torch.resume_from_checkpoint(path, tag=None, partial=True, strict=True, load_optimizer=True, load_sharded_optimizer_state=True, translate_function=None) - - While :class:`smdistributed.modelparallel.torch.load` loads saved - model and optimizer objects, this function resumes from a saved checkpoint file. - - **Parameters** - - * ``path`` (str) (required): Path to load the checkpoint. - * ``tag`` (str) (default: None): Tag of the checkpoint to resume. If not provided, - the library tries to locate the newest checkpoint from the saved newest file. - * ``partial`` (boolean) (default: True): Whether to load the partial checkpoint. - * ``strict`` (boolean) (default: True): Load with strict load, no extra key or - missing key is allowed. - * ``load_optimizer`` (boolean) (default: True): Whether to load ``optimizer``. - * ``load_sharded_optimizer_state`` (boolean) (default: True): Whether to load - the sharded optimizer state of a model. - It can be used only when you activate - the `sharded data parallelism - `_ - feature of the SageMaker model parallel library. - When this is ``False``, the library only loads the FP16 - states, such as FP32 master parameters and the loss scaling factor, - not the sharded optimizer states. - * ``translate_function`` (function) (default: None): function to translate the full - checkpoint into smdistributed.modelparallel format. - For supported models, this is not required. - - **Example usage** - - .. code:: python - - # Save - smp.save_checkpoint( - checkpoint_dir, - tag=f"total_steps{total_steps}", - partial=True, - model=model, - optimizer=optimizer, - user_content=user_content - num_kept_partial_checkpoints=args.num_kept_checkpoints) - - # Load: this will automatically load the newest checkpoint - user_content = smp.resume_from_checkpoint(path, partial=partial) - -.. _pytorch_saving_loading_instructions: - -General instruction on saving and loading ------------------------------------------ - -The library can save partial or full checkpoints. - -- For partial checkpoints, each ``mp_rank`` saves its own checkpoint - file with only the parameters that belong to that rank. -- For full checkpoints, the library saves a single checkpoint that contains - entire model parameters. - -When **saving** using ``smdistributed.modelparallel.torch.save()``, each rank only holds its own -parameters. If you want to save the full model, there will be some -communication between the ranks to create the full model. If you save -checkpoints often, you should save partial checkpoints for best -performance. - -When **loading** using ``smdistributed.modelparallel.torch.load()``, the library can load either partial or | -full checkpoints or full checkpoints saved by a non-model-parallel model. If you -want to resume training with a non-model-parallel model or do inference, you need -a full checkpoint. - -The following is an example of how you can save and load a checkpoint: - -.. code:: python - - import smdistributed.modelparallel.torch as smp - # Original model and optimizer - model = MyModel(...) - optimizer = MyOpt(...) - - # model parallel wrapper - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - - # To save, always save on dp_rank 0 to avoid data racing - if partial: -     # To save the partial model on each mp rank -     # the library will create `checkpoint.pt_{mprank}` for each mp rank -     if save_partial_model: -         if smp.dp_rank() == 0: -             model_dict = model.local_state_dict() # save the partial model -             opt_dict = optimizer.local_state_dict() # save the partial optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 f"/checkpoint.pt", -                 partial=True, -             ) - -     # To save the full model -     if save_full_model: -         if smp.dp_rank() == 0: -             model_dict = model.state_dict() # save the full model -             opt_dict = optimizer.state_dict() # save the full optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 "/checkpoint.pt", -                 partial=False, -             ) - - # To load, load on all ranks. - # The only difference for partial/full loading is the partial flag in smp.load - # Load partial checkpoint - if partial_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=True) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) - # Load full checkpoint - if full_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=False) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch_tensor_parallel.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch_tensor_parallel.rst deleted file mode 100644 index 2c2a7b1f2f..0000000000 --- a/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch_tensor_parallel.rst +++ /dev/null @@ -1,896 +0,0 @@ -.. _smdmp-pytorch-tensor-parallel: - -PyTorch API for Tensor Parallelism -================================== - -SageMaker distributed tensor parallelism works by replacing specific submodules -in the model with their distributed implementations. The distributed modules -have their parameters and optimizer states partitioned across tensor-parallel -ranks. This is to compute the same output as it would have been computed by -the original modules. Since tensor parallelism occurs across data-parallel -ranks, a rank might collect slices of the activations corresponding to the -data shards on other devices that are part of the same tensor parallelism group. - -You can enable or disable tensor parallelism for specific parts of the model. -Within the enabled parts, the replacements with distributed modules will take -place on a best-effort basis for those module supported for tensor parallelism. -Alternatively, you can directly import and use the library’s distributed -modules in the model definition. - -Some of the supported modules (such as ``smdistributed.modelparallel.torch.nn.Transformer``) are high-level -blocks that contain many operations. Because custom implementations -(as opposed to the built-in PyTorch modules) are typically used for these -high-level blocks, the library offers an API that you can use to register -specific distributed versions with such custom modules (provided that they -are functionally equivalent). This allows the library to automatically replace -the occurrences of such PyTorch modules with their distributed counterparts -provided by the library. -For more information, see the following topics. - -.. contents:: Topics - :depth: 3 - :local: - -.. _registering-tp-modules: - -Registering Tensor Parallelism Distributed Modules --------------------------------------------------- - -Although PyTorch natively provides some of the commonly used (and -tensor-parallelizable) building blocks such as Transformer, users often -use custom implementations for such higher-level modules. To distribute -such modules with tensor parallelism, you need to register the -distributed modules to the custom module implementation in your class, -so that the library knows how to distribute the custom module. When you -register the distributed modules, make sure the custom module that you -use is functionally equivalent to the distributed module. You can verify -this by taking a look at the equivalent reference implementations in the -:ref:`smdmp-tp-appendix`. -These implementations are functionally equivalent to their distributed -versions in ``smdistributed.modelparallel.torch.nn`` module. - -.. class:: smdistributed.modelparallel.torch.tp_register(dist_module, init_hook=None, forward_hook=None, return_hook=None) - - - A decorator class that registers the ``dist_module`` class with - the module class that it is attached to. The hooks can be used to - adapt to different interfaces used with ``__init__`` and - ``forward`` methods. - - **Arguments:** - - - ``dist_module``: A subclass of ``smdistributed.modelparallel.torch.nn.DistributedModule`` - that implements the distributed version of the module class the - decorator is attached to. Any distributed module class defined - in ``smdistributed.modelparallel.torch.nn`` module can be used. - - ``init_hook``: A callable that translates the arguments of the - original module ``__init__`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``__init__`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``__init__`` method (including argument order and default - values), except it must exclude ``self``. - - ``forward_hook``: A callable that translates the arguments of - the original module ``forward`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``forward`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``forward`` method (including argument order and default - values), except it must exclude ``self``. - - ``return_hook``: A callable that translates the object returned - from the distributed module to the return object expected of - the original module. - - - **Example:** - - .. code:: python - - import smdistributed.modelparallel.torch as smp - - init_hook = lambda config: ((), config.to_dict()) - - # register smp.nn.DistributedTransformer - # as the distributed version of MyTransformer - @smp.tp_register(smp.nn.DistributedTransformer, init_hook=init_hook) - class MyTransformer(nn.Module): - def __init__(self, config): - ... - - def forward(self, hidden_states, attention_mask): - ... - -.. function:: smdistributed.modelparallel.torch.tp_register_with_module(module_cls, dist_module, init_hook=None, forward_hook=None, return_hook=None) - - - When you do not have direct access to model definition code, you - can use this API to similarly register a distributed module with - an existing module class. - - - **Arguments:** - - - ``module_cls``: The existing module class that will be - distributed. - - ``dist_module``: A subclass of ``smdistributed.modelparallel.torch.nn.DistributedModule`` - that implements the distributed version of the module class the - decorator is attached to. Any distributed module class defined - in ``smdistributed.modelparallel.torch.nn`` module can be used. - - ``init_hook``: A callable that translates the arguments of the - original module ``__init__`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``__init__`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``__init__`` method (including argument order and default - values), except it must exclude ``self``. - - ``forward_hook``: A callable that translates the arguments of - the original module ``forward`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``forward`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``forward`` method (including argument order and default - values), except it must exclude ``self``. - - ``return_hook``: A callable that translates the object returned - from the distributed module to the return object expected of - the original module. - - - **Example:** - - .. code:: python - - import smdistributed.modelparallel.torch as smp - - from somelibrary import MyTransformer - - init_hook = lambda config: ((), config.to_dict()) - - # register smp.nn.DistributedTransformer as the distributed version of MyTransformer - smp.tp_register_with_module(MyTransformer, - smp.nn.DistributedTransformer, - init_hook=init_hook) - -.. _smdmp-supported-modules-for-tp: - -Supported Modules for Tensor Parallelism ----------------------------------------- - -The following modules are supported for tensor parallelism. - -.. contents:: Topics - :depth: 3 - :local: - -.. _tp-module-api: - -Tensor Parallelism Module APIs -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- :class:`smdistributed.modelparallel.torch.nn.DistributedLinear` (implements ``nn.Linear``) -- :class:`smdistributed.modelparallel.torch.nn.DistributedTransformerLMHead` -- :class:`smdistributed.modelparallel.torch.nn.DistributedTransformer` -- :class:`smdistributed.modelparallel.torch.nn.DistributedTransformerLayer` -- :class:`smdistributed.modelparallel.torch.nn.DistributedAttentionLayer` -- :class:`smdistributed.modelparallel.torch.nn.DistributedTransformerOutputLayer` -- :class:`smdistributed.modelparallel.torch.nn.DistributedEmbedding` - -.. class:: smdistributed.modelparallel.torch.nn.DistributedLinear(in_features, out_features) - - Tensor-parallel implementation of the ``nn.Linear`` class. - Functionally equivalent to an ``nn.Linear`` module with the same - ``in_features`` and ``out_features``. In other words, - ``in_features`` and ``out_features`` are the number of *global* - channels across tensor-parallel ranks. - - For more information about what's the reference implementation of this module, - see :ref:`smdmp-tp-appendix`. - - - - **Arguments:** - - - ``in_features``: The total number of input channels for the - linear layer across all tensor-parallel ranks. - - ``out_features``: The total number of output channels for the - linear layer across all tensor-parallel ranks. - -.. class:: smdistributed.modelparallel.torch.nn.DistributedTransformerLMHead(num_layers=12, num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, vocab_size=30522, num_positions=1024, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, num_token_types=0, causal_mask_size=None, add_cross_attention=False, add_lm_head=True, initializer_range=0.02, use_normal_initialization=False, pre_layernorm=False, post_layernorm=True) - - Constructs a distributed transformer model, including embeddings - and a single LM head. A word embedding of size - ``(vocab_size, hidden_size)`` is created, as well as a positional - embedding of size ``(num_positions, hidden_size)``, and the - embeddings are added together. If ``num_token_types`` is larger - than 0, a separate embedding of size - ``(num_token_types, hidden_size)`` is created, and further added - on top. - - - The embeddings are fed through a ``DistributedTransformer``, and - if ``add_lm_head`` is ``True``, the output passes through a single - LM head, which is a linear module without bias whose weight is - tied to the word embeddings. - - See :class:`smdistributed.modelparallel.torch.nn.DistributedTransformerLayer` for descriptions of the rest - of the arguments. - - **Methods:** - - - ``forward(self, inputs)`` - - - If ``add_cross_attention`` is ``True``, ``inputs`` must be a - tuple - ``(input_ids, attention_mask, token_type_ids, position_ids, cross_states, cross_states, cross_mask, labels)``. - - Otherwise, ``inputs`` must be a tuple - ``(input_ids, attention_mask, token_type_ids, position_ids, labels)``. - - If ``token_type_ids`` is ``None``, token type embedding will - not be used. - - ``input_ids`` is assumed to be of shape ``[N, S]``, where - ``N`` is the batch size and ``S`` is sequence length. - - ``attention_mask`` is assumed to be a 0-1 tensor of shape - ``[N, S]``, where 1 represents a masked position. - -.. class:: smdistributed.modelparallel.torch.nn.DistributedTransformer(num_layers=12, num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, causal_mask_size=None, add_cross_attention=False, pre_layernorm=False, post_layernorm=True) - - A sequence of :class:`smdistributed.modelparallel.torch.nn.DistributedTransformerLayer`\ s, whose - number is given by ``num_layers`` argument. For the other - arguments and methods, refer to - :class:`smdistributed.modelparallel.torch.nn.DistributedTransformerLayer`. - - If both ``pre_layernorm`` and ``post_layernorm`` are ``True``, - layer normalization is applied to both the input and the output of - the ``DistributedTransformer``, in addition to the intermediate - attention and transformer-output layers. - -.. class:: smdistributed.modelparallel.torch.nn.DistributedTransformerLayer(num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, causal_mask_size=None, add_cross_attention=False, pre_layernorm=False, post_layernorm=True) - - Tensor-parallel implementation of a single transformer layer. - Number of attention heads, hidden size, and intermediate size - refer to the global quantities across all tensor-parallel ranks. - - For more information about what's the reference implementation of this module, - see :ref:`smdmp-tp-appendix`. - - - **Arguments:** - - - ``num_attention_heads``: The total number of attention heads - across tensor-parallel ranks - - ``attention_head_size``: The number of channels of a single - attention head. - - ``hidden_size``: The hidden dimension of the transformer. The - input tensor ``hidden_states`` is assumed to have its last - dimension size equal to ``hidden_size``. - - ``intermediate_size``: The number of output channels in the - first linear transformation of the transformer output layer. - ``DistributedTransformerOutputLayer`` first maps - ``hidden_size`` dimensions of its input tensor into - ``intermediate_size`` dimensions, and then maps it back into - ``hidden_size`` dimensions. - - ``attention_dropout_prob``: The dropout probability applied to - the attention probabilities. - - ``hidden_dropout_prob``: The dropout probability used in - dropout layers other than the one applied to the attention - probabilities. - - ``activation``: Choice of activation function to use at the - output layer. Must be ``"gelu"`` or ``"relu"``. - - ``layernorm_epsilon``: The epsilon added to the denominator of - layer normalization for numerical stability. - - ``initializer_range``: If ``use_normal_initialization`` is - ``True``, the standard deviation of the normal random variable - to initialize the weights with. - - ``use_normal_initialization``: If ``True``, the weights are - initialized with normal distribution with standard deviation - given by ``initializer_range``. Otherwise, default PyTorch - initialization is used. - - ``causal_mask_size``: If ``None``, no causal mask is used on - attentions. Otherwise, should be set to maximum sequence length - to apply a causal mask to the attention scores. This is used, - for instance, in GPT-2. - - ``add_cross_attention``: If ``True``, a cross-attention layer - will be added after the self-attention block. The - cross-attention layer computes the attention keys and values - based on the ``cross_states`` input (instead of - ``hidden_states`` input, as in self-attention. This is used in - the decoder block of encoder-decoder architectures. For - encoder-only architectures that only use self-attention, this - should be kept ``False``. - - ``pre_layernorm``: If ``True``, inserts layer normalization at - the input. At least one of ``pre_layernorm`` and - ``post_layernorm`` must be ``True``. - - ``post_layernorm``: If ``True``, inserts layer normalization at - the output. At least one of ``pre_layernorm`` and - ``post_layernorm`` must be ``True``. - - ``use_alibi`` (bool, default False): Activates Attention with - Linear Biases (ALiBi) for attention computation. - ALiBi facilitates efficient extrapolation on input sequences - and thus improves training efficiency. - The library enables ALiBi by using the `Triton - flash attention kernel - `_. - Refer to https://arxiv.org/abs/2108.12409 for more - details on the technique. - (Available from - the SageMaker model parallelism library v1.15.0.) - - ``alibi_bias_max`` (int, default 8): Defines the ALiBi base - value for mask generation. (Available from - the SageMaker model parallelism library v1.15.0.) - - - **Methods:** - - - ``forward(self, inputs)``: Forward pass for the transformer - layer. - - - **Arguments:** - - - If ``add_cross_attention=False``, ``inputs`` must be a - tuple ``(hidden_states, attention_mask)``, where - ``hidden_states`` is assumed to be a tensor of dimensions - ``[N, S, H]``, where ``N`` is batch size, ``S`` is - sequence length, and ``H`` is ``hidden_size``. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S]``, where ``N`` is the batch - size, and ``S`` is the sequence length. - - If ``add_cross_attention=True``, ``inputs`` must be a - tuple - ``(hidden_states, cross_states, attention_mask, cross_mask)``, - where ``hidden_states`` is assumed to be a tensor of - dimensions ``[N, S_1, H]``, where ``N`` is batch size, - ``S_1`` is sequence length, and ``H`` is ``hidden_size``. - ``cross_states`` is assumed to be a tensor of size - ``[N, S_2, H]``, similarly interpreted. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S_1]``, where ``N`` is the batch - size, and ``S_1`` is the sequence length, and - ``cross_mask`` is assumed to be a tensor of size - ``[N, 1, 1, S_2]``. Keys and values for the attention - heads in the cross-attention layer (but not the - self-attention layer) are computed using - ``cross_states``, and ``cross_mask`` is applied as the - attention mask in the cross-attention layer (but not the - self-attention layer). - - - **Returns:** - - - If ``add_cross_attention=False``, a tuple - ``(hidden_states, attention_mask)``, where - ``hidden_states`` is the output of the transformer, and - ``attention_mask`` is the same the ``attention_mask`` - argument. - - If ``add_cross_attention=True``, a tuple - ``(hidden_states, cross_states, attention_mask, cross_mask)``, - where ``hidden_states`` is the output of the transformer, - and the next three tensors are the same as the input - arguments. - -.. class:: smdistributed.modelparallel.torch.nn.DistributedAttentionLayer(num_attention_heads=32, attention_head_size=32, hidden_size=1024, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, cross_attention=False, causal_mask_size=None, pre_layernorm=False, post_layernorm=True) - - A distributed implementation for the attention block. Includes the - computation of the self- or cross-attention (context layer), - followed by a linear mapping and dropout, which is optionally - followed by the residual-connection and layer normalization. - - For more information about what's the reference implementation of this module, - see :ref:`smdmp-tp-appendix`. - - - **Arguments:** - - - See :class:`smdistributed.modelparallel.torch.nn.DistributedTransformerLayer` for descriptions of the - arguments. - - ``cross_attention``: If ``True``, it computes the attentions - with respect to the ``cross_states`` tensor of the ``forward`` - method input tuple. (Default: ``False``) - - - **Methods:** - - - ``forward(self, inputs)``: Forward pass for the attention - layer. - - - **Arguments:** - - - If ``cross_attention=False``, ``inputs`` must be a tuple - ``(hidden_states, attention_mask)``, where - ``hidden_states`` is assumed to be a tensor of dimensions - ``[N, S, H]``, where ``N`` is batch size, ``S`` is - sequence length, and ``H`` is ``hidden_size``. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S]``, where ``N`` is the - batch size, and ``S`` is the sequence length. - - If ``cross_attention=True``, ``inputs`` must be a tuple - ``(hidden_states, cross_states, attention_mask)``, where - ``hidden_states`` is assumed to be a tensor of dimensions - ``[N, S_1, H]``, where ``N`` is batch size, ``S_1`` is - sequence length, and ``H`` is ``hidden_size``. - ``cross_states`` is assumed to be a tensor of size - ``[N, S_2, H]``, similarly interpreted. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S_2]``, where ``N`` is the batch - size, and ``S_2`` is the sequence length. Keys and values - for the attention heads are computed using - ``cross_states``. - - - **Returns:** - - - A single tensor that is the output of the attention - layer. - -.. class:: smdistributed.modelparallel.torch.nn.DistributedTransformerOutputLayer(hidden_size=1024, intermediate_size=4096, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, pre_layernorm=False, post_layernorm=True, fp32_residual_addition=False) - - - Distributed implementation of a single transformer output layer. A - single :class:`smdistributed.modelparallel.torch.nn.DistributedTransformerLayer` with - ``add_cross_attention=False`` consists of a single - ``DistributedAttentionLayer`` immediately followed by a single - ``DistributedTransformerOutputLayer``. The latter linearly maps - the last channel of the input tensor from ``hidden_size`` to - ``intermediate_size``, and then maps it back to ``hidden_size``. - - For more information about what's the reference implementation of this module, - see :ref:`smdmp-tp-appendix`. - - - **Arguments:** - - - See :class:`smdistributed.modelparallel.torch.nn.DistributedTransformerLayer` for descriptions of the - arguments. - - ``fp32_residual_addition``: Set to ``True`` if you want to avoid overflow - (NaN loss values) for large models with more than 100 billion parameters - when using FP16. (Default: False) - -.. class:: smdistributed.modelparallel.torch.nn.DistributedEmbedding(num_embeddings,embedding_dim, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False, _weight=None, initializer_range=0.02, _skip_allgather=False,_skip_scatter_and_merge=False,) - - - Distributed implementation of a single Embedding Layer. Currently - only supports splitting across the embedding_dim. - - **Arguments:** - - - See :class:`smdistributed.modelparallel.torch.nn.DistributedEmbedding` for descriptions of the - arguments. - -.. _enabling-tp: - -Enabling Tensor Parallelism -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -There are two ways tensor parallelism can be enabled. - -First, you can use -the distributed module implementations in ``smdistributed.modelparallel.torch.nn`` module directly in -your model definition. See :ref:`smdmp-supported-modules-for-tp` -for a complete list of built-in distributed modules. Here is an example -of how this can be done: - -.. code:: python - - import torch.nn as nn - import smdistributed.modelparallel.torch as smp - - class TransformerModel: - def __init__(self): - self.embedding = nn.Embedding(vocab_size, hidden_size) - - # directly instantiate smp.nn.DistributedTransformer and use it - self.encoder = smp.nn.DistributedTransformer(num_layers, hidden_size, **kwargs) - - self.pooler = nn.Linear(hidden_size, hidden_size) - - def forward(self, hidden_states): - emb_out = self.embedding(hidden_states) - enc_out = self.encoder(emb_out) - return self.pooler(enc_out) - -Second, you can enable tensor parallelism for specific modules or blocks -of code, which will automatically enable tensor parallelism for the -supported modules within that scope. To do this, you can use the -following API: - -.. decorator:: smdistributed.modelparallel.torch.tensor_parallelism(enabled=True, **kwargs) - - - A context manager that enables or disables tensor parallelism for - any supported module that is created inside. If there are nested - contexts, the innermost overrides the rest. If there are - multiple supported modules created within the context, where one - is the submodule of the other, only the outermost module will be - distributed. If a supported module shares weights with another - (supported or unsupported) module, or if its hyperparameters do - not support distribution (e.g., not divisible by the tensor - parallelism degree), tensor parallelism will **not** be enabled - for this module even if this API is used. - - **Example:** - - .. code:: python - - import smdistributed.modelparallel.torch as smp - - with smp.tensor_parallelism(): - self.m0 = nn.Linear(20, 20) # will be distributed - with smp.tensor_parallelism(enabled=False): - self.m1 = nn.Linear(20, 20) # will not be distributed - - - ``kwargs`` - Keyword arguments that can be used to modify the configurations of - the distributed modules created inside the context. - If a keyword argument provided through it matches any ``__init__`` method arguments - of a ``DistributedModule`` that substitutes a module created inside - the ``smdistributed.modelparallel.torch.tensor_parallelism`` context, this keyword will override - the value defined in the ``init_hook``. - - - (*For v1.7.0 and later*) Through the following additional keyword arguments, - the library supports `NVIDIA Megatron’s fused kernels - `_ - - - ``fused_softmax`` (bool) - Fusion of attention masking and softmax. - By default, it is set to ``True``. You can deactivate it by setting - ``fused_softmax=False`` in the ``smdistributed.modelparallel.torch.tensor_parallelism`` context manager. - - ``fused_bias_gelu`` (bool) - Fusion of bias addition and Gelu activation. - By default, it is set to ``False``. You can activate it by setting - ``fused_bias_gelu=True`` in the ``smdistributed.modelparallel.torch.tensor_parallelism`` context manager. - - - -.. function:: smdistributed.modelparallel.torch.set_tensor_parallelism(module, enabled=True, **kwargs) - - - Enables or disables tensor parallelism for the supported - submodules of ``module``. If enabling, the outermost supported - modules will be distributed. If disabling, tensor parallelism will - be disabled for the entire module subtree of ``module``. Unlike - the context manager, this API can be used after the model creation - (but before wrapping with :class:`smdistributed.modelparallel.torch.DistributedModel`), so direct - access to model definition code is not required. If a supported - module shares weights with another (supported or unsupported) - module, or if its hyperparameters do not support distribution - (e.g., not divisible by the tensor parallelism degree), tensor - parallelism will **not** be enabled for this module. - - Keyword arguments ``kwargs`` can be used to modify the - configurations of the distributed modules created inside the - context. If a keyword argument provided here matches any - ``__init__`` method arguments of a :class:`smdistributed.modelparallel.torch.DistributedModel` that - substitutes a module created inside the ``smdistributed.modelparallel.torch.tensor_parallelism`` - context, this keyword will override the value defined in the - ``init_hook``. - - **Example:** - - .. code:: python - - import smdistributed.modelparallel.torch as smp - - model = MyModel() - smp.set_tensor_parallelism(model.encoder, True) - smp.set_tensor_parallelism(model.encoder.embedding, True) - - # outermost supported submodules in model.encoder will be distributed, except for - # model.encoder.embedding - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - -.. _activation-checkpointing-api: - -Activation Checkpointing APIs ------------------------------ - -``smdistributed.modelparallel`` provides three APIs to enable -activation checkpointing: one for checkpointing modules, -one for checkpointing sequential modules, and -one for checkpointing pretrained models. - -For a conceptual guide and examples, see -`Activation Checkpointing `_ -in the *SageMaker's Distributed Model Parallel developer guide*. - -.. class:: smdistributed.modelparallel.torch.patches.checkpoint.checkpoint(module, *args, preserve_rng_state=True) - - - Checkpoints the module passed. Throws error if, during manual - partitioning, all children of module are not on same rank as the - module itself, i.e. the module tree is split across multiple - partitions. During auto-partitioning, if the module is split - across multiple partitions, then this call is ignored(with a - warning). Note that this call applies to the module instance only, - not to the module class. - - - **Arguments:** - - - ``module (Instance of nn.Module)``: The module to be - checkpointed. Note that unlike native checkpointing in - PyTorch’s, activation checkpointing in - ``smdistributed.modelparallel`` is at the granularity of a - module. A generic function cannot be passed here. - - ``args``: Tuple containing inputs to the module. - - ``preserve_rng_state (bool, default=True)``: Omit stashing and - restoring the RNG state during each checkpoint. - -.. class:: smdistributed.modelparallel.torch.patches.checkpoint.checkpoint_sequential(sequential_module, input, strategy="each", preserve_rng_state=True, pack_args_as_tuple=False) - - - Checkpoints the modules inside - `nn.Sequential `__. - This can be used even if different layers that are part of the - sequential container lie on different partitions. Each layer part - of the sequential module that is checkpointed must lie completely - within one partition. If this is not the case during manual - partitioning, then an error will be thrown. If this is not the - case during auto partitioning, a warning will be raised and this - module will be run without checkpointing. - - - **Arguments** - - - ``sequential_module (nn.Sequential)``: the sequential module to - be checkpointed. - - ``input (torch.Tensor or a tuple of torch.Tensors)``: input to - the module, which can be a tensor or a tuple of tensors. If a - tuple is passed, then pack_args_as_tuple should be set to True. - - ``strategy (string, default=“each”)`` : Strategy determines how - many layers part of the sequential module need to be grouped - together for one checkpointing call. This determines how much - memory can be reduced. It can take the following values - - - ``each`` : The default is to checkpoint each module inside - the sequential separately. - - ``contiguous``: Groups consecutive layers on the same - partition together. For example, if a sequential consists of - [a, b, c, d] where a,b are on pp_rank0 and c,d are on - pp_rank 1, then this strategy would checkpoint a,b together - and then c,d together. This means effectively, inputs of a, - outputs of b, inputs of c, and outputs of d are in memory; - the reamining activations are recomputed. - - ``group_2, group_3, group_4, etc:`` More generally, - ``group_x`` where x is an integer. This strategy provides - more flexibility in how many layers to group together. - ``group_x`` groups x layers together on a best effort basis. - It can group x layers together if there are x layers - consecutively on the same partition. For example: - [a,b,c,d,e] where a,b are on pp_rank0 and c,d,e are on - pp_rank 1. If the strategy is ``group_3,`` then a,b are - checkpointed together on pp_rank0 and c,d,e are checkpointed - together on pp_rank1. - - - ``preserve_rng_state (bool, default=True)``: Set to ``False`` - to omit stashing and restoring the RNG state during each - checkpoint. - - ``pack_args_as_tuple (bool, default=False)``: To ensure that - backward works correctly, the autograd function has to unpack - any tuples received. If the checkpointed layer takes a tuple as - input, then this needs to be set to True. - -.. class:: smdistributed.modelparallel.torch.set_activation_checkpointing(module, preserve_rng_state=True, pack_args_as_tuple=False, strategy="each") - - - This API is recommended when importing pretrained models from - libraries, such as PyTorch and Hugging Face Transformers. This is - particularly useful when you don’t have access to the model - definition code and not be able to replace a module call with - checkpoint. - - - **Arguments**: - - - ``module (Instance of nn.Module or nn.Sequential)``: The module - to checkpoint. - - ``preserve_rng_state (bool, default=True)``: Set to ``False`` - to omit stashing and restoring the RNG state during each - checkpoint. - - ``pack_args_as_tuple (bool, default=False)``: *Can only be - passed when module is a sequential module.* To ensure that - backward works correctly, the autograd function has to unpack - any tuples received. If the layer checkpointed takes a tuple as - input, then this needs to be set to True. - - ``strategy: (string, default=“each”)``: *Can only be passed - when module is a sequential module.* Strategy determines how - many layers part of the sequential module need to be grouped - together for one checkpointing call. - - This determines how much memory can be reduced. It can take the - following values - - - ``each`` : The default is to checkpoint each module inside - the sequential separately. - - ``contiguous``: Groups consecutive layers on the same - partition together. For example if a sequential consists of - ``[a, b, c, d]`` where ``a, b`` are on ``pp_rank0`` and ``c, d`` are on - ``pp_rank 1``, then this strategy would checkpoint a,b together - and then ``c, d`` together. This means effectively, the inputs of - ``a``, outputs of ``b``, inputs of ``c``, and outputs of ``d`` are in - memory, and the rest of the activations are recomputed. - - ``group_2, group_3, group_4, etc:`` More generally, - ``group_x`` where x is an integer. This strategy provides - more flexibility in how many layers to group together. - ``group_x`` groups x number of layers together on a best - effort basis if there are x layers consecutively in the same - partition. **Example**: Assume a module with layers ``[a, b, - c, d, e]``. The layers a and b are on pp_rank0, and ``c``, ``d``, and - ``e`` are on ``pp_rank 1``. If the strategy is ``group_3,`` then ``a``, - ``b`` are checkpointed together on ``pp_rank0``, and ``c``, ``d``, ``e`` are - checkpointed together on ``pp_rank1``. - -.. _smdmp-tp-appendix: - -Appendix: Reference Implementations for Modules ------------------------------------------------ - -The following are reference implementations for transformer-related -modules. Note that this is not the actual ``smdistributed`` source code, -but the distributed implementations provided in the library are the -distributed versions of these reference implementations, and can be used -to determine whether the distributed modules perform the same operations -as the custom modules in your script. - -To keep the implementations simple, we only assume keyword arguments, -and assume the existence of a method ``parse_args(kwargs)``, which -parses the arguments to ``__init__`` methods and sets the relevant -attributes of the module, such as ``hidden_size`` and -``num_attention_heads``. - -``smdistributed.modelparallel.torch.nn.DistributedTransformer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class Transformer(nn.Module): - def __init__(self, **kwargs): - super(Transformer, self).__init__() - self.parse_args(kwargs) - - self.layers = [] - for l in range(self.num_layers): - self.layers.append(TransformerLayer(**kwargs)) - - self.seq_layers = nn.Sequential(*self.layers) - - def forward(self, inp): - return self.seq_layers(inp) - -``smdistributed.modelparallel.torch.nn.DistributedTransformerLayer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class TransformerLayer(nn.Module): - def __init__(self, **kwargs): - super(TransformerLayer, self).__init__() - self.parse_args(kwargs) - - self.attention = AttentionLayer(**kwargs) - self.output = TransformerOutputLayer(**kwargs) - - if self.add_cross_attention: - self.cross_attention = AttentionLayer(cross_attention=True, **kwargs) - - def forward(self, inp): - if self.add_cross_attention: - hidden_states, cross_states, attention_mask, cross_mask = inp - else: - hidden_states, attention_mask = inp - - attention_output = self.attention((hidden_states, attention_mask)) - if self.add_cross_attention: - attention_output = self.cross_attention((attention_output, - cross_states, - cross_mask)) - - output = self.output(attention_output) - - if self.add_cross_attention: - return output, cross_states, attention_mask, cross_mask - else: - return output, attention_mask - -``smdistributed.modelparallel.torch.nn.DistributedAttentionLayer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class AttentionLayer(nn.Module): - def __init__(self, **kwargs): - super(AttentionLayer, self).__init__() - self.parse_args(kwargs) - self.attention_head_size = self.hidden_size // self.num_attention_heads - - self.query = nn.Linear(self.hidden_size, self.hidden_size) - self.key = nn.Linear(self.hidden_size, self.hidden_size) - self.value = nn.Linear(self.hidden_size, self.hidden_size) - self.dense = nn.Linear(self.hidden_size, self.hidden_size) - - self.dropout1 = nn.Dropout(self.attention_dropout_prob) - self.dropout2 = nn.Dropout(self.hidden_dropout_prob) - - if self.pre_layernorm: - self.pre_layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - if self.post_layernorm: - self.layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - def transpose(self, tensor, key=False): - shape = tensor.size()[:-1] + - (self.num_attention_heads, self.attention_head_size) - tensor = torch.reshape(tensor, shape) - if key: - return tensor.permute(0, 2, 3, 1) - else: - return tensor.permute(0, 2, 1, 3) - - def forward(self, inp): - if self.cross_attention: - hidden_states, cross_states, attention_mask = inp - else: - hidden_states, attention_mask = inp - - if self.pre_layernorm: - norm_states = self.pre_layernorm(hidden_states) - else: - norm_states = hidden_states - - query_layer = self.query(norm_states) - - if self.cross_attention: - key_layer = self.key(cross_states) - value_layer = self.value(cross_states) - else: - key_layer = self.key(norm_states) - value_layer = self.value(norm_states) - - query_layer = self.transpose(query_layer) - key_layer = self.transpose(key_layer, key=True) - value_layer = self.transpose(value_layer) - - attention_scores = torch.matmul(query_layer, key_layer) - attention_scores = attention_scores / math.sqrt(self.attention_head_size) - - if not self.cross_attention and self.causal_mask is not None: - attention_scores = self.apply_causal_mask(attention_scores) - - attention_scores = attention_scores + attention_mask - - attention_probs = F.softmax(attention_scores, dim=-1) - attention_probs = self.dropout1(attention_probs) - - context_layer = torch.matmul(attention_probs, value_layer) - context_layer = context_layer.permute(0, 2, 1, 3) - new_context_layer_shape = context_layer.size()[:-2] + \ - (self.local_attention_size,) - context_layer = torch.reshape(context_layer, new_context_layer_shape) - - self_attention = self.dense(context_layer) - self_attention = self.dropout2(self_attention) - - if self.post_layernorm: - return self.layernorm(self_attention + hidden_states) - else: - return self_attention - -``smdistributed.modelparallel.torch.nn.DistributedTransformerOutputLayer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class TransformerOutputLayer(nn.Module): - def __init__(self, **kwargs): - super(TransformerOutputLayer, self).__init__() - self.parse_args(kwargs) - - self.dense1 = nn.Linear(self.hidden_size, self.intermediate_size) - self.dense2 = nn.Linear(self.intermediate_size, self.hidden_size) - - self.dropout = nn.Dropout(self.attention_dropout_prob) - - if self.pre_layernorm: - self.pre_layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - if self.post_layernorm: - self.layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - def forward(self, inp): - if self.pre_layernorm: - norm_inp = self.pre_layernorm(inp) - else: - norm_inp = inp - - dense1_output = self.dense1(norm_inp) - if self.activation == "gelu": - act_output = F.gelu(dense1_output) - else: - act_output = F.relu(dense1_output) - - dense2_output = self.dense2(act_output) - output = self.dropout(dense2_output) - - if self.post_layernorm: - return self.layernorm(inp + output) - else: - return output diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst deleted file mode 100644 index 7f21f7a557..0000000000 --- a/doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst +++ /dev/null @@ -1,165 +0,0 @@ -TensorFlow API -============== - -To use the TensorFlow-specific APIs for SageMaker distributed model parallism, -you need to add the following import statement at the top of your training script. - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -.. tip:: - - Refer to - `Modify a TensorFlow Training Script - `_ - to learn how to use the following APIs in your TensorFlow training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of the Keras \ ``Model`` class, which defines the model to - be partitioned. Model definition is done by sub-classing - ``smp.DistributedModel`` class, and implementing the ``call()`` method, - in the same way as the Keras model sub-classing API. Any operation that - is part of the \ ``smp.DistributedModel.call()`` method is subject to - partitioning, meaning that every operation placed inside executes in - exactly one of the devices (the operations outside run on all devices). - - - Similar to the regular Keras API, the forward pass is done by directly - calling the model object on the input tensors. For example: - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - However, ``model()`` calls can only be made inside a - ``smp.step``-decorated function. - - The outputs from a ``smp.DistributedModel`` are available in all ranks, - regardless of which rank computed the last operation. - - **Methods:** - - .. function:: save_model(save_path="/opt/ml/model") - - **Inputs** - - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights. - - Saves the entire, - unpartitioned model with the latest trained weights to ``save_path`` in - TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which - SageMaker monitors to upload the model artifacts to Amazon S3. - -.. function:: smp.partition(index) - - **Inputs** - - - ``index`` (``int``): The index of the partition. - - A context manager which places all operations defined inside into the - partition whose ID is equal to ``index``. When - ``smp.partition`` contexts are nested, the innermost context overrides - the rest. The ``index`` argument must be smaller than the number of - partitions. - - ``smp.partition`` is used in the manual partitioning API; - if \ ``"auto_partition"`` parameter is set to ``True`` while launching - training, then ``smp.partition`` contexts are ignored. Any operation - that is not placed in any ``smp.partition`` context is placed in the - ``default_partition``, as shown in the following example: - - .. code:: python - - # auto_partition: False - # default_partition: 0 - smp.init() - [...] - x = tf.constant(1.2)                     # placed in partition 0 - with smp.partition(1): -     y = tf.add(x, tf.constant(2.3))      # placed in partition 1 -     with smp.partition(3): -         z = tf.reduce_sum(y)             # placed in partition 3 - - -.. function:: register_post_partition_hook(hook) - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. - - .. code:: python - - @smp.register_post_partition_hook - def test_eager(): - # All statements here will be executed right after partition but before the first forward pass - tf.print("Entered hook through eager context") - -.. class:: smp.CheckpointManager - - - A subclass of TensorFlow - `CheckpointManager `__, - which is used to manage checkpoints. The usage is similar to TensorFlow - ``CheckpointManager``. - - The following returns a ``CheckpointManager`` object. - - .. code:: python - - smp.CheckpointManager(checkpoint, -                       directory="/opt/ml/checkpoints", -                       max_to_keep=None, -                       checkpoint_name="ckpt") - - **Parameters** - - - ``checkpoint``: A `tf.train.Checkpoint - `__ instance - that represents a model checkpoint. - - - ``directory``: (``str``) The path to a directory in which to write - checkpoints. A file named "checkpoint" is also written to this - directory (in a human-readable text format) which contains the state - of the ``CheckpointManager``. Defaults to - ``"/opt/ml/checkpoints"``, which is the directory that SageMaker - monitors for uploading the checkpoints to Amazon S3. - - ``max_to_keep`` (``int``): The number of checkpoints to keep. If - ``None``, all checkpoints are kept. - - ``checkpoint_name`` (``str``): Custom name for the checkpoint file. - Defaults to ``"ckpt"``. - - - **Methods:** - - .. function:: save( ) - - Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``. - - .. function:: restore( ) - - Restores the latest checkpoint in the specified directory. - Internally uses ``tf.train.CheckpointManager.restore()``. - - - **Examples:** - - .. code:: python - - checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) - ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints - - for inputs in train_ds: -     loss = train_step(inputs) -     # [...] -     ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints - - .. code:: python - - for step, inputs in enumerate(train_ds): -     if step == 0: -         ckpt_manager.restore() -     loss = train_step(inputs) diff --git a/doc/api/training/smp_versions/model-data-parallel.png b/doc/api/training/smp_versions/model-data-parallel.png deleted file mode 100644 index 089b84673a7a7d24e515f49e8e042384f7268895..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 36777 zcmb@sc|6qZ7dM`zO=*#mv`Z!1EXL42n9XdAWz49U88c%xV^(I8rA66Nsf?vPZK#-} z6%|^Hw30T(jS3+pyXSIm-~0E+@A>QL^=W+8YdPm!=e*zNT#4Rftf|QYlL-?hnBsAs z)Cm&|brU8S9G)@>+*x_<(v%4kW-e8r0~O*ZE?>l%U<3E~>&ylQ36sbbHgHcH7>pSa z;TXzi@?=c0+%cS^0Jp$%u_Tnw<#R&+dItu9K^-742Po7J0<(dmpyNLzOB6DFO$oZ@&D7z_?$QjChxEL@B;)~4+Tq3br8~ERBj;27svf;9x9GZ&XxQH-Wm>Zc-5(^!eQD~8p z{8uC4OfcF%rS1&`<^cVx5o7?A!FH27qgBpOuE2*twSl>lT?7=8vn$dA>FgFkB1Z_r zF={xKBv<-N2{;Lik$eQ3HI{Q9eF+9{5-(C;Ffj;m%5LFnISs z5>rC~q5wxYT?16^j@S6Paw!a@r$zwBNQn$cs58Mwfbv6Q=@DeCGoFYJaQ0V0NNTm0 zg5U!_peo6c5Ed`W7lC&MQgvsLo%wu*hV4a#N>LalL*h!Nf!j!+Rd+A36!b*qp~LC; zKnTMfP6z;Z`K~Mm*B=cPN|6$-vmX^fL`cP)bprZmIk$w>fsh2k{!Veck z5~_VRT~7(7fI$b~X!Pzo0k zO2gtABtO2?4?|W6SXglcmxw}Rl>rzGgGH36X(|%Qmw_ThdB}Wy-T7fm4h0#;_wz;L z!OC|hM};w+F+!q(uMTh_K~x?vF5lA?DT`!Aut-82FUm8F1^10YjvEaHCxJq+v@niP z9sx&5Rd|9s4*~9jgNtK(_zDDgzRDf$C-smkWe8WIkmjt!Nh1QGFdUQaBR~`A0j_@G z1Z-F!U%^y*qx~bu92QKffx=v&9!REwMe|3Ayh(DIDiGpMp2uzMHqiP zSpre=X%a65k&mVNQ!nI_(2RpE}`?`VKg|=&Cl7Nh4RPa z1vt7pK3vRK5@c#BL(bL!A>n9Z8i`C8H;1TD1yAXMW<-qNrV9P=`LV}>k&RC>7 zhZX7PhKlf&ij{00M@|cHMbjh%Io(|)6JUr4k{khNxU;cg5Ep&$aM)@nk-&^VihTVe z_ymd<8UdktNIW$-wF?^`B_{J!VV*QU4|0^)J5WI81hVLK>9`;9U`tdSPgfRS$oC+S zJop-sAH$m{6^NB$ff|9wd-?Jsv1mCdD#DxYid6GqVZKs?hbM~^9tx$((R5;UYbZlVb;kIs1u$={ z8x}7G3UDU`FujQaZ@DU*FF>o{h%f>Jfp$@jTOUU5BO%~;C_jXcmlufqFrrN5$#7Bo zk$Hk~WCISb2#l*QfJPW{L!%3?6X; zM3D?NO9f6N{ADVX8%{;!v&MTHUuoe~3O7KFCXf3L0VlvEQmpihP~$N;nu3Lsi`0H@ ze0%_s%~aDpVWcqkx}y*;p? z2XC}{n7hK0rwjvfbN7z)jsSPvQSLxl7P4uTehf+Ry^Y!;pDp@J!# zW&ZvOBAo|8Nz_oDOhLxG!N^1}yf1CsYthH!p%2zq=Hp8hktqBy83QioLpc6eZ=SEO zf<%gx(8Ycng1;glih~ISKosNS>gE$cln`kQS8q5CsnTftkO(hc6p0h4fN>)t;bA z$sZmegY$V}6<>{p`@jfzv7bai!Q*g3xHkev^ur@O_-Y@qPoTHjB`gx@889BN-DK{8 zlyEK{873#;6tqZ=OiB*$V4@=emnZeB+Cn< za>ao)FF<=lg?kbSY*Lg$3JK6i$>A6`S3b*Kgq4TML;@U+AoKI|kt;MPcMitIjU!gN zF>x9nrWz*j7`G)K4_TPlKQvNGi+~E0YBHXsz_`0|0N{1UqDT}85uiSQUYG<+&~ON0 z1V1c4G@R#3b%lBf7;sl2Llnjehoew<4;mfq0hNa-+?fbJA1H@I=2JbC7=IXmn-W4K z&V>-bm*agAp&TDb074{(irpfhVREPn#Rf}`5*Z=%V1+WcGB}8r%rHJH%oU5o!xSVu zLKxtQbi*rUa0Z^`ClfF|L>!(A6Bw|2m=vS(aU+O1DwtUA0;Y|na0meuw3HhWh4FNc z2n(hA!UYm{x**hFf#p&n!F(kO4k45z211QgQdtO%zz?hNKq*)Z29H8T6Hr2bJl$8! zXR4HNf;&b8MG+Mk7yt1!trV&73YbDUz7!z=T%j{L6x?OVY2kFV3xedzM#-ZhJ&BDOjR4W$0LawfBFJt?Ww=J_fn+fF zY6?c+uONA1A!Itk)7OJ2AMcjw%Nd7IBq0+Ku5pbFP=<^A-~$?jq%S3z1Y;h+7!mM=kJS3Mmq~o0`sIV}TdPK)k{Ws!;2~rc=DYk?!NwE zz9c9P#~+_m7{!CG6ot9@yHcDL-k>9t&p6u}KgOfoKTPdU+6J!wCc|*_rj6IlCQMj5 z0S^LJ)V7XeQ?;orLl4Vz5P07X3op-8Xb%6N%J!g(&8ZjX&W?6o^uD;-@9@^$_PGae zbjCivefE#ZM?0o-hk_eGqo*n{M&HmS!K&zFzR3mS;!U8zj;nCXf@7-4ABE4t40B-%_@V@Qd7J#$BaDm%qL&2tu0M zQY+?#(h3-RYBvr|&n0A3dCc4pD#~(`uRQ9wHEEjTjS2O^u{-vwV){SteWT;ee0z<* z3pA58`XIsd!6}cu&x(b#imIws*Mzly4Qlt^xC&9!4T0a@Ia&zQa9hj($J}Y#!ek34 z)eYoL528^Ge@yL?~PQ_S)3fc{F7GNPlx$*o(9H z{p<_7{*>0{s3|yo!rnZL5)%Kbz5WchCaY@KX>@Yar+d3*^sSVfBk0fd_FCxpe16o( zuX^dvFFtqro04%j+#{>7tqy-G?|=}f=_cmBd{ptJ5t(_w7u=Zm6%kwi`Bp*|5ca^?7IoOo!)cy7R30B3v*(-G!TJ65pnrthd zpT8Spn+r-lx@W>T`uU>Xw%UgTr@OmZl}St>+kHt%Pd0~-bUv}W;>_Cq?bSus=>DrW zTecQ35O;om+#YyWQ~Y>&X-}2ag}}noH&st_+`csER>h2sM83MZ1--+@~ar{t!I*Nylnk2rRdc|R2#kdkagT-Uz zShS%2pz-Q$nqS}cJU@dyoIM|A|tx){>~EG8qQqk8sfQfqD|#A z^i#CO*4;LQqo01hq#zBCrq)M1z2u*lb)vo?hzEOMo;fo3>%qwHM#=6I5}h$)=>fh) zC%dyfH)EUTiH&)hq;%liz+KLgG18LNUmQ@6VMa+`fCXhX`mFv@x6W!JqEb!fOgg&y zC-SuKdGh>?(Kj;FnvT_E%+5Si?7#Ta_K~;6!qjekO8cXO)<$N!p}tOdu%T;N*PX_g zp@P}(Mp;KK8}9F!#;ZsUjtMUvXidMder&ims7k^f`mcOl2IMKhj&f ziG(@WIn-5#Y~Un4j5nIh4dn)17ft1IZP(jw&fFR*?)l?!daCyfKRqlNzl1*-ex>Ef z5jdS8TWV2PGH$W>*uDBei%vY#m|7mTALC-52vaDLSPCyf1D2kA1Um=I)PX}s@+RSp968P=gHpA_|zoU<>Rq6|Vm|D4LYc`m#DCJBl7S;or zAe&^&4YyTAC2LbHU&CwHzudPr3B{dw^I(eM5%-Lp8HRiBrLvWs`ecBLw&KmHAu{-U!Cvj-`~2v`=re^hh?|qZDhX- z-uLvi`A~7uZkw^H)?m0W6&3siNifhJ<9t|6H(tXnNcAht41Bve{vot%_y=Lm)&KwjbE!Z0N~PBYC*)>2`)SNhJNOm*?ICOjsBni3_t@{4rO6-^=AbvT>gYX(ikZtdKkvCe z%_-DvEn$)tpBqucIOV=4Zn%kkhdFH^Glk7!dAntiA=VrS7!F41o9jbdq73mT(SlKme8P%KM?`}xh z)>WGZzgWw&8%qYlC$LpV?!PJ;YMNX7M+Q+q}pIUp=h-acu0b*R<9~bB=lL&+N}C?*+$> z>;b+#vqruCSGhH_A$Hs^SNP7Rg06XO_rD}by=Lr>5Oh$X&vsuV)q@rzqjciXOg)-|8Wau z<>>z9NyQg35#b3&Jer+zt)&O~- z(Egs8PQSFb&y7Ow7MnjXuQb=_Wz=UyqI(*~(K79?4O8P0RXMwU-L5-GGruYtHY+#f z9o2SzynEZWr91WwioB%&W@s1WW$^yGI4GUd$~V)WJ7lE0UpI9|PGi!lve2uTte&RX zA^N8ws42sSQ1wJ*9j8g^{7(JQ1w- zSL*%WDe$Kd*3n#3^T(&c%~RI5A89HHeS9c$Ms3h2r}}7(OtAl+PDWpOFI7t(W-UE*Rk1 zerP>0Jo`_a`zRLRse#%p@TAS>jPqdz9~rzIc}iGZc0`Z;hq-nE+ARO0HWw2A!n&Kc zO~vV3kDR=05kJc;WUG){(&pj1C&BD*#7qNtS?TZElJY-n9F+{Pa7mMcy)&xPmZSmiNv>U<7HWz#lWlEnZvb2hjUA2Nu344$ z*FeB8d0F>+dF`p#u~D3tm-~70vGNl(f@1F+ucaZ-c01OYGiRjz-Ar@GE%X0`9M=T@ zj```*2mp2uI|R)ej*X7K<5>W|ztT>(6j)a0+hV(fnM?L-wezi61pxqdzK?9&VY%si zZtk)*Yu1Eb)h(#K*t>gr%A~)w5Wn6mB=Hk~?jmD{Y#Tu~LYUwtZ~a56*y?CFfL*#2 z`mT_TjsEd?73RWmSLyNxw_cLhnJ!wZc(h4~NdHH1n{iXFdUj_7$2^{;lS-u_MHc|P zShHmMRHo^nU}Io0&fKxlk#UetN=>y9(ASrS+%t3*y@8Z15AwjTv@j z6bm?lP_=W0XMT;*%3j%u145t#0e}bxn?1k%m-#IVN#XRCrO8X5#kTy8#5R_0+;kiZCzzM6%w*-X8c1|`x z?NjHhBJ210HWYEBz*5ctcn7FlIS3w!*^*Mr@l_ap<8wlK6aBVItoZWGV4QE-O34Jm z11rg$cmtzJA@!HHUI;32oHS$MK0r}l*b49Ws4#2#j3dEnNm;aR-jW?ZXM1Hhjx(cm z$Is7MwC~k|H>Epnt0kGve6tyg*6s!i=5ykz)K5i9fY-+P?FiM^ zx98!K*iV3|VB6~odK!2q;w78!sD}G{ZZz*-M^57T`5hjJJN@jJ#JtG-=8nOiD9~Ox zvTn9D<`b~K#>1?4C6ZuKL(#-+qy&>_W|Ncj3aY}w`qEcXs=j}?ZEAP<&&3imG8lMEVasB( z7d`M!NQ#>?^#)o06fBRLY~;hM%m>c^RJUilL)Y=1F6>3Ii-*47p7~(!%!j2rhQQN= zla(C@t+6R_J#%jjZ_cCZR(^Y(xA&s&`L)ti*Q!=r)$s!}7OxKr1TkT{{)PRvJAo{h z6Ew=%`na;&yC%=9dN6a7$k?=OVcwp!xUDP-YWr({+NrSTr?oEP1ztz(EjOM=6al+k z_GvA2Xt4cC^B#+Bkj4~sHZt#MruOdlTkTh6EnexB9JTw^%m$D@$QE_nkT$1i3*T;y z{4kN*c{q1kSDxdc!r#A%ESEclPX&r~1qN^5oJLR6=eW(mVUW( z-EK3_J}~6F$t4iFcAtoAugE{LmGgB%wLyOZO!M>0^h1-lq@i7mK0L1*S|GgR%{(EPqEHnutJrmUs=^WG&{FA#{?v*NqWB93W`cQ^Mm zKlHg8g3=l6b(eL0yxR|KsRzjJB>&|(jDG(D#F|}>lBpkL)ecd=KSYwmLE!>!hCO{} zvg55suig)q#`lnpX4mAud-mD%Vk9;#w8MtmS@^x9Xj0(0S0_4Y-h%@hx$E!{lO9tRZLd3^b zkQ?a110d=bt`9lW%~@G55Z2}28|SDM;*#(Wu$bB(6OTuahq?HY;%KXfmjVj123$UZ zJXOl3Vo}43SD%V@qa31*ms@TJNU##2J=l45Qh)A|F2wMr_H%Xt`SVkzeaj{QDc!yD z^?k@V@{&L7`hL6XX)f&+(b#k8EW^ulbi>`_@wMlf$4ut4lc}-8GAcW;kSB8J+*;zLRd>vw^c&_am-NxXo9M*p-o%*MSW&?THTyhb={RBAlLvhK6 z1XbtTKQmz!aSyNeAGHi!#dVv}74tqTEBmg`Zt`1qXBH^NDhIT5*9j?3Y;yHxKVpJaJb zc^$}Q_}tYfec{>Z+LqaN`tBd9gRd|7U1SNwI?WVAIDCE5-pru9o*eqq^Yda152oC` z61n!?@?o%k5^$H}LHdOSeAszymFX#aA$6KlJXq)MSto4kcuzs5Ec|=bR%zwj&q~G_ zknp_G73MgmEvh^y4ytp7?K{Dlnvm;+*Jv|$1Nzk$u20z&ssBI5M_%O;lw zLAS4f&IZ2b)CqsnH1k;69ysmk;ne~5wEC~h zXFrk{HY7EiNlZL>@_+cIdtQ7|u9v!+scsK6bOV{0u)V3r<8sRiBJ$H=>!S=Mzce#I zY=BzKSg>}IUBKlTF|)RQ4m9j4Z7}K?+dkY|e(y|oTb6SFFLc6W%BQ=f8)jQsu^l_s zg-ofFyfGhMdh33n@uc6eHE$fD?Qdpjo_5E^meHLu8nT}l*+savZ zym!ZU{nS;74L#|l-3yJJ)(wwzJO-Dx!TO)D8I*y=ix#CWX(Jxg{_=ZutV>j$Nl&D= zO1c^%%xn3I-}30Enjr@gqaRyD5pmz(laJH z<08Qq#;vEno9P@)&h6a-TJ*lUOX|;Zna}v%ZU=2I)lO}V8h)y@7dB12eqbV+T>dV{jJs7#WFOj&g0&UD*wcWkg%Iqu2kd? zu@*mvJ{x7q#hPEHUP5yemLUf>fOWjsh9ml}*#iGQ9rSUf}+^)z^-uZPIZ3TCO zc}K{WBiap_bzfgo9vrF3I?Wo8fjqLwq+IKv>!m#lDk{!yG7tjsmpfNM+uuHG{i3Z9?cew6Ucxw-oNvBPy8=2CY|i))ff&5o zcY4wN=c7J{P-Mtyw;>hUx{9_d&*RO((2(XY$E$DmG?7*iJ zVznDA=RP^2jPm5foHQ=Y9NMs0bN^lNHUJ{GI(%;jy_RY<_pB4?PM&TS8@=@BJl5sa zS!y>*JsqcfGdpkQs&StED|<_uU#|zNq(X#)td-45q+=6mdNBmOCT42r46yKyRZna? zzKeq0anipDYFPZV%<$sHi;H^)H$~cTH&|(aHjsVfzAdb}$Iqvo=`_=s>6lx#U0=dr zsbDPjOXP&UMG#}(QZ0W-`?&lWAVH?4*wjO1Km@%RC2VfG_J~%gXPNFg_2-v?0sH4u zS#`6ShYrSoSBh7_0Ph9X)RK`$KZUA z#hNe64m)mL`!4DVquAucj=s#V4@VD%Wj0(5|FCu(Vj>%(;UG762dAC;GiSU*l+%KY zEr6~b+N{x=CeqU;uJ;DhlF!qDL`o%qaXy#)Z1vg5nSeK0MzuXXmOOb$^t0jhfE;bn ze9etNUHc^xu%ZQqS1l-h8?)C2;!nJsx@n#Yy<_j+=>v zW6F3w=(eg8#>(EAvm{i_&rECph@c5nld4jYANS>5zO}~b!3mke9s$ToLd0*Kw?y2V zc){MZ@BcoT9vK_4P#0FW*6J~E~&B}G-uyzYbbXgW&udvtc9w#P%yife7xe{Eq zah#LLhdv0bF)5qNVn(j=Y8QJ)khPF|!;hcnzc-dG0o>!x>_>ZNE^%qP3JCn_pzHaw z>)be82j~#s3R%-z$7#!Woy{%Gw3?cC zsc~aKuPMj6%En4dw2ot_0h_Ij__Xvybq zRw%L-?KnbOKFc|N_Yaa!XVnte-KS6V$72S+_JisHyhwNFHK~w0^3hBuxzm#{`}C-I z@aI>+*sS!O5nRiLq6j#_wF2Z+t_2{eeFC9tLr)rOFirtun5OS+dyKa;= z83UNXZN1*n)>t~y7i4Xp`89NFe(ag~xnVW>NnHGPfSCYSo4>4<=umAHZMCPPzo%LH z9BsjTe8^^1%??mVV*T;x;CX$+-R+UYEa%5lK$#0UzkP7>W_U%&JlMSmUO`f?p1=I~EQ#KB) zcv~Woco_u_5#G_4tdICqJ{?+J3hG1mKGT?GHtg@c)A zfYoEx1J3XwbkA4#NK;wu+ID+(u$PzDIh$GbZEx;wFZntIDsau6C5=^}I^>-nSeV|l zFkf5go!D>}YjD;e*JSeVLq3urvyiOBxxHiO(4QlS3s%~@nX+RqJ3@tx2Obwqm>O!w zO32tjK9>B|=T?Z~E*z_OxHN56CCPb- zTW#L~`ocJEi-Ra>Eq75tm4sSGOdU$OG zC`T0i|GaL3Lpwrt9ZGM*7FfFgElHd@~q&T~5 zV@CQn7;jd8Tga_rbID|x+k<{@y5&hNhkIz%#H;ko06Qc6y~jbn1CC1HT^DoD9Fm9~ z;Awz=J9CWmbw6BgmT7Y@gt}y~`T!lP>4t3F_;KG`ebMRB`5)FGqFSTgf33}; z)jfpS={ua@=K{`z`~hWxQs?QSkb67NS?yWT3CecVVXCptGcF~+DvMRbPMl;VzA9ji zRObIeFU^CzT+&``Gz9AQ-zJ`tzQ=@kcukLa&oI2AcYSwbCA9telzMZ5s$CHwqK24ugF2 zVk@-I;2U;YX7ich$_|i8(sNgu@Jv|YkA8>)G7WT>^ar?g(a1QTS-s{)+=Fz+Gms0> zJ?V@nf`*PoDXH9X}55DZ4t6QIyT$~#nHNgrQUAXaRs`B{y*Kvl| zgG$0%BlG#Wkn1i%h0gJuDf**7ew&w@m)&_yp8dEts3GI@{y(cOKktoepmA=eKdzQEHR7X;mXLDv`Hhgm8(zkU#vQE>sM;*HF z)|SJDy$&rG*;@^HulCI^8c3#(?w8AT7RZVo4eV5TOQvp=#Hus8;(t>sG)$Zju+VOw z%6b8a3R$NeQ&z+SwJj+gnwUyhj9$j!SX;ttj-4m ztH=8M*Eb52KxP2`V=I3)PWi9F6TIfA9o%yMw&svAqf4>9}P3GH>sd!;DFxR+(^VvPi zpFW&y1@R!Fo~T9#+OlCrgI&Sg^0J^zaUcDrEZ1$YdadRCMhRDJd&x2#q;Rf`ZmP~b zYZ@{zu;%3!m!;($9m2+(J-OM{pB}pTrl2{5BSrBX7|K|-a#jRa^`oM2{yYBcZ`~aZ zh}qpH@uCV*`0Vb(hIM(zRd4y38eQ1}X>*;nt!MAsG1QltzA&&i%VLSF zIkF02rc>WPpL`4(LNSN**I%07+iPj2dphkzo5KuO_`2BEh&#$&*ehCMe49n;`3bvG z`n8!SVr%l#cRa3BCDKzyEO8{G@1^tiwHB@vTwWOgG3l4F`9E{$6jt5qgR1j#10J_6 z2C;BQkOOvP=i~1ymR%t)J1OU?4|~+L={3^FS647AljgRr-@5o~%iEREX^U!F-o8F| zk-hhHoMn*L>4!fjndv^7+_k%7>ZR|yt#Jk1DZAO7{Nm<(9iA^-%>%ee-JCDX4B+Deuxi+iU`Po4NeAxQ2u zT$L;*08&u$hvL|x@l^LRlcIa2#%50pIw>3GmX}VdE?Ph5O^J#2a7;yJ)|99{pkhQ@ z-bvmBsH7!LJ=e>y24htGeQL|T1=UZD=WR5||03Z{<77|SoddG&j}8jTqgO12W~W_6 z9N9J1ImY6Gdes5%kN3!reu}0p44gJRF#GEIk(?l)a5GR*azek`QR=0?<#g`H>KtR% zGz0#{&p9YIHfi&dLR?M08ojg$kO=F=hL#5=gA}|v?mPiqUSJ&4XefF+&G~|%pHpXm zQwvRHtazGVQnz=!xYg+*0YjbleAeI@IoD&#OQr{+eX>4yntq2_SU@$is`~wbSb z%kF*AqQE}1IcqGlrh<_;B->B@l0)xu+#fzCK6QKCXo(%X{k1<(+}*kAM7pH%o%kad zsKe=HKe284gjwhQsYw9i(J~Kcq-bg-SQI8szaRmdDhxYcYONx5ORA?L%HkC!i=YxO19swUVNK=(!=Vn zcLCp8jFP-_g0^?|^V;t|&HVMf3CQ7N>fM%`;G%B90qw7VJ0DsLD)QegU*H}K8lS=x ztRAmfT!jJwSxv^CYr*}UbivM<0~P%=){h8W2S40d_{(HPo9l(YcesrgC%`7Rd^yX8 zK>54q`Il4{uL4A^XX2-b-!|Mui#KOF{awjXgSWD2J8x|~G(LNLu>7Cdn~u+Z=@i(q z`tjMPk}ZAuG z%pCZqWaT#Amzfj>#JLJfx&HSwH%|f6Oqg&wVc&f4V8Y{z<=-DbF9h1#RBtwZvC7VSvx1(=yYSqz^!Z7czWl&A6ZQ}2k)d}b{u}3QP@szWa%chT-?pv=q4YqB)(B52Y zus03cd}zT#$jR4b7pTkKUirGQ?rdy4Ql4Qqjq`?aD`L&%{LFomvln_sFW%Te>sTFD zdhM`n&S)hkmgOucqtA`Z**@PWe6)Vp@xaDY3AlkDA8r{>AG|pKM&EhW`$QSKXjC@$ z+rY@fIHBm(!=` z-#q)pw(v{ix8N;`w>{B8V0YO-0%Oje_&9N8N7dcJ8&M$~YnWrcX5o<88K_*j%>KiI zz4VzKNKACX>0i%4a8%KX=8l3;TP-tG7U4nOV5w@S*-!FSMh)Wp^|+8`CdtyPG#5!O1ac zo^n6ykIvghs4V^o_{R%L7FXV;?YCGAi=~i^CdCB5&o4ZO*q_-r@x`k8Nujc36Mv;z zuhBX|r%t36jy4+h7YqgXJ!vJQR=3rP@^L5N?YaH2zj(`{Z{j!CEIno*%@G*iDV zJ?`tVF=#uC>Tn3e8{5%gVzWh9`^cAK2|*iM`?cu73fhl6fI2MrHFIVdUboOKxz@1j z`meP4jmWO#$~{JFtey|t_aExMPs@c!cC1$|-jDj~e(_86^JNNt^s}FfehX(07a?XK zkL`FBy4C5zq0v*2UupN>TyvV2dT9}s=(k|ywZ@4t+UQ}z?3A+Y7K1fT$Jt9vK6*Sn z(^FMq1L18oJVZU6GD|b?ZO7`M6%+6Ho-tQ3#S7mJye^`<{ZID(atnWh9qfeuOzGJiw{p;@STbrjGD{|gAP&}n>Rls?NPmYrbjsn=|;i!t7{lTE)re=jJJNhB1X?w@3=DmUFQV#Mlt`6~`yE z=JpSjT_1Z-pHMT2=HAub*EEVbsP24u4wy`PWq zYJxDpCN|gvH9eC>O}x{nnfdnBrk(2^9z{C!3vAM%LXu(Y=#jX_+Y&^{!N#nh`}<6< z!@X90KAx~wYHw>2oNphx`Uc5#r@HZsPh48FfwKP>7we}?voSYebpBNG7=;C-A6Vy6GHJ63Mi%M&jx* zFU0zouLkeE+g4YN1gv?mmARfN!K>tfAcu6XBG7_r*uh6%M8Ey9_daV+Z{&n7k#3B>Rzlo_@nvE zyPAw2!{NYFkhb*hITY6kHrYMQ^4k>tZHjv61l>Q0Y~s~e)fnu2vgqRW(5f0UbKUnw zDK%A`Mz=}r{^Dejn&78E}5_VpnLg#!C5WK zrF$$ZH#y^Lp2?}+ilWvo{qn)u@YF zb0mLAHP0{GxO@s=SX?XJG=*zq0N2Oj^BH+1%Q9)zj)8vMV=Dtl?UIJl?szxol++ zH?iT^717#6A=&-Na<}svH;MOo4E3~-*Gvm6J+t)k=!NAU!wSJh&~w@yJ(qTzTd^}$ z>En>#Uw*zb@>GN6*S?_62YV{Jf8-y+?4w(y_m}j3yvW+!b0A0_Y`q{6JvYg1fa&u2 z_i{@QXmvx;Q=gxA!h;iBIi#L^#Gdk+_(sXThZL)3eTF%}cGV}0CNB3g`YNydlvH*3 z=FHAVSFb$SUDVI?xRbd_S6u%+UDp3QYCZ0P6T_gfvC4_x2=>|4%Hv)jXAz@$)g`R?R#lZsS) zq>fEFedfh1tC1y>OpYdu*jb^fkBx0ltI`g>Dhc{txxsYi9)EqzZ5M-?M8{pVT^|xx zzJ968uS+P-xYEEYin+J1p_XP|wafQO$h`;ed~R zrSM@)duh#$txH#Sp0%G?ZJwu7uVS2cOs@|4QgEBp>N+1<&3H@LbRo3pR^241U3&JBvCoWIDodm; z%Pshwc2P^h#i^0!GeYicxMI9Z{_5VGb$34AnHSp_)f?sV@yP9=oe#c5o~g*M51#g86U1=; zT%GwG^SSv4@8(xQ<-!EzdAB2$fyD*p$2iGnPcx4+uM$sSXQIeEqOwpgf3L~(ovux6 z@nPI;J?zZ$bnLudv#@J!ik5SMk(wX3Y1B3@Exch@)6|Pm8jY>!gZg<4lRoK^Ne)#EBeuS6ta(Xl^E4xWjuqtM1>*DzI z;o)Gsf7CWkL)wDJfXDnCY}>l>sJ7ejWjy;iS287yn?2U8l3&-vAUNFK<dGLmxYH zGkQl;<^vEU_iVtjrVsVz^y+3>~4ME)Jmj<7kyf}p;2!?)beVt@Jemu z*O_mxE6`OzBc7}PQZfdP|*sk38{wN2n@={c(BL$ZN2Z zi?sK&r`%N0pOaInpb&+)l*zeZ>62+Alw$D4mJjlky9zT-xZ++fsO1@Xp zy&$>p^ZBxBP}*H~a&hmn)zIzRAJp9F+F5g9w)}_DwDncQ@Bh^Pg;Tok-OM}0ex}0t zdSbEQla)mur7KoLgVryb`rw@trt^*YS;=kc|MnEZ;$P#T^MAGTwRf&vs>n|_`pSH_ z3+&B*HJpF!i)JAB!kqM-firidxBo+_6^6G@$5q}=I{of!Z;IU${n`zuKVI7u1|<+n za>^z~2H&vcRh&I}0D1GVaCQVDegqrO35qw}-myI^KF=t`sBK&#rtg36o;v1fVE23O z{b|8<4~{>O0@}Rq*1BW*BU+mzZn5g~DNun()H?g6kWkAO$v)khJFbkalV37t)C5i2 z@_govJM$o2_`Q}+au4^VK-CEq2BwE><3lG*J`*Q?4DDC5ZSNqvhzs3D8+Qb7xWy4U zxTW>1qfh7UCsZMnWA~KGQB(^Rd+_k#m@m4l-l;1p9yaUASp7hO-QJ32ZkdvJ!M0^r zr8Lyf#j%DZTg;NJoD!P4j+Bqp*6!XNwSL{OF_rrAS>BH6FJAl{j2txEx1S83EIf|e zU)2@l#Tr~Pw=i0m===VVR;f9Ammhn)BHsyS@{YQPywuj_ZT5<-t-ro38;SVi3$*z) zveBWuE%>P{bFXaGI>TnnjLr9Uel1yqjfGjhEX}eGS^R&xd-G_h|NsBnmVL=m*|TRW z4cVeBg=`TeLzXNX~GIq&Q2+1yfkLkU9Ki4_eb*}Fp zpMS3FoH=un@tW7``Fg&d%j0ps-5&xg-`bAziHMw2L^yx;di1;d5tH}wK3o1`~EH=mYFO06l#}|{V`GDz)9A-1Ic2-2c_YAkjD;@deo|m4~{Ni z)l5KpsK+%rWHmc)4GsIkqNZBw(uTbzL{k<7j3L%CZpjXBQIK7JFby$|u^}abJ8_pS z6;TdHU@jyNh9jsxZ*+VKD961h-#}j@8n*n@<)3G9hYlIQw*xD1u9~QV#Ot1GA(rygPw6q__+bHCL0ni(-g{ z);E+tVw;^Ov7-aVHUoE<*Mn6cmLS-p<^y_%Gt$>BV9LsKrCf?=cK*!kio zObdhxM}E|&#-kZh;<5LBuaw8RSow;Il8emRKvpEBHq}{^!)%K*sFhJa1R|?-T?7m}^-CA)qVB@Z-d< z>Ex?X&E{Pic?*l%w#LB4gyA^hWIN zt@TcK%TF}7Msi&#yL~O>i9Ra$S$*0$AH&*5-lP5pzqbJGZIX>_EmI7PKEFc^{2O%w z$TvZA+&eqa`QDI~UBQ*_-Gk*QkYe(H&hPS@kU=lFHhz)GTZk^KGeD@Wdt3MJ{}|-} z6q6QV$uM_@Y>#AjGKfi9y0o+B?8(QO#T7=cbcTv;sQ^<%J%T(*Cdbt*g3NAf{rgR%p@~1GBN}~ z>iva*Zo8%a_ZrqiMK`?`i`$-~(T<(;AO<*L?{Kb^0%*~QoV16uk}}p+AAtmVxrhz< zB6U4gyB=pV(Amig1i{597c-DL8+i_378Ha`WF)#4m1WkQ*-G&}qDovW@1r=fL9GD- zZJ*#8xDt{3{D{w;on}E|@JlS`MT$lp;<4l44Ugq#k)oZP@D%EWWPe4(MU^QeRg{{8Xh=4Q0CkiA1qF(Vf1{PCp*%#d-+2arm) z0c?3ZfKN)JK9E9L3v1Q9Y65b{sCvC2taob-!Q@ z9OawqZ_}8IK!ih4d$8|=<30cTm+^`Vh=^rh8>FWtXL>{iQQu!0J_5=Kjk4SQEAv^= zUmz1NSB3+kAxW{;1-M}%w#)rBMvo!5K8WeOqs@&hO$Oy(A1=&K)TwMKwWeD+0>0uR z_aqm@%|p=vCt4&`&|XS0T#rmgd*x5@HZ)DFk>3Hy#KyBYNm_|OpXHjtM_%!wSI7XD z#%z;RLw>tKFv+aqZY11~ryhO`cbUkP^OiwL)JDv}9vcS5co1giZ-ID4NKA=;bCX&jxJKi@FHw@!+6*x4w^K2@D#7EMj>nD!ojf1|c!7)kg#0F~ zC*1G5Z|K{|IcR?8C-VeGNCM%Mv+hV|4nc6{;80F}1lN^4=JXpja$oqLlL{PWgQJ8I zGLA+ZI}2ho{!YTKC?p?bVX#=*>hhU`VGERF<9Q6Y7Eqi~E(T2fJHPsfkAjH(vDScM zE?j$W%H{hT1c&6v_jm8eIP^4sdZl|0Wc!n#TbI8}1n*~a6998p1G3ls{#c#L^zZ%M zy~PuXS!&<0Z&KX{@jy^B1xXA!g@(2Rk zO1?=!W1N_AL`HY+_GUU@{az7bT)m^S$URSpDY4)@Ez_Bcq2&aI?#SjgV+$H%mRm33 zUklD%@c=CkWsvD0rouP0t74Gdz#qmi~2OM{tC^K z$7lA5G!*dGv+wPz<)N98)o(vOsxYvTn}`{?s~(0Nc`9VZ@B*#h)-7od*~aee5$8va z)W@aogN)r4k49ixpUj={2M%P`bJM-|TnOlz=wPsrHW=Cap8xhR56_*u=F4N%$7*~w zHE-X}Qyjs#-x+&%1R!dD(CY^R7m@5ebbk9Q>(v6Ye!K8(RW7+SD*sjvhZMUe;6{Q% zj{C8@&*Bl7Yv((bfX~3U<4YF#a8&@f-;%CSwt564)$EOX=dXr&T7jn73+`{R*Q(9p zc+H6Nxl-N4Q(mCsH&~r0i7q#JYsrJZE1lorEZ2K_Hpm-B%(x}5N6-6U)F*QJwhJ&s z&>Qr~`)oM7Q_6ASy7zomzqCYdF$&2j*|yTJ&DcTeIGAbLy^ZGqdKDxI%0B5FThjKO zlepgV6wK<{!kJ(>_p1s&P#m%43rv$LbZSr4`_g0UbGiV+MDwc9$)asHA3PBTWE zx!p|nyjqOGwX8Qcr+e8{-beOo-fO29J#_StPk;L?Tn3lbrLiwR@7UQhkNP(#a@Y7X2UIZNhw5hf!@&dN7Wqa8%~9sb`<6v zX-OBlKe|t(>1E!leA8d z4vt=qab~;fKG)Mdk2hbF6fTm^UhKk39ujxil&RJU)CwGaYo#K1nE;bk)Kj`+N&5D9 z>lZ4zWTJcpZlnpqhVux)cKWvmjJ~vbwE7aU?e}EzG2ORM(Vh-B4wpShLZ-2)-qdH_ zfw>|%La1dD95v#AUFaa!;C-9rm9v%eJTfc{ae~QC*3yKlVhgs6vmkZVYsG+Cru2nB z9|oCl-qfD^l}^0$E0R5<#$ks6)wwv#jHYUWa8Yq^k}!v)#LAJCgx&}lT&v`MPS035j`5ozR-}<~w9T{0s zKls+lWH zGH<|!L&@Ggk{v-tgywBO1ml~r*HF*cz@ovjH$T!#88-l*BitwVVBAxcKf z2FE2OgXof`_EY0inP3S6(O(yv=;My)85R@UNW=mP=y`l&32y>D7$Eo|CWnipVI*gM z)D(-3;yirL<5+p5yNrIy zDJ_-V4t3{*i`#9WIz9<(%pqMSudXy_(VxFtBwZG8*@4}hAyxvSCS3A&BLHi#M=mc; zOaMs#rF-~RVGy|1B)gi%Ut#V>D(J(%%MkD2<_kBlpJ&-j#!cZpXLB7{jL&t*Kf(~L(t zpd8?KU|sBI2_=Cim{#O!;(vGFevfZn%+`+8Gf6_u!OYa)oM)?_CyYY6)w3R*mtU~% zc)>M1&$N`0f$}DEvLKi`brtV#O*6UgkQ*efEcXuIm|(EBYZ#t&?@?^COWZ#6)SiOd z7=z?`{{ZskeAIF>pmj!(h6O0XeGj;sB_@pO3!3sd8{qF%!V+(yqFahE@62&6KDd>W zirROMKfE%*c0-v1TpbA42ReRUB%3%t@LEFmy?xnL^w^B1L~C?KpO_^w&HCIK{+z0f z7C!~XbS4HcI$d|1~SWFz9_u5}u@eQ*ri4(+m!df_X(Kc1$Y0>iD9!9nj zwD}-E|DnImbJ5LhVt}6)l+lU&n$siq4FM<8Pyu?9!9(#1NO4YdbyhE?`mAIGgv3$hr4q zqyn$Pi8MV`6y^0L%6q`~=c1{SJn!)V!1Zhe3QPZ2! zfW_!p^GL&L(w8hgB_-w7w?JRXIAKrF?Z7HH#KTd73s%8LqC(jrgT}mAj)A#QF7r9R zz~RGxR>1*B!xPVHLppvKvn&3;Tm?gviPMatk*FB5k)!YGBwTPbk%4RKQNFip~Cf67=Q~QR80*fyz%&ZFDpZ248(2bWjD(K5XA@ZoPOVoC4bDO z$tsL$qP~9NnJPOqQQ04S6RGHPDGce*CdD=YWAx*f?y%$ZxVvaqB@dvo4*sG0Og>)q zUT-kbKp>H*wV9S$P9+*_*bI2*%i)ho$Z(0ENo^WFWEH{Tuu zK|ZLbIbdHRCoFsjI7(iT>63#*H!hI=H(KEfZ&a~>8%ueGwbuuj)(2k#ud#9r2L>Ee zJ&JSOOe`^70+aM)X>2%P1jt*u09q#7uLkmxUD@k2(J6^T24%gmNN^l~UnL_TDzT<# zJ=OSBv)l>}Nd}9bHb?SrpQg-s+>Y^4k_Bau20WgN&e9Z#`!9=;oJR z4}|BU>8STyY89^)sGF#t{MkR&*#qCOEoxen!QZuL4p9OYR@TT+m=9f=v~CEFyig^A z8}74uMDqf$=L(IAk~=WCe7TiLWiP|I=494Uvn3-1^Z0FmLO zB+{p_zsRg*aS`@GY5P?Z+va}<(2#kwo0+IlP z^p|jnLRVJ>`-hg)TMiEK@!fR#bCwIqsn<*fs_x+nW@-sYdwX}?D}!OG`z$NqUeMhm zE|FYE*IS4ib8TYSg8feh7Hc8M=2!g)hwJPNPOlh4tq?!tR59Pq%iNw4kkf*VPn}^j zxx0VG#6*e}<>1}e0?Vqtva_yPw}{1rU(DEw8tHq&9vI zVJMWPP^lbp$iJM|{))$`!t-rL_1_7`HDd?<@zDE&a@xX!e^&0k^o{>OKH+(H`VSku zi~mh8zd+}f#gEMw+s=wKtwF9rk>ik!?b`Y0ny%cWt>@)LD6d?(5{0v%oCD*7U2li^ zugqVjzraRy$?lk4bsP0pQB?G$4-cKS+b#YILrSrVTD*MHj7>ZiKO@ui5kDlM%*ZVV7d46cJcbKfxdoAD1vqEY)avS z!+~=079mE$#8{}BT5N8xYQ2r;*1W+}ydcHhyWR|G-}ViID6%slt%w3=FnGkB`_G!(dDQ`DT-t%tBE>$0$9qyU^!=3UPhoCA{6; zBOb=b9wrN)93M!V{GtwS@CzfHR`vMsoCU{7YZ9st83}}82ty7Y+e9Yf9*^ngY)>Li z0@q;@_&e1CYKo;Hr7<{8DsRK-AqZDfzG3793Q(YWohDGYmdCktDfoGi+>`^>NPh+@DQ3=r{ z_MRr7adT>cl!nCkJ_vOG$=ygpE~uuj=6W}_PmrB*7p5SRel_C!DQe0u2a|cTDvw-> zG>(xDjlYI6apEKT80M5HR`^+*sm^BZj2C0Ls8RKB`-c0>tI+6YGUp2|t9hLch9__P ze;b_=)LI!SiAQgDD-Z4_r9#d!jZx*#gnPp0Egl=2bxdSL7-c{?lKcCP*M5gq<3;nP z;V_aG=fMLX!ex<}{FAU}2p{o`g;gdPjffH-V$#PyjpV}!2QMv3FWuZLu|8|_GOMPP zLV0Ony7zrJ2d;Kah&&8w5PYub7KoACCOU}sTk+w|5(}FRl_GsE+2(ns-KSxt!DWJh z(QW?0X)cUKS+{OhXxqpwhTb^`S*C&}!RZqIjI63s<;$?X***Tksqj60(9ZfN@zukD zy{~;EbF3aVUEn>KgJkT9rb0ZnqUi_iGRVo9TR)uThh+}e@R1qpMjwh{JTVp10sugX>^^;xd>k3{(EnJGr%{Q^Ww^o~eUp$_1N&s4Fr0fkb@t z$&0p~KWhr-j`XjE|4D^Dsw*SKZzJfds910thjASp9rJxT(TLg15MK3fpI+f2*MVJI z0gDfe-{SGG1PlYv@=XwWzXu+2)9=@Mo;^V-Jp3g}dsH%XXlRIgF7f{gRgzZxltk$s zQ;ed^U(yKbq%y1>LR{2(px1tLz zs%>DIAbKs24LmACDi6*%2BiMOlHQR2nz0X#7)|cT)z=`BF(FDPzl z7u`3`m9zk{y6)$iIjzDL=gh}ql{p_NM-5$+Y+`K;%T;M<$)3lpj|;fr+KT%}_b)8K`Em|@!VB6nUd$kGPFVL0?=O)#LAG~e_WF?j3!9>4 zQ__eVrsQ-3@#kiw!w4ytzAt+UKbJuGXcC&dR(GzwNpdC2X~|Y^>ll%JUn0>@W`ER!wm0^*xD}qLmPKmw63X>w^&gl?Ezy+1qSh3pL>P;`Kc@2fy?n9}znQ;TnLC z+o*eJSVhn(NDKj-$fm)>MxJbYWvWS&8+L!k!FEix@hl9i!YGKuUXN{nbe77Min|sv zRCn@_xYr%u|F#b04`&m5T>ax6-MNRqPFKt?SIzM{k5)2+Ursgi3d9tmi`M|Glb>P0 z5KceZa*g0UH*ANp@|rp%GYCo057iG`B+bj|J-&ZdgP@ixL_sIP;^YjtVkFD)BM=`W zenVGt7@Ftm{o8;76O4YGz%e!57M%sF&~vy?UoanPd*iynch8dH~@X_ z_piS0JI)eHUFKtKP6NvT^tN)ju=9VPAcnTM?pBhTB*ggVOrC|OK=ZmT{%68jnKx{` z&iFHCnD)_7!uHx+vYs$k%POo`4Z(g(rovw3U;{EXoHxB8h8*NkydP;3_fl4d3oX0^ z)~|3J>nF-GWs)vnN>>6`)b_XQY08<+n$7WN;IWe??XdY{3~}p|kf0;drv9Gy5_pZE zn&qC>xJ)k-CXyt&UV$WG<9gX1d6Bf(QLDFNAq{l5Q7e#j{p2bh2u3|4cM_&b!l?|+ z)CY1AOzl6esE{UaknwPm_v0oWuzm1~T4D=8+qXswQ?j8U{Z?9jcWijHBwgA*yyK-2 zHPPbJ+6x>7)%aVT(B6>A5QC%!RjD(9MFxdeh1>!$#dVs_so&EGFP(g;xXpYPEMc{b zj9zr4n8}tjgd%B82d0fwaaZWU6NnuW1yQ+TcP46U#26-*yoP&rCW)@2p~#J;Pp{~R zG5fy6S{Yp>=nuvvP5U+wJueT|1x^*uEqnX(=`U4aizgzXm;UY8f$sp)fnyg^lF}yw ztrgqA-AI2TC*6@;{Dhg0^ zs(3%5C+KIyf}`w!gAu!#O+s=7c4X{d6qTc*reI|DefKy7=5bfL9*_?~-1RZhRrr_7 zr|d9HxH={hdF`$yT`9gk5wW0y;d#APF6V@h@R`LpBpH(sQ-OlYNKb2f)VE`EFR9C( z2ArkQ*5s zx^z_1mQTh{%)06#B#IBSvf38D(wEH5D3H8Uq{9R8fOVZ%1T&xO8Cp|MA;u*w3rPeQ z>3)**eTv5{INuS;3v1A6^pJ!P8uhOQ5i-@QR+Hl*o%mTWGDmdbv*VX=(gh}zFCNEM z{3wu|hWmuq6rLD1L-B%hk7Fk#x?tvxn?FMLN{92Qmc^>MwB%wpNR~&w>6VoEUibb# zuv6GlpO$@#jkY&I6Rc2r1)2@*wvBr&k{ZI%?`^W4Xa80VLxx+?>W=?BA()Rjaas1x zZJjeijzQO07o<=HQe?=QNApZ|%7!Z?i?%SYSyz$i^1F?GeJxF%Tr1Imbv7`p!fdZ#OHqRL)$Fu`Zla&q$pGs^Rwg((b>xCG>o1V zqHAPI`CLGx(lXMhK2jggVrEbFCmQzZZ!c@taP(M-{WF58=9He=qCYCnLn0M<4J9b! z1OT#VsCPIiu3hrv{7A)H!aVp?y_HA?@@sFOHO zT=AZ|>b)AsYQ!STqLi{6H`Wt9s_~DEQ~m=%eV>+d>4VC~RN@oW9?a8_7H2cg5AKxp zGExsqOFNtlQBUIQ$3?C2dEs6zKQIvK5H4tF(gwkx(t}plUrspsbsOWxThXmYNqv}! zY%13e36xl#sd;Az(6&v-7HkQoa~;MEqSnRQoYqK zyr70qbf%3}_}Owub7MYH9nAVCk+#g)Ct9H1tq!QjysqUyg*OqicI)5!WdsHR<4j3O zCtQ;YcTP~WLjg~1@7*96t!z^`txXHUhNzCKv81?S=fw+|dc~cyQmAdEo^E}M(Zg*E z9Z$#-9ePTNv+Ko;CnropoMiVSXO3Xxn4TedyN~P|?tHw_5wx^!6el~yUvCt&?0H0^ zLVHddA52xO*-b@M5gw8W$(-xYzjt%B`O#@zL7wAbgKOnP;#0{|Z~h zkIb6bdU`-IXjUs5~T@-RXyji`)M z>}KktGePO|>DZGY=iPf=pu4qxB#D<2pIVnPv%rUkO1C9S`9C<8i#(4+cNdH)zY18- z%SrV#du=fGWjhZdGKBLn&jw{A%xfQQ8}qt%>0cdHK+z124s~!^CdjU&m@M2UjhZ$` z)#D{KMC8HwR@Hfun=S9rPg6)-$^;J)6Vn}mvTwat`Q`nRos?5B-Lf(k^QL5MIF%)- zC#R-4VJ9OOE`8(AcDzLJFePHaZgO81cX>lt7xo?&943nKibM?#y*#YAJua{{O$PN~ zejmRYCOXu7;U6skA=c!WE{h4)^kz?AcggG-`UQklNF2KR`e>453G7S-yR2{_u-nyg zFGiWzQ)UMKcya|m%VsR=z%5hfI5Y#D3BzpzSoLp2Jj2pP0sraxUb)YF4!Qi+JWi{A9v)=Tqi4~V|^$ne|PNR-f=BiweR%Xb2#-lko=MZ4; zqKKdWfX?ruS8w{Lodp31F|!g`3HS;#E`jH)flhT4d1Gi}so!G^6fYP_SrYSo^POcBM875PX&R4QV+#;#YAB9HtK(qTst`&_G9t5!^AtgKr%bghL*lo)3p&Ux2h#d7L)9` zW@)`}ezMd^HU32m?t%=UgW#hFVn`0B;1ys{8B}b$`N*OKK1}Oo4AN)J>DTMZD|FuH zNY)}d+szkE!sAs1f!`pJSOfT)d<}jw0aup5%n9x2>67MS$&G zz*jzPyq$_VOdHAZS~H~AIjW=4fG(9(qaxCwXVbW!Rt{m8s9U%>r&Y-Q zhaT0H4E%ME)mRD|tbO4u5SF2JJeIqb&>fyvQo>&rM}Gy$Y)tmsstX$RKL4}@1{ zI@LM+5>BOrOE`&qX%L}ePNyoau+dTzywS;a`Iyu$Yu1(B?!3zB(I$4+wYu2{fS+C? z!QPw2>Z<|?MZ^?a%iB&VhMtOK3Vz6`6;Uy5O@Eie;(XHp+Hn%=rtOv8L7HN|eX-DY z$ma~F`9|AIwO0qN09*=ok zuePO~3Hg*SvAh=W-&? z>@yEPS%}LB6nHmk*ALiuHcZd}T{=L2Q0?JwO7M+vCQ~!15ss3!fQ88PJi8LYfl~S)# zgj`l<)}0{1H|F~k=eLTz%6+jB9q-p^=H=@YOqL`A0Li>}&_eaGe4p8Nq0;Kvp1+R+ znT1At!Ia9Th!8(HjW&B<0No*tdeU_1PysUaXQcrOk7;Sc1+(;qKfKJ4mUzh*D#~>6 zwPqTY>ZG*jnea2zom-Baq4fMhs+aisy3Kc+-;F+bKKjJ`T5jF}w#A0|dz$-3g+c#s zr*vP>qM1vgKb*Ws-Tfkr+?}sU(Fy+0m)PJJzH%vyx^TgN)M?>){5{isC;LU7geQj+ zai18^Aa-w7uu1g^-fGx-d#fydp1c1PLzc<4mYtxR3f|+9m7To8J7t&3R}K`#*evei zuU;8Mg&lf1xm>3?>1uX5X|VgTlIU@(4V~c7uwMVD=Ek&+&h%Fnf{*3Tbv<2mvRzpU zbp87EP;O$z$ECjKhcR;&K0ix@8n4cZE)kSIeVn6}5)tFLdzEI5uQ~OtW%fJy4%hc+ zkIovaUsJVncGSa-fttDe%D$gKGzco1Q#UIG@6r z0Ke|O;)hlWLbL;U;~HlQ;vu>W}u!%bzPQNaCljZ1(x8*GM^xKa z-n$N(oSuhXzi&N|d z^JwT*=3HcB!>QhT+*-Kl_pybq7lDkvjw8R4;8XC@0@`}c>+#B`5`}e^ns2Xc#o2A| zXQ5@26x#ZNRG%Kjy+ED*7I^g%@8G+jDbqb&ir|WKL4glaYa&|ekK8lGsc_dZ=>H5S z@XFgyE?v$Q|1P|AKbR|0L`b|L{}o4juPj1By)WHqM=G+d#DdxKo`B21iw7g8`Kj{L z&P^RL4zW`C(!G3VTkYhf@#W6*CuqAZ1bo757v6+UFD!k6{TEr)@)wG}+Ru2bOnw-W zaMuMXMfc)@#K-~5wId(J9)GIn=ygdqF`rS^Ewy09z%=;nrc$T-$WYy3j}Re2OHtnAMNaU!13|^z_Kved&(6WqEIBxv#;r=B@PCfDfA?jYZoknL2MAW01@t z0xV^-(T3}DNj~TEa^L;Zr}O&WMX~29su%e-EH~Dc(WlLCuP%S;{%bCQqDRR>hd;9$ zAN!q@-ZT`I*o@7-_`Xt-mkVxtaDRlqxje21D}wS>)PAz=viU00m)!f8`l|$IW%gi; z6XKa>cDzC-F9?jVh)G4&J$ij6Q(T~`)Jg6*t;XA@{VUtH9paG}hkK^uKlq{aBM#j2 zbj+C%3)LG!@;`cUPS#i6+Lh*7tzaZ%!wF=Fg(p4}4;U`pzB<^Ommen-_Mt!Yotp>4 zSS^FNYTM=CaoLN?tiM}v+oXthnT`XDpNj?_9RL;#Bn!c z;s|-iiC@0v@%Kr>TIC8b1?5xz=Q;BzqL^(=UIv6+b1Po_7HfIn$TT&b#boiVGdAM`pfu%chF6M50`q@l(@wB)mWB+XLt)EPBXp3qU?%xT`w%dItK9*Rj z#a&BYo@cxz{>8-u;pCZ$tV&FATkhJ4IW|n{PrX&Qc|Po@hi=sP!o9ldapzNKeXM?* z_%v2|%Sa%ta$Q;EK~k1c9)Vc_MKa$Tdv&H#i5J|17&{?x{;!IW>l0#9+MdvAx zAINk{Atad3Of)I9yyA$vA9)cu$V;vD$tv25$sw$s!E?rTt*fn*4HI9bQXypN>#ckF zmB?@@cY{BJj`aPmj!=#3!|W|H_#Hpg)W#)xQ|26_vtL~kLxayfH%A3!2kUBVR>nsP{8v3idpZfB;Dk5M0=7tLeX;(OsFc2?dMWXF6ChoA>j+ta> z!6uA6)Jm14uice=dEs2;DgGS89<0isqZr9g%9H;44_p++4>D;rOV0(7K1e-@Lyz4T zr5~HUcHLLoyD;*Av1!P^`6!E?retuqwB{`v6HB45Pu1P*mfy|398U9jV4;@_g_7I1>@+o70c9Pny!Ar71CdWS;U85IbF;{F3DxPS~+_xab*UAdF5THAL&EuO_){#mnlO6OtN;;FQnd80Y9Y{`+}V^O*9hBalw zky)4AxzEv=5htrEp#;><-QMRlmF4%Fs?R`mc?X8JhQAO=e zbzV5a%n#XeoT_WDO7r#w9U&8O(@U}I@@tuF@jnemhb5ePuqN?Y0^7~)RdFJYi}#nE zW+UANF5mCziTA?=G3ltLHOKX zIP?C)4j*T=L{9?+v+Rk!pldzO$jkmZifrDFiN6+{;6mNxV|ys`-3&Dhh7#MtYvY#9-n@SMbBj` z*~3&G*76fw?m|jZ)Ma^JF-FEWZj&!59}gZ+pNMiv=+WpOVsCC~uAlj-_+~O=UBkG5 zc!JeC3sqZ_Q%jCUkRX(eaUT#;B_qBJ5!5M6WG2U-H5AS0i=iZfbrqhu9bPfV_h2fh zb$9Nl$4)AUll4A7Y(ll>kJUn$$8a}Q#f7P2niq>0 zeLgI`GY*|*^uq9&uiC&S%`E2FoF4JZOoWOgdLyq`$v6C%{>X+ixz9I;rH<3aTUt$X zkAHi|Do^R%8JDFqFd%3RL(;;>lEH}XLK!Eiq|#st@l$~^YW1zGAK6QPp=7mZ@g_H{ z?xBQPMZuJ|FwM#L@E9kLmT$%C)4gwcEPSNDY0wr9CjNAajq%iIi|&9lM3m~$fD5Or z+g~IXZ(dZh=M6_rZ@FjJvxwr*@l$QT?N20KGtl}y8E#DAD>5D8qESc;Ab6lsb;lod z5l`YgsRLfM^B&Ky%G0*fWv005;6cZ5q{G%klu3dvo;EadY4hcbY|a%aA*L(%%(Cg> z41Ks_M#1`aYz@z;sT^Z!b+|sY|5S^%zL@F zMbv-Z2)#dtNF1Qd7jB<eQL4fnHXy6oBs(@k83t2`ROsh73xRkQL>QG*Rp|lkEP(9cb8u|w7w=zB8e{-QcwR!q5S)_||s9b@$ zsMFfo+UgYEo3VIw9EF;za$VG`%rkXG%yqWN!uQlLE;?JN!UmpH@nboa(u4XUwcjFc(MQjixFX_ISWarZu7q zsen!MVXttcZnZ&r2Rvd;{o*x7c(WGc!1NO>&XX9kT&-}Vg!eP|bzxOR*SaeQxE(r3 z(S{@EcG7JIs%npQiJ?-L{3Xl{5Ki{X-_AS;N2b=BWt)d1pDE~s+%m#{C+SzWfFrge zQ*QG^obATp2)r8un}8}}Vf2t1V&TSwjP!!@g!$ORaAdxZVaT?ObUyi2IuC7x6TibD zuRJy!Ddp|rAMBVibd!RG6ny&)+EcBq4}1S!J6;#*ws04pRK{m#3w>Wb5Hj|I5GbfRC!?!YS)& z*e%eBqV6n>vqWa0Jqnjw+@Mi+CprgYr2n^#n8=)g_Skp{Z-obN2NrxZp%HxYiMPRn z>HBH_a3oKL*&E$(B%Pu}z40qF!)e239&PQ|7Vg$>58-0n5X7@L3?*s7x*wx}H{afN zEY8=(V?hj6@I#Gf7JmG<-HuG=8sRC~-?Y55wHvTuoW9?66k~Dl`>23Bd!SrFvd_ua z2dOiAt&yC>UdN$7q13f$Q{cJmqYUZPU$JW>3PWFBN50kiXQ*^jL{sQjaFW&u;7nw~ z*Mv!72qC^ikCTQIe%_nT!3!F`M0a=g$hn}n3HQmWs;a0N8RdL=Qvy~g+?qG<%0GVm zIO2GUVA5sPz2l8O6ZcinvZSl$1aYX3bComv@W&?&LiXrsp2Ohz&x>6nz-B=76L$#3 zrGS54e3NiD$fM$TUi|YZ`1t?FK3I>SfVzRt)6=6tm8jT?;?yAy{wU_60<=Z3%^|49 zc>$cKr7B&g$iNf(6j3pGA(;LOcEW3#;SnT9Eq~C^(9D0%Wd$tf$;yf=u|f;e`7J&X zgR}_18tR_=bILh~5|K!dCQ5#E0mb8Y9Moo#ANN1-#XW-RCEY)N16C3aNldHOUC3gp zgRX#kjtWOF*L7gC4FA0f6-jNN>SgC1s7|Lq0Ivot7QS%`{&dCP%#0H-ElbG-BH3iy z7Uhw0-0?IGyLNqojS0m9sL($E*E#xdp&&Fgv>u$LV^UJE6osWkLupX1d_NoAM$RnI zmt&v>MRbat3DV9JwHCgfAUE{_FHX8`AyH9kaw^8F>YF+z<>ikpjZ|E>x((@mQK6K` z`$c`N(tVpy1d``R&8Fph(1C_VMy|h1EIWGi=;@aC3yvis{>pSYv6O*1u^A@|%W%%~Bv79|}CBS3wU zOvnbK&-uuudCy`b8rcF@A-Dbo+_cA?h~=Wdx}54~f-jhz=d<7h9MA7>*$+I-8hvjNPtYH8egqKoe8M6GYJT?fmJ_g+`-ASM9TiCPc&7@WLx zFhB1|EFd~R`1Uv?QiGwI&NRqri9k}M=2ZJCexpw67Zxk|IoDXrm#O{oaW0zY0kss) zCs1YcdD)DY&ej8MMtA4nJ$Sr7Pq}`w+ZyV;ws&F1sV`66SxV$iR6kF zaEr%}K9cjLL4WhN>dFq+fbzK4(%xU3C%|C!>x8uuLgM3xh9D|iFliPi%P#}lZDJwS zr_GC?EOGtX&g>CJZN)5L7CTx2G9JsY5vVHV<$vlMozZqiWwxJU*=Uu!Lp)AmKXP*Dm#}sDf$V}%fEzqDYASMW#A zk<3<$WTzxvGy%Whf^wR&*LRM0s3CHpoDEoohrt7VYzK33K3jYL>p+1yMkQxD(g99J zW&y^Y>*f&}R-yMVPRacQ5{}h!~|&d5e5^h7dyHce{C+GpjCd_AjxS zYN!}+F6Erh(cR5ucsyivEH3EDh#tA3Ds{SV^Mwmw^LLI!#$R~CDxzmdMf=Kzasf~I zx%>JeUhkkcMbVgtgzMbH(ubDqYN|+97DSwppYsmu>AnM-)X*Uq>5#A3wd2wvDK6Es zla)Wi)5?YAMIBe+h047r@3UXsd=DNG@kR;Z#L@&b`i%m%6Ka@h~(6J zIaKTQ4xV`%F&Y_leXSM*zig6ciWKnJ9WHu>UiT>YT8VOS$PXu*8T%R2#8YA~M5QeB zvz7D3K&3Z+rG|U|`5WxWl1WIwR~I>VYV?$U@l|oI_BHjiBW^`;-P~lT?FnutMR6L1 zWU$`C;>L6rwEvw@>&Ib2U0Sj-)B3aB`LpRM7g0qxIcGy5|128#NPSo{&}9|= zw|M?J775Kz60B9v2?BACzO#LJzq-lE98md6CE4m436bQd> z)lRsWp=V>lS`5|L)cpKvS^kKu^553#bpG2MNBM|VCg_i!zdA9w{ae2zeEil?vI}+0Re@(?9`FsoOUn0Dhcw575K7H}P=|Q23BvTKbcE zf@_`-$(Asx8t{vFiR%I zYY$0*Lb7Bjg$@sWqW%X3$Y$?=pGH7m;a~l2T(-t zK~IMmqT0^p&HE%IBy8Fj)vo@>;WOw052)VA-l%NpWi^4{lW1#PzF4Sk{rLX?%(6}A diff --git a/doc/api/training/smp_versions/v1.1.0/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/v1.1.0/smd_model_parallel_common_api.rst deleted file mode 100644 index 8a8e87252e..0000000000 --- a/doc/api/training/smp_versions/v1.1.0/smd_model_parallel_common_api.rst +++ /dev/null @@ -1,485 +0,0 @@ -.. admonition:: Contents - - - :ref:`communication_api` - - :ref:`mpi_basics` - -Common API -========== - -The following SageMaker distribute model parallel APIs are common across all frameworks. - -**Important**: This API document assumes you use the following import statement in your training scripts. - -**TensorFlow** - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -**PyTorch** - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. function:: smp.init( ) - :noindex: - - Initialize the library. Must be called at the beginning of training script. - -.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs]) - :noindex: - - A decorator that must be placed over a function that represents a single - forward and backward pass (for training use cases), or a single forward - pass (for evaluation use cases). Any computation that is defined inside - the ``smp.step``-decorated function is executed in a pipelined manner. - - By default, every tensor input to the function is split across its batch - dimension into a number of microbatches specified while launching the - training job. This behavior can be customized through the arguments to - ``smp.step``, described below. The library then orchestrates the execution of - each microbatch across all partitions, based on the chosen pipeline - type. - - In a typical use case, forward pass and back-propagation are executed - inside an \ ``smp.step``-decorated function and gradients, loss, and - other relevant metrics (such as accuracy, etc.) are returned from - ``smp.step``-decorated function. - - Any gradient post-processing operation, such as gradient clipping and - allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or - ``optimizer.step`` (for PT) should be applied on the gradients returned - from the ``smp.step`` function, and not inside the ``smp.step`` - function. This is because every operation inside ``smp.step`` is - executed once per microbatch, so having these operations inside - ``smp.step`` can either be inefficient (in the case of allreduce), or - lead to wrong results (in the case of ``apply_gradients`` / - ``optimizer.step``). - - If the objects returned from the ``smp.step``-decorated function contain - ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to - ``StepOutput`` objects. A ``StepOutput`` object encapsulates all - versions of the tensor across different microbatches - (see ``StepOutput`` entry for more information). - - The argument to ``smp.step`` decorated function should either be a tensor - or an instance of list, tuple, dict or set for it to be split across - microbatches. If your object doesn't fall into this category, you can make - the library split your object, by implementing ``smp_slice`` method. - - Below is an example of how to use it with PyTorch. - - .. code:: python - - class CustomType: - def __init__(self, tensor): - self.data = tensor - - # The library will call this to invoke slicing on the object passing in total microbatches (num_mb) - # and the current microbatch index (mb). - def smp_slice(self, num_mb, mb, axis): - dim_size = list(self.data.size())[axis] - - split_size = dim_size // num_mb - sliced_tensor = self.data.narrow(axis, mb * split_size, split_size) - return CustomType(sliced_tensor, self.other) - - custom_obj = CustomType(torch.ones(4,)) - - @smp.step() - def step(custom_obj): - loss = model(custom_obj) - model.backward(loss) - return loss - - - **Important:** ``smp.step`` splits the batch into microbatches, and - executes everything inside the decorated function once per microbatch. - This might affect the behavior of batch normalization, any operation - that explicitly uses the batch size information, or any other Python - code that is expected to run once. - - **TensorFlow-specific behavior** - - ``smp.step`` is a wrapper that - inherits from and extends the behavior of ``tf.function``, and as such, - all the caveats that apply to the use of ``tf.function``\ s also apply - to ``smp.step``. In particular, any operation that is inside - ``smp.step`` executes in graph mode, and not eager mode. - - In the first call, ``smp.step`` performs tracing of the wrapped function every time - one of the tensor arguments changes their shape or dtype, or for every - new value of a Python argument, if there is one. Tracing is expensive, - so such scenarios should be avoided as much as possible or, - alternatively, an ``input_signature`` argument must be provided. For - more information on the usage of ``tf.function``, refer to the - TensorFlow documentation: - - - https://www.tensorflow.org/api_docs/python/tf/function\ - - https://www.tensorflow.org/guide/function\ - - **Common parameters** - - - ``non_split_inputs`` (``list``): The list of arguments to the decorated function - that should not be split along the batch dimension. Should be used - for all input tensors that do not have a batch dimension. Should be a - list of argument names as ``str``, as they appear in the signature of - the ``smp.step``-decorated function. By default it is considered an - empty list. - - - ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch - axis. The keys should be the argument names as ``str``, as they - appear in the signature of the ``smp.step``-decorated function.  By - default all batch axes are assumed to be the 0-axis. - - **TensorFlow-only parameters** - - - All arguments of ``tf.function``. Note: - The \ ``experimental_compile`` argument of ``tf.function`` may not - work as expected with ``smp.step``, since it interferes with - pipelining and model partitioning. To enable XLA with the library, you can - instead use \ ``tf.config.optimizer.set_jit(True)``. - - **PyTorch-only parameters** - - - ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on - all returned ``torch.Tensor`` outputs. Setting it to ``False`` - increases memory consumption, unless ``detach()`` is manually called - on the returned tensors, because the model graph is not cleared from - memory after the training step. Set to \ ``True`` by default. - - **Returns** - - - The same object(s) returned from the decorated function. All - returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or - ``torch.Tensor`` objects (for PT) are wrapped inside - a \ ``StepOutput`` object, even when they are inside a Python - ``list``, ``tuple``, or ``dict``. - - - -.. class:: StepOutput - :noindex: - - - A class that encapsulates all versions of a ``tf.Tensor`` - or \ ``torch.Tensor`` across all microbatches. - - When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside - ``smp.step``, different versions of the tensor are computed for each - microbatch. - - When this tensor is returned from ``smp.step`` and is accessed outside - of the decorated function, it appears as a ``StepOutput`` object, which - contains all such versions. For example, - - - In the case of Tensorflow, the gradient for a particular - ``tf.Variable`` is computed on each microbatch individually, and if - this gradient is returned from ``smp.step``, all gradients for this - ``tf.Variable`` become part of the same ``StepOutput`` object. The - ``StepOutput`` class offers the following API for commonly-used - post-processing operations on such tensors. - - In the case of PyTorch, the loss for each microbatch is computed - individually and all the ``torch.Tensor``\ s that represent the loss - for different microbatches become part of same ``StepOutput`` object, - if loss is returned from the ``smp.step`` function. - - - The ``StepOutput`` class offers the following API for commonly-used - post-processing operations on tensors. - - .. data:: StepOutput.outputs - :noindex: - - Returns a list of the underlying tensors, indexed by microbatch. - - .. function:: StepOutput.reduce_mean( ) - :noindex: - - Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s - ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches. - - .. function:: StepOutput.reduce_sum( ) - :noindex: - - Returns a ``tf.Tensor`` / - ``torch.Tensor`` that sums the constituent - ``tf.Tensor``\ s/\ ``torch.Tensor``\ s. - - .. function:: StepOutput.concat( ) - :noindex: - - Returns a - ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the - batch dimension using ``tf.concat`` / ``torch.cat``. - - .. function:: StepOutput.stack( ) - :noindex: - - Applies ``tf.stack`` / ``torch.stack`` - operation to the list of constituent ``tf.Tensor``\ s / - ``torch.Tensor``\ s. - - **TensorFlow-only methods** - - .. function:: StepOutput.merge( ) - :noindex: - - Returns a ``tf.Tensor`` that - concatenates the constituent ``tf.Tensor``\ s along the batch - dimension. This is commonly used for merging the model predictions - across microbatches. - - .. function:: StepOutput.accumulate(method="variable", var=None) - :noindex: - - Functionally the same as ``StepOutput.reduce_mean()``. However, it is - more memory-efficient, especially for large numbers of microbatches, - since it does not wait for all constituent \ ``tf.Tensor``\ s to be - ready to start averaging them, thereby saving memory. - - In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end - up being more memory-efficient than ``StepOutput.accumulate()``. - - **Parameters** - - - ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``): - If ``"add_n"`` or ``"accumulate_n"``, the library uses - ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement - accumulation. If ``"variable"``, the library uses an internal ``tf.Variable`` - into which to accumulate the tensors. Default is \ ``"variable"``. - Note: Memory usage behavior of these choices can depend on the model - and implementation. - - - ``var``: A ``tf.Variable`` into which, if provided, the library uses to - accumulate the tensors. If \ ``None``, the library internally creates a - variable. If ``method`` is not ``"variable"``, this argument is - ignored. - -.. _mpi_basics: - :noindex: - -MPI Basics -^^^^^^^^^^ - -The library exposes the following basic MPI primitives to its Python API: - -- ``smp.rank()``: The rank of the current process. -- ``smp.size()``: The total number of processes. -- ``smp.mp_rank()``: The rank of the process among the processes that - hold the current model replica. -- ``smp.dp_rank()``: The rank of the process among the processes that - hold different replicas of the same model partition. -- ``smp.dp_size()``: The total number of model replicas. -- ``smp.local_rank()``: The rank among the processes on the current - instance. -- ``smp.local_size()``: The total number of processes on the current - instance. -- ``smp.get_mp_group()``: The list of ranks over which the current - model replica is partitioned. -- ``smp.get_dp_group()``: The list of ranks that hold different - replicas of the same model partition. - -.. _communication_api: - :noindex: - -Communication API -^^^^^^^^^^^^^^^^^ - -The library provides a few communication primitives which can be helpful while -developing the training script. These primitives use the following -``enum`` s as arguments to specify which processes the communication -should involve. -​ - -**Helper structures** - -.. data:: smp.CommGroup - :noindex: - - An ``enum`` that takes the values - ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``. - These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``, - and ``smp.DP_GROUP`` respectively. - - - ``CommGroup.WORLD``: Represents the entire group of processes used in - training - - ``CommGroup.MP_GROUP``: Represents the group of processes that hold - the same model replica as the current process. The processes in a - single ``MP_GROUP`` collectively store an entire replica of the - model. - - ``CommGroup.DP_GROUP``: Represents the group of processes that hold - the same model partition as the current process. The processes in a - single ``DP_GROUP`` perform data parallelism/allreduce among - themselves. - -.. data:: smp.RankType - :noindex: - - An ``enum`` that takes the values - ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``. - - - ``RankType.WORLD_RANK``: The associated rank is to be interpreted as - the rank of the process across all processes used in training. - - ``RankType.MP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``MP_GROUP``. - - ``RankType.DP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``DP_GROUP``. - - -**Communication primitives:** - -.. function:: smp.broadcast(obj, group) - :noindex: - - Sends the object to all processes in the - group. The receiving process must call ``smp.recv_from`` to receive the - sent object. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be broadcast. - - - ``group``: A ``CommGroup`` argument that represents to which group of - processes the object will be sent. - - **Notes** - - - When you use ``broadcast`` on the sender process, there needs - to be an accompanying ``smp.recv_from()`` call on the receiver - processes. - - - This is a synchronous call; the ``broadcast`` statement - returns only after all ranks participating in the call have made a - matching ``recv_from`` call. - - **Example** - - .. code:: python - - if smp.rank() == 0: -     smp.broadcast(something, group=smp.CommGroup.WORLD) - else: -     smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK) - -.. function:: smp.send(obj, dest_rank, rank_type) - :noindex: - - Sends the object ``obj`` to - ``dest_rank``, which is of a type specified by ``rank_type``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be sent. - - - ``dest_rank`` (``int``): An integer denoting the rank of the receiving process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process - with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current - process. - - **Notes** - - - Note: \ This is a synchronous call; the ``send`` statement returns - only after the destination rank has made a matching - ``recv_from`` call. - -.. function:: smp.recv_from(src_rank, rank_type) - :noindex: - - Receive an object from a peer process. Can be used with a matching - ``smp.send`` or a ``smp.broadcast`` call. - - **Inputs** - - - ``src_rank`` (``int``): An integer denoting rank of the sending process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then the object is received from - the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the - current process. - - **Returns** - - Returns the python object that is sent by the peer process. - - **Notes** - - - Note: This is a synchronous call; the ``recv_from`` statement returns - only after the source rank has made a matching ``send`` or - ``broadcast`` call, and the object is received. - -.. function:: smp.allgather(obj, group) - :noindex: - - A collective call that gathers all the - submitted objects across all ranks in the specified ``group``. Returns a - list whose ``i``\ th index contains the object submitted by the - ``i``\ th rank in ``group``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be - allgathered. - - - ``group`` : A ``CommGroup`` argument that represents which group of - processes participate in ``allgather``. - - **Notes** - - - Note: This is a synchronous call; the ``allgather`` statement returns - only after all ranks participating in the call have made a matching - ``allgather`` call, and all the objects are received at the current - rank. - - **Examples** - - .. code:: python - - # assuming mp_size() == 2 - - if smp.mp_rank() == 0: -     out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - else: -     out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - -.. function:: smp.barrier(group=smp.WORLD) - :noindex: - - A statement that hangs until all - processes in the specified group reach the barrier statement, similar to - ``MPI_Barrier()``. - - **Inputs** - - - ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of - processes participating in the barrier call. Defaults to - ``smp.WORLD``. - - **Examples** - - - Assume there are 8 processes and 2 model partitions, and - therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If - the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its - group argument, then each process only waits until the other process - of its own ``mp_group`` reaches that point. It does not wait for - processes outside that ``mp_group``. - -.. function:: smp.dp_barrier() - :noindex: - - Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``. - Waits for the processes in the same \ ``dp_group`` as - the current process to reach the same point in execution. - -.. function:: smp.mp_barrier() - :noindex: - - Same as passing ``smp.MP_GROUP`` to - ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as - the current process to reach the same point in execution. diff --git a/doc/api/training/smp_versions/v1.1.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.1.0/smd_model_parallel_pytorch.rst deleted file mode 100644 index 3b822d79e9..0000000000 --- a/doc/api/training/smp_versions/v1.1.0/smd_model_parallel_pytorch.rst +++ /dev/null @@ -1,521 +0,0 @@ -.. admonition:: Contents - - - :ref:`pytorch_saving_loading` - - :ref:`pytorch_saving_loading_instructions` - -PyTorch API -=========== - -**Supported versions: 1.6.0** - -This API document assumes you use the following import statements in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. tip:: - - Refer to - `Modify a PyTorch Training Script - `_ - to learn how to use the following API in your PyTorch training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of ``torch.nn.Module`` which specifies the model to be - partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is - the model to be partitioned. The returned ``DistributedModel`` object - internally manages model parallelism and data parallelism. Only one - model in the training script can be wrapped with - ``smp.DistributedModel``. - - - **Example:** - - .. code:: python - - model = smp.DistributedModel(model) - - **Important**: The ``__call__`` and  ``backward`` method calls on the - ``smp.DistributedModel`` object (in the following example, the object - is \ ``model``) can only be made inside a ``smp.step``-decorated - function. - - - Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can - be performed by calling the \ ``DistributedModel`` object on the input - tensors. - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - For a backward pass, one needs to call the backward function on - the \ ``DistributedModel`` object, with tensors and gradients as - arguments, replacing the PyTorch operations \ ``torch.Tensor.backward`` - or ``torch.autograd.backward``. - - - The API for ``model.backward`` is very similar to - ``torch.autograd.backward``. For example, the following - ``backward`` calls: - - .. code:: python - - torch.autograd.backward(loss) or loss.backward() - - should be replaced with: - - .. code:: python - - model.backward(loss) # loss is a tensor with only one element as its data - - Similarly, for non-scalar tensors, replace the following - ``backward`` call containing incoming gradient arguments: - - .. code:: python - - torch.autograd.backward(outputs, out_grads) - - with the following line: - - .. code:: python - - model.backward(outputs, out_grads) - - In these examples, all ``__call__``  and ``backward`` method calls on - the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside - a ``smp.step``-decorated function. - - **Parameters** - - - ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism). - - - ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``) - Whether to perform the tracing step on the GPU or CPU. The tracing step gathers - information on the order of execution of modules, the shapes of - intermediate outputs, and execution times, to be used by the - partitioning algorithm. If ``trace_device`` is set to GPU, accurate - module execution times can be gathered during tracing for potentially - improved partitioning decision. However, if the model is too large to - fit in a single GPU, then ``trace_device`` should be set to CPU. - - - ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``, - the library profiles the execution time of each module during tracing, and uses - it in the partitioning decision. This improves the partitioning - decision, but it might make the tracing slower. It may also introduce - some degree of non-determinism in partitioning results, because of the - inherent randomness in module execution times. Must be ``False`` if - ``trace_device`` is ``"cpu"``. - - - ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` while launching training). The library uses this flag - to decide whether to do overlapping allreduce whenever a parameter - gradients are ready. This leads to overlapping of communication and - computation and can improve performance. If this is set to ``False`` , - allreduce is performed at the end of the step. - - - ``backward_passes_per_step`` (``int``) (default: 1): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` in config). This parameter indicates the - number of backward passes to perform before calling allreduce on DDP. - This allows accumulating updates over multiple mini-batches before - reducing and applying them. - - - ``average_grads_across_microbatches`` (``bool``) (default: ``True``): - Whether or not the computed gradients should be averaged across - microbatches. If ``False``, the computed gradients will be summed across - microbatches, but not divided by the number of microbatches. In typical - use case where the computed loss is averaged over the mini-batch, this - should be left as ``True``. If you use a loss function that only sums - the per-sample loss across the batch (and not divide by the batch size), - then this must be set to ``False`` for correctness. - - - ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets - parameters into multiple buckets so that gradient reduction of each - bucket can potentially overlap with backward - computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes - (MB). - - - ``trace_memory_usage`` (default: False): When set to True, the library attempts - to measure memory usage per module during tracing. If this is disabled, - memory usage will be estimated through the sizes of tensors returned from - the module. - - **Properties** - - - ``partitioned``: Is ``True`` if the model is partitioned, ``False`` - otherwise. Initialized to ``False`` when ``DistributedModel`` is first - created. It becomes be ``True`` during the first call - to ``smp.step``-decorated function. Once the model is partitioned, the - local parameters or local ``state_dict`` can be fetched using the - following methods. - - **Methods** - - .. function:: backward(tensors, grad_tensors) - :noindex: - - Triggers a distributed backward - pass across model partitions. Example usage provided in the previous - section. The API is very similar - to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward. - ``retain_grad`` and ``create_graph``  flags are not supported. - - .. function:: local_buffers( ) - :noindex: - - Returns an iterator over buffers for the modules in - the partitioned model that have been assigned to the current process. - - .. function:: local_named_buffers( ) - :noindex: - - Returns an iterator over buffers for the - modules in the partitioned model that have been assigned to the current - process. This yields both the name of the buffer as well as the buffer - itself. - - .. function:: local_parameters( ) - :noindex: - - Returns an iterator over parameters for the - modules in the partitioned model that have been assigned to the current - process. - - .. function:: local_named_parameters( ) - :noindex: - - Returns an iterator over parameters for - the modules in the partitioned model that have been assigned to the - current process. This yields both the name of the parameter as well as - the parameter itself. - - .. function:: local_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. - - .. function:: local_named_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. This - yields both the name of the module as well as the module itself. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains local - parameters that belong to the current \ ``mp_rank``. This ``state_dict`` - contains a key \ ``_smp_is_partial`` to indicate this is a - partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains parameters - for the entire model. It first collects the \ ``local_state_dict``  and - gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to - create a full ``state_dict``. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.module.load_state_dict()`` , - except: It first gathers and merges the ``state_dict``\ s across - ``mp_rank``\ s, if they are partial. The actual loading happens after the - model partition so that each rank knows its local parameters. - - .. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step`` but before the actual execution of the - first forward pass. Returns a ``RemovableHandle`` object ``handle``, - which can be used to remove the hook by calling ``handle.remove()``. - - .. function:: cpu( ) - :noindex: - - Allgathers parameters and buffers across all ``mp_rank``\ s and moves them - to the CPU. - -.. class:: smp.DistributedOptimizer - :noindex: - - **Parameters** - - ``optimizer`` - - An optimizer wrapper for saving/loading optimizer states. This wrapper - returns ``optimizer`` with the following methods overridden: - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains optimizer state for the entire model. - It first collects the ``local_state_dict`` and gathers and merges - the ``local_state_dict`` from all ``mp_rank``s to create a full - ``state_dict``. Please note that this needs to be called on all ranks with - ``dp_rank()==0`` to ensure the gather happens properly. - If it is only called on all such ranks, it can hang. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.optimizer.load_state_dict()`` , except: - - - It first gathers and merges the local ``state_dict``\ s if they are - partial. - - The actual loading happens after the model partition so that each - rank knows its local parameters. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains the - local optimizer state that belongs to the current \ ``mp_rank``. This - ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is - a partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - ​ -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (int) - The index of the partition. - - A context manager which places all modules defined inside into the - partition with ID ``index``.  The ``index`` argument must be less than - the number of partitions. - - Use ``smp.partition`` to implement manual partitioning. - If ``"auto_partition"`` is ``True``, then the - ``smp.partition`` contexts are ignored. Any module that is not placed in - any ``smp.partition`` context is placed in the - ``default_partition`` defined through the SageMaker Python SDK. - - When ``smp.partition`` contexts are nested, the innermost context - overrides the rest (see the following example). In PyTorch, manual - partitioning should be done inside the module \ ``__init__``, and the - partition assignment applies to the modules that are *created* inside - the ``smp.partition`` context. - - Example: - - .. code:: python - - class Model(torch.nn.Module): -     def __init__(self): -         with smp.partition(1): -             self.child0 = Child0()            # child0 on partition 1 -             with smp.partition(2): -                 self.child1 = Child1()        # child1 on partition 2 -             self.child2 = Child2()            # child2 on partition 1 -         self.child3 = Child3()                # child3 on default_partition - -.. function:: smp.get_world_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all - processes, which can be used with the ``torch.distributed`` API. - Requires ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_mp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``MP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_dp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``DP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.is_initialized( ) - :noindex: - - Returns ``True`` if ``smp.init`` has already been called for the - process, and ``False`` otherwise. - -.. function::smp.is_tracing( ) - :noindex: - - Returns ``True`` if the current process is running the tracing step, and - ``False`` otherwise. - -.. data:: smp.nn.FusedLayerNorm - :noindex: - - `Apex Fused Layer Norm `__ is currently not - supported by the library. ``smp.nn.FusedLayerNorm`` replaces ``apex`` - ``FusedLayerNorm`` and provides the same functionality. This requires - ``apex`` to be installed on the system. - -.. data:: smp.optimizers.FusedNovoGrad - :noindex: - - `Fused Novo Grad optimizer `__ is - currently not supported by the library. ``smp.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad`` - optimizer and provides the same functionality. This requires ``apex`` to - be installed on the system. - -.. data:: smp.optimizers.FusedLamb - :noindex: - - `FusedLamb optimizer `__ - currently doesn’t work with the library. ``smp.optimizers.FusedLamb`` replaces - ``apex`` ``FusedLamb`` optimizer and provides the same functionality. - This requires ``apex`` to be installed on the system. - -.. data:: smp.amp.GradScaler - :noindex: - - `Torch AMP Gradscaler `__ - currently doesn’t work with the library. ``smp.amp.GradScaler`` replaces - ``torch.amp.GradScaler`` and provides the same functionality. - -.. _pytorch_saving_loading: - :noindex: - -APIs for Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. function:: smp.save( ) - :noindex: - - Saves an object. This operation is similar to ``torch.save()``, except - it has an additional keyword argument, ``partial``, and accepts only - string type for the argument ``f`` (file). If ``partial=True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank`` - index to your saved file. - - **Parameters** - - - ``obj`` (dict): A saved object. - - ``f`` (str): A string containing a file name. - - ``partial`` (bool, default= ``True``):  When set to ``True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an - ``mp_rank`` index to the saved file. If you want to be able to load - and further train a model that you save with ``smp.save()``, you must - set ``partial=True``. - - ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``): - A module used for pickling metadata and objects. - - ``pickle_protocol``  (int, default=2): Can be specified to - override the defaultprotocol. - -.. function:: smp.load( ) - :noindex: - - Loads an object saved with ``smp.save()`` from a file. - - Similar to, `torch.load() `__, - except it has an additional keyword argument, ``partial``, and accepts - only string type for the argument ``f`` (file). If \ ``partial=True``, - then each ``mp_rank`` loads a separate checkpoint file. - - **Parameters** - - - ``f`` (string): A string containing a file name. - - ``map_location`` (function): A function - `torch.device `__, - a string, or a dict specifying how to remap storage locations. - - ``pickle_module`` (pickle module): A module used for unpickling - metadata and objects (has to match the \ ``pickle_module``\ used to - serialize file). - - ``pickle_load_args`` (Python 3 only): Optional keyword arguments - passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``. - - ``partial`` (bool, default= ``True``): When set to ``True``, each - ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``. - Should be used when loading a model trained with the library. - -.. _pytorch_saving_loading_instructions: - :noindex: - -General Instruction For Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The library can save partial or full checkpoints. - -- For partial checkpoints, each ``mp_rank`` saves its own checkpoint - file with only the parameters that belong to that rank. -- For full checkpoints, the library saves a single checkpoint that contains - entire model parameters. - -When **saving** using ``smp.save()``, each rank only holds its own -parameters. If you want to save the full model, there will be some -communication between the ranks to create the full model. If you save -checkpoints often, you should save partial checkpoints for best -performance. - -When **loading** using ``smp.load()``, the library can load either partial or | -full checkpoints or full checkpoints saved by a non-model-parallel model. If you -want to resume training with a non-model-parallel model or do inference, you need -a full checkpoint. - -The following is an example of how you can save and load a checkpoint: - -.. code:: python - - # Original model and optimizer - model = MyModel(...) - optimizer = MyOpt(...) - - # model parallel wrapper - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - - # To save, always save on dp_rank 0 to avoid data racing - if partial: -     # To save the partial model on each mp rank -     # the library will create `checkpoint.pt_{mprank}` for each mp rank -     if save_partial_model: -         if smp.dp_rank() == 0: -             model_dict = model.local_state_dict() # save the partial model -             opt_dict = optimizer.local_state_dict() # save the partial optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 f"/checkpoint.pt", -                 partial=True, -             ) - -     # To save the full model -     if save_full_model: -         if smp.dp_rank() == 0: -             model_dict = model.state_dict() # save the full model -             opt_dict = optimizer.state_dict() # save the full optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 "/checkpoint.pt", -                 partial=False, -             ) - - # To load, load on all ranks. - # The only difference for partial/full loading is the partial flag in smp.load - # Load partial checkpoint - if partial_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=True) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) - # Load full checkpoint - if full_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=False) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) diff --git a/doc/api/training/smp_versions/v1.1.0/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/v1.1.0/smd_model_parallel_tensorflow.rst deleted file mode 100644 index 252c60d16b..0000000000 --- a/doc/api/training/smp_versions/v1.1.0/smd_model_parallel_tensorflow.rst +++ /dev/null @@ -1,164 +0,0 @@ -TensorFlow API -============== - -**Supported version: 2.3.1** - -**Important**: This API document assumes you use the following import statement in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -.. tip:: - - Refer to - `Modify a TensorFlow Training Script - `_ - to learn how to use the following API in your TensorFlow training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of the Keras \ ``Model`` class, which defines the model to - be partitioned. Model definition is done by sub-classing - ``smp.DistributedModel`` class, and implementing the ``call()`` method, - in the same way as the Keras model sub-classing API. Any operation that - is part of the \ ``smp.DistributedModel.call()`` method is subject to - partitioning, meaning that every operation placed inside executes in - exactly one of the devices (the operations outside run on all devices). - - - Similar to the regular Keras API, the forward pass is done by directly - calling the model object on the input tensors. For example: - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - However, ``model()`` calls can only be made inside a - ``smp.step``-decorated function. - - The outputs from a ``smp.DistributedModel`` are available in all ranks, - regardless of which rank computed the last operation. - - **Methods:** - - .. function:: save_model(save_path="/opt/ml/model") - :noindex: - - **Inputs** - - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights. - - Saves the entire, - unpartitioned model with the latest trained weights to ``save_path`` in - TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which - SageMaker monitors to upload the model artifacts to Amazon S3. - -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (``int``): The index of the partition. - - A context manager which places all operations defined inside into the - partition whose ID is equal to ``index``. When - ``smp.partition`` contexts are nested, the innermost context overrides - the rest. The ``index`` argument must be smaller than the number of - partitions. - - ``smp.partition`` is used in the manual partitioning API; - if \ ``"auto_partition"`` parameter is set to ``True`` while launching - training, then ``smp.partition`` contexts are ignored. Any operation - that is not placed in any ``smp.partition`` context is placed in the - ``default_partition``, as shown in the following example: - - .. code:: python - - # auto_partition: False - # default_partition: 0 - smp.init() - [...] - x = tf.constant(1.2)                     # placed in partition 0 - with smp.partition(1): -     y = tf.add(x, tf.constant(2.3))      # placed in partition 1 -     with smp.partition(3): -         z = tf.reduce_sum(y)             # placed in partition 3 - - ​ - -.. class:: smp.CheckpointManager - :noindex: - - A subclass of TensorFlow - `CheckpointManager `__, - which is used to manage checkpoints. The usage is similar to TensorFlow - ``CheckpointManager``. - - The following returns a ``CheckpointManager`` object. - - .. code:: python - - smp.CheckpointManager(checkpoint, -                       directory="/opt/ml/checkpoints", -                       max_to_keep=None, -                       checkpoint_name="ckpt") - - - **Important:** ``smp.CheckpointManager.restore()`` must be called after - the first training step. This is because the first call of the - ``smp.step`` function constructs and partitions the model, which must - take place before the checkpoint restore. Calling it before the first - ``smp.step`` call might result in hangs or unexpected behavior. - - **Parameters** - - - ``checkpoint``: A `tf.train.Checkpoint - `__ instance - that represents a model checkpoint. - - - ``directory``: (``str``) The path to a directory in which to write - checkpoints. A file named "checkpoint" is also written to this - directory (in a human-readable text format) which contains the state - of the ``CheckpointManager``. Defaults to - ``"/opt/ml/checkpoints"``, which is the directory that SageMaker - monitors for uploading the checkpoints to Amazon S3. - - ``max_to_keep`` (``int``): The number of checkpoints to keep. If - ``None``, all checkpoints are kept. - - ``checkpoint_name`` (``str``): Custom name for the checkpoint file. - Defaults to ``"ckpt"``. - - - **Methods:** - - .. function:: save( ) - :noindex: - - Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``. - - .. function:: restore( ) - :noindex: - - Restores the latest checkpoint in the specified directory. - Internally uses ``tf.train.CheckpointManager.restore()``. - - - **Examples:** - - .. code:: python - - checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) - ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints - - for inputs in train_ds: -     loss = train_step(inputs) -     # [...] -     ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints - - .. code:: python - - for step, inputs in enumerate(train_ds): -     if step == 1:                    # NOTE: restore occurs on the second step -         ckpt_manager.restore() -     loss = train_step(inputs) - diff --git a/doc/api/training/smp_versions/v1.10.0/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/v1.10.0/smd_model_parallel_common_api.rst deleted file mode 100644 index b4713b2707..0000000000 --- a/doc/api/training/smp_versions/v1.10.0/smd_model_parallel_common_api.rst +++ /dev/null @@ -1,538 +0,0 @@ -Common API -========== - -The following SageMaker distribute model parallel APIs are common across all frameworks. - -.. contents:: Table of Contents - :depth: 3 - :local: - -The Library's Core APIs ------------------------ - -This API document assumes you use the following import statement in your training scripts. - -**TensorFlow** - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -**PyTorch** - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. function:: smp.init( ) - :noindex: - - Initialize the library. Must be called at the beginning of training script. - -.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs]) - :noindex: - - A decorator that must be placed over a function that represents a single - forward and backward pass (for training use cases), or a single forward - pass (for evaluation use cases). Any computation that is defined inside - the ``smp.step``-decorated function is executed in a pipelined manner. - - By default, every tensor input to the function is split across its batch - dimension into a number of microbatches specified while launching the - training job. This behavior can be customized through the arguments to - ``smp.step``, described below. The library then orchestrates the execution of - each microbatch across all partitions, based on the chosen pipeline - type. - - In a typical use case, forward pass and back-propagation are executed - inside an \ ``smp.step``-decorated function and gradients, loss, and - other relevant metrics (such as accuracy, etc.) are returned from - ``smp.step``-decorated function. - - Any gradient post-processing operation, such as gradient clipping and - allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or - ``optimizer.step`` (for PT) should be applied on the gradients returned - from the ``smp.step`` function, and not inside the ``smp.step`` - function. This is because every operation inside ``smp.step`` is - executed once per microbatch, so having these operations inside - ``smp.step`` can either be inefficient (in the case of allreduce), or - lead to wrong results (in the case of ``apply_gradients`` / - ``optimizer.step``). - - If the objects returned from the ``smp.step``-decorated function contain - ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to - ``StepOutput`` objects. A ``StepOutput`` object encapsulates all - versions of the tensor across different microbatches - (see ``StepOutput`` entry for more information). - - The argument to ``smp.step`` decorated function should either be a tensor - or an instance of list, tuple, dict or set for it to be split across - microbatches. If your object doesn't fall into this category, you can make - the library split your object, by implementing ``smp_slice`` method. - - Below is an example of how to use it with PyTorch. - - .. code:: python - - class CustomType: - def __init__(self, tensor): - self.data = tensor - - # The library will call this to invoke slicing on the object passing in total microbatches (num_mb) - # and the current microbatch index (mb). - def smp_slice(self, num_mb, mb, axis): - dim_size = list(self.data.size())[axis] - - split_size = dim_size // num_mb - sliced_tensor = self.data.narrow(axis, mb * split_size, split_size) - return CustomType(sliced_tensor, self.other) - - custom_obj = CustomType(torch.ones(4,)) - - @smp.step() - def step(custom_obj): - loss = model(custom_obj) - model.backward(loss) - return loss - - - **Important:** ``smp.step`` splits the batch into microbatches, and - executes everything inside the decorated function once per microbatch. - This might affect the behavior of batch normalization, any operation - that explicitly uses the batch size information, or any other Python - code that is expected to run once. - - **TensorFlow-specific behavior** - - ``smp.step`` is a wrapper that - inherits from and extends the behavior of ``tf.function``, and as such, - all the caveats that apply to the use of ``tf.function``\ s also apply - to ``smp.step``. In particular, any operation that is inside - ``smp.step`` executes in graph mode, and not eager mode. - - In the first call, ``smp.step`` performs tracing of the wrapped function every time - one of the tensor arguments changes their shape or dtype, or for every - new value of a Python argument, if there is one. Tracing is expensive, - so such scenarios should be avoided as much as possible or, - alternatively, an ``input_signature`` argument must be provided. For - more information on the usage of ``tf.function``, refer to the - TensorFlow documentation: - - - https://www.tensorflow.org/api_docs/python/tf/function\ - - https://www.tensorflow.org/guide/function\ - - Each ``smp.step`` decorated function must have a return value that depends on the - output of ``smp.DistributedModel``. - - **Common parameters** - - - ``non_split_inputs`` (``list``): The list of arguments to the decorated function - that should not be split along the batch dimension. Should be used - for all input tensors that do not have a batch dimension. Should be a - list of argument names as ``str``, as they appear in the signature of - the ``smp.step``-decorated function. By default it is considered an - empty list. - - - ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch - axis. The keys should be the argument names as ``str``, as they - appear in the signature of the ``smp.step``-decorated function.  By - default all batch axes are assumed to be the 0-axis. - - **TensorFlow-only parameters** - - - All arguments of ``tf.function``. Note: - The \ ``experimental_compile`` argument of ``tf.function`` may not - work as expected with ``smp.step``, since it interferes with - pipelining and model partitioning. To enable XLA with the library, you can - instead use \ ``tf.config.optimizer.set_jit(True)``. - - **PyTorch-only parameters** - - - ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on - all returned ``torch.Tensor`` outputs. Setting it to ``False`` - increases memory consumption, unless ``detach()`` is manually called - on the returned tensors, because the model graph is not cleared from - memory after the training step. Set to \ ``True`` by default. - - **Returns** - - - The same object(s) returned from the decorated function. All - returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or - ``torch.Tensor`` objects (for PT) are wrapped inside - a \ ``StepOutput`` object, even when they are inside a Python - ``list``, ``tuple``, or ``dict``. - - - -.. class:: StepOutput - :noindex: - - - A class that encapsulates all versions of a ``tf.Tensor`` - or \ ``torch.Tensor`` across all microbatches. - - When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside - ``smp.step``, different versions of the tensor are computed for each - microbatch. - - When this tensor is returned from ``smp.step`` and is accessed outside - of the decorated function, it appears as a ``StepOutput`` object, which - contains all such versions. For example, - - - In the case of Tensorflow, the gradient for a particular - ``tf.Variable`` is computed on each microbatch individually, and if - this gradient is returned from ``smp.step``, all gradients for this - ``tf.Variable`` become part of the same ``StepOutput`` object. The - ``StepOutput`` class offers the following API for commonly-used - post-processing operations on such tensors. - - In the case of PyTorch, the loss for each microbatch is computed - individually and all the ``torch.Tensor``\ s that represent the loss - for different microbatches become part of same ``StepOutput`` object, - if loss is returned from the ``smp.step`` function. - - - The ``StepOutput`` class offers the following API for commonly-used - post-processing operations on tensors. - - .. data:: StepOutput.outputs - :noindex: - - Returns a list of the underlying tensors, indexed by microbatch. - - .. function:: StepOutput.reduce_mean( ) - :noindex: - - Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s - ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches. - - .. function:: StepOutput.reduce_sum( ) - :noindex: - - Returns a ``tf.Tensor`` / - ``torch.Tensor`` that sums the constituent - ``tf.Tensor``\ s/\ ``torch.Tensor``\ s. - - .. function:: StepOutput.concat( ) - :noindex: - - Returns a - ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the - batch dimension using ``tf.concat`` / ``torch.cat``. - - .. function:: StepOutput.stack( ) - :noindex: - - Applies ``tf.stack`` / ``torch.stack`` - operation to the list of constituent ``tf.Tensor``\ s / - ``torch.Tensor``\ s. - - **TensorFlow-only methods** - - .. function:: StepOutput.merge( ) - :noindex: - - Returns a ``tf.Tensor`` that - concatenates the constituent ``tf.Tensor``\ s along the batch - dimension. This is commonly used for merging the model predictions - across microbatches. - - .. function:: StepOutput.accumulate(method="variable", var=None) - :noindex: - - Functionally the same as ``StepOutput.reduce_mean()``. However, it is - more memory-efficient, especially for large numbers of microbatches, - since it does not wait for all constituent \ ``tf.Tensor``\ s to be - ready to start averaging them, thereby saving memory. - - In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end - up being more memory-efficient than ``StepOutput.accumulate()``. - - **Parameters** - - - ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``): - If ``"add_n"`` or ``"accumulate_n"``, the library uses - ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement - accumulation. If ``"variable"``, the library uses an internal ``tf.Variable`` - into which to accumulate the tensors. Default is \ ``"variable"``. - Note: Memory usage behavior of these choices can depend on the model - and implementation. - - - ``var``: A ``tf.Variable`` into which, if provided, the library uses to - accumulate the tensors. If \ ``None``, the library internally creates a - variable. If ``method`` is not ``"variable"``, this argument is - ignored. - -.. _mpi_basics: - :noindex: - -MPI Basics ----------- - -The library exposes the following basic MPI primitives to its Python API: - -**Global** - -- ``smp.rank()`` : The global rank of the current process. -- ``smp.size()`` : The total number of processes. -- ``smp.get_world_process_group()`` : - ``torch.distributed.ProcessGroup`` that contains all processes. -- ``smp.CommGroup.WORLD``: The communication group corresponding to all processes. -- ``smp.local_rank()``: The rank among the processes on the current instance. -- ``smp.local_size()``: The total number of processes on the current instance. -- ``smp.get_mp_group()``: The list of ranks over which the current model replica is partitioned. -- ``smp.get_dp_group()``: The list of ranks that hold different replicas of the same model partition. - -**Tensor Parallelism** - -- ``smp.tp_rank()`` : The rank of the process within its - tensor-parallelism group. -- ``smp.tp_size()`` : The size of the tensor-parallelism group. -- ``smp.get_tp_process_group()`` : Equivalent to - ``torch.distributed.ProcessGroup`` that contains the processes in the - current tensor-parallelism group. -- ``smp.CommGroup.TP_GROUP`` : The communication group corresponding to - the current tensor parallelism group. - -**Pipeline Parallelism** - -- ``smp.pp_rank()`` : The rank of the process within its - pipeline-parallelism group. -- ``smp.pp_size()`` : The size of the pipeline-parallelism group. -- ``smp.get_pp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current pipeline-parallelism group. -- ``smp.CommGroup.PP_GROUP`` : The communication group corresponding to - the current pipeline parallelism group. - -**Reduced-Data Parallelism** - -- ``smp.rdp_rank()`` : The rank of the process within its - reduced-data-parallelism group. -- ``smp.rdp_size()`` : The size of the reduced-data-parallelism group. -- ``smp.get_rdp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current reduced data parallelism - group. -- ``smp.CommGroup.RDP_GROUP`` : The communication group corresponding - to the current reduced data parallelism group. - -**Model Parallelism** - -- ``smp.mp_rank()`` : The rank of the process within its model-parallelism - group. -- ``smp.mp_size()`` : The size of the model-parallelism group. -- ``smp.get_mp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current model-parallelism group. -- ``smp.CommGroup.MP_GROUP`` : The communication group corresponding to - the current model parallelism group. - -**Data Parallelism** - -- ``smp.dp_rank()`` : The rank of the process within its data-parallelism - group. -- ``smp.dp_size()`` : The size of the data-parallelism group. -- ``smp.get_dp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current data-parallelism group. -- ``smp.CommGroup.DP_GROUP`` : The communication group corresponding to - the current data-parallelism group. - -.. _communication_api: - :noindex: - -Communication API ------------------ - -The library provides a few communication primitives which can be helpful while -developing the training script. These primitives use the following -``enum`` s as arguments to specify which processes the communication -should involve. -​ - -**Helper structures** - -.. data:: smp.CommGroup - :noindex: - - An ``enum`` that takes the values - ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``. - These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``, - and ``smp.DP_GROUP`` respectively. - - - ``CommGroup.WORLD``: Represents the entire group of processes used in - training - - ``CommGroup.MP_GROUP``: Represents the group of processes that hold - the same model replica as the current process. The processes in a - single ``MP_GROUP`` collectively store an entire replica of the - model. - - ``CommGroup.DP_GROUP``: Represents the group of processes that hold - the same model partition as the current process. The processes in a - single ``DP_GROUP`` perform data parallelism/allreduce among - themselves. - -.. data:: smp.RankType - :noindex: - - An ``enum`` that takes the values - ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``. - - - ``RankType.WORLD_RANK``: The associated rank is to be interpreted as - the rank of the process across all processes used in training. - - ``RankType.MP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``MP_GROUP``. - - ``RankType.DP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``DP_GROUP``. - - -**Communication primitives:** - -.. function:: smp.broadcast(obj, group) - :noindex: - - Sends the object to all processes in the - group. The receiving process must call ``smp.recv_from`` to receive the - sent object. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be broadcast. - - - ``group``: A ``CommGroup`` argument that represents to which group of - processes the object will be sent. - - **Notes** - - - When you use ``broadcast`` on the sender process, there needs - to be an accompanying ``smp.recv_from()`` call on the receiver - processes. - - - This is a synchronous call; the ``broadcast`` statement - returns only after all ranks participating in the call have made a - matching ``recv_from`` call. - - **Example** - - .. code:: python - - if smp.rank() == 0: -     smp.broadcast(something, group=smp.CommGroup.WORLD) - else: -     smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK) - -.. function:: smp.send(obj, dest_rank, rank_type) - :noindex: - - Sends the object ``obj`` to - ``dest_rank``, which is of a type specified by ``rank_type``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be sent. - - - ``dest_rank`` (``int``): An integer denoting the rank of the receiving process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process - with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current - process. - - **Notes** - - - Note: \ This is a synchronous call; the ``send`` statement returns - only after the destination rank has made a matching - ``recv_from`` call. - -.. function:: smp.recv_from(src_rank, rank_type) - :noindex: - - Receive an object from a peer process. Can be used with a matching - ``smp.send`` or a ``smp.broadcast`` call. - - **Inputs** - - - ``src_rank`` (``int``): An integer denoting rank of the sending process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then the object is received from - the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the - current process. - - **Returns** - - Returns the python object that is sent by the peer process. - - **Notes** - - - Note: This is a synchronous call; the ``recv_from`` statement returns - only after the source rank has made a matching ``send`` or - ``broadcast`` call, and the object is received. - -.. function:: smp.allgather(obj, group) - :noindex: - - A collective call that gathers all the - submitted objects across all ranks in the specified ``group``. Returns a - list whose ``i``\ th index contains the object submitted by the - ``i``\ th rank in ``group``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be - allgathered. - - - ``group`` : A ``CommGroup`` argument that represents which group of - processes participate in ``allgather``. - - **Notes** - - - Note: This is a synchronous call; the ``allgather`` statement returns - only after all ranks participating in the call have made a matching - ``allgather`` call, and all the objects are received at the current - rank. - - **Examples** - - .. code:: python - - # assuming mp_size() == 2 - - if smp.mp_rank() == 0: -     out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - else: -     out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - -.. function:: smp.barrier(group=smp.WORLD) - :noindex: - - A statement that hangs until all - processes in the specified group reach the barrier statement, similar to - ``MPI_Barrier()``. - - **Inputs** - - - ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of - processes participating in the barrier call. Defaults to - ``smp.WORLD``. - - **Examples** - - - Assume there are 8 processes and 2 model partitions, and - therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If - the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its - group argument, then each process only waits until the other process - of its own ``mp_group`` reaches that point. It does not wait for - processes outside that ``mp_group``. - -.. function:: smp.dp_barrier() - :noindex: - - Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``. - Waits for the processes in the same \ ``dp_group`` as - the current process to reach the same point in execution. - -.. function:: smp.mp_barrier() - :noindex: - - Same as passing ``smp.MP_GROUP`` to - ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as - the current process to reach the same point in execution. diff --git a/doc/api/training/smp_versions/v1.10.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.10.0/smd_model_parallel_pytorch.rst deleted file mode 100644 index 7a81e6ddfe..0000000000 --- a/doc/api/training/smp_versions/v1.10.0/smd_model_parallel_pytorch.rst +++ /dev/null @@ -1,883 +0,0 @@ -PyTorch API -=========== - -To use the PyTorch-specific APIs for SageMaker distributed model parallism, -import the ``smdistributed.modelparallel.torch`` package at the top of your training script. - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. tip:: - - Refer to - `Modify a PyTorch Training Script - `_ - to learn how to use the following API in your PyTorch training script. - -.. contents:: Topics - :depth: 1 - :local: - -smdistributed.modelparallel.torch.DistributedModel -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. class:: smdistributed.modelparallel.torch.DistributedModel - :noindex: - - A sub-class of ``torch.nn.Module`` which specifies the model to be - partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is - the model to be partitioned. The returned ``DistributedModel`` object - internally manages model parallelism and data parallelism. Only one - model in the training script can be wrapped with - ``smdistributed.modelparallel.torch.DistributedModel``. - - **Example:** - - .. code:: python - - import smdistributed.modelparallel.torch as smp - - model = smp.DistributedModel(model) - - **Important**: The ``__call__`` and  ``backward`` method calls on the - ``smdistributed.modelparallel.torch.DistributedModel`` object (in the following example, the object - is \ ``model``) can only be made inside a ``smdistributed.modelparallel.torch.step``-decorated - function. - - Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can - be performed by calling the \ ``DistributedModel`` object on the input - tensors. - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - For a backward pass, one needs to call the backward function on - the \ ``DistributedModel`` object, with tensors and gradients as - arguments, replacing the PyTorch operations \ ``torch.Tensor.backward`` - or ``torch.autograd.backward``. - - The API for ``model.backward`` is very similar to - ``torch.autograd.backward``. For example, the following - ``backward`` calls: - - .. code:: python - - torch.autograd.backward(loss) or loss.backward() - - should be replaced with: - - .. code:: python - - model.backward(loss) # loss is a tensor with only one element as its data - - Similarly, for non-scalar tensors, replace the following - ``backward`` call containing incoming gradient arguments: - - .. code:: python - - torch.autograd.backward(outputs, out_grads) - - with the following line: - - .. code:: python - - model.backward(outputs, out_grads) - - In these examples, all ``__call__``  and ``backward`` method calls on - the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside - a ``smdistributed.modelparallel.torch.step``-decorated function. - - **Using DDP** - - If DDP is enabled with the SageMaker model parallel library, do not not place a PyTorch - ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because - the ``DistributedModel`` wrapper will also handle data parallelism. - - Unlike the original DDP wrapper, when you use ``DistributedModel``, - model parameters and buffers are not immediately broadcast across - processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the - ``smdistributed.modelparallel.torch.step``-decorated function when the partition is done. - - **Parameters** - - - ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism). - - - ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``) - Whether to perform the tracing step on the GPU or CPU. The tracing step gathers - information on the order of execution of modules, the shapes of - intermediate outputs, and execution times, to be used by the - partitioning algorithm. If ``trace_device`` is set to GPU, accurate - module execution times can be gathered during tracing for potentially - improved partitioning decision. However, if the model is too large to - fit in a single GPU, then ``trace_device`` should be set to CPU. - - - ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``, - the library profiles the execution time of each module during tracing, and uses - it in the partitioning decision. This improves the partitioning - decision, but it might make the tracing slower. It may also introduce - some degree of non-determinism in partitioning results, because of the - inherent randomness in module execution times. Must be ``False`` if - ``trace_device`` is ``"cpu"``. - - - ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` while launching training). The library uses this flag - to decide whether to do overlapping allreduce whenever a parameter - gradients are ready. This leads to overlapping of communication and - computation and can improve performance. If this is set to ``False`` , - allreduce is performed at the end of the step. - - - ``backward_passes_per_step`` (``int``) (default: 1): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` in config). This parameter indicates the - number of backward passes to perform before calling allreduce on DDP. - This allows accumulating updates over multiple mini-batches before - reducing and applying them. - - - ``average_grads_across_microbatches`` (``bool``) (default: ``True``): - Whether or not the computed gradients should be averaged across - microbatches. If ``False``, the computed gradients will be summed across - microbatches, but not divided by the number of microbatches. In typical - use case where the computed loss is averaged over the mini-batch, this - should be left as ``True``. If you use a loss function that only sums - the per-sample loss across the batch (and not divide by the batch size), - then this must be set to ``False`` for correctness. - - - ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets - parameters into multiple buckets so that gradient reduction of each - bucket can potentially overlap with backward - computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes - (MB). - - - ``trace_memory_usage`` (default: False): When set to True, the library attempts - to measure memory usage per module during tracing. If this is disabled, - memory usage will be estimated through the sizes of tensors returned from - the module. - - - ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``. - This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper. - Please see: `broadcast_buffer `__. - - - ``gradient_as_bucket_view`` (default: False): To be - used with ``ddp=True``. This parameter is forwarded to the underlying - ``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view `__. - - **Properties** - - - ``partitioned``: Is ``True`` if the model is partitioned, ``False`` - otherwise. Initialized to ``False`` when ``DistributedModel`` is first - created. It becomes be ``True`` during the first call - to ``smdistributed.modelparallel.torch.step``-decorated function. Once the model is partitioned, the - local parameters or local ``state_dict`` can be fetched using the - following methods. - - **Methods** - - .. function:: backward(tensors, grad_tensors) - :noindex: - - Triggers a distributed backward - pass across model partitions. Example usage provided in the previous - section. The API is very similar - to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward. - ``retain_grad`` and ``create_graph``  flags are not supported. - - .. function:: local_buffers( ) - :noindex: - - Returns an iterator over buffers for the modules in - the partitioned model that have been assigned to the current process. - - .. function:: local_named_buffers( ) - :noindex: - - Returns an iterator over buffers for the - modules in the partitioned model that have been assigned to the current - process. This yields both the name of the buffer as well as the buffer - itself. - - .. function:: local_parameters( ) - :noindex: - - Returns an iterator over parameters for the - modules in the partitioned model that have been assigned to the current - process. - - .. function:: local_named_parameters( ) - :noindex: - - Returns an iterator over parameters for - the modules in the partitioned model that have been assigned to the - current process. This yields both the name of the parameter as well as - the parameter itself. - - .. function:: local_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. - - .. function:: local_named_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. This - yields both the name of the module as well as the module itself. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains local - parameters that belong to the current \ ``mp_rank``. This ``state_dict`` - contains a key \ ``_smp_is_partial`` to indicate this is a - partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains parameters - for the entire model. It first collects the \ ``local_state_dict``  and - gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to - create a full ``state_dict``. Please note that this needs to be called on all ranks with - ``dp_rank()==0`` to ensure the gather happens properly. - If it is only called on all such ranks, it can hang. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.module.load_state_dict()`` , - except: It first gathers and merges the ``state_dict``\ s across - ``mp_rank``\ s, if they are partial. The actual loading happens after the - model partition so that each rank knows its local parameters. - - .. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smdistributed.modelparallel.torch.step``, but before the actual execution of the - first forward pass. Returns a ``RemovableHandle`` object ``handle``, - which can be used to remove the hook by calling ``handle.remove()``. - - .. function:: cpu( ) - :noindex: - - Allgathers parameters and buffers across all ``mp_rank``\ s and moves them - to the CPU. - - .. function:: join( ) - :noindex: - - A context manager to be used in conjunction with an instance of - ``smdistributed.modelparallel.torch.DistributedModel`` to be able to train with uneven inputs across - participating processes. This is only supported when ``ddp=True``. This will use the join with the wrapped - ``DistributedDataParallel`` instance. For more information, see: - `join `__ - in the PyTorch documentation. - - .. function:: register_comm_hook( state, callable ) - :noindex: - - **Available for PyTorch 1.8.1 only** - Registers a communication hook which is an enhancement that provides - a flexible hook ``callable`` to users where they can specify how - gradients are aggregated across multiple workers. This method will be called on the wrapped ``DistributedDataParallel`` instance. - - Please note that when you register a comm hook you have full control of how the gradients are processed. - When using only data parallelism with Torch DDP you are expected to average grads across data parallel replicas within the hook. - Similarly, when using DistributedModel you have to averaging grads across data parallel replicas within the hook. - In addition to that, you also have to average grads across microbatches within the hook unless you explicitly desire to not average based on your loss function. - See ``average_grads_across_microbatches`` for more information about averaging grads across microbatches. - - This is only supported when ``ddp=True`` and ``overlapping_allreduce=True`` (default). - For more information, see: - `register_comm_hook `__ - in the PyTorch documentation. - - **Behavior of** ``smdistributed.modelparallel.torch.DistributedModel`` **with Tensor Parallelism** - - When a model is wrapped by ``smdistributed.modelparallel.torch.DistributedModel``, the library - immediately traverses the modules of the model object, and replaces the - modules that are supported for tensor parallelism with their distributed - counterparts. This replacement happens in place. If there are no other - references to the original modules in the script, they are - garbage-collected. The module attributes that previously referred to the - original submodules now refer to the distributed versions of those - submodules. - - **Example:** - - .. code:: python - - # register DistributedSubmodule as the distributed version of Submodule - # (note this is a hypothetical example, smp.nn.DistributedSubmodule does not exist) - import smdistributed.modelparallel.torch as smp - - smp.tp_register_with_module(Submodule, smp.nn.DistributedSubmodule) - - class MyModule(nn.Module): - def __init__(self): - ... - - self.submodule = Submodule() - ... - - # enabling tensor parallelism for the entire model - with smp.tensor_parallelism(): - model = MyModule() - - # here model.submodule is still a Submodule object - assert isinstance(model.submodule, Submodule) - - model = smp.DistributedModel(model) - - # now model.submodule is replaced with an equivalent instance - # of smp.nn.DistributedSubmodule - assert isinstance(model.module.submodule, smp.nn.DistributedSubmodule) - - If ``pipeline_parallel_degree`` (equivalently, ``partitions``) is 1, the - placement of model partitions into GPUs and the initial broadcast of - model parameters and buffers across data-parallel ranks take place - immediately. This is because it does not need to wait for the model - partition when ``smdistributed.modelparallel.torch.DistributedModel`` wrapper is called. For other - cases with ``pipeline_parallel_degree`` greater than 1, the broadcast - and device placement will be deferred until the first call of an - ``smdistributed.modelparallel.torch.step``-decorated function happens. This is because the first - ``smdistributed.modelparallel.torch.step``-decorated function call is when the model partitioning - happens if pipeline parallelism is enabled. - - Because of the module replacement during the ``smdistributed.modelparallel.torch.DistributedModel`` - call, any ``load_state_dict`` calls on the model, as well as any direct - access to model parameters, such as during the optimizer creation, - should be done **after** the ``smdistributed.modelparallel.torch.DistributedModel`` call. - - Since the broadcast of the model parameters and buffers happens - immediately during ``smdistributed.modelparallel.torch.DistributedModel`` call when the degree of - pipeline parallelism is 1, using ``@smp.step`` decorators is not - required when tensor parallelism is used by itself (without pipeline - parallelism). - - For more information about the library's tensor parallelism APIs for PyTorch, - see :ref:`smdmp-pytorch-tensor-parallel`. - - **Additional Methods of** ``smdistributed.modelparallel.torch.DistributedModel`` **for Tensor Parallelism** - - The following are the new methods of ``smdistributed.modelparallel.torch.DistributedModel``, in - addition to the ones listed in the - `documentation `__. - - .. function:: distributed_modules() - :noindex: - - - An iterator that runs over the set of distributed - (tensor-parallelized) modules in the model - - .. function:: is_distributed_parameter(param) - :noindex: - - - Returns ``True`` if the given ``nn.Parameter`` is distributed over - tensor-parallel ranks. - - .. function:: is_distributed_buffer(buf) - :noindex: - - - Returns ``True`` if the given buffer is distributed over - tensor-parallel ranks. - - .. function:: is_scaled_batch_parameter(param) - :noindex: - - - Returns ``True`` if the given ``nn.Parameter`` is operates on the - scaled batch (batch over the entire ``TP_GROUP``, and not only the - local batch). - - .. function:: is_scaled_batch_buffer(buf) - :noindex: - - - Returns ``True`` if the parameter corresponding to the given - buffer operates on the scaled batch (batch over the entire - ``TP_GROUP``, and not only the local batch). - - .. function:: default_reducer_named_parameters() - :noindex: - - - Returns an iterator that runs over ``(name, param)`` tuples, for - ``param`` that is allreduced over the ``DP_GROUP``. - - .. function:: scaled_batch_reducer_named_parameters() - :noindex: - - - Returns an iterator that runs over ``(name, param)`` tuples, for - ``param`` that is allreduced over the ``RDP_GROUP``. - -smdistributed.modelparallel.torch.DistributedOptimizer -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. class:: smdistributed.modelparallel.torch.DistributedOptimizer(optimizer, static_loss_scale=1.0, dynamic_loss_scale=False, **dynamic_loss_args) - :noindex: - - An optimizer wrapper for saving and loading optimizer states. - - :param optimizer: An optimizer object. - :type optimizer: object - :param static_loss_scale: Effective only for FP16 training. The default value is ``1.0``. - :type static_loss_scale: float - :param dynamic_loss_scale: Effective only for FP16 training. Set to ``True`` to - use dynamic loss scale. The default value is ``False``. - :type dynamic_loss_scale: boolean - :param dynamic_loss_args: Effective only for FP16 training. - If ``dynamic_loss_scale=True``, you can configure additional scale - parameters for dynamic loss scale. - The following list shows available parameters. - - * ``"init_scale"``: Default is ``2**32`` - * ``"scale_factor"``: Default is ``2.`` - * ``"scale_window"``: Default is ``1000`` - * ``"min_scale"``: Default is ``1`` - * ``"delayed_shift"``: Default is ``1`` - * ``"consecutive_hysteresis"``: Default is ``False`` - :type dynamic_loss_args: dict - - **Example usage of an FP32 Optimizer:** - - .. code:: python - - optimizer = torch.optim.AdaDelta(...) - optimizer = smdistributed.modelparallel.torch.DistributedOptimizer(optimizer) - - **Example usage of an FP16 Optimizer with static loss scale:** - - .. code:: python - - optimizer = torch.optim.AdaDelta(...) - optimizer = smdistributed.modelparallel.torch.DistributedOptimizer( - optimizer, - static_loss_scale=1.0 - ) - - **Example usage of an FP16 Optimizer with dynamic loss scale:** - - .. code:: python - - optimizer = torch.optim.AdaDelta(...) - optimizer = smdistributed.modelparallel.torch.DistributedOptimizer( - optimizer, - static_loss_scale=None, - dynamic_loss_scale=True, - dynamic_loss_args={ - "scale_window": 1000, - "min_scale": 1, - "delayed_shift": 2 - } - ) - - .. tip:: - - After you modify training scripts with - :class:`smdistributed.modelparallel.torch.DistributedModel` and - :class:`smdistributed.modelparallel.torch.DistributedOptimizer`, - use the SageMaker PyTorch estimator's distribution configuration to enable FP16 training. - You simply need to add ``"fp16": True`` to the ``smp_options`` config dictionary's - ``"parameters"`` key as shown in - `Using the SageMaker TensorFlow and PyTorch Estimators - `_. - For more information about available parameters for the ``smp_options`` config, - see :ref:`sm-sdk-modelparallel-general`. - - This wrapper returns an ``optimizer`` object with the following methods overridden: - - .. method:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains optimizer state for the entire model. - It first collects the ``local_state_dict`` and gathers and merges - the ``local_state_dict`` from all ``mp_rank``\ s to create a full - ``state_dict``. - - .. method:: load_state_dict( ) - :noindex: - - Same as the ``torch.optimizer.load_state_dict()`` , except: - - - It first gathers and merges the local ``state_dict``\ s if they are - partial. - - The actual loading happens after the model partition so that each - rank knows its local parameters. - - .. method:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains the - local optimizer state that belongs to the current \ ``mp_rank``. This - ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is - a partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - -smdistributed.modelparallel.torch Context Managers and Util Functions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. function:: smdistributed.modelparallel.torch.model_creation(tensor_parallelism=False, dtype=None, **tensor_parallel_config) - :noindex: - - Context manager to create a ``torch`` model. This API combines both the - :class:`smdistributed.modelparallel.torch.tensor_parallelism` and - :class:`smdistributed.modelparallel.torch.delay_param_initialization` decorators, - so you can simply use this single context when creating the torch model. - - :param tensor_parallelism: Whether to enable tensor parallelism during model creation. - :type tensor_parallelism: boolean - :param dtype: The dtype to use when creating the model. It has the following rules. - - * If dtype is specified, it will be used during model creation. - * If dtype is not specified, the default dtype will be used during model creation, - which is usually FP32. This is for the best performance on CPU. - * Any model that causes out-of-memory problems with FP32 initialization - is recommended to be created with - :class:`smdistributed.modelparallel.torch.delayed_parameter_initialization`. - * ``FP16_Module`` casts the model back to FP16 if FP16 training is enabled - with the ``smp`` config. For more inforamtion about FP16 training - in SageMaker with the model parallel library, see `FP16 Training - `_ - in the *Amazon SageMaker Developer Guide*. - - :type dtype: ``torch.dtype`` - :param tensor_parallel_config: kwargs to specifiy other tensor parallel configs. - This is not used if ``tensor_parallelism`` is ``False``. - :type tensor_parallel_config: dict - - **Example Usage:** - - .. code:: python - - import smdistributed.modelparallel.torch as smp - - with smp.model_creation( - tensor_parallelism=smp.tp_size() > 1, - dtype=torch.float16 if args.fp16 else torch.get_default_dtype() - ): - model = MyModel(...) - -.. function:: smdistributed.modelparallel.torch.partition(index) - :noindex: - - :param index: The index of the partition. - :type index: int - - A context manager which places all modules defined inside into the - partition with ID ``index``.  The ``index`` argument must be less than - the number of partitions. - - Use ``smdistributed.modelparallel.torch.partition`` to implement manual partitioning. - If ``"auto_partition"`` is ``True``, then the - ``smdistributed.modelparallel.torch.partition`` contexts are ignored. Any module that is not placed in - any ``smdistributed.modelparallel.torch.partition`` context is placed in the - ``default_partition`` defined through the SageMaker Python SDK. - - When ``smdistributed.modelparallel.torch.partition`` contexts are nested, the innermost context - overrides the rest (see the following example). In PyTorch, manual - partitioning should be done inside the module \ ``__init__``, and the - partition assignment applies to the modules that are *created* inside - the ``smdistributed.modelparallel.torch.partition`` context. - - Example: - - .. code:: python - - import smdistributed.modelparallel.torch as smp - - class Model(torch.nn.Module): -     def __init__(self): -         with smp.partition(1): -             self.child0 = Child0()            # child0 on partition 1 -             with smp.partition(2): -                 self.child1 = Child1()        # child1 on partition 2 -             self.child2 = Child2()            # child2 on partition 1 -         self.child3 = Child3()                # child3 on default_partition - -.. data:: smdistributed.modelparallel.torch.amp.GradScaler - :noindex: - - `Torch AMP Gradscaler `__ - currently doesn’t work with the library. ``smdistributed.modelparallel.torch.amp.GradScaler`` replaces - ``torch.amp.GradScaler`` and provides the same functionality. - -.. function:: smdistributed.modelparallel.torch.delay_param_initialization(enabled=True) - :noindex: - - If enabled, it delays the initialization of parameters - to save CPU memory. That is, parameter initialization takes place - after the model is partitioned on GPUs. - -.. function:: smdistributed.modelparallel.torch.get_world_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all - processes, which can be used with the ``torch.distributed`` API. - Requires ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smdistributed.modelparallel.torch.get_mp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``MP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smdistributed.modelparallel.torch.get_dp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``DP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smdistributed.modelparallel.torch.is_initialized( ) - :noindex: - - Returns ``True`` if ``smdistributed.modelparallel.torch.init`` has already been called for the - process, and ``False`` otherwise. - -.. function::smp.is_tracing( ) - :noindex: - - Returns ``True`` if the current process is running the tracing step, and - ``False`` otherwise. - -.. data:: smdistributed.modelparallel.torch.nn.FusedLayerNorm - :noindex: - - `Apex Fused Layer Norm `__ is currently not - supported by the library. ``smdistributed.modelparallel.torch.nn.FusedLayerNorm`` replaces ``apex`` - ``FusedLayerNorm`` and provides the same functionality. This requires - ``apex`` to be installed on the system. - -.. data:: smdistributed.modelparallel.torch.optimizers.FusedNovoGrad - :noindex: - - - `Fused Novo Grad optimizer `__ is - currently not supported by the library. ``smdistributed.modelparallel.torch.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad`` - optimizer and provides the same functionality. This requires ``apex`` to - be installed on the system. - -.. data:: smdistributed.modelparallel.torch.optimizers.FusedLamb - :noindex: - - - `FusedLamb optimizer `__ - currently doesn’t work with the library. ``smdistributed.modelparallel.torch.optimizers.FusedLamb`` replaces - ``apex`` ``FusedLamb`` optimizer and provides the same functionality. - This requires ``apex`` to be installed on the system. - -.. _pytorch_saving_loading: - :noindex: - -smdistributed.modelparallel.torch APIs for Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. function:: smdistributed.modelparallel.torch.save(obj, f, partial=True, pickel_module=picklemodule, pickle_protocol=2, ) - :noindex: - - Saves an object. This operation is similar to `torch.save() - `_, except that - it has an additional keyword argument, ``partial``, and accepts only - string type for the argument ``f`` (file). If ``partial=True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank`` - index to your saved file. - - **Parameters** - - - ``obj`` (dict): A saved object. - - ``f`` (str): A string containing a file name. - - ``partial`` (bool, default= ``True``):  When set to ``True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an - ``mp_rank`` index to the saved file. If you want to be able to load - and further train a model that you save with ``smdistributed.modelparallel.torch.save()``, you must - set ``partial=True``. - - ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``): - A module used for pickling metadata and objects. - - ``pickle_protocol``  (int, default=2): Can be specified to - override the defaultprotocol. - -.. function:: smdistributed.modelparallel.torch.load(f, map_location, pickle_module, pickle_load_args, partial=True) - :noindex: - - Loads an object saved with ``smdistributed.modelparallel.torch.save()`` from a file. - - Similar to, `torch.load() `__, - except it has an additional keyword argument, ``partial``, and accepts - only string type for the argument ``f`` (file). If \ ``partial=True``, - then each ``mp_rank`` loads a separate checkpoint file. - - **Parameters** - - - ``f`` (string): A string containing a file name. - - ``map_location`` (function): A function - `torch.device `__, - a string, or a dict specifying how to remap storage locations. - - ``pickle_module`` (pickle module): A module used for unpickling - metadata and objects (has to match the \ ``pickle_module``\ used to - serialize file). - - ``pickle_load_args`` (Python 3 only): Optional keyword arguments - passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``. - - ``partial`` (bool, default= ``True``): When set to ``True``, each - ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``. - Should be used when loading a model trained with the library. - -.. function:: smdistributed.modelparallel.torch.save_checkpoint(path, tag, partial=True, model=None, optimizer=None, user_content=None, translate_if_full=True, num_kept_partial_checkpoints=None) - :noindex: - - Saves a checkpoint. While :class:`smdistributed.modelparallel.torch.save` saves - model and optimizer objects, - this function checkpoints model and optimizer and saves the checkpoints as separate files. - It creates checkpoint folders in the following structure. - - .. code:: text - - - path - - ${tag}_partial (folder for partial checkpoint) - - model_rankinfo.pt - - optimizer_rankinfo.pt - - fp16_states_rankinfo.pt - - user_content.pt - - $tag (checkpoint file for full checkpoint) - - user_content_$tag (user_content file for full checkpoint) - - newest (a file that indicates the newest checkpoint) - - **Parameters** - - * ``path`` (str) (required): Path to save the checkpoint. The library creates - the directory if it does not already exist. - For example, ``/opt/ml/checkpoint/model_parallel``. - * ``tag`` (str) (required): A tag for the current checkpoint, usually the train - steps. Note: tag needs to be the same across all ranks (GPU workers). - When ``partial=False`` this will be the checkpoint file name. - * ``partial`` (boolean) (default: True): Whether to save the partial checkpoint. - * ``model`` (:class:`smdistributed.modelparallel.torch.DistributedModel`) - (default: None): The model to save. It needs to an ``smp.DistributedModel`` object. - * ``optimizer`` (:class:`smdistributed.modelparallel.torch.DistributedOptimizer`) - (default: None): The optimizer to save. It needs to be an ``smp.DistributedOptimizer`` object. - * ``user_content`` (any) (default: None): User-defined content to save. - * ``translate_if_full`` (boolean) (default: True): Whether to translate the - full ``state_dict`` to HF ``state_dict`` if possible. - * ``num_kept_partial_checkpoints`` (int) (default: None): The maximum number - of partial checkpoints to keep on disk. - -.. function:: smdistributed.modelparallel.torch.resume_from_checkpoint(path, tag=None, partial=True, strict=True, load_optimizer_states=True, translate_function=None) - :noindex: - - While :class:`smdistributed.modelparallel.torch.load` loads saved - model and optimizer objects, this function resumes from a saved checkpoint file. - - **Parameters** - - * ``path`` (str) (required): Path to load the checkpoint. - * ``tag`` (str) (default: None): Tag of the checkpoint to resume. If not provided, - the library tries to locate the newest checkpoint from the saved newest file. - * ``partial`` (boolean) (default: True): Whether to load the partial checkpoint. - * ``strict`` (boolean) (default: True): Load with strict load, no extra key or - missing key is allowed. - * ``load_optimizer_states`` (boolean) (default: True): Whether to load ``optimizer_states``. - * ``translate_function`` (function) (default: None): function to translate the full - checkpoint into smdistributed.modelparallel format. - For supported models, this is not required. - - **Example usage** - - .. code:: python - - # Save - smp.save_checkpoint( - checkpoint_dir, - tag=f"total_steps{total_steps}", - partial=True, - model=model, - optimizer=optimizer, - user_content=user_content - num_kept_partial_checkpoints=args.num_kept_checkpoints) - - # Load: this will automatically load the newest checkpoint - user_content = smp.resume_from_checkpoint(path, partial=partial) - -.. _pytorch_saving_loading_instructions: - :noindex: - -General instruction on saving and loading ------------------------------------------ - -The library can save partial or full checkpoints. - -- For partial checkpoints, each ``mp_rank`` saves its own checkpoint - file with only the parameters that belong to that rank. -- For full checkpoints, the library saves a single checkpoint that contains - entire model parameters. - -When **saving** using ``smdistributed.modelparallel.torch.save()``, each rank only holds its own -parameters. If you want to save the full model, there will be some -communication between the ranks to create the full model. If you save -checkpoints often, you should save partial checkpoints for best -performance. - -When **loading** using ``smdistributed.modelparallel.torch.load()``, the library can load either partial or | -full checkpoints or full checkpoints saved by a non-model-parallel model. If you -want to resume training with a non-model-parallel model or do inference, you need -a full checkpoint. - -The following is an example of how you can save and load a checkpoint: - -.. code:: python - - import smdistributed.modelparallel.torch as smp - # Original model and optimizer - model = MyModel(...) - optimizer = MyOpt(...) - - # model parallel wrapper - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - - # To save, always save on dp_rank 0 to avoid data racing - if partial: -     # To save the partial model on each mp rank -     # the library will create `checkpoint.pt_{mprank}` for each mp rank -     if save_partial_model: -         if smp.dp_rank() == 0: -             model_dict = model.local_state_dict() # save the partial model -             opt_dict = optimizer.local_state_dict() # save the partial optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 f"/checkpoint.pt", -                 partial=True, -             ) - -     # To save the full model -     if save_full_model: -         if smp.dp_rank() == 0: -             model_dict = model.state_dict() # save the full model -             opt_dict = optimizer.state_dict() # save the full optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 "/checkpoint.pt", -                 partial=False, -             ) - - # To load, load on all ranks. - # The only difference for partial/full loading is the partial flag in smp.load - # Load partial checkpoint - if partial_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=True) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) - # Load full checkpoint - if full_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=False) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) diff --git a/doc/api/training/smp_versions/v1.10.0/smd_model_parallel_pytorch_tensor_parallel.rst b/doc/api/training/smp_versions/v1.10.0/smd_model_parallel_pytorch_tensor_parallel.rst deleted file mode 100644 index 96231b55fe..0000000000 --- a/doc/api/training/smp_versions/v1.10.0/smd_model_parallel_pytorch_tensor_parallel.rst +++ /dev/null @@ -1,903 +0,0 @@ -.. _smdmp-pytorch-tensor-parallel: - :noindex: - -PyTorch API for Tensor Parallelism -================================== - -SageMaker distributed tensor parallelism works by replacing specific submodules -in the model with their distributed implementations. The distributed modules -have their parameters and optimizer states partitioned across tensor-parallel -ranks. This is to compute the same output as it would have been computed by -the original modules. Since tensor parallelism occurs across data-parallel -ranks, a rank might collect slices of the activations corresponding to the -data shards on other devices that are part of the same tensor parallelism group. - -You can enable or disable tensor parallelism for specific parts of the model. -Within the enabled parts, the replacements with distributed modules will take -place on a best-effort basis for those module supported for tensor parallelism. -Alternatively, you can directly import and use the library’s distributed -modules in the model definition. - -Some of the supported modules (such as ``smdistributed.modelparallel.torch.nn.Transformer``) are high-level -blocks that contain many operations. Because custom implementations -(as opposed to the built-in PyTorch modules) are typically used for these -high-level blocks, the library offers an API that you can use to register -specific distributed versions with such custom modules (provided that they -are functionally equivalent). This allows the library to automatically replace -the occurrences of such PyTorch modules with their distributed counterparts -provided by the library. -For more information, see the following topics. - -.. contents:: Topics - :depth: 3 - :local: - -.. _registering-tp-modules: - :noindex: - -Registering Tensor Parallelism Distributed Modules --------------------------------------------------- - -Although PyTorch natively provides some of the commonly used (and -tensor-parallelizable) building blocks such as Transformer, users often -use custom implementations for such higher-level modules. To distribute -such modules with tensor parallelism, you need to register the -distributed modules to the custom module implementation in your class, -so that the library knows how to distribute the custom module. When you -register the distributed modules, make sure the custom module that you -use is functionally equivalent to the distributed module. You can verify -this by taking a look at the equivalent reference implementations in the -:ref:`smdmp-tp-appendix`. -These implementations are functionally equivalent to their distributed -versions in ``smdistributed.modelparallel.torch.nn`` module. - -.. class:: smdistributed.modelparallel.torch.tp_register(dist_module, init_hook=None, forward_hook=None, return_hook=None) - :noindex: - - - A decorator class that registers the ``dist_module`` class with - the module class that it is attached to. The hooks can be used to - adapt to different interfaces used with ``__init__`` and - ``forward`` methods. - - **Arguments:** - - - ``dist_module``: A subclass of ``smdistributed.modelparallel.torch.nn.DistributedModule`` - that implements the distributed version of the module class the - decorator is attached to. Any distributed module class defined - in ``smdistributed.modelparallel.torch.nn`` module can be used. - - ``init_hook``: A callable that translates the arguments of the - original module ``__init__`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``__init__`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``__init__`` method (including argument order and default - values), except it must exclude ``self``. - - ``forward_hook``: A callable that translates the arguments of - the original module ``forward`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``forward`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``forward`` method (including argument order and default - values), except it must exclude ``self``. - - ``return_hook``: A callable that translates the object returned - from the distributed module to the return object expected of - the original module. - - - **Example:** - - .. code:: python - - import smdistributed.modelparallel.torch as smp - - init_hook = lambda config: ((), config.to_dict()) - - # register smp.nn.DistributedTransformer - # as the distributed version of MyTransformer - @smp.tp_register(smp.nn.DistributedTransformer, init_hook=init_hook) - class MyTransformer(nn.Module): - def __init__(self, config): - ... - - def forward(self, hidden_states, attention_mask): - ... - -.. function:: smdistributed.modelparallel.torch.tp_register_with_module(module_cls, dist_module, init_hook=None, forward_hook=None, return_hook=None) - :noindex: - - - When you do not have direct access to model definition code, you - can use this API to similarly register a distributed module with - an existing module class. - - - **Arguments:** - - - ``module_cls``: The existing module class that will be - distributed. - - ``dist_module``: A subclass of ``smdistributed.modelparallel.torch.nn.DistributedModule`` - that implements the distributed version of the module class the - decorator is attached to. Any distributed module class defined - in ``smdistributed.modelparallel.torch.nn`` module can be used. - - ``init_hook``: A callable that translates the arguments of the - original module ``__init__`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``__init__`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``__init__`` method (including argument order and default - values), except it must exclude ``self``. - - ``forward_hook``: A callable that translates the arguments of - the original module ``forward`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``forward`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``forward`` method (including argument order and default - values), except it must exclude ``self``. - - ``return_hook``: A callable that translates the object returned - from the distributed module to the return object expected of - the original module. - - - **Example:** - - .. code:: python - - import smdistributed.modelparallel.torch as smp - - from somelibrary import MyTransformer - - init_hook = lambda config: ((), config.to_dict()) - - # register smp.nn.DistributedTransformer as the distributed version of MyTransformer - smp.tp_register_with_module(MyTransformer, - smp.nn.DistributedTransformer, - init_hook=init_hook) - -.. _smdmp-supported-modules-for-tp: - :noindex: - -Supported Modules for Tensor Parallelism ----------------------------------------- - -The following modules are supported for tensor parallelism. - -.. contents:: Topics - :depth: 3 - :local: - -.. _tp-module-api: - :noindex: - -Tensor Parallelism Module APIs -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- :class:`smdistributed.modelparallel.torch.nn.DistributedLinear` (implements ``nn.Linear``) -- :class:`smdistributed.modelparallel.torch.nn.DistributedTransformerLMHead` -- :class:`smdistributed.modelparallel.torch.nn.DistributedTransformer` -- :class:`smdistributed.modelparallel.torch.nn.DistributedTransformerLayer` -- :class:`smdistributed.modelparallel.torch.nn.DistributedAttentionLayer` -- :class:`smdistributed.modelparallel.torch.nn.DistributedTransformerOutputLayer` -- :class:`smdistributed.modelparallel.torch.nn.DistributedEmbedding` - -.. class:: smdistributed.modelparallel.torch.nn.DistributedLinear(in_features, out_features) - :noindex: - - Tensor-parallel implementation of the ``nn.Linear`` class. - Functionally equivalent to an ``nn.Linear`` module with the same - ``in_features`` and ``out_features``. In other words, - ``in_features`` and ``out_features`` are the number of *global* - channels across tensor-parallel ranks. - - For more information about what's the reference implementation of this module, - see :ref:`smdmp-tp-appendix`. - - - - **Arguments:** - - - ``in_features``: The total number of input channels for the - linear layer across all tensor-parallel ranks. - - ``out_features``: The total number of output channels for the - linear layer across all tensor-parallel ranks. - -.. class:: smdistributed.modelparallel.torch.nn.DistributedTransformerLMHead(num_layers=12, num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, vocab_size=30522, num_positions=1024, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, num_token_types=0, causal_mask_size=None, add_cross_attention=False, add_lm_head=True, initializer_range=0.02, use_normal_initialization=False, pre_layernorm=False, post_layernorm=True) - :noindex: - - Constructs a distributed transformer model, including embeddings - and a single LM head. A word embedding of size - ``(vocab_size, hidden_size)`` is created, as well as a positional - embedding of size ``(num_positions, hidden_size)``, and the - embeddings are added together. If ``num_token_types`` is larger - than 0, a separate embedding of size - ``(num_token_types, hidden_size)`` is created, and further added - on top. - - - The embeddings are fed through a ``DistributedTransformer``, and - if ``add_lm_head`` is ``True``, the output passes through a single - LM head, which is a linear module without bias whose weight is - tied to the word embeddings. - - See :class:`smdistributed.modelparallel.torch.nn.DistributedTransformerLayer` for descriptions of the rest - of the arguments. - - **Methods:** - - - ``forward(self, inputs)`` - - - If ``add_cross_attention`` is ``True``, ``inputs`` must be a - tuple - ``(input_ids, attention_mask, token_type_ids, position_ids, cross_states, cross_states, cross_mask, labels)``. - - Otherwise, ``inputs`` must be a tuple - ``(input_ids, attention_mask, token_type_ids, position_ids, labels)``. - - If ``token_type_ids`` is ``None``, token type embedding will - not be used. - - ``input_ids`` is assumed to be of shape ``[N, S]``, where - ``N`` is the batch size and ``S`` is sequence length. - - ``attention_mask`` is assumed to be a 0-1 tensor of shape - ``[N, S]``, where 1 represents a masked position. - -.. class:: smdistributed.modelparallel.torch.nn.DistributedTransformer(num_layers=12, num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, causal_mask_size=None, add_cross_attention=False, pre_layernorm=False, post_layernorm=True) - :noindex: - - A sequence of :class:`smdistributed.modelparallel.torch.nn.DistributedTransformerLayer`\ s, whose - number is given by ``num_layers`` argument. For the other - arguments and methods, refer to - :class:`smdistributed.modelparallel.torch.nn.DistributedTransformerLayer`. - - If both ``pre_layernorm`` and ``post_layernorm`` are ``True``, - layer normalization is applied to both the input and the output of - the ``DistributedTransformer``, in addition to the intermediate - attention and transformer-output layers. - -.. class:: smdistributed.modelparallel.torch.nn.DistributedTransformerLayer(num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, causal_mask_size=None, add_cross_attention=False, pre_layernorm=False, post_layernorm=True) - :noindex: - - Tensor-parallel implementation of a single transformer layer. - Number of attention heads, hidden size, and intermediate size - refer to the global quantities across all tensor-parallel ranks. - - For more information about what's the reference implementation of this module, - see :ref:`smdmp-tp-appendix`. - - - **Arguments:** - - - ``num_attention_heads``: The total number of attention heads - across tensor-parallel ranks - - ``attention_head_size``: The number of channels of a single - attention head. - - ``hidden_size``: The hidden dimension of the transformer. The - input tensor ``hidden_states`` is assumed to have its last - dimension size equal to ``hidden_size``. - - ``intermediate_size``: The number of output channels in the - first linear transformation of the transformer output layer. - ``DistributedTransformerOutputLayer`` first maps - ``hidden_size`` dimensions of its input tensor into - ``intermediate_size`` dimensions, and then maps it back into - ``hidden_size`` dimensions. - - ``attention_dropout_prob``: The dropout probability applied to - the attention probabilities. - - ``hidden_dropout_prob``: The dropout probability used in - dropout layers other than the one applied to the attention - probabilities. - - ``activation``: Choice of activation function to use at the - output layer. Must be ``"gelu"`` or ``"relu"``. - - ``layernorm_epsilon``: The epsilon added to the denominator of - layer normalization for numerical stability. - - ``initializer_range``: If ``use_normal_initialization`` is - ``True``, the standard deviation of the normal random variable - to initialize the weights with. - - ``use_normal_initialization``: If ``True``, the weights are - initialized with normal distribution with standard deviation - given by ``initializer_range``. Otherwise, default PyTorch - initialization is used. - - ``causal_mask_size``: If ``None``, no causal mask is used on - attentions. Otherwise, should be set to maximum sequence length - to apply a causal mask to the attention scores. This is used, - for instance, in GPT-2. - - ``add_cross_attention``: If ``True``, a cross-attention layer - will be added after the self-attention block. The - cross-attention layer computes the attention keys and values - based on the ``cross_states`` input (instead of - ``hidden_states`` input, as in self-attention. This is used in - the decoder block of encoder-decoder architectures. For - encoder-only architectures that only use self-attention, this - should be kept ``False``. - - ``pre_layernorm``: If ``True``, inserts layer normalization at - the input. At least one of ``pre_layernorm`` and - ``post_layernorm`` must be ``True``. - - ``post_layernorm``: If ``True``, inserts layer normalization at - the output. At least one of ``pre_layernorm`` and - ``post_layernorm`` must be ``True``. - - - **Methods:** - - - ``forward(self, inputs)``: Forward pass for the transformer - layer. - - - **Arguments:** - - - If ``add_cross_attention=False``, ``inputs`` must be a - tuple ``(hidden_states, attention_mask)``, where - ``hidden_states`` is assumed to be a tensor of dimensions - ``[N, S, H]``, where ``N`` is batch size, ``S`` is - sequence length, and ``H`` is ``hidden_size``. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S]``, where ``N`` is the batch - size, and ``S`` is the sequence length. - - If ``add_cross_attention=True``, ``inputs`` must be a - tuple - ``(hidden_states, cross_states, attention_mask, cross_mask)``, - where ``hidden_states`` is assumed to be a tensor of - dimensions ``[N, S_1, H]``, where ``N`` is batch size, - ``S_1`` is sequence length, and ``H`` is ``hidden_size``. - ``cross_states`` is assumed to be a tensor of size - ``[N, S_2, H]``, similarly interpreted. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S_1]``, where ``N`` is the batch - size, and ``S_1`` is the sequence length, and - ``cross_mask`` is assumed to be a tensor of size - ``[N, 1, 1, S_2]``. Keys and values for the attention - heads in the cross-attention layer (but not the - self-attention layer) are computed using - ``cross_states``, and ``cross_mask`` is applied as the - attention mask in the cross-attention layer (but not the - self-attention layer). - - - **Returns:** - - - If ``add_cross_attention=False``, a tuple - ``(hidden_states, attention_mask)``, where - ``hidden_states`` is the output of the transformer, and - ``attention_mask`` is the same the ``attention_mask`` - argument. - - If ``add_cross_attention=True``, a tuple - ``(hidden_states, cross_states, attention_mask, cross_mask)``, - where ``hidden_states`` is the output of the transformer, - and the next three tensors are the same as the input - arguments. - -.. class:: smdistributed.modelparallel.torch.nn.DistributedAttentionLayer(num_attention_heads=32, attention_head_size=32, hidden_size=1024, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, cross_attention=False, causal_mask_size=None, pre_layernorm=False, post_layernorm=True) - :noindex: - - A distributed implementation for the attention block. Includes the - computation of the self- or cross-attention (context layer), - followed by a linear mapping and dropout, which is optionally - followed by the residual-connection and layer normalization. - - For more information about what's the reference implementation of this module, - see :ref:`smdmp-tp-appendix`. - - - **Arguments:** - - - See :class:`smdistributed.modelparallel.torch.nn.DistributedTransformerLayer` for descriptions of the - arguments. - - ``cross_attention``: If ``True``, it computes the attentions - with respect to the ``cross_states`` tensor of the ``forward`` - method input tuple. (Default: ``False``) - - - **Methods:** - - - ``forward(self, inputs)``: Forward pass for the attention - layer. - - - **Arguments:** - - - If ``cross_attention=False``, ``inputs`` must be a tuple - ``(hidden_states, attention_mask)``, where - ``hidden_states`` is assumed to be a tensor of dimensions - ``[N, S, H]``, where ``N`` is batch size, ``S`` is - sequence length, and ``H`` is ``hidden_size``. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S]``, where ``N`` is the - batch size, and ``S`` is the sequence length. - - If ``cross_attention=True``, ``inputs`` must be a tuple - ``(hidden_states, cross_states, attention_mask)``, where - ``hidden_states`` is assumed to be a tensor of dimensions - ``[N, S_1, H]``, where ``N`` is batch size, ``S_1`` is - sequence length, and ``H`` is ``hidden_size``. - ``cross_states`` is assumed to be a tensor of size - ``[N, S_2, H]``, similarly interpreted. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S_2]``, where ``N`` is the batch - size, and ``S_2`` is the sequence length. Keys and values - for the attention heads are computed using - ``cross_states``. - - - **Returns:** - - - A single tensor that is the output of the attention - layer. - -.. class:: smdistributed.modelparallel.torch.nn.DistributedTransformerOutputLayer(hidden_size=1024, intermediate_size=4096, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, pre_layernorm=False, post_layernorm=True, fp32_residual_addition=False) - :noindex: - - - Distributed implementation of a single transformer output layer. A - single :class:`smdistributed.modelparallel.torch.nn.DistributedTransformerLayer` with - ``add_cross_attention=False`` consists of a single - ``DistributedAttentionLayer`` immediately followed by a single - ``DistributedTransformerOutputLayer``. The latter linearly maps - the last channel of the input tensor from ``hidden_size`` to - ``intermediate_size``, and then maps it back to ``hidden_size``. - - For more information about what's the reference implementation of this module, - see :ref:`smdmp-tp-appendix`. - - - **Arguments:** - - - See :class:`smdistributed.modelparallel.torch.nn.DistributedTransformerLayer` for descriptions of the - arguments. - - ``fp32_residual_addition``: Set to ``True`` if you want to avoid overflow - (NaN loss values) for large models with more than 100 billion parameters - when using FP16. (Default: False) - -.. class:: smdistributed.modelparallel.torch.nn.DistributedEmbedding(num_embeddings,embedding_dim, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False, _weight=None, initializer_range=0.02, _skip_allgather=False,_skip_scatter_and_merge=False,) - :noindex: - - - Distributed implementation of a single Embedding Layer. Currently - only supports splitting across the embedding_dim. - - **Arguments:** - - - See :class:`smdistributed.modelparallel.torch.nn.DistributedEmbedding` for descriptions of the - arguments. - -.. _enabling-tp: - :noindex: - -Enabling Tensor Parallelism -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -There are two ways tensor parallelism can be enabled. - -First, you can use -the distributed module implementations in ``smdistributed.modelparallel.torch.nn`` module directly in -your model definition. See :ref:`smdmp-supported-modules-for-tp` -for a complete list of built-in distributed modules. Here is an example -of how this can be done: - -.. code:: python - - import torch.nn as nn - import smdistributed.modelparallel.torch as smp - - class TransformerModel: - def __init__(self): - self.embedding = nn.Embedding(vocab_size, hidden_size) - - # directly instantiate smp.nn.DistributedTransformer and use it - self.encoder = smp.nn.DistributedTransformer(num_layers, hidden_size, **kwargs) - - self.pooler = nn.Linear(hidden_size, hidden_size) - - def forward(self, hidden_states): - emb_out = self.embedding(hidden_states) - enc_out = self.encoder(emb_out) - return self.pooler(enc_out) - -Second, you can enable tensor parallelism for specific modules or blocks -of code, which will automatically enable tensor parallelism for the -supported modules within that scope. To do this, you can use the -following API: - -.. decorator:: smdistributed.modelparallel.torch.tensor_parallelism(enabled=True, **kwargs) - :noindex: - - - A context manager that enables or disables tensor parallelism for - any supported module that is created inside. If there are nested - contexts, the innermost overrides the rest. If there are - multiple supported modules created within the context, where one - is the submodule of the other, only the outermost module will be - distributed. If a supported module shares weights with another - (supported or unsupported) module, or if its hyperparameters do - not support distribution (e.g., not divisible by the tensor - parallelism degree), tensor parallelism will **not** be enabled - for this module even if this API is used. - - **Example:** - - .. code:: python - - import smdistributed.modelparallel.torch as smp - - with smp.tensor_parallelism(): - self.m0 = nn.Linear(20, 20) # will be distributed - with smp.tensor_parallelism(enabled=False): - self.m1 = nn.Linear(20, 20) # will not be distributed - - - ``kwargs`` - Keyword arguments that can be used to modify the configurations of - the distributed modules created inside the context. - If a keyword argument provided through it matches any ``__init__`` method arguments - of a ``DistributedModule`` that substitutes a module created inside - the ``smdistributed.modelparallel.torch.tensor_parallelism`` context, this keyword will override - the value defined in the ``init_hook``. - - - (*For v1.7.0 and later*) Through the following additional keyword arguments, - the library supports `NVIDIA Megatron’s fused kernels - `_ - - - ``fused_softmax`` (bool) - Fusion of attention masking and softmax. - By default, it is set to ``True``. You can deactivate it by setting - ``fused_softmax=False`` in the ``smdistributed.modelparallel.torch.tensor_parallelism`` context manager. - - ``fused_bias_gelu`` (bool) - Fusion of bias addition and Gelu activation. - By default, it is set to ``False``. You can activate it by setting - ``fused_bias_gelu=True`` in the ``smdistributed.modelparallel.torch.tensor_parallelism`` context manager. - - - -.. function:: smdistributed.modelparallel.torch.set_tensor_parallelism(module, enabled=True, **kwargs) - :noindex: - - - Enables or disables tensor parallelism for the supported - submodules of ``module``. If enabling, the outermost supported - modules will be distributed. If disabling, tensor parallelism will - be disabled for the entire module subtree of ``module``. Unlike - the context manager, this API can be used after the model creation - (but before wrapping with :class:`smdistributed.modelparallel.torch.DistributedModel`), so direct - access to model definition code is not required. If a supported - module shares weights with another (supported or unsupported) - module, or if its hyperparameters do not support distribution - (e.g., not divisible by the tensor parallelism degree), tensor - parallelism will **not** be enabled for this module. - - Keyword arguments ``kwargs`` can be used to modify the - configurations of the distributed modules created inside the - context. If a keyword argument provided here matches any - ``__init__`` method arguments of a :class:`smdistributed.modelparallel.torch.DistributedModel` that - substitutes a module created inside the ``smdistributed.modelparallel.torch.tensor_parallelism`` - context, this keyword will override the value defined in the - ``init_hook``. - - **Example:** - - .. code:: python - - import smdistributed.modelparallel.torch as smp - - model = MyModel() - smp.set_tensor_parallelism(model.encoder, True) - smp.set_tensor_parallelism(model.encoder.embedding, True) - - # outermost supported submodules in model.encoder will be distributed, except for - # model.encoder.embedding - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - -.. _activation-checkpointing-api: - :noindex: - -Activation Checkpointing APIs ------------------------------ - -``smdistributed.modelparallel`` provides three APIs to enable -activation checkpointing: one for checkpointing modules, -one for checkpointing sequential modules, and -one for checkpointing pretrained models. - -For a conceptual guide and examples, see -`Activation Checkpointing `_ -in the *SageMaker's Distributed Model Parallel developer guide*. - -.. class:: smdistributed.modelparallel.torch.patches.checkpoint.checkpoint(module, *args, preserve_rng_state=True) - :noindex: - - - Checkpoints the module passed. Throws error if, during manual - partitioning, all children of module are not on same rank as the - module itself, i.e. the module tree is split across multiple - partitions. During auto-partitioning, if the module is split - across multiple partitions, then this call is ignored(with a - warning). Note that this call applies to the module instance only, - not to the module class. - - - **Arguments:** - - - ``module (Instance of nn.Module)``: The module to be - checkpointed. Note that unlike native checkpointing in - PyTorch’s, activation checkpointing in - ``smdistributed.modelparallel`` is at the granularity of a - module. A generic function cannot be passed here. - - ``args``: Tuple containing inputs to the module. - - ``preserve_rng_state (bool, default=True)``: Omit stashing and - restoring the RNG state during each checkpoint. - -.. class:: smdistributed.modelparallel.torch.patches.checkpoint.checkpoint_sequential(sequential_module, input, strategy="each", preserve_rng_state=True, pack_args_as_tuple=False) - :noindex: - - - Checkpoints the modules inside - `nn.Sequential `__. - This can be used even if different layers that are part of the - sequential container lie on different partitions. Each layer part - of the sequential module that is checkpointed must lie completely - within one partition. If this is not the case during manual - partitioning, then an error will be thrown. If this is not the - case during auto partitioning, a warning will be raised and this - module will be run without checkpointing. - - - **Arguments** - - - ``sequential_module (nn.Sequential)``: the sequential module to - be checkpointed. - - ``input (torch.Tensor or a tuple of torch.Tensors)``: input to - the module, which can be a tensor or a tuple of tensors. If a - tuple is passed, then pack_args_as_tuple should be set to True. - - ``strategy (string, default=“each”)`` : Strategy determines how - many layers part of the sequential module need to be grouped - together for one checkpointing call. This determines how much - memory can be reduced. It can take the following values - - - ``each`` : The default is to checkpoint each module inside - the sequential separately. - - ``contiguous``: Groups consecutive layers on the same - partition together. For example, if a sequential consists of - [a, b, c, d] where a,b are on pp_rank0 and c,d are on - pp_rank 1, then this strategy would checkpoint a,b together - and then c,d together. This means effectively, inputs of a, - outputs of b, inputs of c, and outputs of d are in memory; - the reamining activations are recomputed. - - ``group_2, group_3, group_4, etc:`` More generally, - ``group_x`` where x is an integer. This strategy provides - more flexibility in how many layers to group together. - ``group_x`` groups x layers together on a best effort basis. - It can group x layers together if there are x layers - consecutively on the same partition. For example: - [a,b,c,d,e] where a,b are on pp_rank0 and c,d,e are on - pp_rank 1. If the strategy is ``group_3,`` then a,b are - checkpointed together on pp_rank0 and c,d,e are checkpointed - together on pp_rank1. - - - ``preserve_rng_state (bool, default=True)``: Set to ``False`` - to omit stashing and restoring the RNG state during each - checkpoint. - - ``pack_args_as_tuple (bool, default=False)``: To ensure that - backward works correctly, the autograd function has to unpack - any tuples received. If the checkpointed layer takes a tuple as - input, then this needs to be set to True. - -.. class:: smdistributed.modelparallel.torch.set_activation_checkpointing(module, preserve_rng_state=True, pack_args_as_tuple=False, strategy="each") - :noindex: - - - This API is recommended when importing pretrained models from - libraries, such as PyTorch and Hugging Face Transformers. This is - particularly useful when you don’t have access to the model - definition code and not be able to replace a module call with - checkpoint. - - - **Arguments**: - - - ``module (Instance of nn.Module or nn.Sequential)``: The module - to checkpoint. - - ``preserve_rng_state (bool, default=True)``: Set to ``False`` - to omit stashing and restoring the RNG state during each - checkpoint. - - ``pack_args_as_tuple (bool, default=False)``: *Can only be - passed when module is a sequential module.* To ensure that - backward works correctly, the autograd function has to unpack - any tuples received. If the layer checkpointed takes a tuple as - input, then this needs to be set to True. - - ``strategy: (string, default=“each”)``: *Can only be passed - when module is a sequential module.* Strategy determines how - many layers part of the sequential module need to be grouped - together for one checkpointing call. - - This determines how much memory can be reduced. It can take the - following values - - - ``each`` : The default is to checkpoint each module inside - the sequential separately. - - ``contiguous``: Groups consecutive layers on the same - partition together. For example if a sequential consists of - ``[a, b, c, d]`` where ``a, b`` are on ``pp_rank0`` and ``c, d`` are on - ``pp_rank 1``, then this strategy would checkpoint a,b together - and then ``c, d`` together. This means effectively, the inputs of - ``a``, outputs of ``b``, inputs of ``c``, and outputs of ``d`` are in - memory, and the rest of the activations are recomputed. - - ``group_2, group_3, group_4, etc:`` More generally, - ``group_x`` where x is an integer. This strategy provides - more flexibility in how many layers to group together. - ``group_x`` groups x number of layers together on a best - effort basis if there are x layers consecutively in the same - partition. **Example**: Assume a module with layers ``[a, b, - c, d, e]``. The layers a and b are on pp_rank0, and ``c``, ``d``, and - ``e`` are on ``pp_rank 1``. If the strategy is ``group_3,`` then ``a``, - ``b`` are checkpointed together on ``pp_rank0``, and ``c``, ``d``, ``e`` are - checkpointed together on ``pp_rank1``. - -.. _smdmp-tp-appendix: - :noindex: - -Appendix: Reference Implementations for Modules ------------------------------------------------ - -The following are reference implementations for transformer-related -modules. Note that this is not the actual ``smdistributed`` source code, -but the distributed implementations provided in the library are the -distributed versions of these reference implementations, and can be used -to determine whether the distributed modules perform the same operations -as the custom modules in your script. - -To keep the implementations simple, we only assume keyword arguments, -and assume the existence of a method ``parse_args(kwargs)``, which -parses the arguments to ``__init__`` methods and sets the relevant -attributes of the module, such as ``hidden_size`` and -``num_attention_heads``. - -``smdistributed.modelparallel.torch.nn.DistributedTransformer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class Transformer(nn.Module): - def __init__(self, **kwargs): - super(Transformer, self).__init__() - self.parse_args(kwargs) - - self.layers = [] - for l in range(self.num_layers): - self.layers.append(TransformerLayer(**kwargs)) - - self.seq_layers = nn.Sequential(*self.layers) - - def forward(self, inp): - return self.seq_layers(inp) - -``smdistributed.modelparallel.torch.nn.DistributedTransformerLayer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class TransformerLayer(nn.Module): - def __init__(self, **kwargs): - super(TransformerLayer, self).__init__() - self.parse_args(kwargs) - - self.attention = AttentionLayer(**kwargs) - self.output = TransformerOutputLayer(**kwargs) - - if self.add_cross_attention: - self.cross_attention = AttentionLayer(cross_attention=True, **kwargs) - - def forward(self, inp): - if self.add_cross_attention: - hidden_states, cross_states, attention_mask, cross_mask = inp - else: - hidden_states, attention_mask = inp - - attention_output = self.attention((hidden_states, attention_mask)) - if self.add_cross_attention: - attention_output = self.cross_attention((attention_output, - cross_states, - cross_mask)) - - output = self.output(attention_output) - - if self.add_cross_attention: - return output, cross_states, attention_mask, cross_mask - else: - return output, attention_mask - -``smdistributed.modelparallel.torch.nn.DistributedAttentionLayer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class AttentionLayer(nn.Module): - def __init__(self, **kwargs): - super(AttentionLayer, self).__init__() - self.parse_args(kwargs) - self.attention_head_size = self.hidden_size // self.num_attention_heads - - self.query = nn.Linear(self.hidden_size, self.hidden_size) - self.key = nn.Linear(self.hidden_size, self.hidden_size) - self.value = nn.Linear(self.hidden_size, self.hidden_size) - self.dense = nn.Linear(self.hidden_size, self.hidden_size) - - self.dropout1 = nn.Dropout(self.attention_dropout_prob) - self.dropout2 = nn.Dropout(self.hidden_dropout_prob) - - if self.pre_layernorm: - self.pre_layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - if self.post_layernorm: - self.layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - def transpose(self, tensor, key=False): - shape = tensor.size()[:-1] + - (self.num_attention_heads, self.attention_head_size) - tensor = torch.reshape(tensor, shape) - if key: - return tensor.permute(0, 2, 3, 1) - else: - return tensor.permute(0, 2, 1, 3) - - def forward(self, inp): - if self.cross_attention: - hidden_states, cross_states, attention_mask = inp - else: - hidden_states, attention_mask = inp - - if self.pre_layernorm: - norm_states = self.pre_layernorm(hidden_states) - else: - norm_states = hidden_states - - query_layer = self.query(norm_states) - - if self.cross_attention: - key_layer = self.key(cross_states) - value_layer = self.value(cross_states) - else: - key_layer = self.key(norm_states) - value_layer = self.value(norm_states) - - query_layer = self.transpose(query_layer) - key_layer = self.transpose(key_layer, key=True) - value_layer = self.transpose(value_layer) - - attention_scores = torch.matmul(query_layer, key_layer) - attention_scores = attention_scores / math.sqrt(self.attention_head_size) - - if not self.cross_attention and self.causal_mask is not None: - attention_scores = self.apply_causal_mask(attention_scores) - - attention_scores = attention_scores + attention_mask - - attention_probs = F.softmax(attention_scores, dim=-1) - attention_probs = self.dropout1(attention_probs) - - context_layer = torch.matmul(attention_probs, value_layer) - context_layer = context_layer.permute(0, 2, 1, 3) - new_context_layer_shape = context_layer.size()[:-2] + \ - (self.local_attention_size,) - context_layer = torch.reshape(context_layer, new_context_layer_shape) - - self_attention = self.dense(context_layer) - self_attention = self.dropout2(self_attention) - - if self.post_layernorm: - return self.layernorm(self_attention + hidden_states) - else: - return self_attention - -``smdistributed.modelparallel.torch.nn.DistributedTransformerOutputLayer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class TransformerOutputLayer(nn.Module): - def __init__(self, **kwargs): - super(TransformerOutputLayer, self).__init__() - self.parse_args(kwargs) - - self.dense1 = nn.Linear(self.hidden_size, self.intermediate_size) - self.dense2 = nn.Linear(self.intermediate_size, self.hidden_size) - - self.dropout = nn.Dropout(self.attention_dropout_prob) - - if self.pre_layernorm: - self.pre_layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - if self.post_layernorm: - self.layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - def forward(self, inp): - if self.pre_layernorm: - norm_inp = self.pre_layernorm(inp) - else: - norm_inp = inp - - dense1_output = self.dense1(norm_inp) - if self.activation == "gelu": - act_output = F.gelu(dense1_output) - else: - act_output = F.relu(dense1_output) - - dense2_output = self.dense2(act_output) - output = self.dropout(dense2_output) - - if self.post_layernorm: - return self.layernorm(inp + output) - else: - return output diff --git a/doc/api/training/smp_versions/v1.10.0/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/v1.10.0/smd_model_parallel_tensorflow.rst deleted file mode 100644 index 6630371b94..0000000000 --- a/doc/api/training/smp_versions/v1.10.0/smd_model_parallel_tensorflow.rst +++ /dev/null @@ -1,171 +0,0 @@ -TensorFlow API -============== - -To use the TensorFlow-specific APIs for SageMaker distributed model parallism, -you need to add the following import statement at the top of your training script. - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -.. tip:: - - Refer to - `Modify a TensorFlow Training Script - `_ - to learn how to use the following APIs in your TensorFlow training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of the Keras \ ``Model`` class, which defines the model to - be partitioned. Model definition is done by sub-classing - ``smp.DistributedModel`` class, and implementing the ``call()`` method, - in the same way as the Keras model sub-classing API. Any operation that - is part of the \ ``smp.DistributedModel.call()`` method is subject to - partitioning, meaning that every operation placed inside executes in - exactly one of the devices (the operations outside run on all devices). - - - Similar to the regular Keras API, the forward pass is done by directly - calling the model object on the input tensors. For example: - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - However, ``model()`` calls can only be made inside a - ``smp.step``-decorated function. - - The outputs from a ``smp.DistributedModel`` are available in all ranks, - regardless of which rank computed the last operation. - - **Methods:** - - .. function:: save_model(save_path="/opt/ml/model") - :noindex: - - **Inputs** - - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights. - - Saves the entire, - unpartitioned model with the latest trained weights to ``save_path`` in - TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which - SageMaker monitors to upload the model artifacts to Amazon S3. - -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (``int``): The index of the partition. - - A context manager which places all operations defined inside into the - partition whose ID is equal to ``index``. When - ``smp.partition`` contexts are nested, the innermost context overrides - the rest. The ``index`` argument must be smaller than the number of - partitions. - - ``smp.partition`` is used in the manual partitioning API; - if \ ``"auto_partition"`` parameter is set to ``True`` while launching - training, then ``smp.partition`` contexts are ignored. Any operation - that is not placed in any ``smp.partition`` context is placed in the - ``default_partition``, as shown in the following example: - - .. code:: python - - # auto_partition: False - # default_partition: 0 - smp.init() - [...] - x = tf.constant(1.2)                     # placed in partition 0 - with smp.partition(1): -     y = tf.add(x, tf.constant(2.3))      # placed in partition 1 -     with smp.partition(3): -         z = tf.reduce_sum(y)             # placed in partition 3 - - -.. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. - - .. code:: python - - @smp.register_post_partition_hook - def test_eager(): - # All statements here will be executed right after partition but before the first forward pass - tf.print("Entered hook through eager context") - -.. class:: smp.CheckpointManager - :noindex: - - - A subclass of TensorFlow - `CheckpointManager `__, - which is used to manage checkpoints. The usage is similar to TensorFlow - ``CheckpointManager``. - - The following returns a ``CheckpointManager`` object. - - .. code:: python - - smp.CheckpointManager(checkpoint, -                       directory="/opt/ml/checkpoints", -                       max_to_keep=None, -                       checkpoint_name="ckpt") - - **Parameters** - - - ``checkpoint``: A `tf.train.Checkpoint - `__ instance - that represents a model checkpoint. - - - ``directory``: (``str``) The path to a directory in which to write - checkpoints. A file named "checkpoint" is also written to this - directory (in a human-readable text format) which contains the state - of the ``CheckpointManager``. Defaults to - ``"/opt/ml/checkpoints"``, which is the directory that SageMaker - monitors for uploading the checkpoints to Amazon S3. - - ``max_to_keep`` (``int``): The number of checkpoints to keep. If - ``None``, all checkpoints are kept. - - ``checkpoint_name`` (``str``): Custom name for the checkpoint file. - Defaults to ``"ckpt"``. - - - **Methods:** - - .. function:: save( ) - :noindex: - - Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``. - - .. function:: restore( ) - :noindex: - - Restores the latest checkpoint in the specified directory. - Internally uses ``tf.train.CheckpointManager.restore()``. - - - **Examples:** - - .. code:: python - - checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) - ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints - - for inputs in train_ds: -     loss = train_step(inputs) -     # [...] -     ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints - - .. code:: python - - for step, inputs in enumerate(train_ds): -     if step == 0: -         ckpt_manager.restore() -     loss = train_step(inputs) diff --git a/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_common_api.rst deleted file mode 100644 index 533611ef5e..0000000000 --- a/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_common_api.rst +++ /dev/null @@ -1,487 +0,0 @@ -.. admonition:: Contents - - - :ref:`communication_api` - - :ref:`mpi_basics` - -Common API -========== - -The following SageMaker distribute model parallel APIs are common across all frameworks. - -**Important**: This API document assumes you use the following import statement in your training scripts. - -**TensorFlow** - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -**PyTorch** - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. function:: smp.init( ) - :noindex: - - Initialize the library. Must be called at the beginning of training script. - -.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs]) - :noindex: - - A decorator that must be placed over a function that represents a single - forward and backward pass (for training use cases), or a single forward - pass (for evaluation use cases). Any computation that is defined inside - the ``smp.step``-decorated function is executed in a pipelined manner. - - By default, every tensor input to the function is split across its batch - dimension into a number of microbatches specified while launching the - training job. This behavior can be customized through the arguments to - ``smp.step``, described below. The library then orchestrates the execution of - each microbatch across all partitions, based on the chosen pipeline - type. - - In a typical use case, forward pass and back-propagation are executed - inside an \ ``smp.step``-decorated function and gradients, loss, and - other relevant metrics (such as accuracy, etc.) are returned from - ``smp.step``-decorated function. - - Any gradient post-processing operation, such as gradient clipping and - allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or - ``optimizer.step`` (for PT) should be applied on the gradients returned - from the ``smp.step`` function, and not inside the ``smp.step`` - function. This is because every operation inside ``smp.step`` is - executed once per microbatch, so having these operations inside - ``smp.step`` can either be inefficient (in the case of allreduce), or - lead to wrong results (in the case of ``apply_gradients`` / - ``optimizer.step``). - - If the objects returned from the ``smp.step``-decorated function contain - ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to - ``StepOutput`` objects. A ``StepOutput`` object encapsulates all - versions of the tensor across different microbatches - (see ``StepOutput`` entry for more information). - - The argument to ``smp.step`` decorated function should either be a tensor - or an instance of list, tuple, dict or set for it to be split across - microbatches. If your object doesn't fall into this category, you can make - the library split your object, by implementing ``smp_slice`` method. - - Below is an example of how to use it with PyTorch. - - .. code:: python - - class CustomType: - def __init__(self, tensor): - self.data = tensor - - # The library will call this to invoke slicing on the object passing in total microbatches (num_mb) - # and the current microbatch index (mb). - def smp_slice(self, num_mb, mb, axis): - dim_size = list(self.data.size())[axis] - - split_size = dim_size // num_mb - sliced_tensor = self.data.narrow(axis, mb * split_size, split_size) - return CustomType(sliced_tensor, self.other) - - custom_obj = CustomType(torch.ones(4,)) - - @smp.step() - def step(custom_obj): - loss = model(custom_obj) - model.backward(loss) - return loss - - - **Important:** ``smp.step`` splits the batch into microbatches, and - executes everything inside the decorated function once per microbatch. - This might affect the behavior of batch normalization, any operation - that explicitly uses the batch size information, or any other Python - code that is expected to run once. - - **TensorFlow-specific behavior** - - ``smp.step`` is a wrapper that - inherits from and extends the behavior of ``tf.function``, and as such, - all the caveats that apply to the use of ``tf.function``\ s also apply - to ``smp.step``. In particular, any operation that is inside - ``smp.step`` executes in graph mode, and not eager mode. - - In the first call, ``smp.step`` performs tracing of the wrapped function every time - one of the tensor arguments changes their shape or dtype, or for every - new value of a Python argument, if there is one. Tracing is expensive, - so such scenarios should be avoided as much as possible or, - alternatively, an ``input_signature`` argument must be provided. For - more information on the usage of ``tf.function``, refer to the - TensorFlow documentation: - - - https://www.tensorflow.org/api_docs/python/tf/function\ - - https://www.tensorflow.org/guide/function\ - - Each ``smp.step`` decorated function must have a return value that depends on the - output of ``smp.DistributedModel``. - - **Common parameters** - - - ``non_split_inputs`` (``list``): The list of arguments to the decorated function - that should not be split along the batch dimension. Should be used - for all input tensors that do not have a batch dimension. Should be a - list of argument names as ``str``, as they appear in the signature of - the ``smp.step``-decorated function. By default it is considered an - empty list. - - - ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch - axis. The keys should be the argument names as ``str``, as they - appear in the signature of the ``smp.step``-decorated function.  By - default all batch axes are assumed to be the 0-axis. - - **TensorFlow-only parameters** - - - All arguments of ``tf.function``. Note: - The \ ``experimental_compile`` argument of ``tf.function`` may not - work as expected with ``smp.step``, since it interferes with - pipelining and model partitioning. To enable XLA with the library, you can - instead use \ ``tf.config.optimizer.set_jit(True)``. - - **PyTorch-only parameters** - - - ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on - all returned ``torch.Tensor`` outputs. Setting it to ``False`` - increases memory consumption, unless ``detach()`` is manually called - on the returned tensors, because the model graph is not cleared from - memory after the training step. Set to \ ``True`` by default. - - **Returns** - - - The same object(s) returned from the decorated function. All - returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or - ``torch.Tensor`` objects (for PT) are wrapped inside - a \ ``StepOutput`` object, even when they are inside a Python - ``list``, ``tuple``, or ``dict``. - - - -.. class:: StepOutput - :noindex: - - A class that encapsulates all versions of a ``tf.Tensor`` - or \ ``torch.Tensor`` across all microbatches. - - When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside - ``smp.step``, different versions of the tensor are computed for each - microbatch. - - When this tensor is returned from ``smp.step`` and is accessed outside - of the decorated function, it appears as a ``StepOutput`` object, which - contains all such versions. For example, - - - In the case of Tensorflow, the gradient for a particular - ``tf.Variable`` is computed on each microbatch individually, and if - this gradient is returned from ``smp.step``, all gradients for this - ``tf.Variable`` become part of the same ``StepOutput`` object. The - ``StepOutput`` class offers the following API for commonly-used - post-processing operations on such tensors. - - In the case of PyTorch, the loss for each microbatch is computed - individually and all the ``torch.Tensor``\ s that represent the loss - for different microbatches become part of same ``StepOutput`` object, - if loss is returned from the ``smp.step`` function. - - - The ``StepOutput`` class offers the following API for commonly-used - post-processing operations on tensors. - - .. data:: StepOutput.outputs - :noindex: - - Returns a list of the underlying tensors, indexed by microbatch. - - .. function:: StepOutput.reduce_mean( ) - :noindex: - - Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s - ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches. - - .. function:: StepOutput.reduce_sum( ) - :noindex: - - Returns a ``tf.Tensor`` / - ``torch.Tensor`` that sums the constituent - ``tf.Tensor``\ s/\ ``torch.Tensor``\ s. - - .. function:: StepOutput.concat( ) - :noindex: - - Returns a - ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the - batch dimension using ``tf.concat`` / ``torch.cat``. - - .. function:: StepOutput.stack( ) - :noindex: - - Applies ``tf.stack`` / ``torch.stack`` - operation to the list of constituent ``tf.Tensor``\ s / - ``torch.Tensor``\ s. - - **TensorFlow-only methods** - - .. function:: StepOutput.merge( ) - :noindex: - - Returns a ``tf.Tensor`` that - concatenates the constituent ``tf.Tensor``\ s along the batch - dimension. This is commonly used for merging the model predictions - across microbatches. - - .. function:: StepOutput.accumulate(method="variable", var=None) - :noindex: - - Functionally the same as ``StepOutput.reduce_mean()``. However, it is - more memory-efficient, especially for large numbers of microbatches, - since it does not wait for all constituent \ ``tf.Tensor``\ s to be - ready to start averaging them, thereby saving memory. - - In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end - up being more memory-efficient than ``StepOutput.accumulate()``. - - **Parameters** - - - ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``): - If ``"add_n"`` or ``"accumulate_n"``, the library uses - ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement - accumulation. If ``"variable"``, the library uses an internal ``tf.Variable`` - into which to accumulate the tensors. Default is \ ``"variable"``. - Note: Memory usage behavior of these choices can depend on the model - and implementation. - - - ``var``: A ``tf.Variable`` into which, if provided, the library uses to - accumulate the tensors. If \ ``None``, the library internally creates a - variable. If ``method`` is not ``"variable"``, this argument is - ignored. - -.. _mpi_basics: - :noindex: - -MPI Basics -^^^^^^^^^^ - -The library exposes the following basic MPI primitives to its Python API: - -- ``smp.rank()``: The rank of the current process. -- ``smp.size()``: The total number of processes. -- ``smp.mp_rank()``: The rank of the process among the processes that - hold the current model replica. -- ``smp.dp_rank()``: The rank of the process among the processes that - hold different replicas of the same model partition. -- ``smp.dp_size()``: The total number of model replicas. -- ``smp.local_rank()``: The rank among the processes on the current - instance. -- ``smp.local_size()``: The total number of processes on the current - instance. -- ``smp.get_mp_group()``: The list of ranks over which the current - model replica is partitioned. -- ``smp.get_dp_group()``: The list of ranks that hold different - replicas of the same model partition. - -.. _communication_api: - :noindex: - -Communication API -^^^^^^^^^^^^^^^^^ - -The library provides a few communication primitives which can be helpful while -developing the training script. These primitives use the following -``enum`` s as arguments to specify which processes the communication -should involve. -​ - -**Helper structures** - -.. data:: smp.CommGroup - :noindex: - - An ``enum`` that takes the values - ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``. - These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``, - and ``smp.DP_GROUP`` respectively. - - - ``CommGroup.WORLD``: Represents the entire group of processes used in - training - - ``CommGroup.MP_GROUP``: Represents the group of processes that hold - the same model replica as the current process. The processes in a - single ``MP_GROUP`` collectively store an entire replica of the - model. - - ``CommGroup.DP_GROUP``: Represents the group of processes that hold - the same model partition as the current process. The processes in a - single ``DP_GROUP`` perform data parallelism/allreduce among - themselves. - -.. data:: smp.RankType - :noindex: - - An ``enum`` that takes the values - ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``. - - - ``RankType.WORLD_RANK``: The associated rank is to be interpreted as - the rank of the process across all processes used in training. - - ``RankType.MP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``MP_GROUP``. - - ``RankType.DP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``DP_GROUP``. - - -**Communication primitives:** - -.. function:: smp.broadcast(obj, group) - :noindex: - - Sends the object to all processes in the - group. The receiving process must call ``smp.recv_from`` to receive the - sent object. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be broadcast. - - - ``group``: A ``CommGroup`` argument that represents to which group of - processes the object will be sent. - - **Notes** - - - When you use ``broadcast`` on the sender process, there needs - to be an accompanying ``smp.recv_from()`` call on the receiver - processes. - - - This is a synchronous call; the ``broadcast`` statement - returns only after all ranks participating in the call have made a - matching ``recv_from`` call. - - **Example** - - .. code:: python - - if smp.rank() == 0: -     smp.broadcast(something, group=smp.CommGroup.WORLD) - else: -     smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK) - -.. function:: smp.send(obj, dest_rank, rank_type) - :noindex: - - Sends the object ``obj`` to - ``dest_rank``, which is of a type specified by ``rank_type``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be sent. - - - ``dest_rank`` (``int``): An integer denoting the rank of the receiving process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process - with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current - process. - - **Notes** - - - Note: \ This is a synchronous call; the ``send`` statement returns - only after the destination rank has made a matching - ``recv_from`` call. - -.. function:: smp.recv_from(src_rank, rank_type) - :noindex: - - Receive an object from a peer process. Can be used with a matching - ``smp.send`` or a ``smp.broadcast`` call. - - **Inputs** - - - ``src_rank`` (``int``): An integer denoting rank of the sending process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then the object is received from - the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the - current process. - - **Returns** - - Returns the python object that is sent by the peer process. - - **Notes** - - - Note: This is a synchronous call; the ``recv_from`` statement returns - only after the source rank has made a matching ``send`` or - ``broadcast`` call, and the object is received. - -.. function:: smp.allgather(obj, group) - :noindex: - - A collective call that gathers all the - submitted objects across all ranks in the specified ``group``. Returns a - list whose ``i``\ th index contains the object submitted by the - ``i``\ th rank in ``group``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be - allgathered. - - - ``group`` : A ``CommGroup`` argument that represents which group of - processes participate in ``allgather``. - - **Notes** - - - Note: This is a synchronous call; the ``allgather`` statement returns - only after all ranks participating in the call have made a matching - ``allgather`` call, and all the objects are received at the current - rank. - - **Examples** - - .. code:: python - - # assuming mp_size() == 2 - - if smp.mp_rank() == 0: -     out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - else: -     out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - -.. function:: smp.barrier(group=smp.WORLD) - :noindex: - - A statement that hangs until all - processes in the specified group reach the barrier statement, similar to - ``MPI_Barrier()``. - - **Inputs** - - - ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of - processes participating in the barrier call. Defaults to - ``smp.WORLD``. - - **Examples** - - - Assume there are 8 processes and 2 model partitions, and - therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If - the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its - group argument, then each process only waits until the other process - of its own ``mp_group`` reaches that point. It does not wait for - processes outside that ``mp_group``. - -.. function:: smp.dp_barrier() - :noindex: - - Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``. - Waits for the processes in the same \ ``dp_group`` as - the current process to reach the same point in execution. - -.. function:: smp.mp_barrier() - :noindex: - - Same as passing ``smp.MP_GROUP`` to - ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as - the current process to reach the same point in execution. diff --git a/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_pytorch.rst deleted file mode 100644 index 7e09d64262..0000000000 --- a/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_pytorch.rst +++ /dev/null @@ -1,553 +0,0 @@ -.. admonition:: Contents - - - :ref:`pytorch_saving_loading` - - :ref:`pytorch_saving_loading_instructions` - -PyTorch API -=========== - -**Supported versions: 1.7.1, 1.6.0** - -This API document assumes you use the following import statements in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. tip:: - - Refer to - `Modify a PyTorch Training Script - `_ - to learn how to use the following API in your PyTorch training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of ``torch.nn.Module`` which specifies the model to be - partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is - the model to be partitioned. The returned ``DistributedModel`` object - internally manages model parallelism and data parallelism. Only one - model in the training script can be wrapped with - ``smp.DistributedModel``. - - **Example:** - - .. code:: python - - model = smp.DistributedModel(model) - - **Important**: The ``__call__`` and  ``backward`` method calls on the - ``smp.DistributedModel`` object (in the following example, the object - is \ ``model``) can only be made inside a ``smp.step``-decorated - function. - - - Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can - be performed by calling the \ ``DistributedModel`` object on the input - tensors. - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - For a backward pass, one needs to call the backward function on - the \ ``DistributedModel`` object, with tensors and gradients as - arguments, replacing the PyTorch operations \ ``torch.Tensor.backward`` - or ``torch.autograd.backward``. - - - The API for ``model.backward`` is very similar to - ``torch.autograd.backward``. For example, the following - ``backward`` calls: - - .. code:: python - - torch.autograd.backward(loss) or loss.backward() - - should be replaced with: - - .. code:: python - - model.backward(loss) # loss is a tensor with only one element as its data - - Similarly, for non-scalar tensors, replace the following - ``backward`` call containing incoming gradient arguments: - - .. code:: python - - torch.autograd.backward(outputs, out_grads) - - with the following line: - - .. code:: python - - model.backward(outputs, out_grads) - - In these examples, all ``__call__``  and ``backward`` method calls on - the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside - a ``smp.step``-decorated function. - - **Using DDP** - - If DDP is enabled, do not not place a PyTorch - ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because - the ``DistributedModel`` wrapper will also handle data parallelism. - - Unlike the original DDP wrapper, when you use ``DistributedModel``, - model parameters and buffers are not immediately broadcast across - processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the - ``smp.step``-decorated function when the partition is done. - - **Parameters** - - - ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism). - - - ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``) - Whether to perform the tracing step on the GPU or CPU. The tracing step gathers - information on the order of execution of modules, the shapes of - intermediate outputs, and execution times, to be used by the - partitioning algorithm. If ``trace_device`` is set to GPU, accurate - module execution times can be gathered during tracing for potentially - improved partitioning decision. However, if the model is too large to - fit in a single GPU, then ``trace_device`` should be set to CPU. - - - ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``, - the library profiles the execution time of each module during tracing, and uses - it in the partitioning decision. This improves the partitioning - decision, but it might make the tracing slower. It may also introduce - some degree of non-determinism in partitioning results, because of the - inherent randomness in module execution times. Must be ``False`` if - ``trace_device`` is ``"cpu"``. - - - ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` while launching training). The library uses this flag - to decide whether to do overlapping allreduce whenever a parameter - gradients are ready. This leads to overlapping of communication and - computation and can improve performance. If this is set to ``False`` , - allreduce is performed at the end of the step. - - - ``backward_passes_per_step`` (``int``) (default: 1): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` in config). This parameter indicates the - number of backward passes to perform before calling allreduce on DDP. - This allows accumulating updates over multiple mini-batches before - reducing and applying them. - - - ``average_grads_across_microbatches`` (``bool``) (default: ``True``): - Whether or not the computed gradients should be averaged across - microbatches. If ``False``, the computed gradients will be summed across - microbatches, but not divided by the number of microbatches. In typical - use case where the computed loss is averaged over the mini-batch, this - should be left as ``True``. If you use a loss function that only sums - the per-sample loss across the batch (and not divide by the batch size), - then this must be set to ``False`` for correctness. - - - ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets - parameters into multiple buckets so that gradient reduction of each - bucket can potentially overlap with backward - computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes - (MB). - - - ``trace_memory_usage`` (default: False): When set to True, the library attempts - to measure memory usage per module during tracing. If this is disabled, - memory usage will be estimated through the sizes of tensors returned from - the module. - - - ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``. - This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper. - Please see: `broadcast_buffer `__. - - - ``gradient_as_bucket_view (PyTorch 1.7.1 only)`` (default: False): To be - used with ``ddp=True``. This parameter is forwarded to the underlying - ``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view `__. - - **Properties** - - - ``partitioned``: Is ``True`` if the model is partitioned, ``False`` - otherwise. Initialized to ``False`` when ``DistributedModel`` is first - created. It becomes be ``True`` during the first call - to ``smp.step``-decorated function. Once the model is partitioned, the - local parameters or local ``state_dict`` can be fetched using the - following methods. - - **Methods** - - .. function:: backward(tensors, grad_tensors) - :noindex: - - Triggers a distributed backward - pass across model partitions. Example usage provided in the previous - section. The API is very similar - to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward. - ``retain_grad`` and ``create_graph``  flags are not supported. - - .. function:: local_buffers( ) - :noindex: - - Returns an iterator over buffers for the modules in - the partitioned model that have been assigned to the current process. - - .. function:: local_named_buffers( ) - :noindex: - - Returns an iterator over buffers for the - modules in the partitioned model that have been assigned to the current - process. This yields both the name of the buffer as well as the buffer - itself. - - .. function:: local_parameters( ) - :noindex: - - Returns an iterator over parameters for the - modules in the partitioned model that have been assigned to the current - process. - - .. function:: local_named_parameters( ) - :noindex: - - Returns an iterator over parameters for - the modules in the partitioned model that have been assigned to the - current process. This yields both the name of the parameter as well as - the parameter itself. - - .. function:: local_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. - - .. function:: local_named_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. This - yields both the name of the module as well as the module itself. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains local - parameters that belong to the current \ ``mp_rank``. This ``state_dict`` - contains a key \ ``_smp_is_partial`` to indicate this is a - partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains parameters - for the entire model. It first collects the \ ``local_state_dict``  and - gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to - create a full ``state_dict``. Please note that this needs to be called on all ranks with - ``dp_rank()==0`` to ensure the gather happens properly. - If it is only called on all such ranks, it can hang. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.module.load_state_dict()`` , - except: It first gathers and merges the ``state_dict``\ s across - ``mp_rank``\ s, if they are partial. The actual loading happens after the - model partition so that each rank knows its local parameters. - - .. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. Returns a ``RemovableHandle`` object ``handle``, - which can be used to remove the hook by calling ``handle.remove()``. - - .. function:: cpu( ) - :noindex: - - Allgathers parameters and buffers across all ``mp_rank``\ s and moves them - to the CPU. - - .. function:: join( ) - :noindex: - - **Available for PyTorch 1.7.1 only** - - A context manager to be used in conjunction with an instance of - ``smp.DistributedModel`` to be able to train with uneven inputs across - participating processes. This is only supported when ``ddp=True`` for - ``smp.DistributedModel``. This will use the join with the wrapped - ``DistributedDataParallel`` instance. For more information, see: - `join `__ - in the PyTorch documentation. - - -.. class:: smp.DistributedOptimizer - :noindex: - - **Parameters** - - ``optimizer`` - - An optimizer wrapper for saving/loading optimizer states. This wrapper - returns ``optimizer`` with the following methods overridden: - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains optimizer state for the entire model. - It first collects the ``local_state_dict`` and gathers and merges - the ``local_state_dict`` from all ``mp_rank``s to create a full - ``state_dict``. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.optimizer.load_state_dict()`` , except: - - - It first gathers and merges the local ``state_dict``\ s if they are - partial. - - The actual loading happens after the model partition so that each - rank knows its local parameters. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains the - local optimizer state that belongs to the current \ ``mp_rank``. This - ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is - a partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - ​ -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (int) - The index of the partition. - - A context manager which places all modules defined inside into the - partition with ID ``index``.  The ``index`` argument must be less than - the number of partitions. - - Use ``smp.partition`` to implement manual partitioning. - If ``"auto_partition"`` is ``True``, then the - ``smp.partition`` contexts are ignored. Any module that is not placed in - any ``smp.partition`` context is placed in the - ``default_partition`` defined through the SageMaker Python SDK. - - When ``smp.partition`` contexts are nested, the innermost context - overrides the rest (see the following example). In PyTorch, manual - partitioning should be done inside the module \ ``__init__``, and the - partition assignment applies to the modules that are *created* inside - the ``smp.partition`` context. - - Example: - - .. code:: python - - class Model(torch.nn.Module): -     def __init__(self): -         with smp.partition(1): -             self.child0 = Child0()            # child0 on partition 1 -             with smp.partition(2): -                 self.child1 = Child1()        # child1 on partition 2 -             self.child2 = Child2()            # child2 on partition 1 -         self.child3 = Child3()                # child3 on default_partition - -.. function:: smp.get_world_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all - processes, which can be used with the ``torch.distributed`` API. - Requires ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_mp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``MP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_dp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``DP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.is_initialized( ) - :noindex: - - Returns ``True`` if ``smp.init`` has already been called for the - process, and ``False`` otherwise. - -.. function::smp.is_tracing( ) - :noindex: - - Returns ``True`` if the current process is running the tracing step, and - ``False`` otherwise. - -.. data:: smp.nn.FusedLayerNorm - :noindex: - - `Apex Fused Layer Norm `__ is currently not - supported by the library. ``smp.nn.FusedLayerNorm`` replaces ``apex`` - ``FusedLayerNorm`` and provides the same functionality. This requires - ``apex`` to be installed on the system. - -.. data:: smp.optimizers.FusedNovoGrad - :noindex: - - `Fused Novo Grad optimizer `__ is - currently not supported by the library. ``smp.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad`` - optimizer and provides the same functionality. This requires ``apex`` to - be installed on the system. - -.. data:: smp.optimizers.FusedLamb - :noindex: - - `FusedLamb optimizer `__ - currently doesn’t work with the library. ``smp.optimizers.FusedLamb`` replaces - ``apex`` ``FusedLamb`` optimizer and provides the same functionality. - This requires ``apex`` to be installed on the system. - -.. data:: smp.amp.GradScaler - :noindex: - - `Torch AMP Gradscaler `__ - currently doesn’t work with the library. ``smp.amp.GradScaler`` replaces - ``torch.amp.GradScaler`` and provides the same functionality. - -.. _pytorch_saving_loading: - :noindex: - -APIs for Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. function:: smp.save( ) - :noindex: - - Saves an object. This operation is similar to ``torch.save()``, except - it has an additional keyword argument, ``partial``, and accepts only - string type for the argument ``f`` (file). If ``partial=True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank`` - index to your saved file. - - **Parameters** - - - ``obj`` (dict): A saved object. - - ``f`` (str): A string containing a file name. - - ``partial`` (bool, default= ``True``):  When set to ``True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an - ``mp_rank`` index to the saved file. If you want to be able to load - and further train a model that you save with ``smp.save()``, you must - set ``partial=True``. - - ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``): - A module used for pickling metadata and objects. - - ``pickle_protocol``  (int, default=2): Can be specified to - override the defaultprotocol. - -.. function:: smp.load( ) - :noindex: - - Loads an object saved with ``smp.save()`` from a file. - - Similar to, `torch.load() `__, - except it has an additional keyword argument, ``partial``, and accepts - only string type for the argument ``f`` (file). If \ ``partial=True``, - then each ``mp_rank`` loads a separate checkpoint file. - - **Parameters** - - - ``f`` (string): A string containing a file name. - - ``map_location`` (function): A function - `torch.device `__, - a string, or a dict specifying how to remap storage locations. - - ``pickle_module`` (pickle module): A module used for unpickling - metadata and objects (has to match the \ ``pickle_module``\ used to - serialize file). - - ``pickle_load_args`` (Python 3 only): Optional keyword arguments - passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``. - - ``partial`` (bool, default= ``True``): When set to ``True``, each - ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``. - Should be used when loading a model trained with the library. - -.. _pytorch_saving_loading_instructions: - :noindex: - -General Instruction For Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The library can save partial or full checkpoints. - -- For partial checkpoints, each ``mp_rank`` saves its own checkpoint - file with only the parameters that belong to that rank. -- For full checkpoints, the library saves a single checkpoint that contains - entire model parameters. - -When **saving** using ``smp.save()``, each rank only holds its own -parameters. If you want to save the full model, there will be some -communication between the ranks to create the full model. If you save -checkpoints often, you should save partial checkpoints for best -performance. - -When **loading** using ``smp.load()``, the library can load either partial or | -full checkpoints or full checkpoints saved by a non-model-parallel model. If you -want to resume training with a non-model-parallel model or do inference, you need -a full checkpoint. - -The following is an example of how you can save and load a checkpoint: - -.. code:: python - - # Original model and optimizer - model = MyModel(...) - optimizer = MyOpt(...) - - # model parallel wrapper - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - - # To save, always save on dp_rank 0 to avoid data racing - if partial: -     # To save the partial model on each mp rank -     # the library will create `checkpoint.pt_{mprank}` for each mp rank -     if save_partial_model: -         if smp.dp_rank() == 0: -             model_dict = model.local_state_dict() # save the partial model -             opt_dict = optimizer.local_state_dict() # save the partial optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 f"/checkpoint.pt", -                 partial=True, -             ) - -     # To save the full model -     if save_full_model: -         if smp.dp_rank() == 0: -             model_dict = model.state_dict() # save the full model -             opt_dict = optimizer.state_dict() # save the full optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 "/checkpoint.pt", -                 partial=False, -             ) - - # To load, load on all ranks. - # The only difference for partial/full loading is the partial flag in smp.load - # Load partial checkpoint - if partial_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=True) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) - # Load full checkpoint - if full_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=False) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) diff --git a/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_tensorflow.rst deleted file mode 100644 index e47d313a4c..0000000000 --- a/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_tensorflow.rst +++ /dev/null @@ -1,164 +0,0 @@ -TensorFlow API -============== - -**Supported version: 2.4.1, 2.3.1** - -**Important**: This API document assumes you use the following import statement in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -.. tip:: - - Refer to - `Modify a TensorFlow Training Script - `_ - to learn how to use the following API in your TensorFlow training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of the Keras \ ``Model`` class, which defines the model to - be partitioned. Model definition is done by sub-classing - ``smp.DistributedModel`` class, and implementing the ``call()`` method, - in the same way as the Keras model sub-classing API. Any operation that - is part of the \ ``smp.DistributedModel.call()`` method is subject to - partitioning, meaning that every operation placed inside executes in - exactly one of the devices (the operations outside run on all devices). - - - Similar to the regular Keras API, the forward pass is done by directly - calling the model object on the input tensors. For example: - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - However, ``model()`` calls can only be made inside a - ``smp.step``-decorated function. - - The outputs from a ``smp.DistributedModel`` are available in all ranks, - regardless of which rank computed the last operation. - - **Methods:** - - .. function:: save_model(save_path="/opt/ml/model") - :noindex: - - **Inputs** - - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights. - - Saves the entire, - unpartitioned model with the latest trained weights to ``save_path`` in - TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which - SageMaker monitors to upload the model artifacts to Amazon S3. - -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (``int``): The index of the partition. - - A context manager which places all operations defined inside into the - partition whose ID is equal to ``index``. When - ``smp.partition`` contexts are nested, the innermost context overrides - the rest. The ``index`` argument must be smaller than the number of - partitions. - - ``smp.partition`` is used in the manual partitioning API; - if \ ``"auto_partition"`` parameter is set to ``True`` while launching - training, then ``smp.partition`` contexts are ignored. Any operation - that is not placed in any ``smp.partition`` context is placed in the - ``default_partition``, as shown in the following example: - - .. code:: python - - # auto_partition: False - # default_partition: 0 - smp.init() - [...] - x = tf.constant(1.2)                     # placed in partition 0 - with smp.partition(1): -     y = tf.add(x, tf.constant(2.3))      # placed in partition 1 -     with smp.partition(3): -         z = tf.reduce_sum(y)             # placed in partition 3 - - ​ - -.. class:: smp.CheckpointManager - :noindex: - - A subclass of TensorFlow - `CheckpointManager `__, - which is used to manage checkpoints. The usage is similar to TensorFlow - ``CheckpointManager``. - - The following returns a ``CheckpointManager`` object. - - .. code:: python - - smp.CheckpointManager(checkpoint, -                       directory="/opt/ml/checkpoints", -                       max_to_keep=None, -                       checkpoint_name="ckpt") - - - **Important:** ``smp.CheckpointManager.restore()`` must be called after - the first training step. This is because the first call of the - ``smp.step`` function constructs and partitions the model, which must - take place before the checkpoint restore. Calling it before the first - ``smp.step`` call might result in hangs or unexpected behavior. - - **Parameters** - - - ``checkpoint``: A `tf.train.Checkpoint - `__ instance - that represents a model checkpoint. - - - ``directory``: (``str``) The path to a directory in which to write - checkpoints. A file named "checkpoint" is also written to this - directory (in a human-readable text format) which contains the state - of the ``CheckpointManager``. Defaults to - ``"/opt/ml/checkpoints"``, which is the directory that SageMaker - monitors for uploading the checkpoints to Amazon S3. - - ``max_to_keep`` (``int``): The number of checkpoints to keep. If - ``None``, all checkpoints are kept. - - ``checkpoint_name`` (``str``): Custom name for the checkpoint file. - Defaults to ``"ckpt"``. - - - **Methods:** - - .. function:: save( ) - :noindex: - - Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``. - - .. function:: restore( ) - :noindex: - - Restores the latest checkpoint in the specified directory. - Internally uses ``tf.train.CheckpointManager.restore()``. - - - **Examples:** - - .. code:: python - - checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) - ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints - - for inputs in train_ds: -     loss = train_step(inputs) -     # [...] -     ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints - - .. code:: python - - for step, inputs in enumerate(train_ds): -     if step == 1:                    # NOTE: restore occurs on the second step -         ckpt_manager.restore() -     loss = train_step(inputs) - diff --git a/doc/api/training/smp_versions/v1.3.0/add_smd_version.sh b/doc/api/training/smp_versions/v1.3.0/add_smd_version.sh deleted file mode 100755 index 92d99ca43c..0000000000 --- a/doc/api/training/smp_versions/v1.3.0/add_smd_version.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env python -# add_no_index2.py -import fileinput -import sys - -for line in fileinput.input(inplace=True): - if '.. class::' in line or '.. function::' in line or '.. data::' in line or '.. _' in line: - sys.stdout.write(line + ' :noindex:\n') - else: - sys.stdout.write(line) diff --git a/doc/api/training/smp_versions/v1.3.0/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/v1.3.0/smd_model_parallel_common_api.rst deleted file mode 100644 index 625a7fcbf1..0000000000 --- a/doc/api/training/smp_versions/v1.3.0/smd_model_parallel_common_api.rst +++ /dev/null @@ -1,488 +0,0 @@ -.. admonition:: Contents - - - :ref:`communication_api` - - :ref:`mpi_basics` - -Common API -========== - -The following SageMaker distribute model parallel APIs are common across all frameworks. - -**Important**: This API document assumes you use the following import statement in your training scripts. - -**TensorFlow** - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -**PyTorch** - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. function:: smp.init( ) - :noindex: - - Initialize the library. Must be called at the beginning of training script. - -.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs]) - :noindex: - - A decorator that must be placed over a function that represents a single - forward and backward pass (for training use cases), or a single forward - pass (for evaluation use cases). Any computation that is defined inside - the ``smp.step``-decorated function is executed in a pipelined manner. - - By default, every tensor input to the function is split across its batch - dimension into a number of microbatches specified while launching the - training job. This behavior can be customized through the arguments to - ``smp.step``, described below. The library then orchestrates the execution of - each microbatch across all partitions, based on the chosen pipeline - type. - - In a typical use case, forward pass and back-propagation are executed - inside an \ ``smp.step``-decorated function and gradients, loss, and - other relevant metrics (such as accuracy, etc.) are returned from - ``smp.step``-decorated function. - - Any gradient post-processing operation, such as gradient clipping and - allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or - ``optimizer.step`` (for PT) should be applied on the gradients returned - from the ``smp.step`` function, and not inside the ``smp.step`` - function. This is because every operation inside ``smp.step`` is - executed once per microbatch, so having these operations inside - ``smp.step`` can either be inefficient (in the case of allreduce), or - lead to wrong results (in the case of ``apply_gradients`` / - ``optimizer.step``). - - If the objects returned from the ``smp.step``-decorated function contain - ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to - ``StepOutput`` objects. A ``StepOutput`` object encapsulates all - versions of the tensor across different microbatches - (see ``StepOutput`` entry for more information). - - The argument to ``smp.step`` decorated function should either be a tensor - or an instance of list, tuple, dict or set for it to be split across - microbatches. If your object doesn't fall into this category, you can make - the library split your object, by implementing ``smp_slice`` method. - - Below is an example of how to use it with PyTorch. - - .. code:: python - - class CustomType: - def __init__(self, tensor): - self.data = tensor - - # The library will call this to invoke slicing on the object passing in total microbatches (num_mb) - # and the current microbatch index (mb). - def smp_slice(self, num_mb, mb, axis): - dim_size = list(self.data.size())[axis] - - split_size = dim_size // num_mb - sliced_tensor = self.data.narrow(axis, mb * split_size, split_size) - return CustomType(sliced_tensor, self.other) - - custom_obj = CustomType(torch.ones(4,)) - - @smp.step() - def step(custom_obj): - loss = model(custom_obj) - model.backward(loss) - return loss - - - **Important:** ``smp.step`` splits the batch into microbatches, and - executes everything inside the decorated function once per microbatch. - This might affect the behavior of batch normalization, any operation - that explicitly uses the batch size information, or any other Python - code that is expected to run once. - - **TensorFlow-specific behavior** - - ``smp.step`` is a wrapper that - inherits from and extends the behavior of ``tf.function``, and as such, - all the caveats that apply to the use of ``tf.function``\ s also apply - to ``smp.step``. In particular, any operation that is inside - ``smp.step`` executes in graph mode, and not eager mode. - - In the first call, ``smp.step`` performs tracing of the wrapped function every time - one of the tensor arguments changes their shape or dtype, or for every - new value of a Python argument, if there is one. Tracing is expensive, - so such scenarios should be avoided as much as possible or, - alternatively, an ``input_signature`` argument must be provided. For - more information on the usage of ``tf.function``, refer to the - TensorFlow documentation: - - - https://www.tensorflow.org/api_docs/python/tf/function\ - - https://www.tensorflow.org/guide/function\ - - Each ``smp.step`` decorated function must have a return value that depends on the - output of ``smp.DistributedModel``. - - **Common parameters** - - - ``non_split_inputs`` (``list``): The list of arguments to the decorated function - that should not be split along the batch dimension. Should be used - for all input tensors that do not have a batch dimension. Should be a - list of argument names as ``str``, as they appear in the signature of - the ``smp.step``-decorated function. By default it is considered an - empty list. - - - ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch - axis. The keys should be the argument names as ``str``, as they - appear in the signature of the ``smp.step``-decorated function.  By - default all batch axes are assumed to be the 0-axis. - - **TensorFlow-only parameters** - - - All arguments of ``tf.function``. Note: - The \ ``experimental_compile`` argument of ``tf.function`` may not - work as expected with ``smp.step``, since it interferes with - pipelining and model partitioning. To enable XLA with the library, you can - instead use \ ``tf.config.optimizer.set_jit(True)``. - - **PyTorch-only parameters** - - - ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on - all returned ``torch.Tensor`` outputs. Setting it to ``False`` - increases memory consumption, unless ``detach()`` is manually called - on the returned tensors, because the model graph is not cleared from - memory after the training step. Set to \ ``True`` by default. - - **Returns** - - - The same object(s) returned from the decorated function. All - returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or - ``torch.Tensor`` objects (for PT) are wrapped inside - a \ ``StepOutput`` object, even when they are inside a Python - ``list``, ``tuple``, or ``dict``. - - - -.. class:: StepOutput - :noindex: - - - A class that encapsulates all versions of a ``tf.Tensor`` - or \ ``torch.Tensor`` across all microbatches. - - When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside - ``smp.step``, different versions of the tensor are computed for each - microbatch. - - When this tensor is returned from ``smp.step`` and is accessed outside - of the decorated function, it appears as a ``StepOutput`` object, which - contains all such versions. For example, - - - In the case of Tensorflow, the gradient for a particular - ``tf.Variable`` is computed on each microbatch individually, and if - this gradient is returned from ``smp.step``, all gradients for this - ``tf.Variable`` become part of the same ``StepOutput`` object. The - ``StepOutput`` class offers the following API for commonly-used - post-processing operations on such tensors. - - In the case of PyTorch, the loss for each microbatch is computed - individually and all the ``torch.Tensor``\ s that represent the loss - for different microbatches become part of same ``StepOutput`` object, - if loss is returned from the ``smp.step`` function. - - - The ``StepOutput`` class offers the following API for commonly-used - post-processing operations on tensors. - - .. data:: StepOutput.outputs - :noindex: - - Returns a list of the underlying tensors, indexed by microbatch. - - .. function:: StepOutput.reduce_mean( ) - :noindex: - - Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s - ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches. - - .. function:: StepOutput.reduce_sum( ) - :noindex: - - Returns a ``tf.Tensor`` / - ``torch.Tensor`` that sums the constituent - ``tf.Tensor``\ s/\ ``torch.Tensor``\ s. - - .. function:: StepOutput.concat( ) - :noindex: - - Returns a - ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the - batch dimension using ``tf.concat`` / ``torch.cat``. - - .. function:: StepOutput.stack( ) - :noindex: - - Applies ``tf.stack`` / ``torch.stack`` - operation to the list of constituent ``tf.Tensor``\ s / - ``torch.Tensor``\ s. - - **TensorFlow-only methods** - - .. function:: StepOutput.merge( ) - :noindex: - - Returns a ``tf.Tensor`` that - concatenates the constituent ``tf.Tensor``\ s along the batch - dimension. This is commonly used for merging the model predictions - across microbatches. - - .. function:: StepOutput.accumulate(method="variable", var=None) - :noindex: - - Functionally the same as ``StepOutput.reduce_mean()``. However, it is - more memory-efficient, especially for large numbers of microbatches, - since it does not wait for all constituent \ ``tf.Tensor``\ s to be - ready to start averaging them, thereby saving memory. - - In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end - up being more memory-efficient than ``StepOutput.accumulate()``. - - **Parameters** - - - ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``): - If ``"add_n"`` or ``"accumulate_n"``, the library uses - ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement - accumulation. If ``"variable"``, the library uses an internal ``tf.Variable`` - into which to accumulate the tensors. Default is \ ``"variable"``. - Note: Memory usage behavior of these choices can depend on the model - and implementation. - - - ``var``: A ``tf.Variable`` into which, if provided, the library uses to - accumulate the tensors. If \ ``None``, the library internally creates a - variable. If ``method`` is not ``"variable"``, this argument is - ignored. - -.. _mpi_basics: - :noindex: - -MPI Basics -^^^^^^^^^^ - -The library exposes the following basic MPI primitives to its Python API: - -- ``smp.rank()``: The rank of the current process. -- ``smp.size()``: The total number of processes. -- ``smp.mp_rank()``: The rank of the process among the processes that - hold the current model replica. -- ``smp.dp_rank()``: The rank of the process among the processes that - hold different replicas of the same model partition. -- ``smp.dp_size()``: The total number of model replicas. -- ``smp.local_rank()``: The rank among the processes on the current - instance. -- ``smp.local_size()``: The total number of processes on the current - instance. -- ``smp.get_mp_group()``: The list of ranks over which the current - model replica is partitioned. -- ``smp.get_dp_group()``: The list of ranks that hold different - replicas of the same model partition. - - .. _communication_api: - :noindex: - -Communication API -^^^^^^^^^^^^^^^^^ - -The library provides a few communication primitives which can be helpful while -developing the training script. These primitives use the following -``enum`` s as arguments to specify which processes the communication -should involve. -​ - -**Helper structures** - -.. data:: smp.CommGroup - :noindex: - - An ``enum`` that takes the values - ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``. - These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``, - and ``smp.DP_GROUP`` respectively. - - - ``CommGroup.WORLD``: Represents the entire group of processes used in - training - - ``CommGroup.MP_GROUP``: Represents the group of processes that hold - the same model replica as the current process. The processes in a - single ``MP_GROUP`` collectively store an entire replica of the - model. - - ``CommGroup.DP_GROUP``: Represents the group of processes that hold - the same model partition as the current process. The processes in a - single ``DP_GROUP`` perform data parallelism/allreduce among - themselves. - -.. data:: smp.RankType - :noindex: - - An ``enum`` that takes the values - ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``. - - - ``RankType.WORLD_RANK``: The associated rank is to be interpreted as - the rank of the process across all processes used in training. - - ``RankType.MP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``MP_GROUP``. - - ``RankType.DP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``DP_GROUP``. - - -**Communication primitives:** - -.. function:: smp.broadcast(obj, group) - :noindex: - - Sends the object to all processes in the - group. The receiving process must call ``smp.recv_from`` to receive the - sent object. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be broadcast. - - - ``group``: A ``CommGroup`` argument that represents to which group of - processes the object will be sent. - - **Notes** - - - When you use ``broadcast`` on the sender process, there needs - to be an accompanying ``smp.recv_from()`` call on the receiver - processes. - - - This is a synchronous call; the ``broadcast`` statement - returns only after all ranks participating in the call have made a - matching ``recv_from`` call. - - **Example** - - .. code:: python - - if smp.rank() == 0: -     smp.broadcast(something, group=smp.CommGroup.WORLD) - else: -     smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK) - -.. function:: smp.send(obj, dest_rank, rank_type) - :noindex: - - Sends the object ``obj`` to - ``dest_rank``, which is of a type specified by ``rank_type``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be sent. - - - ``dest_rank`` (``int``): An integer denoting the rank of the receiving process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process - with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current - process. - - **Notes** - - - Note: \ This is a synchronous call; the ``send`` statement returns - only after the destination rank has made a matching - ``recv_from`` call. - -.. function:: smp.recv_from(src_rank, rank_type) - :noindex: - - Receive an object from a peer process. Can be used with a matching - ``smp.send`` or a ``smp.broadcast`` call. - - **Inputs** - - - ``src_rank`` (``int``): An integer denoting rank of the sending process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then the object is received from - the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the - current process. - - **Returns** - - Returns the python object that is sent by the peer process. - - **Notes** - - - Note: This is a synchronous call; the ``recv_from`` statement returns - only after the source rank has made a matching ``send`` or - ``broadcast`` call, and the object is received. - -.. function:: smp.allgather(obj, group) - :noindex: - - A collective call that gathers all the - submitted objects across all ranks in the specified ``group``. Returns a - list whose ``i``\ th index contains the object submitted by the - ``i``\ th rank in ``group``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be - allgathered. - - - ``group`` : A ``CommGroup`` argument that represents which group of - processes participate in ``allgather``. - - **Notes** - - - Note: This is a synchronous call; the ``allgather`` statement returns - only after all ranks participating in the call have made a matching - ``allgather`` call, and all the objects are received at the current - rank. - - **Examples** - - .. code:: python - - # assuming mp_size() == 2 - - if smp.mp_rank() == 0: -     out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - else: -     out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - -.. function:: smp.barrier(group=smp.WORLD) - :noindex: - - A statement that hangs until all - processes in the specified group reach the barrier statement, similar to - ``MPI_Barrier()``. - - **Inputs** - - - ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of - processes participating in the barrier call. Defaults to - ``smp.WORLD``. - - **Examples** - - - Assume there are 8 processes and 2 model partitions, and - therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If - the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its - group argument, then each process only waits until the other process - of its own ``mp_group`` reaches that point. It does not wait for - processes outside that ``mp_group``. - -.. function:: smp.dp_barrier() - :noindex: - - Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``. - Waits for the processes in the same \ ``dp_group`` as - the current process to reach the same point in execution. - -.. function:: smp.mp_barrier() - :noindex: - - Same as passing ``smp.MP_GROUP`` to - ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as - the current process to reach the same point in execution. diff --git a/doc/api/training/smp_versions/v1.3.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.3.0/smd_model_parallel_pytorch.rst deleted file mode 100644 index d2fcb95954..0000000000 --- a/doc/api/training/smp_versions/v1.3.0/smd_model_parallel_pytorch.rst +++ /dev/null @@ -1,572 +0,0 @@ -.. admonition:: Contents - - - :ref:`pytorch_saving_loading` - - :ref:`pytorch_saving_loading_instructions` - -PyTorch API -=========== - -**Supported versions: 1.7.1, 1.8.1** - -This API document assumes you use the following import statements in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. tip:: - - Refer to - `Modify a PyTorch Training Script - `_ - to learn how to use the following API in your PyTorch training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of ``torch.nn.Module`` which specifies the model to be - partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is - the model to be partitioned. The returned ``DistributedModel`` object - internally manages model parallelism and data parallelism. Only one - model in the training script can be wrapped with - ``smp.DistributedModel``. - - **Example:** - - .. code:: python - - model = smp.DistributedModel(model) - - **Important**: The ``__call__`` and  ``backward`` method calls on the - ``smp.DistributedModel`` object (in the following example, the object - is \ ``model``) can only be made inside a ``smp.step``-decorated - function. - - - Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can - be performed by calling the \ ``DistributedModel`` object on the input - tensors. - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - For a backward pass, one needs to call the backward function on - the \ ``DistributedModel`` object, with tensors and gradients as - arguments, replacing the PyTorch operations \ ``torch.Tensor.backward`` - or ``torch.autograd.backward``. - - - The API for ``model.backward`` is very similar to - ``torch.autograd.backward``. For example, the following - ``backward`` calls: - - .. code:: python - - torch.autograd.backward(loss) or loss.backward() - - should be replaced with: - - .. code:: python - - model.backward(loss) # loss is a tensor with only one element as its data - - Similarly, for non-scalar tensors, replace the following - ``backward`` call containing incoming gradient arguments: - - .. code:: python - - torch.autograd.backward(outputs, out_grads) - - with the following line: - - .. code:: python - - model.backward(outputs, out_grads) - - In these examples, all ``__call__``  and ``backward`` method calls on - the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside - a ``smp.step``-decorated function. - - **Using DDP** - - If DDP is enabled, do not not place a PyTorch - ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because - the ``DistributedModel`` wrapper will also handle data parallelism. - - Unlike the original DDP wrapper, when you use ``DistributedModel``, - model parameters and buffers are not immediately broadcast across - processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the - ``smp.step``-decorated function when the partition is done. - - **Parameters** - - - ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism). - - - ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``) - Whether to perform the tracing step on the GPU or CPU. The tracing step gathers - information on the order of execution of modules, the shapes of - intermediate outputs, and execution times, to be used by the - partitioning algorithm. If ``trace_device`` is set to GPU, accurate - module execution times can be gathered during tracing for potentially - improved partitioning decision. However, if the model is too large to - fit in a single GPU, then ``trace_device`` should be set to CPU. - - - ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``, - the library profiles the execution time of each module during tracing, and uses - it in the partitioning decision. This improves the partitioning - decision, but it might make the tracing slower. It may also introduce - some degree of non-determinism in partitioning results, because of the - inherent randomness in module execution times. Must be ``False`` if - ``trace_device`` is ``"cpu"``. - - - ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` while launching training). The library uses this flag - to decide whether to do overlapping allreduce whenever a parameter - gradients are ready. This leads to overlapping of communication and - computation and can improve performance. If this is set to ``False`` , - allreduce is performed at the end of the step. - - - ``backward_passes_per_step`` (``int``) (default: 1): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` in config). This parameter indicates the - number of backward passes to perform before calling allreduce on DDP. - This allows accumulating updates over multiple mini-batches before - reducing and applying them. - - - ``average_grads_across_microbatches`` (``bool``) (default: ``True``): - Whether or not the computed gradients should be averaged across - microbatches. If ``False``, the computed gradients will be summed across - microbatches, but not divided by the number of microbatches. In typical - use case where the computed loss is averaged over the mini-batch, this - should be left as ``True``. If you use a loss function that only sums - the per-sample loss across the batch (and not divide by the batch size), - then this must be set to ``False`` for correctness. - - - ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets - parameters into multiple buckets so that gradient reduction of each - bucket can potentially overlap with backward - computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes - (MB). - - - ``trace_memory_usage`` (default: False): When set to True, the library attempts - to measure memory usage per module during tracing. If this is disabled, - memory usage will be estimated through the sizes of tensors returned from - the module. - - - ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``. - This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper. - Please see: `broadcast_buffer `__. - - - ``gradient_as_bucket_view`` (default: False): To be - used with ``ddp=True``. This parameter is forwarded to the underlying - ``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view `__. - - **Properties** - - - ``partitioned``: Is ``True`` if the model is partitioned, ``False`` - otherwise. Initialized to ``False`` when ``DistributedModel`` is first - created. It becomes be ``True`` during the first call - to ``smp.step``-decorated function. Once the model is partitioned, the - local parameters or local ``state_dict`` can be fetched using the - following methods. - - **Methods** - - .. function:: backward(tensors, grad_tensors) - :noindex: - - Triggers a distributed backward - pass across model partitions. Example usage provided in the previous - section. The API is very similar - to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward. - ``retain_grad`` and ``create_graph``  flags are not supported. - - .. function:: local_buffers( ) - :noindex: - - Returns an iterator over buffers for the modules in - the partitioned model that have been assigned to the current process. - - .. function:: local_named_buffers( ) - :noindex: - - Returns an iterator over buffers for the - modules in the partitioned model that have been assigned to the current - process. This yields both the name of the buffer as well as the buffer - itself. - - .. function:: local_parameters( ) - :noindex: - - Returns an iterator over parameters for the - modules in the partitioned model that have been assigned to the current - process. - - .. function:: local_named_parameters( ) - :noindex: - - Returns an iterator over parameters for - the modules in the partitioned model that have been assigned to the - current process. This yields both the name of the parameter as well as - the parameter itself. - - .. function:: local_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. - - .. function:: local_named_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. This - yields both the name of the module as well as the module itself. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains local - parameters that belong to the current \ ``mp_rank``. This ``state_dict`` - contains a key \ ``_smp_is_partial`` to indicate this is a - partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains parameters - for the entire model. It first collects the \ ``local_state_dict``  and - gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to - create a full ``state_dict``. Please note that this needs to be called on all ranks with - ``dp_rank()==0`` to ensure the gather happens properly. - If it is only called on all such ranks, it can hang. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.module.load_state_dict()`` , - except: It first gathers and merges the ``state_dict``\ s across - ``mp_rank``\ s, if they are partial. The actual loading happens after the - model partition so that each rank knows its local parameters. - - .. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. Returns a ``RemovableHandle`` object ``handle``, - which can be used to remove the hook by calling ``handle.remove()``. - - .. function:: cpu( ) - :noindex: - - Allgathers parameters and buffers across all ``mp_rank``\ s and moves them - to the CPU. - - .. function:: join( ) - :noindex: - - A context manager to be used in conjunction with an instance of - ``smp.DistributedModel`` to be able to train with uneven inputs across - participating processes. This is only supported when ``ddp=True``. This will use the join with the wrapped - ``DistributedDataParallel`` instance. For more information, see: - `join `__ - in the PyTorch documentation. - - .. function:: register_comm_hook( state, callable ) - :noindex: - - **Available for PyTorch 1.8.1 only** - Registers a communication hook which is an enhancement that provides - a flexible hook ``callable`` to users where they can specify how - gradients are aggregated across multiple workers. This method will be called on the wrapped ``DistributedDataParallel`` instance. - - Please note that when you register a comm hook you have full control of how the gradients are processed. - When using only data parallelism with Torch DDP you are expected to average grads across data parallel replicas within the hook. - Similarly, when using DistributedModel you have to averaging grads across data parallel replicas within the hook. - In addition to that, you also have to average grads across microbatches within the hook unless you explicitly desire to not average based on your loss function. - See ``average_grads_across_microbatches`` for more information about averaging grads across microbatches. - - This is only supported when ``ddp=True`` and ``overlapping_allreduce=True`` (default). - For more information, see: - `register_comm_hook `__ - in the PyTorch documentation. - - - -.. class:: smp.DistributedOptimizer - :noindex: - - **Parameters** - - ``optimizer`` - - An optimizer wrapper for saving/loading optimizer states. This wrapper - returns ``optimizer`` with the following methods overridden: - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains optimizer state for the entire model. - It first collects the ``local_state_dict`` and gathers and merges - the ``local_state_dict`` from all ``mp_rank``s to create a full - ``state_dict``. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.optimizer.load_state_dict()`` , except: - - - It first gathers and merges the local ``state_dict``\ s if they are - partial. - - The actual loading happens after the model partition so that each - rank knows its local parameters. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains the - local optimizer state that belongs to the current \ ``mp_rank``. This - ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is - a partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - ​ -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (int) - The index of the partition. - - A context manager which places all modules defined inside into the - partition with ID ``index``.  The ``index`` argument must be less than - the number of partitions. - - Use ``smp.partition`` to implement manual partitioning. - If ``"auto_partition"`` is ``True``, then the - ``smp.partition`` contexts are ignored. Any module that is not placed in - any ``smp.partition`` context is placed in the - ``default_partition`` defined through the SageMaker Python SDK. - - When ``smp.partition`` contexts are nested, the innermost context - overrides the rest (see the following example). In PyTorch, manual - partitioning should be done inside the module \ ``__init__``, and the - partition assignment applies to the modules that are *created* inside - the ``smp.partition`` context. - - Example: - - .. code:: python - - class Model(torch.nn.Module): -     def __init__(self): -         with smp.partition(1): -             self.child0 = Child0()            # child0 on partition 1 -             with smp.partition(2): -                 self.child1 = Child1()        # child1 on partition 2 -             self.child2 = Child2()            # child2 on partition 1 -         self.child3 = Child3()                # child3 on default_partition - -.. function:: smp.get_world_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all - processes, which can be used with the ``torch.distributed`` API. - Requires ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_mp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``MP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_dp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``DP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.is_initialized( ) - :noindex: - - Returns ``True`` if ``smp.init`` has already been called for the - process, and ``False`` otherwise. - -.. function::smp.is_tracing( ) - :noindex: - - Returns ``True`` if the current process is running the tracing step, and - ``False`` otherwise. - -.. data:: smp.nn.FusedLayerNorm - :noindex: - - `Apex Fused Layer Norm `__ is currently not - supported by the library. ``smp.nn.FusedLayerNorm`` replaces ``apex`` - ``FusedLayerNorm`` and provides the same functionality. This requires - ``apex`` to be installed on the system. - -.. data:: smp.optimizers.FusedNovoGrad - :noindex: - - - `Fused Novo Grad optimizer `__ is - currently not supported by the library. ``smp.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad`` - optimizer and provides the same functionality. This requires ``apex`` to - be installed on the system. - -.. data:: smp.optimizers.FusedLamb - :noindex: - - - `FusedLamb optimizer `__ - currently doesn’t work with the library. ``smp.optimizers.FusedLamb`` replaces - ``apex`` ``FusedLamb`` optimizer and provides the same functionality. - This requires ``apex`` to be installed on the system. - -.. data:: smp.amp.GradScaler - :noindex: - - `Torch AMP Gradscaler `__ - currently doesn’t work with the library. ``smp.amp.GradScaler`` replaces - ``torch.amp.GradScaler`` and provides the same functionality. - -.. _pytorch_saving_loading: - :noindex: - -APIs for Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. function:: smp.save( ) - :noindex: - - Saves an object. This operation is similar to ``torch.save()``, except - it has an additional keyword argument, ``partial``, and accepts only - string type for the argument ``f`` (file). If ``partial=True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank`` - index to your saved file. - - **Parameters** - - - ``obj`` (dict): A saved object. - - ``f`` (str): A string containing a file name. - - ``partial`` (bool, default= ``True``):  When set to ``True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an - ``mp_rank`` index to the saved file. If you want to be able to load - and further train a model that you save with ``smp.save()``, you must - set ``partial=True``. - - ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``): - A module used for pickling metadata and objects. - - ``pickle_protocol``  (int, default=2): Can be specified to - override the defaultprotocol. - -.. function:: smp.load( ) - :noindex: - - Loads an object saved with ``smp.save()`` from a file. - - Similar to, `torch.load() `__, - except it has an additional keyword argument, ``partial``, and accepts - only string type for the argument ``f`` (file). If \ ``partial=True``, - then each ``mp_rank`` loads a separate checkpoint file. - - **Parameters** - - - ``f`` (string): A string containing a file name. - - ``map_location`` (function): A function - `torch.device `__, - a string, or a dict specifying how to remap storage locations. - - ``pickle_module`` (pickle module): A module used for unpickling - metadata and objects (has to match the \ ``pickle_module``\ used to - serialize file). - - ``pickle_load_args`` (Python 3 only): Optional keyword arguments - passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``. - - ``partial`` (bool, default= ``True``): When set to ``True``, each - ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``. - Should be used when loading a model trained with the library. - -.. _pytorch_saving_loading_instructions: - :noindex: - -General Instruction For Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The library can save partial or full checkpoints. - -- For partial checkpoints, each ``mp_rank`` saves its own checkpoint - file with only the parameters that belong to that rank. -- For full checkpoints, the library saves a single checkpoint that contains - entire model parameters. - -When **saving** using ``smp.save()``, each rank only holds its own -parameters. If you want to save the full model, there will be some -communication between the ranks to create the full model. If you save -checkpoints often, you should save partial checkpoints for best -performance. - -When **loading** using ``smp.load()``, the library can load either partial or | -full checkpoints or full checkpoints saved by a non-model-parallel model. If you -want to resume training with a non-model-parallel model or do inference, you need -a full checkpoint. - -The following is an example of how you can save and load a checkpoint: - -.. code:: python - - # Original model and optimizer - model = MyModel(...) - optimizer = MyOpt(...) - - # model parallel wrapper - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - - # To save, always save on dp_rank 0 to avoid data racing - if partial: -     # To save the partial model on each mp rank -     # the library will create `checkpoint.pt_{mprank}` for each mp rank -     if save_partial_model: -         if smp.dp_rank() == 0: -             model_dict = model.local_state_dict() # save the partial model -             opt_dict = optimizer.local_state_dict() # save the partial optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 f"/checkpoint.pt", -                 partial=True, -             ) - -     # To save the full model -     if save_full_model: -         if smp.dp_rank() == 0: -             model_dict = model.state_dict() # save the full model -             opt_dict = optimizer.state_dict() # save the full optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 "/checkpoint.pt", -                 partial=False, -             ) - - # To load, load on all ranks. - # The only difference for partial/full loading is the partial flag in smp.load - # Load partial checkpoint - if partial_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=True) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) - # Load full checkpoint - if full_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=False) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) diff --git a/doc/api/training/smp_versions/v1.3.0/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/v1.3.0/smd_model_parallel_tensorflow.rst deleted file mode 100644 index 8dc0b56b1f..0000000000 --- a/doc/api/training/smp_versions/v1.3.0/smd_model_parallel_tensorflow.rst +++ /dev/null @@ -1,172 +0,0 @@ -TensorFlow API -============== - -**Supported version: 2.3.1, 2.4.1** - -**Important**: This API document assumes you use the following import statement in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -.. tip:: - - Refer to - `Modify a TensorFlow Training Script - `_ - to learn how to use the following API in your TensorFlow training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of the Keras \ ``Model`` class, which defines the model to - be partitioned. Model definition is done by sub-classing - ``smp.DistributedModel`` class, and implementing the ``call()`` method, - in the same way as the Keras model sub-classing API. Any operation that - is part of the \ ``smp.DistributedModel.call()`` method is subject to - partitioning, meaning that every operation placed inside executes in - exactly one of the devices (the operations outside run on all devices). - - - Similar to the regular Keras API, the forward pass is done by directly - calling the model object on the input tensors. For example: - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - However, ``model()`` calls can only be made inside a - ``smp.step``-decorated function. - - The outputs from a ``smp.DistributedModel`` are available in all ranks, - regardless of which rank computed the last operation. - - **Methods:** - - .. function:: save_model(save_path="/opt/ml/model") - :noindex: - - **Inputs** - - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights. - - Saves the entire, - unpartitioned model with the latest trained weights to ``save_path`` in - TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which - SageMaker monitors to upload the model artifacts to Amazon S3. - -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (``int``): The index of the partition. - - A context manager which places all operations defined inside into the - partition whose ID is equal to ``index``. When - ``smp.partition`` contexts are nested, the innermost context overrides - the rest. The ``index`` argument must be smaller than the number of - partitions. - - ``smp.partition`` is used in the manual partitioning API; - if \ ``"auto_partition"`` parameter is set to ``True`` while launching - training, then ``smp.partition`` contexts are ignored. Any operation - that is not placed in any ``smp.partition`` context is placed in the - ``default_partition``, as shown in the following example: - - .. code:: python - - # auto_partition: False - # default_partition: 0 - smp.init() - [...] - x = tf.constant(1.2)                     # placed in partition 0 - with smp.partition(1): -     y = tf.add(x, tf.constant(2.3))      # placed in partition 1 -     with smp.partition(3): -         z = tf.reduce_sum(y)             # placed in partition 3 - - -.. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. - - .. code:: python - - @smp.register_post_partition_hook - def test_eager(): - # All statements here will be executed right after partition but before the first forward pass - tf.print("Entered hook through eager context") - -.. class:: smp.CheckpointManager - :noindex: - - - A subclass of TensorFlow - `CheckpointManager `__, - which is used to manage checkpoints. The usage is similar to TensorFlow - ``CheckpointManager``. - - The following returns a ``CheckpointManager`` object. - - .. code:: python - - smp.CheckpointManager(checkpoint, -                       directory="/opt/ml/checkpoints", -                       max_to_keep=None, -                       checkpoint_name="ckpt") - - **Parameters** - - - ``checkpoint``: A `tf.train.Checkpoint - `__ instance - that represents a model checkpoint. - - - ``directory``: (``str``) The path to a directory in which to write - checkpoints. A file named "checkpoint" is also written to this - directory (in a human-readable text format) which contains the state - of the ``CheckpointManager``. Defaults to - ``"/opt/ml/checkpoints"``, which is the directory that SageMaker - monitors for uploading the checkpoints to Amazon S3. - - ``max_to_keep`` (``int``): The number of checkpoints to keep. If - ``None``, all checkpoints are kept. - - ``checkpoint_name`` (``str``): Custom name for the checkpoint file. - Defaults to ``"ckpt"``. - - - **Methods:** - - .. function:: save( ) - :noindex: - - Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``. - - .. function:: restore( ) - :noindex: - - Restores the latest checkpoint in the specified directory. - Internally uses ``tf.train.CheckpointManager.restore()``. - - - **Examples:** - - .. code:: python - - checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) - ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints - - for inputs in train_ds: -     loss = train_step(inputs) -     # [...] -     ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints - - .. code:: python - - for step, inputs in enumerate(train_ds): -     if step == 0: -         ckpt_manager.restore() -     loss = train_step(inputs) diff --git a/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_common_api.rst deleted file mode 100644 index 625a7fcbf1..0000000000 --- a/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_common_api.rst +++ /dev/null @@ -1,488 +0,0 @@ -.. admonition:: Contents - - - :ref:`communication_api` - - :ref:`mpi_basics` - -Common API -========== - -The following SageMaker distribute model parallel APIs are common across all frameworks. - -**Important**: This API document assumes you use the following import statement in your training scripts. - -**TensorFlow** - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -**PyTorch** - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. function:: smp.init( ) - :noindex: - - Initialize the library. Must be called at the beginning of training script. - -.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs]) - :noindex: - - A decorator that must be placed over a function that represents a single - forward and backward pass (for training use cases), or a single forward - pass (for evaluation use cases). Any computation that is defined inside - the ``smp.step``-decorated function is executed in a pipelined manner. - - By default, every tensor input to the function is split across its batch - dimension into a number of microbatches specified while launching the - training job. This behavior can be customized through the arguments to - ``smp.step``, described below. The library then orchestrates the execution of - each microbatch across all partitions, based on the chosen pipeline - type. - - In a typical use case, forward pass and back-propagation are executed - inside an \ ``smp.step``-decorated function and gradients, loss, and - other relevant metrics (such as accuracy, etc.) are returned from - ``smp.step``-decorated function. - - Any gradient post-processing operation, such as gradient clipping and - allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or - ``optimizer.step`` (for PT) should be applied on the gradients returned - from the ``smp.step`` function, and not inside the ``smp.step`` - function. This is because every operation inside ``smp.step`` is - executed once per microbatch, so having these operations inside - ``smp.step`` can either be inefficient (in the case of allreduce), or - lead to wrong results (in the case of ``apply_gradients`` / - ``optimizer.step``). - - If the objects returned from the ``smp.step``-decorated function contain - ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to - ``StepOutput`` objects. A ``StepOutput`` object encapsulates all - versions of the tensor across different microbatches - (see ``StepOutput`` entry for more information). - - The argument to ``smp.step`` decorated function should either be a tensor - or an instance of list, tuple, dict or set for it to be split across - microbatches. If your object doesn't fall into this category, you can make - the library split your object, by implementing ``smp_slice`` method. - - Below is an example of how to use it with PyTorch. - - .. code:: python - - class CustomType: - def __init__(self, tensor): - self.data = tensor - - # The library will call this to invoke slicing on the object passing in total microbatches (num_mb) - # and the current microbatch index (mb). - def smp_slice(self, num_mb, mb, axis): - dim_size = list(self.data.size())[axis] - - split_size = dim_size // num_mb - sliced_tensor = self.data.narrow(axis, mb * split_size, split_size) - return CustomType(sliced_tensor, self.other) - - custom_obj = CustomType(torch.ones(4,)) - - @smp.step() - def step(custom_obj): - loss = model(custom_obj) - model.backward(loss) - return loss - - - **Important:** ``smp.step`` splits the batch into microbatches, and - executes everything inside the decorated function once per microbatch. - This might affect the behavior of batch normalization, any operation - that explicitly uses the batch size information, or any other Python - code that is expected to run once. - - **TensorFlow-specific behavior** - - ``smp.step`` is a wrapper that - inherits from and extends the behavior of ``tf.function``, and as such, - all the caveats that apply to the use of ``tf.function``\ s also apply - to ``smp.step``. In particular, any operation that is inside - ``smp.step`` executes in graph mode, and not eager mode. - - In the first call, ``smp.step`` performs tracing of the wrapped function every time - one of the tensor arguments changes their shape or dtype, or for every - new value of a Python argument, if there is one. Tracing is expensive, - so such scenarios should be avoided as much as possible or, - alternatively, an ``input_signature`` argument must be provided. For - more information on the usage of ``tf.function``, refer to the - TensorFlow documentation: - - - https://www.tensorflow.org/api_docs/python/tf/function\ - - https://www.tensorflow.org/guide/function\ - - Each ``smp.step`` decorated function must have a return value that depends on the - output of ``smp.DistributedModel``. - - **Common parameters** - - - ``non_split_inputs`` (``list``): The list of arguments to the decorated function - that should not be split along the batch dimension. Should be used - for all input tensors that do not have a batch dimension. Should be a - list of argument names as ``str``, as they appear in the signature of - the ``smp.step``-decorated function. By default it is considered an - empty list. - - - ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch - axis. The keys should be the argument names as ``str``, as they - appear in the signature of the ``smp.step``-decorated function.  By - default all batch axes are assumed to be the 0-axis. - - **TensorFlow-only parameters** - - - All arguments of ``tf.function``. Note: - The \ ``experimental_compile`` argument of ``tf.function`` may not - work as expected with ``smp.step``, since it interferes with - pipelining and model partitioning. To enable XLA with the library, you can - instead use \ ``tf.config.optimizer.set_jit(True)``. - - **PyTorch-only parameters** - - - ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on - all returned ``torch.Tensor`` outputs. Setting it to ``False`` - increases memory consumption, unless ``detach()`` is manually called - on the returned tensors, because the model graph is not cleared from - memory after the training step. Set to \ ``True`` by default. - - **Returns** - - - The same object(s) returned from the decorated function. All - returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or - ``torch.Tensor`` objects (for PT) are wrapped inside - a \ ``StepOutput`` object, even when they are inside a Python - ``list``, ``tuple``, or ``dict``. - - - -.. class:: StepOutput - :noindex: - - - A class that encapsulates all versions of a ``tf.Tensor`` - or \ ``torch.Tensor`` across all microbatches. - - When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside - ``smp.step``, different versions of the tensor are computed for each - microbatch. - - When this tensor is returned from ``smp.step`` and is accessed outside - of the decorated function, it appears as a ``StepOutput`` object, which - contains all such versions. For example, - - - In the case of Tensorflow, the gradient for a particular - ``tf.Variable`` is computed on each microbatch individually, and if - this gradient is returned from ``smp.step``, all gradients for this - ``tf.Variable`` become part of the same ``StepOutput`` object. The - ``StepOutput`` class offers the following API for commonly-used - post-processing operations on such tensors. - - In the case of PyTorch, the loss for each microbatch is computed - individually and all the ``torch.Tensor``\ s that represent the loss - for different microbatches become part of same ``StepOutput`` object, - if loss is returned from the ``smp.step`` function. - - - The ``StepOutput`` class offers the following API for commonly-used - post-processing operations on tensors. - - .. data:: StepOutput.outputs - :noindex: - - Returns a list of the underlying tensors, indexed by microbatch. - - .. function:: StepOutput.reduce_mean( ) - :noindex: - - Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s - ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches. - - .. function:: StepOutput.reduce_sum( ) - :noindex: - - Returns a ``tf.Tensor`` / - ``torch.Tensor`` that sums the constituent - ``tf.Tensor``\ s/\ ``torch.Tensor``\ s. - - .. function:: StepOutput.concat( ) - :noindex: - - Returns a - ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the - batch dimension using ``tf.concat`` / ``torch.cat``. - - .. function:: StepOutput.stack( ) - :noindex: - - Applies ``tf.stack`` / ``torch.stack`` - operation to the list of constituent ``tf.Tensor``\ s / - ``torch.Tensor``\ s. - - **TensorFlow-only methods** - - .. function:: StepOutput.merge( ) - :noindex: - - Returns a ``tf.Tensor`` that - concatenates the constituent ``tf.Tensor``\ s along the batch - dimension. This is commonly used for merging the model predictions - across microbatches. - - .. function:: StepOutput.accumulate(method="variable", var=None) - :noindex: - - Functionally the same as ``StepOutput.reduce_mean()``. However, it is - more memory-efficient, especially for large numbers of microbatches, - since it does not wait for all constituent \ ``tf.Tensor``\ s to be - ready to start averaging them, thereby saving memory. - - In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end - up being more memory-efficient than ``StepOutput.accumulate()``. - - **Parameters** - - - ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``): - If ``"add_n"`` or ``"accumulate_n"``, the library uses - ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement - accumulation. If ``"variable"``, the library uses an internal ``tf.Variable`` - into which to accumulate the tensors. Default is \ ``"variable"``. - Note: Memory usage behavior of these choices can depend on the model - and implementation. - - - ``var``: A ``tf.Variable`` into which, if provided, the library uses to - accumulate the tensors. If \ ``None``, the library internally creates a - variable. If ``method`` is not ``"variable"``, this argument is - ignored. - -.. _mpi_basics: - :noindex: - -MPI Basics -^^^^^^^^^^ - -The library exposes the following basic MPI primitives to its Python API: - -- ``smp.rank()``: The rank of the current process. -- ``smp.size()``: The total number of processes. -- ``smp.mp_rank()``: The rank of the process among the processes that - hold the current model replica. -- ``smp.dp_rank()``: The rank of the process among the processes that - hold different replicas of the same model partition. -- ``smp.dp_size()``: The total number of model replicas. -- ``smp.local_rank()``: The rank among the processes on the current - instance. -- ``smp.local_size()``: The total number of processes on the current - instance. -- ``smp.get_mp_group()``: The list of ranks over which the current - model replica is partitioned. -- ``smp.get_dp_group()``: The list of ranks that hold different - replicas of the same model partition. - - .. _communication_api: - :noindex: - -Communication API -^^^^^^^^^^^^^^^^^ - -The library provides a few communication primitives which can be helpful while -developing the training script. These primitives use the following -``enum`` s as arguments to specify which processes the communication -should involve. -​ - -**Helper structures** - -.. data:: smp.CommGroup - :noindex: - - An ``enum`` that takes the values - ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``. - These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``, - and ``smp.DP_GROUP`` respectively. - - - ``CommGroup.WORLD``: Represents the entire group of processes used in - training - - ``CommGroup.MP_GROUP``: Represents the group of processes that hold - the same model replica as the current process. The processes in a - single ``MP_GROUP`` collectively store an entire replica of the - model. - - ``CommGroup.DP_GROUP``: Represents the group of processes that hold - the same model partition as the current process. The processes in a - single ``DP_GROUP`` perform data parallelism/allreduce among - themselves. - -.. data:: smp.RankType - :noindex: - - An ``enum`` that takes the values - ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``. - - - ``RankType.WORLD_RANK``: The associated rank is to be interpreted as - the rank of the process across all processes used in training. - - ``RankType.MP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``MP_GROUP``. - - ``RankType.DP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``DP_GROUP``. - - -**Communication primitives:** - -.. function:: smp.broadcast(obj, group) - :noindex: - - Sends the object to all processes in the - group. The receiving process must call ``smp.recv_from`` to receive the - sent object. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be broadcast. - - - ``group``: A ``CommGroup`` argument that represents to which group of - processes the object will be sent. - - **Notes** - - - When you use ``broadcast`` on the sender process, there needs - to be an accompanying ``smp.recv_from()`` call on the receiver - processes. - - - This is a synchronous call; the ``broadcast`` statement - returns only after all ranks participating in the call have made a - matching ``recv_from`` call. - - **Example** - - .. code:: python - - if smp.rank() == 0: -     smp.broadcast(something, group=smp.CommGroup.WORLD) - else: -     smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK) - -.. function:: smp.send(obj, dest_rank, rank_type) - :noindex: - - Sends the object ``obj`` to - ``dest_rank``, which is of a type specified by ``rank_type``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be sent. - - - ``dest_rank`` (``int``): An integer denoting the rank of the receiving process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process - with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current - process. - - **Notes** - - - Note: \ This is a synchronous call; the ``send`` statement returns - only after the destination rank has made a matching - ``recv_from`` call. - -.. function:: smp.recv_from(src_rank, rank_type) - :noindex: - - Receive an object from a peer process. Can be used with a matching - ``smp.send`` or a ``smp.broadcast`` call. - - **Inputs** - - - ``src_rank`` (``int``): An integer denoting rank of the sending process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then the object is received from - the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the - current process. - - **Returns** - - Returns the python object that is sent by the peer process. - - **Notes** - - - Note: This is a synchronous call; the ``recv_from`` statement returns - only after the source rank has made a matching ``send`` or - ``broadcast`` call, and the object is received. - -.. function:: smp.allgather(obj, group) - :noindex: - - A collective call that gathers all the - submitted objects across all ranks in the specified ``group``. Returns a - list whose ``i``\ th index contains the object submitted by the - ``i``\ th rank in ``group``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be - allgathered. - - - ``group`` : A ``CommGroup`` argument that represents which group of - processes participate in ``allgather``. - - **Notes** - - - Note: This is a synchronous call; the ``allgather`` statement returns - only after all ranks participating in the call have made a matching - ``allgather`` call, and all the objects are received at the current - rank. - - **Examples** - - .. code:: python - - # assuming mp_size() == 2 - - if smp.mp_rank() == 0: -     out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - else: -     out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - -.. function:: smp.barrier(group=smp.WORLD) - :noindex: - - A statement that hangs until all - processes in the specified group reach the barrier statement, similar to - ``MPI_Barrier()``. - - **Inputs** - - - ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of - processes participating in the barrier call. Defaults to - ``smp.WORLD``. - - **Examples** - - - Assume there are 8 processes and 2 model partitions, and - therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If - the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its - group argument, then each process only waits until the other process - of its own ``mp_group`` reaches that point. It does not wait for - processes outside that ``mp_group``. - -.. function:: smp.dp_barrier() - :noindex: - - Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``. - Waits for the processes in the same \ ``dp_group`` as - the current process to reach the same point in execution. - -.. function:: smp.mp_barrier() - :noindex: - - Same as passing ``smp.MP_GROUP`` to - ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as - the current process to reach the same point in execution. diff --git a/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_pytorch.rst deleted file mode 100644 index d2fcb95954..0000000000 --- a/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_pytorch.rst +++ /dev/null @@ -1,572 +0,0 @@ -.. admonition:: Contents - - - :ref:`pytorch_saving_loading` - - :ref:`pytorch_saving_loading_instructions` - -PyTorch API -=========== - -**Supported versions: 1.7.1, 1.8.1** - -This API document assumes you use the following import statements in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. tip:: - - Refer to - `Modify a PyTorch Training Script - `_ - to learn how to use the following API in your PyTorch training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of ``torch.nn.Module`` which specifies the model to be - partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is - the model to be partitioned. The returned ``DistributedModel`` object - internally manages model parallelism and data parallelism. Only one - model in the training script can be wrapped with - ``smp.DistributedModel``. - - **Example:** - - .. code:: python - - model = smp.DistributedModel(model) - - **Important**: The ``__call__`` and  ``backward`` method calls on the - ``smp.DistributedModel`` object (in the following example, the object - is \ ``model``) can only be made inside a ``smp.step``-decorated - function. - - - Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can - be performed by calling the \ ``DistributedModel`` object on the input - tensors. - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - For a backward pass, one needs to call the backward function on - the \ ``DistributedModel`` object, with tensors and gradients as - arguments, replacing the PyTorch operations \ ``torch.Tensor.backward`` - or ``torch.autograd.backward``. - - - The API for ``model.backward`` is very similar to - ``torch.autograd.backward``. For example, the following - ``backward`` calls: - - .. code:: python - - torch.autograd.backward(loss) or loss.backward() - - should be replaced with: - - .. code:: python - - model.backward(loss) # loss is a tensor with only one element as its data - - Similarly, for non-scalar tensors, replace the following - ``backward`` call containing incoming gradient arguments: - - .. code:: python - - torch.autograd.backward(outputs, out_grads) - - with the following line: - - .. code:: python - - model.backward(outputs, out_grads) - - In these examples, all ``__call__``  and ``backward`` method calls on - the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside - a ``smp.step``-decorated function. - - **Using DDP** - - If DDP is enabled, do not not place a PyTorch - ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because - the ``DistributedModel`` wrapper will also handle data parallelism. - - Unlike the original DDP wrapper, when you use ``DistributedModel``, - model parameters and buffers are not immediately broadcast across - processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the - ``smp.step``-decorated function when the partition is done. - - **Parameters** - - - ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism). - - - ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``) - Whether to perform the tracing step on the GPU or CPU. The tracing step gathers - information on the order of execution of modules, the shapes of - intermediate outputs, and execution times, to be used by the - partitioning algorithm. If ``trace_device`` is set to GPU, accurate - module execution times can be gathered during tracing for potentially - improved partitioning decision. However, if the model is too large to - fit in a single GPU, then ``trace_device`` should be set to CPU. - - - ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``, - the library profiles the execution time of each module during tracing, and uses - it in the partitioning decision. This improves the partitioning - decision, but it might make the tracing slower. It may also introduce - some degree of non-determinism in partitioning results, because of the - inherent randomness in module execution times. Must be ``False`` if - ``trace_device`` is ``"cpu"``. - - - ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` while launching training). The library uses this flag - to decide whether to do overlapping allreduce whenever a parameter - gradients are ready. This leads to overlapping of communication and - computation and can improve performance. If this is set to ``False`` , - allreduce is performed at the end of the step. - - - ``backward_passes_per_step`` (``int``) (default: 1): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` in config). This parameter indicates the - number of backward passes to perform before calling allreduce on DDP. - This allows accumulating updates over multiple mini-batches before - reducing and applying them. - - - ``average_grads_across_microbatches`` (``bool``) (default: ``True``): - Whether or not the computed gradients should be averaged across - microbatches. If ``False``, the computed gradients will be summed across - microbatches, but not divided by the number of microbatches. In typical - use case where the computed loss is averaged over the mini-batch, this - should be left as ``True``. If you use a loss function that only sums - the per-sample loss across the batch (and not divide by the batch size), - then this must be set to ``False`` for correctness. - - - ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets - parameters into multiple buckets so that gradient reduction of each - bucket can potentially overlap with backward - computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes - (MB). - - - ``trace_memory_usage`` (default: False): When set to True, the library attempts - to measure memory usage per module during tracing. If this is disabled, - memory usage will be estimated through the sizes of tensors returned from - the module. - - - ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``. - This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper. - Please see: `broadcast_buffer `__. - - - ``gradient_as_bucket_view`` (default: False): To be - used with ``ddp=True``. This parameter is forwarded to the underlying - ``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view `__. - - **Properties** - - - ``partitioned``: Is ``True`` if the model is partitioned, ``False`` - otherwise. Initialized to ``False`` when ``DistributedModel`` is first - created. It becomes be ``True`` during the first call - to ``smp.step``-decorated function. Once the model is partitioned, the - local parameters or local ``state_dict`` can be fetched using the - following methods. - - **Methods** - - .. function:: backward(tensors, grad_tensors) - :noindex: - - Triggers a distributed backward - pass across model partitions. Example usage provided in the previous - section. The API is very similar - to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward. - ``retain_grad`` and ``create_graph``  flags are not supported. - - .. function:: local_buffers( ) - :noindex: - - Returns an iterator over buffers for the modules in - the partitioned model that have been assigned to the current process. - - .. function:: local_named_buffers( ) - :noindex: - - Returns an iterator over buffers for the - modules in the partitioned model that have been assigned to the current - process. This yields both the name of the buffer as well as the buffer - itself. - - .. function:: local_parameters( ) - :noindex: - - Returns an iterator over parameters for the - modules in the partitioned model that have been assigned to the current - process. - - .. function:: local_named_parameters( ) - :noindex: - - Returns an iterator over parameters for - the modules in the partitioned model that have been assigned to the - current process. This yields both the name of the parameter as well as - the parameter itself. - - .. function:: local_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. - - .. function:: local_named_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. This - yields both the name of the module as well as the module itself. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains local - parameters that belong to the current \ ``mp_rank``. This ``state_dict`` - contains a key \ ``_smp_is_partial`` to indicate this is a - partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains parameters - for the entire model. It first collects the \ ``local_state_dict``  and - gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to - create a full ``state_dict``. Please note that this needs to be called on all ranks with - ``dp_rank()==0`` to ensure the gather happens properly. - If it is only called on all such ranks, it can hang. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.module.load_state_dict()`` , - except: It first gathers and merges the ``state_dict``\ s across - ``mp_rank``\ s, if they are partial. The actual loading happens after the - model partition so that each rank knows its local parameters. - - .. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. Returns a ``RemovableHandle`` object ``handle``, - which can be used to remove the hook by calling ``handle.remove()``. - - .. function:: cpu( ) - :noindex: - - Allgathers parameters and buffers across all ``mp_rank``\ s and moves them - to the CPU. - - .. function:: join( ) - :noindex: - - A context manager to be used in conjunction with an instance of - ``smp.DistributedModel`` to be able to train with uneven inputs across - participating processes. This is only supported when ``ddp=True``. This will use the join with the wrapped - ``DistributedDataParallel`` instance. For more information, see: - `join `__ - in the PyTorch documentation. - - .. function:: register_comm_hook( state, callable ) - :noindex: - - **Available for PyTorch 1.8.1 only** - Registers a communication hook which is an enhancement that provides - a flexible hook ``callable`` to users where they can specify how - gradients are aggregated across multiple workers. This method will be called on the wrapped ``DistributedDataParallel`` instance. - - Please note that when you register a comm hook you have full control of how the gradients are processed. - When using only data parallelism with Torch DDP you are expected to average grads across data parallel replicas within the hook. - Similarly, when using DistributedModel you have to averaging grads across data parallel replicas within the hook. - In addition to that, you also have to average grads across microbatches within the hook unless you explicitly desire to not average based on your loss function. - See ``average_grads_across_microbatches`` for more information about averaging grads across microbatches. - - This is only supported when ``ddp=True`` and ``overlapping_allreduce=True`` (default). - For more information, see: - `register_comm_hook `__ - in the PyTorch documentation. - - - -.. class:: smp.DistributedOptimizer - :noindex: - - **Parameters** - - ``optimizer`` - - An optimizer wrapper for saving/loading optimizer states. This wrapper - returns ``optimizer`` with the following methods overridden: - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains optimizer state for the entire model. - It first collects the ``local_state_dict`` and gathers and merges - the ``local_state_dict`` from all ``mp_rank``s to create a full - ``state_dict``. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.optimizer.load_state_dict()`` , except: - - - It first gathers and merges the local ``state_dict``\ s if they are - partial. - - The actual loading happens after the model partition so that each - rank knows its local parameters. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains the - local optimizer state that belongs to the current \ ``mp_rank``. This - ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is - a partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - ​ -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (int) - The index of the partition. - - A context manager which places all modules defined inside into the - partition with ID ``index``.  The ``index`` argument must be less than - the number of partitions. - - Use ``smp.partition`` to implement manual partitioning. - If ``"auto_partition"`` is ``True``, then the - ``smp.partition`` contexts are ignored. Any module that is not placed in - any ``smp.partition`` context is placed in the - ``default_partition`` defined through the SageMaker Python SDK. - - When ``smp.partition`` contexts are nested, the innermost context - overrides the rest (see the following example). In PyTorch, manual - partitioning should be done inside the module \ ``__init__``, and the - partition assignment applies to the modules that are *created* inside - the ``smp.partition`` context. - - Example: - - .. code:: python - - class Model(torch.nn.Module): -     def __init__(self): -         with smp.partition(1): -             self.child0 = Child0()            # child0 on partition 1 -             with smp.partition(2): -                 self.child1 = Child1()        # child1 on partition 2 -             self.child2 = Child2()            # child2 on partition 1 -         self.child3 = Child3()                # child3 on default_partition - -.. function:: smp.get_world_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all - processes, which can be used with the ``torch.distributed`` API. - Requires ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_mp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``MP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_dp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``DP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.is_initialized( ) - :noindex: - - Returns ``True`` if ``smp.init`` has already been called for the - process, and ``False`` otherwise. - -.. function::smp.is_tracing( ) - :noindex: - - Returns ``True`` if the current process is running the tracing step, and - ``False`` otherwise. - -.. data:: smp.nn.FusedLayerNorm - :noindex: - - `Apex Fused Layer Norm `__ is currently not - supported by the library. ``smp.nn.FusedLayerNorm`` replaces ``apex`` - ``FusedLayerNorm`` and provides the same functionality. This requires - ``apex`` to be installed on the system. - -.. data:: smp.optimizers.FusedNovoGrad - :noindex: - - - `Fused Novo Grad optimizer `__ is - currently not supported by the library. ``smp.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad`` - optimizer and provides the same functionality. This requires ``apex`` to - be installed on the system. - -.. data:: smp.optimizers.FusedLamb - :noindex: - - - `FusedLamb optimizer `__ - currently doesn’t work with the library. ``smp.optimizers.FusedLamb`` replaces - ``apex`` ``FusedLamb`` optimizer and provides the same functionality. - This requires ``apex`` to be installed on the system. - -.. data:: smp.amp.GradScaler - :noindex: - - `Torch AMP Gradscaler `__ - currently doesn’t work with the library. ``smp.amp.GradScaler`` replaces - ``torch.amp.GradScaler`` and provides the same functionality. - -.. _pytorch_saving_loading: - :noindex: - -APIs for Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. function:: smp.save( ) - :noindex: - - Saves an object. This operation is similar to ``torch.save()``, except - it has an additional keyword argument, ``partial``, and accepts only - string type for the argument ``f`` (file). If ``partial=True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank`` - index to your saved file. - - **Parameters** - - - ``obj`` (dict): A saved object. - - ``f`` (str): A string containing a file name. - - ``partial`` (bool, default= ``True``):  When set to ``True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an - ``mp_rank`` index to the saved file. If you want to be able to load - and further train a model that you save with ``smp.save()``, you must - set ``partial=True``. - - ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``): - A module used for pickling metadata and objects. - - ``pickle_protocol``  (int, default=2): Can be specified to - override the defaultprotocol. - -.. function:: smp.load( ) - :noindex: - - Loads an object saved with ``smp.save()`` from a file. - - Similar to, `torch.load() `__, - except it has an additional keyword argument, ``partial``, and accepts - only string type for the argument ``f`` (file). If \ ``partial=True``, - then each ``mp_rank`` loads a separate checkpoint file. - - **Parameters** - - - ``f`` (string): A string containing a file name. - - ``map_location`` (function): A function - `torch.device `__, - a string, or a dict specifying how to remap storage locations. - - ``pickle_module`` (pickle module): A module used for unpickling - metadata and objects (has to match the \ ``pickle_module``\ used to - serialize file). - - ``pickle_load_args`` (Python 3 only): Optional keyword arguments - passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``. - - ``partial`` (bool, default= ``True``): When set to ``True``, each - ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``. - Should be used when loading a model trained with the library. - -.. _pytorch_saving_loading_instructions: - :noindex: - -General Instruction For Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The library can save partial or full checkpoints. - -- For partial checkpoints, each ``mp_rank`` saves its own checkpoint - file with only the parameters that belong to that rank. -- For full checkpoints, the library saves a single checkpoint that contains - entire model parameters. - -When **saving** using ``smp.save()``, each rank only holds its own -parameters. If you want to save the full model, there will be some -communication between the ranks to create the full model. If you save -checkpoints often, you should save partial checkpoints for best -performance. - -When **loading** using ``smp.load()``, the library can load either partial or | -full checkpoints or full checkpoints saved by a non-model-parallel model. If you -want to resume training with a non-model-parallel model or do inference, you need -a full checkpoint. - -The following is an example of how you can save and load a checkpoint: - -.. code:: python - - # Original model and optimizer - model = MyModel(...) - optimizer = MyOpt(...) - - # model parallel wrapper - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - - # To save, always save on dp_rank 0 to avoid data racing - if partial: -     # To save the partial model on each mp rank -     # the library will create `checkpoint.pt_{mprank}` for each mp rank -     if save_partial_model: -         if smp.dp_rank() == 0: -             model_dict = model.local_state_dict() # save the partial model -             opt_dict = optimizer.local_state_dict() # save the partial optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 f"/checkpoint.pt", -                 partial=True, -             ) - -     # To save the full model -     if save_full_model: -         if smp.dp_rank() == 0: -             model_dict = model.state_dict() # save the full model -             opt_dict = optimizer.state_dict() # save the full optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 "/checkpoint.pt", -                 partial=False, -             ) - - # To load, load on all ranks. - # The only difference for partial/full loading is the partial flag in smp.load - # Load partial checkpoint - if partial_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=True) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) - # Load full checkpoint - if full_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=False) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) diff --git a/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_tensorflow.rst deleted file mode 100644 index 131fc327ac..0000000000 --- a/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_tensorflow.rst +++ /dev/null @@ -1,172 +0,0 @@ -TensorFlow API -============== - -**Supported version: 2.3.1, 2.4.1, 2.5.0** - -**Important**: This API document assumes you use the following import statement in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -.. tip:: - - Refer to - `Modify a TensorFlow Training Script - `_ - to learn how to use the following API in your TensorFlow training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of the Keras \ ``Model`` class, which defines the model to - be partitioned. Model definition is done by sub-classing - ``smp.DistributedModel`` class, and implementing the ``call()`` method, - in the same way as the Keras model sub-classing API. Any operation that - is part of the \ ``smp.DistributedModel.call()`` method is subject to - partitioning, meaning that every operation placed inside executes in - exactly one of the devices (the operations outside run on all devices). - - - Similar to the regular Keras API, the forward pass is done by directly - calling the model object on the input tensors. For example: - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - However, ``model()`` calls can only be made inside a - ``smp.step``-decorated function. - - The outputs from a ``smp.DistributedModel`` are available in all ranks, - regardless of which rank computed the last operation. - - **Methods:** - - .. function:: save_model(save_path="/opt/ml/model") - :noindex: - - **Inputs** - - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights. - - Saves the entire, - unpartitioned model with the latest trained weights to ``save_path`` in - TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which - SageMaker monitors to upload the model artifacts to Amazon S3. - -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (``int``): The index of the partition. - - A context manager which places all operations defined inside into the - partition whose ID is equal to ``index``. When - ``smp.partition`` contexts are nested, the innermost context overrides - the rest. The ``index`` argument must be smaller than the number of - partitions. - - ``smp.partition`` is used in the manual partitioning API; - if \ ``"auto_partition"`` parameter is set to ``True`` while launching - training, then ``smp.partition`` contexts are ignored. Any operation - that is not placed in any ``smp.partition`` context is placed in the - ``default_partition``, as shown in the following example: - - .. code:: python - - # auto_partition: False - # default_partition: 0 - smp.init() - [...] - x = tf.constant(1.2)                     # placed in partition 0 - with smp.partition(1): -     y = tf.add(x, tf.constant(2.3))      # placed in partition 1 -     with smp.partition(3): -         z = tf.reduce_sum(y)             # placed in partition 3 - - -.. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. - - .. code:: python - - @smp.register_post_partition_hook - def test_eager(): - # All statements here will be executed right after partition but before the first forward pass - tf.print("Entered hook through eager context") - -.. class:: smp.CheckpointManager - :noindex: - - - A subclass of TensorFlow - `CheckpointManager `__, - which is used to manage checkpoints. The usage is similar to TensorFlow - ``CheckpointManager``. - - The following returns a ``CheckpointManager`` object. - - .. code:: python - - smp.CheckpointManager(checkpoint, -                       directory="/opt/ml/checkpoints", -                       max_to_keep=None, -                       checkpoint_name="ckpt") - - **Parameters** - - - ``checkpoint``: A `tf.train.Checkpoint - `__ instance - that represents a model checkpoint. - - - ``directory``: (``str``) The path to a directory in which to write - checkpoints. A file named "checkpoint" is also written to this - directory (in a human-readable text format) which contains the state - of the ``CheckpointManager``. Defaults to - ``"/opt/ml/checkpoints"``, which is the directory that SageMaker - monitors for uploading the checkpoints to Amazon S3. - - ``max_to_keep`` (``int``): The number of checkpoints to keep. If - ``None``, all checkpoints are kept. - - ``checkpoint_name`` (``str``): Custom name for the checkpoint file. - Defaults to ``"ckpt"``. - - - **Methods:** - - .. function:: save( ) - :noindex: - - Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``. - - .. function:: restore( ) - :noindex: - - Restores the latest checkpoint in the specified directory. - Internally uses ``tf.train.CheckpointManager.restore()``. - - - **Examples:** - - .. code:: python - - checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) - ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints - - for inputs in train_ds: -     loss = train_step(inputs) -     # [...] -     ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints - - .. code:: python - - for step, inputs in enumerate(train_ds): -     if step == 0: -         ckpt_manager.restore() -     loss = train_step(inputs) diff --git a/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_common_api.rst deleted file mode 100644 index 625a7fcbf1..0000000000 --- a/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_common_api.rst +++ /dev/null @@ -1,488 +0,0 @@ -.. admonition:: Contents - - - :ref:`communication_api` - - :ref:`mpi_basics` - -Common API -========== - -The following SageMaker distribute model parallel APIs are common across all frameworks. - -**Important**: This API document assumes you use the following import statement in your training scripts. - -**TensorFlow** - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -**PyTorch** - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. function:: smp.init( ) - :noindex: - - Initialize the library. Must be called at the beginning of training script. - -.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs]) - :noindex: - - A decorator that must be placed over a function that represents a single - forward and backward pass (for training use cases), or a single forward - pass (for evaluation use cases). Any computation that is defined inside - the ``smp.step``-decorated function is executed in a pipelined manner. - - By default, every tensor input to the function is split across its batch - dimension into a number of microbatches specified while launching the - training job. This behavior can be customized through the arguments to - ``smp.step``, described below. The library then orchestrates the execution of - each microbatch across all partitions, based on the chosen pipeline - type. - - In a typical use case, forward pass and back-propagation are executed - inside an \ ``smp.step``-decorated function and gradients, loss, and - other relevant metrics (such as accuracy, etc.) are returned from - ``smp.step``-decorated function. - - Any gradient post-processing operation, such as gradient clipping and - allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or - ``optimizer.step`` (for PT) should be applied on the gradients returned - from the ``smp.step`` function, and not inside the ``smp.step`` - function. This is because every operation inside ``smp.step`` is - executed once per microbatch, so having these operations inside - ``smp.step`` can either be inefficient (in the case of allreduce), or - lead to wrong results (in the case of ``apply_gradients`` / - ``optimizer.step``). - - If the objects returned from the ``smp.step``-decorated function contain - ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to - ``StepOutput`` objects. A ``StepOutput`` object encapsulates all - versions of the tensor across different microbatches - (see ``StepOutput`` entry for more information). - - The argument to ``smp.step`` decorated function should either be a tensor - or an instance of list, tuple, dict or set for it to be split across - microbatches. If your object doesn't fall into this category, you can make - the library split your object, by implementing ``smp_slice`` method. - - Below is an example of how to use it with PyTorch. - - .. code:: python - - class CustomType: - def __init__(self, tensor): - self.data = tensor - - # The library will call this to invoke slicing on the object passing in total microbatches (num_mb) - # and the current microbatch index (mb). - def smp_slice(self, num_mb, mb, axis): - dim_size = list(self.data.size())[axis] - - split_size = dim_size // num_mb - sliced_tensor = self.data.narrow(axis, mb * split_size, split_size) - return CustomType(sliced_tensor, self.other) - - custom_obj = CustomType(torch.ones(4,)) - - @smp.step() - def step(custom_obj): - loss = model(custom_obj) - model.backward(loss) - return loss - - - **Important:** ``smp.step`` splits the batch into microbatches, and - executes everything inside the decorated function once per microbatch. - This might affect the behavior of batch normalization, any operation - that explicitly uses the batch size information, or any other Python - code that is expected to run once. - - **TensorFlow-specific behavior** - - ``smp.step`` is a wrapper that - inherits from and extends the behavior of ``tf.function``, and as such, - all the caveats that apply to the use of ``tf.function``\ s also apply - to ``smp.step``. In particular, any operation that is inside - ``smp.step`` executes in graph mode, and not eager mode. - - In the first call, ``smp.step`` performs tracing of the wrapped function every time - one of the tensor arguments changes their shape or dtype, or for every - new value of a Python argument, if there is one. Tracing is expensive, - so such scenarios should be avoided as much as possible or, - alternatively, an ``input_signature`` argument must be provided. For - more information on the usage of ``tf.function``, refer to the - TensorFlow documentation: - - - https://www.tensorflow.org/api_docs/python/tf/function\ - - https://www.tensorflow.org/guide/function\ - - Each ``smp.step`` decorated function must have a return value that depends on the - output of ``smp.DistributedModel``. - - **Common parameters** - - - ``non_split_inputs`` (``list``): The list of arguments to the decorated function - that should not be split along the batch dimension. Should be used - for all input tensors that do not have a batch dimension. Should be a - list of argument names as ``str``, as they appear in the signature of - the ``smp.step``-decorated function. By default it is considered an - empty list. - - - ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch - axis. The keys should be the argument names as ``str``, as they - appear in the signature of the ``smp.step``-decorated function.  By - default all batch axes are assumed to be the 0-axis. - - **TensorFlow-only parameters** - - - All arguments of ``tf.function``. Note: - The \ ``experimental_compile`` argument of ``tf.function`` may not - work as expected with ``smp.step``, since it interferes with - pipelining and model partitioning. To enable XLA with the library, you can - instead use \ ``tf.config.optimizer.set_jit(True)``. - - **PyTorch-only parameters** - - - ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on - all returned ``torch.Tensor`` outputs. Setting it to ``False`` - increases memory consumption, unless ``detach()`` is manually called - on the returned tensors, because the model graph is not cleared from - memory after the training step. Set to \ ``True`` by default. - - **Returns** - - - The same object(s) returned from the decorated function. All - returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or - ``torch.Tensor`` objects (for PT) are wrapped inside - a \ ``StepOutput`` object, even when they are inside a Python - ``list``, ``tuple``, or ``dict``. - - - -.. class:: StepOutput - :noindex: - - - A class that encapsulates all versions of a ``tf.Tensor`` - or \ ``torch.Tensor`` across all microbatches. - - When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside - ``smp.step``, different versions of the tensor are computed for each - microbatch. - - When this tensor is returned from ``smp.step`` and is accessed outside - of the decorated function, it appears as a ``StepOutput`` object, which - contains all such versions. For example, - - - In the case of Tensorflow, the gradient for a particular - ``tf.Variable`` is computed on each microbatch individually, and if - this gradient is returned from ``smp.step``, all gradients for this - ``tf.Variable`` become part of the same ``StepOutput`` object. The - ``StepOutput`` class offers the following API for commonly-used - post-processing operations on such tensors. - - In the case of PyTorch, the loss for each microbatch is computed - individually and all the ``torch.Tensor``\ s that represent the loss - for different microbatches become part of same ``StepOutput`` object, - if loss is returned from the ``smp.step`` function. - - - The ``StepOutput`` class offers the following API for commonly-used - post-processing operations on tensors. - - .. data:: StepOutput.outputs - :noindex: - - Returns a list of the underlying tensors, indexed by microbatch. - - .. function:: StepOutput.reduce_mean( ) - :noindex: - - Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s - ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches. - - .. function:: StepOutput.reduce_sum( ) - :noindex: - - Returns a ``tf.Tensor`` / - ``torch.Tensor`` that sums the constituent - ``tf.Tensor``\ s/\ ``torch.Tensor``\ s. - - .. function:: StepOutput.concat( ) - :noindex: - - Returns a - ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the - batch dimension using ``tf.concat`` / ``torch.cat``. - - .. function:: StepOutput.stack( ) - :noindex: - - Applies ``tf.stack`` / ``torch.stack`` - operation to the list of constituent ``tf.Tensor``\ s / - ``torch.Tensor``\ s. - - **TensorFlow-only methods** - - .. function:: StepOutput.merge( ) - :noindex: - - Returns a ``tf.Tensor`` that - concatenates the constituent ``tf.Tensor``\ s along the batch - dimension. This is commonly used for merging the model predictions - across microbatches. - - .. function:: StepOutput.accumulate(method="variable", var=None) - :noindex: - - Functionally the same as ``StepOutput.reduce_mean()``. However, it is - more memory-efficient, especially for large numbers of microbatches, - since it does not wait for all constituent \ ``tf.Tensor``\ s to be - ready to start averaging them, thereby saving memory. - - In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end - up being more memory-efficient than ``StepOutput.accumulate()``. - - **Parameters** - - - ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``): - If ``"add_n"`` or ``"accumulate_n"``, the library uses - ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement - accumulation. If ``"variable"``, the library uses an internal ``tf.Variable`` - into which to accumulate the tensors. Default is \ ``"variable"``. - Note: Memory usage behavior of these choices can depend on the model - and implementation. - - - ``var``: A ``tf.Variable`` into which, if provided, the library uses to - accumulate the tensors. If \ ``None``, the library internally creates a - variable. If ``method`` is not ``"variable"``, this argument is - ignored. - -.. _mpi_basics: - :noindex: - -MPI Basics -^^^^^^^^^^ - -The library exposes the following basic MPI primitives to its Python API: - -- ``smp.rank()``: The rank of the current process. -- ``smp.size()``: The total number of processes. -- ``smp.mp_rank()``: The rank of the process among the processes that - hold the current model replica. -- ``smp.dp_rank()``: The rank of the process among the processes that - hold different replicas of the same model partition. -- ``smp.dp_size()``: The total number of model replicas. -- ``smp.local_rank()``: The rank among the processes on the current - instance. -- ``smp.local_size()``: The total number of processes on the current - instance. -- ``smp.get_mp_group()``: The list of ranks over which the current - model replica is partitioned. -- ``smp.get_dp_group()``: The list of ranks that hold different - replicas of the same model partition. - - .. _communication_api: - :noindex: - -Communication API -^^^^^^^^^^^^^^^^^ - -The library provides a few communication primitives which can be helpful while -developing the training script. These primitives use the following -``enum`` s as arguments to specify which processes the communication -should involve. -​ - -**Helper structures** - -.. data:: smp.CommGroup - :noindex: - - An ``enum`` that takes the values - ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``. - These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``, - and ``smp.DP_GROUP`` respectively. - - - ``CommGroup.WORLD``: Represents the entire group of processes used in - training - - ``CommGroup.MP_GROUP``: Represents the group of processes that hold - the same model replica as the current process. The processes in a - single ``MP_GROUP`` collectively store an entire replica of the - model. - - ``CommGroup.DP_GROUP``: Represents the group of processes that hold - the same model partition as the current process. The processes in a - single ``DP_GROUP`` perform data parallelism/allreduce among - themselves. - -.. data:: smp.RankType - :noindex: - - An ``enum`` that takes the values - ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``. - - - ``RankType.WORLD_RANK``: The associated rank is to be interpreted as - the rank of the process across all processes used in training. - - ``RankType.MP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``MP_GROUP``. - - ``RankType.DP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``DP_GROUP``. - - -**Communication primitives:** - -.. function:: smp.broadcast(obj, group) - :noindex: - - Sends the object to all processes in the - group. The receiving process must call ``smp.recv_from`` to receive the - sent object. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be broadcast. - - - ``group``: A ``CommGroup`` argument that represents to which group of - processes the object will be sent. - - **Notes** - - - When you use ``broadcast`` on the sender process, there needs - to be an accompanying ``smp.recv_from()`` call on the receiver - processes. - - - This is a synchronous call; the ``broadcast`` statement - returns only after all ranks participating in the call have made a - matching ``recv_from`` call. - - **Example** - - .. code:: python - - if smp.rank() == 0: -     smp.broadcast(something, group=smp.CommGroup.WORLD) - else: -     smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK) - -.. function:: smp.send(obj, dest_rank, rank_type) - :noindex: - - Sends the object ``obj`` to - ``dest_rank``, which is of a type specified by ``rank_type``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be sent. - - - ``dest_rank`` (``int``): An integer denoting the rank of the receiving process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process - with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current - process. - - **Notes** - - - Note: \ This is a synchronous call; the ``send`` statement returns - only after the destination rank has made a matching - ``recv_from`` call. - -.. function:: smp.recv_from(src_rank, rank_type) - :noindex: - - Receive an object from a peer process. Can be used with a matching - ``smp.send`` or a ``smp.broadcast`` call. - - **Inputs** - - - ``src_rank`` (``int``): An integer denoting rank of the sending process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then the object is received from - the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the - current process. - - **Returns** - - Returns the python object that is sent by the peer process. - - **Notes** - - - Note: This is a synchronous call; the ``recv_from`` statement returns - only after the source rank has made a matching ``send`` or - ``broadcast`` call, and the object is received. - -.. function:: smp.allgather(obj, group) - :noindex: - - A collective call that gathers all the - submitted objects across all ranks in the specified ``group``. Returns a - list whose ``i``\ th index contains the object submitted by the - ``i``\ th rank in ``group``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be - allgathered. - - - ``group`` : A ``CommGroup`` argument that represents which group of - processes participate in ``allgather``. - - **Notes** - - - Note: This is a synchronous call; the ``allgather`` statement returns - only after all ranks participating in the call have made a matching - ``allgather`` call, and all the objects are received at the current - rank. - - **Examples** - - .. code:: python - - # assuming mp_size() == 2 - - if smp.mp_rank() == 0: -     out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - else: -     out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - -.. function:: smp.barrier(group=smp.WORLD) - :noindex: - - A statement that hangs until all - processes in the specified group reach the barrier statement, similar to - ``MPI_Barrier()``. - - **Inputs** - - - ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of - processes participating in the barrier call. Defaults to - ``smp.WORLD``. - - **Examples** - - - Assume there are 8 processes and 2 model partitions, and - therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If - the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its - group argument, then each process only waits until the other process - of its own ``mp_group`` reaches that point. It does not wait for - processes outside that ``mp_group``. - -.. function:: smp.dp_barrier() - :noindex: - - Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``. - Waits for the processes in the same \ ``dp_group`` as - the current process to reach the same point in execution. - -.. function:: smp.mp_barrier() - :noindex: - - Same as passing ``smp.MP_GROUP`` to - ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as - the current process to reach the same point in execution. diff --git a/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_pytorch.rst deleted file mode 100644 index d2fcb95954..0000000000 --- a/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_pytorch.rst +++ /dev/null @@ -1,572 +0,0 @@ -.. admonition:: Contents - - - :ref:`pytorch_saving_loading` - - :ref:`pytorch_saving_loading_instructions` - -PyTorch API -=========== - -**Supported versions: 1.7.1, 1.8.1** - -This API document assumes you use the following import statements in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. tip:: - - Refer to - `Modify a PyTorch Training Script - `_ - to learn how to use the following API in your PyTorch training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of ``torch.nn.Module`` which specifies the model to be - partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is - the model to be partitioned. The returned ``DistributedModel`` object - internally manages model parallelism and data parallelism. Only one - model in the training script can be wrapped with - ``smp.DistributedModel``. - - **Example:** - - .. code:: python - - model = smp.DistributedModel(model) - - **Important**: The ``__call__`` and  ``backward`` method calls on the - ``smp.DistributedModel`` object (in the following example, the object - is \ ``model``) can only be made inside a ``smp.step``-decorated - function. - - - Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can - be performed by calling the \ ``DistributedModel`` object on the input - tensors. - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - For a backward pass, one needs to call the backward function on - the \ ``DistributedModel`` object, with tensors and gradients as - arguments, replacing the PyTorch operations \ ``torch.Tensor.backward`` - or ``torch.autograd.backward``. - - - The API for ``model.backward`` is very similar to - ``torch.autograd.backward``. For example, the following - ``backward`` calls: - - .. code:: python - - torch.autograd.backward(loss) or loss.backward() - - should be replaced with: - - .. code:: python - - model.backward(loss) # loss is a tensor with only one element as its data - - Similarly, for non-scalar tensors, replace the following - ``backward`` call containing incoming gradient arguments: - - .. code:: python - - torch.autograd.backward(outputs, out_grads) - - with the following line: - - .. code:: python - - model.backward(outputs, out_grads) - - In these examples, all ``__call__``  and ``backward`` method calls on - the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside - a ``smp.step``-decorated function. - - **Using DDP** - - If DDP is enabled, do not not place a PyTorch - ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because - the ``DistributedModel`` wrapper will also handle data parallelism. - - Unlike the original DDP wrapper, when you use ``DistributedModel``, - model parameters and buffers are not immediately broadcast across - processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the - ``smp.step``-decorated function when the partition is done. - - **Parameters** - - - ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism). - - - ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``) - Whether to perform the tracing step on the GPU or CPU. The tracing step gathers - information on the order of execution of modules, the shapes of - intermediate outputs, and execution times, to be used by the - partitioning algorithm. If ``trace_device`` is set to GPU, accurate - module execution times can be gathered during tracing for potentially - improved partitioning decision. However, if the model is too large to - fit in a single GPU, then ``trace_device`` should be set to CPU. - - - ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``, - the library profiles the execution time of each module during tracing, and uses - it in the partitioning decision. This improves the partitioning - decision, but it might make the tracing slower. It may also introduce - some degree of non-determinism in partitioning results, because of the - inherent randomness in module execution times. Must be ``False`` if - ``trace_device`` is ``"cpu"``. - - - ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` while launching training). The library uses this flag - to decide whether to do overlapping allreduce whenever a parameter - gradients are ready. This leads to overlapping of communication and - computation and can improve performance. If this is set to ``False`` , - allreduce is performed at the end of the step. - - - ``backward_passes_per_step`` (``int``) (default: 1): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` in config). This parameter indicates the - number of backward passes to perform before calling allreduce on DDP. - This allows accumulating updates over multiple mini-batches before - reducing and applying them. - - - ``average_grads_across_microbatches`` (``bool``) (default: ``True``): - Whether or not the computed gradients should be averaged across - microbatches. If ``False``, the computed gradients will be summed across - microbatches, but not divided by the number of microbatches. In typical - use case where the computed loss is averaged over the mini-batch, this - should be left as ``True``. If you use a loss function that only sums - the per-sample loss across the batch (and not divide by the batch size), - then this must be set to ``False`` for correctness. - - - ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets - parameters into multiple buckets so that gradient reduction of each - bucket can potentially overlap with backward - computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes - (MB). - - - ``trace_memory_usage`` (default: False): When set to True, the library attempts - to measure memory usage per module during tracing. If this is disabled, - memory usage will be estimated through the sizes of tensors returned from - the module. - - - ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``. - This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper. - Please see: `broadcast_buffer `__. - - - ``gradient_as_bucket_view`` (default: False): To be - used with ``ddp=True``. This parameter is forwarded to the underlying - ``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view `__. - - **Properties** - - - ``partitioned``: Is ``True`` if the model is partitioned, ``False`` - otherwise. Initialized to ``False`` when ``DistributedModel`` is first - created. It becomes be ``True`` during the first call - to ``smp.step``-decorated function. Once the model is partitioned, the - local parameters or local ``state_dict`` can be fetched using the - following methods. - - **Methods** - - .. function:: backward(tensors, grad_tensors) - :noindex: - - Triggers a distributed backward - pass across model partitions. Example usage provided in the previous - section. The API is very similar - to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward. - ``retain_grad`` and ``create_graph``  flags are not supported. - - .. function:: local_buffers( ) - :noindex: - - Returns an iterator over buffers for the modules in - the partitioned model that have been assigned to the current process. - - .. function:: local_named_buffers( ) - :noindex: - - Returns an iterator over buffers for the - modules in the partitioned model that have been assigned to the current - process. This yields both the name of the buffer as well as the buffer - itself. - - .. function:: local_parameters( ) - :noindex: - - Returns an iterator over parameters for the - modules in the partitioned model that have been assigned to the current - process. - - .. function:: local_named_parameters( ) - :noindex: - - Returns an iterator over parameters for - the modules in the partitioned model that have been assigned to the - current process. This yields both the name of the parameter as well as - the parameter itself. - - .. function:: local_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. - - .. function:: local_named_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. This - yields both the name of the module as well as the module itself. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains local - parameters that belong to the current \ ``mp_rank``. This ``state_dict`` - contains a key \ ``_smp_is_partial`` to indicate this is a - partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains parameters - for the entire model. It first collects the \ ``local_state_dict``  and - gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to - create a full ``state_dict``. Please note that this needs to be called on all ranks with - ``dp_rank()==0`` to ensure the gather happens properly. - If it is only called on all such ranks, it can hang. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.module.load_state_dict()`` , - except: It first gathers and merges the ``state_dict``\ s across - ``mp_rank``\ s, if they are partial. The actual loading happens after the - model partition so that each rank knows its local parameters. - - .. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. Returns a ``RemovableHandle`` object ``handle``, - which can be used to remove the hook by calling ``handle.remove()``. - - .. function:: cpu( ) - :noindex: - - Allgathers parameters and buffers across all ``mp_rank``\ s and moves them - to the CPU. - - .. function:: join( ) - :noindex: - - A context manager to be used in conjunction with an instance of - ``smp.DistributedModel`` to be able to train with uneven inputs across - participating processes. This is only supported when ``ddp=True``. This will use the join with the wrapped - ``DistributedDataParallel`` instance. For more information, see: - `join `__ - in the PyTorch documentation. - - .. function:: register_comm_hook( state, callable ) - :noindex: - - **Available for PyTorch 1.8.1 only** - Registers a communication hook which is an enhancement that provides - a flexible hook ``callable`` to users where they can specify how - gradients are aggregated across multiple workers. This method will be called on the wrapped ``DistributedDataParallel`` instance. - - Please note that when you register a comm hook you have full control of how the gradients are processed. - When using only data parallelism with Torch DDP you are expected to average grads across data parallel replicas within the hook. - Similarly, when using DistributedModel you have to averaging grads across data parallel replicas within the hook. - In addition to that, you also have to average grads across microbatches within the hook unless you explicitly desire to not average based on your loss function. - See ``average_grads_across_microbatches`` for more information about averaging grads across microbatches. - - This is only supported when ``ddp=True`` and ``overlapping_allreduce=True`` (default). - For more information, see: - `register_comm_hook `__ - in the PyTorch documentation. - - - -.. class:: smp.DistributedOptimizer - :noindex: - - **Parameters** - - ``optimizer`` - - An optimizer wrapper for saving/loading optimizer states. This wrapper - returns ``optimizer`` with the following methods overridden: - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains optimizer state for the entire model. - It first collects the ``local_state_dict`` and gathers and merges - the ``local_state_dict`` from all ``mp_rank``s to create a full - ``state_dict``. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.optimizer.load_state_dict()`` , except: - - - It first gathers and merges the local ``state_dict``\ s if they are - partial. - - The actual loading happens after the model partition so that each - rank knows its local parameters. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains the - local optimizer state that belongs to the current \ ``mp_rank``. This - ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is - a partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - ​ -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (int) - The index of the partition. - - A context manager which places all modules defined inside into the - partition with ID ``index``.  The ``index`` argument must be less than - the number of partitions. - - Use ``smp.partition`` to implement manual partitioning. - If ``"auto_partition"`` is ``True``, then the - ``smp.partition`` contexts are ignored. Any module that is not placed in - any ``smp.partition`` context is placed in the - ``default_partition`` defined through the SageMaker Python SDK. - - When ``smp.partition`` contexts are nested, the innermost context - overrides the rest (see the following example). In PyTorch, manual - partitioning should be done inside the module \ ``__init__``, and the - partition assignment applies to the modules that are *created* inside - the ``smp.partition`` context. - - Example: - - .. code:: python - - class Model(torch.nn.Module): -     def __init__(self): -         with smp.partition(1): -             self.child0 = Child0()            # child0 on partition 1 -             with smp.partition(2): -                 self.child1 = Child1()        # child1 on partition 2 -             self.child2 = Child2()            # child2 on partition 1 -         self.child3 = Child3()                # child3 on default_partition - -.. function:: smp.get_world_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all - processes, which can be used with the ``torch.distributed`` API. - Requires ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_mp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``MP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_dp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``DP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.is_initialized( ) - :noindex: - - Returns ``True`` if ``smp.init`` has already been called for the - process, and ``False`` otherwise. - -.. function::smp.is_tracing( ) - :noindex: - - Returns ``True`` if the current process is running the tracing step, and - ``False`` otherwise. - -.. data:: smp.nn.FusedLayerNorm - :noindex: - - `Apex Fused Layer Norm `__ is currently not - supported by the library. ``smp.nn.FusedLayerNorm`` replaces ``apex`` - ``FusedLayerNorm`` and provides the same functionality. This requires - ``apex`` to be installed on the system. - -.. data:: smp.optimizers.FusedNovoGrad - :noindex: - - - `Fused Novo Grad optimizer `__ is - currently not supported by the library. ``smp.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad`` - optimizer and provides the same functionality. This requires ``apex`` to - be installed on the system. - -.. data:: smp.optimizers.FusedLamb - :noindex: - - - `FusedLamb optimizer `__ - currently doesn’t work with the library. ``smp.optimizers.FusedLamb`` replaces - ``apex`` ``FusedLamb`` optimizer and provides the same functionality. - This requires ``apex`` to be installed on the system. - -.. data:: smp.amp.GradScaler - :noindex: - - `Torch AMP Gradscaler `__ - currently doesn’t work with the library. ``smp.amp.GradScaler`` replaces - ``torch.amp.GradScaler`` and provides the same functionality. - -.. _pytorch_saving_loading: - :noindex: - -APIs for Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. function:: smp.save( ) - :noindex: - - Saves an object. This operation is similar to ``torch.save()``, except - it has an additional keyword argument, ``partial``, and accepts only - string type for the argument ``f`` (file). If ``partial=True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank`` - index to your saved file. - - **Parameters** - - - ``obj`` (dict): A saved object. - - ``f`` (str): A string containing a file name. - - ``partial`` (bool, default= ``True``):  When set to ``True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an - ``mp_rank`` index to the saved file. If you want to be able to load - and further train a model that you save with ``smp.save()``, you must - set ``partial=True``. - - ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``): - A module used for pickling metadata and objects. - - ``pickle_protocol``  (int, default=2): Can be specified to - override the defaultprotocol. - -.. function:: smp.load( ) - :noindex: - - Loads an object saved with ``smp.save()`` from a file. - - Similar to, `torch.load() `__, - except it has an additional keyword argument, ``partial``, and accepts - only string type for the argument ``f`` (file). If \ ``partial=True``, - then each ``mp_rank`` loads a separate checkpoint file. - - **Parameters** - - - ``f`` (string): A string containing a file name. - - ``map_location`` (function): A function - `torch.device `__, - a string, or a dict specifying how to remap storage locations. - - ``pickle_module`` (pickle module): A module used for unpickling - metadata and objects (has to match the \ ``pickle_module``\ used to - serialize file). - - ``pickle_load_args`` (Python 3 only): Optional keyword arguments - passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``. - - ``partial`` (bool, default= ``True``): When set to ``True``, each - ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``. - Should be used when loading a model trained with the library. - -.. _pytorch_saving_loading_instructions: - :noindex: - -General Instruction For Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The library can save partial or full checkpoints. - -- For partial checkpoints, each ``mp_rank`` saves its own checkpoint - file with only the parameters that belong to that rank. -- For full checkpoints, the library saves a single checkpoint that contains - entire model parameters. - -When **saving** using ``smp.save()``, each rank only holds its own -parameters. If you want to save the full model, there will be some -communication between the ranks to create the full model. If you save -checkpoints often, you should save partial checkpoints for best -performance. - -When **loading** using ``smp.load()``, the library can load either partial or | -full checkpoints or full checkpoints saved by a non-model-parallel model. If you -want to resume training with a non-model-parallel model or do inference, you need -a full checkpoint. - -The following is an example of how you can save and load a checkpoint: - -.. code:: python - - # Original model and optimizer - model = MyModel(...) - optimizer = MyOpt(...) - - # model parallel wrapper - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - - # To save, always save on dp_rank 0 to avoid data racing - if partial: -     # To save the partial model on each mp rank -     # the library will create `checkpoint.pt_{mprank}` for each mp rank -     if save_partial_model: -         if smp.dp_rank() == 0: -             model_dict = model.local_state_dict() # save the partial model -             opt_dict = optimizer.local_state_dict() # save the partial optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 f"/checkpoint.pt", -                 partial=True, -             ) - -     # To save the full model -     if save_full_model: -         if smp.dp_rank() == 0: -             model_dict = model.state_dict() # save the full model -             opt_dict = optimizer.state_dict() # save the full optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 "/checkpoint.pt", -                 partial=False, -             ) - - # To load, load on all ranks. - # The only difference for partial/full loading is the partial flag in smp.load - # Load partial checkpoint - if partial_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=True) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) - # Load full checkpoint - if full_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=False) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) diff --git a/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_tensorflow.rst deleted file mode 100644 index 131fc327ac..0000000000 --- a/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_tensorflow.rst +++ /dev/null @@ -1,172 +0,0 @@ -TensorFlow API -============== - -**Supported version: 2.3.1, 2.4.1, 2.5.0** - -**Important**: This API document assumes you use the following import statement in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -.. tip:: - - Refer to - `Modify a TensorFlow Training Script - `_ - to learn how to use the following API in your TensorFlow training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of the Keras \ ``Model`` class, which defines the model to - be partitioned. Model definition is done by sub-classing - ``smp.DistributedModel`` class, and implementing the ``call()`` method, - in the same way as the Keras model sub-classing API. Any operation that - is part of the \ ``smp.DistributedModel.call()`` method is subject to - partitioning, meaning that every operation placed inside executes in - exactly one of the devices (the operations outside run on all devices). - - - Similar to the regular Keras API, the forward pass is done by directly - calling the model object on the input tensors. For example: - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - However, ``model()`` calls can only be made inside a - ``smp.step``-decorated function. - - The outputs from a ``smp.DistributedModel`` are available in all ranks, - regardless of which rank computed the last operation. - - **Methods:** - - .. function:: save_model(save_path="/opt/ml/model") - :noindex: - - **Inputs** - - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights. - - Saves the entire, - unpartitioned model with the latest trained weights to ``save_path`` in - TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which - SageMaker monitors to upload the model artifacts to Amazon S3. - -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (``int``): The index of the partition. - - A context manager which places all operations defined inside into the - partition whose ID is equal to ``index``. When - ``smp.partition`` contexts are nested, the innermost context overrides - the rest. The ``index`` argument must be smaller than the number of - partitions. - - ``smp.partition`` is used in the manual partitioning API; - if \ ``"auto_partition"`` parameter is set to ``True`` while launching - training, then ``smp.partition`` contexts are ignored. Any operation - that is not placed in any ``smp.partition`` context is placed in the - ``default_partition``, as shown in the following example: - - .. code:: python - - # auto_partition: False - # default_partition: 0 - smp.init() - [...] - x = tf.constant(1.2)                     # placed in partition 0 - with smp.partition(1): -     y = tf.add(x, tf.constant(2.3))      # placed in partition 1 -     with smp.partition(3): -         z = tf.reduce_sum(y)             # placed in partition 3 - - -.. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. - - .. code:: python - - @smp.register_post_partition_hook - def test_eager(): - # All statements here will be executed right after partition but before the first forward pass - tf.print("Entered hook through eager context") - -.. class:: smp.CheckpointManager - :noindex: - - - A subclass of TensorFlow - `CheckpointManager `__, - which is used to manage checkpoints. The usage is similar to TensorFlow - ``CheckpointManager``. - - The following returns a ``CheckpointManager`` object. - - .. code:: python - - smp.CheckpointManager(checkpoint, -                       directory="/opt/ml/checkpoints", -                       max_to_keep=None, -                       checkpoint_name="ckpt") - - **Parameters** - - - ``checkpoint``: A `tf.train.Checkpoint - `__ instance - that represents a model checkpoint. - - - ``directory``: (``str``) The path to a directory in which to write - checkpoints. A file named "checkpoint" is also written to this - directory (in a human-readable text format) which contains the state - of the ``CheckpointManager``. Defaults to - ``"/opt/ml/checkpoints"``, which is the directory that SageMaker - monitors for uploading the checkpoints to Amazon S3. - - ``max_to_keep`` (``int``): The number of checkpoints to keep. If - ``None``, all checkpoints are kept. - - ``checkpoint_name`` (``str``): Custom name for the checkpoint file. - Defaults to ``"ckpt"``. - - - **Methods:** - - .. function:: save( ) - :noindex: - - Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``. - - .. function:: restore( ) - :noindex: - - Restores the latest checkpoint in the specified directory. - Internally uses ``tf.train.CheckpointManager.restore()``. - - - **Examples:** - - .. code:: python - - checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) - ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints - - for inputs in train_ds: -     loss = train_step(inputs) -     # [...] -     ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints - - .. code:: python - - for step, inputs in enumerate(train_ds): -     if step == 0: -         ckpt_manager.restore() -     loss = train_step(inputs) diff --git a/doc/api/training/smp_versions/v1.6.0/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/v1.6.0/smd_model_parallel_common_api.rst deleted file mode 100644 index b4713b2707..0000000000 --- a/doc/api/training/smp_versions/v1.6.0/smd_model_parallel_common_api.rst +++ /dev/null @@ -1,538 +0,0 @@ -Common API -========== - -The following SageMaker distribute model parallel APIs are common across all frameworks. - -.. contents:: Table of Contents - :depth: 3 - :local: - -The Library's Core APIs ------------------------ - -This API document assumes you use the following import statement in your training scripts. - -**TensorFlow** - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -**PyTorch** - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. function:: smp.init( ) - :noindex: - - Initialize the library. Must be called at the beginning of training script. - -.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs]) - :noindex: - - A decorator that must be placed over a function that represents a single - forward and backward pass (for training use cases), or a single forward - pass (for evaluation use cases). Any computation that is defined inside - the ``smp.step``-decorated function is executed in a pipelined manner. - - By default, every tensor input to the function is split across its batch - dimension into a number of microbatches specified while launching the - training job. This behavior can be customized through the arguments to - ``smp.step``, described below. The library then orchestrates the execution of - each microbatch across all partitions, based on the chosen pipeline - type. - - In a typical use case, forward pass and back-propagation are executed - inside an \ ``smp.step``-decorated function and gradients, loss, and - other relevant metrics (such as accuracy, etc.) are returned from - ``smp.step``-decorated function. - - Any gradient post-processing operation, such as gradient clipping and - allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or - ``optimizer.step`` (for PT) should be applied on the gradients returned - from the ``smp.step`` function, and not inside the ``smp.step`` - function. This is because every operation inside ``smp.step`` is - executed once per microbatch, so having these operations inside - ``smp.step`` can either be inefficient (in the case of allreduce), or - lead to wrong results (in the case of ``apply_gradients`` / - ``optimizer.step``). - - If the objects returned from the ``smp.step``-decorated function contain - ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to - ``StepOutput`` objects. A ``StepOutput`` object encapsulates all - versions of the tensor across different microbatches - (see ``StepOutput`` entry for more information). - - The argument to ``smp.step`` decorated function should either be a tensor - or an instance of list, tuple, dict or set for it to be split across - microbatches. If your object doesn't fall into this category, you can make - the library split your object, by implementing ``smp_slice`` method. - - Below is an example of how to use it with PyTorch. - - .. code:: python - - class CustomType: - def __init__(self, tensor): - self.data = tensor - - # The library will call this to invoke slicing on the object passing in total microbatches (num_mb) - # and the current microbatch index (mb). - def smp_slice(self, num_mb, mb, axis): - dim_size = list(self.data.size())[axis] - - split_size = dim_size // num_mb - sliced_tensor = self.data.narrow(axis, mb * split_size, split_size) - return CustomType(sliced_tensor, self.other) - - custom_obj = CustomType(torch.ones(4,)) - - @smp.step() - def step(custom_obj): - loss = model(custom_obj) - model.backward(loss) - return loss - - - **Important:** ``smp.step`` splits the batch into microbatches, and - executes everything inside the decorated function once per microbatch. - This might affect the behavior of batch normalization, any operation - that explicitly uses the batch size information, or any other Python - code that is expected to run once. - - **TensorFlow-specific behavior** - - ``smp.step`` is a wrapper that - inherits from and extends the behavior of ``tf.function``, and as such, - all the caveats that apply to the use of ``tf.function``\ s also apply - to ``smp.step``. In particular, any operation that is inside - ``smp.step`` executes in graph mode, and not eager mode. - - In the first call, ``smp.step`` performs tracing of the wrapped function every time - one of the tensor arguments changes their shape or dtype, or for every - new value of a Python argument, if there is one. Tracing is expensive, - so such scenarios should be avoided as much as possible or, - alternatively, an ``input_signature`` argument must be provided. For - more information on the usage of ``tf.function``, refer to the - TensorFlow documentation: - - - https://www.tensorflow.org/api_docs/python/tf/function\ - - https://www.tensorflow.org/guide/function\ - - Each ``smp.step`` decorated function must have a return value that depends on the - output of ``smp.DistributedModel``. - - **Common parameters** - - - ``non_split_inputs`` (``list``): The list of arguments to the decorated function - that should not be split along the batch dimension. Should be used - for all input tensors that do not have a batch dimension. Should be a - list of argument names as ``str``, as they appear in the signature of - the ``smp.step``-decorated function. By default it is considered an - empty list. - - - ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch - axis. The keys should be the argument names as ``str``, as they - appear in the signature of the ``smp.step``-decorated function.  By - default all batch axes are assumed to be the 0-axis. - - **TensorFlow-only parameters** - - - All arguments of ``tf.function``. Note: - The \ ``experimental_compile`` argument of ``tf.function`` may not - work as expected with ``smp.step``, since it interferes with - pipelining and model partitioning. To enable XLA with the library, you can - instead use \ ``tf.config.optimizer.set_jit(True)``. - - **PyTorch-only parameters** - - - ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on - all returned ``torch.Tensor`` outputs. Setting it to ``False`` - increases memory consumption, unless ``detach()`` is manually called - on the returned tensors, because the model graph is not cleared from - memory after the training step. Set to \ ``True`` by default. - - **Returns** - - - The same object(s) returned from the decorated function. All - returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or - ``torch.Tensor`` objects (for PT) are wrapped inside - a \ ``StepOutput`` object, even when they are inside a Python - ``list``, ``tuple``, or ``dict``. - - - -.. class:: StepOutput - :noindex: - - - A class that encapsulates all versions of a ``tf.Tensor`` - or \ ``torch.Tensor`` across all microbatches. - - When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside - ``smp.step``, different versions of the tensor are computed for each - microbatch. - - When this tensor is returned from ``smp.step`` and is accessed outside - of the decorated function, it appears as a ``StepOutput`` object, which - contains all such versions. For example, - - - In the case of Tensorflow, the gradient for a particular - ``tf.Variable`` is computed on each microbatch individually, and if - this gradient is returned from ``smp.step``, all gradients for this - ``tf.Variable`` become part of the same ``StepOutput`` object. The - ``StepOutput`` class offers the following API for commonly-used - post-processing operations on such tensors. - - In the case of PyTorch, the loss for each microbatch is computed - individually and all the ``torch.Tensor``\ s that represent the loss - for different microbatches become part of same ``StepOutput`` object, - if loss is returned from the ``smp.step`` function. - - - The ``StepOutput`` class offers the following API for commonly-used - post-processing operations on tensors. - - .. data:: StepOutput.outputs - :noindex: - - Returns a list of the underlying tensors, indexed by microbatch. - - .. function:: StepOutput.reduce_mean( ) - :noindex: - - Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s - ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches. - - .. function:: StepOutput.reduce_sum( ) - :noindex: - - Returns a ``tf.Tensor`` / - ``torch.Tensor`` that sums the constituent - ``tf.Tensor``\ s/\ ``torch.Tensor``\ s. - - .. function:: StepOutput.concat( ) - :noindex: - - Returns a - ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the - batch dimension using ``tf.concat`` / ``torch.cat``. - - .. function:: StepOutput.stack( ) - :noindex: - - Applies ``tf.stack`` / ``torch.stack`` - operation to the list of constituent ``tf.Tensor``\ s / - ``torch.Tensor``\ s. - - **TensorFlow-only methods** - - .. function:: StepOutput.merge( ) - :noindex: - - Returns a ``tf.Tensor`` that - concatenates the constituent ``tf.Tensor``\ s along the batch - dimension. This is commonly used for merging the model predictions - across microbatches. - - .. function:: StepOutput.accumulate(method="variable", var=None) - :noindex: - - Functionally the same as ``StepOutput.reduce_mean()``. However, it is - more memory-efficient, especially for large numbers of microbatches, - since it does not wait for all constituent \ ``tf.Tensor``\ s to be - ready to start averaging them, thereby saving memory. - - In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end - up being more memory-efficient than ``StepOutput.accumulate()``. - - **Parameters** - - - ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``): - If ``"add_n"`` or ``"accumulate_n"``, the library uses - ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement - accumulation. If ``"variable"``, the library uses an internal ``tf.Variable`` - into which to accumulate the tensors. Default is \ ``"variable"``. - Note: Memory usage behavior of these choices can depend on the model - and implementation. - - - ``var``: A ``tf.Variable`` into which, if provided, the library uses to - accumulate the tensors. If \ ``None``, the library internally creates a - variable. If ``method`` is not ``"variable"``, this argument is - ignored. - -.. _mpi_basics: - :noindex: - -MPI Basics ----------- - -The library exposes the following basic MPI primitives to its Python API: - -**Global** - -- ``smp.rank()`` : The global rank of the current process. -- ``smp.size()`` : The total number of processes. -- ``smp.get_world_process_group()`` : - ``torch.distributed.ProcessGroup`` that contains all processes. -- ``smp.CommGroup.WORLD``: The communication group corresponding to all processes. -- ``smp.local_rank()``: The rank among the processes on the current instance. -- ``smp.local_size()``: The total number of processes on the current instance. -- ``smp.get_mp_group()``: The list of ranks over which the current model replica is partitioned. -- ``smp.get_dp_group()``: The list of ranks that hold different replicas of the same model partition. - -**Tensor Parallelism** - -- ``smp.tp_rank()`` : The rank of the process within its - tensor-parallelism group. -- ``smp.tp_size()`` : The size of the tensor-parallelism group. -- ``smp.get_tp_process_group()`` : Equivalent to - ``torch.distributed.ProcessGroup`` that contains the processes in the - current tensor-parallelism group. -- ``smp.CommGroup.TP_GROUP`` : The communication group corresponding to - the current tensor parallelism group. - -**Pipeline Parallelism** - -- ``smp.pp_rank()`` : The rank of the process within its - pipeline-parallelism group. -- ``smp.pp_size()`` : The size of the pipeline-parallelism group. -- ``smp.get_pp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current pipeline-parallelism group. -- ``smp.CommGroup.PP_GROUP`` : The communication group corresponding to - the current pipeline parallelism group. - -**Reduced-Data Parallelism** - -- ``smp.rdp_rank()`` : The rank of the process within its - reduced-data-parallelism group. -- ``smp.rdp_size()`` : The size of the reduced-data-parallelism group. -- ``smp.get_rdp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current reduced data parallelism - group. -- ``smp.CommGroup.RDP_GROUP`` : The communication group corresponding - to the current reduced data parallelism group. - -**Model Parallelism** - -- ``smp.mp_rank()`` : The rank of the process within its model-parallelism - group. -- ``smp.mp_size()`` : The size of the model-parallelism group. -- ``smp.get_mp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current model-parallelism group. -- ``smp.CommGroup.MP_GROUP`` : The communication group corresponding to - the current model parallelism group. - -**Data Parallelism** - -- ``smp.dp_rank()`` : The rank of the process within its data-parallelism - group. -- ``smp.dp_size()`` : The size of the data-parallelism group. -- ``smp.get_dp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current data-parallelism group. -- ``smp.CommGroup.DP_GROUP`` : The communication group corresponding to - the current data-parallelism group. - -.. _communication_api: - :noindex: - -Communication API ------------------ - -The library provides a few communication primitives which can be helpful while -developing the training script. These primitives use the following -``enum`` s as arguments to specify which processes the communication -should involve. -​ - -**Helper structures** - -.. data:: smp.CommGroup - :noindex: - - An ``enum`` that takes the values - ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``. - These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``, - and ``smp.DP_GROUP`` respectively. - - - ``CommGroup.WORLD``: Represents the entire group of processes used in - training - - ``CommGroup.MP_GROUP``: Represents the group of processes that hold - the same model replica as the current process. The processes in a - single ``MP_GROUP`` collectively store an entire replica of the - model. - - ``CommGroup.DP_GROUP``: Represents the group of processes that hold - the same model partition as the current process. The processes in a - single ``DP_GROUP`` perform data parallelism/allreduce among - themselves. - -.. data:: smp.RankType - :noindex: - - An ``enum`` that takes the values - ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``. - - - ``RankType.WORLD_RANK``: The associated rank is to be interpreted as - the rank of the process across all processes used in training. - - ``RankType.MP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``MP_GROUP``. - - ``RankType.DP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``DP_GROUP``. - - -**Communication primitives:** - -.. function:: smp.broadcast(obj, group) - :noindex: - - Sends the object to all processes in the - group. The receiving process must call ``smp.recv_from`` to receive the - sent object. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be broadcast. - - - ``group``: A ``CommGroup`` argument that represents to which group of - processes the object will be sent. - - **Notes** - - - When you use ``broadcast`` on the sender process, there needs - to be an accompanying ``smp.recv_from()`` call on the receiver - processes. - - - This is a synchronous call; the ``broadcast`` statement - returns only after all ranks participating in the call have made a - matching ``recv_from`` call. - - **Example** - - .. code:: python - - if smp.rank() == 0: -     smp.broadcast(something, group=smp.CommGroup.WORLD) - else: -     smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK) - -.. function:: smp.send(obj, dest_rank, rank_type) - :noindex: - - Sends the object ``obj`` to - ``dest_rank``, which is of a type specified by ``rank_type``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be sent. - - - ``dest_rank`` (``int``): An integer denoting the rank of the receiving process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process - with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current - process. - - **Notes** - - - Note: \ This is a synchronous call; the ``send`` statement returns - only after the destination rank has made a matching - ``recv_from`` call. - -.. function:: smp.recv_from(src_rank, rank_type) - :noindex: - - Receive an object from a peer process. Can be used with a matching - ``smp.send`` or a ``smp.broadcast`` call. - - **Inputs** - - - ``src_rank`` (``int``): An integer denoting rank of the sending process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then the object is received from - the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the - current process. - - **Returns** - - Returns the python object that is sent by the peer process. - - **Notes** - - - Note: This is a synchronous call; the ``recv_from`` statement returns - only after the source rank has made a matching ``send`` or - ``broadcast`` call, and the object is received. - -.. function:: smp.allgather(obj, group) - :noindex: - - A collective call that gathers all the - submitted objects across all ranks in the specified ``group``. Returns a - list whose ``i``\ th index contains the object submitted by the - ``i``\ th rank in ``group``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be - allgathered. - - - ``group`` : A ``CommGroup`` argument that represents which group of - processes participate in ``allgather``. - - **Notes** - - - Note: This is a synchronous call; the ``allgather`` statement returns - only after all ranks participating in the call have made a matching - ``allgather`` call, and all the objects are received at the current - rank. - - **Examples** - - .. code:: python - - # assuming mp_size() == 2 - - if smp.mp_rank() == 0: -     out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - else: -     out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - -.. function:: smp.barrier(group=smp.WORLD) - :noindex: - - A statement that hangs until all - processes in the specified group reach the barrier statement, similar to - ``MPI_Barrier()``. - - **Inputs** - - - ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of - processes participating in the barrier call. Defaults to - ``smp.WORLD``. - - **Examples** - - - Assume there are 8 processes and 2 model partitions, and - therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If - the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its - group argument, then each process only waits until the other process - of its own ``mp_group`` reaches that point. It does not wait for - processes outside that ``mp_group``. - -.. function:: smp.dp_barrier() - :noindex: - - Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``. - Waits for the processes in the same \ ``dp_group`` as - the current process to reach the same point in execution. - -.. function:: smp.mp_barrier() - :noindex: - - Same as passing ``smp.MP_GROUP`` to - ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as - the current process to reach the same point in execution. diff --git a/doc/api/training/smp_versions/v1.6.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.6.0/smd_model_parallel_pytorch.rst deleted file mode 100644 index e549559b6b..0000000000 --- a/doc/api/training/smp_versions/v1.6.0/smd_model_parallel_pytorch.rst +++ /dev/null @@ -1,678 +0,0 @@ -PyTorch API -=========== - -To use the PyTorch-specific APIs for SageMaker distributed model parallism, -you need to add the following import statement at the top of your training script. - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. tip:: - - Refer to - `Modify a PyTorch Training Script - `_ - to learn how to use the following API in your PyTorch training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of ``torch.nn.Module`` which specifies the model to be - partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is - the model to be partitioned. The returned ``DistributedModel`` object - internally manages model parallelism and data parallelism. Only one - model in the training script can be wrapped with - ``smp.DistributedModel``. - - **Example:** - - .. code:: python - - model = smp.DistributedModel(model) - - **Important**: The ``__call__`` and  ``backward`` method calls on the - ``smp.DistributedModel`` object (in the following example, the object - is \ ``model``) can only be made inside a ``smp.step``-decorated - function. - - Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can - be performed by calling the \ ``DistributedModel`` object on the input - tensors. - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - For a backward pass, one needs to call the backward function on - the \ ``DistributedModel`` object, with tensors and gradients as - arguments, replacing the PyTorch operations \ ``torch.Tensor.backward`` - or ``torch.autograd.backward``. - - The API for ``model.backward`` is very similar to - ``torch.autograd.backward``. For example, the following - ``backward`` calls: - - .. code:: python - - torch.autograd.backward(loss) or loss.backward() - - should be replaced with: - - .. code:: python - - model.backward(loss) # loss is a tensor with only one element as its data - - Similarly, for non-scalar tensors, replace the following - ``backward`` call containing incoming gradient arguments: - - .. code:: python - - torch.autograd.backward(outputs, out_grads) - - with the following line: - - .. code:: python - - model.backward(outputs, out_grads) - - In these examples, all ``__call__``  and ``backward`` method calls on - the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside - a ``smp.step``-decorated function. - - **Using DDP** - - If DDP is enabled with the SageMaker model parallel library, do not not place a PyTorch - ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because - the ``DistributedModel`` wrapper will also handle data parallelism. - - Unlike the original DDP wrapper, when you use ``DistributedModel``, - model parameters and buffers are not immediately broadcast across - processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the - ``smp.step``-decorated function when the partition is done. - - **Parameters** - - - ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism). - - - ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``) - Whether to perform the tracing step on the GPU or CPU. The tracing step gathers - information on the order of execution of modules, the shapes of - intermediate outputs, and execution times, to be used by the - partitioning algorithm. If ``trace_device`` is set to GPU, accurate - module execution times can be gathered during tracing for potentially - improved partitioning decision. However, if the model is too large to - fit in a single GPU, then ``trace_device`` should be set to CPU. - - - ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``, - the library profiles the execution time of each module during tracing, and uses - it in the partitioning decision. This improves the partitioning - decision, but it might make the tracing slower. It may also introduce - some degree of non-determinism in partitioning results, because of the - inherent randomness in module execution times. Must be ``False`` if - ``trace_device`` is ``"cpu"``. - - - ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` while launching training). The library uses this flag - to decide whether to do overlapping allreduce whenever a parameter - gradients are ready. This leads to overlapping of communication and - computation and can improve performance. If this is set to ``False`` , - allreduce is performed at the end of the step. - - - ``backward_passes_per_step`` (``int``) (default: 1): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` in config). This parameter indicates the - number of backward passes to perform before calling allreduce on DDP. - This allows accumulating updates over multiple mini-batches before - reducing and applying them. - - - ``average_grads_across_microbatches`` (``bool``) (default: ``True``): - Whether or not the computed gradients should be averaged across - microbatches. If ``False``, the computed gradients will be summed across - microbatches, but not divided by the number of microbatches. In typical - use case where the computed loss is averaged over the mini-batch, this - should be left as ``True``. If you use a loss function that only sums - the per-sample loss across the batch (and not divide by the batch size), - then this must be set to ``False`` for correctness. - - - ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets - parameters into multiple buckets so that gradient reduction of each - bucket can potentially overlap with backward - computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes - (MB). - - - ``trace_memory_usage`` (default: False): When set to True, the library attempts - to measure memory usage per module during tracing. If this is disabled, - memory usage will be estimated through the sizes of tensors returned from - the module. - - - ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``. - This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper. - Please see: `broadcast_buffer `__. - - - ``gradient_as_bucket_view`` (default: False): To be - used with ``ddp=True``. This parameter is forwarded to the underlying - ``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view `__. - - **Properties** - - - ``partitioned``: Is ``True`` if the model is partitioned, ``False`` - otherwise. Initialized to ``False`` when ``DistributedModel`` is first - created. It becomes be ``True`` during the first call - to ``smp.step``-decorated function. Once the model is partitioned, the - local parameters or local ``state_dict`` can be fetched using the - following methods. - - **Methods** - - .. function:: backward(tensors, grad_tensors) - :noindex: - - Triggers a distributed backward - pass across model partitions. Example usage provided in the previous - section. The API is very similar - to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward. - ``retain_grad`` and ``create_graph``  flags are not supported. - - .. function:: local_buffers( ) - :noindex: - - Returns an iterator over buffers for the modules in - the partitioned model that have been assigned to the current process. - - .. function:: local_named_buffers( ) - :noindex: - - Returns an iterator over buffers for the - modules in the partitioned model that have been assigned to the current - process. This yields both the name of the buffer as well as the buffer - itself. - - .. function:: local_parameters( ) - :noindex: - - Returns an iterator over parameters for the - modules in the partitioned model that have been assigned to the current - process. - - .. function:: local_named_parameters( ) - :noindex: - - Returns an iterator over parameters for - the modules in the partitioned model that have been assigned to the - current process. This yields both the name of the parameter as well as - the parameter itself. - - .. function:: local_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. - - .. function:: local_named_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. This - yields both the name of the module as well as the module itself. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains local - parameters that belong to the current \ ``mp_rank``. This ``state_dict`` - contains a key \ ``_smp_is_partial`` to indicate this is a - partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains parameters - for the entire model. It first collects the \ ``local_state_dict``  and - gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to - create a full ``state_dict``. Please note that this needs to be called on all ranks with - ``dp_rank()==0`` to ensure the gather happens properly. - If it is only called on all such ranks, it can hang. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.module.load_state_dict()`` , - except: It first gathers and merges the ``state_dict``\ s across - ``mp_rank``\ s, if they are partial. The actual loading happens after the - model partition so that each rank knows its local parameters. - - .. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. Returns a ``RemovableHandle`` object ``handle``, - which can be used to remove the hook by calling ``handle.remove()``. - - .. function:: cpu( ) - :noindex: - - Allgathers parameters and buffers across all ``mp_rank``\ s and moves them - to the CPU. - - .. function:: join( ) - :noindex: - - A context manager to be used in conjunction with an instance of - ``smp.DistributedModel`` to be able to train with uneven inputs across - participating processes. This is only supported when ``ddp=True``. This will use the join with the wrapped - ``DistributedDataParallel`` instance. For more information, see: - `join `__ - in the PyTorch documentation. - - .. function:: register_comm_hook( state, callable ) - :noindex: - - **Available for PyTorch 1.8.1 only** - Registers a communication hook which is an enhancement that provides - a flexible hook ``callable`` to users where they can specify how - gradients are aggregated across multiple workers. This method will be called on the wrapped ``DistributedDataParallel`` instance. - - Please note that when you register a comm hook you have full control of how the gradients are processed. - When using only data parallelism with Torch DDP you are expected to average grads across data parallel replicas within the hook. - Similarly, when using DistributedModel you have to averaging grads across data parallel replicas within the hook. - In addition to that, you also have to average grads across microbatches within the hook unless you explicitly desire to not average based on your loss function. - See ``average_grads_across_microbatches`` for more information about averaging grads across microbatches. - - This is only supported when ``ddp=True`` and ``overlapping_allreduce=True`` (default). - For more information, see: - `register_comm_hook `__ - in the PyTorch documentation. - - **Behavior of** ``smp.DistributedModel`` **with Tensor Parallelism** - - When a model is wrapped by ``smp.DistributedModel``, the library - immediately traverses the modules of the model object, and replaces the - modules that are supported for tensor parallelism with their distributed - counterparts. This replacement happens in place. If there are no other - references to the original modules in the script, they are - garbage-collected. The module attributes that previously referred to the - original submodules now refer to the distributed versions of those - submodules. - - **Example:** - - .. code:: python - - # register DistributedSubmodule as the distributed version of Submodule - # (note this is a hypothetical example, smp.nn.DistributedSubmodule does not exist) - smp.tp_register_with_module(Submodule, smp.nn.DistributedSubmodule) - - class MyModule(nn.Module): - def __init__(self): - ... - - self.submodule = Submodule() - ... - - # enabling tensor parallelism for the entire model - with smp.tensor_parallelism(): - model = MyModule() - - # here model.submodule is still a Submodule object - assert isinstance(model.submodule, Submodule) - - model = smp.DistributedModel(model) - - # now model.submodule is replaced with an equivalent instance - # of smp.nn.DistributedSubmodule - assert isinstance(model.module.submodule, smp.nn.DistributedSubmodule) - - If ``pipeline_parallel_degree`` (equivalently, ``partitions``) is 1, the - placement of model partitions into GPUs and the initial broadcast of - model parameters and buffers across data-parallel ranks take place - immediately. This is because it does not need to wait for the model - partition when ``smp.DistributedModel`` wrapper is called. For other - cases with ``pipeline_parallel_degree`` greater than 1, the broadcast - and device placement will be deferred until the first call of an - ``smp.step``-decorated function happens. This is because the first - ``smp.step``-decorated function call is when the model partitioning - happens if pipeline parallelism is enabled. - - Because of the module replacement during the ``smp.DistributedModel`` - call, any ``load_state_dict`` calls on the model, as well as any direct - access to model parameters, such as during the optimizer creation, - should be done **after** the ``smp.DistributedModel`` call. - - Since the broadcast of the model parameters and buffers happens - immediately during ``smp.DistributedModel`` call when the degree of - pipeline parallelism is 1, using ``@smp.step`` decorators is not - required when tensor parallelism is used by itself (without pipeline - parallelism). - - For more information about the library's tensor parallelism APIs for PyTorch, - see :ref:`smdmp-pytorch-tensor-parallel`. - - **Additional Methods of** ``smp.DistributedModel`` **for Tensor Parallelism** - - The following are the new methods of ``smp.DistributedModel``, in - addition to the ones listed in the - `documentation `__. - - .. function:: distributed_modules() - :noindex: - - - An iterator that runs over the set of distributed - (tensor-parallelized) modules in the model - - .. function:: is_distributed_parameter(param) - :noindex: - - - Returns ``True`` if the given ``nn.Parameter`` is distributed over - tensor-parallel ranks. - - .. function:: is_distributed_buffer(buf) - :noindex: - - - Returns ``True`` if the given buffer is distributed over - tensor-parallel ranks. - - .. function:: is_scaled_batch_parameter(param) - :noindex: - - - Returns ``True`` if the given ``nn.Parameter`` is operates on the - scaled batch (batch over the entire ``TP_GROUP``, and not only the - local batch). - - .. function:: is_scaled_batch_buffer(buf) - :noindex: - - - Returns ``True`` if the parameter corresponding to the given - buffer operates on the scaled batch (batch over the entire - ``TP_GROUP``, and not only the local batch). - - .. function:: default_reducer_named_parameters() - :noindex: - - - Returns an iterator that runs over ``(name, param)`` tuples, for - ``param`` that is allreduced over the ``DP_GROUP``. - - .. function:: scaled_batch_reducer_named_parameters() - :noindex: - - - Returns an iterator that runs over ``(name, param)`` tuples, for - ``param`` that is allreduced over the ``RDP_GROUP``. - - - -.. class:: smp.DistributedOptimizer - :noindex: - - **Parameters** - - ``optimizer`` - - An optimizer wrapper for saving/loading optimizer states. This wrapper - returns ``optimizer`` with the following methods overridden: - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains optimizer state for the entire model. - It first collects the ``local_state_dict`` and gathers and merges - the ``local_state_dict`` from all ``mp_rank``s to create a full - ``state_dict``. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.optimizer.load_state_dict()`` , except: - - - It first gathers and merges the local ``state_dict``\ s if they are - partial. - - The actual loading happens after the model partition so that each - rank knows its local parameters. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains the - local optimizer state that belongs to the current \ ``mp_rank``. This - ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is - a partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - ​ -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (int) - The index of the partition. - - A context manager which places all modules defined inside into the - partition with ID ``index``.  The ``index`` argument must be less than - the number of partitions. - - Use ``smp.partition`` to implement manual partitioning. - If ``"auto_partition"`` is ``True``, then the - ``smp.partition`` contexts are ignored. Any module that is not placed in - any ``smp.partition`` context is placed in the - ``default_partition`` defined through the SageMaker Python SDK. - - When ``smp.partition`` contexts are nested, the innermost context - overrides the rest (see the following example). In PyTorch, manual - partitioning should be done inside the module \ ``__init__``, and the - partition assignment applies to the modules that are *created* inside - the ``smp.partition`` context. - - Example: - - .. code:: python - - class Model(torch.nn.Module): -     def __init__(self): -         with smp.partition(1): -             self.child0 = Child0()            # child0 on partition 1 -             with smp.partition(2): -                 self.child1 = Child1()        # child1 on partition 2 -             self.child2 = Child2()            # child2 on partition 1 -         self.child3 = Child3()                # child3 on default_partition - -.. function:: smp.get_world_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all - processes, which can be used with the ``torch.distributed`` API. - Requires ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_mp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``MP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_dp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``DP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.is_initialized( ) - :noindex: - - Returns ``True`` if ``smp.init`` has already been called for the - process, and ``False`` otherwise. - -.. function:: smp.is_tracing( ) - :noindex: - - Returns ``True`` if the current process is running the tracing step, and - ``False`` otherwise. - -.. data:: smp.nn.FusedLayerNorm - :noindex: - - `Apex Fused Layer Norm `__ is currently not - supported by the library. ``smp.nn.FusedLayerNorm`` replaces ``apex`` - ``FusedLayerNorm`` and provides the same functionality. This requires - ``apex`` to be installed on the system. - -.. data:: smp.optimizers.FusedNovoGrad - :noindex: - - - `Fused Novo Grad optimizer `__ is - currently not supported by the library. ``smp.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad`` - optimizer and provides the same functionality. This requires ``apex`` to - be installed on the system. - -.. data:: smp.optimizers.FusedLamb - :noindex: - - - `FusedLamb optimizer `__ - currently doesn’t work with the library. ``smp.optimizers.FusedLamb`` replaces - ``apex`` ``FusedLamb`` optimizer and provides the same functionality. - This requires ``apex`` to be installed on the system. - -.. data:: smp.amp.GradScaler - :noindex: - - `Torch AMP Gradscaler `__ - currently doesn’t work with the library. ``smp.amp.GradScaler`` replaces - ``torch.amp.GradScaler`` and provides the same functionality. - -.. _pytorch_saving_loading: - :noindex: - -APIs for Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. function:: smp.save( ) - :noindex: - - Saves an object. This operation is similar to ``torch.save()``, except - it has an additional keyword argument, ``partial``, and accepts only - string type for the argument ``f`` (file). If ``partial=True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank`` - index to your saved file. - - **Parameters** - - - ``obj`` (dict): A saved object. - - ``f`` (str): A string containing a file name. - - ``partial`` (bool, default= ``True``):  When set to ``True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an - ``mp_rank`` index to the saved file. If you want to be able to load - and further train a model that you save with ``smp.save()``, you must - set ``partial=True``. - - ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``): - A module used for pickling metadata and objects. - - ``pickle_protocol``  (int, default=2): Can be specified to - override the defaultprotocol. - -.. function:: smp.load( ) - :noindex: - - Loads an object saved with ``smp.save()`` from a file. - - Similar to, `torch.load() `__, - except it has an additional keyword argument, ``partial``, and accepts - only string type for the argument ``f`` (file). If \ ``partial=True``, - then each ``mp_rank`` loads a separate checkpoint file. - - **Parameters** - - - ``f`` (string): A string containing a file name. - - ``map_location`` (function): A function - `torch.device `__, - a string, or a dict specifying how to remap storage locations. - - ``pickle_module`` (pickle module): A module used for unpickling - metadata and objects (has to match the \ ``pickle_module``\ used to - serialize file). - - ``pickle_load_args`` (Python 3 only): Optional keyword arguments - passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``. - - ``partial`` (bool, default= ``True``): When set to ``True``, each - ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``. - Should be used when loading a model trained with the library. - -.. _pytorch_saving_loading_instructions: - :noindex: - -General Instruction For Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The library can save partial or full checkpoints. - -- For partial checkpoints, each ``mp_rank`` saves its own checkpoint - file with only the parameters that belong to that rank. -- For full checkpoints, the library saves a single checkpoint that contains - entire model parameters. - -When **saving** using ``smp.save()``, each rank only holds its own -parameters. If you want to save the full model, there will be some -communication between the ranks to create the full model. If you save -checkpoints often, you should save partial checkpoints for best -performance. - -When **loading** using ``smp.load()``, the library can load either partial or | -full checkpoints or full checkpoints saved by a non-model-parallel model. If you -want to resume training with a non-model-parallel model or do inference, you need -a full checkpoint. - -The following is an example of how you can save and load a checkpoint: - -.. code:: python - - # Original model and optimizer - model = MyModel(...) - optimizer = MyOpt(...) - - # model parallel wrapper - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - - # To save, always save on dp_rank 0 to avoid data racing - if partial: -     # To save the partial model on each mp rank -     # the library will create `checkpoint.pt_{mprank}` for each mp rank -     if save_partial_model: -         if smp.dp_rank() == 0: -             model_dict = model.local_state_dict() # save the partial model -             opt_dict = optimizer.local_state_dict() # save the partial optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 f"/checkpoint.pt", -                 partial=True, -             ) - -     # To save the full model -     if save_full_model: -         if smp.dp_rank() == 0: -             model_dict = model.state_dict() # save the full model -             opt_dict = optimizer.state_dict() # save the full optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 "/checkpoint.pt", -                 partial=False, -             ) - - # To load, load on all ranks. - # The only difference for partial/full loading is the partial flag in smp.load - # Load partial checkpoint - if partial_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=True) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) - # Load full checkpoint - if full_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=False) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) diff --git a/doc/api/training/smp_versions/v1.6.0/smd_model_parallel_pytorch_tensor_parallel.rst b/doc/api/training/smp_versions/v1.6.0/smd_model_parallel_pytorch_tensor_parallel.rst deleted file mode 100644 index d481d32c15..0000000000 --- a/doc/api/training/smp_versions/v1.6.0/smd_model_parallel_pytorch_tensor_parallel.rst +++ /dev/null @@ -1,855 +0,0 @@ -.. _smdmp-pytorch-tensor-parallel: - :noindex: - -PyTorch API for Tensor Parallelism -================================== - -SageMaker distributed tensor parallelism works by replacing specific submodules -in the model with their distributed implementations. The distributed modules -have their parameters and optimizer states partitioned across tensor-parallel -ranks. This is to compute the same output as it would have been computed by -the original modules. Since tensor parallelism occurs across data-parallel -ranks, a rank might collect slices of the activations corresponding to the -data shards on other devices that are part of the same tensor parallelism group. - -You can enable or disable tensor parallelism for specific parts of the model. -Within the enabled parts, the replacements with distributed modules will take -place on a best-effort basis for those module supported for tensor parallelism. -Alternatively, you can directly import and use the library’s distributed -modules in the model definition. - -Some of the supported modules (such as ``smp.nn.Transformer``) are high-level -blocks that contain many operations. Because custom implementations -(as opposed to the built-in PyTorch modules) are typically used for these -high-level blocks, the library offers an API that you can use to register -specific distributed versions with such custom modules (provided that they -are functionally equivalent). This allows the library to automatically replace -the occurrences of such PyTorch modules with their distributed counterparts -provided by the library. -For more information, see the following topics. - -.. contents:: Topics - :depth: 3 - :local: - -.. _registering-tp-modules: - :noindex: - -Registering Tensor Parallelism Distributed Modules --------------------------------------------------- - -Although PyTorch natively provides some of the commonly used (and -tensor-parallelizable) building blocks such as Transformer, users often -use custom implementations for such higher-level modules. To distribute -such modules with tensor parallelism, you need to register the -distributed modules to the custom module implementation in your class, -so that the library knows how to distribute the custom module. When you -register the distributed modules, make sure the custom module that you -use is functionally equivalent to the distributed module. You can verify -this by taking a look at the equivalent reference implementations in the -:ref:`smdmp-tp-appendix`. -These implementations are functionally equivalent to their distributed -versions in ``smp.nn`` module. - -.. decorator:: @smp.tp_register(dist_module, init_hook=None, forward_hook=None, return_hook=None) - - - A class decorator that registers the ``dist_module`` class with - the module class that it is attached to. The hooks can be used to - adapt to different interfaces used with ``__init__`` and - ``forward`` methods. - - **Arguments:** - - - ``dist_module``: A subclass of ``smp.nn.DistributedModule`` - that implements the distributed version of the module class the - decorator is attached to. Any distributed module class defined - in ``smp.nn`` module can be used. - - ``init_hook``: A callable that translates the arguments of the - original module ``__init__`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``__init__`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``__init__`` method (including argument order and default - values), except it must exclude ``self``. - - ``forward_hook``: A callable that translates the arguments of - the original module ``forward`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``forward`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``forward`` method (including argument order and default - values), except it must exclude ``self``. - - ``return_hook``: A callable that translates the object returned - from the distributed module to the return object expected of - the original module. - - - **Example:** - - .. code:: python - - init_hook = lambda config: ((), config.to_dict()) - - # register smp.nn.DistributedTransformer - # as the distributed version of MyTransformer - @smp.tp_register(smp.nn.DistributedTransformer, init_hook=init_hook) - class MyTransformer(nn.Module): - def __init__(self, config): - ... - - def forward(self, hidden_states, attention_mask): - ... - -.. function:: smp.tp_register_with_module(module_cls, dist_module, init_hook=None, forward_hook=None, return_hook=None) - :noindex: - - - When you do not have direct access to model definition code, you - can use this API to similarly register a distributed module with - an existing module class. - - - **Arguments:** - - - ``module_cls``: The existing module class that will be - distributed. - - ``dist_module``: A subclass of ``smp.nn.DistributedModule`` - that implements the distributed version of the module class the - decorator is attached to. Any distributed module class defined - in ``smp.nn`` module can be used. - - ``init_hook``: A callable that translates the arguments of the - original module ``__init__`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``__init__`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``__init__`` method (including argument order and default - values), except it must exclude ``self``. - - ``forward_hook``: A callable that translates the arguments of - the original module ``forward`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``forward`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``forward`` method (including argument order and default - values), except it must exclude ``self``. - - ``return_hook``: A callable that translates the object returned - from the distributed module to the return object expected of - the original module. - - - **Example:** - - .. code:: python - - from somelibrary import MyTransformer - - init_hook = lambda config: ((), config.to_dict()) - - # register smp.nn.DistributedTransformer as the distributed version of MyTransformer - smp.tp_register_with_module(MyTransformer, - smp.nn.DistributedTransformer, - init_hook=init_hook) - -.. _smdmp-supported-modules-for-tp: - :noindex: - -Supported Modules for Tensor Parallelism ----------------------------------------- - -The following modules are supported for tensor -parallelism. - -- ``smp.nn.DistributedLinear`` (implements ``nn.Linear``) -- ``smp.nn.DistributedTransformerLMHead`` -- ``smp.nn.DistributedTransformer`` -- ``smp.nn.DistributedTransformerLayer`` -- ``smp.nn.DistributedAttentionLayer`` -- ``smp.nn.DistributedTransformerOutputLayer`` -- ``smp.nn.DistributedEmbedding`` - -.. contents:: Topics - :depth: 3 - :local: - -.. _tp-module-api: - :noindex: - -Tensor Parallelism Module APIs -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. class:: smp.nn.DistributedLinear(in_features, out_features) - :noindex: - - - Tensor-parallel implementation of the ``nn.Linear`` class. - Functionally equivalent to an ``nn.Linear`` module with the same - ``in_features`` and ``out_features``. In other words, - ``in_features`` and ``out_features`` are the number of *global* - channels across tensor-parallel ranks. - - **Arguments:** - - - ``in_features``: The total number of input channels for the - linear layer across all tensor-parallel ranks. - - ``out_features``: The total number of output channels for the - linear layer across all tensor-parallel ranks. - -.. class:: smp.nn.DistributedTransformerLMHead(num_layers=12, num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, vocab_size=30522, num_positions=1024, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, num_token_types=0, causal_mask_size=None, add_cross_attention=False, add_lm_head=True, initializer_range=0.02, use_normal_initialization=False, pre_layernorm=False, post_layernorm=True) - :noindex: - - - Constructs a distributed transformer model, including embeddings - and a single LM head. A word embedding of size - ``(vocab_size, hidden_size)`` is created, as well as a positional - embedding of size ``(num_positions, hidden_size)``, and the - embeddings are added together. If ``num_token_types`` is larger - than 0, a separate embedding of size - ``(num_token_types, hidden_size)`` is created, and further added - on top. - - The embeddings are fed through a ``DistributedTransformer``, and - if ``add_lm_head`` is ``True``, the output passes through a single - LM head, which is a linear module without bias whose weight is - tied to the word embeddings. - - See ``DistributedTransformerLayer`` for a description of the rest - of the arguments. - - **Methods:** - - - ``forward(self, inputs)`` - - - If ``add_cross_attention`` is ``True``, ``inputs`` must be a - tuple - ``(input_ids, attention_mask, token_type_ids, position_ids, cross_states, cross_states, cross_mask, labels)``. - - Otherwise, ``inputs`` must be a tuple - ``(input_ids, attention_mask, token_type_ids, position_ids, labels)``. - - If ``token_type_ids`` is ``None``, token type embedding will - not be used. - - ``input_ids`` is assumed to be of shape ``[N, S]``, where - ``N`` is the batch size and ``S`` is sequence length. - - ``attention_mask`` is assumed to be a 0-1 tensor of shape - ``[N, S]``, where 1 represents a masked position. - -.. class:: smp.nn.DistributedTransformer(num_layers=12, num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, causal_mask_size=None, add_cross_attention=False, pre_layernorm=False, post_layernorm=True) - :noindex: - - - A sequence of ``smp.nn.DistributedTransformerLayer``\ s, whose - number is given by ``num_layers`` argument. For the other - arguments and methods, refer to - ``smp.nn.DistributedTransformerLayer``. - - If both ``pre_layernorm`` and ``post_layernorm`` are ``True``, - layer normalization is applied to both the input and the output of - the ``DistributedTransformer``, in addition to the intermediate - attention and transformer-output layers. - -.. class:: smp.nn.DistributedTransformerLayer(num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, causal_mask_size=None, add_cross_attention=False, pre_layernorm=False, post_layernorm=True) - :noindex: - - - Tensor-parallel implementation of a single transformer layer. - Number of attention heads, hidden size, and intermediate size - refer to the global quantities across all tensor-parallel ranks. - - **Arguments:** - - - ``num_attention_heads``: The total number of attention heads - across tensor-parallel ranks - - ``attention_head_size``: The number of channels of a single - attention head. - - ``hidden_size``: The hidden dimension of the transformer. The - input tensor ``hidden_states`` is assumed to have its last - dimension size equal to ``hidden_size``. - - ``intermediate_size``: The number of output channels in the - first linear transformation of the transformer output layer. - ``DistributedTransformerOutputLayer`` first maps - ``hidden_size`` dimensions of its input tensor into - ``intermediate_size`` dimensions, and then maps it back into - ``hidden_size`` dimensions. - - ``attention_dropout_prob``: The dropout probability applied to - the attention probabilities. - - ``hidden_dropout_prob``: The dropout probability used in - dropout layers other than the one applied to the attention - probabilities. - - ``activation``: Choice of activation function to use at the - output layer. Must be ``"gelu"`` or ``"relu"``. - - ``layernorm_epsilon``: The epsilon added to the denominator of - layer normalization for numerical stability. - - ``initializer_range``: If ``use_normal_initialization`` is - ``True``, the standard deviation of the normal random variable - to initialize the weights with. - - ``use_normal_initialization``: If ``True``, the weights are - initialized with normal distribution with standard deviation - given by ``initializer_range``. Otherwise, default PyTorch - initialization is used. - - ``causal_mask_size``: If ``None``, no causal mask is used on - attentions. Otherwise, should be set to maximum sequence length - to apply a causal mask to the attention scores. This is used, - for instance, in GPT-2. - - ``add_cross_attention``: If ``True``, a cross-attention layer - will be added after the self-attention block. The - cross-attention layer computes the attention keys and values - based on the ``cross_states`` input (instead of - ``hidden_states`` input, as in self-attention. This is used in - the decoder block of encoder-decoder architectures. For - encoder-only architectures that only use self-attention, this - should be kept ``False``. - - ``pre_layernorm``: If ``True``, inserts layer normalization at - the input. At least one of ``pre_layernorm`` and - ``post_layernorm`` must be ``True``. - - ``post_layernorm``: If ``True``, inserts layer normalization at - the output. At least one of ``pre_layernorm`` and - ``post_layernorm`` must be ``True``. - - - **Methods:** - - - ``forward(self, inputs)``: Forward pass for the transformer - layer. - - - **Arguments:** - - - If ``add_cross_attention=False``, ``inputs`` must be a - tuple ``(hidden_states, attention_mask)``, where - ``hidden_states`` is assumed to be a tensor of dimensions - ``[N, S, H]``, where ``N`` is batch size, ``S`` is - sequence length, and ``H`` is ``hidden_size``. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S]``, where ``N`` is the batch - size, and ``S`` is the sequence length. - - If ``add_cross_attention=True``, ``inputs`` must be a - tuple - ``(hidden_states, cross_states, attention_mask, cross_mask)``, - where ``hidden_states`` is assumed to be a tensor of - dimensions ``[N, S_1, H]``, where ``N`` is batch size, - ``S_1`` is sequence length, and ``H`` is ``hidden_size``. - ``cross_states`` is assumed to be a tensor of size - ``[N, S_2, H]``, similarly interpreted. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S_1]``, where ``N`` is the batch - size, and ``S_1`` is the sequence length, and - ``cross_mask`` is assumed to be a tensor of size - ``[N, 1, 1, S_2]``. Keys and values for the attention - heads in the cross-attention layer (but not the - self-attention layer) are computed using - ``cross_states``, and ``cross_mask`` is applied as the - attention mask in the cross-attention layer (but not the - self-attention layer). - - - **Returns:** - - - If ``add_cross_attention=False``, a tuple - ``(hidden_states, attention_mask)``, where - ``hidden_states`` is the output of the transformer, and - ``attention_mask`` is the same the ``attention_mask`` - argument. - - If ``add_cross_attention=True``, a tuple - ``(hidden_states, cross_states, attention_mask, cross_mask)``, - where ``hidden_states`` is the output of the transformer, - and the next three tensors are the same as the input - arguments. - -.. class:: smp.nn.DistributedAttentionLayer(num_attention_heads=32, attention_head_size=32, hidden_size=1024, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, cross_attention=False, causal_mask_size=None, pre_layernorm=False, post_layernorm=True) - :noindex: - - - A distributed implementation for the attention block. Includes the - computation of the self- or cross-attention (context layer), - followed by a linear mapping and dropout, which is optionally - followed by the residual-connection and layer normalization. - - **Arguments:** - - - See ``DistributedTransformerLayer`` for a description of the - arguments. - - If ``cross_attention`` is ``True``, computes the attentions - with respect to the ``cross_states`` tensor of the ``forward`` - method input tuple. - - - **Methods:** - - - ``forward(self, inputs)``: Forward pass for the attention - layer. - - - **Arguments:** - - - If ``cross_attention=False``, ``inputs`` must be a tuple - ``(hidden_states, attention_mask)``, where - ``hidden_states`` is assumed to be a tensor of dimensions - ``[N, S, H]``, where ``N`` is batch size, ``S`` is - sequence length, and ``H`` is ``hidden_size``. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S]``, \***\* where ``N`` is the - batch size, and ``S`` is the sequence length. - - If ``cross_attention=True``, ``inputs`` must be a tuple - ``(hidden_states, cross_states, attention_mask)``, where - ``hidden_states`` is assumed to be a tensor of dimensions - ``[N, S_1, H]``, where ``N`` is batch size, ``S_1`` is - sequence length, and ``H`` is ``hidden_size``. - ``cross_states`` is assumed to be a tensor of size - ``[N, S_2, H]``, similarly interpreted. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S_2]``, where ``N`` is the batch - size, and ``S_2`` is the sequence length. Keys and values - for the attention heads are computed using - ``cross_states``. - - - **Returns:** - - - A single tensor that is the output of the attention - layer. - -.. class:: smp.nn.DistributedTransformerOutputLayer(hidden_size=1024, intermediate_size=4096, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, pre_layernorm=False, post_layernorm=True) - :noindex: - - - Distributed implementation of a single transformer output layer. A - single ``DistributedTransformerLayer`` with - ``add_cross_attention=False`` consists of a single - ``DistributedAttentionLayer`` immediately followed by a single - ``DistributedTransformerOutputLayer``. The latter linearly maps - the last channel of the input tensor from ``hidden_size`` to - ``intermediate_size``, and then maps it back to ``hidden_size``. - - **Arguments:** - - - See ``DistributedTransformerLayer`` for a description of the - arguments. - -.. class:: smp.nn.DistributedEmbedding(num_embeddings,embedding_dim, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False, _weight=None, initializer_range=0.02, _skip_allgather=False,_skip_scatter_and_merge=False,) - :noindex: - - - Distributed implementation of a single Embedding Layer. Currently - only supports splitting across the embedding_dim. - - **Arguments:** - - - See ``DistributedEmbedding`` for a description of the - arguments. - -.. _enabling-tp: - :noindex: - -Enabling Tensor Parallelism -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -There are two ways tensor parallelism can be enabled. - -First, you can use -the distributed module implementations in ``smp.nn`` module directly in -your model definition. See :ref:`smdmp-supported-modules-for-tp` -for a complete list of built-in distributed modules. Here is an example -of how this can be done: - -.. code:: python - - import torch.nn as nn - import smdistributed.modelparallel.torch as smp - - class TransformerModel: - def __init__(self): - self.embedding = nn.Embedding(vocab_size, hidden_size) - - # directly instantiate smp.nn.DistributedTransformer and use it - self.encoder = smp.nn.DistributedTransformer(num_layers, hidden_size, **kwargs) - - self.pooler = nn.Linear(hidden_size, hidden_size) - - def forward(self, hidden_states): - emb_out = self.embedding(hidden_states) - enc_out = self.encoder(emb_out) - return self.pooler(enc_out) - -Second, you can enable tensor parallelism for specific modules or blocks -of code, which will automatically enable tensor parallelism for the -supported modules within that scope. To do this, you can use the -following API: - -.. decorator:: smp.tensor_parallelism(enabled=True, **kwargs) - :noindex: - - - A context manager that enables or disables tensor parallelism for - any supported module that is created inside. If there are nested - contexts, the innermost will override the rest. If there are - multiple supported modules created within the context, where one - is the submodule of the other, only the outermost module will be - distributed. If a supported module shares weights with another - (supported or unsupported) module, or if its hyperparameters do - not support distribution (e.g., not divisible by the tensor - parallelism degree), tensor parallelism will **not** be enabled - for this module even if this API is used. - - **Example:** - - .. code:: python - - with smp.tensor_parallelism(): - self.m0 = nn.Linear(20, 20) # will be distributed - with smp.tensor_parallelism(enabled=False): - self.m1 = nn.Linear(20, 20) # will not be distributed - - - Keyword arguments `kwargs` can be used to modify the configurations of the distributed modules created inside the context. If a keyword argument provided here matches any `__init__` method arguments of a `DistributedModule` that substitutes a module created inside the `smp.tensor_parallelism` context, this keyword will override the value defined in the `init_hook`. - -.. function:: smp.set_tensor_parallelism(module, enabled=True, **kwargs) - :noindex: - - - Enables or disables tensor parallelism for the supported - submodules of ``module``. If enabling, the outermost supported - modules will be distributed. If disabling, tensor parallelism will - be disabled for the entire module subtree of ``module``. Unlike - the context manager, this API can be used after the model creation - (but before wrapping with :class:`smp.DistributedModel`), so direct - access to model definition code is not required. If a supported - module shares weights with another (supported or unsupported) - module, or if its hyperparameters do not support distribution - (e.g., not divisible by the tensor parallelism degree), tensor - parallelism will **not** be enabled for this module. - - Keyword arguments ``kwargs`` can be used to modify the - configurations of the distributed modules created inside the - context. If a keyword argument provided here matches any - ``__init__`` method arguments of a :class:`smp.DistributedModel` that - substitutes a module created inside the ``smp.tensor_parallelism`` - context, this keyword will override the value defined in the - ``init_hook``. - - **Example:** - - .. code:: python - - model = MyModel() - smp.set_tensor_parallelism(model.encoder, True) - smp.set_tensor_parallelism(model.encoder.embedding, True) - - # outermost supported submodules in model.encoder will be distributed, except for - # model.encoder.embedding - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - -.. _activation-checkpointing-api: - :noindex: - -Activation Checkpointing APIs ------------------------------ - -``smdistributed.modelparallel`` provides three APIs to enable -activation checkpointing: one for checkpointing modules, -one for checkpointing sequential modules, and -one for checkpointing pretrained models. - -For a conceptual guide and examples, see -`Activation Checkpointing `_ -in the *SageMaker's Distributed Model Parallel developer guide*. - -.. class:: smdistributed.modelparallel.torch.patches.checkpoint.checkpoint(module, *args, preserve_rng_state=True) - :noindex: - - - Checkpoints the module passed. Throws error if, during manual - partitioning, all children of module are not on same rank as the - module itself, i.e. the module tree is split across multiple - partitions. During auto-partitioning, if the module is split - across multiple partitions, then this call is ignored(with a - warning). Note that this call applies to the module instance only, - not to the module class. - - - **Arguments:** - - - ``module (Instance of nn.Module)``: The module to be - checkpointed. Note that unlike native checkpointing in - PyTorch’s, activation checkpointing in - ``smdistributed.modelparallel`` is at the granularity of a - module. A generic function cannot be passed here. - - ``args``: Tuple containing inputs to the module. - - ``preserve_rng_state (bool, default=True)``: Omit stashing and - restoring the RNG state during each checkpoint. - -.. class:: smdistributed.modelparallel.torch.patches.checkpoint.checkpoint_sequential(sequential_module, input, strategy="each", preserve_rng_state=True, pack_args_as_tuple=False) - :noindex: - - - Checkpoints the modules inside - `nn.Sequential `__. - This can be used even if different layers that are part of the - sequential container lie on different partitions. Each layer part - of the sequential module that is checkpointed must lie completely - within one partition. If this is not the case during manual - partitioning, then an error will be thrown. If this is not the - case during auto partitioning, a warning will be raised and this - module will be run without checkpointing. - - - **Arguments** - - - ``sequential_module (nn.Sequential)``: the sequential module to - be checkpointed. - - ``input (torch.Tensor or a tuple of torch.Tensors)``: input to - the module, which can be a tensor or a tuple of tensors. If a - tuple is passed, then pack_args_as_tuple should be set to True. - - ``strategy (string, default=“each”)`` : Strategy determines how - many layers part of the sequential module need to be grouped - together for one checkpointing call. This determines how much - memory can be reduced. It can take the following values - - - ``each`` : The default is to checkpoint each module inside - the sequential separately. - - ``contiguous``: Groups consecutive layers on the same - partition together. For example, if a sequential consists of - [a, b, c, d] where a,b are on pp_rank0 and c,d are on - pp_rank 1, then this strategy would checkpoint a,b together - and then c,d together. This means effectively, inputs of a, - outputs of b, inputs of c, and outputs of d are in memory; - the reamining activations are recomputed. - - ``group_2, group_3, group_4, etc:`` More generally, - ``group_x`` where x is an integer. This strategy provides - more flexibility in how many layers to group together. - ``group_x`` groups x layers together on a best effort basis. - It can group x layers together if there are x layers - consecutively on the same partition. For example: - [a,b,c,d,e] where a,b are on pp_rank0 and c,d,e are on - pp_rank 1. If the strategy is ``group_3,`` then a,b are - checkpointed together on pp_rank0 and c,d,e are checkpointed - together on pp_rank1. - - - ``preserve_rng_state (bool, default=True)``: Set to ``False`` - to omit stashing and restoring the RNG state during each - checkpoint. - - ``pack_args_as_tuple (bool, default=False)``: To ensure that - backward works correctly, the autograd function has to unpack - any tuples received. If the checkpointed layer takes a tuple as - input, then this needs to be set to True. - -.. class:: smp.set_activation_checkpointing(module, preserve_rng_state=True, pack_args_as_tuple=False, strategy="each") - :noindex: - - - This API is recommended when importing pretrained models from - libraries, such as PyTorch and Hugging Face Transformers. This is - particularly useful when you don’t have access to the model - definition code and not be able to replace a module call with - checkpoint. - - - **Arguments**: - - - ``module (Instance of nn.Module or nn.Sequential)``: The module - to checkpoint. - - ``preserve_rng_state (bool, default=True)``: Set to ``False`` - to omit stashing and restoring the RNG state during each - checkpoint. - - ``pack_args_as_tuple (bool, default=False)``: *Can only be - passed when module is a sequential module.* To ensure that - backward works correctly, the autograd function has to unpack - any tuples received. If the layer checkpointed takes a tuple as - input, then this needs to be set to True. - - ``strategy: (string, default=“each”)``: *Can only be passed - when module is a sequential module.* Strategy determines how - many layers part of the sequential module need to be grouped - together for one checkpointing call. - - This determines how much memory can be reduced. It can take the - following values - - - ``each`` : The default is to checkpoint each module inside - the sequential separately. - - ``contiguous``: Groups consecutive layers on the same - partition together. For example if a sequential consists of - ``[a, b, c, d]`` where ``a, b`` are on ``pp_rank0`` and ``c, d`` are on - ``pp_rank 1``, then this strategy would checkpoint a,b together - and then ``c, d`` together. This means effectively, the inputs of - ``a``, outputs of ``b``, inputs of ``c``, and outputs of ``d`` are in - memory, and the rest of the activations are recomputed. - - ``group_2, group_3, group_4, etc:`` More generally, - ``group_x`` where x is an integer. This strategy provides - more flexibility in how many layers to group together. - ``group_x`` groups x number of layers together on a best - effort basis if there are x layers consecutively in the same - partition. **Example**: Assume a module with layers ``[a, b, - c, d, e]``. The layers a and b are on pp_rank0, and ``c``, ``d``, and - ``e`` are on ``pp_rank 1``. If the strategy is ``group_3,`` then ``a``, - ``b`` are checkpointed together on ``pp_rank0``, and ``c``, ``d``, ``e`` are - checkpointed together on ``pp_rank1``. - -.. _smdmp-tp-appendix: - :noindex: - -Appendix: Reference Implementations for Modules ------------------------------------------------ - -The following are reference implementations for transformer-related -modules. Note that this is not the actual ``smdistributed`` source code, -but the distributed implementations provided in the library are the -distributed versions of these reference implementations, and can be used -to determine whether the distributed modules perform the same operations -as the custom modules in your script. - -To keep the implementations simple, we only assume keyword arguments, -and assume the existence of a method ``parse_args(kwargs)``, which -parses the arguments to ``__init__`` methods and sets the relevant -attributes of the module, such as ``hidden_size`` and -``num_attention_heads``. - -``smp.nn.DistributedTransformer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class Transformer(nn.Module): - def __init__(self, **kwargs): - super(Transformer, self).__init__() - self.parse_args(kwargs) - - self.layers = [] - for l in range(self.num_layers): - self.layers.append(TransformerLayer(**kwargs)) - - self.seq_layers = nn.Sequential(*self.layers) - - def forward(self, inp): - return self.seq_layers(inp) - -``smp.nn.DistributedTransformerLayer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class TransformerLayer(nn.Module): - def __init__(self, **kwargs): - super(TransformerLayer, self).__init__() - self.parse_args(kwargs) - - self.attention = AttentionLayer(**kwargs) - self.output = TransformerOutputLayer(**kwargs) - - if self.add_cross_attention: - self.cross_attention = AttentionLayer(cross_attention=True, **kwargs) - - def forward(self, inp): - if self.add_cross_attention: - hidden_states, cross_states, attention_mask, cross_mask = inp - else: - hidden_states, attention_mask = inp - - attention_output = self.attention((hidden_states, attention_mask)) - if self.add_cross_attention: - attention_output = self.cross_attention((attention_output, - cross_states, - cross_mask)) - - output = self.output(attention_output) - - if self.add_cross_attention: - return output, cross_states, attention_mask, cross_mask - else: - return output, attention_mask - -``smp.nn.DistributedAttentionLayer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class AttentionLayer(nn.Module): - def __init__(self, **kwargs): - super(AttentionLayer, self).__init__() - self.parse_args(kwargs) - self.attention_head_size = self.hidden_size // self.num_attention_heads - - self.query = nn.Linear(self.hidden_size, self.hidden_size) - self.key = nn.Linear(self.hidden_size, self.hidden_size) - self.value = nn.Linear(self.hidden_size, self.hidden_size) - self.dense = nn.Linear(self.hidden_size, self.hidden_size) - - self.dropout1 = nn.Dropout(self.attention_dropout_prob) - self.dropout2 = nn.Dropout(self.hidden_dropout_prob) - - if self.pre_layernorm: - self.pre_layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - if self.post_layernorm: - self.layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - def transpose(self, tensor, key=False): - shape = tensor.size()[:-1] + - (self.num_attention_heads, self.attention_head_size) - tensor = torch.reshape(tensor, shape) - if key: - return tensor.permute(0, 2, 3, 1) - else: - return tensor.permute(0, 2, 1, 3) - - def forward(self, inp): - if self.cross_attention: - hidden_states, cross_states, attention_mask = inp - else: - hidden_states, attention_mask = inp - - if self.pre_layernorm: - norm_states = self.pre_layernorm(hidden_states) - else: - norm_states = hidden_states - - query_layer = self.query(norm_states) - - if self.cross_attention: - key_layer = self.key(cross_states) - value_layer = self.value(cross_states) - else: - key_layer = self.key(norm_states) - value_layer = self.value(norm_states) - - query_layer = self.transpose(query_layer) - key_layer = self.transpose(key_layer, key=True) - value_layer = self.transpose(value_layer) - - attention_scores = torch.matmul(query_layer, key_layer) - attention_scores = attention_scores / math.sqrt(self.attention_head_size) - - if not self.cross_attention and self.causal_mask is not None: - attention_scores = self.apply_causal_mask(attention_scores) - - attention_scores = attention_scores + attention_mask - - attention_probs = F.softmax(attention_scores, dim=-1) - attention_probs = self.dropout1(attention_probs) - - context_layer = torch.matmul(attention_probs, value_layer) - context_layer = context_layer.permute(0, 2, 1, 3) - new_context_layer_shape = context_layer.size()[:-2] + \ - (self.local_attention_size,) - context_layer = torch.reshape(context_layer, new_context_layer_shape) - - self_attention = self.dense(context_layer) - self_attention = self.dropout2(self_attention) - - if self.post_layernorm: - return self.layernorm(self_attention + hidden_states) - else: - return self_attention - -``smp.nn.DistributedTransformerOutputLayer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class TransformerOutputLayer(nn.Module): - def __init__(self, **kwargs): - super(TransformerOutputLayer, self).__init__() - self.parse_args(kwargs) - - self.dense1 = nn.Linear(self.hidden_size, self.intermediate_size) - self.dense2 = nn.Linear(self.intermediate_size, self.hidden_size) - - self.dropout = nn.Dropout(self.attention_dropout_prob) - - if self.pre_layernorm: - self.pre_layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - if self.post_layernorm: - self.layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - def forward(self, inp): - if self.pre_layernorm: - norm_inp = self.pre_layernorm(inp) - else: - norm_inp = inp - - dense1_output = self.dense1(norm_inp) - if self.activation == "gelu": - act_output = F.gelu(dense1_output) - else: - act_output = F.relu(dense1_output) - - dense2_output = self.dense2(act_output) - output = self.dropout(dense2_output) - - if self.post_layernorm: - return self.layernorm(inp + output) - else: - return output diff --git a/doc/api/training/smp_versions/v1.6.0/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/v1.6.0/smd_model_parallel_tensorflow.rst deleted file mode 100644 index 6630371b94..0000000000 --- a/doc/api/training/smp_versions/v1.6.0/smd_model_parallel_tensorflow.rst +++ /dev/null @@ -1,171 +0,0 @@ -TensorFlow API -============== - -To use the TensorFlow-specific APIs for SageMaker distributed model parallism, -you need to add the following import statement at the top of your training script. - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -.. tip:: - - Refer to - `Modify a TensorFlow Training Script - `_ - to learn how to use the following APIs in your TensorFlow training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of the Keras \ ``Model`` class, which defines the model to - be partitioned. Model definition is done by sub-classing - ``smp.DistributedModel`` class, and implementing the ``call()`` method, - in the same way as the Keras model sub-classing API. Any operation that - is part of the \ ``smp.DistributedModel.call()`` method is subject to - partitioning, meaning that every operation placed inside executes in - exactly one of the devices (the operations outside run on all devices). - - - Similar to the regular Keras API, the forward pass is done by directly - calling the model object on the input tensors. For example: - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - However, ``model()`` calls can only be made inside a - ``smp.step``-decorated function. - - The outputs from a ``smp.DistributedModel`` are available in all ranks, - regardless of which rank computed the last operation. - - **Methods:** - - .. function:: save_model(save_path="/opt/ml/model") - :noindex: - - **Inputs** - - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights. - - Saves the entire, - unpartitioned model with the latest trained weights to ``save_path`` in - TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which - SageMaker monitors to upload the model artifacts to Amazon S3. - -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (``int``): The index of the partition. - - A context manager which places all operations defined inside into the - partition whose ID is equal to ``index``. When - ``smp.partition`` contexts are nested, the innermost context overrides - the rest. The ``index`` argument must be smaller than the number of - partitions. - - ``smp.partition`` is used in the manual partitioning API; - if \ ``"auto_partition"`` parameter is set to ``True`` while launching - training, then ``smp.partition`` contexts are ignored. Any operation - that is not placed in any ``smp.partition`` context is placed in the - ``default_partition``, as shown in the following example: - - .. code:: python - - # auto_partition: False - # default_partition: 0 - smp.init() - [...] - x = tf.constant(1.2)                     # placed in partition 0 - with smp.partition(1): -     y = tf.add(x, tf.constant(2.3))      # placed in partition 1 -     with smp.partition(3): -         z = tf.reduce_sum(y)             # placed in partition 3 - - -.. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. - - .. code:: python - - @smp.register_post_partition_hook - def test_eager(): - # All statements here will be executed right after partition but before the first forward pass - tf.print("Entered hook through eager context") - -.. class:: smp.CheckpointManager - :noindex: - - - A subclass of TensorFlow - `CheckpointManager `__, - which is used to manage checkpoints. The usage is similar to TensorFlow - ``CheckpointManager``. - - The following returns a ``CheckpointManager`` object. - - .. code:: python - - smp.CheckpointManager(checkpoint, -                       directory="/opt/ml/checkpoints", -                       max_to_keep=None, -                       checkpoint_name="ckpt") - - **Parameters** - - - ``checkpoint``: A `tf.train.Checkpoint - `__ instance - that represents a model checkpoint. - - - ``directory``: (``str``) The path to a directory in which to write - checkpoints. A file named "checkpoint" is also written to this - directory (in a human-readable text format) which contains the state - of the ``CheckpointManager``. Defaults to - ``"/opt/ml/checkpoints"``, which is the directory that SageMaker - monitors for uploading the checkpoints to Amazon S3. - - ``max_to_keep`` (``int``): The number of checkpoints to keep. If - ``None``, all checkpoints are kept. - - ``checkpoint_name`` (``str``): Custom name for the checkpoint file. - Defaults to ``"ckpt"``. - - - **Methods:** - - .. function:: save( ) - :noindex: - - Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``. - - .. function:: restore( ) - :noindex: - - Restores the latest checkpoint in the specified directory. - Internally uses ``tf.train.CheckpointManager.restore()``. - - - **Examples:** - - .. code:: python - - checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) - ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints - - for inputs in train_ds: -     loss = train_step(inputs) -     # [...] -     ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints - - .. code:: python - - for step, inputs in enumerate(train_ds): -     if step == 0: -         ckpt_manager.restore() -     loss = train_step(inputs) diff --git a/doc/api/training/smp_versions/v1.9.0/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/v1.9.0/smd_model_parallel_common_api.rst deleted file mode 100644 index b4713b2707..0000000000 --- a/doc/api/training/smp_versions/v1.9.0/smd_model_parallel_common_api.rst +++ /dev/null @@ -1,538 +0,0 @@ -Common API -========== - -The following SageMaker distribute model parallel APIs are common across all frameworks. - -.. contents:: Table of Contents - :depth: 3 - :local: - -The Library's Core APIs ------------------------ - -This API document assumes you use the following import statement in your training scripts. - -**TensorFlow** - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -**PyTorch** - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. function:: smp.init( ) - :noindex: - - Initialize the library. Must be called at the beginning of training script. - -.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs]) - :noindex: - - A decorator that must be placed over a function that represents a single - forward and backward pass (for training use cases), or a single forward - pass (for evaluation use cases). Any computation that is defined inside - the ``smp.step``-decorated function is executed in a pipelined manner. - - By default, every tensor input to the function is split across its batch - dimension into a number of microbatches specified while launching the - training job. This behavior can be customized through the arguments to - ``smp.step``, described below. The library then orchestrates the execution of - each microbatch across all partitions, based on the chosen pipeline - type. - - In a typical use case, forward pass and back-propagation are executed - inside an \ ``smp.step``-decorated function and gradients, loss, and - other relevant metrics (such as accuracy, etc.) are returned from - ``smp.step``-decorated function. - - Any gradient post-processing operation, such as gradient clipping and - allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or - ``optimizer.step`` (for PT) should be applied on the gradients returned - from the ``smp.step`` function, and not inside the ``smp.step`` - function. This is because every operation inside ``smp.step`` is - executed once per microbatch, so having these operations inside - ``smp.step`` can either be inefficient (in the case of allreduce), or - lead to wrong results (in the case of ``apply_gradients`` / - ``optimizer.step``). - - If the objects returned from the ``smp.step``-decorated function contain - ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to - ``StepOutput`` objects. A ``StepOutput`` object encapsulates all - versions of the tensor across different microbatches - (see ``StepOutput`` entry for more information). - - The argument to ``smp.step`` decorated function should either be a tensor - or an instance of list, tuple, dict or set for it to be split across - microbatches. If your object doesn't fall into this category, you can make - the library split your object, by implementing ``smp_slice`` method. - - Below is an example of how to use it with PyTorch. - - .. code:: python - - class CustomType: - def __init__(self, tensor): - self.data = tensor - - # The library will call this to invoke slicing on the object passing in total microbatches (num_mb) - # and the current microbatch index (mb). - def smp_slice(self, num_mb, mb, axis): - dim_size = list(self.data.size())[axis] - - split_size = dim_size // num_mb - sliced_tensor = self.data.narrow(axis, mb * split_size, split_size) - return CustomType(sliced_tensor, self.other) - - custom_obj = CustomType(torch.ones(4,)) - - @smp.step() - def step(custom_obj): - loss = model(custom_obj) - model.backward(loss) - return loss - - - **Important:** ``smp.step`` splits the batch into microbatches, and - executes everything inside the decorated function once per microbatch. - This might affect the behavior of batch normalization, any operation - that explicitly uses the batch size information, or any other Python - code that is expected to run once. - - **TensorFlow-specific behavior** - - ``smp.step`` is a wrapper that - inherits from and extends the behavior of ``tf.function``, and as such, - all the caveats that apply to the use of ``tf.function``\ s also apply - to ``smp.step``. In particular, any operation that is inside - ``smp.step`` executes in graph mode, and not eager mode. - - In the first call, ``smp.step`` performs tracing of the wrapped function every time - one of the tensor arguments changes their shape or dtype, or for every - new value of a Python argument, if there is one. Tracing is expensive, - so such scenarios should be avoided as much as possible or, - alternatively, an ``input_signature`` argument must be provided. For - more information on the usage of ``tf.function``, refer to the - TensorFlow documentation: - - - https://www.tensorflow.org/api_docs/python/tf/function\ - - https://www.tensorflow.org/guide/function\ - - Each ``smp.step`` decorated function must have a return value that depends on the - output of ``smp.DistributedModel``. - - **Common parameters** - - - ``non_split_inputs`` (``list``): The list of arguments to the decorated function - that should not be split along the batch dimension. Should be used - for all input tensors that do not have a batch dimension. Should be a - list of argument names as ``str``, as they appear in the signature of - the ``smp.step``-decorated function. By default it is considered an - empty list. - - - ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch - axis. The keys should be the argument names as ``str``, as they - appear in the signature of the ``smp.step``-decorated function.  By - default all batch axes are assumed to be the 0-axis. - - **TensorFlow-only parameters** - - - All arguments of ``tf.function``. Note: - The \ ``experimental_compile`` argument of ``tf.function`` may not - work as expected with ``smp.step``, since it interferes with - pipelining and model partitioning. To enable XLA with the library, you can - instead use \ ``tf.config.optimizer.set_jit(True)``. - - **PyTorch-only parameters** - - - ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on - all returned ``torch.Tensor`` outputs. Setting it to ``False`` - increases memory consumption, unless ``detach()`` is manually called - on the returned tensors, because the model graph is not cleared from - memory after the training step. Set to \ ``True`` by default. - - **Returns** - - - The same object(s) returned from the decorated function. All - returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or - ``torch.Tensor`` objects (for PT) are wrapped inside - a \ ``StepOutput`` object, even when they are inside a Python - ``list``, ``tuple``, or ``dict``. - - - -.. class:: StepOutput - :noindex: - - - A class that encapsulates all versions of a ``tf.Tensor`` - or \ ``torch.Tensor`` across all microbatches. - - When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside - ``smp.step``, different versions of the tensor are computed for each - microbatch. - - When this tensor is returned from ``smp.step`` and is accessed outside - of the decorated function, it appears as a ``StepOutput`` object, which - contains all such versions. For example, - - - In the case of Tensorflow, the gradient for a particular - ``tf.Variable`` is computed on each microbatch individually, and if - this gradient is returned from ``smp.step``, all gradients for this - ``tf.Variable`` become part of the same ``StepOutput`` object. The - ``StepOutput`` class offers the following API for commonly-used - post-processing operations on such tensors. - - In the case of PyTorch, the loss for each microbatch is computed - individually and all the ``torch.Tensor``\ s that represent the loss - for different microbatches become part of same ``StepOutput`` object, - if loss is returned from the ``smp.step`` function. - - - The ``StepOutput`` class offers the following API for commonly-used - post-processing operations on tensors. - - .. data:: StepOutput.outputs - :noindex: - - Returns a list of the underlying tensors, indexed by microbatch. - - .. function:: StepOutput.reduce_mean( ) - :noindex: - - Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s - ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches. - - .. function:: StepOutput.reduce_sum( ) - :noindex: - - Returns a ``tf.Tensor`` / - ``torch.Tensor`` that sums the constituent - ``tf.Tensor``\ s/\ ``torch.Tensor``\ s. - - .. function:: StepOutput.concat( ) - :noindex: - - Returns a - ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the - batch dimension using ``tf.concat`` / ``torch.cat``. - - .. function:: StepOutput.stack( ) - :noindex: - - Applies ``tf.stack`` / ``torch.stack`` - operation to the list of constituent ``tf.Tensor``\ s / - ``torch.Tensor``\ s. - - **TensorFlow-only methods** - - .. function:: StepOutput.merge( ) - :noindex: - - Returns a ``tf.Tensor`` that - concatenates the constituent ``tf.Tensor``\ s along the batch - dimension. This is commonly used for merging the model predictions - across microbatches. - - .. function:: StepOutput.accumulate(method="variable", var=None) - :noindex: - - Functionally the same as ``StepOutput.reduce_mean()``. However, it is - more memory-efficient, especially for large numbers of microbatches, - since it does not wait for all constituent \ ``tf.Tensor``\ s to be - ready to start averaging them, thereby saving memory. - - In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end - up being more memory-efficient than ``StepOutput.accumulate()``. - - **Parameters** - - - ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``): - If ``"add_n"`` or ``"accumulate_n"``, the library uses - ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement - accumulation. If ``"variable"``, the library uses an internal ``tf.Variable`` - into which to accumulate the tensors. Default is \ ``"variable"``. - Note: Memory usage behavior of these choices can depend on the model - and implementation. - - - ``var``: A ``tf.Variable`` into which, if provided, the library uses to - accumulate the tensors. If \ ``None``, the library internally creates a - variable. If ``method`` is not ``"variable"``, this argument is - ignored. - -.. _mpi_basics: - :noindex: - -MPI Basics ----------- - -The library exposes the following basic MPI primitives to its Python API: - -**Global** - -- ``smp.rank()`` : The global rank of the current process. -- ``smp.size()`` : The total number of processes. -- ``smp.get_world_process_group()`` : - ``torch.distributed.ProcessGroup`` that contains all processes. -- ``smp.CommGroup.WORLD``: The communication group corresponding to all processes. -- ``smp.local_rank()``: The rank among the processes on the current instance. -- ``smp.local_size()``: The total number of processes on the current instance. -- ``smp.get_mp_group()``: The list of ranks over which the current model replica is partitioned. -- ``smp.get_dp_group()``: The list of ranks that hold different replicas of the same model partition. - -**Tensor Parallelism** - -- ``smp.tp_rank()`` : The rank of the process within its - tensor-parallelism group. -- ``smp.tp_size()`` : The size of the tensor-parallelism group. -- ``smp.get_tp_process_group()`` : Equivalent to - ``torch.distributed.ProcessGroup`` that contains the processes in the - current tensor-parallelism group. -- ``smp.CommGroup.TP_GROUP`` : The communication group corresponding to - the current tensor parallelism group. - -**Pipeline Parallelism** - -- ``smp.pp_rank()`` : The rank of the process within its - pipeline-parallelism group. -- ``smp.pp_size()`` : The size of the pipeline-parallelism group. -- ``smp.get_pp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current pipeline-parallelism group. -- ``smp.CommGroup.PP_GROUP`` : The communication group corresponding to - the current pipeline parallelism group. - -**Reduced-Data Parallelism** - -- ``smp.rdp_rank()`` : The rank of the process within its - reduced-data-parallelism group. -- ``smp.rdp_size()`` : The size of the reduced-data-parallelism group. -- ``smp.get_rdp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current reduced data parallelism - group. -- ``smp.CommGroup.RDP_GROUP`` : The communication group corresponding - to the current reduced data parallelism group. - -**Model Parallelism** - -- ``smp.mp_rank()`` : The rank of the process within its model-parallelism - group. -- ``smp.mp_size()`` : The size of the model-parallelism group. -- ``smp.get_mp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current model-parallelism group. -- ``smp.CommGroup.MP_GROUP`` : The communication group corresponding to - the current model parallelism group. - -**Data Parallelism** - -- ``smp.dp_rank()`` : The rank of the process within its data-parallelism - group. -- ``smp.dp_size()`` : The size of the data-parallelism group. -- ``smp.get_dp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current data-parallelism group. -- ``smp.CommGroup.DP_GROUP`` : The communication group corresponding to - the current data-parallelism group. - -.. _communication_api: - :noindex: - -Communication API ------------------ - -The library provides a few communication primitives which can be helpful while -developing the training script. These primitives use the following -``enum`` s as arguments to specify which processes the communication -should involve. -​ - -**Helper structures** - -.. data:: smp.CommGroup - :noindex: - - An ``enum`` that takes the values - ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``. - These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``, - and ``smp.DP_GROUP`` respectively. - - - ``CommGroup.WORLD``: Represents the entire group of processes used in - training - - ``CommGroup.MP_GROUP``: Represents the group of processes that hold - the same model replica as the current process. The processes in a - single ``MP_GROUP`` collectively store an entire replica of the - model. - - ``CommGroup.DP_GROUP``: Represents the group of processes that hold - the same model partition as the current process. The processes in a - single ``DP_GROUP`` perform data parallelism/allreduce among - themselves. - -.. data:: smp.RankType - :noindex: - - An ``enum`` that takes the values - ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``. - - - ``RankType.WORLD_RANK``: The associated rank is to be interpreted as - the rank of the process across all processes used in training. - - ``RankType.MP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``MP_GROUP``. - - ``RankType.DP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``DP_GROUP``. - - -**Communication primitives:** - -.. function:: smp.broadcast(obj, group) - :noindex: - - Sends the object to all processes in the - group. The receiving process must call ``smp.recv_from`` to receive the - sent object. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be broadcast. - - - ``group``: A ``CommGroup`` argument that represents to which group of - processes the object will be sent. - - **Notes** - - - When you use ``broadcast`` on the sender process, there needs - to be an accompanying ``smp.recv_from()`` call on the receiver - processes. - - - This is a synchronous call; the ``broadcast`` statement - returns only after all ranks participating in the call have made a - matching ``recv_from`` call. - - **Example** - - .. code:: python - - if smp.rank() == 0: -     smp.broadcast(something, group=smp.CommGroup.WORLD) - else: -     smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK) - -.. function:: smp.send(obj, dest_rank, rank_type) - :noindex: - - Sends the object ``obj`` to - ``dest_rank``, which is of a type specified by ``rank_type``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be sent. - - - ``dest_rank`` (``int``): An integer denoting the rank of the receiving process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process - with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current - process. - - **Notes** - - - Note: \ This is a synchronous call; the ``send`` statement returns - only after the destination rank has made a matching - ``recv_from`` call. - -.. function:: smp.recv_from(src_rank, rank_type) - :noindex: - - Receive an object from a peer process. Can be used with a matching - ``smp.send`` or a ``smp.broadcast`` call. - - **Inputs** - - - ``src_rank`` (``int``): An integer denoting rank of the sending process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then the object is received from - the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the - current process. - - **Returns** - - Returns the python object that is sent by the peer process. - - **Notes** - - - Note: This is a synchronous call; the ``recv_from`` statement returns - only after the source rank has made a matching ``send`` or - ``broadcast`` call, and the object is received. - -.. function:: smp.allgather(obj, group) - :noindex: - - A collective call that gathers all the - submitted objects across all ranks in the specified ``group``. Returns a - list whose ``i``\ th index contains the object submitted by the - ``i``\ th rank in ``group``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be - allgathered. - - - ``group`` : A ``CommGroup`` argument that represents which group of - processes participate in ``allgather``. - - **Notes** - - - Note: This is a synchronous call; the ``allgather`` statement returns - only after all ranks participating in the call have made a matching - ``allgather`` call, and all the objects are received at the current - rank. - - **Examples** - - .. code:: python - - # assuming mp_size() == 2 - - if smp.mp_rank() == 0: -     out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - else: -     out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - -.. function:: smp.barrier(group=smp.WORLD) - :noindex: - - A statement that hangs until all - processes in the specified group reach the barrier statement, similar to - ``MPI_Barrier()``. - - **Inputs** - - - ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of - processes participating in the barrier call. Defaults to - ``smp.WORLD``. - - **Examples** - - - Assume there are 8 processes and 2 model partitions, and - therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If - the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its - group argument, then each process only waits until the other process - of its own ``mp_group`` reaches that point. It does not wait for - processes outside that ``mp_group``. - -.. function:: smp.dp_barrier() - :noindex: - - Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``. - Waits for the processes in the same \ ``dp_group`` as - the current process to reach the same point in execution. - -.. function:: smp.mp_barrier() - :noindex: - - Same as passing ``smp.MP_GROUP`` to - ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as - the current process to reach the same point in execution. diff --git a/doc/api/training/smp_versions/v1.9.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.9.0/smd_model_parallel_pytorch.rst deleted file mode 100644 index 88d1a42165..0000000000 --- a/doc/api/training/smp_versions/v1.9.0/smd_model_parallel_pytorch.rst +++ /dev/null @@ -1,677 +0,0 @@ -PyTorch API -=========== - -To use the PyTorch-specific APIs for SageMaker distributed model parallism, -you need to add the following import statement at the top of your training script. - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. tip:: - - Refer to - `Modify a PyTorch Training Script - `_ - to learn how to use the following API in your PyTorch training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of ``torch.nn.Module`` which specifies the model to be - partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is - the model to be partitioned. The returned ``DistributedModel`` object - internally manages model parallelism and data parallelism. Only one - model in the training script can be wrapped with - ``smp.DistributedModel``. - - **Example:** - - .. code:: python - - model = smp.DistributedModel(model) - - **Important**: The ``__call__`` and  ``backward`` method calls on the - ``smp.DistributedModel`` object (in the following example, the object - is \ ``model``) can only be made inside a ``smp.step``-decorated - function. - - Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can - be performed by calling the \ ``DistributedModel`` object on the input - tensors. - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - For a backward pass, one needs to call the backward function on - the \ ``DistributedModel`` object, with tensors and gradients as - arguments, replacing the PyTorch operations \ ``torch.Tensor.backward`` - or ``torch.autograd.backward``. - - The API for ``model.backward`` is very similar to - ``torch.autograd.backward``. For example, the following - ``backward`` calls: - - .. code:: python - - torch.autograd.backward(loss) or loss.backward() - - should be replaced with: - - .. code:: python - - model.backward(loss) # loss is a tensor with only one element as its data - - Similarly, for non-scalar tensors, replace the following - ``backward`` call containing incoming gradient arguments: - - .. code:: python - - torch.autograd.backward(outputs, out_grads) - - with the following line: - - .. code:: python - - model.backward(outputs, out_grads) - - In these examples, all ``__call__``  and ``backward`` method calls on - the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside - a ``smp.step``-decorated function. - - **Using DDP** - - If DDP is enabled with the SageMaker model parallel library, do not not place a PyTorch - ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because - the ``DistributedModel`` wrapper will also handle data parallelism. - - Unlike the original DDP wrapper, when you use ``DistributedModel``, - model parameters and buffers are not immediately broadcast across - processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the - ``smp.step``-decorated function when the partition is done. - - **Parameters** - - - ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism). - - - ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``) - Whether to perform the tracing step on the GPU or CPU. The tracing step gathers - information on the order of execution of modules, the shapes of - intermediate outputs, and execution times, to be used by the - partitioning algorithm. If ``trace_device`` is set to GPU, accurate - module execution times can be gathered during tracing for potentially - improved partitioning decision. However, if the model is too large to - fit in a single GPU, then ``trace_device`` should be set to CPU. - - - ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``, - the library profiles the execution time of each module during tracing, and uses - it in the partitioning decision. This improves the partitioning - decision, but it might make the tracing slower. It may also introduce - some degree of non-determinism in partitioning results, because of the - inherent randomness in module execution times. Must be ``False`` if - ``trace_device`` is ``"cpu"``. - - - ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` while launching training). The library uses this flag - to decide whether to do overlapping allreduce whenever a parameter - gradients are ready. This leads to overlapping of communication and - computation and can improve performance. If this is set to ``False`` , - allreduce is performed at the end of the step. - - - ``backward_passes_per_step`` (``int``) (default: 1): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` in config). This parameter indicates the - number of backward passes to perform before calling allreduce on DDP. - This allows accumulating updates over multiple mini-batches before - reducing and applying them. - - - ``average_grads_across_microbatches`` (``bool``) (default: ``True``): - Whether or not the computed gradients should be averaged across - microbatches. If ``False``, the computed gradients will be summed across - microbatches, but not divided by the number of microbatches. In typical - use case where the computed loss is averaged over the mini-batch, this - should be left as ``True``. If you use a loss function that only sums - the per-sample loss across the batch (and not divide by the batch size), - then this must be set to ``False`` for correctness. - - - ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets - parameters into multiple buckets so that gradient reduction of each - bucket can potentially overlap with backward - computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes - (MB). - - - ``trace_memory_usage`` (default: False): When set to True, the library attempts - to measure memory usage per module during tracing. If this is disabled, - memory usage will be estimated through the sizes of tensors returned from - the module. - - - ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``. - This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper. - Please see: `broadcast_buffer `__. - - - ``gradient_as_bucket_view`` (default: False): To be - used with ``ddp=True``. This parameter is forwarded to the underlying - ``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view `__. - - **Properties** - - - ``partitioned``: Is ``True`` if the model is partitioned, ``False`` - otherwise. Initialized to ``False`` when ``DistributedModel`` is first - created. It becomes be ``True`` during the first call - to ``smp.step``-decorated function. Once the model is partitioned, the - local parameters or local ``state_dict`` can be fetched using the - following methods. - - **Methods** - - .. function:: backward(tensors, grad_tensors) - :noindex: - - Triggers a distributed backward - pass across model partitions. Example usage provided in the previous - section. The API is very similar - to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward. - ``retain_grad`` and ``create_graph``  flags are not supported. - - .. function:: local_buffers( ) - :noindex: - - Returns an iterator over buffers for the modules in - the partitioned model that have been assigned to the current process. - - .. function:: local_named_buffers( ) - :noindex: - - Returns an iterator over buffers for the - modules in the partitioned model that have been assigned to the current - process. This yields both the name of the buffer as well as the buffer - itself. - - .. function:: local_parameters( ) - :noindex: - - Returns an iterator over parameters for the - modules in the partitioned model that have been assigned to the current - process. - - .. function:: local_named_parameters( ) - :noindex: - - Returns an iterator over parameters for - the modules in the partitioned model that have been assigned to the - current process. This yields both the name of the parameter as well as - the parameter itself. - - .. function:: local_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. - - .. function:: local_named_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. This - yields both the name of the module as well as the module itself. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains local - parameters that belong to the current \ ``mp_rank``. This ``state_dict`` - contains a key \ ``_smp_is_partial`` to indicate this is a - partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains parameters - for the entire model. It first collects the \ ``local_state_dict``  and - gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to - create a full ``state_dict``. Please note that this needs to be called on all ranks with - ``dp_rank()==0`` to ensure the gather happens properly. - If it is only called on all such ranks, it can hang. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.module.load_state_dict()`` , - except: It first gathers and merges the ``state_dict``\ s across - ``mp_rank``\ s, if they are partial. The actual loading happens after the - model partition so that each rank knows its local parameters. - - .. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. Returns a ``RemovableHandle`` object ``handle``, - which can be used to remove the hook by calling ``handle.remove()``. - - .. function:: cpu( ) - :noindex: - - Allgathers parameters and buffers across all ``mp_rank``\ s and moves them - to the CPU. - - .. function:: join( ) - :noindex: - - A context manager to be used in conjunction with an instance of - ``smp.DistributedModel`` to be able to train with uneven inputs across - participating processes. This is only supported when ``ddp=True``. This will use the join with the wrapped - ``DistributedDataParallel`` instance. For more information, see: - `join `__ - in the PyTorch documentation. - - .. function:: register_comm_hook( state, callable ) - :noindex: - - **Available for PyTorch 1.8.1 only** - Registers a communication hook which is an enhancement that provides - a flexible hook ``callable`` to users where they can specify how - gradients are aggregated across multiple workers. This method will be called on the wrapped ``DistributedDataParallel`` instance. - - Please note that when you register a comm hook you have full control of how the gradients are processed. - When using only data parallelism with Torch DDP you are expected to average grads across data parallel replicas within the hook. - Similarly, when using DistributedModel you have to averaging grads across data parallel replicas within the hook. - In addition to that, you also have to average grads across microbatches within the hook unless you explicitly desire to not average based on your loss function. - See ``average_grads_across_microbatches`` for more information about averaging grads across microbatches. - - This is only supported when ``ddp=True`` and ``overlapping_allreduce=True`` (default). - For more information, see: - `register_comm_hook `__ - in the PyTorch documentation. - - **Behavior of** ``smp.DistributedModel`` **with Tensor Parallelism** - - When a model is wrapped by ``smp.DistributedModel``, the library - immediately traverses the modules of the model object, and replaces the - modules that are supported for tensor parallelism with their distributed - counterparts. This replacement happens in place. If there are no other - references to the original modules in the script, they are - garbage-collected. The module attributes that previously referred to the - original submodules now refer to the distributed versions of those - submodules. - - **Example:** - - .. code:: python - - # register DistributedSubmodule as the distributed version of Submodule - # (note this is a hypothetical example, smp.nn.DistributedSubmodule does not exist) - smp.tp_register_with_module(Submodule, smp.nn.DistributedSubmodule) - - class MyModule(nn.Module): - def __init__(self): - ... - - self.submodule = Submodule() - ... - - # enabling tensor parallelism for the entire model - with smp.tensor_parallelism(): - model = MyModule() - - # here model.submodule is still a Submodule object - assert isinstance(model.submodule, Submodule) - - model = smp.DistributedModel(model) - - # now model.submodule is replaced with an equivalent instance - # of smp.nn.DistributedSubmodule - assert isinstance(model.module.submodule, smp.nn.DistributedSubmodule) - - If ``pipeline_parallel_degree`` (equivalently, ``partitions``) is 1, the - placement of model partitions into GPUs and the initial broadcast of - model parameters and buffers across data-parallel ranks take place - immediately. This is because it does not need to wait for the model - partition when ``smp.DistributedModel`` wrapper is called. For other - cases with ``pipeline_parallel_degree`` greater than 1, the broadcast - and device placement will be deferred until the first call of an - ``smp.step``-decorated function happens. This is because the first - ``smp.step``-decorated function call is when the model partitioning - happens if pipeline parallelism is enabled. - - Because of the module replacement during the ``smp.DistributedModel`` - call, any ``load_state_dict`` calls on the model, as well as any direct - access to model parameters, such as during the optimizer creation, - should be done **after** the ``smp.DistributedModel`` call. - - Since the broadcast of the model parameters and buffers happens - immediately during ``smp.DistributedModel`` call when the degree of - pipeline parallelism is 1, using ``@smp.step`` decorators is not - required when tensor parallelism is used by itself (without pipeline - parallelism). - - For more information about the library's tensor parallelism APIs for PyTorch, - see :ref:`smdmp-pytorch-tensor-parallel`. - - **Additional Methods of** ``smp.DistributedModel`` **for Tensor Parallelism** - - The following are the new methods of ``smp.DistributedModel``, in - addition to the ones listed in the - `documentation `__. - - .. function:: distributed_modules() - :noindex: - - - An iterator that runs over the set of distributed - (tensor-parallelized) modules in the model - - .. function:: is_distributed_parameter(param) - :noindex: - - - Returns ``True`` if the given ``nn.Parameter`` is distributed over - tensor-parallel ranks. - - .. function:: is_distributed_buffer(buf) - :noindex: - - - Returns ``True`` if the given buffer is distributed over - tensor-parallel ranks. - - .. function:: is_scaled_batch_parameter(param) - :noindex: - - - Returns ``True`` if the given ``nn.Parameter`` is operates on the - scaled batch (batch over the entire ``TP_GROUP``, and not only the - local batch). - - .. function:: is_scaled_batch_buffer(buf) - :noindex: - - - Returns ``True`` if the parameter corresponding to the given - buffer operates on the scaled batch (batch over the entire - ``TP_GROUP``, and not only the local batch). - - .. function:: default_reducer_named_parameters() - :noindex: - - - Returns an iterator that runs over ``(name, param)`` tuples, for - ``param`` that is allreduced over the ``DP_GROUP``. - - .. function:: scaled_batch_reducer_named_parameters() - :noindex: - - - Returns an iterator that runs over ``(name, param)`` tuples, for - ``param`` that is allreduced over the ``RDP_GROUP``. - - - -.. class:: smp.DistributedOptimizer - :noindex: - - **Parameters** - - ``optimizer`` - - An optimizer wrapper for saving/loading optimizer states. This wrapper - returns ``optimizer`` with the following methods overridden: - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains optimizer state for the entire model. - It first collects the ``local_state_dict`` and gathers and merges - the ``local_state_dict`` from all ``mp_rank``s to create a full - ``state_dict``. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.optimizer.load_state_dict()`` , except: - - - It first gathers and merges the local ``state_dict``\ s if they are - partial. - - The actual loading happens after the model partition so that each - rank knows its local parameters. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains the - local optimizer state that belongs to the current \ ``mp_rank``. This - ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is - a partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - ​ -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (int) - The index of the partition. - - A context manager which places all modules defined inside into the - partition with ID ``index``.  The ``index`` argument must be less than - the number of partitions. - - Use ``smp.partition`` to implement manual partitioning. - If ``"auto_partition"`` is ``True``, then the - ``smp.partition`` contexts are ignored. Any module that is not placed in - any ``smp.partition`` context is placed in the - ``default_partition`` defined through the SageMaker Python SDK. - - When ``smp.partition`` contexts are nested, the innermost context - overrides the rest (see the following example). In PyTorch, manual - partitioning should be done inside the module \ ``__init__``, and the - partition assignment applies to the modules that are *created* inside - the ``smp.partition`` context. - - Example: - - .. code:: python - - class Model(torch.nn.Module): -     def __init__(self): -         with smp.partition(1): -             self.child0 = Child0()            # child0 on partition 1 -             with smp.partition(2): -                 self.child1 = Child1()        # child1 on partition 2 -             self.child2 = Child2()            # child2 on partition 1 -         self.child3 = Child3()                # child3 on default_partition - -.. function:: smp.get_world_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all - processes, which can be used with the ``torch.distributed`` API. - Requires ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_mp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``MP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_dp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``DP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.is_initialized( ) - :noindex: - - Returns ``True`` if ``smp.init`` has already been called for the - process, and ``False`` otherwise. - -.. function::smp.is_tracing( ) - :noindex: - :noindex: - - Returns ``True`` if the current process is running the tracing step, and - ``False`` otherwise. - -.. data:: smp.nn.FusedLayerNorm - :noindex: - - `Apex Fused Layer Norm `__ is currently not - supported by the library. ``smp.nn.FusedLayerNorm`` replaces ``apex`` - ``FusedLayerNorm`` and provides the same functionality. This requires - ``apex`` to be installed on the system. - -.. data:: smp.optimizers.FusedNovoGrad - :noindex: - - `Fused Novo Grad optimizer `__ is - currently not supported by the library. ``smp.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad`` - optimizer and provides the same functionality. This requires ``apex`` to - be installed on the system. - -.. data:: smp.optimizers.FusedLamb - :noindex: - - `FusedLamb optimizer `__ - currently doesn’t work with the library. ``smp.optimizers.FusedLamb`` replaces - ``apex`` ``FusedLamb`` optimizer and provides the same functionality. - This requires ``apex`` to be installed on the system. - -.. data:: smp.amp.GradScaler - :noindex: - - `Torch AMP Gradscaler `__ - currently doesn’t work with the library. ``smp.amp.GradScaler`` replaces - ``torch.amp.GradScaler`` and provides the same functionality. - -.. _pytorch_saving_loading: - :noindex: - -APIs for Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. function:: smp.save( ) - :noindex: - - Saves an object. This operation is similar to ``torch.save()``, except - it has an additional keyword argument, ``partial``, and accepts only - string type for the argument ``f`` (file). If ``partial=True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank`` - index to your saved file. - - **Parameters** - - - ``obj`` (dict): A saved object. - - ``f`` (str): A string containing a file name. - - ``partial`` (bool, default= ``True``):  When set to ``True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an - ``mp_rank`` index to the saved file. If you want to be able to load - and further train a model that you save with ``smp.save()``, you must - set ``partial=True``. - - ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``): - A module used for pickling metadata and objects. - - ``pickle_protocol``  (int, default=2): Can be specified to - override the defaultprotocol. - -.. function:: smp.load( ) - :noindex: - - Loads an object saved with ``smp.save()`` from a file. - - Similar to, `torch.load() `__, - except it has an additional keyword argument, ``partial``, and accepts - only string type for the argument ``f`` (file). If \ ``partial=True``, - then each ``mp_rank`` loads a separate checkpoint file. - - **Parameters** - - - ``f`` (string): A string containing a file name. - - ``map_location`` (function): A function - `torch.device `__, - a string, or a dict specifying how to remap storage locations. - - ``pickle_module`` (pickle module): A module used for unpickling - metadata and objects (has to match the \ ``pickle_module``\ used to - serialize file). - - ``pickle_load_args`` (Python 3 only): Optional keyword arguments - passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``. - - ``partial`` (bool, default= ``True``): When set to ``True``, each - ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``. - Should be used when loading a model trained with the library. - -.. _pytorch_saving_loading_instructions: - :noindex: - -General Instruction For Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The library can save partial or full checkpoints. - -- For partial checkpoints, each ``mp_rank`` saves its own checkpoint - file with only the parameters that belong to that rank. -- For full checkpoints, the library saves a single checkpoint that contains - entire model parameters. - -When **saving** using ``smp.save()``, each rank only holds its own -parameters. If you want to save the full model, there will be some -communication between the ranks to create the full model. If you save -checkpoints often, you should save partial checkpoints for best -performance. - -When **loading** using ``smp.load()``, the library can load either partial or | -full checkpoints or full checkpoints saved by a non-model-parallel model. If you -want to resume training with a non-model-parallel model or do inference, you need -a full checkpoint. - -The following is an example of how you can save and load a checkpoint: - -.. code:: python - - # Original model and optimizer - model = MyModel(...) - optimizer = MyOpt(...) - - # model parallel wrapper - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - - # To save, always save on dp_rank 0 to avoid data racing - if partial: -     # To save the partial model on each mp rank -     # the library will create `checkpoint.pt_{mprank}` for each mp rank -     if save_partial_model: -         if smp.dp_rank() == 0: -             model_dict = model.local_state_dict() # save the partial model -             opt_dict = optimizer.local_state_dict() # save the partial optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 f"/checkpoint.pt", -                 partial=True, -             ) - -     # To save the full model -     if save_full_model: -         if smp.dp_rank() == 0: -             model_dict = model.state_dict() # save the full model -             opt_dict = optimizer.state_dict() # save the full optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 "/checkpoint.pt", -                 partial=False, -             ) - - # To load, load on all ranks. - # The only difference for partial/full loading is the partial flag in smp.load - # Load partial checkpoint - if partial_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=True) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) - # Load full checkpoint - if full_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=False) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) diff --git a/doc/api/training/smp_versions/v1.9.0/smd_model_parallel_pytorch_tensor_parallel.rst b/doc/api/training/smp_versions/v1.9.0/smd_model_parallel_pytorch_tensor_parallel.rst deleted file mode 100644 index c66595ddf2..0000000000 --- a/doc/api/training/smp_versions/v1.9.0/smd_model_parallel_pytorch_tensor_parallel.rst +++ /dev/null @@ -1,876 +0,0 @@ -.. _smdmp-pytorch-tensor-parallel: - :noindex: - -PyTorch API for Tensor Parallelism -================================== - -SageMaker distributed tensor parallelism works by replacing specific submodules -in the model with their distributed implementations. The distributed modules -have their parameters and optimizer states partitioned across tensor-parallel -ranks. This is to compute the same output as it would have been computed by -the original modules. Since tensor parallelism occurs across data-parallel -ranks, a rank might collect slices of the activations corresponding to the -data shards on other devices that are part of the same tensor parallelism group. - -You can enable or disable tensor parallelism for specific parts of the model. -Within the enabled parts, the replacements with distributed modules will take -place on a best-effort basis for those module supported for tensor parallelism. -Alternatively, you can directly import and use the library’s distributed -modules in the model definition. - -Some of the supported modules (such as ``smp.nn.Transformer``) are high-level -blocks that contain many operations. Because custom implementations -(as opposed to the built-in PyTorch modules) are typically used for these -high-level blocks, the library offers an API that you can use to register -specific distributed versions with such custom modules (provided that they -are functionally equivalent). This allows the library to automatically replace -the occurrences of such PyTorch modules with their distributed counterparts -provided by the library. -For more information, see the following topics. - -.. contents:: Topics - :depth: 3 - :local: - -.. _registering-tp-modules: - :noindex: - -Registering Tensor Parallelism Distributed Modules --------------------------------------------------- - -Although PyTorch natively provides some of the commonly used (and -tensor-parallelizable) building blocks such as Transformer, users often -use custom implementations for such higher-level modules. To distribute -such modules with tensor parallelism, you need to register the -distributed modules to the custom module implementation in your class, -so that the library knows how to distribute the custom module. When you -register the distributed modules, make sure the custom module that you -use is functionally equivalent to the distributed module. You can verify -this by taking a look at the equivalent reference implementations in the -:ref:`smdmp-tp-appendix`. -These implementations are functionally equivalent to their distributed -versions in ``smp.nn`` module. - -.. decorator:: @smp.tp_register(dist_module, init_hook=None, forward_hook=None, return_hook=None) - - - A class decorator that registers the ``dist_module`` class with - the module class that it is attached to. The hooks can be used to - adapt to different interfaces used with ``__init__`` and - ``forward`` methods. - - **Arguments:** - - - ``dist_module``: A subclass of ``smp.nn.DistributedModule`` - that implements the distributed version of the module class the - decorator is attached to. Any distributed module class defined - in ``smp.nn`` module can be used. - - ``init_hook``: A callable that translates the arguments of the - original module ``__init__`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``__init__`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``__init__`` method (including argument order and default - values), except it must exclude ``self``. - - ``forward_hook``: A callable that translates the arguments of - the original module ``forward`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``forward`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``forward`` method (including argument order and default - values), except it must exclude ``self``. - - ``return_hook``: A callable that translates the object returned - from the distributed module to the return object expected of - the original module. - - - **Example:** - - .. code:: python - - init_hook = lambda config: ((), config.to_dict()) - - # register smp.nn.DistributedTransformer - # as the distributed version of MyTransformer - @smp.tp_register(smp.nn.DistributedTransformer, init_hook=init_hook) - class MyTransformer(nn.Module): - def __init__(self, config): - ... - - def forward(self, hidden_states, attention_mask): - ... - -.. function:: smp.tp_register_with_module(module_cls, dist_module, init_hook=None, forward_hook=None, return_hook=None) - :noindex: - - - When you do not have direct access to model definition code, you - can use this API to similarly register a distributed module with - an existing module class. - - - **Arguments:** - - - ``module_cls``: The existing module class that will be - distributed. - - ``dist_module``: A subclass of ``smp.nn.DistributedModule`` - that implements the distributed version of the module class the - decorator is attached to. Any distributed module class defined - in ``smp.nn`` module can be used. - - ``init_hook``: A callable that translates the arguments of the - original module ``__init__`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``__init__`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``__init__`` method (including argument order and default - values), except it must exclude ``self``. - - ``forward_hook``: A callable that translates the arguments of - the original module ``forward`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``forward`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``forward`` method (including argument order and default - values), except it must exclude ``self``. - - ``return_hook``: A callable that translates the object returned - from the distributed module to the return object expected of - the original module. - - - **Example:** - - .. code:: python - - from somelibrary import MyTransformer - - init_hook = lambda config: ((), config.to_dict()) - - # register smp.nn.DistributedTransformer as the distributed version of MyTransformer - smp.tp_register_with_module(MyTransformer, - smp.nn.DistributedTransformer, - init_hook=init_hook) - -.. _smdmp-supported-modules-for-tp: - :noindex: - -Supported Modules for Tensor Parallelism ----------------------------------------- - -The following modules are supported for tensor -parallelism. - -- ``smp.nn.DistributedLinear`` (implements ``nn.Linear``) -- ``smp.nn.DistributedTransformerLMHead`` -- ``smp.nn.DistributedTransformer`` -- ``smp.nn.DistributedTransformerLayer`` -- ``smp.nn.DistributedAttentionLayer`` -- ``smp.nn.DistributedTransformerOutputLayer`` -- ``smp.nn.DistributedEmbedding`` - -.. contents:: Topics - :depth: 3 - :local: - -.. _tp-module-api: - :noindex: - -Tensor Parallelism Module APIs -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. class:: smp.nn.DistributedLinear(in_features, out_features) - :noindex: - - - Tensor-parallel implementation of the ``nn.Linear`` class. - Functionally equivalent to an ``nn.Linear`` module with the same - ``in_features`` and ``out_features``. In other words, - ``in_features`` and ``out_features`` are the number of *global* - channels across tensor-parallel ranks. - - **Arguments:** - - - ``in_features``: The total number of input channels for the - linear layer across all tensor-parallel ranks. - - ``out_features``: The total number of output channels for the - linear layer across all tensor-parallel ranks. - -.. class:: smp.nn.DistributedTransformerLMHead(num_layers=12, num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, vocab_size=30522, num_positions=1024, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, num_token_types=0, causal_mask_size=None, add_cross_attention=False, add_lm_head=True, initializer_range=0.02, use_normal_initialization=False, pre_layernorm=False, post_layernorm=True) - :noindex: - - - Constructs a distributed transformer model, including embeddings - and a single LM head. A word embedding of size - ``(vocab_size, hidden_size)`` is created, as well as a positional - embedding of size ``(num_positions, hidden_size)``, and the - embeddings are added together. If ``num_token_types`` is larger - than 0, a separate embedding of size - ``(num_token_types, hidden_size)`` is created, and further added - on top. - - The embeddings are fed through a ``DistributedTransformer``, and - if ``add_lm_head`` is ``True``, the output passes through a single - LM head, which is a linear module without bias whose weight is - tied to the word embeddings. - - See :class:`smp.nn.DistributedTransformerLayer` for descriptions of the rest - of the arguments. - - **Methods:** - - - ``forward(self, inputs)`` - - - If ``add_cross_attention`` is ``True``, ``inputs`` must be a - tuple - ``(input_ids, attention_mask, token_type_ids, position_ids, cross_states, cross_states, cross_mask, labels)``. - - Otherwise, ``inputs`` must be a tuple - ``(input_ids, attention_mask, token_type_ids, position_ids, labels)``. - - If ``token_type_ids`` is ``None``, token type embedding will - not be used. - - ``input_ids`` is assumed to be of shape ``[N, S]``, where - ``N`` is the batch size and ``S`` is sequence length. - - ``attention_mask`` is assumed to be a 0-1 tensor of shape - ``[N, S]``, where 1 represents a masked position. - -.. class:: smp.nn.DistributedTransformer(num_layers=12, num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, causal_mask_size=None, add_cross_attention=False, pre_layernorm=False, post_layernorm=True) - :noindex: - - - A sequence of ``smp.nn.DistributedTransformerLayer``\ s, whose - number is given by ``num_layers`` argument. For the other - arguments and methods, refer to - ``smp.nn.DistributedTransformerLayer``. - - If both ``pre_layernorm`` and ``post_layernorm`` are ``True``, - layer normalization is applied to both the input and the output of - the ``DistributedTransformer``, in addition to the intermediate - attention and transformer-output layers. - -.. class:: smp.nn.DistributedTransformerLayer(num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, causal_mask_size=None, add_cross_attention=False, pre_layernorm=False, post_layernorm=True) - :noindex: - - - Tensor-parallel implementation of a single transformer layer. - Number of attention heads, hidden size, and intermediate size - refer to the global quantities across all tensor-parallel ranks. - - **Arguments:** - - - ``num_attention_heads``: The total number of attention heads - across tensor-parallel ranks - - ``attention_head_size``: The number of channels of a single - attention head. - - ``hidden_size``: The hidden dimension of the transformer. The - input tensor ``hidden_states`` is assumed to have its last - dimension size equal to ``hidden_size``. - - ``intermediate_size``: The number of output channels in the - first linear transformation of the transformer output layer. - ``DistributedTransformerOutputLayer`` first maps - ``hidden_size`` dimensions of its input tensor into - ``intermediate_size`` dimensions, and then maps it back into - ``hidden_size`` dimensions. - - ``attention_dropout_prob``: The dropout probability applied to - the attention probabilities. - - ``hidden_dropout_prob``: The dropout probability used in - dropout layers other than the one applied to the attention - probabilities. - - ``activation``: Choice of activation function to use at the - output layer. Must be ``"gelu"`` or ``"relu"``. - - ``layernorm_epsilon``: The epsilon added to the denominator of - layer normalization for numerical stability. - - ``initializer_range``: If ``use_normal_initialization`` is - ``True``, the standard deviation of the normal random variable - to initialize the weights with. - - ``use_normal_initialization``: If ``True``, the weights are - initialized with normal distribution with standard deviation - given by ``initializer_range``. Otherwise, default PyTorch - initialization is used. - - ``causal_mask_size``: If ``None``, no causal mask is used on - attentions. Otherwise, should be set to maximum sequence length - to apply a causal mask to the attention scores. This is used, - for instance, in GPT-2. - - ``add_cross_attention``: If ``True``, a cross-attention layer - will be added after the self-attention block. The - cross-attention layer computes the attention keys and values - based on the ``cross_states`` input (instead of - ``hidden_states`` input, as in self-attention. This is used in - the decoder block of encoder-decoder architectures. For - encoder-only architectures that only use self-attention, this - should be kept ``False``. - - ``pre_layernorm``: If ``True``, inserts layer normalization at - the input. At least one of ``pre_layernorm`` and - ``post_layernorm`` must be ``True``. - - ``post_layernorm``: If ``True``, inserts layer normalization at - the output. At least one of ``pre_layernorm`` and - ``post_layernorm`` must be ``True``. - - - **Methods:** - - - ``forward(self, inputs)``: Forward pass for the transformer - layer. - - - **Arguments:** - - - If ``add_cross_attention=False``, ``inputs`` must be a - tuple ``(hidden_states, attention_mask)``, where - ``hidden_states`` is assumed to be a tensor of dimensions - ``[N, S, H]``, where ``N`` is batch size, ``S`` is - sequence length, and ``H`` is ``hidden_size``. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S]``, where ``N`` is the batch - size, and ``S`` is the sequence length. - - If ``add_cross_attention=True``, ``inputs`` must be a - tuple - ``(hidden_states, cross_states, attention_mask, cross_mask)``, - where ``hidden_states`` is assumed to be a tensor of - dimensions ``[N, S_1, H]``, where ``N`` is batch size, - ``S_1`` is sequence length, and ``H`` is ``hidden_size``. - ``cross_states`` is assumed to be a tensor of size - ``[N, S_2, H]``, similarly interpreted. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S_1]``, where ``N`` is the batch - size, and ``S_1`` is the sequence length, and - ``cross_mask`` is assumed to be a tensor of size - ``[N, 1, 1, S_2]``. Keys and values for the attention - heads in the cross-attention layer (but not the - self-attention layer) are computed using - ``cross_states``, and ``cross_mask`` is applied as the - attention mask in the cross-attention layer (but not the - self-attention layer). - - - **Returns:** - - - If ``add_cross_attention=False``, a tuple - ``(hidden_states, attention_mask)``, where - ``hidden_states`` is the output of the transformer, and - ``attention_mask`` is the same the ``attention_mask`` - argument. - - If ``add_cross_attention=True``, a tuple - ``(hidden_states, cross_states, attention_mask, cross_mask)``, - where ``hidden_states`` is the output of the transformer, - and the next three tensors are the same as the input - arguments. - -.. class:: smp.nn.DistributedAttentionLayer(num_attention_heads=32, attention_head_size=32, hidden_size=1024, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, cross_attention=False, causal_mask_size=None, pre_layernorm=False, post_layernorm=True) - :noindex: - - - A distributed implementation for the attention block. Includes the - computation of the self- or cross-attention (context layer), - followed by a linear mapping and dropout, which is optionally - followed by the residual-connection and layer normalization. - - **Arguments:** - - - See :class:`smp.nn.DistributedTransformerLayer` for descriptions of the - arguments. - - ``cross_attention``: If ``True``, it computes the attentions - with respect to the ``cross_states`` tensor of the ``forward`` - method input tuple. (Default: ``False``) - - - **Methods:** - - - ``forward(self, inputs)``: Forward pass for the attention - layer. - - - **Arguments:** - - - If ``cross_attention=False``, ``inputs`` must be a tuple - ``(hidden_states, attention_mask)``, where - ``hidden_states`` is assumed to be a tensor of dimensions - ``[N, S, H]``, where ``N`` is batch size, ``S`` is - sequence length, and ``H`` is ``hidden_size``. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S]``, where ``N`` is the - batch size, and ``S`` is the sequence length. - - If ``cross_attention=True``, ``inputs`` must be a tuple - ``(hidden_states, cross_states, attention_mask)``, where - ``hidden_states`` is assumed to be a tensor of dimensions - ``[N, S_1, H]``, where ``N`` is batch size, ``S_1`` is - sequence length, and ``H`` is ``hidden_size``. - ``cross_states`` is assumed to be a tensor of size - ``[N, S_2, H]``, similarly interpreted. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S_2]``, where ``N`` is the batch - size, and ``S_2`` is the sequence length. Keys and values - for the attention heads are computed using - ``cross_states``. - - - **Returns:** - - - A single tensor that is the output of the attention - layer. - -.. class:: smp.nn.DistributedTransformerOutputLayer(hidden_size=1024, intermediate_size=4096, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, pre_layernorm=False, post_layernorm=True, fp32_residual_addition=False) - :noindex: - - - Distributed implementation of a single transformer output layer. A - single :class:`smp.nn.DistributedTransformerLayer` with - ``add_cross_attention=False`` consists of a single - ``DistributedAttentionLayer`` immediately followed by a single - ``DistributedTransformerOutputLayer``. The latter linearly maps - the last channel of the input tensor from ``hidden_size`` to - ``intermediate_size``, and then maps it back to ``hidden_size``. - - **Arguments:** - - - See :class:`smp.nn.DistributedTransformerLayer` for descriptions of the - arguments. - - ``fp32_residual_addition``: Set to ``True`` if you want to avoid overflow - (NaN loss values) for large models with more than 100 billion parameters - when using FP16. (Default: False) - -.. class:: smp.nn.DistributedEmbedding(num_embeddings,embedding_dim, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False, _weight=None, initializer_range=0.02, _skip_allgather=False,_skip_scatter_and_merge=False,) - :noindex: - - - Distributed implementation of a single Embedding Layer. Currently - only supports splitting across the embedding_dim. - - **Arguments:** - - - See :class:`smp.nn.DistributedEmbedding` for descriptions of the - arguments. - -.. _enabling-tp: - :noindex: - -Enabling Tensor Parallelism -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -There are two ways tensor parallelism can be enabled. - -First, you can use -the distributed module implementations in ``smp.nn`` module directly in -your model definition. See :ref:`smdmp-supported-modules-for-tp` -for a complete list of built-in distributed modules. Here is an example -of how this can be done: - -.. code:: python - - import torch.nn as nn - import smdistributed.modelparallel.torch as smp - - class TransformerModel: - def __init__(self): - self.embedding = nn.Embedding(vocab_size, hidden_size) - - # directly instantiate smp.nn.DistributedTransformer and use it - self.encoder = smp.nn.DistributedTransformer(num_layers, hidden_size, **kwargs) - - self.pooler = nn.Linear(hidden_size, hidden_size) - - def forward(self, hidden_states): - emb_out = self.embedding(hidden_states) - enc_out = self.encoder(emb_out) - return self.pooler(enc_out) - -Second, you can enable tensor parallelism for specific modules or blocks -of code, which will automatically enable tensor parallelism for the -supported modules within that scope. To do this, you can use the -following API: - -.. decorator:: smp.tensor_parallelism(enabled=True, **kwargs) - :noindex: - - - A context manager that enables or disables tensor parallelism for - any supported module that is created inside. If there are nested - contexts, the innermost overrides the rest. If there are - multiple supported modules created within the context, where one - is the submodule of the other, only the outermost module will be - distributed. If a supported module shares weights with another - (supported or unsupported) module, or if its hyperparameters do - not support distribution (e.g., not divisible by the tensor - parallelism degree), tensor parallelism will **not** be enabled - for this module even if this API is used. - - **Example:** - - .. code:: python - - with smp.tensor_parallelism(): - self.m0 = nn.Linear(20, 20) # will be distributed - with smp.tensor_parallelism(enabled=False): - self.m1 = nn.Linear(20, 20) # will not be distributed - - - ``kwargs`` - Keyword arguments that can be used to modify the configurations of - the distributed modules created inside the context. - If a keyword argument provided through it matches any ``__init__`` method arguments - of a ``DistributedModule`` that substitutes a module created inside - the ``smp.tensor_parallelism`` context, this keyword will override - the value defined in the ``init_hook``. - - - (*For v1.7.0 and later*) Through the following additional keyword arguments, - the library supports `NVIDIA Megatron’s fused kernels - `_ - - - ``fused_softmax`` (bool) - Fusion of attention masking and softmax. - By default, it is set to ``True``. You can deactivate it by setting - ``fused_softmax=False`` in the ``smp.tensor_parallelism`` context manager. - - ``fused_bias_gelu`` (bool) - Fusion of bias addition and Gelu activation. - By default, it is set to ``False``. You can activate it by setting - ``fused_bias_gelu=True`` in the ``smp.tensor_parallelism`` context manager. - - - -.. function:: smp.set_tensor_parallelism(module, enabled=True, **kwargs) - :noindex: - - - Enables or disables tensor parallelism for the supported - submodules of ``module``. If enabling, the outermost supported - modules will be distributed. If disabling, tensor parallelism will - be disabled for the entire module subtree of ``module``. Unlike - the context manager, this API can be used after the model creation - (but before wrapping with :class:`smp.DistributedModel`), so direct - access to model definition code is not required. If a supported - module shares weights with another (supported or unsupported) - module, or if its hyperparameters do not support distribution - (e.g., not divisible by the tensor parallelism degree), tensor - parallelism will **not** be enabled for this module. - - Keyword arguments ``kwargs`` can be used to modify the - configurations of the distributed modules created inside the - context. If a keyword argument provided here matches any - ``__init__`` method arguments of a :class:`smp.DistributedModel` that - substitutes a module created inside the ``smp.tensor_parallelism`` - context, this keyword will override the value defined in the - ``init_hook``. - - **Example:** - - .. code:: python - - model = MyModel() - smp.set_tensor_parallelism(model.encoder, True) - smp.set_tensor_parallelism(model.encoder.embedding, True) - - # outermost supported submodules in model.encoder will be distributed, except for - # model.encoder.embedding - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - -.. _activation-checkpointing-api: - :noindex: - -Activation Checkpointing APIs ------------------------------ - -``smdistributed.modelparallel`` provides three APIs to enable -activation checkpointing: one for checkpointing modules, -one for checkpointing sequential modules, and -one for checkpointing pretrained models. - -For a conceptual guide and examples, see -`Activation Checkpointing `_ -in the *SageMaker's Distributed Model Parallel developer guide*. - -.. class:: smdistributed.modelparallel.torch.patches.checkpoint.checkpoint(module, *args, preserve_rng_state=True) - :noindex: - - - Checkpoints the module passed. Throws error if, during manual - partitioning, all children of module are not on same rank as the - module itself, i.e. the module tree is split across multiple - partitions. During auto-partitioning, if the module is split - across multiple partitions, then this call is ignored(with a - warning). Note that this call applies to the module instance only, - not to the module class. - - - **Arguments:** - - - ``module (Instance of nn.Module)``: The module to be - checkpointed. Note that unlike native checkpointing in - PyTorch’s, activation checkpointing in - ``smdistributed.modelparallel`` is at the granularity of a - module. A generic function cannot be passed here. - - ``args``: Tuple containing inputs to the module. - - ``preserve_rng_state (bool, default=True)``: Omit stashing and - restoring the RNG state during each checkpoint. - -.. class:: smdistributed.modelparallel.torch.patches.checkpoint.checkpoint_sequential(sequential_module, input, strategy="each", preserve_rng_state=True, pack_args_as_tuple=False) - :noindex: - - - Checkpoints the modules inside - `nn.Sequential `__. - This can be used even if different layers that are part of the - sequential container lie on different partitions. Each layer part - of the sequential module that is checkpointed must lie completely - within one partition. If this is not the case during manual - partitioning, then an error will be thrown. If this is not the - case during auto partitioning, a warning will be raised and this - module will be run without checkpointing. - - - **Arguments** - - - ``sequential_module (nn.Sequential)``: the sequential module to - be checkpointed. - - ``input (torch.Tensor or a tuple of torch.Tensors)``: input to - the module, which can be a tensor or a tuple of tensors. If a - tuple is passed, then pack_args_as_tuple should be set to True. - - ``strategy (string, default=“each”)`` : Strategy determines how - many layers part of the sequential module need to be grouped - together for one checkpointing call. This determines how much - memory can be reduced. It can take the following values - - - ``each`` : The default is to checkpoint each module inside - the sequential separately. - - ``contiguous``: Groups consecutive layers on the same - partition together. For example, if a sequential consists of - [a, b, c, d] where a,b are on pp_rank0 and c,d are on - pp_rank 1, then this strategy would checkpoint a,b together - and then c,d together. This means effectively, inputs of a, - outputs of b, inputs of c, and outputs of d are in memory; - the reamining activations are recomputed. - - ``group_2, group_3, group_4, etc:`` More generally, - ``group_x`` where x is an integer. This strategy provides - more flexibility in how many layers to group together. - ``group_x`` groups x layers together on a best effort basis. - It can group x layers together if there are x layers - consecutively on the same partition. For example: - [a,b,c,d,e] where a,b are on pp_rank0 and c,d,e are on - pp_rank 1. If the strategy is ``group_3,`` then a,b are - checkpointed together on pp_rank0 and c,d,e are checkpointed - together on pp_rank1. - - - ``preserve_rng_state (bool, default=True)``: Set to ``False`` - to omit stashing and restoring the RNG state during each - checkpoint. - - ``pack_args_as_tuple (bool, default=False)``: To ensure that - backward works correctly, the autograd function has to unpack - any tuples received. If the checkpointed layer takes a tuple as - input, then this needs to be set to True. - -.. class:: smp.set_activation_checkpointing(module, preserve_rng_state=True, pack_args_as_tuple=False, strategy="each") - :noindex: - - - This API is recommended when importing pretrained models from - libraries, such as PyTorch and Hugging Face Transformers. This is - particularly useful when you don’t have access to the model - definition code and not be able to replace a module call with - checkpoint. - - - **Arguments**: - - - ``module (Instance of nn.Module or nn.Sequential)``: The module - to checkpoint. - - ``preserve_rng_state (bool, default=True)``: Set to ``False`` - to omit stashing and restoring the RNG state during each - checkpoint. - - ``pack_args_as_tuple (bool, default=False)``: *Can only be - passed when module is a sequential module.* To ensure that - backward works correctly, the autograd function has to unpack - any tuples received. If the layer checkpointed takes a tuple as - input, then this needs to be set to True. - - ``strategy: (string, default=“each”)``: *Can only be passed - when module is a sequential module.* Strategy determines how - many layers part of the sequential module need to be grouped - together for one checkpointing call. - - This determines how much memory can be reduced. It can take the - following values - - - ``each`` : The default is to checkpoint each module inside - the sequential separately. - - ``contiguous``: Groups consecutive layers on the same - partition together. For example if a sequential consists of - ``[a, b, c, d]`` where ``a, b`` are on ``pp_rank0`` and ``c, d`` are on - ``pp_rank 1``, then this strategy would checkpoint a,b together - and then ``c, d`` together. This means effectively, the inputs of - ``a``, outputs of ``b``, inputs of ``c``, and outputs of ``d`` are in - memory, and the rest of the activations are recomputed. - - ``group_2, group_3, group_4, etc:`` More generally, - ``group_x`` where x is an integer. This strategy provides - more flexibility in how many layers to group together. - ``group_x`` groups x number of layers together on a best - effort basis if there are x layers consecutively in the same - partition. **Example**: Assume a module with layers ``[a, b, - c, d, e]``. The layers a and b are on pp_rank0, and ``c``, ``d``, and - ``e`` are on ``pp_rank 1``. If the strategy is ``group_3,`` then ``a``, - ``b`` are checkpointed together on ``pp_rank0``, and ``c``, ``d``, ``e`` are - checkpointed together on ``pp_rank1``. - -.. _smdmp-tp-appendix: - :noindex: - -Appendix: Reference Implementations for Modules ------------------------------------------------ - -The following are reference implementations for transformer-related -modules. Note that this is not the actual ``smdistributed`` source code, -but the distributed implementations provided in the library are the -distributed versions of these reference implementations, and can be used -to determine whether the distributed modules perform the same operations -as the custom modules in your script. - -To keep the implementations simple, we only assume keyword arguments, -and assume the existence of a method ``parse_args(kwargs)``, which -parses the arguments to ``__init__`` methods and sets the relevant -attributes of the module, such as ``hidden_size`` and -``num_attention_heads``. - -``smp.nn.DistributedTransformer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class Transformer(nn.Module): - def __init__(self, **kwargs): - super(Transformer, self).__init__() - self.parse_args(kwargs) - - self.layers = [] - for l in range(self.num_layers): - self.layers.append(TransformerLayer(**kwargs)) - - self.seq_layers = nn.Sequential(*self.layers) - - def forward(self, inp): - return self.seq_layers(inp) - -``smp.nn.DistributedTransformerLayer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class TransformerLayer(nn.Module): - def __init__(self, **kwargs): - super(TransformerLayer, self).__init__() - self.parse_args(kwargs) - - self.attention = AttentionLayer(**kwargs) - self.output = TransformerOutputLayer(**kwargs) - - if self.add_cross_attention: - self.cross_attention = AttentionLayer(cross_attention=True, **kwargs) - - def forward(self, inp): - if self.add_cross_attention: - hidden_states, cross_states, attention_mask, cross_mask = inp - else: - hidden_states, attention_mask = inp - - attention_output = self.attention((hidden_states, attention_mask)) - if self.add_cross_attention: - attention_output = self.cross_attention((attention_output, - cross_states, - cross_mask)) - - output = self.output(attention_output) - - if self.add_cross_attention: - return output, cross_states, attention_mask, cross_mask - else: - return output, attention_mask - -``smp.nn.DistributedAttentionLayer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class AttentionLayer(nn.Module): - def __init__(self, **kwargs): - super(AttentionLayer, self).__init__() - self.parse_args(kwargs) - self.attention_head_size = self.hidden_size // self.num_attention_heads - - self.query = nn.Linear(self.hidden_size, self.hidden_size) - self.key = nn.Linear(self.hidden_size, self.hidden_size) - self.value = nn.Linear(self.hidden_size, self.hidden_size) - self.dense = nn.Linear(self.hidden_size, self.hidden_size) - - self.dropout1 = nn.Dropout(self.attention_dropout_prob) - self.dropout2 = nn.Dropout(self.hidden_dropout_prob) - - if self.pre_layernorm: - self.pre_layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - if self.post_layernorm: - self.layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - def transpose(self, tensor, key=False): - shape = tensor.size()[:-1] + - (self.num_attention_heads, self.attention_head_size) - tensor = torch.reshape(tensor, shape) - if key: - return tensor.permute(0, 2, 3, 1) - else: - return tensor.permute(0, 2, 1, 3) - - def forward(self, inp): - if self.cross_attention: - hidden_states, cross_states, attention_mask = inp - else: - hidden_states, attention_mask = inp - - if self.pre_layernorm: - norm_states = self.pre_layernorm(hidden_states) - else: - norm_states = hidden_states - - query_layer = self.query(norm_states) - - if self.cross_attention: - key_layer = self.key(cross_states) - value_layer = self.value(cross_states) - else: - key_layer = self.key(norm_states) - value_layer = self.value(norm_states) - - query_layer = self.transpose(query_layer) - key_layer = self.transpose(key_layer, key=True) - value_layer = self.transpose(value_layer) - - attention_scores = torch.matmul(query_layer, key_layer) - attention_scores = attention_scores / math.sqrt(self.attention_head_size) - - if not self.cross_attention and self.causal_mask is not None: - attention_scores = self.apply_causal_mask(attention_scores) - - attention_scores = attention_scores + attention_mask - - attention_probs = F.softmax(attention_scores, dim=-1) - attention_probs = self.dropout1(attention_probs) - - context_layer = torch.matmul(attention_probs, value_layer) - context_layer = context_layer.permute(0, 2, 1, 3) - new_context_layer_shape = context_layer.size()[:-2] + \ - (self.local_attention_size,) - context_layer = torch.reshape(context_layer, new_context_layer_shape) - - self_attention = self.dense(context_layer) - self_attention = self.dropout2(self_attention) - - if self.post_layernorm: - return self.layernorm(self_attention + hidden_states) - else: - return self_attention - -``smp.nn.DistributedTransformerOutputLayer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class TransformerOutputLayer(nn.Module): - def __init__(self, **kwargs): - super(TransformerOutputLayer, self).__init__() - self.parse_args(kwargs) - - self.dense1 = nn.Linear(self.hidden_size, self.intermediate_size) - self.dense2 = nn.Linear(self.intermediate_size, self.hidden_size) - - self.dropout = nn.Dropout(self.attention_dropout_prob) - - if self.pre_layernorm: - self.pre_layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - if self.post_layernorm: - self.layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - def forward(self, inp): - if self.pre_layernorm: - norm_inp = self.pre_layernorm(inp) - else: - norm_inp = inp - - dense1_output = self.dense1(norm_inp) - if self.activation == "gelu": - act_output = F.gelu(dense1_output) - else: - act_output = F.relu(dense1_output) - - dense2_output = self.dense2(act_output) - output = self.dropout(dense2_output) - - if self.post_layernorm: - return self.layernorm(inp + output) - else: - return output diff --git a/doc/api/training/smp_versions/v1.9.0/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/v1.9.0/smd_model_parallel_tensorflow.rst deleted file mode 100644 index 2c658b487c..0000000000 --- a/doc/api/training/smp_versions/v1.9.0/smd_model_parallel_tensorflow.rst +++ /dev/null @@ -1,171 +0,0 @@ -TensorFlow API -============== - -To use the TensorFlow-specific APIs for SageMaker distributed model parallism, -you need to add the following import statement at the top of your training script. - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -.. tip:: - - Refer to - `Modify a TensorFlow Training Script - `_ - to learn how to use the following APIs in your TensorFlow training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of the Keras \ ``Model`` class, which defines the model to - be partitioned. Model definition is done by sub-classing - ``smp.DistributedModel`` class, and implementing the ``call()`` method, - in the same way as the Keras model sub-classing API. Any operation that - is part of the \ ``smp.DistributedModel.call()`` method is subject to - partitioning, meaning that every operation placed inside executes in - exactly one of the devices (the operations outside run on all devices). - - - Similar to the regular Keras API, the forward pass is done by directly - calling the model object on the input tensors. For example: - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - However, ``model()`` calls can only be made inside a - ``smp.step``-decorated function. - - The outputs from a ``smp.DistributedModel`` are available in all ranks, - regardless of which rank computed the last operation. - - **Methods:** - - .. function:: save_model(save_path="/opt/ml/model") - :noindex: - - **Inputs** - - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights. - - Saves the entire, - unpartitioned model with the latest trained weights to ``save_path`` in - TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which - SageMaker monitors to upload the model artifacts to Amazon S3. - -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (``int``): The index of the partition. - - A context manager which places all operations defined inside into the - partition whose ID is equal to ``index``. When - ``smp.partition`` contexts are nested, the innermost context overrides - the rest. The ``index`` argument must be smaller than the number of - partitions. - - ``smp.partition`` is used in the manual partitioning API; - if \ ``"auto_partition"`` parameter is set to ``True`` while launching - training, then ``smp.partition`` contexts are ignored. Any operation - that is not placed in any ``smp.partition`` context is placed in the - ``default_partition``, as shown in the following example: - - .. code:: python - - # auto_partition: False - # default_partition: 0 - smp.init() - [...] - x = tf.constant(1.2)                     # placed in partition 0 - with smp.partition(1): -     y = tf.add(x, tf.constant(2.3))      # placed in partition 1 -     with smp.partition(3): -         z = tf.reduce_sum(y)             # placed in partition 3 - - -.. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. - - .. code:: python - - @smp.register_post_partition_hook - def test_eager(): - # All statements here will be executed right after partition but before the first forward pass - tf.print("Entered hook through eager context") - -.. class:: smp.CheckpointManager - :noindex: - - - A subclass of TensorFlow - `CheckpointManager `__, - which is used to manage checkpoints. The usage is similar to TensorFlow - ``CheckpointManager``. - - The following returns a ``CheckpointManager`` object. - - .. code:: python - - smp.CheckpointManager(checkpoint, -                       directory="/opt/ml/checkpoints", -                       max_to_keep=None, -                       checkpoint_name="ckpt") - - **Parameters** - - - ``checkpoint``: A `tf.train.Checkpoint - `__ instance - that represents a model checkpoint. - - - ``directory``: (``str``) The path to a directory in which to write - checkpoints. A file named "checkpoint" is also written to this - directory (in a human-readable text format) which contains the state - of the ``CheckpointManager``. Defaults to - ``"/opt/ml/checkpoints"``, which is the directory that SageMaker - monitors for uploading the checkpoints to Amazon S3. - - ``max_to_keep`` (``int``): The number of checkpoints to keep. If - ``None``, all checkpoints are kept. - - ``checkpoint_name`` (``str``): Custom name for the checkpoint file. - Defaults to ``"ckpt"``. - - - **Methods:** - - .. function:: save( ) - :noindex: - - Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``. - - .. function:: restore( ) - :noindex: - - Restores the latest checkpoint in the specified directory. - Internally uses ``tf.train.CheckpointManager.restore()``. - - - **Examples:** - - .. code:: python - - checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) - ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints - - for inputs in train_ds: -     loss = train_step(inputs) -     # [...] -     ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints - - .. code:: python - - for step, inputs in enumerate(train_ds): -     if step == 0: -         ckpt_manager.restore() -     loss = train_step(inputs) diff --git a/doc/api/training/smp_versions/v1_10_0.rst b/doc/api/training/smp_versions/v1_10_0.rst deleted file mode 100644 index dc2c1d18d1..0000000000 --- a/doc/api/training/smp_versions/v1_10_0.rst +++ /dev/null @@ -1,13 +0,0 @@ - -Version 1.10.0 -============== - -To use the library, reference the Common API documentation alongside the framework specific API documentation. - -.. toctree:: - :maxdepth: 1 - - v1.10.0/smd_model_parallel_common_api - v1.10.0/smd_model_parallel_pytorch - v1.10.0/smd_model_parallel_pytorch_tensor_parallel - v1.10.0/smd_model_parallel_tensorflow diff --git a/doc/api/training/smp_versions/v1_1_0.rst b/doc/api/training/smp_versions/v1_1_0.rst deleted file mode 100644 index 34b2d83b6b..0000000000 --- a/doc/api/training/smp_versions/v1_1_0.rst +++ /dev/null @@ -1,12 +0,0 @@ - -Version 1.1.0 -============= - -To use the library, reference the Common API documentation alongside the framework specific API documentation. - -.. toctree:: - :maxdepth: 1 - - v1.1.0/smd_model_parallel_common_api - v1.1.0/smd_model_parallel_pytorch - v1.1.0/smd_model_parallel_tensorflow diff --git a/doc/api/training/smp_versions/v1_2_0.rst b/doc/api/training/smp_versions/v1_2_0.rst deleted file mode 100644 index 4201de0b52..0000000000 --- a/doc/api/training/smp_versions/v1_2_0.rst +++ /dev/null @@ -1,12 +0,0 @@ - -Version 1.2.0 -============= - -To use the library, reference the Common API documentation alongside the framework specific API documentation. - -.. toctree:: - :maxdepth: 1 - - v1.2.0/smd_model_parallel_common_api - v1.2.0/smd_model_parallel_pytorch - v1.2.0/smd_model_parallel_tensorflow diff --git a/doc/api/training/smp_versions/v1_3_0.rst b/doc/api/training/smp_versions/v1_3_0.rst deleted file mode 100644 index 80d73acbd9..0000000000 --- a/doc/api/training/smp_versions/v1_3_0.rst +++ /dev/null @@ -1,12 +0,0 @@ - -Version 1.3.x -============= - -To use the library, reference the Common API documentation alongside the framework specific API documentation. - -.. toctree:: - :maxdepth: 1 - - v1.3.0/smd_model_parallel_common_api - v1.3.0/smd_model_parallel_pytorch - v1.3.0/smd_model_parallel_tensorflow diff --git a/doc/api/training/smp_versions/v1_4_0.rst b/doc/api/training/smp_versions/v1_4_0.rst deleted file mode 100644 index 4485ae6a40..0000000000 --- a/doc/api/training/smp_versions/v1_4_0.rst +++ /dev/null @@ -1,12 +0,0 @@ - -Version 1.4.x -============= - -To use the library, reference the Common API documentation alongside the framework specific API documentation. - -.. toctree:: - :maxdepth: 1 - - v1.4.0/smd_model_parallel_common_api - v1.4.0/smd_model_parallel_pytorch - v1.4.0/smd_model_parallel_tensorflow diff --git a/doc/api/training/smp_versions/v1_5_0.rst b/doc/api/training/smp_versions/v1_5_0.rst deleted file mode 100644 index c93761efa4..0000000000 --- a/doc/api/training/smp_versions/v1_5_0.rst +++ /dev/null @@ -1,12 +0,0 @@ - -Version 1.5.x -============= - -To use the library, reference the Common API documentation alongside the framework specific API documentation. - -.. toctree:: - :maxdepth: 1 - - v1.5.0/smd_model_parallel_common_api - v1.5.0/smd_model_parallel_pytorch - v1.5.0/smd_model_parallel_tensorflow diff --git a/doc/api/training/smp_versions/v1_6_0.rst b/doc/api/training/smp_versions/v1_6_0.rst deleted file mode 100644 index fe02479853..0000000000 --- a/doc/api/training/smp_versions/v1_6_0.rst +++ /dev/null @@ -1,13 +0,0 @@ - -Version 1.6.0 -============= - -To use the library, reference the Common API documentation alongside the framework specific API documentation. - -.. toctree:: - :maxdepth: 1 - - v1.6.0/smd_model_parallel_common_api - v1.6.0/smd_model_parallel_pytorch - v1.6.0/smd_model_parallel_pytorch_tensor_parallel - v1.6.0/smd_model_parallel_tensorflow diff --git a/doc/api/training/smp_versions/v1_9_0.rst b/doc/api/training/smp_versions/v1_9_0.rst deleted file mode 100644 index e2e9acd83a..0000000000 --- a/doc/api/training/smp_versions/v1_9_0.rst +++ /dev/null @@ -1,13 +0,0 @@ - -Version 1.7.0, 1.8.0, 1.8.1, 1.9.0 -================================== - -To use the library, reference the Common API documentation alongside the framework specific API documentation. - -.. toctree:: - :maxdepth: 1 - - v1.9.0/smd_model_parallel_common_api - v1.9.0/smd_model_parallel_pytorch - v1.9.0/smd_model_parallel_pytorch_tensor_parallel - v1.9.0/smd_model_parallel_tensorflow diff --git a/src/sagemaker/pytorch/estimator.py b/src/sagemaker/pytorch/estimator.py index d127a2a2d6..a4e24d1ff0 100644 --- a/src/sagemaker/pytorch/estimator.py +++ b/src/sagemaker/pytorch/estimator.py @@ -107,57 +107,65 @@ def __init__( If ``framework_version`` or ``py_version`` are ``None``, then ``image_uri`` is required. If also ``None``, then a ``ValueError`` will be raised. - distribution (dict): A dictionary with information on how to run distributed training - (default: None). Currently, the following are supported: - distributed training with parameter servers, SageMaker Distributed (SMD) Data - and Model Parallelism, and MPI. SMD Model Parallelism can only be used with MPI. + distribution (dict): A dictionary with information on how to configure and + run distributed training + (default: None). The following options are available. - **To enable the SageMaker distributed data parallelism:** + **To enable the SageMaker distributed data parallelism (SMDDP) library:** .. code:: python { "smdistributed": { "dataparallel": { "enabled": True } } } - .. seealso:: + Beside activating the SMDDP library through this parameter, + you also need to add few lines of code in your training script + for initializing PyTorch Distributed with the SMDDP setups. + To learn how to configure your training job with the SMDDP library v2, see + `Run distributed training with the SageMaker distributed data parallelism + library + `_ + in the *Amazon SageMaker User Guide*. - To learn more, see :ref:`sdp_api_docs_toc`. - - **To enable the SageMaker distributed model parallelism:** + **To enable the SageMaker distributed model parallelism (SMP) library v2:** .. code:: python { + "torch_distributed": { "enabled": True }, "smdistributed": { "modelparallel": { - "enabled":True, + "enabled": True, "parameters": { - "partitions": 2, - "microbatches": 4, - "placement_strategy": "spread", - "pipeline": "interleaved", - "optimize": "speed", - "ddp": True, - } + "tensor_parallel_degree": 8, + "hybrid_shard_degree": 1, + ... + }, + } }, - "mpi": { - "enabled" : True, - "processes_per_host" : 8, - } } - .. note:: + Beside activating the SMP library v2 through this parameter, + you also need to add few lines of code in your training script + for initializing PyTorch Distributed with the SMP setups. + To learn how to configure your training job with the SMP library v2, see + `Run distributed training with the SageMaker model parallelism library v2 + `_ + in the *Amazon SageMaker User Guide*. - The SageMaker distributed model parallel library internally uses MPI. - In order to use model parallelism, MPI also must be enabled. - - .. seealso:: + .. note:: - To learn more, see :ref:`smp_api_docs_toc`. + The SageMaker distributed model parallel library v2 requires with + ``torch_distributed``. - .. seealso:: + .. note:: - To find a complete list of parameters for SageMaker model parallelism, - see :ref:`sm-sdk-modelparallel-general`. + The documentation for the SMP library v1.x is archived and available at + `Run distributed training with the SageMaker model parallelism library + `_ + in the *Amazon SageMaker User Guide*, + and the SMP v1 API reference is available in the + `SageMaker Python SDK v2.199.0 documentation + `_. **To enable PyTorch DDP:** diff --git a/src/sagemaker/tensorflow/estimator.py b/src/sagemaker/tensorflow/estimator.py index eb4366f0a7..523b70ec38 100644 --- a/src/sagemaker/tensorflow/estimator.py +++ b/src/sagemaker/tensorflow/estimator.py @@ -86,56 +86,7 @@ def __init__( ``image_uri`` is required. If also ``None``, then a ``ValueError`` will be raised. distribution (dict): A dictionary with information on how to run distributed training - (default: None). Currently, the following are supported: - distributed training with parameter servers, SageMaker Distributed (SMD) Data - and Model Parallelism, and MPI. SMD Model Parallelism can only be used with MPI. - - **To enable the SageMaker distributed data parallelism:** - - .. code:: python - - { "smdistributed": { "dataparallel": { "enabled": True } } } - - .. seealso:: - - To learn more, see :ref:`sdp_api_docs_toc`. - - **To enable the SageMaker distributed model parallelism:** - - .. code:: python - - { - "smdistributed": { - "modelparallel": { - "enabled":True, - "parameters": { - "partitions": 2, - "microbatches": 4, - "placement_strategy": "spread", - "pipeline": "interleaved", - "optimize": "speed", - "ddp": True, - } - }, - "mpi": { - "enabled" : True, - "processes_per_host" : 8, - } - } - - .. note:: - - The SageMaker distributed model parallel library internally uses MPI. - In order to use model parallelism, MPI also must be enabled. - - .. seealso:: - - To learn more, see :ref:`smp_api_docs_toc`. - - .. seealso:: - - To find a complete list of parameters for SageMaker model parallelism, - see :ref:`sm-sdk-modelparallel-general`. + (default: None). **To enable Multi Worker Mirrored Strategy:** @@ -179,6 +130,31 @@ def __init__( To learn more, see `Training with parameter servers `_. + + .. note:: + + The SageMaker distributed data parallelism (SMDDP) library + discontinued support for TensorFlow. + The documentation for the SMDDP library v1.x is still available at + `Use the SMDDP library in your TensorFlow training script (deprecated) + `_ + in the *Amazon SageMaker User Guide*, + and the `SMDDP v1 API reference in the + SageMaker Python SDK v2.199.0 documentation + `_. + + .. note:: + + The SageMaker model parallelism (SMP) library v2 discontinued support + for TensorFlow. + The documentation for the SMP library v1.x is archived and available at + `Run distributed training with the SageMaker model parallelism library + `_ + in the *Amazon SageMaker User Guide*, + and the `SMP v1 API reference in the + SageMaker Python SDK v2.199.0 documentation + `_. + compiler_config (:class:`~sagemaker.tensorflow.TrainingCompilerConfig`): Configures SageMaker Training Compiler to accelerate training. From 76fc506413baebe9cfffeb046aa99f241df72408 Mon Sep 17 00:00:00 2001 From: ci Date: Wed, 20 Dec 2023 02:50:48 +0000 Subject: [PATCH 12/76] prepare release v2.201.0 --- CHANGELOG.md | 17 +++++++++++++++++ VERSION | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 50ed614047..56d6ebc8b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,22 @@ # Changelog +## v2.201.0 (2023-12-20) + +### Features + + * Use specific images for SMP v2 jobs + * Added update for model package + +### Bug Fixes and Other Changes + + * Add write permission to job output dirs for remote and step decorator running on non-root job user + * Move func and args serialization of function step to step level + +### Documentation Changes + + * SMP v2 doc updates (#1423) + * fix ModelBuilder sample notebook links + ## v2.200.1 (2023-12-14) ### Bug Fixes and Other Changes diff --git a/VERSION b/VERSION index d59c8f162b..b485cb5d05 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.200.2.dev0 +2.201.0 From 890a3989403514a9871e9bdf2979d05e7ea56fa5 Mon Sep 17 00:00:00 2001 From: ci Date: Wed, 20 Dec 2023 02:50:50 +0000 Subject: [PATCH 13/76] update development version to v2.201.1.dev0 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index b485cb5d05..9c20e02093 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.201.0 +2.201.1.dev0 From f8629d7554366df072591dcf658caa373265ce7f Mon Sep 17 00:00:00 2001 From: Gary Wang <38331932+gwang111@users.noreply.github.com> Date: Wed, 20 Dec 2023 11:04:50 -0800 Subject: [PATCH 14/76] Fix: Add additional model builder telemetry (#4334) * move telemetry code to public * add additional test --------- Co-authored-by: EC2 Default User --- src/sagemaker/serve/utils/telemetry_logger.py | 35 +++- .../serve/utils/test_telemetry_logger.py | 154 +++++++++++++++++- 2 files changed, 176 insertions(+), 13 deletions(-) diff --git a/src/sagemaker/serve/utils/telemetry_logger.py b/src/sagemaker/serve/utils/telemetry_logger.py index 4fcce555ca..cb57a9f0a7 100644 --- a/src/sagemaker/serve/utils/telemetry_logger.py +++ b/src/sagemaker/serve/utils/telemetry_logger.py @@ -59,21 +59,35 @@ def wrapper(self, *args, **kwargs): caught_ex = None image_uri_tail = self.image_uri.split("/")[1] - extra = f"{func_name}&{MODEL_SERVER_TO_CODE[str(self.model_server)]}&{image_uri_tail}" + extra = ( + f"{func_name}" + f"&x-modelServer={MODEL_SERVER_TO_CODE[str(self.model_server)]}" + f"&x-imageTag={image_uri_tail}" + ) if self.model_server == ModelServer.DJL_SERVING or self.model_server == ModelServer.TGI: - extra += f"&{self.model}" + extra += f"&x-modelName={self.model}" try: response = func(self, *args, **kwargs) if not self.serve_settings.telemetry_opt_out: _send_telemetry( - "1", MODE_TO_CODE[str(self.mode)], self.sagemaker_session, None, extra + "1", + MODE_TO_CODE[str(self.mode)], + self.sagemaker_session, + None, + None, + extra, ) except ModelBuilderException as e: if not self.serve_settings.telemetry_opt_out: _send_telemetry( - "0", MODE_TO_CODE[str(self.mode)], self.sagemaker_session, str(e), extra + "0", + MODE_TO_CODE[str(self.mode)], + self.sagemaker_session, + str(e), + e.__class__.__name__, + extra, ) caught_ex = e except Exception as e: # pylint: disable=W0703 @@ -93,13 +107,22 @@ def _send_telemetry( mode: int, session: Session, failure_reason: str = None, + failure_type: str = None, extra_info: str = None, ) -> None: """Make GET request to an empty object in S3 bucket""" try: accountId = _get_accountId(session) region = _get_region_or_default(session) - url = _construct_url(accountId, str(mode), status, failure_reason, extra_info, region) + url = _construct_url( + accountId, + str(mode), + status, + failure_reason, + failure_type, + extra_info, + region, + ) _requests_helper(url, 2) logger.debug("ModelBuilder metrics emitted.") except Exception: # pylint: disable=W0703 @@ -111,6 +134,7 @@ def _construct_url( mode: str, status: str, failure_reason: str, + failure_type: str, extra_info: str, region: str, ) -> str: @@ -124,6 +148,7 @@ def _construct_url( ) if failure_reason: base_url += f"&x-failureReason={failure_reason}" + base_url += f"&x-failureType={failure_type}" if extra_info: base_url += f"&x-extra={extra_info}" return base_url diff --git a/tests/unit/sagemaker/serve/utils/test_telemetry_logger.py b/tests/unit/sagemaker/serve/utils/test_telemetry_logger.py index 34b8fad74e..7c4b014e8a 100644 --- a/tests/unit/sagemaker/serve/utils/test_telemetry_logger.py +++ b/tests/unit/sagemaker/serve/utils/test_telemetry_logger.py @@ -12,19 +12,46 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import import unittest -from unittest.mock import MagicMock, patch -from sagemaker.serve.utils.telemetry_logger import _send_telemetry +from unittest.mock import Mock, patch +from sagemaker.serve import Mode, ModelServer +from sagemaker.serve.utils.telemetry_logger import ( + _send_telemetry, + _capture_telemetry, + _construct_url, +) +from sagemaker.serve.utils.exceptions import ModelBuilderException, LocalModelOutOfMemoryException -mock_session = MagicMock() +MOCK_SESSION = Mock() +MOCK_FUNC_NAME = "Mock.deploy" +MOCK_DJL_CONTAINER = ( + "763104351884.dkr.ecr.us-west-2.amazonaws.com/" "djl-inference:0.25.0-deepspeed0.11.0-cu118" +) +MOCK_TGI_CONTAINER = ( + "763104351884.dkr.ecr.us-east-1.amazonaws.com/" + "huggingface-pytorch-inference:2.0.0-transformers4.28.1-cpu-py310-ubuntu20.04" +) +MOCK_HUGGINGFACE_ID = "meta-llama/Llama-2-7b-hf" +MOCK_EXCEPTION = LocalModelOutOfMemoryException("mock raise ex") + + +class ModelBuilderMock: + def __init__(self): + self.serve_settings = Mock() + self.sagemaker_session = MOCK_SESSION + + @_capture_telemetry(MOCK_FUNC_NAME) + def mock_deploy(self, mock_exception_func=None): + if mock_exception_func: + mock_exception_func() class TestTelemetryLogger(unittest.TestCase): @patch("sagemaker.serve.utils.telemetry_logger._requests_helper") @patch("sagemaker.serve.utils.telemetry_logger._get_accountId") def test_log_sucessfully(self, mocked_get_accountId, mocked_request_helper): - mock_session.boto_session.region_name = "ap-south-1" + MOCK_SESSION.boto_session.region_name = "ap-south-1" mocked_get_accountId.return_value = "testAccountId" - _send_telemetry("someStatus", 1, mock_session) + _send_telemetry("someStatus", 1, MOCK_SESSION) mocked_request_helper.assert_called_with( "https://dev-exp-t-ap-south-1.s3.ap-south-1.amazonaws.com/" "telemetry?x-accountId=testAccountId&x-mode=1&x-status=someStatus", @@ -34,9 +61,120 @@ def test_log_sucessfully(self, mocked_get_accountId, mocked_request_helper): @patch("sagemaker.serve.utils.telemetry_logger._get_accountId") def test_log_handle_exception(self, mocked_get_accountId): mocked_get_accountId.side_effect = Exception("Internal error") - _send_telemetry("someStatus", 1, mock_session) + _send_telemetry("someStatus", 1, MOCK_SESSION) self.assertRaises(Exception) + @patch("sagemaker.serve.utils.telemetry_logger._send_telemetry") + def test_capture_telemetry_decorator_djl_success(self, mock_send_telemetry): + mock_model_builder = ModelBuilderMock() + mock_model_builder.serve_settings.telemetry_opt_out = False + mock_model_builder.image_uri = MOCK_DJL_CONTAINER + mock_model_builder.model = MOCK_HUGGINGFACE_ID + mock_model_builder.mode = Mode.LOCAL_CONTAINER + mock_model_builder.model_server = ModelServer.DJL_SERVING + + mock_model_builder.mock_deploy() + + expected_extra_str = ( + f"{MOCK_FUNC_NAME}" + "&x-modelServer=4" + "&x-imageTag=djl-inference:0.25.0-deepspeed0.11.0-cu118" + f"&x-modelName={MOCK_HUGGINGFACE_ID}" + ) + mock_send_telemetry.assert_called_once_with( + "1", 2, MOCK_SESSION, None, None, expected_extra_str + ) + + @patch("sagemaker.serve.utils.telemetry_logger._send_telemetry") + def test_capture_telemetry_decorator_tgi_success(self, mock_send_telemetry): + mock_model_builder = ModelBuilderMock() + mock_model_builder.serve_settings.telemetry_opt_out = False + mock_model_builder.image_uri = MOCK_TGI_CONTAINER + mock_model_builder.model = MOCK_HUGGINGFACE_ID + mock_model_builder.mode = Mode.LOCAL_CONTAINER + mock_model_builder.model_server = ModelServer.TGI + + mock_model_builder.mock_deploy() + + expected_extra_str = ( + f"{MOCK_FUNC_NAME}" + "&x-modelServer=6" + "&x-imageTag=huggingface-pytorch-inference:2.0.0-transformers4.28.1-cpu-py310-ubuntu20.04" + f"&x-modelName={MOCK_HUGGINGFACE_ID}" + ) + mock_send_telemetry.assert_called_once_with( + "1", 2, MOCK_SESSION, None, None, expected_extra_str + ) + + @patch("sagemaker.serve.utils.telemetry_logger._send_telemetry") + def test_capture_telemetry_decorator_no_call_when_disabled(self, mock_send_telemetry): + mock_model_builder = ModelBuilderMock() + mock_model_builder.serve_settings.telemetry_opt_out = True + mock_model_builder.image_uri = MOCK_DJL_CONTAINER + mock_model_builder.model = MOCK_HUGGINGFACE_ID + mock_model_builder.model_server = ModelServer.DJL_SERVING -if __name__ == "__main__": - unittest.main() + mock_model_builder.mock_deploy() + + assert not mock_send_telemetry.called + + @patch("sagemaker.serve.utils.telemetry_logger._send_telemetry") + def test_capture_telemetry_decorator_handle_exception_success(self, mock_send_telemetry): + mock_model_builder = ModelBuilderMock() + mock_model_builder.serve_settings.telemetry_opt_out = False + mock_model_builder.image_uri = MOCK_DJL_CONTAINER + mock_model_builder.model = MOCK_HUGGINGFACE_ID + mock_model_builder.mode = Mode.LOCAL_CONTAINER + mock_model_builder.model_server = ModelServer.DJL_SERVING + + mock_exception = Mock() + mock_exception_obj = MOCK_EXCEPTION + mock_exception.side_effect = mock_exception_obj + + with self.assertRaises(ModelBuilderException) as _: + mock_model_builder.mock_deploy(mock_exception) + + expected_extra_str = ( + f"{MOCK_FUNC_NAME}" + "&x-modelServer=4" + "&x-imageTag=djl-inference:0.25.0-deepspeed0.11.0-cu118" + f"&x-modelName={MOCK_HUGGINGFACE_ID}" + ) + mock_send_telemetry.assert_called_once_with( + "0", + 2, + MOCK_SESSION, + str(mock_exception_obj), + mock_exception_obj.__class__.__name__, + expected_extra_str, + ) + + def test_construct_url_with_failure_reason_and_extra_info(self): + mock_accountId = "12345678910" + mock_mode = Mode.LOCAL_CONTAINER + mock_status = "0" + mock_failure_reason = str(MOCK_EXCEPTION) + mock_failure_type = MOCK_EXCEPTION.__class__.__name__ + mock_extra_info = "mock_extra_info" + mock_region = "us-west-2" + + ret_url = _construct_url( + accountId=mock_accountId, + mode=mock_mode, + status=mock_status, + failure_reason=mock_failure_reason, + failure_type=mock_failure_type, + extra_info=mock_extra_info, + region=mock_region, + ) + + expected_base_url = ( + f"https://dev-exp-t-{mock_region}.s3.{mock_region}.amazonaws.com/telemetry?" + f"x-accountId={mock_accountId}" + f"&x-mode={mock_mode}" + f"&x-status={mock_status}" + f"&x-failureReason={mock_failure_reason}" + f"&x-failureType={mock_failure_type}" + f"&x-extra={mock_extra_info}" + ) + self.assertEquals(ret_url, expected_base_url) From 8110cc430b8dda98f78f66074b0407a2fa15ff10 Mon Sep 17 00:00:00 2001 From: Xinyu Xie Date: Wed, 20 Dec 2023 12:37:05 -0800 Subject: [PATCH 15/76] feature: support remote debug for sagemaker training job (#4315) * feature: support remote debug for sagemaker training job * change: Replace update_remote_config with 2 helper methods for enable and disable respectively * change: add new argument enable_remote_debug to skip set of test_jumpstart_estimator_kwargs_match_parent_class * chore: add jumpstart support for remote debug --------- Co-authored-by: Xinyu Xie Co-authored-by: Evan Kravitz --- src/sagemaker/estimator.py | 66 ++++++++++++++++- src/sagemaker/jumpstart/estimator.py | 4 ++ src/sagemaker/jumpstart/factory/estimator.py | 2 + src/sagemaker/jumpstart/types.py | 3 + src/sagemaker/session.py | 48 +++++++++++++ tests/unit/test_estimator.py | 76 ++++++++++++++++++++ tests/unit/test_session.py | 12 ++++ 7 files changed, 208 insertions(+), 3 deletions(-) diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index db8d572a75..551a42ad55 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -178,6 +178,7 @@ def __init__( container_entry_point: Optional[List[str]] = None, container_arguments: Optional[List[str]] = None, disable_output_compression: bool = False, + enable_remote_debug: Optional[Union[bool, PipelineVariable]] = None, **kwargs, ): """Initialize an ``EstimatorBase`` instance. @@ -540,6 +541,8 @@ def __init__( to Amazon S3 without compression after training finishes. enable_infra_check (bool or PipelineVariable): Optional. Specifies whether it is running Sagemaker built-in infra check jobs. + enable_remote_debug (bool or PipelineVariable): Optional. + Specifies whether RemoteDebug is enabled for the training job """ instance_count = renamed_kwargs( "train_instance_count", "instance_count", instance_count, kwargs @@ -777,6 +780,8 @@ def __init__( self.tensorboard_app = TensorBoardApp(region=self.sagemaker_session.boto_region_name) + self._enable_remote_debug = enable_remote_debug + @abstractmethod def training_image_uri(self): """Return the Docker image to use for training. @@ -1958,6 +1963,11 @@ def _prepare_init_params_from_job_description(cls, job_details, model_channel_na max_wait = job_details.get("StoppingCondition", {}).get("MaxWaitTimeInSeconds") if max_wait: init_params["max_wait"] = max_wait + + if "RemoteDebugConfig" in job_details: + init_params["enable_remote_debug"] = job_details["RemoteDebugConfig"].get( + "EnableRemoteDebug" + ) return init_params def _get_instance_type(self): @@ -2292,6 +2302,32 @@ def update_profiler( _TrainingJob.update(self, profiler_rule_configs, profiler_config_request_dict) + def get_remote_debug_config(self): + """dict: Return the configuration of RemoteDebug""" + return ( + None + if self._enable_remote_debug is None + else {"EnableRemoteDebug": self._enable_remote_debug} + ) + + def enable_remote_debug(self): + """Enable remote debug for a training job.""" + self._update_remote_debug(True) + + def disable_remote_debug(self): + """Disable remote debug for a training job.""" + self._update_remote_debug(False) + + def _update_remote_debug(self, enable_remote_debug: bool): + """Update to enable or disable remote debug for a training job. + + This method updates the ``_enable_remote_debug`` parameter + and enables or disables remote debug for a training job + """ + self._ensure_latest_training_job() + _TrainingJob.update(self, remote_debug_config={"EnableRemoteDebug": enable_remote_debug}) + self._enable_remote_debug = enable_remote_debug + def get_app_url( self, app_type, @@ -2520,6 +2556,9 @@ def _get_train_args(cls, estimator, inputs, experiment_config): if estimator.profiler_config: train_args["profiler_config"] = estimator.profiler_config._to_request_dict() + if estimator.get_remote_debug_config() is not None: + train_args["remote_debug_config"] = estimator.get_remote_debug_config() + return train_args @classmethod @@ -2549,7 +2588,12 @@ def _is_local_channel(cls, input_uri): @classmethod def update( - cls, estimator, profiler_rule_configs=None, profiler_config=None, resource_config=None + cls, + estimator, + profiler_rule_configs=None, + profiler_config=None, + resource_config=None, + remote_debug_config=None, ): """Update a running Amazon SageMaker training job. @@ -2562,20 +2606,31 @@ def update( resource_config (dict): Configuration of the resources for the training job. You can update the keep-alive period if the warm pool status is `Available`. No other fields can be updated. (default: None). + remote_debug_config (dict): Configuration for RemoteDebug. (default: ``None``) + The dict can contain 'EnableRemoteDebug'(bool). + For example, + + .. code:: python + + remote_debug_config = { + "EnableRemoteDebug": True, + } (default: None). Returns: sagemaker.estimator._TrainingJob: Constructed object that captures all information about the updated training job. """ update_args = cls._get_update_args( - estimator, profiler_rule_configs, profiler_config, resource_config + estimator, profiler_rule_configs, profiler_config, resource_config, remote_debug_config ) estimator.sagemaker_session.update_training_job(**update_args) return estimator.latest_training_job @classmethod - def _get_update_args(cls, estimator, profiler_rule_configs, profiler_config, resource_config): + def _get_update_args( + cls, estimator, profiler_rule_configs, profiler_config, resource_config, remote_debug_config + ): """Constructs a dict of arguments for updating an Amazon SageMaker training job. Args: @@ -2596,6 +2651,7 @@ def _get_update_args(cls, estimator, profiler_rule_configs, profiler_config, res update_args.update(build_dict("profiler_rule_configs", profiler_rule_configs)) update_args.update(build_dict("profiler_config", profiler_config)) update_args.update(build_dict("resource_config", resource_config)) + update_args.update(build_dict("remote_debug_config", remote_debug_config)) return update_args @@ -2694,6 +2750,7 @@ def __init__( container_arguments: Optional[List[str]] = None, disable_output_compression: bool = False, enable_infra_check: Optional[Union[bool, PipelineVariable]] = None, + enable_remote_debug: Optional[Union[bool, PipelineVariable]] = None, **kwargs, ): """Initialize an ``Estimator`` instance. @@ -3055,6 +3112,8 @@ def __init__( to Amazon S3 without compression after training finishes. enable_infra_check (bool or PipelineVariable): Optional. Specifies whether it is running Sagemaker built-in infra check jobs. + enable_remote_debug (bool or PipelineVariable): Optional. + Specifies whether RemoteDebug is enabled for the training job """ self.image_uri = image_uri self._hyperparameters = hyperparameters.copy() if hyperparameters else {} @@ -3106,6 +3165,7 @@ def __init__( container_entry_point=container_entry_point, container_arguments=container_arguments, disable_output_compression=disable_output_compression, + enable_remote_debug=enable_remote_debug, **kwargs, ) diff --git a/src/sagemaker/jumpstart/estimator.py b/src/sagemaker/jumpstart/estimator.py index 4f7a041df0..e6047e9009 100644 --- a/src/sagemaker/jumpstart/estimator.py +++ b/src/sagemaker/jumpstart/estimator.py @@ -106,6 +106,7 @@ def __init__( container_entry_point: Optional[List[str]] = None, container_arguments: Optional[List[str]] = None, disable_output_compression: Optional[bool] = None, + enable_remote_debug: Optional[Union[bool, PipelineVariable]] = None, ): """Initializes a ``JumpStartEstimator``. @@ -495,6 +496,8 @@ def __init__( a training job. disable_output_compression (Optional[bool]): When set to true, Model is uploaded to Amazon S3 without compression after training finishes. + enable_remote_debug (bool or PipelineVariable): Optional. + Specifies whether RemoteDebug is enabled for the training job Raises: ValueError: If the model ID is not recognized by JumpStart. @@ -569,6 +572,7 @@ def _is_valid_model_id_hook(): container_arguments=container_arguments, disable_output_compression=disable_output_compression, enable_infra_check=enable_infra_check, + enable_remote_debug=enable_remote_debug, ) self.model_id = estimator_init_kwargs.model_id diff --git a/src/sagemaker/jumpstart/factory/estimator.py b/src/sagemaker/jumpstart/factory/estimator.py index baa9d55085..7479c23832 100644 --- a/src/sagemaker/jumpstart/factory/estimator.py +++ b/src/sagemaker/jumpstart/factory/estimator.py @@ -127,6 +127,7 @@ def get_init_kwargs( container_arguments: Optional[List[str]] = None, disable_output_compression: Optional[bool] = None, enable_infra_check: Optional[Union[bool, PipelineVariable]] = None, + enable_remote_debug: Optional[Union[bool, PipelineVariable]] = None, ) -> JumpStartEstimatorInitKwargs: """Returns kwargs required to instantiate `sagemaker.estimator.Estimator` object.""" @@ -183,6 +184,7 @@ def get_init_kwargs( container_arguments=container_arguments, disable_output_compression=disable_output_compression, enable_infra_check=enable_infra_check, + enable_remote_debug=enable_remote_debug, ) estimator_init_kwargs = _add_model_version_to_kwargs(estimator_init_kwargs) diff --git a/src/sagemaker/jumpstart/types.py b/src/sagemaker/jumpstart/types.py index de9e2c10a3..7c06282894 100644 --- a/src/sagemaker/jumpstart/types.py +++ b/src/sagemaker/jumpstart/types.py @@ -1280,6 +1280,7 @@ class JumpStartEstimatorInitKwargs(JumpStartKwargs): "container_arguments", "disable_output_compression", "enable_infra_check", + "enable_remote_debug", ] SERIALIZATION_EXCLUSION_SET = { @@ -1344,6 +1345,7 @@ def __init__( container_arguments: Optional[List[str]] = None, disable_output_compression: Optional[bool] = None, enable_infra_check: Optional[Union[bool, PipelineVariable]] = None, + enable_remote_debug: Optional[Union[bool, PipelineVariable]] = None, ) -> None: """Instantiates JumpStartEstimatorInitKwargs object.""" @@ -1401,6 +1403,7 @@ def __init__( self.container_arguments = container_arguments self.disable_output_compression = disable_output_compression self.enable_infra_check = enable_infra_check + self.enable_remote_debug = enable_remote_debug class JumpStartEstimatorFitKwargs(JumpStartKwargs): diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py index 3b2de0239e..5b5df7a792 100644 --- a/src/sagemaker/session.py +++ b/src/sagemaker/session.py @@ -748,6 +748,7 @@ def train( # noqa: C901 profiler_config=None, environment: Optional[Dict[str, str]] = None, retry_strategy=None, + remote_debug_config=None, ): """Create an Amazon SageMaker training job. @@ -858,6 +859,15 @@ def train( # noqa: C901 configurations.src/sagemaker/lineage/artifact.py:285 profiler_config (dict): Configuration for how profiling information is emitted with SageMaker Profiler. (default: ``None``). + remote_debug_config(dict): Configuration for RemoteDebug. (default: ``None``) + The dict can contain 'EnableRemoteDebug'(bool). + For example, + + .. code:: python + + remote_debug_config = { + "EnableRemoteDebug": True, + } environment (dict[str, str]) : Environment variables to be set for use during training job (default: ``None``) retry_strategy(dict): Defines RetryStrategy for InternalServerFailures. @@ -950,6 +960,7 @@ def train( # noqa: C901 enable_sagemaker_metrics=enable_sagemaker_metrics, profiler_rule_configs=profiler_rule_configs, profiler_config=inferred_profiler_config, + remote_debug_config=remote_debug_config, environment=environment, retry_strategy=retry_strategy, ) @@ -992,6 +1003,7 @@ def _get_train_request( # noqa: C901 enable_sagemaker_metrics=None, profiler_rule_configs=None, profiler_config=None, + remote_debug_config=None, environment=None, retry_strategy=None, ): @@ -1103,6 +1115,15 @@ def _get_train_request( # noqa: C901 profiler_rule_configs (list[dict]): A list of profiler rule configurations. profiler_config(dict): Configuration for how profiling information is emitted with SageMaker Profiler. (default: ``None``). + remote_debug_config(dict): Configuration for RemoteDebug. (default: ``None``) + The dict can contain 'EnableRemoteDebug'(bool). + For example, + + .. code:: python + + remote_debug_config = { + "EnableRemoteDebug": True, + } environment (dict[str, str]) : Environment variables to be set for use during training job (default: ``None``) retry_strategy(dict): Defines RetryStrategy for InternalServerFailures. @@ -1206,6 +1227,9 @@ def _get_train_request( # noqa: C901 if profiler_config is not None: train_request["ProfilerConfig"] = profiler_config + if remote_debug_config is not None: + train_request["RemoteDebugConfig"] = remote_debug_config + if retry_strategy is not None: train_request["RetryStrategy"] = retry_strategy @@ -1217,6 +1241,7 @@ def update_training_job( profiler_rule_configs=None, profiler_config=None, resource_config=None, + remote_debug_config=None, ): """Calls the UpdateTrainingJob API for the given job name and returns the response. @@ -1228,6 +1253,15 @@ def update_training_job( resource_config (dict): Configuration of the resources for the training job. You can update the keep-alive period if the warm pool status is `Available`. No other fields can be updated. (default: ``None``). + remote_debug_config(dict): Configuration for RemoteDebug. (default: ``None``) + The dict can contain 'EnableRemoteDebug'(bool). + For example, + + .. code:: python + + remote_debug_config = { + "EnableRemoteDebug": True, + } """ # No injections from sagemaker_config because the UpdateTrainingJob API's resource_config # object accepts fewer parameters than the CreateTrainingJob API, and none that the @@ -1240,6 +1274,7 @@ def update_training_job( profiler_rule_configs=profiler_rule_configs, profiler_config=inferred_profiler_config, resource_config=resource_config, + remote_debug_config=remote_debug_config, ) LOGGER.info("Updating training job with name %s", job_name) LOGGER.debug("Update request: %s", json.dumps(update_training_job_request, indent=4)) @@ -1251,6 +1286,7 @@ def _get_update_training_job_request( profiler_rule_configs=None, profiler_config=None, resource_config=None, + remote_debug_config=None, ): """Constructs a request compatible for updating an Amazon SageMaker training job. @@ -1262,6 +1298,15 @@ def _get_update_training_job_request( resource_config (dict): Configuration of the resources for the training job. You can update the keep-alive period if the warm pool status is `Available`. No other fields can be updated. (default: ``None``). + remote_debug_config(dict): Configuration for RemoteDebug. (default: ``None``) + The dict can contain 'EnableRemoteDebug'(bool). + For example, + + .. code:: python + + remote_debug_config = { + "EnableRemoteDebug": True, + } Returns: Dict: an update training request dict @@ -1279,6 +1324,9 @@ def _get_update_training_job_request( if resource_config is not None: update_training_job_request["ResourceConfig"] = resource_config + if remote_debug_config is not None: + update_training_job_request["RemoteDebugConfig"] = remote_debug_config + return update_training_job_request def process( diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index 437c150c8b..3d8b0c454d 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -2012,6 +2012,82 @@ def test_sagemaker_model_custom_channel_name(sagemaker_session): ] +def test_framework_with_remote_debug_config(sagemaker_session): + f = DummyFramework( + entry_point=SCRIPT_PATH, + role=ROLE, + sagemaker_session=sagemaker_session, + instance_groups=[ + InstanceGroup("group1", "ml.c4.xlarge", 1), + InstanceGroup("group2", "ml.m4.xlarge", 2), + ], + enable_remote_debug=True, + ) + f.fit("s3://mydata") + sagemaker_session.train.assert_called_once() + _, args = sagemaker_session.train.call_args + assert args["remote_debug_config"]["EnableRemoteDebug"] + assert f.get_remote_debug_config()["EnableRemoteDebug"] + + +def test_framework_without_remote_debug_config(sagemaker_session): + f = DummyFramework( + entry_point=SCRIPT_PATH, + role=ROLE, + sagemaker_session=sagemaker_session, + instance_groups=[ + InstanceGroup("group1", "ml.c4.xlarge", 1), + InstanceGroup("group2", "ml.m4.xlarge", 2), + ], + ) + f.fit("s3://mydata") + sagemaker_session.train.assert_called_once() + _, args = sagemaker_session.train.call_args + assert args.get("remote_debug_config") is None + assert f.get_remote_debug_config() is None + + +def test_framework_enable_remote_debug(sagemaker_session): + f = DummyFramework( + entry_point=SCRIPT_PATH, + role=ROLE, + sagemaker_session=sagemaker_session, + instance_count=INSTANCE_COUNT, + instance_type=INSTANCE_TYPE, + ) + f.fit("s3://mydata") + f.enable_remote_debug() + + sagemaker_session.update_training_job.assert_called_once() + _, args = sagemaker_session.update_training_job.call_args + assert args["remote_debug_config"] == { + "EnableRemoteDebug": True, + } + assert f.get_remote_debug_config()["EnableRemoteDebug"] + assert len(args) == 2 + + +def test_framework_disable_remote_debug(sagemaker_session): + f = DummyFramework( + entry_point=SCRIPT_PATH, + role=ROLE, + sagemaker_session=sagemaker_session, + instance_count=INSTANCE_COUNT, + instance_type=INSTANCE_TYPE, + enable_remote_debug=True, + ) + f.fit("s3://mydata") + f.disable_remote_debug() + + sagemaker_session.update_training_job.assert_called_once() + _, args = sagemaker_session.update_training_job.call_args + assert args["remote_debug_config"] == { + "EnableRemoteDebug": False, + } + assert not f.get_remote_debug_config()["EnableRemoteDebug"] + assert len(args) == 2 + + @patch("time.strftime", return_value=TIMESTAMP) def test_custom_code_bucket(time, sagemaker_session): code_bucket = "codebucket" diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py index 57ba8daad5..d3bba53504 100644 --- a/tests/unit/test_session.py +++ b/tests/unit/test_session.py @@ -1876,6 +1876,15 @@ def test_update_training_job_with_sagemaker_config_injection(sagemaker_session): ) +def test_update_training_job_with_remote_debug_config(sagemaker_session): + sagemaker_session.sagemaker_config = SAGEMAKER_CONFIG_TRAINING_JOB + sagemaker_session.update_training_job( + job_name="MyTestJob", remote_debug_config={"EnableRemoteDebug": False} + ) + _, _, actual_train_args = sagemaker_session.sagemaker_client.method_calls[0] + assert not actual_train_args["RemoteDebugConfig"]["EnableRemoteDebug"] + + def test_train_with_sagemaker_config_injection(sagemaker_session): sagemaker_session.sagemaker_config = SAGEMAKER_CONFIG_TRAINING_JOB @@ -2128,6 +2137,7 @@ def test_train_pack_to_request_with_optional_params(sagemaker_session): } CONTAINER_ENTRY_POINT = ["bin/bash", "test.sh"] CONTAINER_ARGUMENTS = ["--arg1", "value1", "--arg2", "value2"] + remote_debug_config = {"EnableRemoteDebug": True} sagemaker_session.train( image_uri=IMAGE, @@ -2152,6 +2162,7 @@ def test_train_pack_to_request_with_optional_params(sagemaker_session): training_image_config=TRAINING_IMAGE_CONFIG, container_entry_point=CONTAINER_ENTRY_POINT, container_arguments=CONTAINER_ARGUMENTS, + remote_debug_config=remote_debug_config, ) _, _, actual_train_args = sagemaker_session.sagemaker_client.method_calls[0] @@ -2174,6 +2185,7 @@ def test_train_pack_to_request_with_optional_params(sagemaker_session): actual_train_args["AlgorithmSpecification"]["ContainerEntrypoint"] == CONTAINER_ENTRY_POINT ) assert actual_train_args["AlgorithmSpecification"]["ContainerArguments"] == CONTAINER_ARGUMENTS + assert actual_train_args["RemoteDebugConfig"]["EnableRemoteDebug"] def test_create_transform_job_with_sagemaker_config_injection(sagemaker_session): From 5de95fc8798f9a55346bb9b4a22ac78eea62921e Mon Sep 17 00:00:00 2001 From: martinRenou Date: Thu, 21 Dec 2023 01:51:03 +0100 Subject: [PATCH 16/76] Update tblib constraint (#4317) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 804cf16128..ff833b7a5c 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,7 @@ def read_requirements(filename): "PyYAML~=6.0", "jsonschema", "platformdirs", - "tblib==1.7.0", + "tblib>=1.7.0,<3", "urllib3<1.27", "uvicorn==0.22.0", "fastapi==0.95.2", From 894628277285e307634c06de50a47efdc56d05f5 Mon Sep 17 00:00:00 2001 From: martinRenou Date: Thu, 21 Dec 2023 06:53:53 +0100 Subject: [PATCH 17/76] Fix: Fix job_objective type (#4303) --- src/sagemaker/automl/automl.py | 2 +- tests/unit/sagemaker/workflow/test_automl_step.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/sagemaker/automl/automl.py b/src/sagemaker/automl/automl.py index 334c1d5c88..ce71d50977 100644 --- a/src/sagemaker/automl/automl.py +++ b/src/sagemaker/automl/automl.py @@ -332,7 +332,7 @@ def attach(cls, auto_ml_job_name, sagemaker_session=None): total_job_runtime_in_seconds=auto_ml_job_desc.get("AutoMLJobConfig", {}) .get("CompletionCriteria", {}) .get("MaxAutoMLJobRuntimeInSeconds"), - job_objective=auto_ml_job_desc.get("AutoMLJobObjective", {}).get("MetricName"), + job_objective=auto_ml_job_desc.get("AutoMLJobObjective", {}), generate_candidate_definitions_only=auto_ml_job_desc.get( "GenerateCandidateDefinitionsOnly", False ), diff --git a/tests/unit/sagemaker/workflow/test_automl_step.py b/tests/unit/sagemaker/workflow/test_automl_step.py index 6f02eccf4a..d831729241 100644 --- a/tests/unit/sagemaker/workflow/test_automl_step.py +++ b/tests/unit/sagemaker/workflow/test_automl_step.py @@ -41,7 +41,7 @@ def test_single_automl_step(pipeline_session): max_candidates=1, max_runtime_per_training_job_in_seconds=3600, total_job_runtime_in_seconds=36000, - job_objective="fake job objective", + job_objective={"MetricName": "F1"}, generate_candidate_definitions_only=False, tags=[{"Name": "some-tag", "Value": "value-for-tag"}], content_type="x-application/vnd.amazon+parquet", @@ -111,7 +111,7 @@ def test_single_automl_step(pipeline_session): "VpcConfig": {"SecurityGroupIds": ["group"], "Subnets": ["subnet"]}, }, }, - "AutoMLJobObjective": "fake job objective", + "AutoMLJobObjective": {"MetricName": "F1"}, "InputDataConfig": [ { "ChannelType": "training", @@ -165,7 +165,7 @@ def test_single_automl_step_with_parameter(pipeline_session): max_candidates=1, max_runtime_per_training_job_in_seconds=3600, total_job_runtime_in_seconds=36000, - job_objective="fake job objective", + job_objective={"MetricName": "F1"}, generate_candidate_definitions_only=False, tags=[{"Name": "some-tag", "Value": "value-for-tag"}], content_type="x-application/vnd.amazon+parquet", @@ -239,7 +239,7 @@ def test_single_automl_step_with_parameter(pipeline_session): "VpcConfig": {"SecurityGroupIds": ["group"], "Subnets": ["subnet"]}, }, }, - "AutoMLJobObjective": "fake job objective", + "AutoMLJobObjective": {"MetricName": "F1"}, "InputDataConfig": [ { "ChannelType": "training", @@ -290,7 +290,7 @@ def test_get_best_auto_ml_model(pipeline_session): max_candidates=1, max_runtime_per_training_job_in_seconds=3600, total_job_runtime_in_seconds=36000, - job_objective="fake job objective", + job_objective={"MetricName": "F1"}, generate_candidate_definitions_only=False, tags=[{"Name": "some-tag", "Value": "value-for-tag"}], content_type="x-application/vnd.amazon+parquet", @@ -399,7 +399,7 @@ def test_automl_step_with_invalid_mode(pipeline_session): max_candidates=1, max_runtime_per_training_job_in_seconds=3600, total_job_runtime_in_seconds=36000, - job_objective="fake job objective", + job_objective={"MetricName": "F1"}, generate_candidate_definitions_only=False, tags=[{"Name": "some-tag", "Value": "value-for-tag"}], content_type="x-application/vnd.amazon+parquet", @@ -455,7 +455,7 @@ def test_automl_step_with_no_mode(pipeline_session): max_candidates=1, max_runtime_per_training_job_in_seconds=3600, total_job_runtime_in_seconds=36000, - job_objective="fake job objective", + job_objective={"MetricName": "F1"}, generate_candidate_definitions_only=False, tags=[{"Name": "some-tag", "Value": "value-for-tag"}], content_type="x-application/vnd.amazon+parquet", From 5f77b7f2d6db89384e6d2d75fcd831a173d544b2 Mon Sep 17 00:00:00 2001 From: sagemaker-bot Date: Thu, 21 Dec 2023 16:32:41 +0000 Subject: [PATCH 18/76] change: update image_uri_configs 12-21-2023 08:32:41 PST --- src/sagemaker/image_uri_config/spark.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/sagemaker/image_uri_config/spark.json b/src/sagemaker/image_uri_config/spark.json index 8fac03e716..9a33ca87d9 100644 --- a/src/sagemaker/image_uri_config/spark.json +++ b/src/sagemaker/image_uri_config/spark.json @@ -21,6 +21,7 @@ "ap-southeast-3": "800295151634", "ap-southeast-4": "819679513684", "ca-central-1": "446299261295", + "ca-west-1": "000907499111", "cn-north-1": "671472414489", "cn-northwest-1": "844356804704", "eu-central-1": "906073651304", @@ -61,6 +62,7 @@ "ap-southeast-3": "800295151634", "ap-southeast-4": "819679513684", "ca-central-1": "446299261295", + "ca-west-1": "000907499111", "cn-north-1": "671472414489", "cn-northwest-1": "844356804704", "eu-central-1": "906073651304", @@ -101,6 +103,7 @@ "ap-southeast-3": "800295151634", "ap-southeast-4": "819679513684", "ca-central-1": "446299261295", + "ca-west-1": "000907499111", "cn-north-1": "671472414489", "cn-northwest-1": "844356804704", "eu-central-1": "906073651304", @@ -141,6 +144,7 @@ "ap-southeast-3": "800295151634", "ap-southeast-4": "819679513684", "ca-central-1": "446299261295", + "ca-west-1": "000907499111", "cn-north-1": "671472414489", "cn-northwest-1": "844356804704", "eu-central-1": "906073651304", @@ -181,6 +185,7 @@ "ap-southeast-3": "800295151634", "ap-southeast-4": "819679513684", "ca-central-1": "446299261295", + "ca-west-1": "000907499111", "cn-north-1": "671472414489", "cn-northwest-1": "844356804704", "eu-central-1": "906073651304", From c6acdca45ed740bd3458b1baeb19b20271c006dd Mon Sep 17 00:00:00 2001 From: ci Date: Thu, 21 Dec 2023 19:52:34 +0000 Subject: [PATCH 19/76] prepare release v2.202.0 --- CHANGELOG.md | 11 +++++++++++ VERSION | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 56d6ebc8b7..965f933d24 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # Changelog +## v2.202.0 (2023-12-21) + +### Features + + * support remote debug for sagemaker training job + +### Bug Fixes and Other Changes + + * update image_uri_configs 12-21-2023 08:32:41 PST + * Update tblib constraint + ## v2.201.0 (2023-12-20) ### Features diff --git a/VERSION b/VERSION index 9c20e02093..ac44a2625b 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.201.1.dev0 +2.202.0 From d5f60b923e2e5a657462bc5dd72034ae3769c60b Mon Sep 17 00:00:00 2001 From: ci Date: Thu, 21 Dec 2023 19:52:36 +0000 Subject: [PATCH 20/76] update development version to v2.202.1.dev0 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index ac44a2625b..b97ecdc293 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.202.0 +2.202.1.dev0 From bfd6a5dc81d7e10170b5e7f7a022802187c37c9a Mon Sep 17 00:00:00 2001 From: martinRenou Date: Thu, 21 Dec 2023 22:48:15 +0100 Subject: [PATCH 21/76] Using logging instead of prints (#4133) --- src/sagemaker/base_predictor.py | 5 +- src/sagemaker/cli/compatibility/v2/files.py | 5 +- src/sagemaker/fw_utils.py | 2 +- .../inference_recommender_mixin.py | 19 +-- src/sagemaker/local/entities.py | 21 ++-- src/sagemaker/local/image.py | 4 +- src/sagemaker/model.py | 11 +- .../model_monitor/clarify_model_monitoring.py | 37 +++--- .../model_monitor/model_monitoring.py | 100 ++++++++------- .../model_monitor/monitoring_files.py | 17 ++- src/sagemaker/session.py | 118 +++++++++--------- tests/integ/timeout.py | 27 ++-- 12 files changed, 189 insertions(+), 177 deletions(-) diff --git a/src/sagemaker/base_predictor.py b/src/sagemaker/base_predictor.py index 3baecd8014..99ef6ef55f 100644 --- a/src/sagemaker/base_predictor.py +++ b/src/sagemaker/base_predictor.py @@ -63,6 +63,9 @@ LOGGER = logging.getLogger("sagemaker") +logger = logging.getLogger(__name__) + + class PredictorBase(abc.ABC): """An object that encapsulates a deployed model.""" @@ -714,7 +717,7 @@ def list_monitors(self): endpoint_name=self.endpoint_name ) if len(monitoring_schedules_dict["MonitoringScheduleSummaries"]) == 0: - print("No monitors found for endpoint. endpoint: {}".format(self.endpoint_name)) + logger.debug("No monitors found for endpoint. endpoint: %s", self.endpoint_name) return [] monitors = [] diff --git a/src/sagemaker/cli/compatibility/v2/files.py b/src/sagemaker/cli/compatibility/v2/files.py index 0d118b1c73..c231b06ced 100644 --- a/src/sagemaker/cli/compatibility/v2/files.py +++ b/src/sagemaker/cli/compatibility/v2/files.py @@ -22,7 +22,8 @@ from sagemaker.cli.compatibility.v2.ast_transformer import ASTTransformer -LOGGER = logging.getLogger(__name__) +# Setting LOGGER for backward compatibility, in case users import this... +logger = LOGGER = logging.getLogger(__name__) class FileUpdater(object): @@ -59,7 +60,7 @@ def _make_output_dirs_if_needed(self): os.makedirs(output_dir) if os.path.exists(self.output_path): - LOGGER.warning("Overwriting file %s", self.output_path) + logger.warning("Overwriting file %s", self.output_path) class PyFileUpdater(FileUpdater): diff --git a/src/sagemaker/fw_utils.py b/src/sagemaker/fw_utils.py index 4c97032384..d33b71ad10 100644 --- a/src/sagemaker/fw_utils.py +++ b/src/sagemaker/fw_utils.py @@ -474,7 +474,7 @@ def tar_and_upload_dir( if s3_resource is None: s3_resource = session.resource("s3", region_name=session.region_name) else: - print("Using provided s3_resource") + logger.debug("Using provided s3_resource") s3_resource.Object(bucket, key).upload_file(tar_file, ExtraArgs=extra_args) finally: diff --git a/src/sagemaker/inference_recommender/inference_recommender_mixin.py b/src/sagemaker/inference_recommender/inference_recommender_mixin.py index ed3357a532..cd9a1abb58 100644 --- a/src/sagemaker/inference_recommender/inference_recommender_mixin.py +++ b/src/sagemaker/inference_recommender/inference_recommender_mixin.py @@ -28,7 +28,8 @@ "mxnet": "MXNET", } -LOGGER = logging.getLogger("sagemaker") +# Setting LOGGER for backward compatibility, in case users import it... +logger = LOGGER = logging.getLogger("sagemaker") class Phase: @@ -145,10 +146,10 @@ def right_size( ) if endpoint_configurations or traffic_pattern or stopping_conditions or resource_limit: - LOGGER.info("Advanced Job parameters were specified. Running Advanced job...") + logger.info("Advanced Job parameters were specified. Running Advanced job...") job_type = "Advanced" else: - LOGGER.info("Advanced Job parameters were not specified. Running Default job...") + logger.info("Advanced Job parameters were not specified. Running Default job...") job_type = "Default" self._init_sagemaker_session_if_does_not_exist() @@ -173,7 +174,7 @@ def right_size( vpc_config=self.vpc_config, enable_network_isolation=self.enable_network_isolation(), ) - LOGGER.warning("Attempting to create new model with name %s", self.name) + logger.warning("Attempting to create new model with name %s", self.name) self.sagemaker_session.create_model(**create_model_args) ret_name = self.sagemaker_session.create_inference_recommendations_job( @@ -281,23 +282,23 @@ def _update_params_for_right_size( if accelerator_type: raise ValueError("accelerator_type is not compatible with right_size().") if instance_type or initial_instance_count: - LOGGER.warning( + logger.warning( "instance_type or initial_instance_count specified." "Overriding right_size() recommendations." ) return None if async_inference_config: - LOGGER.warning( + logger.warning( "async_inference_config is specified. Overriding right_size() recommendations." ) return None if serverless_inference_config: - LOGGER.warning( + logger.warning( "serverless_inference_config is specified. Overriding right_size() recommendations." ) return None if explainer_config: - LOGGER.warning( + logger.warning( "explainer_config is specified. Overriding right_size() recommendations." ) return None @@ -359,7 +360,7 @@ def _update_params_for_recommendation_id( """ if instance_type is not None and initial_instance_count is not None: - LOGGER.warning( + logger.warning( "Both instance_type and initial_instance_count are specified," "overriding the recommendation result." ) diff --git a/src/sagemaker/local/entities.py b/src/sagemaker/local/entities.py index 89c9c7025d..3eb4ab2b34 100644 --- a/src/sagemaker/local/entities.py +++ b/src/sagemaker/local/entities.py @@ -683,8 +683,10 @@ def start(self, **kwargs): ) self._executions[execution_id] = execution - print( - f"Starting execution for pipeline {self.pipeline.name}. Execution ID is {execution_id}" + logger.info( + "Starting execution for pipeline %s. Execution ID is %s", + self.pipeline.name, + execution_id, ) self.last_modified_time = datetime.datetime.now().timestamp() @@ -771,31 +773,32 @@ def update_execution_success(self): """Mark execution as succeeded.""" self.status = _LocalExecutionStatus.SUCCEEDED.value self.last_modified_time = datetime.datetime.now().timestamp() - print(f"Pipeline execution {self.pipeline_execution_name} SUCCEEDED") + logger.info("Pipeline execution %s SUCCEEDED", self.pipeline_execution_name) def update_execution_failure(self, step_name, failure_message): """Mark execution as failed.""" self.status = _LocalExecutionStatus.FAILED.value self.failure_reason = f"Step '{step_name}' failed with message: {failure_message}" self.last_modified_time = datetime.datetime.now().timestamp() - print( - f"Pipeline execution {self.pipeline_execution_name} FAILED because step " - f"'{step_name}' failed." + logger.info( + "Pipeline execution %s FAILED because step '%s' failed.", + self.pipeline_execution_name, + step_name, ) def update_step_properties(self, step_name, step_properties): """Update pipeline step execution output properties.""" self.step_execution.get(step_name).update_step_properties(step_properties) - print(f"Pipeline step '{step_name}' SUCCEEDED.") + logger.info("Pipeline step '%s' SUCCEEDED.", step_name) def update_step_failure(self, step_name, failure_message): """Mark step_name as failed.""" - print(f"Pipeline step '{step_name}' FAILED. Failure message is: {failure_message}") + logger.info("Pipeline step '%s' FAILED. Failure message is: %s", step_name, failure_message) self.step_execution.get(step_name).update_step_failure(failure_message) def mark_step_executing(self, step_name): """Update pipelines step's status to EXECUTING and start_time to now.""" - print(f"Starting pipeline step: '{step_name}'") + logger.info("Starting pipeline step: '%s'", step_name) self.step_execution.get(step_name).mark_step_executing() def _initialize_step_execution(self, steps): diff --git a/src/sagemaker/local/image.py b/src/sagemaker/local/image.py index 98a5a7c629..22a15c0570 100644 --- a/src/sagemaker/local/image.py +++ b/src/sagemaker/local/image.py @@ -230,7 +230,7 @@ def process( # Print our Job Complete line to have a similar experience to training on SageMaker where # you see this line at the end. - print("===== Job Complete =====") + logger.info("===== Job Complete =====") def train(self, input_data_config, output_data_config, hyperparameters, environment, job_name): """Run a training job locally using docker-compose. @@ -310,7 +310,7 @@ def train(self, input_data_config, output_data_config, hyperparameters, environm # Print our Job Complete line to have a similar experience to training on SageMaker where # you see this line at the end. - print("===== Job Complete =====") + logger.info("===== Job Complete =====") return artifacts def serve(self, model_dir, environment): diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py index 9caca5feff..56f68372ae 100644 --- a/src/sagemaker/model.py +++ b/src/sagemaker/model.py @@ -77,7 +77,8 @@ from sagemaker.enums import EndpointType from sagemaker.session import get_add_model_package_inference_args -LOGGER = logging.getLogger("sagemaker") +# Setting LOGGER for backward compatibility, in case users import it... +logger = LOGGER = logging.getLogger("sagemaker") NEO_ALLOWED_FRAMEWORKS = set( ["mxnet", "tensorflow", "keras", "pytorch", "onnx", "xgboost", "tflite"] @@ -737,7 +738,7 @@ def _upload_code(self, key_prefix: str, repack: bool = False) -> None: script_name=os.path.basename(self.entry_point), ) - LOGGER.info( + logger.info( "Repacking model artifact (%s), script artifact " "(%s), and dependencies (%s) " "into single tar.gz file located at %s. " @@ -1258,13 +1259,13 @@ def compile( self.image_uri = job_status.get("InferenceImage", None) self._is_compiled_model = True else: - LOGGER.warning( + logger.warning( "The instance type %s is not supported for deployment via SageMaker." "Please deploy the model manually.", target_instance_family, ) else: - LOGGER.warning( + logger.warning( "Devices described by Target Platform OS, Architecture and Accelerator are not" "supported for deployment via SageMaker. Please deploy the model manually." ) @@ -1484,7 +1485,7 @@ def deploy( and instance_type.startswith("ml.inf") and not self._is_compiled_model ): - LOGGER.warning( + logger.warning( "Your model is not compiled. Please compile your model before using Inferentia." ) diff --git a/src/sagemaker/model_monitor/clarify_model_monitoring.py b/src/sagemaker/model_monitor/clarify_model_monitoring.py index 8905731b08..bc572827cd 100644 --- a/src/sagemaker/model_monitor/clarify_model_monitoring.py +++ b/src/sagemaker/model_monitor/clarify_model_monitoring.py @@ -29,7 +29,8 @@ from sagemaker.clarify import SageMakerClarifyProcessor, ModelPredictedLabelConfig from sagemaker.lineage._utils import get_resource_name_from_arn -_LOGGER = logging.getLogger(__name__) +# Setting _LOGGER for backward compatibility, in case users import it... +logger = _LOGGER = logging.getLogger(__name__) class ClarifyModelMonitor(mm.ModelMonitor): @@ -223,7 +224,7 @@ def _upload_analysis_config(self, analysis_config, output_s3_uri, job_definition str(uuid.uuid4()), "analysis_config.json", ) - _LOGGER.info("Uploading analysis config to {s3_uri}.") + logger.info("Uploading analysis config to {s3_uri}.") return s3.S3Uploader.upload_string_as_file_body( json.dumps(analysis_config), desired_s3_uri=s3_uri, @@ -604,7 +605,7 @@ def create_monitoring_schedule( "Monitoring Schedule. To create another, first delete the existing one " "using my_monitor.delete_monitoring_schedule()." ) - _LOGGER.error(message) + logger.error(message) raise ValueError(message) if (batch_transform_input is not None) ^ (endpoint_input is None): @@ -613,7 +614,7 @@ def create_monitoring_schedule( "Amazon Model Monitoring Schedule. " "Please provide only one of the above required inputs" ) - _LOGGER.error(message) + logger.error(message) raise ValueError(message) self._check_monitoring_schedule_cron_validity( @@ -667,7 +668,7 @@ def create_monitoring_schedule( self.job_definition_name = new_job_definition_name self.monitoring_schedule_name = monitor_schedule_name except Exception: - _LOGGER.exception("Failed to create monitoring schedule.") + logger.exception("Failed to create monitoring schedule.") # noinspection PyBroadException try: self.sagemaker_session.sagemaker_client.delete_model_bias_job_definition( @@ -675,7 +676,7 @@ def create_monitoring_schedule( ) except Exception: # pylint: disable=W0703 message = "Failed to delete job definition {}.".format(new_job_definition_name) - _LOGGER.exception(message) + logger.exception(message) raise # noinspection PyMethodOverriding @@ -756,7 +757,7 @@ def update_monitoring_schedule( "Amazon Model Monitoring Schedule. " "Please provide only one of the above required inputs" ) - _LOGGER.error(message) + logger.error(message) raise ValueError(message) # Only need to update schedule expression @@ -820,7 +821,7 @@ def update_monitoring_schedule( if network_config is not None: self.network_config = network_config except Exception: - _LOGGER.exception("Failed to update monitoring schedule.") + logger.exception("Failed to update monitoring schedule.") # noinspection PyBroadException try: self.sagemaker_session.sagemaker_client.delete_model_bias_job_definition( @@ -828,7 +829,7 @@ def update_monitoring_schedule( ) except Exception: # pylint: disable=W0703 message = "Failed to delete job definition {}.".format(new_job_definition_name) - _LOGGER.exception(message) + logger.exception(message) raise def delete_monitoring_schedule(self): @@ -838,7 +839,7 @@ def delete_monitoring_schedule(self): message = "Deleting Model Bias Job Definition with name: {}".format( self.job_definition_name ) - _LOGGER.info(message) + logger.info(message) self.sagemaker_session.sagemaker_client.delete_model_bias_job_definition( JobDefinitionName=self.job_definition_name ) @@ -1045,7 +1046,7 @@ def create_monitoring_schedule( "Monitoring Schedule. To create another, first delete the existing one " "using my_monitor.delete_monitoring_schedule()." ) - _LOGGER.error(message) + logger.error(message) raise ValueError(message) if (batch_transform_input is not None) ^ (endpoint_input is None): @@ -1054,7 +1055,7 @@ def create_monitoring_schedule( "Amazon Model Monitoring Schedule." "Please provide only one of the above required inputs" ) - _LOGGER.error(message) + logger.error(message) raise ValueError(message) self._check_monitoring_schedule_cron_validity( @@ -1107,7 +1108,7 @@ def create_monitoring_schedule( self.job_definition_name = new_job_definition_name self.monitoring_schedule_name = monitor_schedule_name except Exception: - _LOGGER.exception("Failed to create monitoring schedule.") + logger.exception("Failed to create monitoring schedule.") # noinspection PyBroadException try: self.sagemaker_session.sagemaker_client.delete_model_explainability_job_definition( @@ -1115,7 +1116,7 @@ def create_monitoring_schedule( ) except Exception: # pylint: disable=W0703 message = "Failed to delete job definition {}.".format(new_job_definition_name) - _LOGGER.exception(message) + logger.exception(message) raise # noinspection PyMethodOverriding @@ -1198,7 +1199,7 @@ def update_monitoring_schedule( "Amazon Model Monitoring Schedule. " "Please provide only one of the above required inputs" ) - _LOGGER.error(message) + logger.error(message) raise ValueError(message) # Only need to update schedule expression @@ -1265,7 +1266,7 @@ def update_monitoring_schedule( if network_config is not None: self.network_config = network_config except Exception: - _LOGGER.exception("Failed to update monitoring schedule.") + logger.exception("Failed to update monitoring schedule.") # noinspection PyBroadException try: self.sagemaker_session.sagemaker_client.delete_model_explainability_job_definition( @@ -1273,7 +1274,7 @@ def update_monitoring_schedule( ) except Exception: # pylint: disable=W0703 message = "Failed to delete job definition {}.".format(new_job_definition_name) - _LOGGER.exception(message) + logger.exception(message) raise def delete_monitoring_schedule(self): @@ -1283,7 +1284,7 @@ def delete_monitoring_schedule(self): message = "Deleting Model Explainability Job Definition with name: {}".format( self.job_definition_name ) - _LOGGER.info(message) + logger.info(message) self.sagemaker_session.sagemaker_client.delete_model_explainability_job_definition( JobDefinitionName=self.job_definition_name ) diff --git a/src/sagemaker/model_monitor/model_monitoring.py b/src/sagemaker/model_monitor/model_monitoring.py index a28eaa184b..b949c6538b 100644 --- a/src/sagemaker/model_monitor/model_monitoring.py +++ b/src/sagemaker/model_monitor/model_monitoring.py @@ -103,7 +103,8 @@ _PROBABILITY_THRESHOLD_ATTRIBUTE_ENV_NAME = "probability_threshold_attribute" _CATEGORICAL_DRIFT_METHOD_ENV_NAME = "categorical_drift_method" -_LOGGER = logging.getLogger(__name__) +# Setting _LOGGER for backward compatibility, in case users import it... +logger = _LOGGER = logging.getLogger(__name__) framework_name = "model-monitor" @@ -348,7 +349,7 @@ def create_monitoring_schedule( "Monitoring Schedule. To create another, first delete the existing one " "using my_monitor.delete_monitoring_schedule()." ) - print(message) + logger.warning(message) raise ValueError(message) if not output: @@ -360,7 +361,7 @@ def create_monitoring_schedule( "Amazon Model Monitoring Schedule. " "Please provide only one of the above required inputs" ) - _LOGGER.error(message) + logger.error(message) raise ValueError(message) self._check_monitoring_schedule_cron_validity( @@ -518,7 +519,7 @@ def update_monitoring_schedule( "Amazon Model Monitoring Schedule. " "Please provide atmost one of the above required inputs" ) - _LOGGER.error(message) + logger.error(message) raise ValueError(message) if endpoint_input is not None: @@ -696,10 +697,9 @@ def latest_monitoring_statistics(self, file_name=STATISTICS_JSON_DEFAULT_FILE_NA """ executions = self.list_executions() if len(executions) == 0: - print( - "No executions found for schedule. monitoring_schedule_name: {}".format( - self.monitoring_schedule_name - ) + logger.warning( + "No executions found for schedule. monitoring_schedule_name: %s", + self.monitoring_schedule_name, ) return None @@ -724,10 +724,9 @@ def latest_monitoring_constraint_violations( """ executions = self.list_executions() if len(executions) == 0: - print( - "No executions found for schedule. monitoring_schedule_name: {}".format( - self.monitoring_schedule_name - ) + logger.warning( + "No executions found for schedule. monitoring_schedule_name: %s", + self.monitoring_schedule_name, ) return None @@ -770,10 +769,9 @@ def list_executions(self): ) if len(monitoring_executions_dict["MonitoringExecutionSummaries"]) == 0: - print( - "No executions found for schedule. monitoring_schedule_name: {}".format( - self.monitoring_schedule_name - ) + logger.warning( + "No executions found for schedule. monitoring_schedule_name: %s", + self.monitoring_schedule_name, ) return [] @@ -833,7 +831,7 @@ def update_monitoring_alert( if self.monitoring_schedule_name is None: message = "Nothing to update, please create a schedule first." - _LOGGER.error(message) + logger.error(message) raise ValueError(message) if not data_points_to_alert and not evaluation_period: @@ -862,7 +860,7 @@ def list_monitoring_alerts( """ if self.monitoring_schedule_name is None: message = "No alert to list, please create a schedule first." - _LOGGER.warning(message) + logger.warning(message) return [], None monitoring_alert_dict: Dict = self.sagemaker_session.list_monitoring_alerts( @@ -931,7 +929,7 @@ def list_monitoring_alert_history( """ if self.monitoring_schedule_name is None: message = "No alert history to list, please create a schedule first." - _LOGGER.warning(message) + logger.warning(message) return [], None monitoring_alert_history_dict: Dict = self.sagemaker_session.list_monitoring_alert_history( @@ -1554,7 +1552,7 @@ def _create_monitoring_schedule_from_job_definition( for the one time monitoring schedule (NOW), e.g. "-PT1H" (default: None) """ message = "Creating Monitoring Schedule with name: {}".format(monitor_schedule_name) - _LOGGER.info(message) + logger.info(message) self._check_monitoring_schedule_cron_validity( schedule_cron_expression=schedule_cron_expression, @@ -1653,7 +1651,7 @@ def _update_monitoring_schedule( """ if self.job_definition_name is None or self.monitoring_schedule_name is None: message = "Nothing to update, please create a schedule first." - _LOGGER.error(message) + logger.error(message) raise ValueError(message) self._check_monitoring_schedule_cron_validity( @@ -1991,7 +1989,7 @@ def create_monitoring_schedule( "Monitoring Schedule. To create another, first delete the existing one " "using my_monitor.delete_monitoring_schedule()." ) - _LOGGER.error(message) + logger.error(message) raise ValueError(message) if (batch_transform_input is not None) ^ (endpoint_input is None): @@ -2000,7 +1998,7 @@ def create_monitoring_schedule( "Amazon Model Monitoring Schedule. " "Please provide only one of the above required inputs" ) - _LOGGER.error(message) + logger.error(message) raise ValueError(message) self._check_monitoring_schedule_cron_validity( @@ -2054,7 +2052,7 @@ def create_monitoring_schedule( self.job_definition_name = new_job_definition_name self.monitoring_schedule_name = monitor_schedule_name except Exception: - _LOGGER.exception("Failed to create monitoring schedule.") + logger.exception("Failed to create monitoring schedule.") # noinspection PyBroadException try: self.sagemaker_session.sagemaker_client.delete_data_quality_job_definition( @@ -2062,7 +2060,7 @@ def create_monitoring_schedule( ) except Exception: # pylint: disable=W0703 message = "Failed to delete job definition {}.".format(new_job_definition_name) - _LOGGER.exception(message) + logger.exception(message) raise def update_monitoring_schedule( @@ -2143,7 +2141,7 @@ def update_monitoring_schedule( "Amazon Model Monitoring Schedule. " "Please provide atmost one of the above required inputs" ) - _LOGGER.error(message) + logger.error(message) raise ValueError(message) # check if this schedule is in v2 format and update as per v2 format if it is @@ -2430,7 +2428,7 @@ def _update_data_quality_monitoring_schedule( if network_config is not None: self.network_config = network_config except Exception: - _LOGGER.exception("Failed to update monitoring schedule.") + logger.exception("Failed to update monitoring schedule.") # noinspection PyBroadException try: self.sagemaker_session.sagemaker_client.delete_data_quality_job_definition( @@ -2438,7 +2436,7 @@ def _update_data_quality_monitoring_schedule( ) except Exception: # pylint: disable=W0703 message = "Failed to delete job definition {}.".format(new_job_definition_name) - _LOGGER.exception(message) + logger.exception(message) raise def delete_monitoring_schedule(self): @@ -2449,7 +2447,7 @@ def delete_monitoring_schedule(self): message = "Deleting Data Quality Job Definition with name: {}".format( self.job_definition_name ) - _LOGGER.info(message) + logger.info(message) self.sagemaker_session.sagemaker_client.delete_data_quality_job_definition( JobDefinitionName=self.job_definition_name ) @@ -2570,10 +2568,9 @@ def latest_monitoring_statistics(self): """ executions = self.list_executions() if len(executions) == 0: - print( - "No executions found for schedule. monitoring_schedule_name: {}".format( - self.monitoring_schedule_name - ) + logger.warning( + "No executions found for schedule. monitoring_schedule_name: %s", + self.monitoring_schedule_name, ) return None @@ -2583,9 +2580,10 @@ def latest_monitoring_statistics(self): return latest_monitoring_execution.statistics() except ClientError: status = latest_monitoring_execution.describe()["ProcessingJobStatus"] - print( - "Unable to retrieve statistics as job is in status '{}'. Latest statistics only " - "available for completed executions.".format(status) + logger.warning( + "Unable to retrieve statistics as job is in status '%s'. Latest statistics only " + "available for completed executions.", + status, ) def latest_monitoring_constraint_violations(self): @@ -2600,10 +2598,9 @@ def latest_monitoring_constraint_violations(self): """ executions = self.list_executions() if len(executions) == 0: - print( - "No executions found for schedule. monitoring_schedule_name: {}".format( - self.monitoring_schedule_name - ) + logger.warning( + "No executions found for schedule. monitoring_schedule_name: %s", + self.monitoring_schedule_name, ) return None @@ -2612,9 +2609,10 @@ def latest_monitoring_constraint_violations(self): return latest_monitoring_execution.constraint_violations() except ClientError: status = latest_monitoring_execution.describe()["ProcessingJobStatus"] - print( - "Unable to retrieve constraint violations as job is in status '{}'. Latest " - "violations only available for completed executions.".format(status) + logger.warning( + "Unable to retrieve constraint violations as job is in status '%s'. Latest " + "violations only available for completed executions.", + status, ) @staticmethod @@ -3109,7 +3107,7 @@ def create_monitoring_schedule( "Monitoring Schedule. To create another, first delete the existing one " "using my_monitor.delete_monitoring_schedule()." ) - _LOGGER.error(message) + logger.error(message) raise ValueError(message) if (batch_transform_input is not None) ^ (endpoint_input is None): @@ -3118,7 +3116,7 @@ def create_monitoring_schedule( "Amazon Model Monitoring Schedule. " "Please provide only one of the above required inputs" ) - _LOGGER.error(message) + logger.error(message) raise ValueError(message) self._check_monitoring_schedule_cron_validity( @@ -3173,7 +3171,7 @@ def create_monitoring_schedule( self.job_definition_name = new_job_definition_name self.monitoring_schedule_name = monitor_schedule_name except Exception: - _LOGGER.exception("Failed to create monitoring schedule.") + logger.exception("Failed to create monitoring schedule.") # noinspection PyBroadException try: self.sagemaker_session.sagemaker_client.delete_model_quality_job_definition( @@ -3181,7 +3179,7 @@ def create_monitoring_schedule( ) except Exception: # pylint: disable=W0703 message = "Failed to delete job definition {}.".format(new_job_definition_name) - _LOGGER.exception(message) + logger.exception(message) raise def update_monitoring_schedule( @@ -3279,7 +3277,7 @@ def update_monitoring_schedule( "Amazon Model Monitoring Schedule. " "Please provide atmost one of the above required inputs" ) - _LOGGER.error(message) + logger.error(message) raise ValueError(message) # Need to update schedule with a new job definition @@ -3340,7 +3338,7 @@ def update_monitoring_schedule( if network_config is not None: self.network_config = network_config except Exception: - _LOGGER.exception("Failed to update monitoring schedule.") + logger.exception("Failed to update monitoring schedule.") # noinspection PyBroadException try: self.sagemaker_session.sagemaker_client.delete_model_quality_job_definition( @@ -3348,7 +3346,7 @@ def update_monitoring_schedule( ) except Exception: # pylint: disable=W0703 message = "Failed to delete job definition {}.".format(new_job_definition_name) - _LOGGER.exception(message) + logger.exception(message) raise def delete_monitoring_schedule(self): @@ -3358,7 +3356,7 @@ def delete_monitoring_schedule(self): message = "Deleting Model Quality Job Definition with name: {}".format( self.job_definition_name ) - _LOGGER.info(message) + logger.info(message) self.sagemaker_session.sagemaker_client.delete_model_quality_job_definition( JobDefinitionName=self.job_definition_name ) diff --git a/src/sagemaker/model_monitor/monitoring_files.py b/src/sagemaker/model_monitor/monitoring_files.py index 90ec627087..6768a5270f 100644 --- a/src/sagemaker/model_monitor/monitoring_files.py +++ b/src/sagemaker/model_monitor/monitoring_files.py @@ -18,6 +18,7 @@ from __future__ import print_function, absolute_import import json +import logging import os import uuid @@ -28,6 +29,8 @@ NO_SUCH_KEY_CODE = "NoSuchKey" +logger = logging.getLogger(__name__) + class ModelMonitoringFile(object): """Represents a file with a body and an S3 uri.""" @@ -123,11 +126,12 @@ def from_s3_uri(cls, statistics_file_s3_uri, kms_key=None, sagemaker_session=Non ) ) except ClientError as error: - print( - "\nCould not retrieve statistics file at location '{}'. " + logger.warning( + "\nCould not retrieve statistics file at location '%s'. " "To manually retrieve Statistics object from a given uri, " "use 'my_model_monitor.statistics(my_s3_uri)' or " - "'Statistics.from_s3_uri(my_s3_uri)'".format(statistics_file_s3_uri) + "'Statistics.from_s3_uri(my_s3_uri)'", + statistics_file_s3_uri, ) raise error @@ -253,11 +257,12 @@ def from_s3_uri(cls, constraints_file_s3_uri, kms_key=None, sagemaker_session=No ) ) except ClientError as error: - print( - "\nCould not retrieve constraints file at location '{}'. " + logger.warning( + "\nCould not retrieve constraints file at location '%s'. " "To manually retrieve Constraints object from a given uri, " "use 'my_model_monitor.constraints(my_s3_uri)' or " - "'Constraints.from_s3_uri(my_s3_uri)'".format(constraints_file_s3_uri) + "'Constraints.from_s3_uri(my_s3_uri)'", + constraints_file_s3_uri, ) raise error diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py index 5b5df7a792..28097a72bf 100644 --- a/src/sagemaker/session.py +++ b/src/sagemaker/session.py @@ -135,7 +135,8 @@ from sagemaker import exceptions from sagemaker.session_settings import SessionSettings -LOGGER = logging.getLogger("sagemaker") +# Setting LOGGER for backward compatibility, in case users import it... +logger = LOGGER = logging.getLogger("sagemaker") NOTEBOOK_METADATA_FILE = "/opt/ml/metadata/resource-metadata.json" MODEL_MONITOR_ONE_TIME_SCHEDULE = "NOW" @@ -485,7 +486,7 @@ def download_data(self, path, bucket, key_prefix="", extra_args=None): response = s3.list_objects_v2(**request_parameters) contents = response.get("Contents", None) if not contents: - LOGGER.info( + logger.info( "Nothing to download from bucket: %s, key_prefix: %s.", bucket, key_prefix ) return [] @@ -630,7 +631,7 @@ def _create_s3_bucket_if_it_does_not_exist(self, bucket_name, region): CreateBucketConfiguration={"LocationConstraint": region}, ) - LOGGER.info("Created S3 bucket: %s", bucket_name) + logger.info("Created S3 bucket: %s", bucket_name) except ClientError as e: error_code = e.response["Error"]["Code"] message = e.response["Error"]["Message"] @@ -645,7 +646,7 @@ def _create_s3_bucket_if_it_does_not_exist(self, bucket_name, region): else: raise elif error_code == "403" and message == "Forbidden": - LOGGER.error( + logger.error( "Bucket %s exists, but access is forbidden. Please try again after " "adding appropriate access.", bucket.name, @@ -966,8 +967,8 @@ def train( # noqa: C901 ) def submit(request): - LOGGER.info("Creating training-job with name: %s", job_name) - LOGGER.debug("train request: %s", json.dumps(request, indent=4)) + logger.info("Creating training-job with name: %s", job_name) + logger.debug("train request: %s", json.dumps(request, indent=4)) self.sagemaker_client.create_training_job(**request) self._intercept_create_request(train_request, submit, self.train.__name__) @@ -1276,8 +1277,8 @@ def update_training_job( resource_config=resource_config, remote_debug_config=remote_debug_config, ) - LOGGER.info("Updating training job with name %s", job_name) - LOGGER.debug("Update request: %s", json.dumps(update_training_job_request, indent=4)) + logger.info("Updating training job with name %s", job_name) + logger.debug("Update request: %s", json.dumps(update_training_job_request, indent=4)) self.sagemaker_client.update_training_job(**update_training_job_request) def _get_update_training_job_request( @@ -1444,8 +1445,8 @@ def process( ) def submit(request): - LOGGER.info("Creating processing-job with name %s", job_name) - LOGGER.debug("process request: %s", json.dumps(request, indent=4)) + logger.info("Creating processing-job with name %s", job_name) + logger.debug("process request: %s", json.dumps(request, indent=4)) self.sagemaker_client.create_processing_job(**request) self._intercept_create_request(process_request, submit, self.process.__name__) @@ -1723,8 +1724,8 @@ def create_monitoring_schedule( if tags is not None: monitoring_schedule_request["Tags"] = tags - LOGGER.info("Creating monitoring schedule name %s.", monitoring_schedule_name) - LOGGER.debug( + logger.info("Creating monitoring schedule name %s.", monitoring_schedule_name) + logger.debug( "monitoring_schedule_request= %s", json.dumps(monitoring_schedule_request, indent=4) ) self.sagemaker_client.create_monitoring_schedule(**monitoring_schedule_request) @@ -2059,8 +2060,8 @@ def update_monitoring_schedule( "NetworkConfig" ] = _network_config - LOGGER.info("Updating monitoring schedule with name: %s .", monitoring_schedule_name) - LOGGER.debug( + logger.info("Updating monitoring schedule with name: %s .", monitoring_schedule_name) + logger.debug( "monitoring_schedule_request= %s", json.dumps(monitoring_schedule_request, indent=4) ) self.sagemaker_client.update_monitoring_schedule(**monitoring_schedule_request) @@ -2072,8 +2073,7 @@ def start_monitoring_schedule(self, monitoring_schedule_name): monitoring_schedule_name (str): The name of the Amazon SageMaker Monitoring Schedule to start. """ - print() - print("Starting Monitoring Schedule with name: {}".format(monitoring_schedule_name)) + logger.info("Starting Monitoring Schedule with name: %s", monitoring_schedule_name) self.sagemaker_client.start_monitoring_schedule( MonitoringScheduleName=monitoring_schedule_name ) @@ -2085,8 +2085,7 @@ def stop_monitoring_schedule(self, monitoring_schedule_name): monitoring_schedule_name (str): The name of the Amazon SageMaker Monitoring Schedule to stop. """ - print() - print("Stopping Monitoring Schedule with name: {}".format(monitoring_schedule_name)) + logger.info("Stopping Monitoring Schedule with name: %s", monitoring_schedule_name) self.sagemaker_client.stop_monitoring_schedule( MonitoringScheduleName=monitoring_schedule_name ) @@ -2098,8 +2097,7 @@ def delete_monitoring_schedule(self, monitoring_schedule_name): monitoring_schedule_name (str): The name of the Amazon SageMaker Monitoring Schedule to delete. """ - print() - print("Deleting Monitoring Schedule with name: {}".format(monitoring_schedule_name)) + logger.info("Deleting Monitoring Schedule with name: %s", monitoring_schedule_name) self.sagemaker_client.delete_monitoring_schedule( MonitoringScheduleName=monitoring_schedule_name ) @@ -2397,8 +2395,8 @@ def auto_ml( ) def submit(request): - LOGGER.info("Creating auto-ml-job with name: %s", job_name) - LOGGER.debug("auto ml request: %s", json.dumps(request), indent=4) + logger.info("Creating auto-ml-job with name: %s", job_name) + logger.debug("auto ml request: %s", json.dumps(request), indent=4) self.sagemaker_client.create_auto_ml_job(**request) self._intercept_create_request(auto_ml_job_request, submit, self.auto_ml.__name__) @@ -2684,7 +2682,7 @@ def compile_model( if tags is not None: compilation_job_request["Tags"] = tags - LOGGER.info("Creating compilation-job with name: %s", job_name) + logger.info("Creating compilation-job with name: %s", job_name) self.sagemaker_client.create_compilation_job(**compilation_job_request) def package_model_for_edge( @@ -2736,7 +2734,7 @@ def package_model_for_edge( if resource_key is not None: edge_packaging_job_request["ResourceKey"] = resource_key - LOGGER.info("Creating edge-packaging-job with name: %s", job_name) + logger.info("Creating edge-packaging-job with name: %s", job_name) self.sagemaker_client.create_edge_packaging_job(**edge_packaging_job_request) def tune( # noqa: C901 @@ -2935,8 +2933,8 @@ def tune( # noqa: C901 if tags is not None: tune_request["Tags"] = tags - LOGGER.info("Creating hyperparameter tuning job with name: %s", job_name) - LOGGER.debug("tune request: %s", json.dumps(tune_request, indent=4)) + logger.info("Creating hyperparameter tuning job with name: %s", job_name) + logger.debug("tune request: %s", json.dumps(tune_request, indent=4)) self.sagemaker_client.create_hyper_parameter_tuning_job(**tune_request) def create_tuning_job( @@ -2989,8 +2987,8 @@ def create_tuning_job( ) def submit(request): - LOGGER.info("Creating hyperparameter tuning job with name: %s", job_name) - LOGGER.debug("tune request: %s", json.dumps(request, indent=4)) + logger.info("Creating hyperparameter tuning job with name: %s", job_name) + logger.debug("tune request: %s", json.dumps(request, indent=4)) self.sagemaker_client.create_hyper_parameter_tuning_job(**request) self._intercept_create_request(tune_request, submit, self.create_tuning_job.__name__) @@ -3346,15 +3344,15 @@ def stop_tuning_job(self, name): ClientError: If an error occurs while trying to stop the hyperparameter tuning job. """ try: - LOGGER.info("Stopping tuning job: %s", name) + logger.info("Stopping tuning job: %s", name) self.sagemaker_client.stop_hyper_parameter_tuning_job(HyperParameterTuningJobName=name) except ClientError as e: error_code = e.response["Error"]["Code"] # allow to pass if the job already stopped if error_code == "ValidationException": - LOGGER.info("Tuning job: %s is already stopped or not running.", name) + logger.info("Tuning job: %s is already stopped or not running.", name) else: - LOGGER.error( + logger.error( "Error occurred while attempting to stop tuning job: %s. Please try again.", name, ) @@ -3554,8 +3552,8 @@ def transform( ) def submit(request): - LOGGER.info("Creating transform job with name: %s", job_name) - LOGGER.debug("Transform request: %s", json.dumps(request, indent=4)) + logger.info("Creating transform job with name: %s", job_name) + logger.debug("Transform request: %s", json.dumps(request, indent=4)) self.sagemaker_client.create_transform_job(**request) self._intercept_create_request(transform_request, submit, self.transform.__name__) @@ -3697,8 +3695,8 @@ def create_model( ) def submit(request): - LOGGER.info("Creating model with name: %s", name) - LOGGER.debug("CreateModel request: %s", json.dumps(request, indent=4)) + logger.info("Creating model with name: %s", name) + logger.debug("CreateModel request: %s", json.dumps(request, indent=4)) try: self.sagemaker_client.create_model(**request) except ClientError as e: @@ -3708,7 +3706,7 @@ def submit(request): error_code == "ValidationException" and "Cannot create already existing model" in message ): - LOGGER.warning("Using already existing model: %s", name) + logger.warning("Using already existing model: %s", name) else: raise @@ -3808,14 +3806,14 @@ def create_model_package_from_algorithm(self, name, description, algorithm_arn, }, } try: - LOGGER.info("Creating model package with name: %s", name) + logger.info("Creating model package with name: %s", name) self.sagemaker_client.create_model_package(**request) except ClientError as e: error_code = e.response["Error"]["Code"] message = e.response["Error"]["Message"] if error_code == "ValidationException" and "ModelPackage already exists" in message: - LOGGER.warning("Using already existing model package: %s", name) + logger.warning("Using already existing model package: %s", name) else: raise @@ -4059,7 +4057,7 @@ def create_endpoint_config( Returns: str: Name of the endpoint point configuration created. """ - LOGGER.info("Creating endpoint-config with name %s", name) + logger.info("Creating endpoint-config with name %s", name) tags = tags or [] provided_production_variant = production_variant( @@ -4161,7 +4159,7 @@ def create_endpoint_config_from_existing( Returns: str: Name of the endpoint point configuration created. """ - LOGGER.info("Creating endpoint-config with name %s", new_config_name) + logger.info("Creating endpoint-config with name %s", new_config_name) existing_endpoint_config_desc = self.sagemaker_client.describe_endpoint_config( EndpointConfigName=existing_config_name @@ -4275,7 +4273,7 @@ def create_endpoint(self, endpoint_name, config_name, tags=None, wait=True, live Returns: str: Name of the Amazon SageMaker ``Endpoint`` created. """ - LOGGER.info("Creating endpoint with name %s", endpoint_name) + logger.info("Creating endpoint with name %s", endpoint_name) tags = tags or [] tags = _append_project_tags(tags) @@ -4366,7 +4364,7 @@ def delete_endpoint(self, endpoint_name): Args: endpoint_name (str): Name of the Amazon SageMaker ``Endpoint`` to delete. """ - LOGGER.info("Deleting endpoint with name: %s", endpoint_name) + logger.info("Deleting endpoint with name: %s", endpoint_name) self.sagemaker_client.delete_endpoint(EndpointName=endpoint_name) def delete_endpoint_config(self, endpoint_config_name): @@ -4376,7 +4374,7 @@ def delete_endpoint_config(self, endpoint_config_name): endpoint_config_name (str): Name of the Amazon SageMaker endpoint configuration to delete. """ - LOGGER.info("Deleting endpoint configuration with name: %s", endpoint_config_name) + logger.info("Deleting endpoint configuration with name: %s", endpoint_config_name) self.sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name) def create_inference_component( @@ -4710,7 +4708,7 @@ def delete_model(self, model_name): Args: model_name (str): Name of the Amazon SageMaker model to delete. """ - LOGGER.info("Deleting model with name: %s", model_name) + logger.info("Deleting model with name: %s", model_name) self.sagemaker_client.delete_model(ModelName=model_name) def list_group_resources(self, group, filters, next_token: str = ""): @@ -4829,7 +4827,7 @@ def list_tags(self, resource_arn, max_results=50): non_aws_tags.append(tag) return non_aws_tags except ClientError as error: - print("Error retrieving tags. resource_arn: {}".format(resource_arn)) + logger.error("Error retrieving tags. resource_arn: %s", resource_arn) raise error def wait_for_job(self, job, poll=5): @@ -4963,15 +4961,15 @@ def stop_transform_job(self, name): ClientError: If an error occurs while trying to stop the batch transform job. """ try: - LOGGER.info("Stopping transform job: %s", name) + logger.info("Stopping transform job: %s", name) self.sagemaker_client.stop_transform_job(TransformJobName=name) except ClientError as e: error_code = e.response["Error"]["Code"] # allow to pass if the job already stopped if error_code == "ValidationException": - LOGGER.info("Transform job: %s is already stopped or not running.", name) + logger.info("Transform job: %s is already stopped or not running.", name) else: - LOGGER.error("Error occurred while attempting to stop transform job: %s.", name) + logger.error("Error occurred while attempting to stop transform job: %s.", name) raise def wait_for_endpoint(self, endpoint, poll=DEFAULT_EP_POLL, live_logging=False): @@ -5373,7 +5371,7 @@ def endpoint_from_production_variants( if role is not None: config_options["ExecutionRoleArn"] = role - LOGGER.info("Creating endpoint-config with name %s", name) + logger.info("Creating endpoint-config with name %s", name) self.sagemaker_client.create_endpoint_config(**config_options) return self.create_endpoint( @@ -5437,7 +5435,7 @@ def get_caller_identity_arn(self): domain_desc = self.sagemaker_client.describe_domain(DomainId=domain_id) return domain_desc["DefaultUserSettings"]["ExecutionRole"] except ClientError: - LOGGER.debug( + logger.debug( "Couldn't call 'describe_notebook_instance' to get the Role " "ARN of the instance %s.", instance_name, @@ -5456,7 +5454,7 @@ def get_caller_identity_arn(self): try: role = self.boto_session.client("iam").get_role(RoleName=role_name)["Role"]["Arn"] except ClientError: - LOGGER.warning( + logger.warning( "Couldn't call 'get_role' to get Role ARN from role name %s to get Role path.", role_name, ) @@ -5465,7 +5463,7 @@ def get_caller_identity_arn(self): # Guessing this conditional's purpose was to handle lack of IAM permissions # https://github.com/aws/sagemaker-python-sdk/issues/2089#issuecomment-791802713 if "AmazonSageMaker-ExecutionRole" in assumed_role: - LOGGER.warning( + logger.warning( "Assuming role was created in SageMaker AWS console, " "as the name contains `AmazonSageMaker-ExecutionRole`. " "Defaulting to Role ARN with service-role in path. " @@ -6084,7 +6082,7 @@ def wait_for_athena_query(self, query_execution_id: str, poll: int = 5): .get("State") ) while query_state not in ("SUCCEEDED", "FAILED"): - LOGGER.info("Query %s is being executed.", query_execution_id) + logger.info("Query %s is being executed.", query_execution_id) time.sleep(poll) query_state = ( self.get_query_execution(query_execution_id=query_execution_id) @@ -6093,9 +6091,9 @@ def wait_for_athena_query(self, query_execution_id: str, poll: int = 5): .get("State") ) if query_state == "SUCCEEDED": - LOGGER.info("Query %s successfully executed.", query_execution_id) + logger.info("Query %s successfully executed.", query_execution_id) else: - LOGGER.error("Failed to execute query %s.", query_execution_id) + logger.error("Failed to execute query %s.", query_execution_id) def download_athena_query_result( self, @@ -6351,8 +6349,8 @@ def create_inference_recommendations_job( ) def submit(request): - LOGGER.info("Creating Inference Recommendations job with name: %s", job_name) - LOGGER.debug("process request: %s", json.dumps(request, indent=4)) + logger.info("Creating Inference Recommendations job with name: %s", job_name) + logger.debug("process request: %s", json.dumps(request, indent=4)) self.sagemaker_client.create_inference_recommendations_job(**request) self._intercept_create_request( @@ -7386,7 +7384,7 @@ def _wait_until_training_done(callable_fn, desc, poll=5): # access policy based on resource tags, The caveat here is for true AccessDenied # cases the routine will fail after 5 mins if err.response["Error"]["Code"] == "AccessDeniedException" and elapsed_time <= 300: - LOGGER.warning( + logger.warning( "Received AccessDeniedException. This could mean the IAM role does not " "have the resource permissions, in which case please add resource access " "and retry. For cases where the role has tag based resource policy, " @@ -7412,7 +7410,7 @@ def _wait_until(callable_fn, poll=5): # access policy based on resource tags, The caveat here is for true AccessDenied # cases the routine will fail after 5 mins if err.response["Error"]["Code"] == "AccessDeniedException" and elapsed_time <= 300: - LOGGER.warning( + logger.warning( "Received AccessDeniedException. This could mean the IAM role does not " "have the resource permissions, in which case please add resource access " "and retry. For cases where the role has tag based resource policy, " @@ -7627,7 +7625,7 @@ def _check_job_status(job, desc, status_key_name): status = _STATUS_CODE_TABLE.get(status, status) if status == "Stopped": - LOGGER.warning( + logger.warning( "Job ended with status 'Stopped' rather than 'Completed'. " "This could mean the job timed out or stopped early for some other reason: " "Consider checking whether it completed as you expect." diff --git a/tests/integ/timeout.py b/tests/integ/timeout.py index b86249eaf8..49f447e5a3 100644 --- a/tests/integ/timeout.py +++ b/tests/integ/timeout.py @@ -23,7 +23,8 @@ from sagemaker import Predictor from tests.integ.retry import retries -LOGGER = logging.getLogger("timeout") +# Setting LOGGER for backward compatibility, in case users import it... +logger = LOGGER = logging.getLogger("timeout") @contextmanager @@ -73,7 +74,7 @@ def timeout_and_delete_endpoint_by_name( sagemaker_session=sagemaker_session, endpoint_name=endpoint_name ) sagemaker_session.delete_endpoint(endpoint_name) - LOGGER.info("deleted endpoint {}".format(endpoint_name)) + logger.info("deleted endpoint %s", endpoint_name) _show_logs(endpoint_name, "Endpoints", sagemaker_session) if no_errors: @@ -111,7 +112,7 @@ def timeout_and_delete_model_with_transformer( attempts -= 1 try: transformer.delete_model() - LOGGER.info("deleted SageMaker model {}".format(transformer.model_name)) + logger.info("deleted SageMaker model %s", transformer.model_name) _show_logs(transformer.model_name, "Models", sagemaker_session) if no_errors: @@ -147,7 +148,7 @@ def timeout_and_delete_model_by_name( attempts -= 1 try: sagemaker_session.delete_model(model_name) - LOGGER.info("deleted model {}".format(model_name)) + logger.info("deleted model %s", model_name) _show_logs(model_name, "Models", sagemaker_session) if no_errors: @@ -200,10 +201,10 @@ def _delete_schedules_associated_with_endpoint(sagemaker_session, endpoint_name) # Delete schedules. monitor.delete_monitoring_schedule() except Exception as e: - LOGGER.warning( - "Failed to delete monitor {},\nError: {}".format( - monitor.monitoring_schedule_name, e - ) + logger.warning( + "Failed to delete monitor %s,\nError: %s", + monitor.monitoring_schedule_name, + e, ) @@ -211,7 +212,7 @@ def _show_logs(resource_name, resource_type, sagemaker_session): log_group = "/aws/sagemaker/{}/{}".format(resource_type, resource_name) try: # print out logs before deletion for debuggability - LOGGER.info("cloudwatch logs for log group {}:".format(log_group)) + logger.info("cloudwatch logs for log group %s:", log_group) logs = AWSLogs( log_group_name=log_group, log_stream_name="ALL", @@ -220,7 +221,7 @@ def _show_logs(resource_name, resource_type, sagemaker_session): ) logs.list_logs() except Exception: - LOGGER.exception( + logger.exception( "Failure occurred while listing cloudwatch log group %s. Swallowing exception but printing " "stacktrace for debugging.", log_group, @@ -231,12 +232,12 @@ def _cleanup_logs(resource_name, resource_type, sagemaker_session): log_group = "/aws/sagemaker/{}/{}".format(resource_type, resource_name) try: # print out logs before deletion for debuggability - LOGGER.info("deleting cloudwatch log group {}:".format(log_group)) + logger.info("deleting cloudwatch log group %s:", log_group) cwl_client = sagemaker_session.boto_session.client("logs") cwl_client.delete_log_group(logGroupName=log_group) - LOGGER.info("deleted cloudwatch log group: {}".format(log_group)) + logger.info("deleted cloudwatch log group: %s", log_group) except Exception: - LOGGER.exception( + logger.exception( "Failure occurred while cleaning up cloudwatch log group %s. " "Swallowing exception but printing stacktrace for debugging.", log_group, From 27f9c883bfb5efb37d681feeaea983722188800e Mon Sep 17 00:00:00 2001 From: Duc Trung Le Date: Thu, 21 Dec 2023 22:52:06 +0100 Subject: [PATCH 22/76] documentation: update issue template. (#4337) --- .github/ISSUE_TEMPLATE/bug_report.md | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 036f6a3e9e..048133d265 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -12,6 +12,7 @@ A clear and concise description of what the bug is. **To reproduce** A clear, step-by-step set of instructions to reproduce the bug. +The provided code need to be **complete** and **runnable**, if additional data is needed, please include them in the issue. **Expected behavior** A clear and concise description of what you expected to happen. From e08e5fb647029f92ec41a2ac024baddc6c96eddc Mon Sep 17 00:00:00 2001 From: Duc Trung Le Date: Fri, 22 Dec 2023 01:33:50 +0100 Subject: [PATCH 23/76] change: update model path in local mode (#4296) * Update model path in local mode * Add test --- src/sagemaker/local/image.py | 1 + src/sagemaker/local/utils.py | 6 ++++-- tests/unit/sagemaker/local/test_local_utils.py | 12 ++++++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/sagemaker/local/image.py b/src/sagemaker/local/image.py index 22a15c0570..60f28d3b0c 100644 --- a/src/sagemaker/local/image.py +++ b/src/sagemaker/local/image.py @@ -430,6 +430,7 @@ def retrieve_artifacts(self, compose_data, output_data_config, job_name): output_data_config["S3OutputPath"], job_name, self.sagemaker_session, + prefix="output", ) _delete_tree(model_artifacts) diff --git a/src/sagemaker/local/utils.py b/src/sagemaker/local/utils.py index 298c95acb6..16375de7d4 100644 --- a/src/sagemaker/local/utils.py +++ b/src/sagemaker/local/utils.py @@ -53,7 +53,7 @@ def copy_directory_structure(destination_directory, relative_path): os.makedirs(destination_directory, relative_path) -def move_to_destination(source, destination, job_name, sagemaker_session): +def move_to_destination(source, destination, job_name, sagemaker_session, prefix=""): """Move source to destination. Can handle uploading to S3. @@ -64,6 +64,8 @@ def move_to_destination(source, destination, job_name, sagemaker_session): job_name (str): SageMaker job name. sagemaker_session (sagemaker.Session): a sagemaker_session to interact with S3 if needed + prefix (str, optional): the directory on S3 used to save files, default + to the root of ``destination`` Returns: (str): destination URI @@ -75,7 +77,7 @@ def move_to_destination(source, destination, job_name, sagemaker_session): final_uri = destination elif parsed_uri.scheme == "s3": bucket = parsed_uri.netloc - path = s3.s3_path_join(parsed_uri.path, job_name) + path = s3.s3_path_join(parsed_uri.path, job_name, prefix) final_uri = s3.s3_path_join("s3://", bucket, path) sagemaker_session.upload_data(source, bucket, path) else: diff --git a/tests/unit/sagemaker/local/test_local_utils.py b/tests/unit/sagemaker/local/test_local_utils.py index 2db8c83351..39b9e2b392 100644 --- a/tests/unit/sagemaker/local/test_local_utils.py +++ b/tests/unit/sagemaker/local/test_local_utils.py @@ -66,6 +66,18 @@ def test_move_to_destination_s3(recursive_copy): sms.upload_data.assert_called_with("/tmp/data", "bucket", "job") +@patch("shutil.rmtree", Mock()) +def test_move_to_destination_s3_with_prefix(): + sms = Mock( + settings=SessionSettings(), + ) + uri = sagemaker.local.utils.move_to_destination( + "/tmp/data", "s3://bucket/path", "job", sms, "foo_prefix" + ) + sms.upload_data.assert_called_with("/tmp/data", "bucket", "path/job/foo_prefix") + assert uri == "s3://bucket/path/job/foo_prefix" + + def test_move_to_destination_illegal_destination(): with pytest.raises(ValueError): sagemaker.local.utils.move_to_destination("/tmp/data", "ftp://ftp/in/2018", "job", None) From ce269e18a798dd0506c02b7b3773d958802dff8f Mon Sep 17 00:00:00 2001 From: sagemaker-bot Date: Fri, 22 Dec 2023 14:17:36 +0000 Subject: [PATCH 24/76] change: update image_uri_configs 12-22-2023 06:17:35 PST --- .../image_uri_config/blazingtext.json | 1 + .../factorization-machines.json | 1 + .../image_uri_config/forecasting-deepar.json | 1 + .../image_uri_config/image-classification.json | 1 + src/sagemaker/image_uri_config/ipinsights.json | 1 + src/sagemaker/image_uri_config/kmeans.json | 1 + src/sagemaker/image_uri_config/knn.json | 1 + .../image_uri_config/linear-learner.json | 1 + src/sagemaker/image_uri_config/ntm.json | 1 + .../image_uri_config/object-detection.json | 1 + src/sagemaker/image_uri_config/object2vec.json | 1 + src/sagemaker/image_uri_config/pca.json | 1 + .../image_uri_config/randomcutforest.json | 1 + .../semantic-segmentation.json | 1 + src/sagemaker/image_uri_config/seq2seq.json | 1 + src/sagemaker/image_uri_config/sklearn.json | 9 +++++++++ src/sagemaker/image_uri_config/xgboost.json | 18 ++++++++++++++++++ 17 files changed, 42 insertions(+) diff --git a/src/sagemaker/image_uri_config/blazingtext.json b/src/sagemaker/image_uri_config/blazingtext.json index 2c5601b356..eba76fc80c 100644 --- a/src/sagemaker/image_uri_config/blazingtext.json +++ b/src/sagemaker/image_uri_config/blazingtext.json @@ -18,6 +18,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "469771592824", + "ca-west-1": "190319476487", "cn-north-1": "390948362332", "cn-northwest-1": "387376663083", "eu-central-1": "813361260812", diff --git a/src/sagemaker/image_uri_config/factorization-machines.json b/src/sagemaker/image_uri_config/factorization-machines.json index 610f36e000..a97ef3b374 100644 --- a/src/sagemaker/image_uri_config/factorization-machines.json +++ b/src/sagemaker/image_uri_config/factorization-machines.json @@ -18,6 +18,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "469771592824", + "ca-west-1": "190319476487", "cn-north-1": "390948362332", "cn-northwest-1": "387376663083", "eu-central-1": "664544806723", diff --git a/src/sagemaker/image_uri_config/forecasting-deepar.json b/src/sagemaker/image_uri_config/forecasting-deepar.json index 1adf88d7f3..5bff449425 100644 --- a/src/sagemaker/image_uri_config/forecasting-deepar.json +++ b/src/sagemaker/image_uri_config/forecasting-deepar.json @@ -18,6 +18,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "469771592824", + "ca-west-1": "190319476487", "cn-north-1": "390948362332", "cn-northwest-1": "387376663083", "eu-central-1": "495149712605", diff --git a/src/sagemaker/image_uri_config/image-classification.json b/src/sagemaker/image_uri_config/image-classification.json index ae2fc4a31b..67c926f779 100644 --- a/src/sagemaker/image_uri_config/image-classification.json +++ b/src/sagemaker/image_uri_config/image-classification.json @@ -18,6 +18,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "469771592824", + "ca-west-1": "190319476487", "cn-north-1": "390948362332", "cn-northwest-1": "387376663083", "eu-central-1": "813361260812", diff --git a/src/sagemaker/image_uri_config/ipinsights.json b/src/sagemaker/image_uri_config/ipinsights.json index aa4012ce94..8840b01473 100644 --- a/src/sagemaker/image_uri_config/ipinsights.json +++ b/src/sagemaker/image_uri_config/ipinsights.json @@ -18,6 +18,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "469771592824", + "ca-west-1": "190319476487", "cn-north-1": "390948362332", "cn-northwest-1": "387376663083", "eu-central-1": "664544806723", diff --git a/src/sagemaker/image_uri_config/kmeans.json b/src/sagemaker/image_uri_config/kmeans.json index 091e99ada8..9b181a75f5 100644 --- a/src/sagemaker/image_uri_config/kmeans.json +++ b/src/sagemaker/image_uri_config/kmeans.json @@ -18,6 +18,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "469771592824", + "ca-west-1": "190319476487", "cn-north-1": "390948362332", "cn-northwest-1": "387376663083", "eu-central-1": "664544806723", diff --git a/src/sagemaker/image_uri_config/knn.json b/src/sagemaker/image_uri_config/knn.json index 7d54e730f4..4d561f694d 100644 --- a/src/sagemaker/image_uri_config/knn.json +++ b/src/sagemaker/image_uri_config/knn.json @@ -18,6 +18,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "469771592824", + "ca-west-1": "190319476487", "cn-north-1": "390948362332", "cn-northwest-1": "387376663083", "eu-central-1": "664544806723", diff --git a/src/sagemaker/image_uri_config/linear-learner.json b/src/sagemaker/image_uri_config/linear-learner.json index f59384e2af..c3dafc49bc 100644 --- a/src/sagemaker/image_uri_config/linear-learner.json +++ b/src/sagemaker/image_uri_config/linear-learner.json @@ -18,6 +18,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "469771592824", + "ca-west-1": "190319476487", "cn-north-1": "390948362332", "cn-northwest-1": "387376663083", "eu-central-1": "664544806723", diff --git a/src/sagemaker/image_uri_config/ntm.json b/src/sagemaker/image_uri_config/ntm.json index a942c68bbb..d753ccec48 100644 --- a/src/sagemaker/image_uri_config/ntm.json +++ b/src/sagemaker/image_uri_config/ntm.json @@ -18,6 +18,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "469771592824", + "ca-west-1": "190319476487", "cn-north-1": "390948362332", "cn-northwest-1": "387376663083", "eu-central-1": "664544806723", diff --git a/src/sagemaker/image_uri_config/object-detection.json b/src/sagemaker/image_uri_config/object-detection.json index 079ef594ec..d036f2ff15 100644 --- a/src/sagemaker/image_uri_config/object-detection.json +++ b/src/sagemaker/image_uri_config/object-detection.json @@ -18,6 +18,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "469771592824", + "ca-west-1": "190319476487", "cn-north-1": "390948362332", "cn-northwest-1": "387376663083", "eu-central-1": "813361260812", diff --git a/src/sagemaker/image_uri_config/object2vec.json b/src/sagemaker/image_uri_config/object2vec.json index be4258a207..53f6686945 100644 --- a/src/sagemaker/image_uri_config/object2vec.json +++ b/src/sagemaker/image_uri_config/object2vec.json @@ -18,6 +18,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "469771592824", + "ca-west-1": "190319476487", "cn-north-1": "390948362332", "cn-northwest-1": "387376663083", "eu-central-1": "664544806723", diff --git a/src/sagemaker/image_uri_config/pca.json b/src/sagemaker/image_uri_config/pca.json index 5b87591d9f..64792a8e7b 100644 --- a/src/sagemaker/image_uri_config/pca.json +++ b/src/sagemaker/image_uri_config/pca.json @@ -18,6 +18,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "469771592824", + "ca-west-1": "190319476487", "cn-north-1": "390948362332", "cn-northwest-1": "387376663083", "eu-central-1": "664544806723", diff --git a/src/sagemaker/image_uri_config/randomcutforest.json b/src/sagemaker/image_uri_config/randomcutforest.json index fe4b0cbf91..74ab6898cc 100644 --- a/src/sagemaker/image_uri_config/randomcutforest.json +++ b/src/sagemaker/image_uri_config/randomcutforest.json @@ -18,6 +18,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "469771592824", + "ca-west-1": "190319476487", "cn-north-1": "390948362332", "cn-northwest-1": "387376663083", "eu-central-1": "664544806723", diff --git a/src/sagemaker/image_uri_config/semantic-segmentation.json b/src/sagemaker/image_uri_config/semantic-segmentation.json index 37671ed7a1..e6e2b4350b 100644 --- a/src/sagemaker/image_uri_config/semantic-segmentation.json +++ b/src/sagemaker/image_uri_config/semantic-segmentation.json @@ -18,6 +18,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "469771592824", + "ca-west-1": "190319476487", "cn-north-1": "390948362332", "cn-northwest-1": "387376663083", "eu-central-1": "813361260812", diff --git a/src/sagemaker/image_uri_config/seq2seq.json b/src/sagemaker/image_uri_config/seq2seq.json index cc73055bb0..143f966a99 100644 --- a/src/sagemaker/image_uri_config/seq2seq.json +++ b/src/sagemaker/image_uri_config/seq2seq.json @@ -18,6 +18,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "469771592824", + "ca-west-1": "190319476487", "cn-north-1": "390948362332", "cn-northwest-1": "387376663083", "eu-central-1": "813361260812", diff --git a/src/sagemaker/image_uri_config/sklearn.json b/src/sagemaker/image_uri_config/sklearn.json index 7d25792e8c..656758d607 100644 --- a/src/sagemaker/image_uri_config/sklearn.json +++ b/src/sagemaker/image_uri_config/sklearn.json @@ -21,6 +21,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -66,6 +67,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -111,6 +113,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -156,6 +159,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -205,6 +209,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -250,6 +255,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -295,6 +301,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -340,6 +347,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -389,6 +397,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", diff --git a/src/sagemaker/image_uri_config/xgboost.json b/src/sagemaker/image_uri_config/xgboost.json index b3883eabe7..573a2db10e 100644 --- a/src/sagemaker/image_uri_config/xgboost.json +++ b/src/sagemaker/image_uri_config/xgboost.json @@ -62,6 +62,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -107,6 +108,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -152,6 +154,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -191,6 +194,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -230,6 +234,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -269,6 +274,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -308,6 +314,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -347,6 +354,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -437,6 +445,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -482,6 +491,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -527,6 +537,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -566,6 +577,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -605,6 +617,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -644,6 +657,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -683,6 +697,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -722,6 +737,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -765,6 +781,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", @@ -804,6 +821,7 @@ "ap-southeast-3": "951798379941", "ap-southeast-4": "106583098589", "ca-central-1": "341280168497", + "ca-west-1": "190319476487", "cn-north-1": "450853457545", "cn-northwest-1": "451049120500", "eu-central-1": "492215442770", From d0cb07166b377f0204aaf61887d86848587cf255 Mon Sep 17 00:00:00 2001 From: ci Date: Fri, 22 Dec 2023 18:29:00 +0000 Subject: [PATCH 25/76] prepare release v2.202.1 --- CHANGELOG.md | 12 ++++++++++++ VERSION | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 965f933d24..ad5477d6db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Changelog +## v2.202.1 (2023-12-22) + +### Bug Fixes and Other Changes + + * update image_uri_configs 12-22-2023 06:17:35 PST + * update model path in local mode + * Using logging instead of prints + +### Documentation Changes + + * update issue template. + ## v2.202.0 (2023-12-21) ### Features diff --git a/VERSION b/VERSION index b97ecdc293..c4588d4f9a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.202.1.dev0 +2.202.1 From 7c7fb173d07db371df3523cbe2ee576565b5b677 Mon Sep 17 00:00:00 2001 From: ci Date: Fri, 22 Dec 2023 18:29:02 +0000 Subject: [PATCH 26/76] update development version to v2.202.2.dev0 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index c4588d4f9a..8a14f9bf50 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.202.1 +2.202.2.dev0 From 0a2f9e4679e742a5509e1cd9e945e63ffc026b20 Mon Sep 17 00:00:00 2001 From: Duc Trung Le Date: Fri, 22 Dec 2023 21:48:17 +0100 Subject: [PATCH 27/76] change: create role if needed in `get_execution_role` (#4323) * Create role if needed in get_execution_role * Add tests --- src/sagemaker/session.py | 42 ++++++++++++++++++++++++-- tests/unit/test_session.py | 62 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 2 deletions(-) diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py index 28097a72bf..fe0d259428 100644 --- a/src/sagemaker/session.py +++ b/src/sagemaker/session.py @@ -6902,13 +6902,16 @@ def production_variant( return production_variant_configuration -def get_execution_role(sagemaker_session=None): +def get_execution_role(sagemaker_session=None, use_default=False): """Return the role ARN whose credentials are used to call the API. Throws an exception if role doesn't exist. Args: - sagemaker_session(Session): Current sagemaker session + sagemaker_session (Session): Current sagemaker session. + use_default (bool): Use a default role if ``get_caller_identity_arn`` does not + return a correct role. This default role will be created if needed. + Defaults to ``False``. Returns: (str): The role ARN @@ -6919,6 +6922,41 @@ def get_execution_role(sagemaker_session=None): if ":role/" in arn: return arn + + if use_default: + default_role_name = "AmazonSageMaker-DefaultRole" + + LOGGER.warning("Using default role: %s", default_role_name) + + boto3_session = sagemaker_session.boto_session + permissions_policy = json.dumps( + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": ["sagemaker.amazonaws.com"]}, + "Action": "sts:AssumeRole", + } + ], + } + ) + iam_client = boto3_session.client("iam") + try: + iam_client.get_role(RoleName=default_role_name) + except iam_client.exceptions.NoSuchEntityException: + iam_client.create_role( + RoleName=default_role_name, AssumeRolePolicyDocument=str(permissions_policy) + ) + + LOGGER.warning("Created new sagemaker execution role: %s", default_role_name) + + iam_client.attach_role_policy( + PolicyArn="arn:aws:iam::aws:policy/AmazonSageMakerFullAccess", + RoleName=default_role_name, + ) + return iam_client.get_role(RoleName=default_role_name)["Role"]["Arn"] + message = ( "The current AWS identity is not a role: {}, therefore it cannot be used as a " "SageMaker execution role" diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py index d3bba53504..d08a155c7c 100644 --- a/tests/unit/test_session.py +++ b/tests/unit/test_session.py @@ -15,6 +15,7 @@ import copy import datetime import io +import json import logging import os @@ -532,6 +533,67 @@ def test_get_execution_role_throws_exception_if_arn_is_not_role_with_role_in_nam assert "The current AWS identity is not a role" in str(error.value) +def test_get_execution_role_get_default_role(caplog): + session = Mock() + session.get_caller_identity_arn.return_value = "arn:aws:iam::369233609183:user/marcos" + + iam_client = Mock() + iam_client.get_role.return_value = {"Role": {"Arn": "foo-role"}} + boto_session = Mock() + boto_session.client.return_value = iam_client + + session.boto_session = boto_session + actual = get_execution_role(session, use_default=True) + + iam_client.get_role.assert_called_with(RoleName="AmazonSageMaker-DefaultRole") + iam_client.attach_role_policy.assert_called_with( + PolicyArn="arn:aws:iam::aws:policy/AmazonSageMakerFullAccess", + RoleName="AmazonSageMaker-DefaultRole", + ) + assert "Using default role: AmazonSageMaker-DefaultRole" in caplog.text + assert actual == "foo-role" + + +def test_get_execution_role_create_default_role(caplog): + session = Mock() + session.get_caller_identity_arn.return_value = "arn:aws:iam::369233609183:user/marcos" + permissions_policy = json.dumps( + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": ["sagemaker.amazonaws.com"]}, + "Action": "sts:AssumeRole", + } + ], + } + ) + iam_client = Mock() + iam_client.exceptions.NoSuchEntityException = Exception + iam_client.get_role = Mock(side_effect=[Exception(), {"Role": {"Arn": "foo-role"}}]) + + boto_session = Mock() + boto_session.client.return_value = iam_client + + session.boto_session = boto_session + + actual = get_execution_role(session, use_default=True) + + iam_client.create_role.assert_called_with( + RoleName="AmazonSageMaker-DefaultRole", AssumeRolePolicyDocument=str(permissions_policy) + ) + + iam_client.attach_role_policy.assert_called_with( + PolicyArn="arn:aws:iam::aws:policy/AmazonSageMakerFullAccess", + RoleName="AmazonSageMaker-DefaultRole", + ) + + assert "Created new sagemaker execution role: AmazonSageMaker-DefaultRole" in caplog.text + + assert actual == "foo-role" + + @patch( "six.moves.builtins.open", mock_open(read_data='{"ResourceName": "SageMakerInstance"}'), From 069b41cb7dd8466defd341451ffcc4e84240e297 Mon Sep 17 00:00:00 2001 From: martinRenou Date: Fri, 22 Dec 2023 21:56:17 +0100 Subject: [PATCH 28/76] Change: More pythonic tags (#4327) * Change: More pythonic tags * Fix broken tags * More tags formatting and add a test * Fix tests --- src/sagemaker/algorithm.py | 9 +- src/sagemaker/apiutils/_base_types.py | 5 +- src/sagemaker/automl/automl.py | 13 +-- src/sagemaker/base_predictor.py | 4 +- src/sagemaker/clarify.py | 7 +- src/sagemaker/djl_inference/model.py | 6 +- src/sagemaker/estimator.py | 41 ++++--- src/sagemaker/experiments/experiment.py | 9 +- src/sagemaker/experiments/run.py | 13 ++- src/sagemaker/experiments/trial.py | 9 +- src/sagemaker/experiments/trial_component.py | 9 +- src/sagemaker/feature_store/feature_group.py | 10 +- .../_event_bridge_rule_helper.py | 3 +- src/sagemaker/huggingface/model.py | 6 +- src/sagemaker/huggingface/processing.py | 5 +- src/sagemaker/jumpstart/estimator.py | 16 +-- src/sagemaker/jumpstart/factory/estimator.py | 10 +- src/sagemaker/jumpstart/factory/model.py | 6 +- src/sagemaker/jumpstart/model.py | 12 +- src/sagemaker/jumpstart/types.py | 14 +-- src/sagemaker/jumpstart/utils.py | 10 +- src/sagemaker/lineage/action.py | 5 +- src/sagemaker/lineage/artifact.py | 6 +- src/sagemaker/lineage/association.py | 3 +- src/sagemaker/lineage/context.py | 3 +- src/sagemaker/local/entities.py | 6 +- src/sagemaker/local/local_session.py | 16 ++- src/sagemaker/model.py | 43 ++++--- .../model_monitor/clarify_model_monitoring.py | 10 +- .../model_monitor/model_monitoring.py | 21 ++-- src/sagemaker/multidatamodel.py | 4 +- src/sagemaker/mxnet/processing.py | 5 +- src/sagemaker/pipeline.py | 5 +- src/sagemaker/predictor_async.py | 4 +- src/sagemaker/processing.py | 29 +++-- src/sagemaker/pytorch/processing.py | 5 +- src/sagemaker/remote_function/job.py | 20 ++-- src/sagemaker/session.py | 108 +++++++++--------- src/sagemaker/sklearn/processing.py | 8 +- src/sagemaker/spark/processing.py | 18 +-- src/sagemaker/tensorflow/estimator.py | 7 +- src/sagemaker/tensorflow/model.py | 3 +- src/sagemaker/tensorflow/processing.py | 5 +- src/sagemaker/transformer.py | 14 ++- src/sagemaker/tuner.py | 16 +-- src/sagemaker/utils.py | 14 ++- src/sagemaker/workflow/_utils.py | 7 +- src/sagemaker/workflow/airflow.py | 7 +- src/sagemaker/workflow/check_job_config.py | 5 +- src/sagemaker/workflow/function_step.py | 10 +- src/sagemaker/workflow/notebook_job_step.py | 13 +-- src/sagemaker/workflow/pipeline.py | 14 +-- src/sagemaker/workflow/step_collections.py | 6 +- src/sagemaker/wrangler/processing.py | 9 +- src/sagemaker/xgboost/processing.py | 5 +- tests/unit/sagemaker/model/test_model.py | 2 +- .../sagemaker/model/test_model_package.py | 4 +- .../sagemaker/tensorflow/test_estimator.py | 2 +- tests/unit/test_predictor.py | 2 +- tests/unit/test_predictor_async.py | 2 +- tests/unit/test_session.py | 19 ++- tests/unit/test_transformer.py | 4 +- 62 files changed, 400 insertions(+), 306 deletions(-) diff --git a/src/sagemaker/algorithm.py b/src/sagemaker/algorithm.py index f4124fff2a..a177b93f03 100644 --- a/src/sagemaker/algorithm.py +++ b/src/sagemaker/algorithm.py @@ -28,6 +28,7 @@ from sagemaker.session import Session from sagemaker.workflow.entities import PipelineVariable from sagemaker.workflow.pipeline_context import runnable_by_pipeline +from sagemaker.utils import format_tags, Tags from sagemaker.workflow import is_pipeline_variable @@ -58,7 +59,7 @@ def __init__( base_job_name: Optional[str] = None, sagemaker_session: Optional[Session] = None, hyperparameters: Optional[Dict[str, Union[str, PipelineVariable]]] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, subnets: Optional[List[Union[str, PipelineVariable]]] = None, security_group_ids: Optional[List[Union[str, PipelineVariable]]] = None, model_uri: Optional[str] = None, @@ -121,7 +122,7 @@ def __init__( interactions with Amazon SageMaker APIs and any other AWS services needed. If not specified, the estimator creates one using the default AWS configuration chain. - tags (list[dict[str, str] or list[dict[str, PipelineVariable]]): List of tags for + tags (Union[Tags]): Tags for labeling a training job. For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. subnets (list[str] or list[PipelineVariable]): List of subnet ids. If not specified @@ -170,7 +171,7 @@ def __init__( output_kms_key=output_kms_key, base_job_name=base_job_name, sagemaker_session=sagemaker_session, - tags=tags, + tags=format_tags(tags), subnets=subnets, security_group_ids=security_group_ids, model_uri=model_uri, @@ -391,7 +392,7 @@ def transformer( if self._is_marketplace(): transform_env = None - tags = tags or self.tags + tags = format_tags(tags) or self.tags else: raise RuntimeError("No finished training job found associated with this estimator") diff --git a/src/sagemaker/apiutils/_base_types.py b/src/sagemaker/apiutils/_base_types.py index 9a7359e12b..acee3d4d67 100644 --- a/src/sagemaker/apiutils/_base_types.py +++ b/src/sagemaker/apiutils/_base_types.py @@ -14,6 +14,7 @@ from __future__ import absolute_import from sagemaker.apiutils import _boto_functions, _utils +from sagemaker.utils import format_tags class ApiObject(object): @@ -194,13 +195,13 @@ def _set_tags(self, resource_arn=None, tags=None): Args: resource_arn (str): The arn of the Record - tags (dict): An array of Tag objects that set to Record + tags (Optional[Tags]): An array of Tag objects that set to Record Returns: A list of key, value pair objects. i.e. [{"key":"value"}] """ tag_list = self.sagemaker_session.sagemaker_client.add_tags( - ResourceArn=resource_arn, Tags=tags + ResourceArn=resource_arn, Tags=format_tags(tags) )["Tags"] return tag_list diff --git a/src/sagemaker/automl/automl.py b/src/sagemaker/automl/automl.py index ce71d50977..1413f3aa29 100644 --- a/src/sagemaker/automl/automl.py +++ b/src/sagemaker/automl/automl.py @@ -28,7 +28,7 @@ ) from sagemaker.job import _Job from sagemaker.session import Session -from sagemaker.utils import name_from_base, resolve_value_from_config +from sagemaker.utils import name_from_base, resolve_value_from_config, format_tags, Tags from sagemaker.workflow.entities import PipelineVariable from sagemaker.workflow.pipeline_context import runnable_by_pipeline @@ -127,7 +127,7 @@ def __init__( total_job_runtime_in_seconds: Optional[int] = None, job_objective: Optional[Dict[str, str]] = None, generate_candidate_definitions_only: Optional[bool] = False, - tags: Optional[List[Dict[str, str]]] = None, + tags: Optional[Tags] = None, content_type: Optional[str] = None, s3_data_type: Optional[str] = None, feature_specification_s3_uri: Optional[str] = None, @@ -167,8 +167,7 @@ def __init__( In the format of: {"MetricName": str} generate_candidate_definitions_only (bool): Whether to generates possible candidates without training the models. - tags (List[dict[str, str]]): The list of tags to attach to this - specific endpoint. + tags (Optional[Tags]): Tags to attach to this specific endpoint. content_type (str): The content type of the data from the input source. s3_data_type (str): The data type for S3 data source. Valid values: ManifestFile or S3Prefix. @@ -203,7 +202,7 @@ def __init__( self.target_attribute_name = target_attribute_name self.job_objective = job_objective self.generate_candidate_definitions_only = generate_candidate_definitions_only - self.tags = tags + self.tags = format_tags(tags) self.content_type = content_type self.s3_data_type = s3_data_type self.feature_specification_s3_uri = feature_specification_s3_uri @@ -581,7 +580,7 @@ def deploy( be selected on each ``deploy``. endpoint_name (str): The name of the endpoint to create (default: None). If not specified, a unique endpoint name will be created. - tags (List[dict[str, str]]): The list of tags to attach to this + tags (Optional[Tags]): The list of tags to attach to this specific endpoint. wait (bool): Whether the call should wait until the deployment of model completes (default: True). @@ -633,7 +632,7 @@ def deploy( deserializer=deserializer, endpoint_name=endpoint_name, kms_key=model_kms_key, - tags=tags, + tags=format_tags(tags), wait=wait, volume_size=volume_size, model_data_download_timeout=model_data_download_timeout, diff --git a/src/sagemaker/base_predictor.py b/src/sagemaker/base_predictor.py index 99ef6ef55f..882cfafc39 100644 --- a/src/sagemaker/base_predictor.py +++ b/src/sagemaker/base_predictor.py @@ -53,7 +53,7 @@ NumpySerializer, ) from sagemaker.session import production_variant, Session -from sagemaker.utils import name_from_base, stringify_object +from sagemaker.utils import name_from_base, stringify_object, format_tags from sagemaker.model_monitor.model_monitoring import DEFAULT_REPOSITORY_NAME @@ -409,7 +409,7 @@ def update_endpoint( self.sagemaker_session.create_endpoint_config_from_existing( current_endpoint_config_name, new_endpoint_config_name, - new_tags=tags, + new_tags=format_tags(tags), new_kms_key=kms_key, new_data_capture_config_dict=data_capture_config_dict, new_production_variants=production_variants, diff --git a/src/sagemaker/clarify.py b/src/sagemaker/clarify.py index 9421d0e419..11bc43c43a 100644 --- a/src/sagemaker/clarify.py +++ b/src/sagemaker/clarify.py @@ -33,6 +33,7 @@ from sagemaker.session import Session from sagemaker.network import NetworkConfig from sagemaker.processing import ProcessingInput, ProcessingOutput, Processor +from sagemaker.utils import format_tags, Tags logger = logging.getLogger(__name__) @@ -1417,7 +1418,7 @@ def __init__( max_runtime_in_seconds: Optional[int] = None, sagemaker_session: Optional[Session] = None, env: Optional[Dict[str, str]] = None, - tags: Optional[List[Dict[str, str]]] = None, + tags: Optional[Tags] = None, network_config: Optional[NetworkConfig] = None, job_name_prefix: Optional[str] = None, version: Optional[str] = None, @@ -1454,7 +1455,7 @@ def __init__( using the default AWS configuration chain. env (dict[str, str]): Environment variables to be passed to the processing jobs (default: None). - tags (list[dict]): List of tags to be passed to the processing job + tags (Optional[Tags]): Tags to be passed to the processing job (default: None). For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. network_config (:class:`~sagemaker.network.NetworkConfig`): @@ -1482,7 +1483,7 @@ def __init__( None, # We set method-specific job names below. sagemaker_session, env, - tags, + format_tags(tags), network_config, ) diff --git a/src/sagemaker/djl_inference/model.py b/src/sagemaker/djl_inference/model.py index 118a4af5a0..8308215e81 100644 --- a/src/sagemaker/djl_inference/model.py +++ b/src/sagemaker/djl_inference/model.py @@ -30,7 +30,7 @@ from sagemaker.s3_utils import s3_path_join from sagemaker.serializers import JSONSerializer, BaseSerializer from sagemaker.session import Session -from sagemaker.utils import _tmpdir, _create_or_update_code_dir +from sagemaker.utils import _tmpdir, _create_or_update_code_dir, format_tags from sagemaker.workflow.entities import PipelineVariable from sagemaker.estimator import Estimator from sagemaker.s3 import S3Uploader @@ -610,7 +610,7 @@ def deploy( default deserializer is set by the ``predictor_cls``. endpoint_name (str): The name of the endpoint to create (default: None). If not specified, a unique endpoint name will be created. - tags (List[dict[str, str]]): The list of tags to attach to this + tags (Optional[Tags]): The list of tags to attach to this specific endpoint. kms_key (str): The ARN of the KMS key that is used to encrypt the data on the storage volume attached to the instance hosting the @@ -651,7 +651,7 @@ def deploy( serializer=serializer, deserializer=deserializer, endpoint_name=endpoint_name, - tags=tags, + tags=format_tags(tags), kms_key=kms_key, wait=wait, data_capture_config=data_capture_config, diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index 551a42ad55..f899570775 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -98,6 +98,8 @@ to_string, check_and_get_run_experiment_config, resolve_value_from_config, + format_tags, + Tags, ) from sagemaker.workflow import is_pipeline_variable from sagemaker.workflow.entities import PipelineVariable @@ -144,7 +146,7 @@ def __init__( output_kms_key: Optional[Union[str, PipelineVariable]] = None, base_job_name: Optional[str] = None, sagemaker_session: Optional[Session] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, subnets: Optional[List[Union[str, PipelineVariable]]] = None, security_group_ids: Optional[List[Union[str, PipelineVariable]]] = None, model_uri: Optional[str] = None, @@ -270,8 +272,8 @@ def __init__( manages interactions with Amazon SageMaker APIs and any other AWS services needed. If not specified, the estimator creates one using the default AWS configuration chain. - tags (list[dict[str, str] or list[dict[str, PipelineVariable]]): - List of tags for labeling a training job. For more, see + tags (Optional[Tags]): + Tags for labeling a training job. For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. subnets (list[str] or list[PipelineVariable]): List of subnet ids. If not specified training job will be created without VPC config. @@ -604,6 +606,7 @@ def __init__( else: self.sagemaker_session = sagemaker_session or Session() + tags = format_tags(tags) self.tags = ( add_jumpstart_uri_tags( tags=tags, training_model_uri=self.model_uri, training_script_uri=self.source_dir @@ -1352,7 +1355,7 @@ def compile_model( framework=None, framework_version=None, compile_max_run=15 * 60, - tags=None, + tags: Optional[Tags] = None, target_platform_os=None, target_platform_arch=None, target_platform_accelerator=None, @@ -1378,7 +1381,7 @@ def compile_model( compile_max_run (int): Timeout in seconds for compilation (default: 15 * 60). After this amount of time Amazon SageMaker Neo terminates the compilation job regardless of its current status. - tags (list[dict]): List of tags for labeling a compilation job. For + tags (list[dict]): Tags for labeling a compilation job. For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. target_platform_os (str): Target Platform OS, for example: 'LINUX'. @@ -1420,7 +1423,7 @@ def compile_model( input_shape, output_path, self.role, - tags, + format_tags(tags), self._compilation_job_name(), compile_max_run, framework=framework, @@ -1532,7 +1535,7 @@ def deploy( model_name=None, kms_key=None, data_capture_config=None, - tags=None, + tags: Optional[Tags] = None, serverless_inference_config=None, async_inference_config=None, volume_size=None, @@ -1601,8 +1604,10 @@ def deploy( empty object passed through, will use pre-defined values in ``ServerlessInferenceConfig`` class to deploy serverless endpoint. Deploy an instance based endpoint if it's None. (default: None) - tags(List[dict[str, str]]): Optional. The list of tags to attach to this specific + tags(Optional[Tags]): Optional. Tags to attach to this specific endpoint. Example: + >>> tags = {'tagname', 'tagvalue'} + Or >>> tags = [{'Key': 'tagname', 'Value': 'tagvalue'}] For more information about tags, see https://boto3.amazonaws.com/v1/documentation\ @@ -1664,7 +1669,7 @@ def deploy( model.name = model_name tags = update_inference_tags_with_jumpstart_training_tags( - inference_tags=tags, training_tags=self.tags + inference_tags=format_tags(tags), training_tags=self.tags ) return model.deploy( @@ -2017,7 +2022,7 @@ def transformer( env=None, max_concurrent_transforms=None, max_payload=None, - tags=None, + tags: Optional[Tags] = None, role=None, volume_kms_key=None, vpc_config_override=vpc_utils.VPC_CONFIG_DEFAULT, @@ -2051,7 +2056,7 @@ def transformer( to be made to each individual transform container at one time. max_payload (int): Maximum size of the payload in a single HTTP request to the container in MB. - tags (list[dict]): List of tags for labeling a transform job. If + tags (Optional[Tags]): Tags for labeling a transform job. If none specified, then the tags used for the training job are used for the transform job. role (str): The ``ExecutionRoleArn`` IAM Role ARN for the ``Model``, @@ -2078,7 +2083,7 @@ def transformer( model. If not specified, the estimator generates a default job name based on the training image name and current timestamp. """ - tags = tags or self.tags + tags = format_tags(tags) or self.tags model_name = self._get_or_create_name(model_name) if self.latest_training_job is None: @@ -2717,7 +2722,7 @@ def __init__( base_job_name: Optional[str] = None, sagemaker_session: Optional[Session] = None, hyperparameters: Optional[Dict[str, Union[str, PipelineVariable]]] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, subnets: Optional[List[Union[str, PipelineVariable]]] = None, security_group_ids: Optional[List[Union[str, PipelineVariable]]] = None, model_uri: Optional[str] = None, @@ -2847,7 +2852,7 @@ def __init__( hyperparameters. SageMaker rejects the training job request and returns an validation error for detected credentials, if such user input is found. - tags (list[dict[str, str] or list[dict[str, PipelineVariable]]): List of tags for + tags (Optional[Tags]): Tags for labeling a training job. For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. subnets (list[str] or list[PipelineVariable]): List of subnet ids. @@ -3130,7 +3135,7 @@ def __init__( output_kms_key, base_job_name, sagemaker_session, - tags, + format_tags(tags), subnets, security_group_ids, model_uri=model_uri, @@ -3762,7 +3767,7 @@ def transformer( env=None, max_concurrent_transforms=None, max_payload=None, - tags=None, + tags: Optional[Tags] = None, role=None, model_server_workers=None, volume_kms_key=None, @@ -3798,7 +3803,7 @@ def transformer( to be made to each individual transform container at one time. max_payload (int): Maximum size of the payload in a single HTTP request to the container in MB. - tags (list[dict]): List of tags for labeling a transform job. If + tags (Optional[Tags]): Tags for labeling a transform job. If none specified, then the tags used for the training job are used for the transform job. role (str): The ``ExecutionRoleArn`` IAM Role ARN for the ``Model``, @@ -3837,7 +3842,7 @@ def transformer( SageMaker Batch Transform job. """ role = role or self.role - tags = tags or self.tags + tags = format_tags(tags) or self.tags model_name = self._get_or_create_name(model_name) if self.latest_training_job is not None: diff --git a/src/sagemaker/experiments/experiment.py b/src/sagemaker/experiments/experiment.py index 584fbed27e..6f33fafb0f 100644 --- a/src/sagemaker/experiments/experiment.py +++ b/src/sagemaker/experiments/experiment.py @@ -20,6 +20,7 @@ from sagemaker.apiutils import _base_types from sagemaker.experiments.trial import _Trial from sagemaker.experiments.trial_component import _TrialComponent +from sagemaker.utils import format_tags class Experiment(_base_types.Record): @@ -111,7 +112,7 @@ def create( manages interactions with Amazon SageMaker APIs and any other AWS services needed. If not specified, one is created using the default AWS configuration chain. - tags (List[Dict[str, str]]): A list of tags to associate with the experiment + tags (Optional[Tags]): A list of tags to associate with the experiment (default: None). Returns: @@ -122,7 +123,7 @@ def create( experiment_name=experiment_name, display_name=display_name, description=description, - tags=tags, + tags=format_tags(tags), sagemaker_session=sagemaker_session, ) @@ -149,7 +150,7 @@ def _load_or_create( manages interactions with Amazon SageMaker APIs and any other AWS services needed. If not specified, one is created using the default AWS configuration chain. - tags (List[Dict[str, str]]): A list of tags to associate with the experiment + tags (Optional[Tags]): A list of tags to associate with the experiment (default: None). This is used only when the given `experiment_name` does not exist and a new experiment has to be created. @@ -161,7 +162,7 @@ def _load_or_create( experiment_name=experiment_name, display_name=display_name, description=description, - tags=tags, + tags=format_tags(tags), sagemaker_session=sagemaker_session, ) except ClientError as ce: diff --git a/src/sagemaker/experiments/run.py b/src/sagemaker/experiments/run.py index bfef1191c3..6068880844 100644 --- a/src/sagemaker/experiments/run.py +++ b/src/sagemaker/experiments/run.py @@ -44,6 +44,9 @@ from sagemaker.utils import ( get_module, unique_name_from_base, + format_tags, + Tags, + TagsDict, ) from sagemaker.experiments._utils import ( @@ -97,7 +100,7 @@ def __init__( run_name: Optional[str] = None, experiment_display_name: Optional[str] = None, run_display_name: Optional[str] = None, - tags: Optional[List[Dict[str, str]]] = None, + tags: Optional[Tags] = None, sagemaker_session: Optional["Session"] = None, artifact_bucket: Optional[str] = None, artifact_prefix: Optional[str] = None, @@ -152,7 +155,7 @@ def __init__( run_display_name (str): The display name of the run used in UI (default: None). This display name is used in a create run call. If a run with the specified name already exists, this display name won't take effect. - tags (List[Dict[str, str]]): A list of tags to be used for all create calls, + tags (Optional[Tags]): Tags to be used for all create calls, e.g. to create an experiment, a run group, etc. (default: None). sagemaker_session (sagemaker.session.Session): Session object which manages interactions with Amazon SageMaker APIs and any other @@ -172,6 +175,8 @@ def __init__( # avoid confusion due to mis-match in casing between run name and TC name self.run_name = self.run_name.lower() + tags = format_tags(tags) + trial_component_name = Run._generate_trial_component_name( run_name=self.run_name, experiment_name=self.experiment_name ) @@ -676,11 +681,11 @@ def _extract_run_name_from_tc_name(trial_component_name: str, experiment_name: s ) @staticmethod - def _append_run_tc_label_to_tags(tags: Optional[List[Dict[str, str]]] = None) -> list: + def _append_run_tc_label_to_tags(tags: Optional[List[TagsDict]] = None) -> list: """Append the run trial component label to tags used to create a trial component. Args: - tags (List[Dict[str, str]]): The tags supplied by users to initialize a Run object. + tags (List[TagsDict]): The tags supplied by users to initialize a Run object. Returns: list: The updated tags with the appended run trial component label. diff --git a/src/sagemaker/experiments/trial.py b/src/sagemaker/experiments/trial.py index ce8deb4862..466ba39158 100644 --- a/src/sagemaker/experiments/trial.py +++ b/src/sagemaker/experiments/trial.py @@ -18,6 +18,7 @@ from sagemaker.apiutils import _base_types from sagemaker.experiments import _api_types from sagemaker.experiments.trial_component import _TrialComponent +from sagemaker.utils import format_tags class _Trial(_base_types.Record): @@ -101,7 +102,7 @@ def create( trial_name: (str): Name of the Trial. display_name (str): Name of the trial that will appear in UI, such as SageMaker Studio (default: None). - tags (List[dict]): A list of tags to associate with the trial (default: None). + tags (Optional[Tags]): A list of tags to associate with the trial (default: None). sagemaker_session (sagemaker.session.Session): Session object which manages interactions with Amazon SageMaker APIs and any other AWS services needed. If not specified, one is created using the @@ -115,7 +116,7 @@ def create( trial_name=trial_name, experiment_name=experiment_name, display_name=display_name, - tags=tags, + tags=format_tags(tags), sagemaker_session=sagemaker_session, ) return trial @@ -259,7 +260,7 @@ def _load_or_create( display_name (str): Name of the trial that will appear in UI, such as SageMaker Studio (default: None). This is used only when the given `trial_name` does not exist and a new trial has to be created. - tags (List[dict]): A list of tags to associate with the trial (default: None). + tags (Optional[Tags]): A list of tags to associate with the trial (default: None). This is used only when the given `trial_name` does not exist and a new trial has to be created. sagemaker_session (sagemaker.session.Session): Session object which @@ -275,7 +276,7 @@ def _load_or_create( experiment_name=experiment_name, trial_name=trial_name, display_name=display_name, - tags=tags, + tags=format_tags(tags), sagemaker_session=sagemaker_session, ) except ClientError as ce: diff --git a/src/sagemaker/experiments/trial_component.py b/src/sagemaker/experiments/trial_component.py index 061948a9d2..bdd5cd0634 100644 --- a/src/sagemaker/experiments/trial_component.py +++ b/src/sagemaker/experiments/trial_component.py @@ -20,6 +20,7 @@ from sagemaker.apiutils import _base_types from sagemaker.experiments import _api_types from sagemaker.experiments._api_types import TrialComponentSearchResult +from sagemaker.utils import format_tags class _TrialComponent(_base_types.Record): @@ -191,7 +192,7 @@ def create(cls, trial_component_name, display_name=None, tags=None, sagemaker_se Args: trial_component_name (str): The name of the trial component. display_name (str): Display name of the trial component used by Studio (default: None). - tags (List[Dict[str, str]]): Tags to add to the trial component (default: None). + tags (Optional[Tags]): Tags to add to the trial component (default: None). sagemaker_session (sagemaker.session.Session): Session object which manages interactions with Amazon SageMaker APIs and any other AWS services needed. If not specified, one is created using the @@ -204,7 +205,7 @@ def create(cls, trial_component_name, display_name=None, tags=None, sagemaker_se cls._boto_create_method, trial_component_name=trial_component_name, display_name=display_name, - tags=tags, + tags=format_tags(tags), sagemaker_session=sagemaker_session, ) @@ -316,7 +317,7 @@ def _load_or_create( display_name (str): Display name of the trial component used by Studio (default: None). This is used only when the given `trial_component_name` does not exist and a new trial component has to be created. - tags (List[Dict[str, str]]): Tags to add to the trial component (default: None). + tags (Optional[Tags]): Tags to add to the trial component (default: None). This is used only when the given `trial_component_name` does not exist and a new trial component has to be created. sagemaker_session (sagemaker.session.Session): Session object which @@ -333,7 +334,7 @@ def _load_or_create( run_tc = _TrialComponent.create( trial_component_name=trial_component_name, display_name=display_name, - tags=tags, + tags=format_tags(tags), sagemaker_session=sagemaker_session, ) except ClientError as ce: diff --git a/src/sagemaker/feature_store/feature_group.py b/src/sagemaker/feature_store/feature_group.py index 977fc302e0..0e503e192d 100644 --- a/src/sagemaker/feature_store/feature_group.py +++ b/src/sagemaker/feature_store/feature_group.py @@ -28,7 +28,7 @@ import tempfile from concurrent.futures import as_completed from concurrent.futures import ThreadPoolExecutor -from typing import Sequence, List, Dict, Any, Union +from typing import Optional, Sequence, List, Dict, Any, Union from urllib.parse import urlparse from multiprocessing.pool import AsyncResult @@ -65,7 +65,7 @@ OnlineStoreConfigUpdate, OnlineStoreStorageTypeEnum, ) -from sagemaker.utils import resolve_value_from_config +from sagemaker.utils import resolve_value_from_config, format_tags, Tags logger = logging.getLogger(__name__) @@ -538,7 +538,7 @@ def create( disable_glue_table_creation: bool = False, data_catalog_config: DataCatalogConfig = None, description: str = None, - tags: List[Dict[str, str]] = None, + tags: Optional[Tags] = None, table_format: TableFormatEnum = None, online_store_storage_type: OnlineStoreStorageTypeEnum = None, ) -> Dict[str, Any]: @@ -566,7 +566,7 @@ def create( data_catalog_config (DataCatalogConfig): configuration for Metadata store (default: None). description (str): description of the FeatureGroup (default: None). - tags (List[Dict[str, str]]): list of tags for labeling a FeatureGroup (default: None). + tags (Optional[Tags]): Tags for labeling a FeatureGroup (default: None). table_format (TableFormatEnum): format of the offline store table (default: None). online_store_storage_type (OnlineStoreStorageTypeEnum): storage type for the online store (default: None). @@ -602,7 +602,7 @@ def create( ], role_arn=role_arn, description=description, - tags=tags, + tags=format_tags(tags), ) # online store configuration diff --git a/src/sagemaker/feature_store/feature_processor/_event_bridge_rule_helper.py b/src/sagemaker/feature_store/feature_processor/_event_bridge_rule_helper.py index 8f47a2e712..d47a37f5cb 100644 --- a/src/sagemaker/feature_store/feature_processor/_event_bridge_rule_helper.py +++ b/src/sagemaker/feature_store/feature_processor/_event_bridge_rule_helper.py @@ -32,6 +32,7 @@ from sagemaker.feature_store.feature_processor._enums import ( FeatureProcessorPipelineExecutionStatus, ) +from sagemaker.utils import TagsDict logger = logging.getLogger("sagemaker") @@ -175,7 +176,7 @@ def disable_rule(self, rule_name: str) -> None: self.event_bridge_rule_client.disable_rule(Name=rule_name) logger.info("Disabled EventBridge Rule for pipeline %s.", rule_name) - def add_tags(self, rule_arn: str, tags: List[Dict[str, str]]) -> None: + def add_tags(self, rule_arn: str, tags: List[TagsDict]) -> None: """Adds tags to the EventBridge Rule. Args: diff --git a/src/sagemaker/huggingface/model.py b/src/sagemaker/huggingface/model.py index da294c89e2..efe6a85288 100644 --- a/src/sagemaker/huggingface/model.py +++ b/src/sagemaker/huggingface/model.py @@ -29,7 +29,7 @@ from sagemaker.predictor import Predictor from sagemaker.serializers import JSONSerializer from sagemaker.session import Session -from sagemaker.utils import to_string +from sagemaker.utils import to_string, format_tags from sagemaker.workflow import is_pipeline_variable from sagemaker.workflow.entities import PipelineVariable @@ -255,7 +255,7 @@ def deploy( https://docs.aws.amazon.com/sagemaker/latest/dg/ei.html endpoint_name (str): The name of the endpoint to create (default: None). If not specified, a unique endpoint name will be created. - tags (List[dict[str, str]]): The list of tags to attach to this + tags (Optional[Tags]): The list of tags to attach to this specific endpoint. kms_key (str): The ARN of the KMS key that is used to encrypt the data on the storage volume attached to the instance hosting the @@ -319,7 +319,7 @@ def deploy( deserializer, accelerator_type, endpoint_name, - tags, + format_tags(tags), kms_key, wait, data_capture_config, diff --git a/src/sagemaker/huggingface/processing.py b/src/sagemaker/huggingface/processing.py index 332148891f..b8721928f0 100644 --- a/src/sagemaker/huggingface/processing.py +++ b/src/sagemaker/huggingface/processing.py @@ -25,6 +25,7 @@ from sagemaker.huggingface.estimator import HuggingFace from sagemaker.workflow.entities import PipelineVariable +from sagemaker.utils import format_tags, Tags class HuggingFaceProcessor(FrameworkProcessor): @@ -51,7 +52,7 @@ def __init__( base_job_name: Optional[str] = None, sagemaker_session: Optional[Session] = None, env: Optional[Dict[str, Union[str, PipelineVariable]]] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, network_config: Optional[NetworkConfig] = None, ): """This processor executes a Python script in a HuggingFace execution environment. @@ -101,7 +102,7 @@ def __init__( base_job_name, sagemaker_session, env, - tags, + format_tags(tags), network_config, ) diff --git a/src/sagemaker/jumpstart/estimator.py b/src/sagemaker/jumpstart/estimator.py index e6047e9009..36a188ed55 100644 --- a/src/sagemaker/jumpstart/estimator.py +++ b/src/sagemaker/jumpstart/estimator.py @@ -37,7 +37,7 @@ is_valid_model_id, resolve_model_sagemaker_config_field, ) -from sagemaker.utils import stringify_object +from sagemaker.utils import stringify_object, format_tags, Tags from sagemaker.model_monitor.data_capture_config import DataCaptureConfig from sagemaker.predictor import PredictorBase @@ -73,7 +73,7 @@ def __init__( base_job_name: Optional[str] = None, sagemaker_session: Optional[session.Session] = None, hyperparameters: Optional[Dict[str, Union[str, PipelineVariable]]] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, subnets: Optional[List[Union[str, PipelineVariable]]] = None, security_group_ids: Optional[List[Union[str, PipelineVariable]]] = None, model_uri: Optional[str] = None, @@ -225,8 +225,8 @@ def __init__( validation error for detected credentials, if such user input is found. (Default: None). - tags (Optional[Union[list[dict[str, str], list[dict[str, PipelineVariable]]]]): - List of tags for labeling a training job. For more, see + tags (Optional[Tags]): + Tags for labeling a training job. For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. (Default: None). subnets (Optional[Union[list[str], list[PipelineVariable]]]): List of subnet ids. @@ -535,7 +535,7 @@ def _is_valid_model_id_hook(): output_kms_key=output_kms_key, base_job_name=base_job_name, sagemaker_session=sagemaker_session, - tags=tags, + tags=format_tags(tags), subnets=subnets, security_group_ids=security_group_ids, model_uri=model_uri, @@ -728,7 +728,7 @@ def deploy( deserializer: Optional[BaseDeserializer] = None, accelerator_type: Optional[str] = None, endpoint_name: Optional[str] = None, - tags: List[Dict[str, str]] = None, + tags: Optional[Tags] = None, kms_key: Optional[str] = None, wait: Optional[bool] = True, data_capture_config: Optional[DataCaptureConfig] = None, @@ -794,7 +794,7 @@ def deploy( endpoint_name (Optional[str]): The name of the endpoint to create (default: None). If not specified, a unique endpoint name will be created. (Default: None). - tags (Optional[List[dict[str, str]]]): The list of tags to attach to this + tags (Optional[Tags]): Tags to attach to this specific endpoint. (Default: None). kms_key (Optional[str]): The ARN of the KMS key that is used to encrypt the data on the storage volume attached to the instance hosting the @@ -1014,7 +1014,7 @@ def deploy( deserializer=deserializer, accelerator_type=accelerator_type, endpoint_name=endpoint_name, - tags=tags, + tags=format_tags(tags), kms_key=kms_key, wait=wait, data_capture_config=data_capture_config, diff --git a/src/sagemaker/jumpstart/factory/estimator.py b/src/sagemaker/jumpstart/factory/estimator.py index 7479c23832..7ccf57983b 100644 --- a/src/sagemaker/jumpstart/factory/estimator.py +++ b/src/sagemaker/jumpstart/factory/estimator.py @@ -70,7 +70,7 @@ from sagemaker.model_monitor.data_capture_config import DataCaptureConfig from sagemaker.serverless.serverless_inference_config import ServerlessInferenceConfig -from sagemaker.utils import name_from_base +from sagemaker.utils import name_from_base, format_tags, Tags from sagemaker.workflow.entities import PipelineVariable @@ -94,7 +94,7 @@ def get_init_kwargs( base_job_name: Optional[str] = None, sagemaker_session: Optional[Session] = None, hyperparameters: Optional[Dict[str, Union[str, PipelineVariable]]] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, subnets: Optional[List[Union[str, PipelineVariable]]] = None, security_group_ids: Optional[List[Union[str, PipelineVariable]]] = None, model_uri: Optional[str] = None, @@ -149,7 +149,7 @@ def get_init_kwargs( output_kms_key=output_kms_key, base_job_name=base_job_name, sagemaker_session=sagemaker_session, - tags=tags, + tags=format_tags(tags), subnets=subnets, security_group_ids=security_group_ids, model_uri=model_uri, @@ -253,7 +253,7 @@ def get_deploy_kwargs( deserializer: Optional[BaseDeserializer] = None, accelerator_type: Optional[str] = None, endpoint_name: Optional[str] = None, - tags: List[Dict[str, str]] = None, + tags: Optional[Tags] = None, kms_key: Optional[str] = None, wait: Optional[bool] = None, data_capture_config: Optional[DataCaptureConfig] = None, @@ -297,7 +297,7 @@ def get_deploy_kwargs( deserializer=deserializer, accelerator_type=accelerator_type, endpoint_name=endpoint_name, - tags=tags, + tags=format_tags(tags), kms_key=kms_key, wait=wait, data_capture_config=data_capture_config, diff --git a/src/sagemaker/jumpstart/factory/model.py b/src/sagemaker/jumpstart/factory/model.py index 185beefc59..64e4727116 100644 --- a/src/sagemaker/jumpstart/factory/model.py +++ b/src/sagemaker/jumpstart/factory/model.py @@ -56,7 +56,7 @@ from sagemaker.serverless.serverless_inference_config import ServerlessInferenceConfig from sagemaker.session import Session -from sagemaker.utils import name_from_base +from sagemaker.utils import name_from_base, format_tags, Tags from sagemaker.workflow.entities import PipelineVariable from sagemaker.compute_resource_requirements.resource_requirements import ResourceRequirements from sagemaker import resource_requirements @@ -496,7 +496,7 @@ def get_deploy_kwargs( deserializer: Optional[BaseDeserializer] = None, accelerator_type: Optional[str] = None, endpoint_name: Optional[str] = None, - tags: List[Dict[str, str]] = None, + tags: Optional[Tags] = None, kms_key: Optional[str] = None, wait: Optional[bool] = None, data_capture_config: Optional[DataCaptureConfig] = None, @@ -528,7 +528,7 @@ def get_deploy_kwargs( deserializer=deserializer, accelerator_type=accelerator_type, endpoint_name=endpoint_name, - tags=tags, + tags=format_tags(tags), kms_key=kms_key, wait=wait, data_capture_config=data_capture_config, diff --git a/src/sagemaker/jumpstart/model.py b/src/sagemaker/jumpstart/model.py index e921add6d7..1742f860e4 100644 --- a/src/sagemaker/jumpstart/model.py +++ b/src/sagemaker/jumpstart/model.py @@ -31,7 +31,7 @@ ) from sagemaker.jumpstart.types import JumpStartSerializablePayload from sagemaker.jumpstart.utils import is_valid_model_id -from sagemaker.utils import stringify_object +from sagemaker.utils import stringify_object, format_tags, Tags from sagemaker.model import ( Model, ModelPackage, @@ -388,7 +388,7 @@ def _create_sagemaker_model( attach to an endpoint for model loading and inference, for example, 'ml.eia1.medium'. If not specified, no Elastic Inference accelerator will be attached to the endpoint. (Default: None). - tags (List[dict[str, str]]): Optional. The list of tags to add to + tags (Optional[Tags]): Optional. The list of tags to add to the model. Example: >>> tags = [{'Key': 'tagname', 'Value': 'tagvalue'}] For more information about tags, see https://boto3.amazonaws.com/v1/documentation @@ -402,6 +402,8 @@ def _create_sagemaker_model( any so they are ignored. """ + tags = format_tags(tags) + # if the user inputs a model artifact uri, do not use model package arn to create # inference endpoint. if self.model_package_arn and not self._model_data_is_set: @@ -446,7 +448,7 @@ def deploy( deserializer: Optional[BaseDeserializer] = None, accelerator_type: Optional[str] = None, endpoint_name: Optional[str] = None, - tags: List[Dict[str, str]] = None, + tags: Optional[Tags] = None, kms_key: Optional[str] = None, wait: Optional[bool] = True, data_capture_config: Optional[DataCaptureConfig] = None, @@ -502,7 +504,7 @@ def deploy( endpoint_name (Optional[str]): The name of the endpoint to create (default: None). If not specified, a unique endpoint name will be created. (Default: None). - tags (Optional[List[dict[str, str]]]): The list of tags to attach to this + tags (Optional[Tags]): Tags to attach to this specific endpoint. (Default: None). kms_key (Optional[str]): The ARN of the KMS key that is used to encrypt the data on the storage volume attached to the instance hosting the @@ -570,7 +572,7 @@ def deploy( deserializer=deserializer, accelerator_type=accelerator_type, endpoint_name=endpoint_name, - tags=tags, + tags=format_tags(tags), kms_key=kms_key, wait=wait, data_capture_config=data_capture_config, diff --git a/src/sagemaker/jumpstart/types.py b/src/sagemaker/jumpstart/types.py index 7c06282894..21b624d7a4 100644 --- a/src/sagemaker/jumpstart/types.py +++ b/src/sagemaker/jumpstart/types.py @@ -15,7 +15,7 @@ from copy import deepcopy from enum import Enum from typing import Any, Dict, List, Optional, Set, Union -from sagemaker.utils import get_instance_type_family +from sagemaker.utils import get_instance_type_family, format_tags, Tags from sagemaker.model_metrics import ModelMetrics from sagemaker.metadata_properties import MetadataProperties from sagemaker.drift_check_baselines import DriftCheckBaselines @@ -1172,7 +1172,7 @@ def __init__( deserializer: Optional[Any] = None, accelerator_type: Optional[str] = None, endpoint_name: Optional[str] = None, - tags: List[Dict[str, str]] = None, + tags: Optional[Tags] = None, kms_key: Optional[str] = None, wait: Optional[bool] = None, data_capture_config: Optional[Any] = None, @@ -1203,7 +1203,7 @@ def __init__( self.deserializer = deserializer self.accelerator_type = accelerator_type self.endpoint_name = endpoint_name - self.tags = deepcopy(tags) + self.tags = format_tags(tags) self.kms_key = kms_key self.wait = wait self.data_capture_config = data_capture_config @@ -1310,7 +1310,7 @@ def __init__( base_job_name: Optional[str] = None, sagemaker_session: Optional[Any] = None, hyperparameters: Optional[Dict[str, Union[str, Any]]] = None, - tags: Optional[List[Dict[str, Union[str, Any]]]] = None, + tags: Optional[Tags] = None, subnets: Optional[List[Union[str, Any]]] = None, security_group_ids: Optional[List[Union[str, Any]]] = None, model_uri: Optional[str] = None, @@ -1370,7 +1370,7 @@ def __init__( self.output_kms_key = output_kms_key self.base_job_name = base_job_name self.sagemaker_session = sagemaker_session - self.tags = deepcopy(tags) + self.tags = format_tags(tags) self.subnets = subnets self.security_group_ids = security_group_ids self.model_channel_name = model_channel_name @@ -1526,7 +1526,7 @@ def __init__( deserializer: Optional[Any] = None, accelerator_type: Optional[str] = None, endpoint_name: Optional[str] = None, - tags: List[Dict[str, str]] = None, + tags: Optional[Tags] = None, kms_key: Optional[str] = None, wait: Optional[bool] = None, data_capture_config: Optional[Any] = None, @@ -1573,7 +1573,7 @@ def __init__( self.deserializer = deserializer self.accelerator_type = accelerator_type self.endpoint_name = endpoint_name - self.tags = deepcopy(tags) + self.tags = format_tags(tags) self.kms_key = kms_key self.wait = wait self.data_capture_config = data_capture_config diff --git a/src/sagemaker/jumpstart/utils.py b/src/sagemaker/jumpstart/utils.py index 0003081e99..7d84caa0d4 100644 --- a/src/sagemaker/jumpstart/utils.py +++ b/src/sagemaker/jumpstart/utils.py @@ -41,7 +41,7 @@ ) from sagemaker.session import Session from sagemaker.config import load_sagemaker_config -from sagemaker.utils import resolve_value_from_config +from sagemaker.utils import resolve_value_from_config, TagsDict from sagemaker.workflow import is_pipeline_variable @@ -345,10 +345,10 @@ def get_jumpstart_base_name_if_jumpstart_model( def add_jumpstart_model_id_version_tags( - tags: Optional[List[Dict[str, str]]], + tags: Optional[List[TagsDict]], model_id: str, model_version: str, -) -> List[Dict[str, str]]: +) -> List[TagsDict]: """Add custom model ID and version tags to JumpStart related resources.""" if model_id is None or model_version is None: return tags @@ -368,12 +368,12 @@ def add_jumpstart_model_id_version_tags( def add_jumpstart_uri_tags( - tags: Optional[List[Dict[str, str]]] = None, + tags: Optional[List[TagsDict]] = None, inference_model_uri: Optional[Union[str, dict]] = None, inference_script_uri: Optional[str] = None, training_model_uri: Optional[str] = None, training_script_uri: Optional[str] = None, -) -> Optional[List[Dict[str, str]]]: +) -> Optional[List[TagsDict]]: """Add custom uri tags to JumpStart models, return the updated tags. No-op if this is not a JumpStart model related resource. diff --git a/src/sagemaker/lineage/action.py b/src/sagemaker/lineage/action.py index 9046a3ccf2..57b7fca5bc 100644 --- a/src/sagemaker/lineage/action.py +++ b/src/sagemaker/lineage/action.py @@ -21,6 +21,7 @@ from sagemaker.lineage import _api_types, _utils from sagemaker.lineage._api_types import ActionSource, ActionSummary from sagemaker.lineage.artifact import Artifact +from sagemaker.utils import format_tags from sagemaker.lineage.query import ( LineageQuery, @@ -159,12 +160,12 @@ def set_tags(self, tags=None): """Add tags to the object. Args: - tags ([{key:value}]): list of key value pairs. + tags (Optional[Tags]): list of key value pairs. Returns: list({str:str}): a list of key value pairs """ - return self._set_tags(resource_arn=self.action_arn, tags=tags) + return self._set_tags(resource_arn=self.action_arn, tags=format_tags(tags)) @classmethod def create( diff --git a/src/sagemaker/lineage/artifact.py b/src/sagemaker/lineage/artifact.py index 718344095a..e693313dbc 100644 --- a/src/sagemaker/lineage/artifact.py +++ b/src/sagemaker/lineage/artifact.py @@ -31,7 +31,7 @@ ) from sagemaker.lineage._utils import _disassociate, get_resource_name_from_arn from sagemaker.lineage.association import Association -from sagemaker.utils import get_module +from sagemaker.utils import get_module, format_tags LOGGER = logging.getLogger("sagemaker") @@ -288,12 +288,12 @@ def set_tags(self, tags=None): """Add tags to the object. Args: - tags ([{key:value}]): list of key value pairs. + tags (Optional[Tags]): list of key value pairs. Returns: list({str:str}): a list of key value pairs """ - return self._set_tags(resource_arn=self.artifact_arn, tags=tags) + return self._set_tags(resource_arn=self.artifact_arn, tags=format_tags(tags)) @classmethod def create( diff --git a/src/sagemaker/lineage/association.py b/src/sagemaker/lineage/association.py index fef79e2f8f..6ad08eb928 100644 --- a/src/sagemaker/lineage/association.py +++ b/src/sagemaker/lineage/association.py @@ -20,6 +20,7 @@ from sagemaker.apiutils import _base_types from sagemaker.lineage import _api_types from sagemaker.lineage._api_types import AssociationSummary +from sagemaker.utils import format_tags logger = logging.getLogger(__name__) @@ -95,7 +96,7 @@ def set_tags(self, tags=None): "set_tags on Association is deprecated. Use set_tags on the source or destination\ entity instead." ) - return self._set_tags(resource_arn=self.source_arn, tags=tags) + return self._set_tags(resource_arn=self.source_arn, tags=format_tags(tags)) @classmethod def create( diff --git a/src/sagemaker/lineage/context.py b/src/sagemaker/lineage/context.py index aef919e876..46d7693ecf 100644 --- a/src/sagemaker/lineage/context.py +++ b/src/sagemaker/lineage/context.py @@ -33,6 +33,7 @@ from sagemaker.lineage.artifact import Artifact from sagemaker.lineage.action import Action from sagemaker.lineage.lineage_trial_component import LineageTrialComponent +from sagemaker.utils import format_tags class Context(_base_types.Record): @@ -126,7 +127,7 @@ def set_tags(self, tags=None): Returns: list({str:str}): a list of key value pairs """ - return self._set_tags(resource_arn=self.context_arn, tags=tags) + return self._set_tags(resource_arn=self.context_arn, tags=format_tags(tags)) @classmethod def load(cls, context_name: str, sagemaker_session=None) -> "Context": diff --git a/src/sagemaker/local/entities.py b/src/sagemaker/local/entities.py index 3eb4ab2b34..8431d8154a 100644 --- a/src/sagemaker/local/entities.py +++ b/src/sagemaker/local/entities.py @@ -28,7 +28,7 @@ from sagemaker.local.image import _SageMakerContainer from sagemaker.local.utils import copy_directory_structure, move_to_destination, get_docker_host -from sagemaker.utils import DeferredError, get_config_value +from sagemaker.utils import DeferredError, get_config_value, format_tags from sagemaker.local.exceptions import StepExecutionException logger = logging.getLogger(__name__) @@ -552,7 +552,7 @@ class _LocalEndpointConfig(object): def __init__(self, config_name, production_variants, tags=None): self.name = config_name self.production_variants = production_variants - self.tags = tags + self.tags = format_tags(tags) self.creation_time = datetime.datetime.now() def describe(self): @@ -584,7 +584,7 @@ def __init__(self, endpoint_name, endpoint_config_name, tags=None, local_session self.name = endpoint_name self.endpoint_config = local_client.describe_endpoint_config(endpoint_config_name) self.production_variant = self.endpoint_config["ProductionVariants"][0] - self.tags = tags + self.tags = format_tags(tags) model_name = self.production_variant["ModelName"] self.primary_container = local_client.describe_model(model_name)["PrimaryContainer"] diff --git a/src/sagemaker/local/local_session.py b/src/sagemaker/local/local_session.py index f09d64b9be..7d48850077 100644 --- a/src/sagemaker/local/local_session.py +++ b/src/sagemaker/local/local_session.py @@ -42,7 +42,12 @@ _LocalPipeline, ) from sagemaker.session import Session -from sagemaker.utils import get_config_value, _module_import_error, resolve_value_from_config +from sagemaker.utils import ( + get_config_value, + _module_import_error, + resolve_value_from_config, + format_tags, +) logger = logging.getLogger(__name__) @@ -336,7 +341,7 @@ def create_endpoint_config(self, EndpointConfigName, ProductionVariants, Tags=No """ LocalSagemakerClient._endpoint_configs[EndpointConfigName] = _LocalEndpointConfig( - EndpointConfigName, ProductionVariants, Tags + EndpointConfigName, ProductionVariants, format_tags(Tags) ) def describe_endpoint(self, EndpointName): @@ -366,7 +371,12 @@ def create_endpoint(self, EndpointName, EndpointConfigName, Tags=None): Returns: """ - endpoint = _LocalEndpoint(EndpointName, EndpointConfigName, Tags, self.sagemaker_session) + endpoint = _LocalEndpoint( + EndpointName, + EndpointConfigName, + format_tags(Tags), + self.sagemaker_session, + ) LocalSagemakerClient._endpoints[EndpointName] = endpoint endpoint.serve() diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py index 56f68372ae..ff340b58e9 100644 --- a/src/sagemaker/model.py +++ b/src/sagemaker/model.py @@ -64,6 +64,8 @@ to_string, resolve_value_from_config, resolve_nested_dict_value_from_config, + format_tags, + Tags, ) from sagemaker.async_inference import AsyncInferenceConfig from sagemaker.predictor_async import AsyncPredictor @@ -554,7 +556,7 @@ def create( instance_type: Optional[str] = None, accelerator_type: Optional[str] = None, serverless_inference_config: Optional[ServerlessInferenceConfig] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, accept_eula: Optional[bool] = None, ): """Create a SageMaker Model Entity @@ -571,10 +573,11 @@ def create( Specifies configuration related to serverless endpoint. Instance type is not provided in serverless inference. So this is used to find image URIs (default: None). - tags (list[dict[str, str] or list[dict[str, PipelineVariable]]): The list of - tags to add to the model (default: None). Example:: + tags (Optional[Tags]): Tags to add to the model (default: None). Example:: tags = [{'Key': 'tagname', 'Value':'tagvalue'}] + # Or + tags = {'tagname', 'tagvalue'} For more information about tags, see `boto3 documentation >> tags = [{'Key': 'tagname', 'Value': 'tagvalue'}] For more information about tags, see https://boto3.amazonaws.com/v1/documentation @@ -843,7 +846,7 @@ def _create_sagemaker_model( model_package._create_sagemaker_model( instance_type=instance_type, accelerator_type=accelerator_type, - tags=tags, + tags=format_tags(tags), serverless_inference_config=serverless_inference_config, ) if self._base_name is None and model_package._base_name is not None: @@ -898,7 +901,7 @@ def _create_sagemaker_model( container_defs=container_def, vpc_config=self.vpc_config, enable_network_isolation=self._enable_network_isolation, - tags=tags, + tags=format_tags(tags), ) self.sagemaker_session.create_model(**create_model_args) @@ -956,7 +959,7 @@ def _edge_packaging_job_config( compilation_job_name (str): what compilation job to source the model from resource_key (str): the kms key to encrypt the disk with s3_kms_key (str): the kms key to encrypt the output with - tags (list[dict]): List of tags for labeling an edge packaging job. For + tags (Optional[Tags]): Tags for labeling an edge packaging job. For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. Returns: @@ -971,7 +974,7 @@ def _edge_packaging_job_config( return { "output_model_config": output_model_config, "role": role, - "tags": tags, + "tags": format_tags(tags), "model_name": model_name, "model_version": model_version, "job_name": packaging_job_name, @@ -1063,7 +1066,7 @@ def multi_version_compilation_supported( "output_model_config": output_model_config, "role": role, "stop_condition": {"MaxRuntimeInSeconds": compile_max_run}, - "tags": tags, + "tags": format_tags(tags), "job_name": job_name, } @@ -1091,7 +1094,7 @@ def package_for_edge( job_name (str): The name of the edge packaging job resource_key (str): the kms key to encrypt the disk with s3_kms_key (str): the kms key to encrypt the output with - tags (list[dict]): List of tags for labeling an edge packaging job. For + tags (Optional[Tags]): Tags for labeling an edge packaging job. For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. @@ -1126,7 +1129,7 @@ def package_for_edge( self._compilation_job_name, resource_key, s3_kms_key, - tags, + format_tags(tags), ) self.sagemaker_session.package_model_for_edge(**config) job_status = self.sagemaker_session.wait_for_edge_packaging_job(job_name) @@ -1169,7 +1172,7 @@ def compile( https://docs.aws.amazon.com/sagemaker/latest/dg/neo-compilation-preparing-model.html output_path (str): Specifies where to store the compiled model role (str): Execution role - tags (list[dict]): List of tags for labeling a compilation job. For + tags (Optional[Tags]): Tags for labeling a compilation job. For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. job_name (str): The name of the compilation job @@ -1242,7 +1245,7 @@ def compile( compile_max_run, job_name, framework, - tags, + format_tags(tags), target_platform_os, target_platform_arch, target_platform_accelerator, @@ -1342,7 +1345,7 @@ def deploy( https://docs.aws.amazon.com/sagemaker/latest/dg/ei.html endpoint_name (str): The name of the endpoint to create (default: None). If not specified, a unique endpoint name will be created. - tags (List[dict[str, str]]): The list of tags to attach to this + tags (Optional[Tags]): Tags to attach to this specific endpoint. kms_key (str): The ARN of the KMS key that is used to encrypt the data on the storage volume attached to the instance hosting the @@ -1430,6 +1433,8 @@ def deploy( sagemaker_session=self.sagemaker_session, ) + tags = format_tags(tags) + if ( getattr(self.sagemaker_session, "settings", None) is not None and self.sagemaker_session.settings.include_jumpstart_tags @@ -1733,7 +1738,7 @@ def transformer( to be made to each individual transform container at one time. max_payload (int): Maximum size of the payload in a single HTTP request to the container in MB. - tags (list[dict]): List of tags for labeling a transform job. If + tags (Optional[Tags]): Tags for labeling a transform job. If none specified, then the tags used for the training job are used for the transform job. volume_kms_key (str): Optional. KMS key ID for encrypting the volume @@ -1741,6 +1746,8 @@ def transformer( """ self._init_sagemaker_session_if_does_not_exist(instance_type) + tags = format_tags(tags) + self._create_sagemaker_model(instance_type, tags=tags) if self.enable_network_isolation(): env = None @@ -2165,7 +2172,7 @@ def _create_sagemaker_model(self, *args, **kwargs): # pylint: disable=unused-ar container_def, vpc_config=self.vpc_config, enable_network_isolation=self.enable_network_isolation(), - tags=kwargs.get("tags"), + tags=format_tags(kwargs.get("tags")), ) def _ensure_base_name_if_needed(self, base_name): diff --git a/src/sagemaker/model_monitor/clarify_model_monitoring.py b/src/sagemaker/model_monitor/clarify_model_monitoring.py index bc572827cd..77f27b37f0 100644 --- a/src/sagemaker/model_monitor/clarify_model_monitoring.py +++ b/src/sagemaker/model_monitor/clarify_model_monitoring.py @@ -25,7 +25,7 @@ from sagemaker.model_monitor import model_monitoring as mm from sagemaker import image_uris, s3 from sagemaker.session import Session -from sagemaker.utils import name_from_base +from sagemaker.utils import name_from_base, format_tags from sagemaker.clarify import SageMakerClarifyProcessor, ModelPredictedLabelConfig from sagemaker.lineage._utils import get_resource_name_from_arn @@ -81,7 +81,7 @@ def __init__( AWS services needed. If not specified, one is created using the default AWS configuration chain. env (dict): Environment variables to be passed to the job. - tags ([dict]): List of tags to be passed to the job. + tags (Optional[Tags]): List of tags to be passed to the job. network_config (sagemaker.network.NetworkConfig): A NetworkConfig object that configures network isolation, encryption of inter-container traffic, security group IDs, and subnets. @@ -108,7 +108,7 @@ def __init__( base_job_name=base_job_name, sagemaker_session=session, env=env, - tags=tags, + tags=format_tags(tags), network_config=network_config, ) self.latest_baselining_job_config = None @@ -296,7 +296,7 @@ def _build_create_job_definition_request( time, Amazon SageMaker terminates the job regardless of its current status. Default: 3600 env (dict): Environment variables to be passed to the job. - tags ([dict]): List of tags to be passed to the job. + tags (Optional[Tags]): List of tags to be passed to the job. network_config (sagemaker.network.NetworkConfig): A NetworkConfig object that configures network isolation, encryption of inter-container traffic, security group IDs, and subnets. @@ -458,7 +458,7 @@ def _build_create_job_definition_request( request_dict["StoppingCondition"] = stop_condition if tags is not None: - request_dict["Tags"] = tags + request_dict["Tags"] = format_tags(tags) return request_dict diff --git a/src/sagemaker/model_monitor/model_monitoring.py b/src/sagemaker/model_monitor/model_monitoring.py index b949c6538b..2800082df4 100644 --- a/src/sagemaker/model_monitor/model_monitoring.py +++ b/src/sagemaker/model_monitor/model_monitoring.py @@ -62,6 +62,7 @@ retries, resolve_value_from_config, resolve_class_attribute_from_config, + format_tags, ) from sagemaker.lineage._utils import get_resource_name_from_arn from sagemaker.model_monitor.cron_expression_generator import CronExpressionGenerator @@ -163,7 +164,7 @@ def __init__( AWS services needed. If not specified, one is created using the default AWS configuration chain. env (dict): Environment variables to be passed to the job. - tags ([dict]): List of tags to be passed to the job. + tags (Optional[Tags]): List of tags to be passed to the job. network_config (sagemaker.network.NetworkConfig): A NetworkConfig object that configures network isolation, encryption of inter-container traffic, security group IDs, and subnets. @@ -177,7 +178,7 @@ def __init__( self.max_runtime_in_seconds = max_runtime_in_seconds self.base_job_name = base_job_name self.sagemaker_session = sagemaker_session or Session() - self.tags = tags + self.tags = format_tags(tags) self.baselining_jobs = [] self.latest_baselining_job = None @@ -1738,7 +1739,7 @@ def __init__( AWS services needed. If not specified, one is created using the default AWS configuration chain. env (dict): Environment variables to be passed to the job. - tags ([dict]): List of tags to be passed to the job. + tags (Optional[Tags]): List of tags to be passed to the job. network_config (sagemaker.network.NetworkConfig): A NetworkConfig object that configures network isolation, encryption of inter-container traffic, security group IDs, and subnets. @@ -1757,7 +1758,7 @@ def __init__( base_job_name=base_job_name, sagemaker_session=sagemaker_session, env=env, - tags=tags, + tags=format_tags(tags), network_config=network_config, ) @@ -2685,7 +2686,7 @@ def _build_create_data_quality_job_definition_request( time, Amazon SageMaker terminates the job regardless of its current status. Default: 3600 env (dict): Environment variables to be passed to the job. - tags ([dict]): List of tags to be passed to the job. + tags (Optional[Tags]): List of tags to be passed to the job. network_config (sagemaker.network.NetworkConfig): A NetworkConfig object that configures network isolation, encryption of inter-container traffic, security group IDs, and subnets. @@ -2817,7 +2818,7 @@ def _build_create_data_quality_job_definition_request( request_dict["StoppingCondition"] = stop_condition if tags is not None: - request_dict["Tags"] = tags + request_dict["Tags"] = format_tags(tags) return request_dict @@ -2871,7 +2872,7 @@ def __init__( AWS services needed. If not specified, one is created using the default AWS configuration chain. env (dict): Environment variables to be passed to the job. - tags ([dict]): List of tags to be passed to the job. + tags (Optional[Tags]): List of tags to be passed to the job. network_config (sagemaker.network.NetworkConfig): A NetworkConfig object that configures network isolation, encryption of inter-container traffic, security group IDs, and subnets. @@ -2890,7 +2891,7 @@ def __init__( base_job_name=base_job_name, sagemaker_session=session, env=env, - tags=tags, + tags=format_tags(tags), network_config=network_config, ) @@ -3462,7 +3463,7 @@ def _build_create_model_quality_job_definition_request( time, Amazon SageMaker terminates the job regardless of its current status. Default: 3600 env (dict): Environment variables to be passed to the job. - tags ([dict]): List of tags to be passed to the job. + tags (Optional[Tags]): List of tags to be passed to the job. network_config (sagemaker.network.NetworkConfig): A NetworkConfig object that configures network isolation, encryption of inter-container traffic, security group IDs, and subnets. @@ -3594,7 +3595,7 @@ def _build_create_model_quality_job_definition_request( request_dict["StoppingCondition"] = stop_condition if tags is not None: - request_dict["Tags"] = tags + request_dict["Tags"] = format_tags(tags) return request_dict diff --git a/src/sagemaker/multidatamodel.py b/src/sagemaker/multidatamodel.py index b656b4c671..9c1e6ac4f4 100644 --- a/src/sagemaker/multidatamodel.py +++ b/src/sagemaker/multidatamodel.py @@ -23,7 +23,7 @@ from sagemaker.deprecations import removed_kwargs from sagemaker.model import Model from sagemaker.session import Session -from sagemaker.utils import pop_out_unused_kwarg +from sagemaker.utils import pop_out_unused_kwarg, format_tags from sagemaker.workflow.entities import PipelineVariable MULTI_MODEL_CONTAINER_MODE = "MultiModel" @@ -245,6 +245,8 @@ def deploy( if instance_type == "local" and not isinstance(self.sagemaker_session, local.LocalSession): self.sagemaker_session = local.LocalSession() + tags = format_tags(tags) + container_def = self.prepare_container_def(instance_type, accelerator_type=accelerator_type) self.sagemaker_session.create_model( self.name, diff --git a/src/sagemaker/mxnet/processing.py b/src/sagemaker/mxnet/processing.py index d85ab5b526..bb50de2014 100644 --- a/src/sagemaker/mxnet/processing.py +++ b/src/sagemaker/mxnet/processing.py @@ -24,6 +24,7 @@ from sagemaker.mxnet.estimator import MXNet from sagemaker.processing import FrameworkProcessor from sagemaker.workflow.entities import PipelineVariable +from sagemaker.utils import format_tags, Tags class MXNetProcessor(FrameworkProcessor): @@ -48,7 +49,7 @@ def __init__( base_job_name: Optional[str] = None, sagemaker_session: Optional[Session] = None, env: Optional[Dict[str, Union[str, PipelineVariable]]] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, network_config: Optional[NetworkConfig] = None, ): """This processor executes a Python script in a managed MXNet execution environment. @@ -81,6 +82,6 @@ def __init__( base_job_name, sagemaker_session, env, - tags, + format_tags(tags), network_config, ) diff --git a/src/sagemaker/pipeline.py b/src/sagemaker/pipeline.py index b9405f568c..a4b7feac69 100644 --- a/src/sagemaker/pipeline.py +++ b/src/sagemaker/pipeline.py @@ -31,6 +31,7 @@ name_from_image, update_container_with_inference_params, resolve_value_from_config, + format_tags, ) from sagemaker.transformer import Transformer from sagemaker.workflow.entities import PipelineVariable @@ -263,6 +264,8 @@ def deploy( if data_capture_config is not None: data_capture_config_dict = data_capture_config._to_request_dict() + tags = format_tags(tags) + if update_endpoint: endpoint_config_name = self.sagemaker_session.create_endpoint_config( name=self.name, @@ -516,7 +519,7 @@ def transformer( max_concurrent_transforms=max_concurrent_transforms, max_payload=max_payload, env=env, - tags=tags, + tags=format_tags(tags), base_transform_job_name=self.name, volume_kms_key=volume_kms_key, sagemaker_session=self.sagemaker_session, diff --git a/src/sagemaker/predictor_async.py b/src/sagemaker/predictor_async.py index 1adfce4c7c..cdf9b141b3 100644 --- a/src/sagemaker/predictor_async.py +++ b/src/sagemaker/predictor_async.py @@ -22,7 +22,7 @@ from sagemaker.async_inference import WaiterConfig, AsyncInferenceResponse from sagemaker.s3 import parse_s3_url from sagemaker.session import Session -from sagemaker.utils import name_from_base, sagemaker_timestamp +from sagemaker.utils import name_from_base, sagemaker_timestamp, format_tags class AsyncPredictor: @@ -375,7 +375,7 @@ def update_endpoint( instance_type=instance_type, accelerator_type=accelerator_type, model_name=model_name, - tags=tags, + tags=format_tags(tags), kms_key=kms_key, data_capture_config_dict=data_capture_config_dict, wait=wait, diff --git a/src/sagemaker/processing.py b/src/sagemaker/processing.py index a020abc140..7b16e3cba3 100644 --- a/src/sagemaker/processing.py +++ b/src/sagemaker/processing.py @@ -50,6 +50,8 @@ check_and_get_run_experiment_config, resolve_value_from_config, resolve_class_attribute_from_config, + Tags, + format_tags, ) from sagemaker.session import Session from sagemaker.workflow import is_pipeline_variable @@ -83,7 +85,7 @@ def __init__( base_job_name: Optional[str] = None, sagemaker_session: Optional[Session] = None, env: Optional[Dict[str, Union[str, PipelineVariable]]] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, network_config: Optional[NetworkConfig] = None, ): """Initializes a ``Processor`` instance. @@ -122,9 +124,8 @@ def __init__( one using the default AWS configuration chain. env (dict[str, str] or dict[str, PipelineVariable]): Environment variables to be passed to the processing jobs (default: None). - tags (list[dict[str, str] or list[dict[str, PipelineVariable]]): List of tags - to be passed to the processing job (default: None). For more, see - https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. + tags (Optional[Tags]): Tags to be passed to the processing job (default: None). + For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. network_config (:class:`~sagemaker.network.NetworkConfig`): A :class:`~sagemaker.network.NetworkConfig` object that configures network isolation, encryption of @@ -137,7 +138,7 @@ def __init__( self.volume_size_in_gb = volume_size_in_gb self.max_runtime_in_seconds = max_runtime_in_seconds self.base_job_name = base_job_name - self.tags = tags + self.tags = format_tags(tags) self.jobs = [] self.latest_job = None @@ -515,7 +516,7 @@ def __init__( base_job_name: Optional[str] = None, sagemaker_session: Optional[Session] = None, env: Optional[Dict[str, Union[str, PipelineVariable]]] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, network_config: Optional[NetworkConfig] = None, ): """Initializes a ``ScriptProcessor`` instance. @@ -555,9 +556,8 @@ def __init__( one using the default AWS configuration chain. env (dict[str, str] or dict[str, PipelineVariable])): Environment variables to be passed to the processing jobs (default: None). - tags (list[dict[str, str] or list[dict[str, PipelineVariable]]): List of tags to - be passed to the processing job (default: None). For more, see - https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. + tags (Optional[Tags]): Tags to be passed to the processing job (default: None). + For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. network_config (:class:`~sagemaker.network.NetworkConfig`): A :class:`~sagemaker.network.NetworkConfig` object that configures network isolation, encryption of @@ -579,7 +579,7 @@ def __init__( base_job_name=base_job_name, sagemaker_session=sagemaker_session, env=env, - tags=tags, + tags=format_tags(tags), network_config=network_config, ) @@ -1442,7 +1442,7 @@ def __init__( base_job_name: Optional[str] = None, sagemaker_session: Optional[Session] = None, env: Optional[Dict[str, Union[str, PipelineVariable]]] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, network_config: Optional[NetworkConfig] = None, ): """Initializes a ``FrameworkProcessor`` instance. @@ -1494,9 +1494,8 @@ def __init__( one using the default AWS configuration chain (default: None). env (dict[str, str] or dict[str, PipelineVariable]): Environment variables to be passed to the processing jobs (default: None). - tags (list[dict[str, str] or list[dict[str, PipelineVariable]]): List of tags to - be passed to the processing job (default: None). For more, see - https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. + tags (Optional[Tags]): Tags to be passed to the processing job (default: None). + For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. network_config (:class:`~sagemaker.network.NetworkConfig`): A :class:`~sagemaker.network.NetworkConfig` object that configures network isolation, encryption of @@ -1531,7 +1530,7 @@ def __init__( base_job_name=base_job_name, sagemaker_session=sagemaker_session, env=env, - tags=tags, + tags=format_tags(tags), network_config=network_config, ) diff --git a/src/sagemaker/pytorch/processing.py b/src/sagemaker/pytorch/processing.py index 70fc96497e..e04e4ba65a 100644 --- a/src/sagemaker/pytorch/processing.py +++ b/src/sagemaker/pytorch/processing.py @@ -24,6 +24,7 @@ from sagemaker.processing import FrameworkProcessor from sagemaker.pytorch.estimator import PyTorch from sagemaker.workflow.entities import PipelineVariable +from sagemaker.utils import format_tags, Tags class PyTorchProcessor(FrameworkProcessor): @@ -48,7 +49,7 @@ def __init__( base_job_name: Optional[str] = None, sagemaker_session: Optional[Session] = None, env: Optional[Dict[str, Union[str, PipelineVariable]]] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, network_config: Optional[NetworkConfig] = None, ): """This processor executes a Python script in a PyTorch execution environment. @@ -81,6 +82,6 @@ def __init__( base_job_name, sagemaker_session, env, - tags, + format_tags(tags), network_config, ) diff --git a/src/sagemaker/remote_function/job.py b/src/sagemaker/remote_function/job.py index c4570da463..205a2adf41 100644 --- a/src/sagemaker/remote_function/job.py +++ b/src/sagemaker/remote_function/job.py @@ -49,7 +49,13 @@ from sagemaker import image_uris from sagemaker.remote_function.checkpoint_location import CheckpointLocation from sagemaker.session import get_execution_role, _logs_for_job, Session -from sagemaker.utils import name_from_base, _tmpdir, resolve_value_from_config +from sagemaker.utils import ( + name_from_base, + _tmpdir, + resolve_value_from_config, + format_tags, + Tags, +) from sagemaker.s3 import s3_path_join, S3Uploader from sagemaker import vpc_utils from sagemaker.remote_function.core.stored_function import StoredFunction, _SerializedData @@ -200,7 +206,7 @@ def __init__( sagemaker_session: Session = None, security_group_ids: List[Union[str, "PipelineVariable"]] = None, subnets: List[Union[str, "PipelineVariable"]] = None, - tags: List[Dict[str, Union[str, "PipelineVariable"]]] = None, + tags: Optional[Tags] = None, volume_kms_key: Union[str, "PipelineVariable"] = None, volume_size: Union[int, "PipelineVariable"] = 30, encrypt_inter_container_traffic: Union[bool, "PipelineVariable"] = None, @@ -362,9 +368,8 @@ def __init__( subnets (List[str, PipelineVariable]): A list of subnet IDs. Defaults to ``None`` and the job is created without VPC config. - tags (list[dict[str, str] or list[dict[str, PipelineVariable]]): A list of tags - attached to the job. Defaults to ``None`` and the training job is created - without tags. + tags (Optional[Tags]): Tags attached to the job. Defaults to ``None`` + and the training job is created without tags. volume_kms_key (str, PipelineVariable): An Amazon Key Management Service (KMS) key used to encrypt an Amazon Elastic Block Storage (EBS) volume attached to the @@ -544,9 +549,8 @@ def __init__( vpc_config = vpc_utils.to_dict(subnets=_subnets, security_group_ids=_security_group_ids) self.vpc_config = vpc_utils.sanitize(vpc_config) - self.tags = self.sagemaker_session._append_sagemaker_config_tags( - [{"Key": k, "Value": v} for k, v in tags] if tags else None, REMOTE_FUNCTION_TAGS - ) + tags = format_tags(tags) + self.tags = self.sagemaker_session._append_sagemaker_config_tags(tags, REMOTE_FUNCTION_TAGS) @staticmethod def _get_default_image(session): diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py index fe0d259428..2cf7e78f41 100644 --- a/src/sagemaker/session.py +++ b/src/sagemaker/session.py @@ -131,6 +131,9 @@ resolve_nested_dict_value_from_config, update_nested_dictionary_with_values_from_config, update_list_of_dicts_with_values_from_config, + format_tags, + Tags, + TagsDict, ) from sagemaker import exceptions from sagemaker.session_settings import SessionSettings @@ -677,7 +680,7 @@ def _create_s3_bucket_if_it_does_not_exist(self, bucket_name, region): ) raise - def _append_sagemaker_config_tags(self, tags: list, config_path_to_tags: str): + def _append_sagemaker_config_tags(self, tags: List[TagsDict], config_path_to_tags: str): """Appends tags specified in the sagemaker_config to the given list of tags. To minimize the chance of duplicate tags being applied, this is intended to be used @@ -787,7 +790,7 @@ def train( # noqa: C901 called to convert them before training. stop_condition (dict): Defines when training shall finish. Contains entries that can be understood by the service like ``MaxRuntimeInSeconds``. - tags (list[dict]): List of tags for labeling a training job. For more, see + tags (Optional[Tags]): Tags for labeling a training job. For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. metric_definitions (list[dict]): A list of dictionaries that defines the metric(s) used to evaluate the training jobs. Each dictionary contains two keys: 'Name' for @@ -886,7 +889,7 @@ def train( # noqa: C901 Returns: str: ARN of the training job, if it is created. """ - tags = _append_project_tags(tags) + tags = _append_project_tags(format_tags(tags)) tags = self._append_sagemaker_config_tags( tags, "{}.{}.{}".format(SAGEMAKER, TRAINING_JOB, TAGS) ) @@ -1369,7 +1372,7 @@ def process( jobs. role_arn (str): The Amazon Resource Name (ARN) of an IAM role that Amazon SageMaker can assume to perform tasks on your behalf. - tags ([dict[str,str]]): A list of dictionaries containing key-value + tags (Optional[Tags]): A list of dictionaries containing key-value pairs. experiment_config (dict[str, str]): Experiment management configuration. Optionally, the dict can contain three keys: @@ -1383,7 +1386,7 @@ def process( will be unassociated. * `TrialComponentDisplayName` is used for display in Studio. """ - tags = _append_project_tags(tags) + tags = _append_project_tags(format_tags(tags)) tags = self._append_sagemaker_config_tags( tags, "{}.{}.{}".format(SAGEMAKER, PROCESSING_JOB, TAGS) ) @@ -1597,7 +1600,7 @@ def create_monitoring_schedule( jobs. role_arn (str): The Amazon Resource Name (ARN) of an IAM role that Amazon SageMaker can assume to perform tasks on your behalf. - tags ([dict[str,str]]): A list of dictionaries containing key-value + tags (Optional[Tags]): A list of dictionaries containing key-value pairs. data_analysis_start_time (str): Start time for the data analysis window for the one time monitoring schedule (NOW), e.g. "-PT1H" @@ -1717,7 +1720,7 @@ def create_monitoring_schedule( "NetworkConfig" ] = inferred_network_config_from_config - tags = _append_project_tags(tags) + tags = _append_project_tags(format_tags(tags)) tags = self._append_sagemaker_config_tags( tags, "{}.{}.{}".format(SAGEMAKER, MONITORING_SCHEDULE, TAGS) ) @@ -2367,7 +2370,7 @@ def auto_ml( "MetricName" and "Value". generate_candidate_definitions_only (bool): Indicates whether to only generate candidate definitions. If True, AutoML.list_candidates() cannot be called. Default: False. - tags ([dict[str,str]]): A list of dictionaries containing key-value + tags (Optional[Tags]): A list of dictionaries containing key-value pairs. model_deploy_config (dict): Specifies how to generate the endpoint name for an automatic one-click Autopilot model deployment. @@ -2390,7 +2393,7 @@ def auto_ml( problem_type=problem_type, job_objective=job_objective, generate_candidate_definitions_only=generate_candidate_definitions_only, - tags=tags, + tags=format_tags(tags), model_deploy_config=model_deploy_config, ) @@ -2435,7 +2438,7 @@ def _get_auto_ml_request( "MetricName" and "Value". generate_candidate_definitions_only (bool): Indicates whether to only generate candidate definitions. If True, AutoML.list_candidates() cannot be called. Default: False. - tags ([dict[str,str]]): A list of dictionaries containing key-value + tags (Optional[Tags]): A list of dictionaries containing key-value pairs. model_deploy_config (dict): Specifies how to generate the endpoint name for an automatic one-click Autopilot model deployment. @@ -2460,7 +2463,7 @@ def _get_auto_ml_request( if problem_type is not None: auto_ml_job_request["ProblemType"] = problem_type - tags = _append_project_tags(tags) + tags = _append_project_tags(format_tags(tags)) tags = self._append_sagemaker_config_tags( tags, "{}.{}.{}".format(SAGEMAKER, AUTO_ML_JOB, TAGS) ) @@ -2650,7 +2653,7 @@ def compile_model( job_name (str): Name of the compilation job being created. stop_condition (dict): Defines when compilation job shall finish. Contains entries that can be understood by the service like ``MaxRuntimeInSeconds``. - tags (list[dict]): List of tags for labeling a compile model job. For more, see + tags (Optional[Tags]): List of tags for labeling a compile model job. For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. Returns: @@ -2675,7 +2678,7 @@ def compile_model( if vpc_config: compilation_job_request["VpcConfig"] = vpc_config - tags = _append_project_tags(tags) + tags = _append_project_tags(format_tags(tags)) tags = self._append_sagemaker_config_tags( tags, "{}.{}.{}".format(SAGEMAKER, COMPILATION_JOB, TAGS) ) @@ -2707,7 +2710,7 @@ def package_model_for_edge( job_name (str): Name of the edge packaging job being created. compilation_job_name (str): Name of the compilation job being created. resource_key (str): KMS key to encrypt the disk used to package the job - tags (list[dict]): List of tags for labeling a compile model job. For more, see + tags (Optional[Tags]): List of tags for labeling a compile model job. For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. """ role = resolve_value_from_config(role, EDGE_PACKAGING_ROLE_ARN_PATH, sagemaker_session=self) @@ -2725,7 +2728,7 @@ def package_model_for_edge( resource_key = resolve_value_from_config( resource_key, EDGE_PACKAGING_RESOURCE_KEY_PATH, sagemaker_session=self ) - tags = _append_project_tags(tags) + tags = _append_project_tags(format_tags(tags)) tags = self._append_sagemaker_config_tags( tags, "{}.{}.{}".format(SAGEMAKER, EDGE_PACKAGING_JOB, TAGS) ) @@ -2963,7 +2966,7 @@ def create_tuning_job( or training_config_list should be provided, but not both. warm_start_config (dict): Configuration defining the type of warm start and other required configurations. - tags (list[dict]): List of tags for labeling the tuning job. For more, see + tags (Optional[Tags]): List of tags for labeling the tuning job. For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. autotune (bool): Whether the parameter ranges or other unset settings of a tuning job should be chosen automatically. @@ -2982,7 +2985,7 @@ def create_tuning_job( training_config=training_config, training_config_list=training_config_list, warm_start_config=warm_start_config, - tags=tags, + tags=format_tags(tags), autotune=autotune, ) @@ -3015,7 +3018,7 @@ def _get_tuning_request( or training_config_list should be provided, but not both. warm_start_config (dict): Configuration defining the type of warm start and other required configurations. - tags (list[dict]): List of tags for labeling the tuning job. For more, see + tags (Optional[Tags]): List of tags for labeling the tuning job. For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. autotune (bool): Whether the parameter ranges or other unset settings of a tuning job should be chosen automatically. @@ -3040,7 +3043,7 @@ def _get_tuning_request( if warm_start_config is not None: tune_request["WarmStartConfig"] = warm_start_config - tags = _append_project_tags(tags) + tags = _append_project_tags(format_tags(tags)) if tags is not None: tune_request["Tags"] = tags @@ -3497,7 +3500,7 @@ def transform( * If both `ExperimentName` and `TrialName` are not supplied the trial component will be unassociated. * `TrialComponentDisplayName` is used for display in Studio. - tags (list[dict]): List of tags for labeling a transform job. + tags (Optional[Tags]): List of tags for labeling a transform job. data_processing(dict): A dictionary describing config for combining the input data and transformed data. For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. @@ -3507,7 +3510,7 @@ def transform( batch_data_capture_config (BatchDataCaptureConfig): Configuration object which specifies the configurations related to the batch data capture for the transform job """ - tags = _append_project_tags(tags) + tags = _append_project_tags(format_tags(tags)) tags = self._append_sagemaker_config_tags( tags, "{}.{}.{}".format(SAGEMAKER, TRANSFORM_JOB, TAGS) ) @@ -3603,7 +3606,7 @@ def _create_model_request( request["PrimaryContainer"] = container_definition if tags: - request["Tags"] = tags + request["Tags"] = format_tags(tags) if vpc_config: request["VpcConfig"] = vpc_config @@ -3655,7 +3658,7 @@ def create_model( which is used to create more advanced container configurations, including model containers which need artifacts from S3. This field is deprecated, please use container_defs instead. - tags(List[dict[str, str]]): Optional. The list of tags to add to the model. + tags(Optional[Tags]): Optional. The list of tags to add to the model. Example: >>> tags = [{'Key': 'tagname', 'Value': 'tagvalue'}] @@ -3665,7 +3668,7 @@ def create_model( Returns: str: Name of the Amazon SageMaker ``Model`` created. """ - tags = _append_project_tags(tags) + tags = _append_project_tags(format_tags(tags)) tags = self._append_sagemaker_config_tags(tags, "{}.{}.{}".format(SAGEMAKER, MODEL, TAGS)) role = resolve_value_from_config( role, MODEL_EXECUTION_ROLE_ARN_PATH, sagemaker_session=self @@ -3745,7 +3748,7 @@ def create_model_from_job( Default: use VpcConfig from training job. * 'Subnets' (list[str]): List of subnet ids. * 'SecurityGroupIds' (list[str]): List of security group ids. - tags(List[dict[str, str]]): Optional. The list of tags to add to the model. + tags(Optional[Tags]): Optional. The list of tags to add to the model. For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. Returns: @@ -3786,7 +3789,7 @@ def create_model_from_job( primary_container, enable_network_isolation=enable_network_isolation, vpc_config=vpc_config, - tags=tags, + tags=format_tags(tags), ) def create_model_package_from_algorithm(self, name, description, algorithm_arn, model_data): @@ -4027,7 +4030,7 @@ def create_endpoint_config( accelerator_type (str): Type of Elastic Inference accelerator to attach to the instance. For example, 'ml.eia1.medium'. For more information: https://docs.aws.amazon.com/sagemaker/latest/dg/ei.html - tags(List[dict[str, str]]): Optional. The list of tags to add to the endpoint config. + tags(Optional[Tags]): Optional. The list of tags to add to the endpoint config. kms_key (str): The KMS key that is used to encrypt the data on the storage volume attached to the instance hosting the endpoint. data_capture_config_dict (dict): Specifies configuration related to Endpoint data @@ -4059,7 +4062,7 @@ def create_endpoint_config( """ logger.info("Creating endpoint-config with name %s", name) - tags = tags or [] + tags = format_tags(tags) or [] provided_production_variant = production_variant( model_name, instance_type, @@ -4136,7 +4139,7 @@ def create_endpoint_config_from_existing( new_config_name (str): Name of the Amazon SageMaker endpoint configuration to create. existing_config_name (str): Name of the existing Amazon SageMaker endpoint configuration. - new_tags (list[dict[str, str]]): Optional. The list of tags to add to the endpoint + new_tags (Optional[Tags]): Optional. The list of tags to add to the endpoint config. If not specified, the tags of the existing endpoint configuration are used. If any of the existing tags are reserved AWS ones (i.e. begin with "aws"), they are not carried over to the new endpoint configuration. @@ -4196,7 +4199,7 @@ def create_endpoint_config_from_existing( if "ModelName" not in pv or not pv["ModelName"]: request["ExecutionRoleArn"] = self.get_caller_identity_arn() - request_tags = new_tags or self.list_tags( + request_tags = format_tags(new_tags) or self.list_tags( existing_endpoint_config_desc["EndpointConfigArn"] ) request_tags = _append_project_tags(request_tags) @@ -4267,7 +4270,7 @@ def create_endpoint(self, endpoint_name, config_name, tags=None, wait=True, live config_name (str): Name of the Amazon SageMaker endpoint configuration to deploy. wait (bool): Whether to wait for the endpoint deployment to complete before returning (default: True). - tags (list[dict[str, str]]): A list of key-value pairs for tagging the endpoint + tags (Optional[Tags]): A list of key-value pairs for tagging the endpoint (default: None). Returns: @@ -4275,7 +4278,7 @@ def create_endpoint(self, endpoint_name, config_name, tags=None, wait=True, live """ logger.info("Creating endpoint with name %s", endpoint_name) - tags = tags or [] + tags = format_tags(tags) or [] tags = _append_project_tags(tags) tags = self._append_sagemaker_config_tags( tags, "{}.{}.{}".format(SAGEMAKER, ENDPOINT, TAGS) @@ -4384,7 +4387,7 @@ def create_inference_component( variant_name: str, specification: Dict[str, Any], runtime_config: Optional[Dict[str, Any]] = None, - tags: Optional[Dict[str, str]] = None, + tags: Optional[Tags] = None, wait: bool = True, ): """Create an Amazon SageMaker Inference Component. @@ -4399,8 +4402,8 @@ def create_inference_component( specification (Dict[str, Any]): The inference component specification. runtime_config (Optional[Dict[str, Any]]): Optional. The inference component runtime configuration. (Default: None). - tags (Optional[Dict[str, str]]): Optional. A list of dictionaries containing key-value - pairs. (Default: None). + tags (Optional[Tags]): Optional. Either a dictionary or a list + of dictionaries containing key-value pairs. (Default: None). wait (bool) : Optional. Wait for the inference component to finish being created before returning a value. (Default: True). @@ -4424,7 +4427,7 @@ def create_inference_component( "RuntimeConfig": runtime_config, } - tags = tags or [] + tags = format_tags(tags) tags = _append_project_tags(tags) tags = self._append_sagemaker_config_tags( tags, "{}.{}.{}".format(SAGEMAKER, INFERENCE_COMPONENT, TAGS) @@ -5182,7 +5185,7 @@ def endpoint_from_model_data( data_capture_config (sagemaker.model_monitor.DataCaptureConfig): Specifies configuration related to Endpoint data capture for use with Amazon SageMaker Model Monitoring. Default: None. - tags (list[dict[str, str]]): A list of key-value pairs for tagging the endpoint + tags (Optional[Tags]): A list of key-value pairs for tagging the endpoint (default: None). Returns: @@ -5191,8 +5194,8 @@ def endpoint_from_model_data( model_environment_vars = model_environment_vars or {} name = name or name_from_image(image_uri) model_vpc_config = vpc_utils.sanitize(model_vpc_config) - endpoint_config_tags = _append_project_tags(tags) - endpoint_tags = _append_project_tags(tags) + endpoint_config_tags = _append_project_tags(format_tags(tags)) + endpoint_tags = _append_project_tags(format_tags(tags)) endpoint_config_tags = self._append_sagemaker_config_tags( endpoint_config_tags, "{}.{}.{}".format(SAGEMAKER, ENDPOINT_CONFIG, TAGS) ) @@ -5255,7 +5258,7 @@ def endpoint_from_production_variants( Args: name (str): The name of the ``Endpoint`` to create. production_variants (list[dict[str, str]]): The list of production variants to deploy. - tags (list[dict[str, str]]): A list of key-value pairs for tagging the endpoint + tags (Optional[Tags]): A list of key-value pairs for tagging the endpoint (default: None). kms_key (str): The KMS key that is used to encrypt the data on the storage volume attached to the instance hosting the endpoint. @@ -5340,8 +5343,8 @@ def endpoint_from_production_variants( # Use expand_role method to handle this situation. role = self.expand_role(role) config_options["ExecutionRoleArn"] = role - endpoint_config_tags = _append_project_tags(tags) - endpoint_tags = _append_project_tags(tags) + endpoint_config_tags = _append_project_tags(format_tags(tags)) + endpoint_tags = _append_project_tags(format_tags(tags)) endpoint_config_tags = self._append_sagemaker_config_tags( endpoint_config_tags, "{}.{}.{}".format(SAGEMAKER, ENDPOINT_CONFIG, TAGS) @@ -5679,7 +5682,7 @@ def create_feature_group( online_store_config: Dict[str, str] = None, offline_store_config: Dict[str, str] = None, description: str = None, - tags: List[Dict[str, str]] = None, + tags: Optional[Tags] = None, ) -> Dict[str, Any]: """Creates a FeatureGroup in the FeatureStore service. @@ -5694,11 +5697,12 @@ def create_feature_group( offline_store_config (Dict[str, str]): dict contains configuration of the feature offline store. description (str): description of the FeatureGroup. - tags (List[Dict[str, str]]): list of tags for labeling a FeatureGroup. + tags (Optional[Tags]): tags for labeling a FeatureGroup. Returns: Response dict from service. """ + tags = format_tags(tags) tags = _append_project_tags(tags) tags = self._append_sagemaker_config_tags( tags, "{}.{}.{}".format(SAGEMAKER, FEATURE_GROUP, TAGS) @@ -6155,7 +6159,7 @@ def _create_inference_recommendations_job_request( framework: str, sample_payload_url: str, supported_content_types: List[str], - tags: Dict[str, str], + tags: Optional[Tags], model_name: str = None, model_package_version_arn: str = None, job_duration_in_seconds: int = None, @@ -6191,8 +6195,8 @@ def _create_inference_recommendations_job_request( benchmarked by Amazon SageMaker Inference Recommender that matches your model. supported_instance_types (List[str]): A list of the instance types that are used to generate inferences in real-time. - tags (Dict[str, str]): Tags used to identify where the Inference Recommendatons Call - was made from. + tags (Optional[Tags]): Tags used to identify where + the Inference Recommendatons Call was made from. endpoint_configurations (List[Dict[str, any]]): Specifies the endpoint configurations to use for a job. Will be used for `Advanced` jobs. traffic_pattern (Dict[str, any]): Specifies the traffic pattern for the job. @@ -6231,7 +6235,7 @@ def _create_inference_recommendations_job_request( "InputConfig": { "ContainerConfig": containerConfig, }, - "Tags": tags, + "Tags": format_tags(tags), } request.get("InputConfig").update( @@ -6443,7 +6447,7 @@ def get_model_package_args( approval_status (str): Model Approval Status, values can be "Approved", "Rejected", or "PendingManualApproval" (default: "PendingManualApproval"). description (str): Model Package description (default: None). - tags (List[dict[str, str]]): A list of dictionaries containing key-value pairs + tags (Optional[Tags]): A list of dictionaries containing key-value pairs (default: None). container_def_list (list): A list of container defintiions (default: None). drift_check_baselines (DriftCheckBaselines): DriftCheckBaselines object (default: None). @@ -6498,7 +6502,7 @@ def get_model_package_args( if description is not None: model_package_args["description"] = description if tags is not None: - model_package_args["tags"] = tags + model_package_args["tags"] = format_tags(tags) if customer_metadata_properties is not None: model_package_args["customer_metadata_properties"] = customer_metadata_properties if validation_specification is not None: @@ -6558,7 +6562,7 @@ def get_create_model_package_request( approval_status (str): Model Approval Status, values can be "Approved", "Rejected", or "PendingManualApproval" (default: "PendingManualApproval"). description (str): Model Package description (default: None). - tags (List[dict[str, str]]): A list of dictionaries containing key-value pairs + tags (Optional[Tags]): A list of dictionaries containing key-value pairs (default: None). drift_check_baselines (DriftCheckBaselines): DriftCheckBaselines object (default: None). customer_metadata_properties (dict[str, str]): A dictionary of key-value paired @@ -6585,7 +6589,7 @@ def get_create_model_package_request( if description is not None: request_dict["ModelPackageDescription"] = description if tags is not None: - request_dict["Tags"] = tags + request_dict["Tags"] = format_tags(tags) if model_metrics: request_dict["ModelMetrics"] = model_metrics if drift_check_baselines: diff --git a/src/sagemaker/sklearn/processing.py b/src/sagemaker/sklearn/processing.py index 86d0df9113..ff209b3740 100644 --- a/src/sagemaker/sklearn/processing.py +++ b/src/sagemaker/sklearn/processing.py @@ -24,6 +24,7 @@ from sagemaker.processing import ScriptProcessor from sagemaker.sklearn import defaults from sagemaker.workflow.entities import PipelineVariable +from sagemaker.utils import format_tags, Tags class SKLearnProcessor(ScriptProcessor): @@ -43,7 +44,7 @@ def __init__( base_job_name: Optional[str] = None, sagemaker_session: Optional[Session] = None, env: Optional[Dict[str, Union[str, PipelineVariable]]] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, network_config: Optional[NetworkConfig] = None, ): """Initialize an ``SKLearnProcessor`` instance. @@ -81,8 +82,7 @@ def __init__( using the default AWS configuration chain. env (dict[str, str] or dict[str, PipelineVariable]): Environment variables to be passed to the processing job. - tags (list[dict[str, str] or list[dict[str, PipelineVariable]]): List of tags - to be passed to the processing job. + tags (Optional[Tags]): Tags to be passed to the processing job. network_config (sagemaker.network.NetworkConfig): A NetworkConfig object that configures network isolation, encryption of inter-container traffic, security group IDs, and subnets. @@ -110,6 +110,6 @@ def __init__( base_job_name=base_job_name, sagemaker_session=session, env=env, - tags=tags, + tags=format_tags(tags), network_config=network_config, ) diff --git a/src/sagemaker/spark/processing.py b/src/sagemaker/spark/processing.py index 293b61f835..82634071cc 100644 --- a/src/sagemaker/spark/processing.py +++ b/src/sagemaker/spark/processing.py @@ -41,6 +41,7 @@ from sagemaker.session import Session from sagemaker.network import NetworkConfig from sagemaker.spark import defaults +from sagemaker.utils import format_tags, Tags from sagemaker.workflow import is_pipeline_variable from sagemaker.workflow.pipeline_context import runnable_by_pipeline @@ -135,7 +136,7 @@ def __init__( SageMaker APIs and any other AWS services needed. If not specified, the processor creates one using the default AWS configuration chain. env (dict): Environment variables to be passed to the processing job. - tags ([dict]): List of tags to be passed to the processing job. + tags (Optional[Tags]): List of tags to be passed to the processing job. network_config (sagemaker.network.NetworkConfig): A NetworkConfig object that configures network isolation, encryption of inter-container traffic, security group IDs, and subnets. @@ -168,7 +169,7 @@ def __init__( base_job_name=base_job_name, sagemaker_session=session, env=env, - tags=tags, + tags=format_tags(tags), network_config=network_config, ) @@ -703,7 +704,7 @@ def __init__( base_job_name: Optional[str] = None, sagemaker_session: Optional[Session] = None, env: Optional[Dict[str, Union[str, PipelineVariable]]] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, network_config: Optional[NetworkConfig] = None, ): """Initialize an ``PySparkProcessor`` instance. @@ -747,7 +748,7 @@ def __init__( using the default AWS configuration chain. env (dict[str, str] or dict[str, PipelineVariable]): Environment variables to be passed to the processing job. - tags (list[dict[str, str] or list[dict[str, PipelineVariable]]): List of tags to + tags (Optional[Tags]): List of tags to be passed to the processing job. network_config (sagemaker.network.NetworkConfig): A NetworkConfig object that configures network isolation, encryption of @@ -771,7 +772,7 @@ def __init__( base_job_name=base_job_name, sagemaker_session=sagemaker_session, env=env, - tags=tags, + tags=format_tags(tags), network_config=network_config, ) @@ -980,7 +981,7 @@ def __init__( base_job_name: Optional[str] = None, sagemaker_session: Optional[Session] = None, env: Optional[Dict[str, Union[str, PipelineVariable]]] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, network_config: Optional[NetworkConfig] = None, ): """Initialize a ``SparkJarProcessor`` instance. @@ -1024,8 +1025,7 @@ def __init__( using the default AWS configuration chain. env (dict[str, str] or dict[str, PipelineVariable]): Environment variables to be passed to the processing job. - tags (list[dict[str, str] or list[dict[str, PipelineVariable]]): List of tags to - be passed to the processing job. + tags (Optional[Tags]): Tags to be passed to the processing job. network_config (sagemaker.network.NetworkConfig): A NetworkConfig object that configures network isolation, encryption of inter-container traffic, security group IDs, and subnets. @@ -1048,7 +1048,7 @@ def __init__( base_job_name=base_job_name, sagemaker_session=sagemaker_session, env=env, - tags=tags, + tags=format_tags(tags), network_config=network_config, ) diff --git a/src/sagemaker/tensorflow/estimator.py b/src/sagemaker/tensorflow/estimator.py index 523b70ec38..df2bc74935 100644 --- a/src/sagemaker/tensorflow/estimator.py +++ b/src/sagemaker/tensorflow/estimator.py @@ -29,6 +29,7 @@ from sagemaker.workflow import is_pipeline_variable from sagemaker.tensorflow.training_compiler.config import TrainingCompilerConfig from sagemaker.workflow.entities import PipelineVariable +from sagemaker.utils import format_tags logger = logging.getLogger("sagemaker") @@ -474,7 +475,7 @@ def transformer( each individual transform container at one time. max_payload (int): Maximum size of the payload in a single HTTP request to the container in MB. - tags (list[dict]): List of tags for labeling a transform job. If none specified, then + tags (Optional[Tags]): Tags for labeling a transform job. If none specified, then the tags used for the training job are used for the transform job. role (str): The IAM Role ARN for the ``TensorFlowModel``, which is also used during transform jobs. If not specified, the role from the Estimator is used. @@ -525,7 +526,7 @@ def transformer( max_concurrent_transforms=max_concurrent_transforms, max_payload=max_payload, env=env or {}, - tags=tags, + tags=format_tags(tags), base_transform_job_name=self.base_job_name, volume_kms_key=volume_kms_key, sagemaker_session=self.sagemaker_session, @@ -553,6 +554,6 @@ def transformer( env=env, max_concurrent_transforms=max_concurrent_transforms, max_payload=max_payload, - tags=tags, + tags=format_tags(tags), volume_kms_key=volume_kms_key, ) diff --git a/src/sagemaker/tensorflow/model.py b/src/sagemaker/tensorflow/model.py index 375a2ea7e5..1b35afbe7c 100644 --- a/src/sagemaker/tensorflow/model.py +++ b/src/sagemaker/tensorflow/model.py @@ -27,6 +27,7 @@ from sagemaker.workflow import is_pipeline_variable from sagemaker.workflow.entities import PipelineVariable from sagemaker.workflow.pipeline_context import PipelineSession +from sagemaker.utils import format_tags logger = logging.getLogger(__name__) @@ -355,7 +356,7 @@ def deploy( deserializer=deserializer, accelerator_type=accelerator_type, endpoint_name=endpoint_name, - tags=tags, + tags=format_tags(tags), kms_key=kms_key, wait=wait, data_capture_config=data_capture_config, diff --git a/src/sagemaker/tensorflow/processing.py b/src/sagemaker/tensorflow/processing.py index e4495a39fd..529920a374 100644 --- a/src/sagemaker/tensorflow/processing.py +++ b/src/sagemaker/tensorflow/processing.py @@ -24,6 +24,7 @@ from sagemaker.processing import FrameworkProcessor from sagemaker.tensorflow.estimator import TensorFlow from sagemaker.workflow.entities import PipelineVariable +from sagemaker.utils import format_tags, Tags class TensorFlowProcessor(FrameworkProcessor): @@ -48,7 +49,7 @@ def __init__( base_job_name: Optional[str] = None, sagemaker_session: Optional[Session] = None, env: Optional[Dict[str, Union[str, PipelineVariable]]] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, network_config: Optional[NetworkConfig] = None, ): """This processor executes a Python script in a TensorFlow execution environment. @@ -81,6 +82,6 @@ def __init__( base_job_name, sagemaker_session, env, - tags, + format_tags(tags), network_config, ) diff --git a/src/sagemaker/transformer.py b/src/sagemaker/transformer.py index be8511f570..4ddbbc5451 100644 --- a/src/sagemaker/transformer.py +++ b/src/sagemaker/transformer.py @@ -13,7 +13,7 @@ """Placeholder docstring""" from __future__ import absolute_import -from typing import Union, Optional, List, Dict +from typing import Union, Optional, Dict import logging import copy import time @@ -42,6 +42,8 @@ check_and_get_run_experiment_config, resolve_value_from_config, resolve_class_attribute_from_config, + format_tags, + Tags, ) @@ -62,7 +64,7 @@ def __init__( accept: Optional[Union[str, PipelineVariable]] = None, max_concurrent_transforms: Optional[Union[int, PipelineVariable]] = None, max_payload: Optional[Union[int, PipelineVariable]] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, env: Optional[Dict[str, Union[str, PipelineVariable]]] = None, base_transform_job_name: Optional[str] = None, sagemaker_session: Optional[Session] = None, @@ -92,9 +94,9 @@ def __init__( to be made to each individual transform container at one time. max_payload (int or PipelineVariable): Maximum size of the payload in a single HTTP request to the container in MB. - tags (list[dict[str, str] or list[dict[str, PipelineVariable]]): List of tags for - labeling a transform job (default: None). For more, see the SageMaker API - documentation for `Tag `_. + tags (Optional[Tags]): Tags for labeling a transform job (default: None). + For more, see the SageMaker API documentation for + `Tag `_. env (dict[str, str] or dict[str, PipelineVariable]): Environment variables to be set for use during the transform job (default: None). base_transform_job_name (str): Prefix for the transform job when the @@ -121,7 +123,7 @@ def __init__( self.max_concurrent_transforms = max_concurrent_transforms self.max_payload = max_payload - self.tags = tags + self.tags = format_tags(tags) self.base_transform_job_name = base_transform_job_name self._current_job_name = None diff --git a/src/sagemaker/tuner.py b/src/sagemaker/tuner.py index 02f7bd8e79..571f84761f 100644 --- a/src/sagemaker/tuner.py +++ b/src/sagemaker/tuner.py @@ -52,6 +52,8 @@ base_name_from_image, name_from_base, to_string, + format_tags, + Tags, ) AMAZON_ESTIMATOR_MODULE = "sagemaker" @@ -603,7 +605,7 @@ def __init__( max_jobs: Union[int, PipelineVariable] = None, max_parallel_jobs: Union[int, PipelineVariable] = 1, max_runtime_in_seconds: Optional[Union[int, PipelineVariable]] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, base_tuning_job_name: Optional[str] = None, warm_start_config: Optional[WarmStartConfig] = None, strategy_config: Optional[StrategyConfig] = None, @@ -651,9 +653,8 @@ def __init__( start (default: 1). max_runtime_in_seconds (int or PipelineVariable): The maximum time in seconds that a hyperparameter tuning job can run. - tags (list[dict[str, str] or list[dict[str, PipelineVariable]]): List of tags for - labeling the tuning job (default: None). For more, see - https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. + tags (Optional[Tags]): Tags for labeling the tuning job (default: None). + For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. base_tuning_job_name (str): Prefix for the hyperparameter tuning job name when the :meth:`~sagemaker.tuner.HyperparameterTuner.fit` method launches. If not specified, a default job name is @@ -746,7 +747,7 @@ def __init__( self.max_parallel_jobs = max_parallel_jobs self.max_runtime_in_seconds = max_runtime_in_seconds - self.tags = tags + self.tags = format_tags(tags) self.base_tuning_job_name = base_tuning_job_name self._current_job_name = None self.latest_tuning_job = None @@ -1924,7 +1925,8 @@ def create( (default: 1). max_runtime_in_seconds (int): The maximum time in seconds that a hyperparameter tuning job can run. - tags (list[dict]): List of tags for labeling the tuning job (default: None). For more, + tags (Optional[Tags]): List of tags for labeling the tuning job (default: None). + For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. warm_start_config (sagemaker.tuner.WarmStartConfig): A ``WarmStartConfig`` object that has been initialized with the configuration defining the nature of warm start @@ -1988,7 +1990,7 @@ def create( max_jobs=max_jobs, max_parallel_jobs=max_parallel_jobs, max_runtime_in_seconds=max_runtime_in_seconds, - tags=tags, + tags=format_tags(tags), warm_start_config=warm_start_config, early_stopping_type=early_stopping_type, random_seed=random_seed, diff --git a/src/sagemaker/utils.py b/src/sagemaker/utils.py index 31850a290e..e203693f84 100644 --- a/src/sagemaker/utils.py +++ b/src/sagemaker/utils.py @@ -25,7 +25,7 @@ import tarfile import tempfile import time -from typing import Any, List, Optional, Dict +from typing import Union, Any, List, Optional, Dict import json import abc import uuid @@ -44,6 +44,7 @@ ) from sagemaker.session_settings import SessionSettings from sagemaker.workflow import is_pipeline_variable, is_pipeline_parameter_string +from sagemaker.workflow.entities import PipelineVariable ECR_URI_PATTERN = r"^(\d+)(\.)dkr(\.)ecr(\.)(.+)(\.)(.*)(/)(.*:.*)$" MAX_BUCKET_PATHS_COUNT = 5 @@ -57,6 +58,9 @@ logger = logging.getLogger(__name__) +TagsDict = Dict[str, Union[str, PipelineVariable]] +Tags = Union[List[TagsDict], TagsDict] + # Use the base name of the image as the job name if the user doesn't give us one def name_from_image(image, max_length=63): @@ -1477,3 +1481,11 @@ def create_paginator_config(max_items: int = None, page_size: int = None) -> Dic "MaxItems": max_items if max_items else MAX_ITEMS, "PageSize": page_size if page_size else PAGE_SIZE, } + + +def format_tags(tags: Tags) -> List[TagsDict]: + """Process tags to turn them into the expected format for Sagemaker.""" + if isinstance(tags, dict): + return [{"Key": str(k), "Value": str(v)} for k, v in tags.items()] + + return tags diff --git a/src/sagemaker/workflow/_utils.py b/src/sagemaker/workflow/_utils.py index 8177f6eed4..4993493513 100644 --- a/src/sagemaker/workflow/_utils.py +++ b/src/sagemaker/workflow/_utils.py @@ -32,7 +32,7 @@ Step, ConfigurableRetryStep, ) -from sagemaker.utils import _save_model, download_file_from_url +from sagemaker.utils import _save_model, download_file_from_url, format_tags from sagemaker.workflow.retry import RetryPolicy from sagemaker.workflow.utilities import trim_request_dict @@ -359,7 +359,7 @@ def __init__( depends on (default: None). retry_policies (List[RetryPolicy]): The list of retry policies for the current step (default: None). - tags (List[dict[str, str]]): A list of dictionaries containing key-value pairs used to + tags (Optional[Tags]): A list of dictionaries containing key-value pairs used to configure the create model package request (default: None). container_def_list (list): A list of container definitions (default: None). drift_check_baselines (DriftCheckBaselines): DriftCheckBaselines object (default: None). @@ -395,7 +395,7 @@ def __init__( self.inference_instances = inference_instances self.transform_instances = transform_instances self.model_package_group_name = model_package_group_name - self.tags = tags + self.tags = format_tags(tags) self.model_metrics = model_metrics self.drift_check_baselines = drift_check_baselines self.customer_metadata_properties = customer_metadata_properties @@ -407,7 +407,6 @@ def __init__( self.image_uri = image_uri self.compile_model_family = compile_model_family self.description = description - self.tags = tags self.kwargs = kwargs self.container_def_list = container_def_list self.skip_model_validation = skip_model_validation diff --git a/src/sagemaker/workflow/airflow.py b/src/sagemaker/workflow/airflow.py index cb4951d6e4..793849ff93 100644 --- a/src/sagemaker/workflow/airflow.py +++ b/src/sagemaker/workflow/airflow.py @@ -22,6 +22,7 @@ from sagemaker.tensorflow import TensorFlow from sagemaker.estimator import EstimatorBase from sagemaker.processing import Processor +from sagemaker.utils import format_tags def prepare_framework(estimator, s3_operations): @@ -898,7 +899,7 @@ def transform_config_from_estimator( be made to each individual transform container at one time. max_payload (int): Maximum size of the payload in a single HTTP request to the container in MB. - tags (list[dict]): List of tags for labeling a transform job. If none + tags (Optional[Tags]): List of tags for labeling a transform job. If none specified, then the tags used for the training job are used for the transform job. role (str): The ``ExecutionRoleArn`` IAM Role ARN for the ``Model``, @@ -969,7 +970,7 @@ def transform_config_from_estimator( env, max_concurrent_transforms, max_payload, - tags, + format_tags(tags), role, model_server_workers, volume_kms_key, @@ -986,7 +987,7 @@ def transform_config_from_estimator( env, max_concurrent_transforms, max_payload, - tags, + format_tags(tags), role, volume_kms_key, ) diff --git a/src/sagemaker/workflow/check_job_config.py b/src/sagemaker/workflow/check_job_config.py index eaba149823..a8e4082c8e 100644 --- a/src/sagemaker/workflow/check_job_config.py +++ b/src/sagemaker/workflow/check_job_config.py @@ -24,6 +24,7 @@ ModelBiasMonitor, ModelExplainabilityMonitor, ) +from sagemaker.utils import format_tags class CheckJobConfig: @@ -66,7 +67,7 @@ def __init__( AWS services needed (default: None). If not specified, one is created using the default AWS configuration chain. env (dict): Environment variables to be passed to the job (default: None). - tags ([dict]): List of tags to be passed to the job (default: None). + tags (Optional[Tags]): List of tags to be passed to the job (default: None). network_config (sagemaker.network.NetworkConfig): A NetworkConfig object that configures network isolation, encryption of inter-container traffic, security group IDs, and subnets (default: None). @@ -82,7 +83,7 @@ def __init__( self.base_job_name = base_job_name self.sagemaker_session = sagemaker_session or Session() self.env = env - self.tags = tags + self.tags = format_tags(tags) self.network_config = network_config def _generate_model_monitor(self, mm_type: str) -> Optional[ModelMonitor]: diff --git a/src/sagemaker/workflow/function_step.py b/src/sagemaker/workflow/function_step.py index 4fee8ef269..a55955b4eb 100644 --- a/src/sagemaker/workflow/function_step.py +++ b/src/sagemaker/workflow/function_step.py @@ -41,7 +41,7 @@ from sagemaker.workflow.utilities import trim_request_dict, load_step_compilation_context from sagemaker.s3_utils import s3_path_join -from sagemaker.utils import unique_name_from_base_uuid4 +from sagemaker.utils import unique_name_from_base_uuid4, format_tags, Tags if TYPE_CHECKING: from sagemaker.remote_function.spark_config import SparkConfig @@ -374,7 +374,7 @@ def step( role: str = None, security_group_ids: Optional[List[Union[str, PipelineVariable]]] = None, subnets: Optional[List[Union[str, PipelineVariable]]] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, volume_kms_key: Optional[Union[str, PipelineVariable]] = None, volume_size: Union[int, PipelineVariable] = 30, encrypt_inter_container_traffic: Optional[Union[bool, PipelineVariable]] = None, @@ -513,8 +513,8 @@ def step( subnets (List[str, PipelineVariable]): A list of subnet IDs. Defaults to ``None`` and the job is created without a VPC config. - tags (list[dict[str, str] or list[dict[str, PipelineVariable]]): A list of tags attached - to the job. Defaults to ``None`` and the training job is created without tags. + tags (Optional[Tags]): Tags attached to the job. Defaults to ``None`` + and the training job is created without tags. volume_kms_key (str, PipelineVariable): An Amazon Key Management Service (KMS) key used to encrypt an Amazon Elastic Block Storage (EBS) volume attached to the training instance. @@ -598,7 +598,7 @@ def wrapper(*args, **kwargs): role=role, security_group_ids=security_group_ids, subnets=subnets, - tags=tags, + tags=format_tags(tags), volume_kms_key=volume_kms_key, volume_size=volume_size, encrypt_inter_container_traffic=encrypt_inter_container_traffic, diff --git a/src/sagemaker/workflow/notebook_job_step.py b/src/sagemaker/workflow/notebook_job_step.py index e535457db6..8a1dd6bc53 100644 --- a/src/sagemaker/workflow/notebook_job_step.py +++ b/src/sagemaker/workflow/notebook_job_step.py @@ -45,7 +45,7 @@ from sagemaker.s3_utils import s3_path_join from sagemaker.s3 import S3Uploader -from sagemaker.utils import _tmpdir, name_from_base, resolve_value_from_config +from sagemaker.utils import _tmpdir, name_from_base, resolve_value_from_config, format_tags, Tags from sagemaker import vpc_utils from sagemaker.config.config_schema import ( @@ -93,7 +93,7 @@ def __init__( subnets: Optional[List[Union[str, PipelineVariable]]] = None, max_retry_attempts: int = 1, max_runtime_in_seconds: int = 2 * 24 * 60 * 60, - tags: Optional[Dict[str, Union[str, PipelineVariable]]] = None, + tags: Optional[Tags] = None, additional_dependencies: Optional[List[str]] = None, # pylint: enable=W0613 retry_policies: Optional[List[RetryPolicy]] = None, @@ -187,10 +187,9 @@ def __init__( time and max retry attempts, the run time applies to each retry. If a job does not complete in this time, its status is set to ``Failed``. Defaults to ``172800 seconds(2 days)``. - tags (dict[str, str] or dict[str, PipelineVariable]): A list of tags attached to the - job. Defaults to ``None`` and the training job is created without tags. Your tags - control how the Studio UI captures and displays the job created by - the pipeline in the following ways: + tags (Optional[Tags]): Tags attached to the job. Defaults to ``None`` and the training + job is created without tags. Your tags control how the Studio UI captures and + displays the job created by the pipeline in the following ways: * If you only attach the domain tag, then the notebook job is displayed to all user profiles and spaces. @@ -359,7 +358,7 @@ def _prepare_tags(self): This function converts the custom tags into training API required format and also attach the system tags. """ - custom_tags = [{"Key": k, "Value": v} for k, v in self.tags.items()] if self.tags else [] + custom_tags = format_tags(self.tags) or [] system_tags = [ {"Key": "sagemaker:name", "Value": self.notebook_job_name}, {"Key": "sagemaker:notebook-name", "Value": os.path.basename(self.input_notebook)}, diff --git a/src/sagemaker/workflow/pipeline.py b/src/sagemaker/workflow/pipeline.py index e65a2f5e05..0645e58386 100644 --- a/src/sagemaker/workflow/pipeline.py +++ b/src/sagemaker/workflow/pipeline.py @@ -33,7 +33,7 @@ from sagemaker.remote_function.job import JOBS_CONTAINER_ENTRYPOINT from sagemaker.s3_utils import s3_path_join from sagemaker.session import Session -from sagemaker.utils import resolve_value_from_config, retry_with_backoff +from sagemaker.utils import resolve_value_from_config, retry_with_backoff, format_tags, Tags from sagemaker.workflow.callback_step import CallbackOutput, CallbackStep from sagemaker.workflow._event_bridge_client_helper import ( EventBridgeSchedulerHelper, @@ -130,7 +130,7 @@ def create( self, role_arn: str = None, description: str = None, - tags: List[Dict[str, str]] = None, + tags: Optional[Tags] = None, parallelism_config: ParallelismConfiguration = None, ) -> Dict[str, Any]: """Creates a Pipeline in the Pipelines service. @@ -138,8 +138,7 @@ def create( Args: role_arn (str): The role arn that is assumed by the pipeline to create step artifacts. description (str): A description of the pipeline. - tags (List[Dict[str, str]]): A list of {"Key": "string", "Value": "string"} dicts as - tags. + tags (Optional[Tags]): Tags to be passed to the pipeline. parallelism_config (Optional[ParallelismConfiguration]): Parallelism configuration that is applied to each of the executions of the pipeline. It takes precedence over the parallelism configuration of the parent pipeline. @@ -160,6 +159,7 @@ def create( if parallelism_config: logger.warning("Pipeline parallelism config is not supported in the local mode.") return self.sagemaker_session.sagemaker_client.create_pipeline(self, description) + tags = format_tags(tags) tags = _append_project_tags(tags) tags = self.sagemaker_session._append_sagemaker_config_tags(tags, PIPELINE_TAGS_PATH) kwargs = self._create_args(role_arn, description, parallelism_config) @@ -264,7 +264,7 @@ def upsert( self, role_arn: str = None, description: str = None, - tags: List[Dict[str, str]] = None, + tags: Optional[Tags] = None, parallelism_config: ParallelismConfiguration = None, ) -> Dict[str, Any]: """Creates a pipeline or updates it, if it already exists. @@ -272,8 +272,7 @@ def upsert( Args: role_arn (str): The role arn that is assumed by workflow to create step artifacts. description (str): A description of the pipeline. - tags (List[Dict[str, str]]): A list of {"Key": "string", "Value": "string"} dicts as - tags. + tags (Optional[Tags]): Tags to be passed. parallelism_config (Optional[Config for parallel steps, Parallelism configuration that is applied to each of the executions @@ -283,6 +282,7 @@ def upsert( role_arn = resolve_value_from_config( role_arn, PIPELINE_ROLE_ARN_PATH, sagemaker_session=self.sagemaker_session ) + tags = format_tags(tags) if not role_arn: # Originally IAM role was a required parameter. # Now we marked that as Optional because we can fetch it from SageMakerConfig diff --git a/src/sagemaker/workflow/step_collections.py b/src/sagemaker/workflow/step_collections.py index 5afac7b519..d48bf7c307 100644 --- a/src/sagemaker/workflow/step_collections.py +++ b/src/sagemaker/workflow/step_collections.py @@ -28,7 +28,7 @@ from sagemaker.workflow.steps import Step, CreateModelStep, TransformStep from sagemaker.workflow._utils import _RegisterModelStep, _RepackModelStep from sagemaker.workflow.retry import RetryPolicy -from sagemaker.utils import update_container_with_inference_params +from sagemaker.utils import update_container_with_inference_params, format_tags @attr.s @@ -128,7 +128,7 @@ def __init__( compile_model_family (str): The instance family for the compiled model. If specified, a compiled model is used (default: None). description (str): Model Package description (default: None). - tags (List[dict[str, str]]): The list of tags to attach to the model package group. Note + tags (Optional[Tags]): The list of tags to attach to the model package group. Note that tags will only be applied to newly created model package groups; if the name of an existing group is passed to "model_package_group_name", tags will not be applied. @@ -163,6 +163,7 @@ def __init__( self.container_def_list = None subnets = None security_group_ids = None + tags = format_tags(tags) if estimator is not None: subnets = estimator.subnets @@ -390,6 +391,7 @@ def __init__( """ super().__init__(name=name, depends_on=depends_on) steps = [] + tags = format_tags(tags) if "entry_point" in kwargs: entry_point = kwargs.get("entry_point", None) source_dir = kwargs.get("source_dir", None) diff --git a/src/sagemaker/wrangler/processing.py b/src/sagemaker/wrangler/processing.py index fe38b670a0..3853fe8ef9 100644 --- a/src/sagemaker/wrangler/processing.py +++ b/src/sagemaker/wrangler/processing.py @@ -14,7 +14,7 @@ from __future__ import absolute_import -from typing import Dict, List +from typing import Dict, Optional from sagemaker.network import NetworkConfig from sagemaker.processing import ( @@ -23,6 +23,7 @@ ) from sagemaker import image_uris from sagemaker.session import Session +from sagemaker.utils import format_tags, Tags class DataWranglerProcessor(Processor): @@ -41,7 +42,7 @@ def __init__( base_job_name: str = None, sagemaker_session: Session = None, env: Dict[str, str] = None, - tags: List[dict] = None, + tags: Optional[Tags] = None, network_config: NetworkConfig = None, ): """Initializes a ``Processor`` instance. @@ -78,7 +79,7 @@ def __init__( one using the default AWS configuration chain. env (dict[str, str]): Environment variables to be passed to the processing jobs (default: None). - tags (list[dict]): List of tags to be passed to the processing job + tags (Optional[Tags]): Tags to be passed to the processing job (default: None). For more, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. network_config (:class:`~sagemaker.network.NetworkConfig`): @@ -103,7 +104,7 @@ def __init__( base_job_name=base_job_name, sagemaker_session=sagemaker_session, env=env, - tags=tags, + tags=format_tags(tags), network_config=network_config, ) diff --git a/src/sagemaker/xgboost/processing.py b/src/sagemaker/xgboost/processing.py index d840bfd960..1df32df37a 100644 --- a/src/sagemaker/xgboost/processing.py +++ b/src/sagemaker/xgboost/processing.py @@ -24,6 +24,7 @@ from sagemaker.processing import FrameworkProcessor from sagemaker.xgboost.estimator import XGBoost from sagemaker.workflow.entities import PipelineVariable +from sagemaker.utils import format_tags, Tags class XGBoostProcessor(FrameworkProcessor): @@ -48,7 +49,7 @@ def __init__( base_job_name: Optional[str] = None, sagemaker_session: Optional[Session] = None, env: Optional[Dict[str, Union[str, PipelineVariable]]] = None, - tags: Optional[List[Dict[str, Union[str, PipelineVariable]]]] = None, + tags: Optional[Tags] = None, network_config: Optional[NetworkConfig] = None, ): """This processor executes a Python script in an XGBoost execution environment. @@ -81,6 +82,6 @@ def __init__( base_job_name, sagemaker_session, env, - tags, + format_tags(tags), network_config, ) diff --git a/tests/unit/sagemaker/model/test_model.py b/tests/unit/sagemaker/model/test_model.py index bfd5af977d..de86fcf99a 100644 --- a/tests/unit/sagemaker/model/test_model.py +++ b/tests/unit/sagemaker/model/test_model.py @@ -361,7 +361,7 @@ def test_create_sagemaker_model_tags(prepare_container_def, sagemaker_session): model = Model(MODEL_IMAGE, MODEL_DATA, name=MODEL_NAME, sagemaker_session=sagemaker_session) - tags = {"Key": "foo", "Value": "bar"} + tags = [{"Key": "foo", "Value": "bar"}] model._create_sagemaker_model(INSTANCE_TYPE, tags=tags) sagemaker_session.create_model.assert_called_with( diff --git a/tests/unit/sagemaker/model/test_model_package.py b/tests/unit/sagemaker/model/test_model_package.py index 8be561030e..def7ddf5e3 100644 --- a/tests/unit/sagemaker/model/test_model_package.py +++ b/tests/unit/sagemaker/model/test_model_package.py @@ -201,7 +201,7 @@ def test_create_sagemaker_model_include_tags(sagemaker_session): env_key = "env_key" env_value = "env_value" environment = {env_key: env_value} - tags = {"Key": "foo", "Value": "bar"} + tags = [{"Key": "foo", "Value": "bar"}] model_package = ModelPackage( role="role", @@ -314,7 +314,7 @@ def test_model_package_create_transformer_with_product_id(sagemaker_session): @patch("sagemaker.model.ModelPackage.update_approval_status") def test_model_package_auto_approve_on_deploy(update_approval_status, sagemaker_session): - tags = {"Key": "foo", "Value": "bar"} + tags = [{"Key": "foo", "Value": "bar"}] model_package = ModelPackage( role="role", model_package_arn=MODEL_PACKAGE_VERSIONED_ARN, diff --git a/tests/unit/sagemaker/tensorflow/test_estimator.py b/tests/unit/sagemaker/tensorflow/test_estimator.py index 6654a04202..d6eaf74012 100644 --- a/tests/unit/sagemaker/tensorflow/test_estimator.py +++ b/tests/unit/sagemaker/tensorflow/test_estimator.py @@ -324,7 +324,7 @@ def test_transformer_creation_with_optional_args( env = {"foo": "bar"} max_concurrent_transforms = 3 max_payload = 100 - tags = {"Key": "foo", "Value": "bar"} + tags = [{"Key": "foo", "Value": "bar"}] new_role = "role" vpc_config = {"Subnets": ["1234"], "SecurityGroupIds": ["5678"]} model_name = "model-name" diff --git a/tests/unit/test_predictor.py b/tests/unit/test_predictor.py index 003c57ac04..1ee9babdf7 100644 --- a/tests/unit/test_predictor.py +++ b/tests/unit/test_predictor.py @@ -307,7 +307,7 @@ def test_update_endpoint_all_args(name_from_base, production_variant): new_instance_type = "ml.c4.xlarge" new_accelerator_type = "ml.eia1.medium" new_model_name = "new-model" - new_tags = {"Key": "foo", "Value": "bar"} + new_tags = [{"Key": "foo", "Value": "bar"}] new_kms_key = "new-key" new_data_capture_config_dict = {} diff --git a/tests/unit/test_predictor_async.py b/tests/unit/test_predictor_async.py index 1af21a36ff..fa2d6da6c7 100644 --- a/tests/unit/test_predictor_async.py +++ b/tests/unit/test_predictor_async.py @@ -404,7 +404,7 @@ def test_update_endpoint_all_args(): new_instance_type = "ml.c4.xlarge" new_accelerator_type = "ml.eia1.medium" new_model_name = "new-model" - new_tags = {"Key": "foo", "Value": "bar"} + new_tags = [{"Key": "foo", "Value": "bar"}] new_kms_key = "new-key" new_data_capture_config_dict = {} diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py index d08a155c7c..c51dcaaea5 100644 --- a/tests/unit/test_session.py +++ b/tests/unit/test_session.py @@ -3653,7 +3653,7 @@ def test_endpoint_from_production_variants_with_sagemaker_config_injection_no_km ) -def test_create_endpoint_config_with_tags(sagemaker_session): +def test_create_endpoint_config_with_tags_list(sagemaker_session): tags = [{"Key": "TagtestKey", "Value": "TagtestValue"}] sagemaker_session.create_endpoint_config( @@ -3669,6 +3669,23 @@ def test_create_endpoint_config_with_tags(sagemaker_session): ) +def test_create_endpoint_config_with_tags_dict(sagemaker_session): + tags = {"TagtestKey": "TagtestValue"} + call_tags = [{"Key": "TagtestKey", "Value": "TagtestValue"}] + + sagemaker_session.create_endpoint_config( + name="endpoint-test", + initial_instance_count=1, + instance_type="local", + model_name="simple-model", + tags=tags, + ) + + sagemaker_session.sagemaker_client.create_endpoint_config.assert_called_with( + EndpointConfigName="endpoint-test", ProductionVariants=ANY, Tags=call_tags + ) + + def test_create_endpoint_config_with_explainer_config(sagemaker_session): explainer_config = ExplainerConfig diff --git a/tests/unit/test_transformer.py b/tests/unit/test_transformer.py index 138cc3e171..8497bc7ea0 100644 --- a/tests/unit/test_transformer.py +++ b/tests/unit/test_transformer.py @@ -249,7 +249,7 @@ def test_transformer_init_optional_params(sagemaker_session): accept = "text/csv" max_concurrent_transforms = 100 max_payload = 100 - tags = {"Key": "foo", "Value": "bar"} + tags = [{"Key": "foo", "Value": "bar"}] env = {"FOO": "BAR"} transformer = Transformer( @@ -573,7 +573,7 @@ def test_start_new(prepare_data_processing, load_config, sagemaker_session): strategy = "MultiRecord" max_concurrent_transforms = 100 max_payload = 100 - tags = {"Key": "foo", "Value": "bar"} + tags = [{"Key": "foo", "Value": "bar"}] env = {"FOO": "BAR"} transformer = Transformer( From fee50e5adc2c92d087ec8507ab325aa0757a42be Mon Sep 17 00:00:00 2001 From: ruiliann666 <141953824+ruiliann666@users.noreply.github.com> Date: Fri, 22 Dec 2023 13:27:27 -0800 Subject: [PATCH 29/76] Raise Exception for debug (#4344) Co-authored-by: Ruilian Gao --- tests/integ/test_inference_recommender.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integ/test_inference_recommender.py b/tests/integ/test_inference_recommender.py index ede9aa8b66..b64439f860 100644 --- a/tests/integ/test_inference_recommender.py +++ b/tests/integ/test_inference_recommender.py @@ -165,13 +165,14 @@ def advanced_right_sized_model(sagemaker_session, cpu_instance_type): ), model_package_group_name, ) - except Exception: + except Exception as e: sagemaker_session.sagemaker_client.delete_model_package( ModelPackageName=sklearn_model_package.model_package_arn ) sagemaker_session.sagemaker_client.delete_model_package_group( ModelPackageGroupName=model_package_group_name ) + raise e @pytest.fixture(scope="module") From 80858e7d61dd1f6e13dff3775063287b3fac55f6 Mon Sep 17 00:00:00 2001 From: martinRenou Date: Fri, 22 Dec 2023 22:30:00 +0100 Subject: [PATCH 30/76] Change: Allow extra_args to be passed to uploader (#4338) * Change: Allow extra_args to be passed to uploader * Fix tests * Black * Fix test --- src/sagemaker/experiments/_helper.py | 13 +++++++++++-- src/sagemaker/experiments/run.py | 9 +++++++-- tests/unit/sagemaker/experiments/test_helper.py | 2 +- tests/unit/sagemaker/experiments/test_run.py | 16 ++++++++++------ 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/src/sagemaker/experiments/_helper.py b/src/sagemaker/experiments/_helper.py index c87bc66e42..5c4e0bce6b 100644 --- a/src/sagemaker/experiments/_helper.py +++ b/src/sagemaker/experiments/_helper.py @@ -59,11 +59,15 @@ def __init__( self.artifact_prefix = artifact_prefix self._s3_client = self.sagemaker_session.boto_session.client("s3") - def upload_artifact(self, file_path): + def upload_artifact(self, file_path, extra_args=None): """Upload an artifact file to S3. Args: file_path (str): the file path of the artifact + extra_args (dict): Optional extra arguments that may be passed to the upload operation. + Similar to ExtraArgs parameter in S3 upload_file function. Please refer to the + ExtraArgs parameter documentation here: + https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html#the-extraargs-parameter Returns: (str, str): The s3 URI of the uploaded file and the etag of the file. @@ -91,7 +95,12 @@ def upload_artifact(self, file_path): artifact_s3_key = "{}/{}/{}".format( self.artifact_prefix, self.trial_component_name, artifact_name ) - self._s3_client.upload_file(file_path, self.artifact_bucket, artifact_s3_key) + self._s3_client.upload_file( + file_path, + self.artifact_bucket, + artifact_s3_key, + ExtraArgs=extra_args, + ) etag = self._try_get_etag(artifact_s3_key) return "s3://{}/{}".format(self.artifact_bucket, artifact_s3_key), etag diff --git a/src/sagemaker/experiments/run.py b/src/sagemaker/experiments/run.py index 6068880844..fdd8f9cdbc 100644 --- a/src/sagemaker/experiments/run.py +++ b/src/sagemaker/experiments/run.py @@ -508,7 +508,8 @@ def log_file( file_path: str, name: Optional[str] = None, media_type: Optional[str] = None, - is_output: bool = True, + is_output: Optional[bool] = True, + extra_args: Optional[dict] = None, ): """Upload a file to s3 and store it as an input/output artifact in this run. @@ -521,11 +522,15 @@ def log_file( is_output (bool): Determines direction of association to the run. Defaults to True (output artifact). If set to False then represented as input association. + extra_args (dict): Optional extra arguments that may be passed to the upload operation. + Similar to ExtraArgs parameter in S3 upload_file function. Please refer to the + ExtraArgs parameter documentation here: + https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html#the-extraargs-parameter """ self._verify_trial_component_artifacts_length(is_output) media_type = media_type or guess_media_type(file_path) name = name or resolve_artifact_name(file_path) - s3_uri, _ = self._artifact_uploader.upload_artifact(file_path) + s3_uri, _ = self._artifact_uploader.upload_artifact(file_path, extra_args=extra_args) if is_output: self._trial_component.output_artifacts[name] = TrialComponentArtifact( value=s3_uri, media_type=media_type diff --git a/tests/unit/sagemaker/experiments/test_helper.py b/tests/unit/sagemaker/experiments/test_helper.py index 7fb49d4feb..314f798f5f 100644 --- a/tests/unit/sagemaker/experiments/test_helper.py +++ b/tests/unit/sagemaker/experiments/test_helper.py @@ -171,7 +171,7 @@ def test_artifact_uploader_upload_artifact(tempdir, artifact_uploader): ) artifact_uploader._s3_client.upload_file.assert_called_with( - path, artifact_uploader.artifact_bucket, expected_key + path, artifact_uploader.artifact_bucket, expected_key, ExtraArgs=None ) expected_uri = "s3://{}/{}".format(artifact_uploader.artifact_bucket, expected_key) diff --git a/tests/unit/sagemaker/experiments/test_run.py b/tests/unit/sagemaker/experiments/test_run.py index 35b1ac42ec..68326a19af 100644 --- a/tests/unit/sagemaker/experiments/test_run.py +++ b/tests/unit/sagemaker/experiments/test_run.py @@ -592,11 +592,11 @@ def test_log_output_artifact(run_obj): run_obj._artifact_uploader.upload_artifact.return_value = ("s3uri_value", "etag_value") with run_obj: run_obj.log_file("foo.txt", "name", "whizz/bang") - run_obj._artifact_uploader.upload_artifact.assert_called_with("foo.txt") + run_obj._artifact_uploader.upload_artifact.assert_called_with("foo.txt", extra_args=None) assert "whizz/bang" == run_obj._trial_component.output_artifacts["name"].media_type run_obj.log_file("foo.txt") - run_obj._artifact_uploader.upload_artifact.assert_called_with("foo.txt") + run_obj._artifact_uploader.upload_artifact.assert_called_with("foo.txt", extra_args=None) assert "foo.txt" in run_obj._trial_component.output_artifacts assert "text/plain" == run_obj._trial_component.output_artifacts["foo.txt"].media_type @@ -611,11 +611,11 @@ def test_log_input_artifact(run_obj): run_obj._artifact_uploader.upload_artifact.return_value = ("s3uri_value", "etag_value") with run_obj: run_obj.log_file("foo.txt", "name", "whizz/bang", is_output=False) - run_obj._artifact_uploader.upload_artifact.assert_called_with("foo.txt") + run_obj._artifact_uploader.upload_artifact.assert_called_with("foo.txt", extra_args=None) assert "whizz/bang" == run_obj._trial_component.input_artifacts["name"].media_type run_obj.log_file("foo.txt", is_output=False) - run_obj._artifact_uploader.upload_artifact.assert_called_with("foo.txt") + run_obj._artifact_uploader.upload_artifact.assert_called_with("foo.txt", extra_args=None) assert "foo.txt" in run_obj._trial_component.input_artifacts assert "text/plain" == run_obj._trial_component.input_artifacts["foo.txt"].media_type @@ -655,7 +655,9 @@ def test_log_multiple_input_artifacts(run_obj): run_obj.log_file( file_path, "name" + str(index), "whizz/bang" + str(index), is_output=False ) - run_obj._artifact_uploader.upload_artifact.assert_called_with(file_path) + run_obj._artifact_uploader.upload_artifact.assert_called_with( + file_path, extra_args=None + ) run_obj._artifact_uploader.upload_artifact.return_value = ( "s3uri_value", @@ -680,7 +682,9 @@ def test_log_multiple_output_artifacts(run_obj): "etag_value" + str(index), ) run_obj.log_file(file_path, "name" + str(index), "whizz/bang" + str(index)) - run_obj._artifact_uploader.upload_artifact.assert_called_with(file_path) + run_obj._artifact_uploader.upload_artifact.assert_called_with( + file_path, extra_args=None + ) run_obj._artifact_uploader.upload_artifact.return_value = ( "s3uri_value", From db3e28c5c88ee71a9abb6eac970561e61f9dd327 Mon Sep 17 00:00:00 2001 From: martinRenou Date: Sat, 23 Dec 2023 18:34:07 +0100 Subject: [PATCH 31/76] Change: Drop py2 tag from the wheel as we don't support Python 2 (#4343) --- setup.cfg | 3 --- 1 file changed, 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index bf718e48ac..80eaced105 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,6 +10,3 @@ test=pytest [metadata] description_file = README.rst license_files = LICENSE.txt - -[wheel] -universal = 1 From c72796c09fe94894d0c11e04bbcabe86a79c6e17 Mon Sep 17 00:00:00 2001 From: ruiliann666 <141953824+ruiliann666@users.noreply.github.com> Date: Sat, 23 Dec 2023 09:40:08 -0800 Subject: [PATCH 32/76] Disable failed test in IR (#4345) * Disable failed test in IR * Fix format --------- Co-authored-by: Ruilian Gao --- tests/integ/test_inference_recommender.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integ/test_inference_recommender.py b/tests/integ/test_inference_recommender.py index b64439f860..4337ceb5da 100644 --- a/tests/integ/test_inference_recommender.py +++ b/tests/integ/test_inference_recommender.py @@ -420,6 +420,7 @@ def test_advanced_right_size_and_deploy_unregistered_model_sklearn( predictor.delete_endpoint() +@pytest.mark.skip(reason="Skipping this test class for now") @pytest.mark.slow_test @pytest.mark.flaky(reruns=3, reruns_delay=2) def test_advanced_right_size_and_deploy_registered_model_sklearn( From dc2235343613e6d1684cab8217a8c76531000f5f Mon Sep 17 00:00:00 2001 From: sagemaker-bot Date: Mon, 25 Dec 2023 14:17:33 +0000 Subject: [PATCH 33/76] change: update image_uri_configs 12-25-2023 06:17:33 PST --- src/sagemaker/image_uri_config/model-monitor.json | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sagemaker/image_uri_config/model-monitor.json b/src/sagemaker/image_uri_config/model-monitor.json index 744f4d5acb..f0d50acf7a 100644 --- a/src/sagemaker/image_uri_config/model-monitor.json +++ b/src/sagemaker/image_uri_config/model-monitor.json @@ -20,6 +20,7 @@ "eu-central-1": "048819808253", "eu-north-1": "895015795356", "eu-south-1": "933208885752", + "eu-south-2": "437450045455", "eu-west-1": "468650794304", "eu-west-2": "749857270468", "eu-west-3": "680080141114", From eb5e97eedb00b5f95e6370d0408d621b6fcce110 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Tue, 26 Dec 2023 11:40:16 -0800 Subject: [PATCH 34/76] feat: Supporting tbac in load_run (#4039) --- src/sagemaker/experiments/run.py | 9 +- tests/unit/sagemaker/experiments/test_run.py | 88 +++++++++++++------- 2 files changed, 67 insertions(+), 30 deletions(-) diff --git a/src/sagemaker/experiments/run.py b/src/sagemaker/experiments/run.py index fdd8f9cdbc..8fbffe3667 100644 --- a/src/sagemaker/experiments/run.py +++ b/src/sagemaker/experiments/run.py @@ -210,7 +210,9 @@ def __init__( ) if not _TrialComponent._trial_component_is_associated_to_trial( - self._trial_component.trial_component_name, self._trial.trial_name, sagemaker_session + self._trial_component.trial_component_name, + self._trial.trial_name, + sagemaker_session, ): self._trial.add_trial_component(self._trial_component) @@ -781,6 +783,7 @@ def load_run( sagemaker_session: Optional["Session"] = None, artifact_bucket: Optional[str] = None, artifact_prefix: Optional[str] = None, + tags: Optional[List[Dict[str, str]]] = None, ) -> Run: """Load an existing run. @@ -849,6 +852,8 @@ def load_run( will be used. artifact_prefix (str): The S3 key prefix used to generate the S3 path to upload the artifact to (default: "trial-component-artifacts"). + tags (List[Dict[str, str]]): A list of tags to be used for all create calls, + e.g. to create an experiment, a run group, etc. (default: None). Returns: Run: The loaded Run object. @@ -870,6 +875,7 @@ def load_run( sagemaker_session=sagemaker_session or _utils.default_session(), artifact_bucket=artifact_bucket, artifact_prefix=artifact_prefix, + tags=tags, ) elif _RunContext.get_current_run(): run_instance = _RunContext.get_current_run() @@ -889,6 +895,7 @@ def load_run( sagemaker_session=sagemaker_session or _utils.default_session(), artifact_bucket=artifact_bucket, artifact_prefix=artifact_prefix, + tags=tags, ) else: raise RuntimeError( diff --git a/tests/unit/sagemaker/experiments/test_run.py b/tests/unit/sagemaker/experiments/test_run.py index 68326a19af..2bebbe3d9c 100644 --- a/tests/unit/sagemaker/experiments/test_run.py +++ b/tests/unit/sagemaker/experiments/test_run.py @@ -55,6 +55,7 @@ TEST_RUN_DISPLAY_NAME, TEST_ARTIFACT_BUCKET, TEST_ARTIFACT_PREFIX, + TEST_TAGS, ) @@ -155,24 +156,22 @@ def test_run_init_name_length_exceed_limit(sagemaker_session): @pytest.mark.parametrize( - ("kwargs", "expected_artifact_bucket", "expected_artifact_prefix"), + ("kwargs", "expected_artifact_bucket", "expected_artifact_prefix", "expected_tags"), [ - ({}, None, _DEFAULT_ARTIFACT_PREFIX), + ({}, None, _DEFAULT_ARTIFACT_PREFIX, None), ( { "artifact_bucket": TEST_ARTIFACT_BUCKET, "artifact_prefix": TEST_ARTIFACT_PREFIX, + "tags": TEST_TAGS, }, TEST_ARTIFACT_BUCKET, TEST_ARTIFACT_PREFIX, + TEST_TAGS, ), ], ) @patch.object(_TrialComponent, "save", MagicMock(return_value=None)) -@patch( - "sagemaker.experiments.run.Experiment._load_or_create", - MagicMock(return_value=Experiment(experiment_name=TEST_EXP_NAME)), -) @patch( "sagemaker.experiments.run._Trial._load_or_create", MagicMock(side_effect=mock_trial_load_or_create_func), @@ -189,6 +188,7 @@ def test_run_load_no_run_name_and_in_train_job( kwargs, expected_artifact_bucket, expected_artifact_prefix, + expected_tags, ): client = sagemaker_session.sagemaker_client job_name = "my-train-job" @@ -213,26 +213,32 @@ def test_run_load_no_run_name_and_in_train_job( { "TrialComponent": { "Parents": [ - {"ExperimentName": TEST_EXP_NAME, "TrialName": exp_config[TRIAL_NAME]} + { + "ExperimentName": TEST_EXP_NAME, + "TrialName": exp_config[TRIAL_NAME], + } ], "TrialComponentName": expected_tc_name, } } ] } - with load_run(sagemaker_session=sagemaker_session, **kwargs) as run_obj: - assert run_obj._in_load - assert not run_obj._inside_init_context - assert run_obj._inside_load_context - assert run_obj.run_name == TEST_RUN_NAME - assert run_obj._trial_component.trial_component_name == expected_tc_name - assert run_obj.run_group_name == Run._generate_trial_name(TEST_EXP_NAME) - assert run_obj._trial - assert run_obj.experiment_name == TEST_EXP_NAME - assert run_obj._experiment - assert run_obj.experiment_config == exp_config - assert run_obj._artifact_uploader.artifact_bucket == expected_artifact_bucket - assert run_obj._artifact_uploader.artifact_prefix == expected_artifact_prefix + expmock = MagicMock(return_value=Experiment(experiment_name=TEST_EXP_NAME, tags=expected_tags)) + with patch("sagemaker.experiments.run.Experiment._load_or_create", expmock): + with load_run(sagemaker_session=sagemaker_session, **kwargs) as run_obj: + assert run_obj._in_load + assert not run_obj._inside_init_context + assert run_obj._inside_load_context + assert run_obj.run_name == TEST_RUN_NAME + assert run_obj._trial_component.trial_component_name == expected_tc_name + assert run_obj.run_group_name == Run._generate_trial_name(TEST_EXP_NAME) + assert run_obj._trial + assert run_obj.experiment_name == TEST_EXP_NAME + assert run_obj._experiment + assert run_obj.experiment_config == exp_config + assert run_obj._artifact_uploader.artifact_bucket == expected_artifact_bucket + assert run_obj._artifact_uploader.artifact_prefix == expected_artifact_prefix + assert run_obj._experiment.tags == expected_tags client.describe_training_job.assert_called_once_with(TrainingJobName=job_name) run_obj._trial.add_trial_component.assert_not_called() @@ -265,7 +271,9 @@ def test_run_load_no_run_name_and_not_in_train_job(run_obj, sagemaker_session): assert run_obj == run -def test_run_load_no_run_name_and_not_in_train_job_but_no_obj_in_context(sagemaker_session): +def test_run_load_no_run_name_and_not_in_train_job_but_no_obj_in_context( + sagemaker_session, +): with pytest.raises(RuntimeError) as err: with load_run(sagemaker_session=sagemaker_session): pass @@ -388,7 +396,10 @@ def test_run_load_in_sm_processing_job(mock_run_env, sagemaker_session): { "TrialComponent": { "Parents": [ - {"ExperimentName": TEST_EXP_NAME, "TrialName": exp_config[TRIAL_NAME]} + { + "ExperimentName": TEST_EXP_NAME, + "TrialName": exp_config[TRIAL_NAME], + } ], "TrialComponentName": expected_tc_name, } @@ -442,7 +453,10 @@ def test_run_load_in_sm_transform_job(mock_run_env, sagemaker_session): { "TrialComponent": { "Parents": [ - {"ExperimentName": TEST_EXP_NAME, "TrialName": exp_config[TRIAL_NAME]} + { + "ExperimentName": TEST_EXP_NAME, + "TrialName": exp_config[TRIAL_NAME], + } ], "TrialComponentName": expected_tc_name, } @@ -589,7 +603,10 @@ def test_log_output_artifact_outside_run_context(run_obj): def test_log_output_artifact(run_obj): - run_obj._artifact_uploader.upload_artifact.return_value = ("s3uri_value", "etag_value") + run_obj._artifact_uploader.upload_artifact.return_value = ( + "s3uri_value", + "etag_value", + ) with run_obj: run_obj.log_file("foo.txt", "name", "whizz/bang") run_obj._artifact_uploader.upload_artifact.assert_called_with("foo.txt", extra_args=None) @@ -608,7 +625,10 @@ def test_log_input_artifact_outside_run_context(run_obj): def test_log_input_artifact(run_obj): - run_obj._artifact_uploader.upload_artifact.return_value = ("s3uri_value", "etag_value") + run_obj._artifact_uploader.upload_artifact.return_value = ( + "s3uri_value", + "etag_value", + ) with run_obj: run_obj.log_file("foo.txt", "name", "whizz/bang", is_output=False) run_obj._artifact_uploader.upload_artifact.assert_called_with("foo.txt", extra_args=None) @@ -653,7 +673,10 @@ def test_log_multiple_input_artifacts(run_obj): "etag_value" + str(index), ) run_obj.log_file( - file_path, "name" + str(index), "whizz/bang" + str(index), is_output=False + file_path, + "name" + str(index), + "whizz/bang" + str(index), + is_output=False, ) run_obj._artifact_uploader.upload_artifact.assert_called_with( file_path, extra_args=None @@ -757,7 +780,12 @@ def test_log_precision_recall_invalid_input(run_obj): with run_obj: with pytest.raises(ValueError) as error: run_obj.log_precision_recall( - y_true, y_scores, 0, title="TestPrecisionRecall", no_skill=no_skill, is_output=False + y_true, + y_scores, + 0, + title="TestPrecisionRecall", + no_skill=no_skill, + is_output=False, ) assert "Lengths mismatch between true labels and predicted probabilities" in str(error) @@ -905,7 +933,8 @@ def test_list(mock_tc_search, mock_tc_list, mock_tc_load, run_obj, sagemaker_ses display_name="C" + str(i), source_arn="D" + str(i), status=TrialComponentStatus( - primary_status=_TrialComponentStatusType.InProgress.value, message="E" + str(i) + primary_status=_TrialComponentStatusType.InProgress.value, + message="E" + str(i), ), start_time=start_time + datetime.timedelta(hours=i), end_time=end_time + datetime.timedelta(hours=i), @@ -925,7 +954,8 @@ def test_list(mock_tc_search, mock_tc_list, mock_tc_load, run_obj, sagemaker_ses display_name="C" + str(i), source_arn="D" + str(i), status=TrialComponentStatus( - primary_status=_TrialComponentStatusType.InProgress.value, message="E" + str(i) + primary_status=_TrialComponentStatusType.InProgress.value, + message="E" + str(i), ), start_time=start_time + datetime.timedelta(hours=i), end_time=end_time + datetime.timedelta(hours=i), From 485fe78a18b05908dc6267f957fbe194bfff23ad Mon Sep 17 00:00:00 2001 From: Mufaddal Rohawala <89424143+mufaddal-rohawala@users.noreply.github.com> Date: Thu, 28 Dec 2023 15:53:38 -0800 Subject: [PATCH 35/76] feature: support local mode in SageMaker Studio (#1300) (#4347) * feature: support local mode in SageMaker Studio * chore: fix typo * chore: fix formatting * chore: revert changes for docker compose logs * chore: black-format * change: Use predtermined dns-allow-listed-hostname for Studio Local Support * add support for CodeEditor and JupyterLabs --------- Co-authored-by: Erick Benitez-Ramos <141277478+benieric@users.noreply.github.com> Co-authored-by: Mufaddal Rohawala --- src/sagemaker/local/image.py | 81 ++++--- src/sagemaker/local/utils.py | 28 +++ .../unit/sagemaker/local/test_local_image.py | 207 +++++++++++++++++- .../unit/sagemaker/local/test_local_utils.py | 27 ++- 4 files changed, 313 insertions(+), 30 deletions(-) diff --git a/src/sagemaker/local/image.py b/src/sagemaker/local/image.py index 60f28d3b0c..f38bc1fbe5 100644 --- a/src/sagemaker/local/image.py +++ b/src/sagemaker/local/image.py @@ -42,6 +42,7 @@ import sagemaker.utils CONTAINER_PREFIX = "algo" +STUDIO_HOST_NAME = "sagemaker-local" DOCKER_COMPOSE_FILENAME = "docker-compose.yaml" DOCKER_COMPOSE_HTTP_TIMEOUT_ENV = "COMPOSE_HTTP_TIMEOUT" DOCKER_COMPOSE_HTTP_TIMEOUT = "120" @@ -50,6 +51,7 @@ REGION_ENV_NAME = "AWS_REGION" TRAINING_JOB_NAME_ENV_NAME = "TRAINING_JOB_NAME" S3_ENDPOINT_URL_ENV_NAME = "S3_ENDPOINT_URL" +SM_STUDIO_LOCAL_MODE = "SM_STUDIO_LOCAL_MODE" # SELinux Enabled SELINUX_ENABLED = os.environ.get("SAGEMAKER_LOCAL_SELINUX_ENABLED", "False").lower() in [ @@ -107,10 +109,30 @@ def __init__( # Since we are using a single docker network, Generate a random suffix to attach to the # container names. This way multiple jobs can run in parallel. suffix = "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(5)) - self.hosts = [ - "{}-{}-{}".format(CONTAINER_PREFIX, i, suffix) - for i in range(1, self.instance_count + 1) - ] + self.is_studio = sagemaker.local.utils.check_for_studio() + if self.is_studio: + if self.instance_count > 1: + raise NotImplementedError( + "Multi instance Local Mode execution is " + "currently not supported in SageMaker Studio." + ) + # For studio use-case, directories need to be created in `~/tmp`, rather than /tmp + home = os.path.expanduser("~") + root_dir = os.path.join(home, "tmp") + if not os.path.isdir(root_dir): + os.mkdir(root_dir) + if self.sagemaker_session.config: + self.sagemaker_session.config["local"]["container_root"] = root_dir + else: + self.sagemaker_session.config = {"local": {"container_root": root_dir}} + # Studio only supports single instance run + self.hosts = [STUDIO_HOST_NAME] + else: + self.hosts = [ + "{}-{}-{}".format(CONTAINER_PREFIX, i, suffix) + for i in range(1, self.instance_count + 1) + ] + self.container_root = None self.container = None @@ -201,22 +223,17 @@ def process( self._generate_compose_file( "process", additional_volumes=volumes, additional_env_vars=environment ) - compose_command = self._compose() if _ecr_login_if_needed(self.sagemaker_session.boto_session, self.image): _pull_image(self.image) + compose_command = self._compose() process = subprocess.Popen( compose_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) try: _stream_output(process) - except RuntimeError as e: - # _stream_output() doesn't have the command line. We will handle the exception - # which contains the exit code and append the command line to it. - msg = f"Failed to run: {compose_command}" - raise RuntimeError(msg) from e finally: # Uploading processing outputs back to Amazon S3. self._upload_processing_outputs(data_dir, processing_output_config) @@ -283,22 +300,17 @@ def train(self, input_data_config, output_data_config, hyperparameters, environm compose_data = self._generate_compose_file( "train", additional_volumes=volumes, additional_env_vars=training_env_vars ) - compose_command = self._compose() if _ecr_login_if_needed(self.sagemaker_session.boto_session, self.image): _pull_image(self.image) + compose_command = self._compose() process = subprocess.Popen( compose_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) try: _stream_output(process) - except RuntimeError as e: - # _stream_output() doesn't have the command line. We will handle the exception - # which contains the exit code and append the command line to it. - msg = "Failed to run: %s, %s" % (compose_command, str(e)) - raise RuntimeError(msg) finally: artifacts = self.retrieve_artifacts(compose_data, output_data_config, job_name) @@ -347,6 +359,7 @@ def serve(self, model_dir, environment): self._generate_compose_file( "serve", additional_env_vars=environment, additional_volumes=volumes ) + compose_command = self._compose() self.container = _HostingContainer(compose_command) @@ -710,6 +723,9 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en additional_env_var_list = ["{}={}".format(k, v) for k, v in additional_env_vars.items()] environment.extend(additional_env_var_list) + if self.is_studio: + environment.extend([f"{SM_STUDIO_LOCAL_MODE}=True"]) + if os.environ.get(DOCKER_COMPOSE_HTTP_TIMEOUT_ENV) is None: os.environ[DOCKER_COMPOSE_HTTP_TIMEOUT_ENV] = DOCKER_COMPOSE_HTTP_TIMEOUT @@ -723,12 +739,19 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en for h in self.hosts } - content = { - # Use version 2.3 as a minimum so that we can specify the runtime - "version": "2.3", - "services": services, - "networks": {"sagemaker-local": {"name": "sagemaker-local"}}, - } + if self.is_studio: + content = { + # Use version 2.3 as a minimum so that we can specify the runtime + "version": "2.3", + "services": services, + } + else: + content = { + # Use version 2.3 as a minimum so that we can specify the runtime + "version": "2.3", + "services": services, + "networks": {"sagemaker-local": {"name": "sagemaker-local"}}, + } docker_compose_path = os.path.join(self.container_root, DOCKER_COMPOSE_FILENAME) @@ -810,7 +833,6 @@ def _create_docker_host( "tty": True, "volumes": [v.map for v in optml_volumes], "environment": environment, - "networks": {"sagemaker-local": {"aliases": [host]}}, } is_train_with_entrypoint = False @@ -827,6 +849,11 @@ def _create_docker_host( if self.container_arguments: host_config["entrypoint"] = host_config["entrypoint"] + self.container_arguments + if self.is_studio: + host_config["network_mode"] = "sagemaker" + else: + host_config["networks"] = {"sagemaker-local": {"aliases": [host]}} + # for GPU support pass in nvidia as the runtime, this is equivalent # to setting --runtime=nvidia in the docker commandline. if self.instance_type == "local_gpu": @@ -834,7 +861,7 @@ def _create_docker_host( "resources": {"reservations": {"devices": [{"capabilities": ["gpu"]}]}} } - if command == "serve": + if not self.is_studio and command == "serve": serving_port = ( sagemaker.utils.get_config_value( "local.serving_port", self.sagemaker_session.config @@ -910,7 +937,7 @@ def __init__(self, command): """Creates a new threaded hosting container. Args: - command: + command (dict): docker compose command """ Thread.__init__(self) self.command = command @@ -987,8 +1014,8 @@ def _stream_output(process): sys.stdout.write(stdout) exit_code = process.poll() - if exit_code != 0: - raise RuntimeError("Process exited with code: %s" % exit_code) + if exit_code not in [0, 130]: + raise RuntimeError(f"Failed to run: {process.args}. Process exited with code: {exit_code}") return exit_code diff --git a/src/sagemaker/local/utils.py b/src/sagemaker/local/utils.py index 16375de7d4..950d0974db 100644 --- a/src/sagemaker/local/utils.py +++ b/src/sagemaker/local/utils.py @@ -28,6 +28,8 @@ logger = logging.getLogger(__name__) +STUDIO_APP_TYPES = ["KernelGateway", "CodeEditor", "JupyterLab"] + def copy_directory_structure(destination_directory, relative_path): """Creates intermediate directory structure for relative_path. @@ -216,3 +218,29 @@ def get_using_dot_notation(dictionary, keys): return get_using_dot_notation(inner_dict, rest) except (KeyError, IndexError, TypeError): raise ValueError(f"{keys} does not exist in input dictionary.") + + +def check_for_studio(): + """Helper function to determine if the run environment is studio. + + Returns (bool): Returns True if valid Studio request. + + Raises: + NotImplementedError: + if run environment = Studio and AppType not in STUDIO_APP_TYPES + """ + is_studio = False + if os.path.exists("/opt/ml/metadata/resource-metadata.json"): + with open("/opt/ml/metadata/resource-metadata.json", "r") as handle: + metadata = json.load(handle) + app_type = metadata.get("AppType") + if app_type: + # check if the execution is triggered from Studio KernelGateway App + if app_type in STUDIO_APP_TYPES: + is_studio = True + else: + raise NotImplementedError( + f"AppType {app_type} in Studio does not support Local Mode." + ) + # if no apptype, case of classic notebooks + return is_studio diff --git a/tests/unit/sagemaker/local/test_local_image.py b/tests/unit/sagemaker/local/test_local_image.py index 22a565b306..ebca91a9f9 100644 --- a/tests/unit/sagemaker/local/test_local_image.py +++ b/tests/unit/sagemaker/local/test_local_image.py @@ -27,9 +27,10 @@ import pytest import yaml -from mock import patch, Mock, MagicMock +from mock import patch, Mock, MagicMock, mock_open, call import sagemaker from sagemaker.local.image import _SageMakerContainer, _Volume, _aws_credentials +from sagemaker.local.utils import STUDIO_APP_TYPES REGION = "us-west-2" BUCKET_NAME = "mybucket" @@ -75,6 +76,22 @@ ENVIRONMENT = {"MYVAR": "HELLO_WORLD"} +LOCAL_STUDIO_METADATA_BASE = '{{"AppType":"{app_type}","DomainId":"d-1234567890","UserProfileName": \ + "dummy-profile","ResourceArn":"arn:aws:sagemaker:us-west-2:123456789012:app/arn", \ + "ResourceName":"datascience-1-0-ml-t3-medium-1234567890"}}' + +LOCAL_STUDIO_METADATA_WITH_SPACE = '{"AppType":"KernelGateway","DomainId":"d-1234567890","SpaceName": \ + "dummy-space","ResourceArn":"arn:aws:sagemaker:us-west-2:123456789012:app/arn", \ + "ResourceName":"datascience-1-0-ml-t3-medium-1234567890"}' + +DUMMY_APPTYPE_METADATA = '{"AppType":"DUMMY"}' + +LOCAL_STUDIO_INCOMPLETE_METADATA = '{"AppType":"KernelGateway"}' + +CLASSIC_METADATA = '{"ResourceArn": \ + "arn:aws:sagemaker:us-west-2:616250812882:notebook-instance/test", \ + "ResourceName": "test"}' + @pytest.fixture() def sagemaker_session(): @@ -90,6 +107,49 @@ def sagemaker_session(): return sms +@patch("os.path.exists", return_value=True) +def test_check_for_studio(patch_os_exists, sagemaker_session): + for app_type in STUDIO_APP_TYPES: + metadata = LOCAL_STUDIO_METADATA_BASE.format(app_type=app_type) + print(metadata) + with patch("sagemaker.local.utils.open", mock_open(read_data=metadata)): + with pytest.raises( + NotImplementedError, + match="Multi instance Local Mode execution is currently not supported in SageMaker Studio.", + ): + _SageMakerContainer("local", 2, "my-image", sagemaker_session=sagemaker_session) + + sagemaker_container = _SageMakerContainer( + "local", 1, "my-image", sagemaker_session=sagemaker_session + ) + assert sagemaker_container.is_studio + + with patch("sagemaker.local.utils.open", mock_open(read_data=LOCAL_STUDIO_METADATA_WITH_SPACE)): + with pytest.raises( + NotImplementedError, + match="Multi instance Local Mode execution is currently not supported in SageMaker Studio.", + ): + _SageMakerContainer("local", 2, "my-image", sagemaker_session=sagemaker_session) + + sagemaker_container = _SageMakerContainer( + "local", 1, "my-image", sagemaker_session=sagemaker_session + ) + assert sagemaker_container.is_studio + + with patch("sagemaker.local.utils.open", mock_open(read_data=CLASSIC_METADATA)): + sagemaker_container = _SageMakerContainer( + "local", 1, "my-image", sagemaker_session=sagemaker_session + ) + assert not sagemaker_container.is_studio + + with patch("sagemaker.local.utils.open", mock_open(read_data=DUMMY_APPTYPE_METADATA)): + with pytest.raises( + NotImplementedError, + match="AppType DUMMY in Studio does not support Local Mode.", + ): + _SageMakerContainer("local", 2, "my-image", sagemaker_session=sagemaker_session) + + @patch("subprocess.check_output", Mock(return_value="Docker Compose version v2.0.0-rc.3")) def test_get_compose_cmd_prefix_with_docker_cli(): compose_cmd_prefix = _SageMakerContainer._get_compose_cmd_prefix() @@ -432,6 +492,87 @@ def test_train( assert "[Masked]" in caplog.text +@patch("sagemaker.local.local_session.LocalSession", Mock()) +@patch("sagemaker.local.image._stream_output", Mock()) +@patch("sagemaker.local.image._SageMakerContainer._cleanup") +@patch("sagemaker.local.image._SageMakerContainer.retrieve_artifacts") +@patch( + "sagemaker.local.image._SageMakerContainer._get_compose_cmd_prefix", + Mock(return_value=["docker-compose"]), +) +@patch("sagemaker.local.data.get_data_source_instance") +@patch("subprocess.Popen") +def test_train_for_studio( + popen, get_data_source_instance, retrieve_artifacts, cleanup, tmpdir, sagemaker_session, caplog +): + data_source = Mock() + data_source.get_root_dir.return_value = "foo" + get_data_source_instance.return_value = data_source + + caplog.set_level(logging.INFO) + + directories = [str(tmpdir.mkdir("container-root")), str(tmpdir.mkdir("data"))] + with patch( + "sagemaker.local.image._SageMakerContainer._create_tmp_folder", side_effect=directories + ): + instance_count = 1 + image = "my-image" + metadata = LOCAL_STUDIO_METADATA_BASE.format(app_type="KernelGateway") + with patch("sagemaker.local.utils.open", mock_open(read_data=metadata)): + with patch("os.path.exists", return_value=True): + sagemaker_container = _SageMakerContainer( + "local", instance_count, image, sagemaker_session=sagemaker_session + ) + + sagemaker_container.train( + INPUT_DATA_CONFIG, + OUTPUT_DATA_CONFIG, + HYPERPARAMETERS, + ENVIRONMENT, + TRAINING_JOB_NAME, + ) + + docker_compose_file = os.path.join( + sagemaker_container.container_root, "docker-compose.yaml" + ) + + expected_up_cmd = [ + "docker-compose", + "-f", + docker_compose_file, + "up", + "--build", + "--abort-on-container-exit", + ] + + popen.assert_has_calls( + [ + call(expected_up_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT), + ] + ) + + with open(docker_compose_file, "r") as f: + config = yaml.load(f, Loader=yaml.SafeLoader) + assert len(config["services"]) == instance_count + for h in sagemaker_container.hosts: + assert config["services"][h]["image"] == image + assert config["services"][h]["command"] == "train" + assert ( + "TRAINING_JOB_NAME={}".format(TRAINING_JOB_NAME) + in config["services"][h]["environment"] + ) + assert "SM_STUDIO_LOCAL_MODE=True" in config["services"][h]["environment"] + assert config["services"][h]["network_mode"] == "sagemaker" + + # assert that expected by sagemaker container output directories exist + assert os.path.exists(os.path.join(sagemaker_container.container_root, "output")) + assert os.path.exists(os.path.join(sagemaker_container.container_root, "output/data")) + + retrieve_artifacts.assert_called_once() + cleanup.assert_called_once() + assert "[Masked]" in caplog.text + + @patch("sagemaker.local.local_session.LocalSession", Mock()) @patch("sagemaker.local.image._stream_output", Mock()) @patch("sagemaker.local.image._SageMakerContainer._cleanup") @@ -563,7 +704,12 @@ def test_train_with_hyperparameters_without_job_name( @patch("sagemaker.local.data.get_data_source_instance") @patch("subprocess.Popen", Mock()) def test_train_error( - get_data_source_instance, retrieve_artifacts, cleanup, _stream_output, tmpdir, sagemaker_session + get_data_source_instance, + retrieve_artifacts, + cleanup, + _stream_output, + tmpdir, + sagemaker_session, ): data_source = Mock() data_source.get_root_dir.return_value = "foo" @@ -787,6 +933,63 @@ def test_serve(tmpdir, sagemaker_session, caplog): assert "[Masked]" in caplog.text +@patch("sagemaker.local.image._stream_output", Mock()) +@patch("sagemaker.local.image._SageMakerContainer._prepare_serving_volumes", Mock(return_value=[])) +@patch("shutil.copy", Mock()) +@patch("shutil.copytree", Mock()) +@patch( + "sagemaker.local.image._SageMakerContainer._get_compose_cmd_prefix", + Mock(return_value=["docker-compose"]), +) +@patch("subprocess.Popen") +def test_serve_for_studio(popen, tmpdir, sagemaker_session, caplog): + caplog.set_level(logging.INFO) + with patch( + "sagemaker.local.image._SageMakerContainer._create_tmp_folder", + return_value=str(tmpdir.mkdir("container-root")), + ): + instance_count = 1 + image = "my-image" + metadata = LOCAL_STUDIO_METADATA_BASE.format(app_type="KernelGateway") + with patch("sagemaker.local.utils.open", mock_open(read_data=metadata)): + with patch("os.path.exists", return_value=True): + sagemaker_container = _SageMakerContainer( + "local", instance_count, image, sagemaker_session=sagemaker_session + ) + + environment = {"env1": 1, "env2": "b", "SAGEMAKER_SUBMIT_DIRECTORY": "s3://some/path"} + + sagemaker_container.serve("/some/model/path", environment) + docker_compose_file = os.path.join( + sagemaker_container.container_root, "docker-compose.yaml" + ) + + expected_up_cmd = [ + "docker-compose", + "-f", + docker_compose_file, + "up", + "--build", + "--abort-on-container-exit", + ] + + popen.assert_has_calls( + [ + call(expected_up_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE), + ] + ) + + with open(docker_compose_file, "r") as f: + config = yaml.load(f, Loader=yaml.SafeLoader) + + for h in sagemaker_container.hosts: + assert config["services"][h]["image"] == image + assert config["services"][h]["command"] == "serve" + assert "SM_STUDIO_LOCAL_MODE=True" in config["services"][h]["environment"] + assert config["services"][h]["network_mode"] == "sagemaker" + assert "[Masked]" in caplog.text + + @patch("sagemaker.local.image._HostingContainer.run", Mock()) @patch("sagemaker.local.image._SageMakerContainer._prepare_serving_volumes", Mock(return_value=[])) @patch("shutil.copy", Mock()) diff --git a/tests/unit/sagemaker/local/test_local_utils.py b/tests/unit/sagemaker/local/test_local_utils.py index 39b9e2b392..42710b2495 100644 --- a/tests/unit/sagemaker/local/test_local_utils.py +++ b/tests/unit/sagemaker/local/test_local_utils.py @@ -15,7 +15,8 @@ import os import errno import pytest -from mock import patch, Mock +import json +from mock import patch, Mock, mock_open import sagemaker.local.utils from sagemaker.session_settings import SessionSettings @@ -198,3 +199,27 @@ def test_move_to_destination_local_root_failure(recursive_copy, mock_rmtree): recursive_copy.assert_called_with( "/tmp/data", os.path.abspath(os.path.join(os.sep, "target", "dir")) ) + + +def test_check_for_studio_with_valid_request(): + metadata = {"AppType": "KernelGateway"} + with patch("builtins.open", mock_open(read_data=json.dumps(metadata))): + with patch("os.path.exists", return_value=True): + is_studio = sagemaker.local.utils.check_for_studio() + assert is_studio is True + + +def test_check_for_studio_with_invalid_request(): + metadata = {"AppType": "DUMMY"} + with patch("builtins.open", mock_open(read_data=json.dumps(metadata))): + with patch("os.path.exists", return_value=True): + with pytest.raises(NotImplementedError): + sagemaker.local.utils.check_for_studio() + + +def test_check_for_studio_without_app_type(): + metadata = {} + with patch("builtins.open", mock_open(read_data=json.dumps(metadata))): + with patch("os.path.exists", return_value=True): + is_studio = sagemaker.local.utils.check_for_studio() + assert is_studio is False From 4a46a636d37ba29083f4bbc9eff9f61263b8ef59 Mon Sep 17 00:00:00 2001 From: ci Date: Thu, 28 Dec 2023 23:57:43 +0000 Subject: [PATCH 36/76] prepare release v2.203.0 --- CHANGELOG.md | 14 ++++++++++++++ VERSION | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad5477d6db..688bd848f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # Changelog +## v2.203.0 (2023-12-28) + +### Features + + * support local mode in SageMaker Studio (#1300) + * Supporting tbac in load_run + +### Bug Fixes and Other Changes + + * update image_uri_configs 12-25-2023 06:17:33 PST + * Disable failed test in IR + * Raise Exception for debug + * create role if needed in `get_execution_role` + ## v2.202.1 (2023-12-22) ### Bug Fixes and Other Changes diff --git a/VERSION b/VERSION index 8a14f9bf50..377a571709 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.202.2.dev0 +2.203.0 From 48fede14b566bbe308ccfb9661ff307975a6b863 Mon Sep 17 00:00:00 2001 From: ci Date: Thu, 28 Dec 2023 23:57:45 +0000 Subject: [PATCH 37/76] update development version to v2.203.1.dev0 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 377a571709..5a385e596d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.203.0 +2.203.1.dev0 From 263734747b8fd18bb53d1887f0c0a14fe7444ea6 Mon Sep 17 00:00:00 2001 From: sagemaker-bot Date: Fri, 29 Dec 2023 14:17:34 +0000 Subject: [PATCH 38/76] change: update image_uri_configs 12-29-2023 06:17:34 PST --- src/sagemaker/image_uri_config/sagemaker-base-python.json | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sagemaker/image_uri_config/sagemaker-base-python.json b/src/sagemaker/image_uri_config/sagemaker-base-python.json index 15c4c78eb2..b1eaccf204 100644 --- a/src/sagemaker/image_uri_config/sagemaker-base-python.json +++ b/src/sagemaker/image_uri_config/sagemaker-base-python.json @@ -17,6 +17,7 @@ "eu-central-1": "936697816551", "eu-north-1": "243637512696", "eu-south-1": "592751261982", + "eu-south-2": "127363102723", "eu-west-1": "470317259841", "eu-west-2": "712779665605", "eu-west-3": "615547856133", From 4fecf6a8fa08171ba52ea70e92bd1176dda4e5db Mon Sep 17 00:00:00 2001 From: Gary Wang <38331932+gwang111@users.noreply.github.com> Date: Tue, 2 Jan 2024 13:22:57 -0800 Subject: [PATCH 39/76] query hf api for model md (#4346) Co-authored-by: EC2 Default User --- src/sagemaker/huggingface/llm_utils.py | 46 +++++++++++ .../sagemaker/huggingface/test_llm_utils.py | 76 +++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 tests/unit/sagemaker/huggingface/test_llm_utils.py diff --git a/src/sagemaker/huggingface/llm_utils.py b/src/sagemaker/huggingface/llm_utils.py index aef5e5e585..65befe41b0 100644 --- a/src/sagemaker/huggingface/llm_utils.py +++ b/src/sagemaker/huggingface/llm_utils.py @@ -15,9 +15,16 @@ from typing import Optional +import urllib.request +from urllib.error import HTTPError, URLError +import json +from json import JSONDecodeError +import logging from sagemaker import image_uris from sagemaker.session import Session +logger = logging.getLogger(__name__) + def get_huggingface_llm_image_uri( backend: str, @@ -54,3 +61,42 @@ def get_huggingface_llm_image_uri( version = version or "0.24.0" return image_uris.retrieve(framework="djl-deepspeed", region=region, version=version) raise ValueError("Unsupported backend: %s" % backend) + + +def get_huggingface_model_metadata(model_id: str, hf_hub_token: Optional[str] = None) -> dict: + """Retrieves the json metadata of the HuggingFace Model via HuggingFace API. + + Args: + model_id (str): The HuggingFace Model ID + hf_hub_token (str): The HuggingFace Hub Token needed for Private/Gated HuggingFace Models + + Returns: + dict: The model metadata retrieved with the HuggingFace API + """ + + hf_model_metadata_url = f"https://huggingface.co/api/models/{model_id}" + hf_model_metadata_json = None + try: + if hf_hub_token: + hf_model_metadata_url = urllib.request.Request( + hf_model_metadata_url, None, {"Authorization": "Bearer " + hf_hub_token} + ) + with urllib.request.urlopen(hf_model_metadata_url) as response: + hf_model_metadata_json = json.load(response) + except (HTTPError, URLError, TimeoutError, JSONDecodeError) as e: + if "HTTP Error 401: Unauthorized" in str(e): + raise ValueError( + "Trying to access a gated/private HuggingFace model without valid credentials. " + "Please provide a HUGGING_FACE_HUB_TOKEN in env_vars" + ) + logger.warning( + "Exception encountered while trying to retrieve HuggingFace model metadata %s. " + "Details: %s", + hf_model_metadata_url, + e, + ) + if not hf_model_metadata_json: + raise ValueError( + "Did not find model metadata for the following HuggingFace Model ID %s" % model_id + ) + return hf_model_metadata_json diff --git a/tests/unit/sagemaker/huggingface/test_llm_utils.py b/tests/unit/sagemaker/huggingface/test_llm_utils.py new file mode 100644 index 0000000000..3c4cdde3f6 --- /dev/null +++ b/tests/unit/sagemaker/huggingface/test_llm_utils.py @@ -0,0 +1,76 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +from unittest import TestCase +from urllib.error import HTTPError +from unittest.mock import Mock, patch +from sagemaker.huggingface.llm_utils import get_huggingface_model_metadata + +MOCK_HF_ID = "mock_hf_id" +MOCK_HF_HUB_TOKEN = "mock_hf_hub_token" +MOCK_HF_MODEL_METADATA_JSON = {"mock_key": "mock_value"} + + +class LlmUtilsTests(TestCase): + @patch("sagemaker.huggingface.llm_utils.urllib") + @patch("sagemaker.huggingface.llm_utils.json") + def test_huggingface_model_metadata_success(self, mock_json, mock_urllib): + mock_json.load.return_value = MOCK_HF_MODEL_METADATA_JSON + ret_json = get_huggingface_model_metadata(MOCK_HF_ID) + + mock_urllib.request.urlopen.assert_called_once_with( + f"https://huggingface.co/api/models/{MOCK_HF_ID}" + ) + self.assertEqual(ret_json["mock_key"], "mock_value") + + @patch("sagemaker.huggingface.llm_utils.urllib") + @patch("sagemaker.huggingface.llm_utils.json") + def test_huggingface_model_metadata_gated_success(self, mock_json, mock_urllib): + mock_json.load.return_value = MOCK_HF_MODEL_METADATA_JSON + mock_hf_model_metadata_url = Mock() + mock_urllib.request.Request.side_effect = mock_hf_model_metadata_url + + ret_json = get_huggingface_model_metadata(MOCK_HF_ID, MOCK_HF_HUB_TOKEN) + + mock_urllib.request.Request.assert_called_once_with( + f"https://huggingface.co/api/models/{MOCK_HF_ID}", + None, + {"Authorization": "Bearer " + MOCK_HF_HUB_TOKEN}, + ) + self.assertEqual(ret_json["mock_key"], "mock_value") + + @patch("sagemaker.huggingface.llm_utils.urllib") + def test_huggingface_model_metadata_unauthorized_exception(self, mock_urllib): + mock_urllib.request.urlopen.side_effect = HTTPError( + code=401, msg="Unauthorized", url=None, hdrs=None, fp=None + ) + with self.assertRaises(ValueError) as context: + get_huggingface_model_metadata(MOCK_HF_ID) + + expected_error_msg = ( + "Trying to access a gated/private HuggingFace model without valid credentials. " + "Please provide a HUGGING_FACE_HUB_TOKEN in env_vars" + ) + self.assertEquals(expected_error_msg, str(context.exception)) + + @patch("sagemaker.huggingface.llm_utils.urllib") + def test_huggingface_model_metadata_general_exception(self, mock_urllib): + mock_urllib.request.urlopen.side_effect = TimeoutError("timed out") + with self.assertRaises(ValueError) as context: + get_huggingface_model_metadata(MOCK_HF_ID) + + expected_error_msg = ( + f"Did not find model metadata for the following HuggingFace Model ID {MOCK_HF_ID}" + ) + self.assertEquals(expected_error_msg, str(context.exception)) From 86e95b12a86406c98fa5ff26463667347953f5e1 Mon Sep 17 00:00:00 2001 From: Mufaddal Rohawala <89424143+mufaddal-rohawala@users.noreply.github.com> Date: Fri, 5 Jan 2024 10:50:19 -0800 Subject: [PATCH 40/76] fix: skip failing integs (#4348) Co-authored-by: Mufaddal Rohawala --- .../workflow/test_notebook_job_step.py | 4 ++++ ...test_inference_component_based_endpoint.py | 19 ++++++------------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/tests/integ/sagemaker/workflow/test_notebook_job_step.py b/tests/integ/sagemaker/workflow/test_notebook_job_step.py index b79b4898b2..c5fbb73e24 100644 --- a/tests/integ/sagemaker/workflow/test_notebook_job_step.py +++ b/tests/integ/sagemaker/workflow/test_notebook_job_step.py @@ -17,6 +17,7 @@ import tarfile import logging import nbformat as nbf +import pytest from sagemaker import get_execution_role from sagemaker.s3 import S3Downloader @@ -125,6 +126,9 @@ def verify_notebook_for_happy_case(cells): logging.error(error) +@pytest.mark.skip( + reason="This test is skipped temporarily due to failures. Need to re-enable later after fix." +) def test_notebook_job_with_more_configuration(sagemaker_session): """This test case is for more complex job configuration. 1. a parent notebook file with %run magic to execute 'subfolder/sub.ipynb' and the diff --git a/tests/integ/test_inference_component_based_endpoint.py b/tests/integ/test_inference_component_based_endpoint.py index ba725f5fa5..f2a2d7bb3b 100644 --- a/tests/integ/test_inference_component_based_endpoint.py +++ b/tests/integ/test_inference_component_based_endpoint.py @@ -15,7 +15,6 @@ import os import sagemaker.predictor import sagemaker.utils -import tests.integ import pytest from sagemaker import image_uris @@ -114,10 +113,8 @@ def xgboost_model(sagemaker_session, resources, model_update_to_name): return xgb_model -@pytest.mark.release -@pytest.mark.skipif( - tests.integ.test_region() not in tests.integ.INFERENCE_COMPONENT_SUPPORTED_REGIONS, - reason="inference component based endpoint is not supported in certain regions", +@pytest.mark.skip( + reason="This test is skipped temporarily due to failures. Need to re-enable later after fix." ) def test_deploy_single_model_with_endpoint_name(tfs_model, resources): endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-tensorflow-serving") @@ -145,10 +142,8 @@ def test_deploy_single_model_with_endpoint_name(tfs_model, resources): predictor.delete_endpoint() -@pytest.mark.release -@pytest.mark.skipif( - tests.integ.test_region() not in tests.integ.INFERENCE_COMPONENT_SUPPORTED_REGIONS, - reason="inference component based endpoint is not supported in certain regions", +@pytest.mark.skip( + reason="This test is skipped temporarily due to failures. Need to re-enable later after fix." ) def test_deploy_update_predictor_with_other_model( tfs_model, @@ -206,10 +201,8 @@ def test_deploy_update_predictor_with_other_model( predictor_to_update.delete_endpoint() -@pytest.mark.release -@pytest.mark.skipif( - tests.integ.test_region() not in tests.integ.INFERENCE_COMPONENT_SUPPORTED_REGIONS, - reason="inference component based endpoint is not supported in certain regions", +@pytest.mark.skip( + reason="This test is skipped temporarily due to failures. Need to re-enable later after fix." ) def test_deploy_multi_models_without_endpoint_name(tfs_model, resources): input_data = {"instances": [1.0, 2.0, 5.0]} From 906d61f3ec9020f0b4434471a721e259f099dbe8 Mon Sep 17 00:00:00 2001 From: amzn-choeric <105388439+amzn-choeric@users.noreply.github.com> Date: Mon, 8 Jan 2024 15:58:45 -0500 Subject: [PATCH 41/76] change: TGI 1.3.3 (#4335) --- .../image_uri_config/huggingface-llm.json | 51 ++++++++++++++++++- .../image_uris/test_huggingface_llm.py | 1 + 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/src/sagemaker/image_uri_config/huggingface-llm.json b/src/sagemaker/image_uri_config/huggingface-llm.json index 352ea3587f..637e831c50 100644 --- a/src/sagemaker/image_uri_config/huggingface-llm.json +++ b/src/sagemaker/image_uri_config/huggingface-llm.json @@ -10,7 +10,7 @@ "1.0": "1.0.3", "1.1": "1.1.0", "1.2": "1.2.0", - "1.3": "1.3.1" + "1.3": "1.3.3" }, "versions": { "0.6.0": { @@ -341,7 +341,54 @@ "container_version": { "gpu": "cu121-ubuntu20.04" } + }, + "1.3.3": { + "py_versions": [ + "py310" + ], + "registries": { + "af-south-1": "626614931356", + "il-central-1": "780543022126", + "ap-east-1": "871362719292", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-south-2": "772153158452", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ap-southeast-3": "907027046896", + "ap-southeast-4": "457447274322", + "ca-central-1": "763104351884", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-central-2": "380420809688", + "eu-north-1": "763104351884", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "eu-south-1": "692866216735", + "eu-south-2": "503227376785", + "me-south-1": "217643126080", + "me-central-1": "914824155844", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-gov-east-1": "446045086412", + "us-gov-west-1": "442386744353", + "us-iso-east-1": "886529160074", + "us-isob-east-1": "094389454867", + "us-west-1": "763104351884", + "us-west-2": "763104351884", + "ca-west-1": "204538143572" + }, + "tag_prefix": "2.1.1-tgi1.3.3", + "repository": "huggingface-pytorch-tgi-inference", + "container_version": { + "gpu": "cu121-ubuntu20.04" + } } } } -} \ No newline at end of file +} diff --git a/tests/unit/sagemaker/image_uris/test_huggingface_llm.py b/tests/unit/sagemaker/image_uris/test_huggingface_llm.py index c3fade6896..e4d7ab9947 100644 --- a/tests/unit/sagemaker/image_uris/test_huggingface_llm.py +++ b/tests/unit/sagemaker/image_uris/test_huggingface_llm.py @@ -26,6 +26,7 @@ "1.1.0": "2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04", "1.2.0": "2.1.1-tgi1.2.0-gpu-py310-cu121-ubuntu20.04", "1.3.1": "2.1.1-tgi1.3.1-gpu-py310-cu121-ubuntu20.04", + "1.3.3": "2.1.1-tgi1.3.3-gpu-py310-cu121-ubuntu20.04", } From cef604f410ba78f71adcf5908084ed3e213f831b Mon Sep 17 00:00:00 2001 From: ci Date: Tue, 9 Jan 2024 17:53:30 +0000 Subject: [PATCH 42/76] prepare release v2.203.1 --- CHANGELOG.md | 9 +++++++++ VERSION | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 688bd848f5..7d7649d124 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## v2.203.1 (2024-01-09) + +### Bug Fixes and Other Changes + + * TGI 1.3.3 + * skip failing integs + * query hf api for model md + * update image_uri_configs 12-29-2023 06:17:34 PST + ## v2.203.0 (2023-12-28) ### Features diff --git a/VERSION b/VERSION index 5a385e596d..3b1ae904ec 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.203.1.dev0 +2.203.1 From da2697c573e641a477e49a35d005267e2a4a265f Mon Sep 17 00:00:00 2001 From: ci Date: Tue, 9 Jan 2024 17:53:32 +0000 Subject: [PATCH 43/76] update development version to v2.203.2.dev0 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 3b1ae904ec..36ee2817e2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.203.1 +2.203.2.dev0 From 020aed2a87a00055c533dc418bd0a5a8e1e3b526 Mon Sep 17 00:00:00 2001 From: evakravi <69981223+evakravi@users.noreply.github.com> Date: Wed, 10 Jan 2024 09:38:36 -0500 Subject: [PATCH 44/76] feat: parallelize notebook search utils, add new operators (#4342) * feat: parallelize notebook search utils * chore: raise exception in notebook utils if thread has error * chore: improve variable name * fix: not passing region to get jumpstart bucket * chore: add sagemaker session to notebook utils * chore: address PR comments * feat: add support for includes, begins with, ends with * fix: pylint * feat: private util for model eula key * fix: unit tests, use verify_model_region_and_return_specs in notebook utils * Revert "feat: private util for model eula key" This reverts commit e2daefc839acb1e7bb1fe1c59347c4acb1595e33. * chore: add search keywords to header --- src/sagemaker/jumpstart/filters.py | 164 ++++++++-- src/sagemaker/jumpstart/notebook_utils.py | 284 ++++++++++-------- src/sagemaker/jumpstart/types.py | 9 +- .../unit/sagemaker/jumpstart/test_filters.py | 46 +++ .../jumpstart/test_notebook_utils.py | 225 ++++++++------ 5 files changed, 464 insertions(+), 264 deletions(-) diff --git a/src/sagemaker/jumpstart/filters.py b/src/sagemaker/jumpstart/filters.py index 56ef12a148..b045435ed0 100644 --- a/src/sagemaker/jumpstart/filters.py +++ b/src/sagemaker/jumpstart/filters.py @@ -14,7 +14,7 @@ from __future__ import absolute_import from ast import literal_eval from enum import Enum -from typing import Dict, List, Union, Any +from typing import Dict, List, Optional, Union, Any from sagemaker.jumpstart.types import JumpStartDataHolderType @@ -38,6 +38,10 @@ class FilterOperators(str, Enum): NOT_EQUALS = "not_equals" IN = "in" NOT_IN = "not_in" + INCLUDES = "includes" + NOT_INCLUDES = "not_includes" + BEGINS_WITH = "begins_with" + ENDS_WITH = "ends_with" class SpecialSupportedFilterKeys(str, Enum): @@ -52,6 +56,10 @@ class SpecialSupportedFilterKeys(str, Enum): FilterOperators.NOT_EQUALS: ["!==", "!=", "not equals", "is not"], FilterOperators.IN: ["in"], FilterOperators.NOT_IN: ["not in"], + FilterOperators.INCLUDES: ["includes", "contains"], + FilterOperators.NOT_INCLUDES: ["not includes", "not contains"], + FilterOperators.BEGINS_WITH: ["begins with", "starts with"], + FilterOperators.ENDS_WITH: ["ends with"], } @@ -62,7 +70,19 @@ class SpecialSupportedFilterKeys(str, Enum): ) ACCEPTABLE_OPERATORS_IN_PARSE_ORDER = ( - list(map(_PAD_ALPHABETIC_OPERATOR, FILTER_OPERATOR_STRING_MAPPINGS[FilterOperators.NOT_EQUALS])) + list( + map(_PAD_ALPHABETIC_OPERATOR, FILTER_OPERATOR_STRING_MAPPINGS[FilterOperators.BEGINS_WITH]) + ) + + list( + map(_PAD_ALPHABETIC_OPERATOR, FILTER_OPERATOR_STRING_MAPPINGS[FilterOperators.ENDS_WITH]) + ) + + list( + map(_PAD_ALPHABETIC_OPERATOR, FILTER_OPERATOR_STRING_MAPPINGS[FilterOperators.NOT_INCLUDES]) + ) + + list(map(_PAD_ALPHABETIC_OPERATOR, FILTER_OPERATOR_STRING_MAPPINGS[FilterOperators.INCLUDES])) + + list( + map(_PAD_ALPHABETIC_OPERATOR, FILTER_OPERATOR_STRING_MAPPINGS[FilterOperators.NOT_EQUALS]) + ) + list(map(_PAD_ALPHABETIC_OPERATOR, FILTER_OPERATOR_STRING_MAPPINGS[FilterOperators.NOT_IN])) + list(map(_PAD_ALPHABETIC_OPERATOR, FILTER_OPERATOR_STRING_MAPPINGS[FilterOperators.EQUALS])) + list(map(_PAD_ALPHABETIC_OPERATOR, FILTER_OPERATOR_STRING_MAPPINGS[FilterOperators.IN])) @@ -428,9 +448,96 @@ def parse_filter_string(filter_string: str) -> ModelFilter: raise ValueError(f"Cannot parse filter string: {filter_string}") +def _negate_boolean(boolean: BooleanValues) -> BooleanValues: + """Negates boolean expression (False -> True, True -> False).""" + if boolean == BooleanValues.TRUE: + return BooleanValues.FALSE + if boolean == BooleanValues.FALSE: + return BooleanValues.TRUE + return boolean + + +def _evaluate_filter_expression_equals( + model_filter: ModelFilter, + cached_model_value: Optional[Union[str, bool, int, float, Dict[str, Any], List[Any]]], +) -> BooleanValues: + """Evaluates filter expressions for equals.""" + if cached_model_value is None: + return BooleanValues.FALSE + model_filter_value = model_filter.value + if isinstance(cached_model_value, bool): + cached_model_value = str(cached_model_value).lower() + model_filter_value = model_filter.value.lower() + if str(model_filter_value) == str(cached_model_value): + return BooleanValues.TRUE + return BooleanValues.FALSE + + +def _evaluate_filter_expression_in( + model_filter: ModelFilter, + cached_model_value: Optional[Union[str, bool, int, float, Dict[str, Any], List[Any]]], +) -> BooleanValues: + """Evaluates filter expressions for string/list in.""" + if cached_model_value is None: + return BooleanValues.FALSE + py_obj = model_filter.value + try: + py_obj = literal_eval(py_obj) + try: + iter(py_obj) + except TypeError: + return BooleanValues.FALSE + except Exception: # pylint: disable=W0703 + pass + if isinstance(cached_model_value, list): + return BooleanValues.FALSE + if cached_model_value in py_obj: + return BooleanValues.TRUE + return BooleanValues.FALSE + + +def _evaluate_filter_expression_includes( + model_filter: ModelFilter, + cached_model_value: Optional[Union[str, bool, int, float, Dict[str, Any], List[Any]]], +) -> BooleanValues: + """Evaluates filter expressions for string includes.""" + if cached_model_value is None: + return BooleanValues.FALSE + filter_value = str(model_filter.value) + if filter_value in cached_model_value: + return BooleanValues.TRUE + return BooleanValues.FALSE + + +def _evaluate_filter_expression_begins_with( + model_filter: ModelFilter, + cached_model_value: Optional[Union[str, bool, int, float, Dict[str, Any], List[Any]]], +) -> BooleanValues: + """Evaluates filter expressions for string begins with.""" + if cached_model_value is None: + return BooleanValues.FALSE + filter_value = str(model_filter.value) + if cached_model_value.startswith(filter_value): + return BooleanValues.TRUE + return BooleanValues.FALSE + + +def _evaluate_filter_expression_ends_with( + model_filter: ModelFilter, + cached_model_value: Optional[Union[str, bool, int, float, Dict[str, Any], List[Any]]], +) -> BooleanValues: + """Evaluates filter expressions for string ends with.""" + if cached_model_value is None: + return BooleanValues.FALSE + filter_value = str(model_filter.value) + if cached_model_value.endswith(filter_value): + return BooleanValues.TRUE + return BooleanValues.FALSE + + def evaluate_filter_expression( # pylint: disable=too-many-return-statements model_filter: ModelFilter, - cached_model_value: Union[str, bool, int, float, Dict[str, Any], List[Any]], + cached_model_value: Optional[Union[str, bool, int, float, Dict[str, Any], List[Any]]], ) -> BooleanValues: """Evaluates model filter with cached model spec value, returns boolean. @@ -440,36 +547,29 @@ def evaluate_filter_expression( # pylint: disable=too-many-return-statements evaluate the filter. """ if model_filter.operator in FILTER_OPERATOR_STRING_MAPPINGS[FilterOperators.EQUALS]: - model_filter_value = model_filter.value - if isinstance(cached_model_value, bool): - cached_model_value = str(cached_model_value).lower() - model_filter_value = model_filter.value.lower() - if str(model_filter_value) == str(cached_model_value): - return BooleanValues.TRUE - return BooleanValues.FALSE + return _evaluate_filter_expression_equals(model_filter, cached_model_value) + if model_filter.operator in FILTER_OPERATOR_STRING_MAPPINGS[FilterOperators.NOT_EQUALS]: - if isinstance(cached_model_value, bool): - cached_model_value = str(cached_model_value).lower() - model_filter.value = model_filter.value.lower() - if str(model_filter.value) == str(cached_model_value): - return BooleanValues.FALSE - return BooleanValues.TRUE + return _negate_boolean(_evaluate_filter_expression_equals(model_filter, cached_model_value)) + if model_filter.operator in FILTER_OPERATOR_STRING_MAPPINGS[FilterOperators.IN]: - py_obj = literal_eval(model_filter.value) - try: - iter(py_obj) - except TypeError: - return BooleanValues.FALSE - if cached_model_value in py_obj: - return BooleanValues.TRUE - return BooleanValues.FALSE + return _evaluate_filter_expression_in(model_filter, cached_model_value) + if model_filter.operator in FILTER_OPERATOR_STRING_MAPPINGS[FilterOperators.NOT_IN]: - py_obj = literal_eval(model_filter.value) - try: - iter(py_obj) - except TypeError: - return BooleanValues.TRUE - if cached_model_value in py_obj: - return BooleanValues.FALSE - return BooleanValues.TRUE + return _negate_boolean(_evaluate_filter_expression_in(model_filter, cached_model_value)) + + if model_filter.operator in FILTER_OPERATOR_STRING_MAPPINGS[FilterOperators.INCLUDES]: + return _evaluate_filter_expression_includes(model_filter, cached_model_value) + + if model_filter.operator in FILTER_OPERATOR_STRING_MAPPINGS[FilterOperators.NOT_INCLUDES]: + return _negate_boolean( + _evaluate_filter_expression_includes(model_filter, cached_model_value) + ) + + if model_filter.operator in FILTER_OPERATOR_STRING_MAPPINGS[FilterOperators.BEGINS_WITH]: + return _evaluate_filter_expression_begins_with(model_filter, cached_model_value) + + if model_filter.operator in FILTER_OPERATOR_STRING_MAPPINGS[FilterOperators.ENDS_WITH]: + return _evaluate_filter_expression_ends_with(model_filter, cached_model_value) + raise RuntimeError(f"Bad operator: {model_filter.operator}") diff --git a/src/sagemaker/jumpstart/notebook_utils.py b/src/sagemaker/jumpstart/notebook_utils.py index 732dbf4b83..1554025995 100644 --- a/src/sagemaker/jumpstart/notebook_utils.py +++ b/src/sagemaker/jumpstart/notebook_utils.py @@ -14,13 +14,15 @@ from __future__ import absolute_import import copy +from concurrent.futures import ThreadPoolExecutor, as_completed + from functools import cmp_to_key -import os +import json from typing import Any, Generator, List, Optional, Tuple, Union, Set, Dict from packaging.version import Version from sagemaker.jumpstart import accessors from sagemaker.jumpstart.constants import ( - ENV_VARIABLE_DISABLE_JUMPSTART_LOGGING, + DEFAULT_JUMPSTART_SAGEMAKER_SESSION, JUMPSTART_DEFAULT_REGION_NAME, ) from sagemaker.jumpstart.enums import JumpStartScriptScope @@ -31,7 +33,15 @@ SpecialSupportedFilterKeys, ) from sagemaker.jumpstart.filters import Constant, ModelFilter, Operator, evaluate_filter_expression -from sagemaker.jumpstart.utils import get_sagemaker_version +from sagemaker.jumpstart.types import JumpStartModelHeader, JumpStartModelSpecs +from sagemaker.jumpstart.utils import ( + get_jumpstart_content_bucket, + get_sagemaker_version, + verify_model_region_and_return_specs, +) +from sagemaker.session import Session + +MAX_SEARCH_WORKERS = int(100 * 1e6 / 25 * 1e3) # max 100MB total memory, 25kB per thread) def _compare_model_version_tuples( # pylint: disable=too-many-return-statements @@ -134,6 +144,7 @@ def extract_framework_task_model(model_id: str) -> Tuple[str, str, str]: def list_jumpstart_tasks( # pylint: disable=redefined-builtin filter: Union[Operator, str] = Constant(BooleanValues.TRUE), region: str = JUMPSTART_DEFAULT_REGION_NAME, + sagemaker_session: Session = DEFAULT_JUMPSTART_SAGEMAKER_SESSION, ) -> List[str]: """List tasks for JumpStart, and optionally apply filters to result. @@ -145,10 +156,14 @@ def list_jumpstart_tasks( # pylint: disable=redefined-builtin (Default: Constant(BooleanValues.TRUE)). region (str): Optional. The AWS region from which to retrieve JumpStart metadata regarding models. (Default: JUMPSTART_DEFAULT_REGION_NAME). + sagemaker_session (sagemaker.session.Session): Optional. The SageMaker Session to + use to perform the model search. (Default: DEFAULT_JUMPSTART_SAGEMAKER_SESSION). """ tasks: Set[str] = set() - for model_id, _ in _generate_jumpstart_model_versions(filter=filter, region=region): + for model_id, _ in _generate_jumpstart_model_versions( + filter=filter, region=region, sagemaker_session=sagemaker_session + ): _, task, _ = extract_framework_task_model(model_id) tasks.add(task) return sorted(list(tasks)) @@ -157,6 +172,7 @@ def list_jumpstart_tasks( # pylint: disable=redefined-builtin def list_jumpstart_frameworks( # pylint: disable=redefined-builtin filter: Union[Operator, str] = Constant(BooleanValues.TRUE), region: str = JUMPSTART_DEFAULT_REGION_NAME, + sagemaker_session: Session = DEFAULT_JUMPSTART_SAGEMAKER_SESSION, ) -> List[str]: """List frameworks for JumpStart, and optionally apply filters to result. @@ -168,10 +184,14 @@ def list_jumpstart_frameworks( # pylint: disable=redefined-builtin (Default: Constant(BooleanValues.TRUE)). region (str): Optional. The AWS region from which to retrieve JumpStart metadata regarding models. (Default: JUMPSTART_DEFAULT_REGION_NAME). + sagemaker_session (sagemaker.session.Session): Optional. The SageMaker Session + to use to perform the model search. (Default: DEFAULT_JUMPSTART_SAGEMAKER_SESSION). """ frameworks: Set[str] = set() - for model_id, _ in _generate_jumpstart_model_versions(filter=filter, region=region): + for model_id, _ in _generate_jumpstart_model_versions( + filter=filter, region=region, sagemaker_session=sagemaker_session + ): framework, _, _ = extract_framework_task_model(model_id) frameworks.add(framework) return sorted(list(frameworks)) @@ -180,6 +200,7 @@ def list_jumpstart_frameworks( # pylint: disable=redefined-builtin def list_jumpstart_scripts( # pylint: disable=redefined-builtin filter: Union[Operator, str] = Constant(BooleanValues.TRUE), region: str = JUMPSTART_DEFAULT_REGION_NAME, + sagemaker_session: Session = DEFAULT_JUMPSTART_SAGEMAKER_SESSION, ) -> List[str]: """List scripts for JumpStart, and optionally apply filters to result. @@ -191,6 +212,8 @@ def list_jumpstart_scripts( # pylint: disable=redefined-builtin (Default: Constant(BooleanValues.TRUE)). region (str): Optional. The AWS region from which to retrieve JumpStart metadata regarding models. (Default: JUMPSTART_DEFAULT_REGION_NAME). + sagemaker_session (sagemaker.session.Session): Optional. The SageMaker Session to + use to perform the model search. (Default: DEFAULT_JUMPSTART_SAGEMAKER_SESSION). """ if (isinstance(filter, Constant) and filter.resolved_value == BooleanValues.TRUE) or ( isinstance(filter, str) and filter.lower() == BooleanValues.TRUE.lower() @@ -198,12 +221,16 @@ def list_jumpstart_scripts( # pylint: disable=redefined-builtin return sorted([e.value for e in JumpStartScriptScope]) scripts: Set[str] = set() - for model_id, version in _generate_jumpstart_model_versions(filter=filter, region=region): + for model_id, version in _generate_jumpstart_model_versions( + filter=filter, region=region, sagemaker_session=sagemaker_session + ): scripts.add(JumpStartScriptScope.INFERENCE) - model_specs = accessors.JumpStartModelsAccessor.get_model_specs( + model_specs = verify_model_region_and_return_specs( region=region, model_id=model_id, version=version, + sagemaker_session=sagemaker_session, + scope=JumpStartScriptScope.INFERENCE, ) if model_specs.training_supported: scripts.add(JumpStartScriptScope.TRAINING) @@ -219,6 +246,7 @@ def list_jumpstart_models( # pylint: disable=redefined-builtin list_incomplete_models: bool = False, list_old_models: bool = False, list_versions: bool = False, + sagemaker_session: Session = DEFAULT_JUMPSTART_SAGEMAKER_SESSION, ) -> List[Union[Tuple[str], Tuple[str, str]]]: """List models for JumpStart, and optionally apply filters to result. @@ -238,11 +266,16 @@ def list_jumpstart_models( # pylint: disable=redefined-builtin versions should be included in the returned result. (Default: False). list_versions (bool): Optional. True if versions for models should be returned in addition to the id of the model. (Default: False). + sagemaker_session (sagemaker.session.Session): Optional. The SageMaker Session to use + to perform the model search. (Default: DEFAULT_JUMPSTART_SAGEMAKER_SESSION). """ model_id_version_dict: Dict[str, List[str]] = dict() for model_id, version in _generate_jumpstart_model_versions( - filter=filter, region=region, list_incomplete_models=list_incomplete_models + filter=filter, + region=region, + list_incomplete_models=list_incomplete_models, + sagemaker_session=sagemaker_session, ): if model_id not in model_id_version_dict: model_id_version_dict[model_id] = list() @@ -268,6 +301,7 @@ def _generate_jumpstart_model_versions( # pylint: disable=redefined-builtin filter: Union[Operator, str] = Constant(BooleanValues.TRUE), region: str = JUMPSTART_DEFAULT_REGION_NAME, list_incomplete_models: bool = False, + sagemaker_session: Session = DEFAULT_JUMPSTART_SAGEMAKER_SESSION, ) -> Generator: """Generate models for JumpStart, and optionally apply filters to result. @@ -283,166 +317,144 @@ def _generate_jumpstart_model_versions( # pylint: disable=redefined-builtin requested by the filter, and the filter cannot be resolved to a include/not include, whether the model should be included. By default, these models are omitted from results. (Default: False). + sagemaker_session (sagemaker.session.Session): Optional. The SageMaker Session + to use to perform the model search. (Default: DEFAULT_JUMPSTART_SAGEMAKER_SESSION). """ - class _ModelSearchContext: - """Context manager for conducting model searches.""" - - def __init__(self): - """Initialize context manager.""" - - self.old_disable_js_logging_env_var_value = os.environ.get( - ENV_VARIABLE_DISABLE_JUMPSTART_LOGGING - ) - - def __enter__(self, *args, **kwargs): - """Enter context. - - Disable JumpStart logs to avoid excessive logging. - """ - - os.environ[ENV_VARIABLE_DISABLE_JUMPSTART_LOGGING] = "true" - - def __exit__(self, *args, **kwargs): - """Exit context. + models_manifest_list = accessors.JumpStartModelsAccessor._get_manifest( + region=region, s3_client=sagemaker_session.s3_client + ) - Restore JumpStart logging settings, and reset cache so - new logs would appear for models previously searched. - """ + if isinstance(filter, str): + filter = Identity(filter) - if self.old_disable_js_logging_env_var_value: - os.environ[ - ENV_VARIABLE_DISABLE_JUMPSTART_LOGGING - ] = self.old_disable_js_logging_env_var_value - else: - os.environ.pop(ENV_VARIABLE_DISABLE_JUMPSTART_LOGGING, None) - accessors.JumpStartModelsAccessor.reset_cache() + manifest_keys = set(models_manifest_list[0].__slots__) - with _ModelSearchContext(): + all_keys: Set[str] = set() - if isinstance(filter, str): - filter = Identity(filter) + model_filters: Set[ModelFilter] = set() - models_manifest_list = accessors.JumpStartModelsAccessor._get_manifest(region=region) - manifest_keys = set(models_manifest_list[0].__slots__) + for operator in _model_filter_in_operator_generator(filter): + model_filter = operator.unresolved_value + key = model_filter.key + all_keys.add(key) + model_filters.add(model_filter) - all_keys: Set[str] = set() + for key in all_keys: + if "." in key: + raise NotImplementedError(f"No support for multiple level metadata indexing ('{key}').") - model_filters: Set[ModelFilter] = set() + metadata_filter_keys = all_keys - SPECIAL_SUPPORTED_FILTER_KEYS - for operator in _model_filter_in_operator_generator(filter): - model_filter = operator.unresolved_value - key = model_filter.key - all_keys.add(key) - model_filters.add(model_filter) + required_manifest_keys = manifest_keys.intersection(metadata_filter_keys) + possible_spec_keys = metadata_filter_keys - manifest_keys - for key in all_keys: - if "." in key: - raise NotImplementedError( - f"No support for multiple level metadata indexing ('{key}')." - ) + is_task_filter = SpecialSupportedFilterKeys.TASK in all_keys + is_framework_filter = SpecialSupportedFilterKeys.FRAMEWORK in all_keys - metadata_filter_keys = all_keys - SPECIAL_SUPPORTED_FILTER_KEYS + def evaluate_model(model_manifest: JumpStartModelHeader) -> Optional[Tuple[str, str]]: - required_manifest_keys = manifest_keys.intersection(metadata_filter_keys) - possible_spec_keys = metadata_filter_keys - manifest_keys + copied_filter = copy.deepcopy(filter) - unrecognized_keys: Set[str] = set() + manifest_specs_cached_values: Dict[str, Union[bool, int, float, str, dict, list]] = {} - is_task_filter = SpecialSupportedFilterKeys.TASK in all_keys - is_framework_filter = SpecialSupportedFilterKeys.FRAMEWORK in all_keys + model_filters_to_resolved_values: Dict[ModelFilter, BooleanValues] = {} - for model_manifest in models_manifest_list: + for val in required_manifest_keys: + manifest_specs_cached_values[val] = getattr(model_manifest, val) - copied_filter = copy.deepcopy(filter) + if is_task_filter: + manifest_specs_cached_values[ + SpecialSupportedFilterKeys.TASK + ] = extract_framework_task_model(model_manifest.model_id)[1] - manifest_specs_cached_values: Dict[str, Union[bool, int, float, str, dict, list]] = {} + if is_framework_filter: + manifest_specs_cached_values[ + SpecialSupportedFilterKeys.FRAMEWORK + ] = extract_framework_task_model(model_manifest.model_id)[0] - model_filters_to_resolved_values: Dict[ModelFilter, BooleanValues] = {} + if Version(model_manifest.min_version) > Version(get_sagemaker_version()): + return None - for val in required_manifest_keys: - manifest_specs_cached_values[val] = getattr(model_manifest, val) + _populate_model_filters_to_resolved_values( + manifest_specs_cached_values, + model_filters_to_resolved_values, + model_filters, + ) - if is_task_filter: - manifest_specs_cached_values[ - SpecialSupportedFilterKeys.TASK - ] = extract_framework_task_model(model_manifest.model_id)[1] + _put_resolved_booleans_into_filter(copied_filter, model_filters_to_resolved_values) - if is_framework_filter: - manifest_specs_cached_values[ - SpecialSupportedFilterKeys.FRAMEWORK - ] = extract_framework_task_model(model_manifest.model_id)[0] + copied_filter.eval() - if Version(model_manifest.min_version) > Version(get_sagemaker_version()): - continue + if copied_filter.resolved_value in [BooleanValues.TRUE, BooleanValues.FALSE]: + if copied_filter.resolved_value == BooleanValues.TRUE: + return (model_manifest.model_id, model_manifest.version) + return None - _populate_model_filters_to_resolved_values( - manifest_specs_cached_values, - model_filters_to_resolved_values, - model_filters, + if copied_filter.resolved_value == BooleanValues.UNEVALUATED: + raise RuntimeError( + "Filter expression in unevaluated state after using " + "values from model manifest. Model ID and version that " + f"is failing: {(model_manifest.model_id, model_manifest.version)}." ) - - _put_resolved_booleans_into_filter(copied_filter, model_filters_to_resolved_values) - - copied_filter.eval() - - if copied_filter.resolved_value in [BooleanValues.TRUE, BooleanValues.FALSE]: - if copied_filter.resolved_value == BooleanValues.TRUE: - yield (model_manifest.model_id, model_manifest.version) - continue - - if copied_filter.resolved_value == BooleanValues.UNEVALUATED: - raise RuntimeError( - "Filter expression in unevaluated state after using " - "values from model manifest. Model ID and version that " - f"is failing: {(model_manifest.model_id, model_manifest.version)}." + copied_filter_2 = copy.deepcopy(filter) + + # spec is downloaded to thread's memory. since each thread + # accesses a unique s3 spec, there is no need to use the JS caching utils. + # spec only stays in memory for lifecycle of thread. + model_specs = JumpStartModelSpecs( + json.loads( + sagemaker_session.read_s3_file( + get_jumpstart_content_bucket(region), model_manifest.spec_key ) - copied_filter_2 = copy.deepcopy(filter) - - model_specs = accessors.JumpStartModelsAccessor.get_model_specs( - region=region, - model_id=model_manifest.model_id, - version=model_manifest.version, ) + ) - model_specs_keys = set(model_specs.__slots__) - - unrecognized_keys -= model_specs_keys - unrecognized_keys_for_single_spec = possible_spec_keys - model_specs_keys - unrecognized_keys.update(unrecognized_keys_for_single_spec) + for val in possible_spec_keys: + if hasattr(model_specs, val): + manifest_specs_cached_values[val] = getattr(model_specs, val) - for val in possible_spec_keys: - if hasattr(model_specs, val): - manifest_specs_cached_values[val] = getattr(model_specs, val) + _populate_model_filters_to_resolved_values( + manifest_specs_cached_values, + model_filters_to_resolved_values, + model_filters, + ) + _put_resolved_booleans_into_filter(copied_filter_2, model_filters_to_resolved_values) - _populate_model_filters_to_resolved_values( - manifest_specs_cached_values, - model_filters_to_resolved_values, - model_filters, - ) - _put_resolved_booleans_into_filter(copied_filter_2, model_filters_to_resolved_values) + copied_filter_2.eval() - copied_filter_2.eval() + if copied_filter_2.resolved_value != BooleanValues.UNEVALUATED: + if copied_filter_2.resolved_value == BooleanValues.TRUE or ( + BooleanValues.UNKNOWN and list_incomplete_models + ): + return (model_manifest.model_id, model_manifest.version) + return None - if copied_filter_2.resolved_value != BooleanValues.UNEVALUATED: - if copied_filter_2.resolved_value == BooleanValues.TRUE or ( - BooleanValues.UNKNOWN and list_incomplete_models - ): - yield (model_manifest.model_id, model_manifest.version) - continue + raise RuntimeError( + "Filter expression in unevaluated state after using values from model specs. " + "Model ID and version that is failing: " + f"{(model_manifest.model_id, model_manifest.version)}." + ) - raise RuntimeError( - "Filter expression in unevaluated state after using values from model specs. " - "Model ID and version that is failing: " - f"{(model_manifest.model_id, model_manifest.version)}." - ) + with ThreadPoolExecutor(max_workers=MAX_SEARCH_WORKERS) as executor: + futures = [] + for header in models_manifest_list: + futures.append(executor.submit(evaluate_model, header)) - if len(unrecognized_keys) > 0: - raise RuntimeError(f"Unrecognized keys: {str(unrecognized_keys)}") + for future in as_completed(futures): + error = future.exception() + if error: + raise error + result = future.result() + if result: + yield result def get_model_url( - model_id: str, model_version: str, region: str = JUMPSTART_DEFAULT_REGION_NAME + model_id: str, + model_version: str, + region: str = JUMPSTART_DEFAULT_REGION_NAME, + sagemaker_session: Session = DEFAULT_JUMPSTART_SAGEMAKER_SESSION, ) -> str: """Retrieve web url describing pretrained model. @@ -451,9 +463,15 @@ def get_model_url( model_version (str): The model version for which to retrieve the url. region (str): Optional. The region from which to retrieve metadata. (Default: JUMPSTART_DEFAULT_REGION_NAME) + sagemaker_session (sagemaker.session.Session): Optional. The SageMaker Session to use + to retrieve the model url. """ - model_specs = accessors.JumpStartModelsAccessor.get_model_specs( - region=region, model_id=model_id, version=model_version + model_specs = verify_model_region_and_return_specs( + region=region, + model_id=model_id, + version=model_version, + sagemaker_session=sagemaker_session, + scope=JumpStartScriptScope.INFERENCE, ) return model_specs.url diff --git a/src/sagemaker/jumpstart/types.py b/src/sagemaker/jumpstart/types.py index 21b624d7a4..49d3e295c5 100644 --- a/src/sagemaker/jumpstart/types.py +++ b/src/sagemaker/jumpstart/types.py @@ -130,7 +130,7 @@ def __init__( class JumpStartModelHeader(JumpStartDataHolderType): """Data class JumpStart model header.""" - __slots__ = ["model_id", "version", "min_version", "spec_key"] + __slots__ = ["model_id", "version", "min_version", "spec_key", "search_keywords"] def __init__(self, header: Dict[str, str]): """Initializes a JumpStartModelHeader object from its json representation. @@ -142,7 +142,11 @@ def __init__(self, header: Dict[str, str]): def to_json(self) -> Dict[str, str]: """Returns json representation of JumpStartModelHeader object.""" - json_obj = {att: getattr(self, att) for att in self.__slots__ if hasattr(self, att)} + json_obj = { + att: getattr(self, att) + for att in self.__slots__ + if getattr(self, att, None) is not None + } return json_obj def from_json(self, json_obj: Dict[str, str]) -> None: @@ -155,6 +159,7 @@ def from_json(self, json_obj: Dict[str, str]) -> None: self.version: str = json_obj["version"] self.min_version: str = json_obj["min_version"] self.spec_key: str = json_obj["spec_key"] + self.search_keywords: Optional[List[str]] = json_obj.get("search_keywords") class JumpStartECRSpecs(JumpStartDataHolderType): diff --git a/tests/unit/sagemaker/jumpstart/test_filters.py b/tests/unit/sagemaker/jumpstart/test_filters.py index 31055745b1..a984509f9b 100644 --- a/tests/unit/sagemaker/jumpstart/test_filters.py +++ b/tests/unit/sagemaker/jumpstart/test_filters.py @@ -143,6 +143,10 @@ def test_not_equals(self): def test_in(self): + assert BooleanValues.TRUE == evaluate_filter_expression( + ModelFilter(key="hello", operator="in", value="daddy"), "dad" + ) + assert BooleanValues.TRUE == evaluate_filter_expression( ModelFilter(key="hello", operator="in", value='["mom", "dad"]'), "dad" ) @@ -169,6 +173,10 @@ def test_in(self): def test_not_in(self): + assert BooleanValues.FALSE == evaluate_filter_expression( + ModelFilter(key="hello", operator="not in", value="daddy"), "dad" + ) + assert BooleanValues.FALSE == evaluate_filter_expression( ModelFilter(key="hello", operator="not in", value='["mom", "dad"]'), "dad" ) @@ -193,6 +201,44 @@ def test_not_in(self): ModelFilter(key="hello", operator="not in", value='["mom", "fsdfdsfsd"]'), False ) + def test_includes(self): + + assert BooleanValues.TRUE == evaluate_filter_expression( + ModelFilter(key="hello", operator="includes", value="dad"), "daddy" + ) + + assert BooleanValues.TRUE == evaluate_filter_expression( + ModelFilter(key="hello", operator="includes", value="dad"), ["dad"] + ) + + def test_not_includes(self): + + assert BooleanValues.FALSE == evaluate_filter_expression( + ModelFilter(key="hello", operator="not includes", value="dad"), "daddy" + ) + + assert BooleanValues.FALSE == evaluate_filter_expression( + ModelFilter(key="hello", operator="not includes", value="dad"), ["dad"] + ) + + def test_begins_with(self): + assert BooleanValues.TRUE == evaluate_filter_expression( + ModelFilter(key="hello", operator="begins with", value="dad"), "daddy" + ) + + assert BooleanValues.FALSE == evaluate_filter_expression( + ModelFilter(key="hello", operator="begins with", value="mm"), "mommy" + ) + + def test_ends_with(self): + assert BooleanValues.TRUE == evaluate_filter_expression( + ModelFilter(key="hello", operator="ends with", value="car"), "racecar" + ) + + assert BooleanValues.FALSE == evaluate_filter_expression( + ModelFilter(key="hello", operator="begins with", value="ace"), "racecar" + ) + def test_parse_filter_string(): diff --git a/tests/unit/sagemaker/jumpstart/test_notebook_utils.py b/tests/unit/sagemaker/jumpstart/test_notebook_utils.py index 181310a507..8aae4c36a8 100644 --- a/tests/unit/sagemaker/jumpstart/test_notebook_utils.py +++ b/tests/unit/sagemaker/jumpstart/test_notebook_utils.py @@ -1,10 +1,14 @@ from __future__ import absolute_import +import json from unittest import TestCase from unittest.mock import Mock, patch import pytest -from sagemaker.jumpstart.constants import JUMPSTART_DEFAULT_REGION_NAME +from sagemaker.jumpstart.constants import ( + DEFAULT_JUMPSTART_SAGEMAKER_SESSION, + JUMPSTART_DEFAULT_REGION_NAME, +) from sagemaker.jumpstart.filters import And, Identity, Not, Or from tests.unit.sagemaker.jumpstart.constants import PROTOTYPICAL_MODEL_SPECS_DICT from tests.unit.sagemaker.jumpstart.utils import ( @@ -22,6 +26,7 @@ ) +@patch("sagemaker.jumpstart.notebook_utils.DEFAULT_JUMPSTART_SAGEMAKER_SESSION.read_s3_file") @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor._get_manifest") @patch("sagemaker.jumpstart.notebook_utils.accessors.JumpStartModelsAccessor.get_model_specs") @patch("sagemaker.jumpstart.notebook_utils._generate_jumpstart_model_versions") @@ -29,10 +34,16 @@ def test_list_jumpstart_scripts( patched_generate_jumpstart_models: Mock, patched_get_model_specs: Mock, patched_get_manifest: Mock, + patched_read_s3_file: Mock, ): patched_get_model_specs.side_effect = get_prototype_model_spec - patched_get_manifest.side_effect = get_prototype_manifest + patched_get_manifest.side_effect = lambda region, *args, **kwargs: get_prototype_manifest( + region + ) patched_generate_jumpstart_models.side_effect = _generate_jumpstart_model_versions + patched_read_s3_file.side_effect = lambda *args, **kwargs: json.dumps( + get_prototype_model_spec(None, "pytorch-eqa-bert-base-cased").to_json() + ) assert list_jumpstart_scripts() == sorted(["inference", "training"]) patched_get_model_specs.assert_not_called() @@ -48,7 +59,9 @@ def test_list_jumpstart_scripts( "region": "sa-east-1", } assert list_jumpstart_scripts(**kwargs) == sorted(["inference", "training"]) - patched_generate_jumpstart_models.assert_called_once_with(**kwargs) + patched_generate_jumpstart_models.assert_called_once_with( + **kwargs, sagemaker_session=DEFAULT_JUMPSTART_SAGEMAKER_SESSION + ) patched_get_manifest.assert_called_once() assert patched_get_model_specs.call_count == 1 @@ -61,9 +74,11 @@ def test_list_jumpstart_scripts( "region": "sa-east-1", } assert list_jumpstart_scripts(**kwargs) == [] - patched_generate_jumpstart_models.assert_called_once_with(**kwargs) + patched_generate_jumpstart_models.assert_called_once_with( + **kwargs, sagemaker_session=DEFAULT_JUMPSTART_SAGEMAKER_SESSION + ) patched_get_manifest.assert_called_once() - assert patched_get_model_specs.call_count == len(PROTOTYPICAL_MODEL_SPECS_DICT) + assert patched_read_s3_file.call_count == len(PROTOTYPICAL_MODEL_SPECS_DICT) @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor._get_manifest") @@ -75,7 +90,9 @@ def test_list_jumpstart_tasks( patched_get_manifest: Mock, ): patched_get_model_specs.side_effect = get_prototype_model_spec - patched_get_manifest.side_effect = get_prototype_manifest + patched_get_manifest.side_effect = lambda region, *args, **kwargs: get_prototype_manifest( + region + ) patched_generate_jumpstart_models.side_effect = _generate_jumpstart_model_versions assert list_jumpstart_tasks() == sorted( @@ -101,7 +118,9 @@ def test_list_jumpstart_tasks( "region": "sa-east-1", } assert list_jumpstart_tasks(**kwargs) == ["ic"] - patched_generate_jumpstart_models.assert_called_once_with(**kwargs) + patched_generate_jumpstart_models.assert_called_once_with( + **kwargs, sagemaker_session=DEFAULT_JUMPSTART_SAGEMAKER_SESSION + ) patched_get_manifest.assert_called_once() patched_get_model_specs.assert_not_called() @@ -115,7 +134,9 @@ def test_list_jumpstart_frameworks( patched_get_manifest: Mock, ): patched_get_model_specs.side_effect = get_prototype_model_spec - patched_get_manifest.side_effect = get_prototype_manifest + patched_get_manifest.side_effect = lambda region, *args, **kwargs: get_prototype_manifest( + region + ) patched_generate_jumpstart_models.side_effect = _generate_jumpstart_model_versions assert list_jumpstart_frameworks() == sorted( @@ -155,7 +176,9 @@ def test_list_jumpstart_frameworks( ] ) - patched_generate_jumpstart_models.assert_called_once_with(**kwargs) + patched_generate_jumpstart_models.assert_called_once_with( + **kwargs, sagemaker_session=DEFAULT_JUMPSTART_SAGEMAKER_SESSION + ) patched_get_manifest.assert_called_once() patched_get_model_specs.assert_not_called() @@ -167,7 +190,9 @@ def test_list_jumpstart_models_simple_case( self, patched_get_model_specs: Mock, patched_get_manifest: Mock ): patched_get_model_specs.side_effect = get_prototype_model_spec - patched_get_manifest.side_effect = get_prototype_manifest + patched_get_manifest.side_effect = lambda region, *args, **kwargs: get_prototype_manifest( + region + ) assert list_jumpstart_models(list_versions=True) == [ ("catboost-classification-model", "1.0.0"), ("huggingface-spc-bert-base-cased", "1.0.0"), @@ -183,31 +208,35 @@ def test_list_jumpstart_models_simple_case( patched_get_model_specs.assert_not_called() @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor._get_manifest") - @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor.get_model_specs") + @patch("sagemaker.jumpstart.notebook_utils.DEFAULT_JUMPSTART_SAGEMAKER_SESSION.read_s3_file") def test_list_jumpstart_models_script_filter( - self, patched_get_model_specs: Mock, patched_get_manifest: Mock + self, patched_read_s3_file: Mock, patched_get_manifest: Mock ): - patched_get_model_specs.side_effect = get_prototype_model_spec - patched_get_manifest.side_effect = get_prototype_manifest + patched_read_s3_file.side_effect = lambda *args, **kwargs: json.dumps( + get_prototype_model_spec(None, "pytorch-eqa-bert-base-cased").to_json() + ) + patched_get_manifest.side_effect = lambda region, *args, **kwargs: get_prototype_manifest( + region + ) manifest_length = len(get_prototype_manifest()) vals = [True, False] for val in vals: kwargs = {"filter": f"training_supported == {val}"} list_jumpstart_models(**kwargs) - assert patched_get_model_specs.call_count == manifest_length + assert patched_read_s3_file.call_count == manifest_length patched_get_manifest.assert_called_once() patched_get_manifest.reset_mock() - patched_get_model_specs.reset_mock() + patched_read_s3_file.reset_mock() kwargs = {"filter": f"training_supported != {val}"} list_jumpstart_models(**kwargs) - assert patched_get_model_specs.call_count == manifest_length + assert patched_read_s3_file.call_count == manifest_length patched_get_manifest.assert_called_once() patched_get_manifest.reset_mock() - patched_get_model_specs.reset_mock() + patched_read_s3_file.reset_mock() kwargs = {"filter": f"training_supported in {vals}", "list_versions": True} assert list_jumpstart_models(**kwargs) == [ @@ -220,16 +249,16 @@ def test_list_jumpstart_models_script_filter( ("tensorflow-ic-bit-m-r101x1-ilsvrc2012-classification-1", "1.0.0"), ("xgboost-classification-model", "1.0.0"), ] - assert patched_get_model_specs.call_count == manifest_length + assert patched_read_s3_file.call_count == manifest_length patched_get_manifest.assert_called_once() patched_get_manifest.reset_mock() - patched_get_model_specs.reset_mock() + patched_read_s3_file.reset_mock() kwargs = {"filter": f"training_supported not in {vals}"} models = list_jumpstart_models(**kwargs) assert [] == models - assert patched_get_model_specs.call_count == manifest_length + assert patched_read_s3_file.call_count == manifest_length patched_get_manifest.assert_called_once() @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor._get_manifest") @@ -238,7 +267,9 @@ def test_list_jumpstart_models_task_filter( self, patched_get_model_specs: Mock, patched_get_manifest: Mock ): patched_get_model_specs.side_effect = get_prototype_model_spec - patched_get_manifest.side_effect = get_prototype_manifest + patched_get_manifest.side_effect = lambda region, *args, **kwargs: get_prototype_manifest( + region + ) vals = [ "classification", @@ -288,12 +319,16 @@ def test_list_jumpstart_models_task_filter( patched_get_manifest.assert_called_once() @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor._get_manifest") - @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor.get_model_specs") + @patch("sagemaker.jumpstart.notebook_utils.DEFAULT_JUMPSTART_SAGEMAKER_SESSION.read_s3_file") def test_list_jumpstart_models_framework_filter( - self, patched_get_model_specs: Mock, patched_get_manifest: Mock + self, patched_read_s3_file: Mock, patched_get_manifest: Mock ): - patched_get_model_specs.side_effect = get_prototype_model_spec - patched_get_manifest.side_effect = get_prototype_manifest + patched_read_s3_file.side_effect = lambda *args, **kwargs: json.dumps( + get_prototype_model_spec(None, "pytorch-eqa-bert-base-cased").to_json() + ) + patched_get_manifest.side_effect = lambda region, *args, **kwargs: get_prototype_manifest( + region + ) vals = [ "catboost", @@ -307,19 +342,19 @@ def test_list_jumpstart_models_framework_filter( for val in vals: kwargs = {"filter": f"framework == {val}"} list_jumpstart_models(**kwargs) - patched_get_model_specs.assert_not_called() + patched_read_s3_file.assert_not_called() patched_get_manifest.assert_called_once() patched_get_manifest.reset_mock() - patched_get_model_specs.reset_mock() + patched_read_s3_file.reset_mock() kwargs = {"filter": f"framework != {val}"} list_jumpstart_models(**kwargs) - patched_get_model_specs.assert_not_called() + patched_read_s3_file.assert_not_called() patched_get_manifest.assert_called_once() patched_get_manifest.reset_mock() - patched_get_model_specs.reset_mock() + patched_read_s3_file.reset_mock() kwargs = {"filter": f"framework in {vals}", "list_versions": True} assert list_jumpstart_models(**kwargs) == [ @@ -331,18 +366,18 @@ def test_list_jumpstart_models_framework_filter( ("sklearn-classification-linear", "1.0.0"), ("xgboost-classification-model", "1.0.0"), ] - patched_get_model_specs.assert_not_called() + patched_read_s3_file.assert_not_called() patched_get_manifest.assert_called_once() patched_get_manifest.reset_mock() - patched_get_model_specs.reset_mock() + patched_read_s3_file.reset_mock() kwargs = {"filter": f"framework not in {vals}", "list_versions": True} models = list_jumpstart_models(**kwargs) assert [("tensorflow-ic-bit-m-r101x1-ilsvrc2012-classification-1", "1.0.0")] == models patched_get_manifest.reset_mock() - patched_get_model_specs.reset_mock() + patched_read_s3_file.reset_mock() kwargs = { "filter": And(f"framework not in {vals}", "training_supported is True"), @@ -350,11 +385,11 @@ def test_list_jumpstart_models_framework_filter( } models = list_jumpstart_models(**kwargs) assert [("tensorflow-ic-bit-m-r101x1-ilsvrc2012-classification-1", "1.0.0")] == models - patched_get_model_specs.assert_called_once() + patched_read_s3_file.assert_called_once() patched_get_manifest.assert_called_once() patched_get_manifest.reset_mock() - patched_get_model_specs.reset_mock() + patched_read_s3_file.reset_mock() kwargs = { "filter": And( @@ -364,7 +399,7 @@ def test_list_jumpstart_models_framework_filter( } models = list_jumpstart_models(**kwargs) assert [] == models - patched_get_model_specs.assert_not_called() + patched_read_s3_file.assert_not_called() patched_get_manifest.assert_called_once() @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor._get_manifest") @@ -374,36 +409,15 @@ def test_list_jumpstart_models_region( ): patched_get_model_specs.side_effect = get_prototype_model_spec - patched_get_manifest.side_effect = lambda region: get_prototype_manifest(region="us-west-2") + patched_get_manifest.side_effect = lambda region, *args, **kwargs: get_prototype_manifest( + region="us-west-2" + ) list_jumpstart_models(region="some-region") - patched_get_manifest.assert_called_once_with(region="some-region") - - @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor._get_manifest") - @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor.get_model_specs") - @patch("sagemaker.jumpstart.notebook_utils.get_sagemaker_version") - @patch("sagemaker.jumpstart.notebook_utils.accessors.JumpStartModelsAccessor.reset_cache") - @patch.dict("os.environ", {}) - @patch("logging.StreamHandler.emit") - @patch("sagemaker.jumpstart.constants.JUMPSTART_LOGGER.propagate", False) - def test_list_jumpstart_models_disables_logging_resets_cache( - self, - patched_emit: Mock, - patched_reset_cache: Mock, - patched_get_sagemaker_version: Mock, - patched_get_model_specs: Mock, - patched_get_manifest: Mock, - ): - patched_get_model_specs.side_effect = get_prototype_model_spec - patched_get_manifest.side_effect = get_prototype_manifest - - patched_get_sagemaker_version.return_value = "3.0.0" - - list_jumpstart_models("deprecate_warn_message is blah") - - patched_emit.assert_not_called() - patched_reset_cache.assert_called_once() + patched_get_manifest.assert_called_once_with( + region="some-region", s3_client=DEFAULT_JUMPSTART_SAGEMAKER_SESSION.s3_client + ) @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor._get_manifest") @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor.get_model_specs") @@ -419,7 +433,9 @@ def get_manifest_more_versions(region: str = JUMPSTART_DEFAULT_REGION_NAME): for version in ["2.400.0", "1.4.0", "2.5.1", "1.300.0"] ] - patched_get_manifest.side_effect = get_manifest_more_versions + patched_get_manifest.side_effect = ( + lambda region, *args, **kwargs: get_manifest_more_versions(region) + ) assert [ ("catboost-classification-model", "2.400.0"), @@ -477,83 +493,87 @@ def get_manifest_more_versions(region: str = JUMPSTART_DEFAULT_REGION_NAME): ) == list_jumpstart_models(list_versions=True) @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor._get_manifest") - @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor.get_model_specs") + @patch("sagemaker.jumpstart.notebook_utils.DEFAULT_JUMPSTART_SAGEMAKER_SESSION.read_s3_file") def test_list_jumpstart_models_vulnerable_models( self, - patched_get_model_specs: Mock, + patched_read_s3_file: Mock, patched_get_manifest: Mock, ): - patched_get_manifest.side_effect = get_prototype_manifest + patched_get_manifest.side_effect = lambda region, *args, **kwargs: get_prototype_manifest( + region + ) - def vulnerable_inference_model_spec(*args, **kwargs): - spec = get_prototype_model_spec(*args, **kwargs) + def vulnerable_inference_model_spec(bucket, key, *args, **kwargs) -> str: + spec = get_prototype_model_spec(None, "pytorch-eqa-bert-base-cased") spec.inference_vulnerable = True - return spec + return json.dumps(spec.to_json()) - def vulnerable_training_model_spec(*args, **kwargs): - spec = get_prototype_model_spec(*args, **kwargs) + def vulnerable_training_model_spec(bucket, key, *args, **kwargs): + spec = get_prototype_model_spec(None, "pytorch-eqa-bert-base-cased") spec.training_vulnerable = True - return spec + return json.dumps(spec.to_json()) - patched_get_model_specs.side_effect = vulnerable_inference_model_spec + patched_read_s3_file.side_effect = vulnerable_inference_model_spec num_specs = len(PROTOTYPICAL_MODEL_SPECS_DICT) assert [] == list_jumpstart_models( And("inference_vulnerable is false", "training_vulnerable is false") ) - assert patched_get_model_specs.call_count == num_specs + assert patched_read_s3_file.call_count == num_specs patched_get_manifest.assert_called_once() patched_get_manifest.reset_mock() - patched_get_model_specs.reset_mock() + patched_read_s3_file.reset_mock() - patched_get_model_specs.side_effect = vulnerable_training_model_spec + patched_read_s3_file.side_effect = vulnerable_training_model_spec assert [] == list_jumpstart_models( And("inference_vulnerable is false", "training_vulnerable is false") ) - assert patched_get_model_specs.call_count == num_specs + assert patched_read_s3_file.call_count == num_specs patched_get_manifest.assert_called_once() patched_get_manifest.reset_mock() - patched_get_model_specs.reset_mock() + patched_read_s3_file.reset_mock() assert [] != list_jumpstart_models() - assert patched_get_model_specs.call_count == 0 + assert patched_read_s3_file.call_count == 0 @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor._get_manifest") - @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor.get_model_specs") + @patch("sagemaker.jumpstart.notebook_utils.DEFAULT_JUMPSTART_SAGEMAKER_SESSION.read_s3_file") def test_list_jumpstart_models_deprecated_models( self, - patched_get_model_specs: Mock, + patched_read_s3_file: Mock, patched_get_manifest: Mock, ): - patched_get_manifest.side_effect = get_prototype_manifest + patched_get_manifest.side_effect = lambda region, *args, **kwargs: get_prototype_manifest( + region + ) - def deprecated_model_spec(*args, **kwargs): - spec = get_prototype_model_spec(*args, **kwargs) + def deprecated_model_spec(bucket, key, *args, **kwargs) -> str: + spec = get_prototype_model_spec(None, "pytorch-eqa-bert-base-cased") spec.deprecated = True - return spec + return json.dumps(spec.to_json()) - patched_get_model_specs.side_effect = deprecated_model_spec + patched_read_s3_file.side_effect = deprecated_model_spec num_specs = len(PROTOTYPICAL_MODEL_SPECS_DICT) assert [] == list_jumpstart_models("deprecated equals false") - assert patched_get_model_specs.call_count == num_specs + assert patched_read_s3_file.call_count == num_specs patched_get_manifest.assert_called_once() patched_get_manifest.reset_mock() - patched_get_model_specs.reset_mock() + patched_read_s3_file.reset_mock() assert [] != list_jumpstart_models() - assert patched_get_model_specs.call_count == 0 + assert patched_read_s3_file.call_count == 0 @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor._get_manifest") @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor.get_model_specs") @@ -563,7 +583,9 @@ def test_list_jumpstart_models_no_versions( patched_get_manifest: Mock, ): patched_get_model_specs.side_effect = get_prototype_model_spec - patched_get_manifest.side_effect = get_prototype_manifest + patched_get_manifest.side_effect = lambda region, *args, **kwargs: get_prototype_manifest( + region + ) all_model_ids = [ "catboost-classification-model", @@ -581,14 +603,18 @@ def test_list_jumpstart_models_no_versions( assert list_jumpstart_models(list_versions=False) == all_model_ids @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor._get_manifest") - @patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor.get_model_specs") + @patch("sagemaker.jumpstart.notebook_utils.DEFAULT_JUMPSTART_SAGEMAKER_SESSION.read_s3_file") def test_list_jumpstart_models_complex_queries( self, - patched_get_model_specs: Mock, + patched_read_s3_file: Mock, patched_get_manifest: Mock, ): - patched_get_model_specs.side_effect = get_prototype_model_spec - patched_get_manifest.side_effect = get_prototype_manifest + patched_read_s3_file.side_effect = lambda *args, **kwargs: json.dumps( + get_prototype_model_spec(None, "pytorch-eqa-bert-base-cased").to_json() + ) + patched_get_manifest.side_effect = lambda region, *args, **kwargs: get_prototype_manifest( + region + ) assert list_jumpstart_models( Or( @@ -631,7 +657,9 @@ def test_list_jumpstart_models_multiple_level_index( patched_get_manifest: Mock, ): patched_get_model_specs.side_effect = get_prototype_model_spec - patched_get_manifest.side_effect = get_prototype_manifest + patched_get_manifest.side_effect = lambda region, *args, **kwargs: get_prototype_manifest( + region + ) with pytest.raises(NotImplementedError): list_jumpstart_models("hosting_ecr_specs.py_version == py3") @@ -665,5 +693,8 @@ def test_get_model_url( get_model_url(model_id, version, region=region) patched_get_model_specs.assert_called_once_with( - model_id=model_id, version=version, region=region + model_id=model_id, + version=version, + region=region, + s3_client=DEFAULT_JUMPSTART_SAGEMAKER_SESSION.s3_client, ) From b6227f602dd37fea71e51857defd1e88d498f60f Mon Sep 17 00:00:00 2001 From: qidewenwhen <32910701+qidewenwhen@users.noreply.github.com> Date: Wed, 10 Jan 2024 09:41:00 -0800 Subject: [PATCH 45/76] fix: change ConditionNot incorrect property Expression to Condition (#4351) --- src/sagemaker/local/pipeline.py | 2 +- src/sagemaker/workflow/conditions.py | 2 +- tests/integ/sagemaker/workflow/test_fail_steps.py | 7 ++++--- tests/unit/sagemaker/workflow/test_condition_step.py | 10 +++++----- tests/unit/sagemaker/workflow/test_conditions.py | 4 ++-- 5 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/sagemaker/local/pipeline.py b/src/sagemaker/local/pipeline.py index c1d049de4d..4b9209fc0b 100644 --- a/src/sagemaker/local/pipeline.py +++ b/src/sagemaker/local/pipeline.py @@ -444,7 +444,7 @@ def _resolve_not_condition(self, not_condition: dict): True if given ConditionNot evaluated as true, False otherwise. """ - return not self._resolve_condition(not_condition["Expression"]) + return not self._resolve_condition(not_condition["Condition"]) def _resolve_or_condition(self, or_condition: dict): """Resolve given ConditionOr. diff --git a/src/sagemaker/workflow/conditions.py b/src/sagemaker/workflow/conditions.py index 4cdec057f4..4b4996a7fa 100644 --- a/src/sagemaker/workflow/conditions.py +++ b/src/sagemaker/workflow/conditions.py @@ -259,7 +259,7 @@ def __init__(self, expression: Condition): def to_request(self) -> RequestType: """Get the request structure for workflow service calls.""" - return {"Type": self.condition_type.value, "Expression": self.expression.to_request()} + return {"Type": self.condition_type.value, "Condition": self.expression.to_request()} @property def _referenced_steps(self) -> List[str]: diff --git a/tests/integ/sagemaker/workflow/test_fail_steps.py b/tests/integ/sagemaker/workflow/test_fail_steps.py index 5f8c1e04ab..af9ed5368e 100644 --- a/tests/integ/sagemaker/workflow/test_fail_steps.py +++ b/tests/integ/sagemaker/workflow/test_fail_steps.py @@ -17,7 +17,7 @@ from tests.integ.sagemaker.workflow.helpers import wait_pipeline_execution from sagemaker import get_execution_role, utils from sagemaker.workflow.condition_step import ConditionStep -from sagemaker.workflow.conditions import ConditionEquals +from sagemaker.workflow.conditions import ConditionEquals, ConditionNot from sagemaker.workflow.fail_step import FailStep from sagemaker.workflow.functions import Join @@ -37,14 +37,15 @@ def pipeline_name(): def test_two_step_fail_pipeline_with_str_err_msg(sagemaker_session, role, pipeline_name): param = ParameterInteger(name="MyInt", default_value=2) - cond = ConditionEquals(left=param, right=1) + cond_equal = ConditionEquals(left=param, right=2) + cond_not_equal = ConditionNot(cond_equal) step_fail = FailStep( name="FailStep", error_message="Failed due to hitting in else branch", ) step_cond = ConditionStep( name="CondStep", - conditions=[cond], + conditions=[cond_not_equal], if_steps=[], else_steps=[step_fail], ) diff --git a/tests/unit/sagemaker/workflow/test_condition_step.py b/tests/unit/sagemaker/workflow/test_condition_step.py index 7ac335fbc3..315d549cce 100644 --- a/tests/unit/sagemaker/workflow/test_condition_step.py +++ b/tests/unit/sagemaker/workflow/test_condition_step.py @@ -202,7 +202,7 @@ def test_pipeline_condition_step_interpolated(sagemaker_session): }, { "Type": "Not", - "Expression": { + "Condition": { "Type": "Equals", "LeftValue": {"Get": "Parameters.MyInt1"}, "RightValue": {"Get": "Parameters.MyInt2"}, @@ -210,7 +210,7 @@ def test_pipeline_condition_step_interpolated(sagemaker_session): }, { "Type": "Not", - "Expression": { + "Condition": { "Type": "In", "QueryValue": {"Get": "Parameters.MyStr"}, "Values": ["abc", "def"], @@ -533,9 +533,9 @@ def func2(): assert len(step_dsl["Arguments"]["Conditions"]) == 1 condition_dsl = step_dsl["Arguments"]["Conditions"][0] assert condition_dsl["Type"] == "Not" - cond_expr_dsl = condition_dsl["Expression"] + cond_expr_dsl = condition_dsl["Condition"] assert cond_expr_dsl["Type"] == "Not" - cond_inner_expr_dsl = cond_expr_dsl["Expression"] + cond_inner_expr_dsl = cond_expr_dsl["Condition"] assert cond_inner_expr_dsl["Type"] == "Or" assert len(cond_inner_expr_dsl["Conditions"]) == 2 assert cond_inner_expr_dsl["Conditions"][0]["LeftValue"] == _get_expected_jsonget_expr( @@ -602,7 +602,7 @@ def func4(): assert len(step_dsl["Arguments"]["Conditions"]) == 1 condition_dsl = step_dsl["Arguments"]["Conditions"][0] assert condition_dsl["Type"] == "Not" - cond_expr_dsl = condition_dsl["Expression"] + cond_expr_dsl = condition_dsl["Condition"] assert cond_expr_dsl["Type"] == "In" assert cond_expr_dsl["QueryValue"] == _get_expected_jsonget_expr( step_name=step_output3._step.name, path="Result" diff --git a/tests/unit/sagemaker/workflow/test_conditions.py b/tests/unit/sagemaker/workflow/test_conditions.py index a7ec9c0c11..941a191856 100644 --- a/tests/unit/sagemaker/workflow/test_conditions.py +++ b/tests/unit/sagemaker/workflow/test_conditions.py @@ -122,7 +122,7 @@ def test_condition_not(): cond_not = ConditionNot(expression=cond_eq) assert cond_not.to_request() == { "Type": "Not", - "Expression": { + "Condition": { "Type": "Equals", "LeftValue": param, "RightValue": "foo", @@ -136,7 +136,7 @@ def test_condition_not_in(): cond_not = ConditionNot(expression=cond_in) assert cond_not.to_request() == { "Type": "Not", - "Expression": { + "Condition": { "Type": "In", "QueryValue": param, "Values": ["abc", "def"], From afc6b72ece0124a6340a8871142e0fa2c8ca7390 Mon Sep 17 00:00:00 2001 From: Mufaddal Rohawala <89424143+mufaddal-rohawala@users.noreply.github.com> Date: Thu, 11 Jan 2024 10:51:04 -0800 Subject: [PATCH 46/76] fix: Huggingface glue failing tests (#4367) * fix: Huggingface glue failing tests * fix: Sphinx doc build failure * fix: Huggingface glue failing tests * fix: failing sphinx tests * fix: failing sphinx tests * fix: failing black check * fix: sphinx doc errors * fix: sphinx doc errors * sphinx * black-format * sphinx * sphinx * sphinx --------- Co-authored-by: Mufaddal Rohawala Co-authored-by: Erick Benitez-Ramos --- doc/conf.py | 2 +- doc/requirements.txt | 2 +- .../feature_processor/feature_processor.py | 2 +- src/sagemaker/jumpstart/estimator.py | 8 ++++---- src/sagemaker/session.py | 12 +++++------- tests/data/huggingface/requirements.txt | 1 + tests/data/huggingface_byoc/requirements.txt | 2 +- tests/integ/test_huggingface.py | 3 ++- tests/integ/test_huggingface_torch_distributed.py | 4 ++-- 9 files changed, 18 insertions(+), 18 deletions(-) create mode 100644 tests/data/huggingface/requirements.txt diff --git a/doc/conf.py b/doc/conf.py index d1ce73cb90..94a5c4d9c6 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -94,7 +94,7 @@ } # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {"http://docs.python.org/": None} +intersphinx_mapping = {"python": ("http://docs.python.org/", None)} # -- Options for autodoc ---------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html#configuration diff --git a/doc/requirements.txt b/doc/requirements.txt index 365a7c1272..62541ef4e1 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -1,4 +1,4 @@ -sphinx==3.1.2 +sphinx==3.4.3 sphinx-rtd-theme==0.5.0 docutils==0.15.2 packaging==20.9 diff --git a/src/sagemaker/feature_store/feature_processor/feature_processor.py b/src/sagemaker/feature_store/feature_processor/feature_processor.py index e957dbd0ea..95e1dd297c 100644 --- a/src/sagemaker/feature_store/feature_processor/feature_processor.py +++ b/src/sagemaker/feature_store/feature_processor/feature_processor.py @@ -79,7 +79,7 @@ def transform(input_feature_group, input_csv): return ... Args: - inputs (Sequence[Union[FeatureGroupDataSource, CSVDataSource, ParquetDataSource, + inputs (Sequence[Union[FeatureGroupDataSource, CSVDataSource, ParquetDataSource,\ BaseDataSource]]): A list of data sources. output (str): A Feature Group ARN to write results of this function to. target_stores (Optional[list[str]], optional): A list containing at least one of diff --git a/src/sagemaker/jumpstart/estimator.py b/src/sagemaker/jumpstart/estimator.py index 36a188ed55..6c374b7e09 100644 --- a/src/sagemaker/jumpstart/estimator.py +++ b/src/sagemaker/jumpstart/estimator.py @@ -251,8 +251,8 @@ def __init__( (Default: None). model_channel_name (Optional[Union[str, PipelineVariable]]): Name of the channel where 'model_uri' will be downloaded. (Default: None). - metric_definitions (Optional[Union[list[dict[str, str], list[dict[str, - PipelineVariable]]]]): A list of dictionaries that defines the metric(s) + metric_definitions (Optional[list[dict[str, Union[str, PipelineVariable]]]]): + A list of dictionaries that defines the metric(s) used to evaluate the training jobs. Each dictionary contains two keys: 'Name' for the name of the metric, and 'Regex' for the regular expression used to extract the metric from the logs. This should be defined only for jobs that @@ -292,8 +292,8 @@ def __init__( SageMaker Debugger rules for real-time analysis (Default: None). For more information, see `Continuous analyses through rules - `_. + `_. (Default: None). debugger_hook_config (Optional[Union[DebuggerHookConfig, bool]]): Configuration for how debugging information is emitted with diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py index 2cf7e78f41..ac1bf6e343 100644 --- a/src/sagemaker/session.py +++ b/src/sagemaker/session.py @@ -4565,20 +4565,18 @@ def update_inference_component( Args: inference_component_name (str): Name of the Amazon SageMaker ``InferenceComponent``. specification ([dict[str,int]]): Resource configuration. Optional. - Example: { + Example: { "MinMemoryRequiredInMb": 1024, "NumberOfCpuCoresRequired": 1, "NumberOfAcceleratorDevicesRequired": 1, "MaxMemoryRequiredInMb": 4096, - }, - + }, runtime_config ([dict[str,int]]): Number of copies. Optional. - Default: { + Default: { "copyCount": 1 - } - + } wait: Wait for inference component to be created before return. Optional. Default is - True. + True. Return: str: inference component name diff --git a/tests/data/huggingface/requirements.txt b/tests/data/huggingface/requirements.txt new file mode 100644 index 0000000000..d40b9acb97 --- /dev/null +++ b/tests/data/huggingface/requirements.txt @@ -0,0 +1 @@ +datasets==2.16.1 diff --git a/tests/data/huggingface_byoc/requirements.txt b/tests/data/huggingface_byoc/requirements.txt index fed4662285..6845f50254 100644 --- a/tests/data/huggingface_byoc/requirements.txt +++ b/tests/data/huggingface_byoc/requirements.txt @@ -1,2 +1,2 @@ transformers<=4.28.1 -datasets<=2.12.0 +datasets==2.16.1 diff --git a/tests/integ/test_huggingface.py b/tests/integ/test_huggingface.py index c77ade62ee..a8be54c4d4 100644 --- a/tests/integ/test_huggingface.py +++ b/tests/integ/test_huggingface.py @@ -71,7 +71,8 @@ def test_huggingface_training( hf = HuggingFace( py_version=huggingface_pytorch_latest_training_py_version, - entry_point=os.path.join(data_path, "run_glue.py"), + source_dir=data_path, + entry_point="run_glue.py", role="SageMakerRole", transformers_version=huggingface_training_latest_version, pytorch_version=huggingface_training_pytorch_latest_version, diff --git a/tests/integ/test_huggingface_torch_distributed.py b/tests/integ/test_huggingface_torch_distributed.py index 0f78154ff8..733f59494c 100644 --- a/tests/integ/test_huggingface_torch_distributed.py +++ b/tests/integ/test_huggingface_torch_distributed.py @@ -24,10 +24,10 @@ def test_huggingface_torch_distributed_g5_glue( huggingface_pytorch_latest_training_py_version, ): with timeout.timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): - data_path = os.path.join(DATA_DIR, "huggingface") estimator = HuggingFace( py_version=huggingface_pytorch_latest_training_py_version, - entry_point=os.path.join(data_path, "run_glue.py"), + source_dir=os.path.join(DATA_DIR, "huggingface"), + entry_point="run_glue.py", role="SageMakerRole", transformers_version=huggingface_training_latest_version, pytorch_version=huggingface_training_pytorch_latest_version, From 9deefd72f5b310a777c9de1583e17ef08cf06866 Mon Sep 17 00:00:00 2001 From: Sirut Buasai <73297481+sirutBuasai@users.noreply.github.com> Date: Mon, 15 Jan 2024 10:49:38 -0800 Subject: [PATCH 47/76] fix: Add PyTorch 2.1.0 SM Training DLC to UNSUPPORTED_DLC_IMAGE_FOR_SM_PARALLELISM list (#4356) * add 2.1 unsupported smddp * formatting --- src/sagemaker/estimator.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index f899570775..9cadebf3d6 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -3282,7 +3282,11 @@ class Framework(EstimatorBase): """ _framework_name = None - UNSUPPORTED_DLC_IMAGE_FOR_SM_PARALLELISM = ("2.0.1-gpu-py310-cu121", "2.0-gpu-py310-cu121") + UNSUPPORTED_DLC_IMAGE_FOR_SM_PARALLELISM = ( + "2.0.1-gpu-py310-cu121", + "2.0-gpu-py310-cu121", + "2.1.0-gpu-py310", + ) def __init__( self, From 107ba5890b04cd237db171512881ef7541dbc5ce Mon Sep 17 00:00:00 2001 From: qidewenwhen <32910701+qidewenwhen@users.noreply.github.com> Date: Tue, 16 Jan 2024 11:47:55 -0800 Subject: [PATCH 48/76] feat: Support custom repack model settings (#4328) --- src/sagemaker/workflow/_utils.py | 4 +- src/sagemaker/workflow/model_step.py | 84 +++- .../sagemaker/workflow/test_model_step.py | 397 ++++++++++++------ 3 files changed, 353 insertions(+), 132 deletions(-) diff --git a/src/sagemaker/workflow/_utils.py b/src/sagemaker/workflow/_utils.py index 4993493513..9c4fa114ab 100644 --- a/src/sagemaker/workflow/_utils.py +++ b/src/sagemaker/workflow/_utils.py @@ -172,8 +172,8 @@ def __init__( # the real estimator and inputs repacker = SKLearn( - framework_version=FRAMEWORK_VERSION, - instance_type=INSTANCE_TYPE, + framework_version=kwargs.pop("framework_version", None) or FRAMEWORK_VERSION, + instance_type=kwargs.pop("instance_type", None) or INSTANCE_TYPE, entry_point=REPACK_SCRIPT_LAUNCHER, source_dir=self._source_dir, dependencies=self._dependencies, diff --git a/src/sagemaker/workflow/model_step.py b/src/sagemaker/workflow/model_step.py index d0872d5d9d..0ef77a5cd0 100644 --- a/src/sagemaker/workflow/model_step.py +++ b/src/sagemaker/workflow/model_step.py @@ -29,6 +29,9 @@ _REGISTER_MODEL_NAME_BASE = "RegisterModel" _CREATE_MODEL_NAME_BASE = "CreateModel" _REPACK_MODEL_NAME_BASE = "RepackModel" +_IGNORED_REPACK_PARAM_LIST = ["entry_point", "source_dir", "hyperparameters", "dependencies"] + +logger = logging.getLogger(__name__) class ModelStep(StepCollection): @@ -42,6 +45,7 @@ def __init__( retry_policies: Optional[Union[List[RetryPolicy], Dict[str, List[RetryPolicy]]]] = None, display_name: Optional[str] = None, description: Optional[str] = None, + repack_model_step_settings: Optional[Dict[str, any]] = None, ): """Constructs a `ModelStep`. @@ -115,6 +119,15 @@ def __init__( display_name (str): The display name of the `ModelStep`. The display name provides better UI readability. (default: None). description (str): The description of the `ModelStep` (default: None). + repack_model_step_settings (Dict[str, any]): The kwargs passed to the _RepackModelStep + to customize the configuration of the underlying repack model job (default: None). + Notes: + 1. If the _RepackModelStep is unnecessary, the settings will be ignored. + 2. If the _RepackModelStep is added, the repack_model_step_settings + is honored if set. + 3. In repack_model_step_settings, the arguments with misspelled keys will be + ignored. Please refer to the expected parameters of repack model job in + :class:`~sagemaker.sklearn.estimator.SKLearn` and its base classes. """ from sagemaker.workflow.utilities import validate_step_args_input @@ -148,6 +161,9 @@ def __init__( self.display_name = display_name self.description = description self.steps: List[Step] = [] + self._repack_model_step_settings = ( + dict(repack_model_step_settings) if repack_model_step_settings else {} + ) self._model = step_args.model self._create_model_args = self.step_args.create_model_request self._register_model_args = self.step_args.create_model_package_request @@ -157,6 +173,12 @@ def __init__( if self._need_runtime_repack: self._append_repack_model_step() + elif self._repack_model_step_settings: + logger.warning( + "Non-empty repack_model_step_settings is supplied but no repack model " + "step is needed. Ignoring the repack_model_step_settings." + ) + if self._register_model_args: self._append_register_model_step() else: @@ -235,14 +257,12 @@ def _append_repack_model_step(self): elif isinstance(self._model, Model): model_list = [self._model] else: - logging.warning("No models to repack") + logger.warning("No models to repack") return - security_group_ids = None - subnets = None - if self._model.vpc_config: - security_group_ids = self._model.vpc_config.get("SecurityGroupIds", None) - subnets = self._model.vpc_config.get("Subnets", None) + self._pop_out_non_configurable_repack_model_step_args() + + security_group_ids, subnets = self._resolve_repack_model_step_vpc_configs() for i, model in enumerate(model_list): runtime_repack_flg = ( @@ -252,8 +272,16 @@ def _append_repack_model_step(self): name_base = model.name or i repack_model_step = _RepackModelStep( name="{}-{}-{}".format(self.name, _REPACK_MODEL_NAME_BASE, name_base), - sagemaker_session=self._model.sagemaker_session or model.sagemaker_session, - role=self._model.role or model.role, + sagemaker_session=( + self._repack_model_step_settings.pop("sagemaker_session", None) + or self._model.sagemaker_session + or model.sagemaker_session + ), + role=( + self._repack_model_step_settings.pop("role", None) + or self._model.role + or model.role + ), model_data=model.model_data, entry_point=model.entry_point, source_dir=model.source_dir, @@ -266,8 +294,15 @@ def _append_repack_model_step(self): ), depends_on=self.depends_on, retry_policies=self._repack_model_retry_policies, - output_path=self._runtime_repack_output_prefix, - output_kms_key=model.model_kms_key, + output_path=( + self._repack_model_step_settings.pop("output_path", None) + or self._runtime_repack_output_prefix + ), + output_kms_key=( + self._repack_model_step_settings.pop("output_kms_key", None) + or model.model_kms_key + ), + **self._repack_model_step_settings ) self.steps.append(repack_model_step) @@ -282,3 +317,32 @@ def _append_repack_model_step(self): "InferenceSpecification" ]["Containers"][i] container["ModelDataUrl"] = repacked_model_data + + def _pop_out_non_configurable_repack_model_step_args(self): + """Pop out non-configurable args from _repack_model_step_settings""" + if not self._repack_model_step_settings: + return + for ignored_param in _IGNORED_REPACK_PARAM_LIST: + if self._repack_model_step_settings.pop(ignored_param, None): + logger.warning( + "The repack model step parameter - %s is not configurable. Ignoring it.", + ignored_param, + ) + + def _resolve_repack_model_step_vpc_configs(self): + """Resolve vpc configs for repack model step""" + # Note: the EstimatorBase constructor ensures that: + # "When setting up custom VPC, both subnets and security_group_ids must be set" + if self._repack_model_step_settings.get( + "security_group_ids", None + ) or self._repack_model_step_settings.get("subnets", None): + security_group_ids = self._repack_model_step_settings.pop("security_group_ids", None) + subnets = self._repack_model_step_settings.pop("subnets", None) + return security_group_ids, subnets + + if self._model.vpc_config: + security_group_ids = self._model.vpc_config.get("SecurityGroupIds", None) + subnets = self._model.vpc_config.get("Subnets", None) + return security_group_ids, subnets + + return None, None diff --git a/tests/unit/sagemaker/workflow/test_model_step.py b/tests/unit/sagemaker/workflow/test_model_step.py index f5f82e8a9f..4c8e9e0311 100644 --- a/tests/unit/sagemaker/workflow/test_model_step.py +++ b/tests/unit/sagemaker/workflow/test_model_step.py @@ -16,10 +16,12 @@ import os import tempfile import shutil +from typing import Union from mock import patch import pytest +from mock.mock import MagicMock from sagemaker import Model, PipelineModel, Session, Processor from sagemaker.chainer import ChainerModel @@ -34,9 +36,11 @@ from sagemaker.tensorflow import TensorFlowModel from sagemaker.transformer import Transformer from sagemaker.tuner import HyperparameterTuner +from sagemaker.workflow import _utils, is_pipeline_variable from sagemaker.workflow._utils import REPACK_SCRIPT_LAUNCHER from sagemaker.workflow.condition_step import ConditionStep from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo +from sagemaker.workflow.entities import PipelineVariable from sagemaker.workflow.model_step import ( ModelStep, _REGISTER_MODEL_NAME_BASE, @@ -68,6 +72,11 @@ _REPACK_OUTPUT_KEY_PREFIX = "code-output" _MODEL_CODE_LOCATION = f"s3://{BUCKET}/{_REPACK_OUTPUT_KEY_PREFIX}" _MODEL_CODE_LOCATION_TRAILING_SLASH = _MODEL_CODE_LOCATION + "/" +_MODEL_KMS_KEY = "model-kms-key" +_MODEL_VPC_CONFIG = { + "SecurityGroupIds": ["model-security-group-1", "model-security-group-2"], + "Subnets": ["model-subnet-1", "model-subnet-2"], +} @pytest.fixture @@ -150,13 +159,15 @@ def test_register_model_with_runtime_repack(pipeline_session, model_data_param, assert arguments["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"] == { "Get": "Parameters.ModelData" } - assert arguments["HyperParameters"]["inference_script"] == '"inference.py"' - assert arguments["HyperParameters"]["model_archive"] == {"Get": "Parameters.ModelData"} - assert ( - arguments["HyperParameters"]["sagemaker_program"] == f'"{REPACK_SCRIPT_LAUNCHER}"' + + _validate_repack_job_non_configurable_args( + arguments=arguments, + expected_script_name=f'"{_SCRIPT_NAME}"', + expected_model_archive={"Get": "Parameters.ModelData"}, + expected_entry_point=f'"{REPACK_SCRIPT_LAUNCHER}"', + expected_dependencies="null", ) assert "s3://" in arguments["HyperParameters"]["sagemaker_submit_directory"] - assert arguments["HyperParameters"]["dependencies"] == "null" assert step["RetryPolicies"] == [ { "BackoffRate": 2.0, @@ -166,6 +177,10 @@ def test_register_model_with_runtime_repack(pipeline_session, model_data_param, } ] assert "repack a model with customer scripts" in step["Description"] + _validate_repack_job_default_cfgs_for_configurable_args( + arguments=arguments, model=model + ) + elif step["Type"] == "RegisterModel": assert step["Name"] == f"MyModelStep-{_REGISTER_MODEL_NAME_BASE}" assert not step.get("DependsOn", None) @@ -189,6 +204,7 @@ def test_register_model_with_runtime_repack(pipeline_session, model_data_param, } ] assert "my model step description" in step["Description"] + else: raise Exception("A step exists in the collection of an invalid type.") @@ -233,13 +249,15 @@ def test_create_model_with_runtime_repack(pipeline_session, model_data_param, mo assert arguments["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"] == { "Get": "Parameters.ModelData" } - assert arguments["HyperParameters"]["inference_script"] == '"inference.py"' - assert arguments["HyperParameters"]["model_archive"] == {"Get": "Parameters.ModelData"} - assert ( - arguments["HyperParameters"]["sagemaker_program"] == f'"{REPACK_SCRIPT_LAUNCHER}"' + + _validate_repack_job_non_configurable_args( + arguments=arguments, + expected_script_name=f'"{_SCRIPT_NAME}"', + expected_model_archive={"Get": "Parameters.ModelData"}, + expected_entry_point=f'"{REPACK_SCRIPT_LAUNCHER}"', + expected_dependencies="null", ) assert "s3://" in arguments["HyperParameters"]["sagemaker_submit_directory"] - assert arguments["HyperParameters"]["dependencies"] == "null" assert "repack a model with customer scripts" in step["Description"] assert step["RetryPolicies"] == [ { @@ -249,6 +267,10 @@ def test_create_model_with_runtime_repack(pipeline_session, model_data_param, mo "ExceptionType": ["Step.THROTTLING"], } ] + _validate_repack_job_default_cfgs_for_configurable_args( + arguments=arguments, model=model + ) + elif step["Type"] == "Model": assert step["Name"] == f"MyModelStep-{_CREATE_MODEL_NAME_BASE}" arguments = step["Arguments"] @@ -284,7 +306,7 @@ def test_create_pipeline_model_with_runtime_repack(pipeline_session, model_data_ sparkml_model = SparkMLModel( name="MySparkMLModel", model_data=model_data_param, - role=ROLE, + role="AnotherRoleWontBeHonored", sagemaker_session=pipeline_session, env={"SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT": "text/csv"}, ) @@ -326,13 +348,15 @@ def test_create_pipeline_model_with_runtime_repack(pipeline_session, model_data_ assert arguments["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"] == { "Get": "Parameters.ModelData" } - assert arguments["HyperParameters"]["inference_script"] == f'"{_SCRIPT_NAME}"' - assert arguments["HyperParameters"]["model_archive"] == {"Get": "Parameters.ModelData"} - assert ( - arguments["HyperParameters"]["sagemaker_program"] == f'"{REPACK_SCRIPT_LAUNCHER}"' + + _validate_repack_job_non_configurable_args( + arguments=arguments, + expected_script_name=f'"{_SCRIPT_NAME}"', + expected_model_archive={"Get": "Parameters.ModelData"}, + expected_entry_point=f'"{REPACK_SCRIPT_LAUNCHER}"', + expected_dependencies="null", ) assert "s3://" in arguments["HyperParameters"]["sagemaker_submit_directory"] - assert arguments["HyperParameters"]["dependencies"] == "null" assert step["RetryPolicies"] == [ { "BackoffRate": 2.0, @@ -341,6 +365,10 @@ def test_create_pipeline_model_with_runtime_repack(pipeline_session, model_data_ "ExceptionType": ["SageMaker.CAPACITY_ERROR"], } ] + _validate_repack_job_default_cfgs_for_configurable_args( + arguments=arguments, model=model + ) + elif step["Type"] == "Model": assert step["Name"] == f"MyModelStep-{_CREATE_MODEL_NAME_BASE}" arguments = step["Arguments"] @@ -364,6 +392,7 @@ def test_create_pipeline_model_with_runtime_repack(pipeline_session, model_data_ "ExceptionType": ["Step.THROTTLING"], } ] + else: raise Exception("A step exists in the collection of an invalid type.") adjacency_list = PipelineGraph.from_pipeline(pipeline).adjacency_list @@ -432,13 +461,19 @@ def test_register_pipeline_model_with_runtime_repack( assert arguments["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"] == { "Get": "Parameters.ModelData" } - assert arguments["HyperParameters"]["inference_script"] == f'"{_SCRIPT_NAME}"' - assert arguments["HyperParameters"]["model_archive"] == {"Get": "Parameters.ModelData"} - assert ( - arguments["HyperParameters"]["sagemaker_program"] == f'"{REPACK_SCRIPT_LAUNCHER}"' + + _validate_repack_job_non_configurable_args( + arguments=arguments, + expected_script_name=f'"{_SCRIPT_NAME}"', + expected_model_archive={"Get": "Parameters.ModelData"}, + expected_entry_point=f'"{REPACK_SCRIPT_LAUNCHER}"', + expected_dependencies="null", ) assert "s3://" in arguments["HyperParameters"]["sagemaker_submit_directory"] - assert arguments["HyperParameters"]["dependencies"] == "null" + _validate_repack_job_default_cfgs_for_configurable_args( + arguments=arguments, model=model + ) + elif step["Type"] == "RegisterModel": assert step["Name"] == f"MyModelStep-{_REGISTER_MODEL_NAME_BASE}" arguments = step["Arguments"] @@ -519,8 +554,8 @@ def test_register_model_without_repack(pipeline_session, source_dir): assert ordered(adjacency_list) == ordered({"MyModelStep-RegisterModel": []}) -@patch("sagemaker.utils.repack_model") -def test_create_model_with_compile_time_repack(mock_repack, pipeline_session, source_dir): +@patch("sagemaker.utils.repack_model", MagicMock()) +def test_create_model_with_compile_time_repack(pipeline_session, source_dir): custom_step = CustomStep("TestStep") model_name = "MyModel" model = Model( @@ -658,118 +693,193 @@ def test_conditional_model_create_and_regis( ) -@pytest.mark.parametrize( - "test_input", - [ - ( - SKLearnModel( - name="MySKModel", - model_data="dummy_model_data", - image_uri=IMAGE_URI, - entry_point=f"{DATA_DIR}/workflow/{_SCRIPT_NAME}", - role=ROLE, - enable_network_isolation=True, - code_location=_MODEL_CODE_LOCATION_TRAILING_SLASH, - ), - 2, +_REPACK_STEP_SETTING_INPUT_PARAM = [ + (None, False), + ( + { + # args below are unconfigurable so will be ignored + "entry_point": "invalid-entry-point", + "source_dir": "invalid-source-dir", + "hyperparameters": None, + "dependencies": None, + # args below are misspelled so will be ignored + "myInstanceType": "invalid-instance-type", + "roleArn": "invalid-role", + }, + False, + ), + ( + { + "output_path": f"s3://{BUCKET}/repack-model-output", + "code_location": f"s3://{BUCKET}/repack-model-code-location", + "instance_type": "ml.m4.xlarge", + "role": "arn:aws:iam::123412341234:role/new-role", + "volume_size": 50, + "output_kms_key": "repack-model-output-kms-key", + "security_group_ids": ["repack-model-security-group-a"], + "subnets": ["repack-model-subnet-a", "repack-model-subnet-b"], + }, + True, + ), +] +_MODEL_INPUT_PARAM = [ + ( + SKLearnModel( + name="MySKModel", + model_data="dummy_model_data", + image_uri=IMAGE_URI, + entry_point=f"{DATA_DIR}/workflow/{_SCRIPT_NAME}", + role=ROLE, + enable_network_isolation=True, + code_location=_MODEL_CODE_LOCATION_TRAILING_SLASH, + vpc_config=_MODEL_VPC_CONFIG, ), - ( - XGBoostModel( - name="MYXGBoostModel", - model_data="dummy_model_data", - framework_version="1.11.0", - image_uri=IMAGE_URI, - entry_point=f"{DATA_DIR}/workflow/{_SCRIPT_NAME}", - role=ROLE, - enable_network_isolation=False, - ), - 1, + 2, + ), + ( + XGBoostModel( + name="MYXGBoostModel", + model_data="dummy_model_data", + framework_version="1.11.0", + image_uri=IMAGE_URI, + entry_point=f"{DATA_DIR}/workflow/{_SCRIPT_NAME}", + role=ROLE, + enable_network_isolation=False, ), - ( - PyTorchModel( - name="MyPyTorchModel", - model_data="dummy_model_data", - image_uri=IMAGE_URI, - entry_point=f"{DATA_DIR}/workflow/{_SCRIPT_NAME}", - role=ROLE, - framework_version="1.5.0", - code_location=_MODEL_CODE_LOCATION_TRAILING_SLASH, - ), - 2, + 1, + ), + ( + PyTorchModel( + name="MyPyTorchModel", + model_data="dummy_model_data", + image_uri=IMAGE_URI, + entry_point=f"{DATA_DIR}/workflow/{_SCRIPT_NAME}", + role=ROLE, + framework_version="1.5.0", + code_location=_MODEL_CODE_LOCATION_TRAILING_SLASH, ), - ( - MXNetModel( - name="MyMXNetModel", - model_data="dummy_model_data", - image_uri=IMAGE_URI, - entry_point=f"{DATA_DIR}/workflow/{_SCRIPT_NAME}", - role=ROLE, - framework_version="1.2.0", - ), - 1, + 2, + ), + ( + MXNetModel( + name="MyMXNetModel", + model_data="dummy_model_data", + image_uri=IMAGE_URI, + entry_point=f"{DATA_DIR}/workflow/{_SCRIPT_NAME}", + role=ROLE, + framework_version="1.2.0", ), - ( - HuggingFaceModel( - name="MyHuggingFaceModel", - model_data="dummy_model_data", - image_uri=IMAGE_URI, - entry_point=f"{DATA_DIR}/workflow/{_SCRIPT_NAME}", - role=ROLE, - ), - 2, + 1, + ), + ( + HuggingFaceModel( + name="MyHuggingFaceModel", + model_data="dummy_model_data", + image_uri=IMAGE_URI, + entry_point=f"{DATA_DIR}/workflow/{_SCRIPT_NAME}", + role=ROLE, + model_kms_key=_MODEL_KMS_KEY, + vpc_config=_MODEL_VPC_CONFIG, ), - ( - TensorFlowModel( - name="MyTensorFlowModel", - model_data="dummy_model_data", - image_uri=IMAGE_URI, - entry_point=f"{DATA_DIR}/workflow/{_SCRIPT_NAME}", - role=ROLE, - code_location=_MODEL_CODE_LOCATION_TRAILING_SLASH, - ), - 2, + 2, + ), + ( + TensorFlowModel( + name="MyTensorFlowModel", + model_data="dummy_model_data", + image_uri=IMAGE_URI, + entry_point=f"{DATA_DIR}/workflow/{_SCRIPT_NAME}", + role=ROLE, + code_location=_MODEL_CODE_LOCATION_TRAILING_SLASH, + model_kms_key=_MODEL_KMS_KEY, ), - ( - ChainerModel( - name="MyChainerModel", - model_data="dummy_model_data", - image_uri=IMAGE_URI, - entry_point=f"{DATA_DIR}/workflow/{_SCRIPT_NAME}", - role=ROLE, - ), - 1, + 2, + ), + ( + ChainerModel( + name="MyChainerModel", + model_data="dummy_model_data", + image_uri=IMAGE_URI, + entry_point=f"{DATA_DIR}/workflow/{_SCRIPT_NAME}", + role=ROLE, ), - ], -) -def test_create_model_among_different_model_types(test_input, pipeline_session, model_data_param): - def assert_test_result(steps: list): + 1, + ), +] + + +@pytest.mark.parametrize("repack_step_setting_input", _REPACK_STEP_SETTING_INPUT_PARAM) +@pytest.mark.parametrize("model_input", _MODEL_INPUT_PARAM) +def test_model_step_among_different_model_types( + model_input, repack_step_setting_input, pipeline_session, model_data_param +): + def assert_test_result(model_step: ModelStep): + steps = model_step.request_dicts() # If expected_step_num is 2, it means a runtime repack step is appended # If expected_step_num is 1, it means no runtime repack is needed assert len(steps) == expected_step_num - if expected_step_num == 2: - assert steps[0]["Type"] == "Training" - if model.key_prefix is not None and model.key_prefix.startswith( - _REPACK_OUTPUT_KEY_PREFIX - ): - assert steps[0]["Arguments"]["OutputDataConfig"]["S3OutputPath"] == ( - f"{_MODEL_CODE_LOCATION}/{model.name}" - ) - else: - assert steps[0]["Arguments"]["OutputDataConfig"]["S3OutputPath"] == ( - f"s3://{BUCKET}/{model.name}" - ) + if expected_step_num != 2: + return + + assert steps[0]["Type"] == "Training" + arguments = steps[0]["Arguments"] + + _validate_repack_job_non_configurable_args( + arguments=arguments, + expected_script_name=f'"{_SCRIPT_NAME}"', + expected_model_archive={"Get": "Parameters.ModelData"}, + expected_entry_point=f'"{REPACK_SCRIPT_LAUNCHER}"', + expected_dependencies="null", + ) - model, expected_step_num = test_input + if valid_repack_args: + # validate repack job configurable args + assert ( + arguments["ResourceConfig"]["InstanceType"] + == repack_model_step_settings["instance_type"] + ) + assert ( + arguments["ResourceConfig"]["VolumeSizeInGB"] + == repack_model_step_settings["volume_size"] + ) + assert ( + arguments["OutputDataConfig"]["KmsKeyId"] + == repack_model_step_settings["output_kms_key"] + ) + assert arguments["RoleArn"] == repack_model_step_settings["role"] + assert ( + arguments["OutputDataConfig"]["S3OutputPath"] + == repack_model_step_settings["output_path"] + ) + assert ( + repack_model_step_settings["code_location"] + in arguments["HyperParameters"]["sagemaker_submit_directory"] + ) + assert ( + arguments["VpcConfig"]["SecurityGroupIds"] + == repack_model_step_settings["security_group_ids"] + ) + assert arguments["VpcConfig"]["Subnets"] == repack_model_step_settings["subnets"] + return + + _validate_repack_job_default_cfgs_for_configurable_args( + arguments=arguments, + model=model, + ) + + repack_model_step_settings, valid_repack_args = repack_step_setting_input + model, expected_step_num = model_input model.sagemaker_session = pipeline_session model.model_data = model_data_param create_model_step_args = model.create( instance_type="c4.4xlarge", ) - create_model_steps = ModelStep( + create_model_step = ModelStep( name="MyModelStep", step_args=create_model_step_args, + repack_model_step_settings=repack_model_step_settings, ) - assert_test_result(create_model_steps.request_dicts()) + assert_test_result(create_model_step) register_model_step_args = model.register( content_types=["text/csv"], @@ -778,11 +888,12 @@ def assert_test_result(steps: list): transform_instances=["ml.m5.xlarge"], model_package_group_name="MyModelPackageGroup", ) - register_model_steps = ModelStep( + register_model_step = ModelStep( name="MyModelStep", step_args=register_model_step_args, + repack_model_step_settings=repack_model_step_settings, ) - assert_test_result(register_model_steps.request_dicts()) + assert_test_result(register_model_step) @pytest.mark.parametrize( @@ -870,9 +981,9 @@ def assert_test_result(steps: list): ), ], ) -@patch("sagemaker.utils.repack_model") +@patch("sagemaker.utils.repack_model", MagicMock()) def test_request_compare_of_register_model_under_different_sessions( - mock_repack, test_input, pipeline_session, sagemaker_session, model_data_param + test_input, pipeline_session, sagemaker_session, model_data_param ): model_package_group_name = "TestModelPackageGroup" model, expect = test_input @@ -1171,3 +1282,49 @@ def test_create_model_step_using_custom_model_name(pipeline_session): steps = json.loads(pipeline.definition())["Steps"] assert len(steps) == 1 assert "ModelName" not in steps[0]["Arguments"] + + +def _validate_repack_job_default_cfgs_for_configurable_args( + arguments: dict, model: Union[Model, PipelineModel] +): + assert arguments["ResourceConfig"]["InstanceType"] == _utils.INSTANCE_TYPE + assert arguments["ResourceConfig"]["VolumeSizeInGB"] == 30 + assert arguments["RoleArn"] == ROLE + + if isinstance(model, Model) and model.model_kms_key: + assert arguments["OutputDataConfig"]["KmsKeyId"] == model.model_kms_key + else: + assert not arguments["OutputDataConfig"].get("KmsKeyId", None) + + if isinstance(model, Model) and model.vpc_config: + assert arguments["VpcConfig"]["SecurityGroupIds"] == model.vpc_config["SecurityGroupIds"] + assert arguments["VpcConfig"]["Subnets"] == model.vpc_config["Subnets"] + else: + assert not arguments.get("VpcConfig", None) + + if not model.name: + return + if model.key_prefix is not None and model.key_prefix.startswith(_REPACK_OUTPUT_KEY_PREFIX): + assert arguments["OutputDataConfig"]["S3OutputPath"] == ( + f"{_MODEL_CODE_LOCATION}/{model.name}" + ) + else: + assert arguments["OutputDataConfig"]["S3OutputPath"] == f"s3://{BUCKET}/{model.name}" + + +def _validate_repack_job_non_configurable_args( + arguments: dict, + expected_script_name: str, + expected_model_archive: Union[str, dict, PipelineVariable], + expected_entry_point: str, + expected_dependencies: str, +): + assert arguments["HyperParameters"]["inference_script"] == expected_script_name + assert arguments["HyperParameters"]["sagemaker_program"] == expected_entry_point + assert arguments["HyperParameters"]["dependencies"] == expected_dependencies + + model_archive = arguments["HyperParameters"]["model_archive"] + if is_pipeline_variable(model_archive): + assert model_archive.expr == expected_model_archive + else: + assert model_archive == expected_model_archive From bea480f8643abc4f07d727acfec24e5cb0db9817 Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos <141277478+benieric@users.noreply.github.com> Date: Thu, 18 Jan 2024 11:30:00 -0800 Subject: [PATCH 49/76] change: update sphinx version (#4377) * change: update sphinx version * Update sphinx --- doc/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/requirements.txt b/doc/requirements.txt index 62541ef4e1..640a85ca08 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -1,4 +1,4 @@ -sphinx==3.4.3 +sphinx==5.1.1 sphinx-rtd-theme==0.5.0 docutils==0.15.2 packaging==20.9 From 8c4137fbacc74ea5e2131a46f7cb463281358d5a Mon Sep 17 00:00:00 2001 From: Sindhu Somasundaram <56774226+sindhuvahinis@users.noreply.github.com> Date: Thu, 18 Jan 2024 14:01:30 -0800 Subject: [PATCH 50/76] change: Updates for DJL 0.26.0 release (#4366) --- .../image_uri_config/djl-deepspeed.json | 32 +++++++++++++++++++ .../image_uri_config/djl-neuronx.json | 18 +++++++++++ .../image_uri_config/djl-tensorrtllm.json | 32 +++++++++++++++++++ 3 files changed, 82 insertions(+) diff --git a/src/sagemaker/image_uri_config/djl-deepspeed.json b/src/sagemaker/image_uri_config/djl-deepspeed.json index 1612d85ff7..fa41e9ed35 100644 --- a/src/sagemaker/image_uri_config/djl-deepspeed.json +++ b/src/sagemaker/image_uri_config/djl-deepspeed.json @@ -3,6 +3,38 @@ "inference" ], "versions": { + "0.26.0": { + "registries": { + "af-south-1": "626614931356", + "il-central-1": "780543022126", + "ap-east-1": "871362719292", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ap-southeast-3": "907027046896", + "ca-central-1": "763104351884", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-north-1": "763104351884", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "eu-south-1": "692866216735", + "me-south-1": "217643126080", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-west-1": "763104351884", + "us-west-2": "763104351884", + "ca-west-1": "204538143572" + }, + "repository": "djl-inference", + "tag_prefix": "0.26.0-deepspeed0.12.6-cu121" + }, "0.25.0": { "registries": { "af-south-1": "626614931356", diff --git a/src/sagemaker/image_uri_config/djl-neuronx.json b/src/sagemaker/image_uri_config/djl-neuronx.json index b8c0f2be1a..d81622d9f9 100644 --- a/src/sagemaker/image_uri_config/djl-neuronx.json +++ b/src/sagemaker/image_uri_config/djl-neuronx.json @@ -3,6 +3,24 @@ "inference" ], "versions": { + "0.26.0": { + "registries": { + "ap-northeast-1": "763104351884", + "ap-south-1": "763104351884", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "eu-central-1": "763104351884", + "eu-west-1": "763104351884", + "eu-west-3": "763104351884", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-west-2": "763104351884", + "ca-west-1": "204538143572" + }, + "repository": "djl-inference", + "tag_prefix": "0.26.0-neuronx-sdk2.16.0" + }, "0.25.0": { "registries": { "ap-northeast-1": "763104351884", diff --git a/src/sagemaker/image_uri_config/djl-tensorrtllm.json b/src/sagemaker/image_uri_config/djl-tensorrtllm.json index 545e49f630..dcafeefc31 100644 --- a/src/sagemaker/image_uri_config/djl-tensorrtllm.json +++ b/src/sagemaker/image_uri_config/djl-tensorrtllm.json @@ -3,6 +3,38 @@ "inference" ], "versions": { + "0.26.0": { + "registries": { + "af-south-1": "626614931356", + "il-central-1": "780543022126", + "ap-east-1": "871362719292", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ap-southeast-3": "907027046896", + "ca-central-1": "763104351884", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-north-1": "763104351884", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "eu-south-1": "692866216735", + "me-south-1": "217643126080", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-west-1": "763104351884", + "us-west-2": "763104351884", + "ca-west-1": "204538143572" + }, + "repository": "djl-inference", + "tag_prefix": "0.26.0-tensorrtllm0.7.1-cu122" + }, "0.25.0": { "registries": { "af-south-1": "626614931356", From ceb33cdd9d08909dd338f2c38a07b9c44661b0d9 Mon Sep 17 00:00:00 2001 From: amzn-choeric <105388439+amzn-choeric@users.noreply.github.com> Date: Fri, 19 Jan 2024 13:04:58 -0500 Subject: [PATCH 51/76] change: TGI NeuronX (#4375) * TGI NeuronX * Update * Update --- src/sagemaker/huggingface/llm_utils.py | 8 ++++ .../huggingface-llm-neuronx.json | 41 +++++++++++++++++++ src/sagemaker/image_uris.py | 2 + .../image_uris/test_huggingface_llm.py | 31 +++++++++----- 4 files changed, 71 insertions(+), 11 deletions(-) create mode 100644 src/sagemaker/image_uri_config/huggingface-llm-neuronx.json diff --git a/src/sagemaker/huggingface/llm_utils.py b/src/sagemaker/huggingface/llm_utils.py index 65befe41b0..1a2abfb2e4 100644 --- a/src/sagemaker/huggingface/llm_utils.py +++ b/src/sagemaker/huggingface/llm_utils.py @@ -57,6 +57,14 @@ def get_huggingface_llm_image_uri( version=version, image_scope="inference", ) + if backend == "huggingface-neuronx": + return image_uris.retrieve( + "huggingface-llm-neuronx", + region=region, + version=version, + image_scope="inference", + inference_tool="neuronx", + ) if backend == "lmi": version = version or "0.24.0" return image_uris.retrieve(framework="djl-deepspeed", region=region, version=version) diff --git a/src/sagemaker/image_uri_config/huggingface-llm-neuronx.json b/src/sagemaker/image_uri_config/huggingface-llm-neuronx.json new file mode 100644 index 0000000000..a13336fb79 --- /dev/null +++ b/src/sagemaker/image_uri_config/huggingface-llm-neuronx.json @@ -0,0 +1,41 @@ +{ + "inference": { + "processors": [ + "inf2" + ], + "version_aliases": { + "0.0": "0.0.16" + }, + "versions": { + "0.0.16": { + "py_versions": [ + "py310" + ], + "registries": { + "ap-northeast-1": "763104351884", + "ap-south-1": "763104351884", + "ap-south-2": "772153158452", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ap-southeast-4": "457447274322", + "eu-central-1": "763104351884", + "eu-central-2": "380420809688", + "eu-south-2": "503227376785", + "eu-west-1": "763104351884", + "eu-west-3": "763104351884", + "il-central-1": "780543022126", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-west-2": "763104351884", + "ca-west-1": "204538143572" + }, + "tag_prefix": "1.13.1-optimum0.0.16", + "repository": "huggingface-pytorch-tgi-inference", + "container_version": { + "inf2": "ubuntu22.04" + } + } + } + } +} diff --git a/src/sagemaker/image_uris.py b/src/sagemaker/image_uris.py index 56e4bf346f..efcdc68b22 100644 --- a/src/sagemaker/image_uris.py +++ b/src/sagemaker/image_uris.py @@ -37,6 +37,7 @@ ECR_URI_TEMPLATE = "{registry}.dkr.{hostname}/{repository}" HUGGING_FACE_FRAMEWORK = "huggingface" HUGGING_FACE_LLM_FRAMEWORK = "huggingface-llm" +HUGGING_FACE_LLM_NEURONX_FRAMEWORK = "huggingface-llm-neuronx" XGBOOST_FRAMEWORK = "xgboost" SKLEARN_FRAMEWORK = "sklearn" TRAINIUM_ALLOWED_FRAMEWORKS = "pytorch" @@ -470,6 +471,7 @@ def _validate_version_and_set_if_needed(version, config, framework): if version is None and framework in [ DATA_WRANGLER_FRAMEWORK, HUGGING_FACE_LLM_FRAMEWORK, + HUGGING_FACE_LLM_NEURONX_FRAMEWORK, STABILITYAI_FRAMEWORK, ]: version = _get_latest_versions(available_versions) diff --git a/tests/unit/sagemaker/image_uris/test_huggingface_llm.py b/tests/unit/sagemaker/image_uris/test_huggingface_llm.py index e4d7ab9947..b02fe36e99 100644 --- a/tests/unit/sagemaker/image_uris/test_huggingface_llm.py +++ b/tests/unit/sagemaker/image_uris/test_huggingface_llm.py @@ -19,29 +19,38 @@ LMI_VERSIONS = ["0.24.0"] HF_VERSIONS_MAPPING = { - "0.6.0": "2.0.0-tgi0.6.0-gpu-py39-cu118-ubuntu20.04", - "0.8.2": "2.0.0-tgi0.8.2-gpu-py39-cu118-ubuntu20.04", - "0.9.3": "2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04", - "1.0.3": "2.0.1-tgi1.0.3-gpu-py39-cu118-ubuntu20.04", - "1.1.0": "2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04", - "1.2.0": "2.1.1-tgi1.2.0-gpu-py310-cu121-ubuntu20.04", - "1.3.1": "2.1.1-tgi1.3.1-gpu-py310-cu121-ubuntu20.04", - "1.3.3": "2.1.1-tgi1.3.3-gpu-py310-cu121-ubuntu20.04", + "gpu": { + "0.6.0": "2.0.0-tgi0.6.0-gpu-py39-cu118-ubuntu20.04", + "0.8.2": "2.0.0-tgi0.8.2-gpu-py39-cu118-ubuntu20.04", + "0.9.3": "2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04", + "1.0.3": "2.0.1-tgi1.0.3-gpu-py39-cu118-ubuntu20.04", + "1.1.0": "2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04", + "1.2.0": "2.1.1-tgi1.2.0-gpu-py310-cu121-ubuntu20.04", + "1.3.1": "2.1.1-tgi1.3.1-gpu-py310-cu121-ubuntu20.04", + "1.3.3": "2.1.1-tgi1.3.3-gpu-py310-cu121-ubuntu20.04", + }, + "inf2": { + "0.0.16": "1.13.1-optimum0.0.16-neuronx-py310-ubuntu22.04", + }, } -@pytest.mark.parametrize("load_config", ["huggingface-llm.json"], indirect=True) +@pytest.mark.parametrize( + "load_config", ["huggingface-llm.json", "huggingface-llm-neuronx.json"], indirect=True +) def test_huggingface_uris(load_config): VERSIONS = load_config["inference"]["versions"] + device = load_config["inference"]["processors"][0] + backend = "huggingface-neuronx" if device == "inf2" else "huggingface" for version in VERSIONS: ACCOUNTS = load_config["inference"]["versions"][version]["registries"] for region in ACCOUNTS.keys(): - uri = get_huggingface_llm_image_uri("huggingface", region=region, version=version) + uri = get_huggingface_llm_image_uri(backend, region=region, version=version) expected = expected_uris.huggingface_llm_framework_uri( "huggingface-pytorch-tgi-inference", ACCOUNTS[region], version, - HF_VERSIONS_MAPPING[version], + HF_VERSIONS_MAPPING[device][version], region=region, ) assert expected == uri From 097db73efc11d685f246d557626af08caded9db2 Mon Sep 17 00:00:00 2001 From: Stephen Via <51342648+svia3@users.noreply.github.com> Date: Fri, 19 Jan 2024 14:23:44 -0800 Subject: [PATCH 52/76] fix: add warning message for job-prefixed pipeline steps when no job name is provided (#4371) Co-authored-by: svia3 --- src/sagemaker/workflow/utilities.py | 10 +++++++ .../sagemaker/workflow/test_model_step.py | 28 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/src/sagemaker/workflow/utilities.py b/src/sagemaker/workflow/utilities.py index d2fd169b13..4ef5ad5dd2 100644 --- a/src/sagemaker/workflow/utilities.py +++ b/src/sagemaker/workflow/utilities.py @@ -41,6 +41,14 @@ "if desired." ) +JOB_KEY_NONE_WARN_MSG_TEMPLATE = ( + "Invalid input: use_custom_job_prefix flag is set but the name field [{}] has not been " + "specified. Please refer to the AWS Docs to identify which field should be set to enable the " + "custom-prefixing feature for jobs created via a pipeline execution. " + "https://docs.aws.amazon.com/sagemaker/latest/dg/" + "build-and-manage-access.html#build-and-manage-step-permissions-prefix" +) + if TYPE_CHECKING: from sagemaker.workflow.step_collections import StepCollection @@ -458,6 +466,8 @@ def trim_request_dict(request_dict, job_key, config): request_dict.pop(job_key, None) # safely return null in case of KeyError else: if job_key in request_dict: + if request_dict[job_key] is None or len(request_dict[job_key]) == 0: + raise ValueError(JOB_KEY_NONE_WARN_MSG_TEMPLATE.format(job_key)) request_dict[job_key] = base_from_name(request_dict[job_key]) # trim timestamp return request_dict diff --git a/tests/unit/sagemaker/workflow/test_model_step.py b/tests/unit/sagemaker/workflow/test_model_step.py index 4c8e9e0311..106de119e1 100644 --- a/tests/unit/sagemaker/workflow/test_model_step.py +++ b/tests/unit/sagemaker/workflow/test_model_step.py @@ -1328,3 +1328,31 @@ def _validate_repack_job_non_configurable_args( assert model_archive.expr == expected_model_archive else: assert model_archive == expected_model_archive + + +def test_create_model_step_using_custom_model_name_set_to_none(pipeline_session): + # Name of the model not specified, will resolve to None. + model = Model( + image_uri="my-image", + sagemaker_session=pipeline_session, + model_data="s3://", + role=ROLE, + ) + step_create_model = ModelStep(name="MyModelStep", step_args=model.create()) + + # 1. Toggle on custom-prefixing model name set to None. + config = PipelineDefinitionConfig(use_custom_job_prefix=True) + + with pytest.raises(ValueError) as error: + pipeline = Pipeline( + name="MyPipeline", + steps=[step_create_model], + sagemaker_session=pipeline_session, + pipeline_definition_config=config, + ) + pipeline.definition() + + assert ( + "Invalid input: use_custom_job_prefix flag is set but the name field " + "[ModelName] has not been specified." in str(error.value) + ) From 4cf749432583a3b43f9a5a3cd3ddf7cd95a061e7 Mon Sep 17 00:00:00 2001 From: Haixin Wang <98612668+haixiw@users.noreply.github.com> Date: Fri, 19 Jan 2024 17:20:46 -0800 Subject: [PATCH 53/76] change: JumpStart - TLV region launch (#4379) --- src/sagemaker/jumpstart/constants.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/sagemaker/jumpstart/constants.py b/src/sagemaker/jumpstart/constants.py index daa9e0e30a..10e5e32c56 100644 --- a/src/sagemaker/jumpstart/constants.py +++ b/src/sagemaker/jumpstart/constants.py @@ -142,6 +142,11 @@ region_name="cn-north-1", content_bucket="jumpstart-cache-prod-cn-north-1", ), + JumpStartLaunchedRegionInfo( + region_name="il-central-1", + content_bucket="jumpstart-cache-prod-il-central-1", + gated_content_bucket="jumpstart-private-cache-prod-il-central-1", + ), ] ) From 2d5e3997ebe00b5bd875c2fdc866a5aa4b5825d7 Mon Sep 17 00:00:00 2001 From: Nilesh PS Date: Tue, 23 Jan 2024 00:13:00 +0530 Subject: [PATCH 54/76] feat: add throughput management support for feature group (#4359) * feat: add throughput management support for feature group * documentation: add doc for feature group throughput config --------- Co-authored-by: Nilesh PS --- doc/api/prep_data/feature_store.rst | 12 +++ src/sagemaker/feature_store/feature_group.py | 17 +++- src/sagemaker/feature_store/inputs.py | 76 ++++++++++++++++ src/sagemaker/session.py | 28 +++--- tests/integ/test_feature_store.py | 75 ++++++++++++++++ .../feature_store/test_feature_group.py | 90 +++++++++++++++++++ tests/unit/test_session.py | 28 +++++- 7 files changed, 313 insertions(+), 13 deletions(-) diff --git a/doc/api/prep_data/feature_store.rst b/doc/api/prep_data/feature_store.rst index ee6350b8da..278574e400 100644 --- a/doc/api/prep_data/feature_store.rst +++ b/doc/api/prep_data/feature_store.rst @@ -75,6 +75,14 @@ Inputs :members: :show-inheritance: +.. autoclass:: sagemaker.feature_store.inputs.ThroughputConfig + :members: + :show-inheritance: + +.. autoclass:: sagemaker.feature_store.inputs.ThroughputConfigUpdate + :members: + :show-inheritance: + .. autoclass:: sagemaker.feature_store.inputs.OnlineStoreConfig :members: :show-inheritance: @@ -99,6 +107,10 @@ Inputs :members: :show-inheritance: +.. autoclass:: sagemaker.feature_store.inputs.ThroughputModeEnum + :members: + :show-inheritance: + .. autoclass:: sagemaker.feature_store.inputs.ResourceEnum :members: :show-inheritance: diff --git a/src/sagemaker/feature_store/feature_group.py b/src/sagemaker/feature_store/feature_group.py index 0e503e192d..9ffb0ea9da 100644 --- a/src/sagemaker/feature_store/feature_group.py +++ b/src/sagemaker/feature_store/feature_group.py @@ -64,6 +64,8 @@ TtlDuration, OnlineStoreConfigUpdate, OnlineStoreStorageTypeEnum, + ThroughputConfig, + ThroughputConfigUpdate, ) from sagemaker.utils import resolve_value_from_config, format_tags, Tags @@ -541,6 +543,7 @@ def create( tags: Optional[Tags] = None, table_format: TableFormatEnum = None, online_store_storage_type: OnlineStoreStorageTypeEnum = None, + throughput_config: ThroughputConfig = None, ) -> Dict[str, Any]: """Create a SageMaker FeatureStore FeatureGroup. @@ -570,6 +573,8 @@ def create( table_format (TableFormatEnum): format of the offline store table (default: None). online_store_storage_type (OnlineStoreStorageTypeEnum): storage type for the online store (default: None). + throughput_config (ThroughputConfig): throughput configuration of the + feature group (default: None). Returns: Response dict from service. @@ -618,6 +623,9 @@ def create( ) create_feature_store_args.update({"online_store_config": online_store_config.to_dict()}) + if throughput_config: + create_feature_store_args.update({"throughput_config": throughput_config.to_dict()}) + # offline store configuration if s3_uri: s3_storage_config = S3StorageConfig(s3_uri=s3_uri) @@ -656,17 +664,17 @@ def update( self, feature_additions: Sequence[FeatureDefinition] = None, online_store_config: OnlineStoreConfigUpdate = None, + throughput_config: ThroughputConfigUpdate = None, ) -> Dict[str, Any]: """Update a FeatureGroup and add new features from the given feature definitions. Args: feature_additions (Sequence[Dict[str, str]): list of feature definitions to be updated. online_store_config (OnlineStoreConfigUpdate): online store config to be updated. - + throughput_config (ThroughputConfigUpdate): target throughput configuration Returns: Response dict from service. """ - if feature_additions is None: feature_additions_parameter = None else: @@ -679,10 +687,15 @@ def update( else: online_store_config_parameter = online_store_config.to_dict() + throughput_config_parameter = ( + None if throughput_config is None else throughput_config.to_dict() + ) + return self.sagemaker_session.update_feature_group( feature_group_name=self.name, feature_additions=feature_additions_parameter, online_store_config=online_store_config_parameter, + throughput_config=throughput_config_parameter, ) def update_feature_metadata( diff --git a/src/sagemaker/feature_store/inputs.py b/src/sagemaker/feature_store/inputs.py index ed61117ead..aaff977d3c 100644 --- a/src/sagemaker/feature_store/inputs.py +++ b/src/sagemaker/feature_store/inputs.py @@ -453,3 +453,79 @@ class ExpirationTimeResponseEnum(Enum): DISABLED = "Disabled" ENABLED = "Enabled" + + +class ThroughputModeEnum(Enum): + """Enum of throughput modes supported by feature group. + + Throughput mode of feature group can be ON_DEMAND or PROVISIONED. + """ + + ON_DEMAND = "OnDemand" + PROVISIONED = "Provisioned" + + +@attr.s +class ThroughputConfig(Config): + """Throughput configuration of the feature group. + + Throughput configuration can be ON_DEMAND, or PROVISIONED with valid values for + read and write capacity units. ON_DEMAND works best for less predictable traffic, + while PROVISIONED works best for consistent and predictable traffic. + + Attributes: + mode (ThroughputModeEnum): Throughput mode + provisioned_read_capacity_units (int): For provisioned feature groups, this indicates + the read throughput you are billed for and can consume without throttling. + provisioned_write_capacity_units (int): For provisioned feature groups, this indicates + the write throughput you are billed for and can consume without throttling. + """ + + mode: ThroughputModeEnum = attr.ib(default=None) + provisioned_read_capacity_units: int = attr.ib(default=None) + provisioned_write_capacity_units: int = attr.ib(default=None) + + def to_dict(self) -> Dict[str, Any]: + """Construct a dictionary based on the attributes provided. + + Returns: + dict represents the attributes. + """ + return Config.construct_dict( + ThroughputMode=self.mode.value if self.mode else None, + ProvisionedReadCapacityUnits=self.provisioned_read_capacity_units, + ProvisionedWriteCapacityUnits=self.provisioned_write_capacity_units, + ) + + +@attr.s +class ThroughputConfigUpdate(Config): + """Target throughput configuration for the feature group. + + Target throughput configuration can be ON_DEMAND, or PROVISIONED with valid values for + read and write capacity units. ON_DEMAND works best for less predictable traffic, + while PROVISIONED works best for consistent and predictable traffic. + + Attributes: + mode (ThroughputModeEnum): Target throughput mode + provisioned_read_capacity_units (int): For provisioned feature groups, this indicates + the read throughput you are billed for and can consume without throttling. + provisioned_write_capacity_units (int): For provisioned feature groups, this indicates + the write throughput you are billed for and can consume without throttling. + """ + + mode: ThroughputModeEnum = attr.ib(default=None) + provisioned_read_capacity_units: int = attr.ib(default=None) + provisioned_write_capacity_units: int = attr.ib(default=None) + + def to_dict(self) -> Dict[str, Any]: + """Construct a dictionary based on the attributes provided. + + Returns: + dict represents the attributes. + """ + return Config.construct_dict( + ThroughputMode=self.mode.value if self.mode else None, + ProvisionedReadCapacityUnits=self.provisioned_read_capacity_units, + ProvisionedWriteCapacityUnits=self.provisioned_write_capacity_units, + ) diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py index ac1bf6e343..8f2753a7cf 100644 --- a/src/sagemaker/session.py +++ b/src/sagemaker/session.py @@ -5679,6 +5679,7 @@ def create_feature_group( role_arn: str = None, online_store_config: Dict[str, str] = None, offline_store_config: Dict[str, str] = None, + throughput_config: Dict[str, Any] = None, description: str = None, tags: Optional[Tags] = None, ) -> Dict[str, Any]: @@ -5694,6 +5695,8 @@ def create_feature_group( feature online store. offline_store_config (Dict[str, str]): dict contains configuration of the feature offline store. + throughput_config (Dict[str, str]): dict contains throughput configuration + for the feature group. description (str): description of the FeatureGroup. tags (Optional[Tags]): tags for labeling a FeatureGroup. @@ -5729,6 +5732,7 @@ def create_feature_group( kwargs, OnlineStoreConfig=inferred_online_store_from_config, OfflineStoreConfig=inferred_offline_store_from_config, + ThroughputConfig=throughput_config, Description=description, Tags=tags, ) @@ -5757,28 +5761,32 @@ def update_feature_group( feature_group_name: str, feature_additions: Sequence[Dict[str, str]] = None, online_store_config: Dict[str, any] = None, + throughput_config: Dict[str, Any] = None, ) -> Dict[str, Any]: """Update a FeatureGroup - either adding new features from the given feature definitions - or updating online store config + Supports modifications like adding new features from the given feature definitions, + updating online store and throughput configurations. Args: feature_group_name (str): name of the FeatureGroup to update. feature_additions (Sequence[Dict[str, str]): list of feature definitions to be updated. + online_store_config (Dict[str, Any]): updates to online store config + throughput_config (Dict[str, Any]): target throughput configuration of the feature group Returns: Response dict from service. """ + update_req = {"FeatureGroupName": feature_group_name} + if online_store_config is not None: + update_req["OnlineStoreConfig"] = online_store_config - if feature_additions is None: - return self.sagemaker_client.update_feature_group( - FeatureGroupName=feature_group_name, - OnlineStoreConfig=online_store_config, - ) + if throughput_config is not None: + update_req["ThroughputConfig"] = throughput_config - return self.sagemaker_client.update_feature_group( - FeatureGroupName=feature_group_name, FeatureAdditions=feature_additions - ) + if feature_additions is not None: + update_req["FeatureAdditions"] = feature_additions + + return self.sagemaker_client.update_feature_group(**update_req) def list_feature_groups( self, diff --git a/tests/integ/test_feature_store.py b/tests/integ/test_feature_store.py index f7190d2122..319d492e83 100644 --- a/tests/integ/test_feature_store.py +++ b/tests/integ/test_feature_store.py @@ -43,6 +43,9 @@ TtlDuration, OnlineStoreConfigUpdate, OnlineStoreStorageTypeEnum, + ThroughputConfig, + ThroughputModeEnum, + ThroughputConfigUpdate, ) from sagemaker.feature_store.dataset_builder import ( JoinTypeEnum, @@ -410,6 +413,78 @@ def test_create_feature_group_standard_storage_type( assert storage_type == "Standard" +def test_throughput_create_as_provisioned_and_update_to_ondemand( + feature_store_session, + role, + feature_group_name, + pandas_data_frame, +): + feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) + feature_group.load_feature_definitions(data_frame=pandas_data_frame) + with cleanup_feature_group(feature_group): + feature_group.create( + s3_uri=False, + record_identifier_name="feature1", + event_time_feature_name="feature3", + role_arn=role, + enable_online_store=True, + throughput_config=ThroughputConfig(ThroughputModeEnum.PROVISIONED, 4000, 4000), + ) + _wait_for_feature_group_create(feature_group) + + tp_config = feature_group.describe().get("ThroughputConfig") + mode = tp_config.get("ThroughputMode") + rcu = tp_config.get("ProvisionedReadCapacityUnits") + wcu = tp_config.get("ProvisionedWriteCapacityUnits") + assert mode == ThroughputModeEnum.PROVISIONED.value + assert rcu == 4000 + assert wcu == 4000 + + feature_group.update(throughput_config=ThroughputConfigUpdate(ThroughputModeEnum.ON_DEMAND)) + _wait_for_feature_group_update(feature_group) + + tp_config = feature_group.describe().get("ThroughputConfig") + mode = tp_config.get("ThroughputMode") + assert mode == ThroughputModeEnum.ON_DEMAND.value + + +def test_throughput_create_as_ondemand_and_update_to_provisioned( + feature_store_session, + role, + feature_group_name, + pandas_data_frame, +): + feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) + feature_group.load_feature_definitions(data_frame=pandas_data_frame) + with cleanup_feature_group(feature_group): + feature_group.create( + s3_uri=False, + record_identifier_name="feature1", + event_time_feature_name="feature3", + role_arn=role, + enable_online_store=True, + throughput_config=ThroughputConfig(ThroughputModeEnum.ON_DEMAND), + ) + _wait_for_feature_group_create(feature_group) + + tp_config = feature_group.describe().get("ThroughputConfig") + mode = tp_config.get("ThroughputMode") + assert mode == ThroughputModeEnum.ON_DEMAND.value + + feature_group.update( + throughput_config=ThroughputConfigUpdate(ThroughputModeEnum.PROVISIONED, 100, 200) + ) + _wait_for_feature_group_update(feature_group) + + tp_config = feature_group.describe().get("ThroughputConfig") + mode = tp_config.get("ThroughputMode") + rcu = tp_config.get("ProvisionedReadCapacityUnits") + wcu = tp_config.get("ProvisionedWriteCapacityUnits") + assert mode == ThroughputModeEnum.PROVISIONED.value + assert rcu == 100 + assert wcu == 200 + + def test_ttl_duration( feature_store_session, role, diff --git a/tests/unit/sagemaker/feature_store/test_feature_group.py b/tests/unit/sagemaker/feature_store/test_feature_group.py index c3499e3f51..394ecb25b3 100644 --- a/tests/unit/sagemaker/feature_store/test_feature_group.py +++ b/tests/unit/sagemaker/feature_store/test_feature_group.py @@ -40,6 +40,9 @@ TtlDuration, OnlineStoreConfigUpdate, OnlineStoreStorageTypeEnum, + ThroughputModeEnum, + ThroughputConfig, + ThroughputConfigUpdate, ) from tests.unit import SAGEMAKER_CONFIG_FEATURE_GROUP @@ -305,6 +308,63 @@ def test_feature_store_create_with_in_memory_collection_types( ) +def test_feature_store_create_in_provisioned_throughput_mode( + sagemaker_session_mock, role_arn, feature_group_dummy_definitions +): + feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) + feature_group.feature_definitions = feature_group_dummy_definitions + feature_group.create( + s3_uri=False, + record_identifier_name="feature1", + event_time_feature_name="feature2", + role_arn=role_arn, + enable_online_store=True, + throughput_config=ThroughputConfig(ThroughputModeEnum.PROVISIONED, 1000, 2000), + ) + sagemaker_session_mock.create_feature_group.assert_called_with( + feature_group_name="MyFeatureGroup", + record_identifier_name="feature1", + event_time_feature_name="feature2", + feature_definitions=[fd.to_dict() for fd in feature_group_dummy_definitions], + role_arn=role_arn, + description=None, + tags=None, + online_store_config={"EnableOnlineStore": True}, + throughput_config={ + "ThroughputMode": "Provisioned", + "ProvisionedReadCapacityUnits": 1000, + "ProvisionedWriteCapacityUnits": 2000, + }, + ) + + +def test_feature_store_create_in_ondemand_throughput_mode( + sagemaker_session_mock, role_arn, feature_group_dummy_definitions +): + feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) + feature_group.feature_definitions = feature_group_dummy_definitions + feature_group.create( + s3_uri=False, + record_identifier_name="feature1", + event_time_feature_name="feature2", + role_arn=role_arn, + enable_online_store=True, + throughput_config=ThroughputConfig(ThroughputModeEnum.ON_DEMAND), + ) + + sagemaker_session_mock.create_feature_group.assert_called_with( + feature_group_name="MyFeatureGroup", + record_identifier_name="feature1", + event_time_feature_name="feature2", + feature_definitions=[fd.to_dict() for fd in feature_group_dummy_definitions], + role_arn=role_arn, + description=None, + tags=None, + online_store_config={"EnableOnlineStore": True}, + throughput_config={"ThroughputMode": "OnDemand"}, + ) + + def test_feature_store_delete(sagemaker_session_mock): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.delete() @@ -327,6 +387,35 @@ def test_feature_store_update(sagemaker_session_mock, feature_group_dummy_defini sagemaker_session_mock.update_feature_group.assert_called_with( feature_group_name="MyFeatureGroup", feature_additions=[fd.to_dict() for fd in feature_group_dummy_definitions], + throughput_config=None, + online_store_config=None, + ) + + +def test_feature_store_throughput_update_to_provisioned(sagemaker_session_mock): + feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) + feature_group.update( + throughput_config=ThroughputConfigUpdate(ThroughputModeEnum.PROVISIONED, 999, 777) + ) + sagemaker_session_mock.update_feature_group.assert_called_with( + feature_group_name="MyFeatureGroup", + feature_additions=None, + throughput_config={ + "ThroughputMode": "Provisioned", + "ProvisionedReadCapacityUnits": 999, + "ProvisionedWriteCapacityUnits": 777, + }, + online_store_config=None, + ) + + +def test_feature_store_throughput_update_to_ondemand(sagemaker_session_mock): + feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) + feature_group.update(throughput_config=ThroughputConfigUpdate(ThroughputModeEnum.ON_DEMAND)) + sagemaker_session_mock.update_feature_group.assert_called_with( + feature_group_name="MyFeatureGroup", + feature_additions=None, + throughput_config={"ThroughputMode": "OnDemand"}, online_store_config=None, ) @@ -341,6 +430,7 @@ def test_feature_store_update_with_ttl_duration(sagemaker_session_mock): feature_group_name="MyFeatureGroup", feature_additions=None, online_store_config=online_store_config.to_dict(), + throughput_config=None, ) diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py index c51dcaaea5..6ee2cc9af5 100644 --- a/tests/unit/test_session.py +++ b/tests/unit/test_session.py @@ -5134,7 +5134,7 @@ def test_feature_group_describe(sagemaker_session): ) -def test_feature_group_update(sagemaker_session, feature_group_dummy_definitions): +def test_feature_group_feature_additions_update(sagemaker_session, feature_group_dummy_definitions): sagemaker_session.update_feature_group( feature_group_name="MyFeatureGroup", feature_additions=feature_group_dummy_definitions, @@ -5145,6 +5145,32 @@ def test_feature_group_update(sagemaker_session, feature_group_dummy_definitions ) +def test_feature_group_online_store_config_update(sagemaker_session): + os_conf_update = {"TtlDuration": {"Unit": "Seconds", "Value": 123}} + sagemaker_session.update_feature_group( + feature_group_name="MyFeatureGroup", + online_store_config=os_conf_update, + ) + assert sagemaker_session.sagemaker_client.update_feature_group.called_with( + FeatureGroupName="MyFeatureGroup", OnlineStoreConfig=os_conf_update + ) + + +def test_feature_group_throughput_config_update(sagemaker_session): + tp_update = { + "ThroughputMode": "Provisioned", + "ProvisionedReadCapacityUnits": 123, + "ProvisionedWriteCapacityUnits": 456, + } + sagemaker_session.update_feature_group( + feature_group_name="MyFeatureGroup", + throughput_config=tp_update, + ) + assert sagemaker_session.sagemaker_client.update_feature_group.called_with( + FeatureGroupName="MyFeatureGroup", ThroughputConfig=tp_update + ) + + def test_feature_metadata_update(sagemaker_session): parameter_additions = [ { From 230954cbe68a63306a1fdd77a2245049430f889b Mon Sep 17 00:00:00 2001 From: jiapinw <95885824+jiapinw@users.noreply.github.com> Date: Mon, 22 Jan 2024 15:37:17 -0800 Subject: [PATCH 55/76] change: Enable galactus integ tests (#4376) * feat: Enable galactus integ tests * fix flake8 * fix doc8 * trying to see if it works with slow tests * small fixes in import error * fix missing import * try to remove some dependencies from requirement to see if pr test can be fixed * fix flake8 * Enable more tests * Add rerun annotation and further remove dependencies * comment out 2 integ tests * Remove local mode test for now * fix flake8 --- tests/integ/sagemaker/serve/constants.py | 27 +- .../sagemaker/serve/test_serve_js_happy.py | 5 +- .../sagemaker/serve/test_serve_pt_happy.py | 345 ++++++++++-------- 3 files changed, 206 insertions(+), 171 deletions(-) diff --git a/tests/integ/sagemaker/serve/constants.py b/tests/integ/sagemaker/serve/constants.py index 6b27ff2db6..cf4c6919aa 100644 --- a/tests/integ/sagemaker/serve/constants.py +++ b/tests/integ/sagemaker/serve/constants.py @@ -12,24 +12,23 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import -# import os +import os import platform -# from tests.integ import DATA_DIR +from tests.integ import DATA_DIR -# SERVE_IN_PROCESS_TIMEOUT = 5 -# SERVE_MODEL_PACKAGE_TIMEOUT = 10 -# SERVE_LOCAL_CONTAINER_TIMEOUT = 10 +SERVE_IN_PROCESS_TIMEOUT = 5 +SERVE_MODEL_PACKAGE_TIMEOUT = 10 +SERVE_LOCAL_CONTAINER_TIMEOUT = 10 SERVE_SAGEMAKER_ENDPOINT_TIMEOUT = 15 -# SERVE_SAVE_TIMEOUT = 2 +SERVE_SAVE_TIMEOUT = 2 -# NOT_RUNNING_ON_PY38 = platform.python_version_tuple()[1] != "8" -NOT_RUNNING_ON_PY310 = platform.python_version_tuple()[1] != "10" -# NOT_RUNNING_ON_INF_EXP_DEV_PIPELINE = os.getenv("TEST_OWNER") != "INF_EXP_DEV" +PYTHON_VERSION_IS_NOT_38 = platform.python_version_tuple()[1] != "8" +PYTHON_VERSION_IS_NOT_310 = platform.python_version_tuple()[1] != "10" -# XGB_RESOURCE_DIR = os.path.join(DATA_DIR, "serve_resources", "xgboost") -# PYTORCH_SQUEEZENET_RESOURCE_DIR = os.path.join(DATA_DIR, "serve_resources", "pytorch") -# TF_EFFICIENT_RESOURCE_DIR = os.path.join(DATA_DIR, "serve_resources", "tensorflow") -# HF_DIR = os.path.join(DATA_DIR, "serve_resources", "hf") +XGB_RESOURCE_DIR = os.path.join(DATA_DIR, "serve_resources", "xgboost") +PYTORCH_SQUEEZENET_RESOURCE_DIR = os.path.join(DATA_DIR, "serve_resources", "pytorch") +TF_EFFICIENT_RESOURCE_DIR = os.path.join(DATA_DIR, "serve_resources", "tensorflow") +HF_DIR = os.path.join(DATA_DIR, "serve_resources", "hf") -# BYOC_IMAGE_URI_TEMPLATE = "661407751302.dkr.ecr.{}.amazonaws.com/byoc-integ-test-images:{}" +BYOC_IMAGE_URI_TEMPLATE = "661407751302.dkr.ecr.{}.amazonaws.com/byoc-integ-test-images:{}" diff --git a/tests/integ/sagemaker/serve/test_serve_js_happy.py b/tests/integ/sagemaker/serve/test_serve_js_happy.py index 66341c471a..1050be3bb1 100644 --- a/tests/integ/sagemaker/serve/test_serve_js_happy.py +++ b/tests/integ/sagemaker/serve/test_serve_js_happy.py @@ -17,7 +17,7 @@ from sagemaker.serve.builder.schema_builder import SchemaBuilder from tests.integ.sagemaker.serve.constants import ( SERVE_SAGEMAKER_ENDPOINT_TIMEOUT, - NOT_RUNNING_ON_PY310, + PYTHON_VERSION_IS_NOT_310, ) from tests.integ.timeout import timeout @@ -47,9 +47,10 @@ def happy_model_builder(sagemaker_session): @pytest.mark.skipif( - NOT_RUNNING_ON_PY310, + PYTHON_VERSION_IS_NOT_310, reason="The goal of these test are to test the serving components of our feature", ) +@pytest.mark.slow_test def test_happy_tgi_sagemaker_endpoint(happy_model_builder, gpu_instance_type): logger.info("Running in SAGEMAKER_ENDPOINT mode...") caught_ex = None diff --git a/tests/integ/sagemaker/serve/test_serve_pt_happy.py b/tests/integ/sagemaker/serve/test_serve_pt_happy.py index 37233da859..67ac9dc7fd 100644 --- a/tests/integ/sagemaker/serve/test_serve_pt_happy.py +++ b/tests/integ/sagemaker/serve/test_serve_pt_happy.py @@ -10,127 +10,159 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. -# from __future__ import absolute_import - -# import pytest -# import torch -# from PIL import Image -# import os - -# from sagemaker.serve.builder.model_builder import ModelBuilder, Mode -# from sagemaker.serve.builder.schema_builder import SchemaBuilder -# from sagemaker.serve.spec.inference_spec import InferenceSpec -# from torchvision.transforms import transforms -# from torchvision.models.squeezenet import squeezenet1_1 - -# from tests.integ.sagemaker.serve.constants import ( -# PYTORCH_SQUEEZENET_RESOURCE_DIR, -# SERVE_SAGEMAKER_ENDPOINT_TIMEOUT, -# NOT_RUNNING_ON_INF_EXP_DEV_PIPELINE, -# NOT_RUNNING_ON_PY310, -# ) -# from tests.integ.timeout import timeout -# from tests.integ.utils import cleanup_model_resources -# import logging - -# logger = logging.getLogger(__name__) - -# ROLE_NAME = "Admin" - -# GH_USER_NAME = os.getenv("GH_USER_NAME") -# GH_ACCESS_TOKEN = os.getenv("GH_ACCESS_TOKEN") - - -# @pytest.fixture -# def pt_dependencies(): -# return { -# "auto": True, -# "custom": [ -# "boto3==1.26.*", -# "botocore==1.29.*", -# "s3transfer==0.6.*", -# ( -# f"git+https://{GH_USER_NAME}:{GH_ACCESS_TOKEN}@github.com" -# "/aws/sagemaker-python-sdk-staging.git@inference-experience-dev" -# ), -# ], -# } - - -# @pytest.fixture -# def test_image(): -# return Image.open(str(os.path.join(PYTORCH_SQUEEZENET_RESOURCE_DIR, "zidane.jpeg"))) - - -# @pytest.fixture -# def squeezenet_inference_spec(): -# class MySqueezeNetModel(InferenceSpec): -# def __init__(self) -> None: -# super().__init__() -# self.transform = transforms.Compose( -# [ -# transforms.Resize(256), -# transforms.CenterCrop(224), -# transforms.ToTensor(), -# transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), -# ] -# ) - -# def invoke(self, input_object: object, model: object): -# # transform -# image_tensor = self.transform(input_object) -# input_batch = image_tensor.unsqueeze(0) -# # invoke -# with torch.no_grad(): -# output = model(input_batch) -# return output - -# def load(self, model_dir: str): -# model = squeezenet1_1() -# model.load_state_dict(torch.load(model_dir + "/model.pth")) -# model.eval() -# return model - -# return MySqueezeNetModel() - - -# @pytest.fixture -# def squeezenet_schema(): -# input_image = Image.open(os.path.join(PYTORCH_SQUEEZENET_RESOURCE_DIR, "zidane.jpeg")) -# output_tensor = torch.rand(3, 4) -# return SchemaBuilder(sample_input=input_image, sample_output=output_tensor) - +from __future__ import absolute_import + +import pytest +import torch +from PIL import Image +import os +import io +import numpy as np + +from sagemaker.serve.builder.model_builder import ModelBuilder, Mode +from sagemaker.serve.builder.schema_builder import SchemaBuilder, CustomPayloadTranslator +from sagemaker.serve.spec.inference_spec import InferenceSpec +from torchvision.transforms import transforms +from torchvision.models.squeezenet import squeezenet1_1 + +from tests.integ.sagemaker.serve.constants import ( + PYTORCH_SQUEEZENET_RESOURCE_DIR, + SERVE_SAGEMAKER_ENDPOINT_TIMEOUT, + PYTHON_VERSION_IS_NOT_310, +) +from tests.integ.timeout import timeout +from tests.integ.utils import cleanup_model_resources +import logging + +logger = logging.getLogger(__name__) + +ROLE_NAME = "SageMakerRole" + + +@pytest.fixture +def test_image(): + return Image.open(str(os.path.join(PYTORCH_SQUEEZENET_RESOURCE_DIR, "zidane.jpeg"))) + + +@pytest.fixture +def squeezenet_inference_spec(): + class MySqueezeNetModel(InferenceSpec): + def __init__(self) -> None: + super().__init__() + + def invoke(self, input_object: object, model: object): + with torch.no_grad(): + output = model(input_object) + return output + + def load(self, model_dir: str): + model = squeezenet1_1() + model.load_state_dict(torch.load(model_dir + "/model.pth")) + model.eval() + return model + + return MySqueezeNetModel() + + +@pytest.fixture +def custom_request_translator(): + # request translator + class MyRequestTranslator(CustomPayloadTranslator): + def __init__(self): + super().__init__() + # Define image transformation + self.transform = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ] + ) + + # This function converts the payload to bytes - happens on client side + def serialize_payload_to_bytes(self, payload: object) -> bytes: + # converts an image to bytes + image_tensor = self.transform(payload) + input_batch = image_tensor.unsqueeze(0) + input_ndarray = input_batch.numpy() + return self._convert_numpy_to_bytes(input_ndarray) + + # This function converts the bytes to payload - happens on server side + def deserialize_payload_from_stream(self, stream) -> torch.Tensor: + # convert payload back to torch.Tensor + np_array = np.load(io.BytesIO(stream.read())) + return torch.from_numpy(np_array) + + def _convert_numpy_to_bytes(self, np_array: np.ndarray) -> bytes: + buffer = io.BytesIO() + np.save(buffer, np_array) + return buffer.getvalue() + + return MyRequestTranslator() + + +@pytest.fixture +def custom_response_translator(): + # response translator + class MyResponseTranslator(CustomPayloadTranslator): + # This function converts the payload to bytes - happens on server side + def serialize_payload_to_bytes(self, payload: torch.Tensor) -> bytes: + return self._convert_numpy_to_bytes(payload.numpy()) + + # This function converts the bytes to payload - happens on client side + def deserialize_payload_from_stream(self, stream) -> object: + return torch.from_numpy(np.load(io.BytesIO(stream.read()))) + + def _convert_numpy_to_bytes(self, np_array: np.ndarray) -> bytes: + buffer = io.BytesIO() + np.save(buffer, np_array) + return buffer.getvalue() + + return MyResponseTranslator() + + +@pytest.fixture +def squeezenet_schema(custom_request_translator, custom_response_translator): + input_image = Image.open(os.path.join(PYTORCH_SQUEEZENET_RESOURCE_DIR, "zidane.jpeg")) + output_tensor = torch.rand(3, 4) + return SchemaBuilder( + sample_input=input_image, + sample_output=output_tensor, + input_translator=custom_request_translator, + output_translator=custom_response_translator, + ) + -# @pytest.fixture -# def model_builder_inference_spec_schema_builder( -# squeezenet_inference_spec, squeezenet_schema, pt_dependencies -# ): -# return ModelBuilder( -# model_path=PYTORCH_SQUEEZENET_RESOURCE_DIR, -# inference_spec=squeezenet_inference_spec, -# schema_builder=squeezenet_schema, -# dependencies=pt_dependencies, -# ) - - -# @pytest.fixture -# def model_builder(request): -# return request.getfixturevalue(request.param) +@pytest.fixture +def model_builder_inference_spec_schema_builder(squeezenet_inference_spec, squeezenet_schema): + return ModelBuilder( + model_path=PYTORCH_SQUEEZENET_RESOURCE_DIR, + inference_spec=squeezenet_inference_spec, + schema_builder=squeezenet_schema, + ) + + +@pytest.fixture +def model_builder(request): + return request.getfixturevalue(request.param) # @pytest.mark.skipif( -# NOT_RUNNING_ON_INF_EXP_DEV_PIPELINE or NOT_RUNNING_ON_PY310, +# PYTHON_VERSION_IS_NOT_310, # reason="The goal of these test are to test the serving components of our feature", # ) # @pytest.mark.parametrize( # "model_builder", ["model_builder_inference_spec_schema_builder"], indirect=True # ) +# @pytest.mark.slow_test +# @pytest.mark.flaky(reruns=5, reruns_delay=2) # def test_happy_pytorch_local_container(sagemaker_session, model_builder, test_image): # logger.info("Running in LOCAL_CONTAINER mode...") # caught_ex = None - +# # model = model_builder.build(mode=Mode.LOCAL_CONTAINER, sagemaker_session=sagemaker_session) - +# # with timeout(minutes=SERVE_LOCAL_CONTAINER_TIMEOUT): # try: # logger.info("Deploying and predicting in LOCAL_CONTAINER mode...") @@ -149,68 +181,70 @@ # ), f"{caught_ex} was thrown when running pytorch squeezenet local container test" -# @pytest.mark.skipif( -# NOT_RUNNING_ON_INF_EXP_DEV_PIPELINE or NOT_RUNNING_ON_PY310, -# reason="The goal of these test are to test the serving components of our feature", -# ) -# @pytest.mark.parametrize( -# "model_builder", ["model_builder_inference_spec_schema_builder"], indirect=True -# ) -# def test_happy_pytorch_sagemaker_endpoint( -# sagemaker_session, model_builder, cpu_instance_type, test_image -# ): -# logger.info("Running in SAGEMAKER_ENDPOINT mode...") -# caught_ex = None - -# iam_client = sagemaker_session.boto_session.client("iam") -# role_arn = iam_client.get_role(RoleName=ROLE_NAME)["Role"]["Arn"] - -# model = model_builder.build( -# mode=Mode.SAGEMAKER_ENDPOINT, role_arn=role_arn, sagemaker_session=sagemaker_session -# ) - -# with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT): -# try: -# logger.info("Deploying and predicting in SAGEMAKER_ENDPOINT mode...") -# predictor = model.deploy(instance_type=cpu_instance_type, initial_instance_count=1) -# logger.info("Endpoint successfully deployed.") -# predictor.predict(test_image) -# except Exception as e: -# caught_ex = e -# finally: -# cleanup_model_resources( -# sagemaker_session=model_builder.sagemaker_session, -# model_name=model.name, -# endpoint_name=model.endpoint_name, -# ) -# if caught_ex: -# logger.exception(caught_ex) -# assert ( -# False -# ), f"{caught_ex} was thrown when running pytorch squeezenet sagemaker endpoint test" +@pytest.mark.skipif( + PYTHON_VERSION_IS_NOT_310, # or NOT_RUNNING_ON_INF_EXP_DEV_PIPELINE, + reason="The goal of these test are to test the serving components of our feature", +) +@pytest.mark.parametrize( + "model_builder", ["model_builder_inference_spec_schema_builder"], indirect=True +) +@pytest.mark.slow_test +def test_happy_pytorch_sagemaker_endpoint( + sagemaker_session, model_builder, cpu_instance_type, test_image +): + logger.info("Running in SAGEMAKER_ENDPOINT mode...") + caught_ex = None + + iam_client = sagemaker_session.boto_session.client("iam") + role_arn = iam_client.get_role(RoleName=ROLE_NAME)["Role"]["Arn"] + + model = model_builder.build( + mode=Mode.SAGEMAKER_ENDPOINT, role_arn=role_arn, sagemaker_session=sagemaker_session + ) + + with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT): + try: + logger.info("Deploying and predicting in SAGEMAKER_ENDPOINT mode...") + predictor = model.deploy(instance_type=cpu_instance_type, initial_instance_count=1) + logger.info("Endpoint successfully deployed.") + predictor.predict(test_image) + except Exception as e: + caught_ex = e + finally: + cleanup_model_resources( + sagemaker_session=model_builder.sagemaker_session, + model_name=model.name, + endpoint_name=model.endpoint_name, + ) + if caught_ex: + logger.exception(caught_ex) + assert ( + False + ), f"{caught_ex} was thrown when running pytorch squeezenet sagemaker endpoint test" # @pytest.mark.skipif( -# NOT_RUNNING_ON_INF_EXP_DEV_PIPELINE or NOT_RUNNING_ON_PY310, +# PYTHON_VERSION_IS_NOT_310, # reason="The goal of these test are to test the serving components of our feature", # ) # @pytest.mark.parametrize( # "model_builder", ["model_builder_inference_spec_schema_builder"], indirect=True # ) +# @pytest.mark.slow_test # def test_happy_pytorch_local_container_overwrite_to_sagemaker_endpoint( # sagemaker_session, model_builder, cpu_instance_type, test_image # ): # logger.info("Building model in LOCAL_CONTAINER mode...") # caught_ex = None - +# # iam_client = sagemaker_session.boto_session.client("iam") # role_arn = iam_client.get_role(RoleName=ROLE_NAME)["Role"]["Arn"] # logger.debug("Role arn: %s", role_arn) - +# # model = model_builder.build( # mode=Mode.LOCAL_CONTAINER, role_arn=role_arn, sagemaker_session=sagemaker_session # ) - +# # with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT): # try: # logger.info("Deploying and predicting in SAGEMAKER_ENDPOINT mode...") @@ -237,25 +271,26 @@ # @pytest.mark.skipif( -# NOT_RUNNING_ON_INF_EXP_DEV_PIPELINE or NOT_RUNNING_ON_PY310, +# PYTHON_VERSION_IS_NOT_310, # reason="The goal of these test are to test the serving components of our feature", # ) # @pytest.mark.parametrize( # "model_builder", ["model_builder_inference_spec_schema_builder"], indirect=True # ) +# @pytest.mark.slow_test # def test_happy_pytorch_sagemaker_endpoint_overwrite_to_local_container( # sagemaker_session, model_builder, test_image # ): # logger.info("Building model in SAGEMAKER_ENDPOINT mode...") # caught_ex = None - +# # iam_client = sagemaker_session.boto_session.client("iam") # role_arn = iam_client.get_role(RoleName=ROLE_NAME)["Role"]["Arn"] - +# # model = model_builder.build( # mode=Mode.SAGEMAKER_ENDPOINT, role_arn=role_arn, sagemaker_session=sagemaker_session # ) - +# # with timeout(minutes=SERVE_LOCAL_CONTAINER_TIMEOUT): # try: # logger.info("Deploying and predicting in LOCAL_CONTAINER mode...") From a15491c1b16a19e984f1f41a71ba64635bb86235 Mon Sep 17 00:00:00 2001 From: ci Date: Tue, 23 Jan 2024 20:32:18 +0000 Subject: [PATCH 56/76] prepare release v2.204.0 --- CHANGELOG.md | 20 ++++++++++++++++++++ VERSION | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d7649d124..28984dd77d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,25 @@ # Changelog +## v2.204.0 (2024-01-23) + +### Features + + * add throughput management support for feature group + * Support custom repack model settings + * parallelize notebook search utils, add new operators + +### Bug Fixes and Other Changes + + * Enable galactus integ tests + * JumpStart - TLV region launch + * add warning message for job-prefixed pipeline steps when no job name is provided + * TGI NeuronX + * Updates for DJL 0.26.0 release + * update sphinx version + * Add PyTorch 2.1.0 SM Training DLC to UNSUPPORTED_DLC_IMAGE_FOR_SM_PARALLELISM list + * Huggingface glue failing tests + * change ConditionNot incorrect property Expression to Condition + ## v2.203.1 (2024-01-09) ### Bug Fixes and Other Changes diff --git a/VERSION b/VERSION index 36ee2817e2..c8ecfe2616 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.203.2.dev0 +2.204.0 From 857499ff7d177521d49139130b3cdae0964b8133 Mon Sep 17 00:00:00 2001 From: ci Date: Tue, 23 Jan 2024 20:32:20 +0000 Subject: [PATCH 57/76] update development version to v2.204.1.dev0 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index c8ecfe2616..3f1ba2fb4c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.204.0 +2.204.1.dev0 From 160af3bac4ed86be5a1f99ed9018f09d3d9945a7 Mon Sep 17 00:00:00 2001 From: qidewenwhen <32910701+qidewenwhen@users.noreply.github.com> Date: Tue, 23 Jan 2024 14:30:02 -0800 Subject: [PATCH 58/76] fix: Add validation for empty ParameterString value in start local pipeline (#4354) --- src/sagemaker/local/entities.py | 6 +++++ tests/integ/test_local_mode.py | 2 ++ .../sagemaker/local/test_local_entities.py | 26 +++++++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/src/sagemaker/local/entities.py b/src/sagemaker/local/entities.py index 8431d8154a..02ade9bd9a 100644 --- a/src/sagemaker/local/entities.py +++ b/src/sagemaker/local/entities.py @@ -845,6 +845,12 @@ def _initialize_and_validate_parameters(self, overridden_parameters): "{}.".format(param_name, parameter_type.python_type, type(param_value)) ) raise ClientError(error_msg, "start_pipeline_execution") + if param_value == "": + error_msg = self._construct_validation_exception_message( + 'Parameter {} value "" is too short (length: 0, ' + "required minimum: 1).".format(param_name) + ) + raise ClientError(error_msg, "start_pipeline_execution") merged_parameters[param_name] = param_value for param_name, default_parameter in default_parameters.items(): if param_name not in merged_parameters: diff --git a/tests/integ/test_local_mode.py b/tests/integ/test_local_mode.py index a143e05012..f86d2fea93 100644 --- a/tests/integ/test_local_mode.py +++ b/tests/integ/test_local_mode.py @@ -908,6 +908,7 @@ def else_step(): assert exe_step_result["Metadata"]["Condition"]["Outcome"] is True +@pytest.mark.local_mode def test_local_pipeline_with_step_decorator_data_referenced_by_other_steps( local_pipeline_session, dummy_container, @@ -987,6 +988,7 @@ def func(var: int): assert exe_step_result["Metadata"]["Condition"]["Outcome"] is True +@pytest.mark.local_mode def test_local_remote_function_with_additional_dependencies( local_pipeline_session, dummy_container ): diff --git a/tests/unit/sagemaker/local/test_local_entities.py b/tests/unit/sagemaker/local/test_local_entities.py index 6a2d7ee6f2..6a026c316b 100644 --- a/tests/unit/sagemaker/local/test_local_entities.py +++ b/tests/unit/sagemaker/local/test_local_entities.py @@ -20,6 +20,7 @@ from botocore.exceptions import ClientError import sagemaker.local +from sagemaker.workflow.fail_step import FailStep from sagemaker.workflow.parameters import ParameterString from sagemaker.workflow.pipeline import Pipeline from sagemaker.workflow.lambda_step import LambdaStep @@ -293,3 +294,28 @@ def test_start_local_pipeline_with_wrong_parameter_type(sagemaker_local_session) f"Unexpected type for parameter '{parameter.name}'. Expected " f"{parameter.parameter_type.python_type} but found {type(True)}." in str(error.value) ) + + +def test_start_local_pipeline_with_empty_parameter_string_value( + local_pipeline_session, +): + param_str_name = "MyParameterString" + param_str = ParameterString(name=param_str_name, default_value="default") + fail_step = FailStep( + name="MyFailStep", + error_message=param_str, + ) + + pipeline = Pipeline( + name="MyPipeline", + steps=[fail_step], + sagemaker_session=local_pipeline_session, + parameters=[param_str], + ) + + local_pipeline = sagemaker.local.entities._LocalPipeline(pipeline) + with pytest.raises(ClientError) as error: + local_pipeline.start(PipelineParameters={param_str_name: ""}) + assert ( + f'Parameter {param_str_name} value "" is too short (length: 0, required minimum: 1).' + ) in str(error) From f7bba26a2d1a56c985bc149c34a6a40058d0cbe7 Mon Sep 17 00:00:00 2001 From: qidewenwhen <32910701+qidewenwhen@users.noreply.github.com> Date: Tue, 23 Jan 2024 19:57:42 -0800 Subject: [PATCH 59/76] feat: Support selective pipeline execution for function step (#4372) --- .../core/pipeline_variables.py | 27 +-- src/sagemaker/remote_function/job.py | 23 +- .../runtime_environment_manager.py | 4 +- src/sagemaker/workflow/function_step.py | 7 + src/sagemaker/workflow/pipeline.py | 10 +- tests/integ/sagemaker/workflow/helpers.py | 34 ++- .../workflow/test_selective_execution.py | 201 ++++++++++++++++++ .../sagemaker/workflow/test_step_decorator.py | 5 +- .../core/test_pipeline_variables.py | 98 ++++++--- .../core/test_stored_function.py | 12 +- .../test_runtime_environment_manager.py | 6 +- .../sagemaker/remote_function/test_job.py | 36 +++- .../unit/sagemaker/workflow/test_pipeline.py | 63 ++++-- 13 files changed, 439 insertions(+), 87 deletions(-) create mode 100644 tests/integ/sagemaker/workflow/test_selective_execution.py diff --git a/src/sagemaker/remote_function/core/pipeline_variables.py b/src/sagemaker/remote_function/core/pipeline_variables.py index 269ce94113..952cccdb07 100644 --- a/src/sagemaker/remote_function/core/pipeline_variables.py +++ b/src/sagemaker/remote_function/core/pipeline_variables.py @@ -19,6 +19,7 @@ from sagemaker.s3 import s3_path_join from sagemaker.remote_function.core.serialization import deserialize_obj_from_s3 +from sagemaker.workflow.step_outputs import get_step @dataclass @@ -92,7 +93,7 @@ class _S3BaseUriIdentifier: class _DelayedReturn: """Delayed return from a function.""" - uri: List[Union[str, _Parameter, _ExecutionVariable]] + uri: Union[_Properties, List[Union[str, _Parameter, _ExecutionVariable]]] reference_path: Tuple = field(default_factory=tuple) @@ -164,6 +165,7 @@ def __init__( self, delayed_returns: List[_DelayedReturn], hmac_key: str, + properties_resolver: _PropertiesResolver, parameter_resolver: _ParameterResolver, execution_variable_resolver: _ExecutionVariableResolver, s3_base_uri: str, @@ -174,6 +176,7 @@ def __init__( Args: delayed_returns: list of delayed returns to resolve. hmac_key: key used to encrypt serialized and deserialized function and arguments. + properties_resolver: resolver used to resolve step properties. parameter_resolver: resolver used to pipeline parameters. execution_variable_resolver: resolver used to resolve execution variables. s3_base_uri (str): the s3 base uri of the function step that @@ -184,6 +187,7 @@ def __init__( self._s3_base_uri = s3_base_uri self._parameter_resolver = parameter_resolver self._execution_variable_resolver = execution_variable_resolver + self._properties_resolver = properties_resolver # different delayed returns can have the same uri, so we need to dedupe uris = { self._resolve_delayed_return_uri(delayed_return) for delayed_return in delayed_returns @@ -214,7 +218,10 @@ def resolve(self, delayed_return: _DelayedReturn) -> Any: def _resolve_delayed_return_uri(self, delayed_return: _DelayedReturn): """Resolve the s3 uri of the delayed return.""" + if isinstance(delayed_return.uri, _Properties): + return self._properties_resolver.resolve(delayed_return.uri) + # Keep the following old resolution logics to keep backward compatible uri = [] for component in delayed_return.uri: if isinstance(component, _Parameter): @@ -274,6 +281,7 @@ def resolve_pipeline_variables( delayed_return_resolver = _DelayedReturnResolver( delayed_returns=delayed_returns, hmac_key=hmac_key, + properties_resolver=properties_resolver, parameter_resolver=parameter_resolver, execution_variable_resolver=execution_variable_resolver, s3_base_uri=s3_base_uri, @@ -325,27 +333,12 @@ def convert_pipeline_variables_to_pickleable(func_args: Tuple, func_kwargs: Dict from sagemaker.workflow.entities import PipelineVariable - from sagemaker.workflow.execution_variables import ExecutionVariables - from sagemaker.workflow.function_step import DelayedReturn - # Notes: - # 1. The s3_base_uri = s3_root_uri + pipeline_name, but the two may be unknown - # when defining function steps. After step-level arg serialization, - # it's hard to update the s3_base_uri in pipeline compile time. - # Thus set a placeholder: _S3BaseUriIdentifier, and let the runtime job to resolve it. - # 2. For saying s3_root_uri is unknown, it's because when defining function steps, - # the pipeline's sagemaker_session is not passed in, but the default s3_root_uri - # should be retrieved from the pipeline's sagemaker_session. def convert(arg): if isinstance(arg, DelayedReturn): return _DelayedReturn( - uri=[ - _S3BaseUriIdentifier(), - ExecutionVariables.PIPELINE_EXECUTION_ID._pickleable, - arg._step.name, - "results", - ], + uri=get_step(arg)._properties.OutputDataConfig.S3OutputPath._pickleable, reference_path=arg._reference_path, ) diff --git a/src/sagemaker/remote_function/job.py b/src/sagemaker/remote_function/job.py index 205a2adf41..71530ac4dd 100644 --- a/src/sagemaker/remote_function/job.py +++ b/src/sagemaker/remote_function/job.py @@ -60,6 +60,7 @@ from sagemaker import vpc_utils from sagemaker.remote_function.core.stored_function import StoredFunction, _SerializedData from sagemaker.remote_function.core.pipeline_variables import Context + from sagemaker.remote_function.runtime_environment.runtime_environment_manager import ( RuntimeEnvironmentManager, _DependencySettings, @@ -72,6 +73,8 @@ copy_workdir, resolve_custom_file_filter_from_config_file, ) +from sagemaker.workflow.function_step import DelayedReturn +from sagemaker.workflow.step_outputs import get_step if TYPE_CHECKING: from sagemaker.workflow.entities import PipelineVariable @@ -701,6 +704,7 @@ def compile( """Build the artifacts and generate the training job request.""" from sagemaker.workflow.properties import Properties from sagemaker.workflow.parameters import Parameter + from sagemaker.workflow.functions import Join from sagemaker.workflow.execution_variables import ExecutionVariables, ExecutionVariable from sagemaker.workflow.utilities import load_step_compilation_context @@ -760,7 +764,19 @@ def compile( job_settings=job_settings, s3_base_uri=s3_base_uri ) - output_config = {"S3OutputPath": s3_base_uri} + if step_compilation_context: + s3_output_path = Join( + on="/", + values=[ + s3_base_uri, + ExecutionVariables.PIPELINE_EXECUTION_ID, + step_compilation_context.step_name, + "results", + ], + ) + output_config = {"S3OutputPath": s3_output_path} + else: + output_config = {"S3OutputPath": s3_base_uri} if job_settings.s3_kms_key is not None: output_config["KmsKeyId"] = job_settings.s3_kms_key request_dict["OutputDataConfig"] = output_config @@ -804,6 +820,11 @@ def compile( if isinstance(arg, (Parameter, ExecutionVariable, Properties)): container_args.extend([arg.expr["Get"], arg.to_string()]) + if isinstance(arg, DelayedReturn): + # The uri is a Properties object + uri = get_step(arg)._properties.OutputDataConfig.S3OutputPath + container_args.extend([uri.expr["Get"], uri.to_string()]) + if run_info is not None: container_args.extend(["--run_in_context", json.dumps(dataclasses.asdict(run_info))]) elif _RunContext.get_current_run() is not None: diff --git a/src/sagemaker/remote_function/runtime_environment/runtime_environment_manager.py b/src/sagemaker/remote_function/runtime_environment/runtime_environment_manager.py index 0affa9beac..97ca4f08e4 100644 --- a/src/sagemaker/remote_function/runtime_environment/runtime_environment_manager.py +++ b/src/sagemaker/remote_function/runtime_environment/runtime_environment_manager.py @@ -252,7 +252,7 @@ def _is_file_exists(self, dependencies): def _install_requirements_txt(self, local_path, python_executable): """Install requirements.txt file""" - cmd = f"{python_executable} -m pip install -r {local_path}" + cmd = f"{python_executable} -m pip install -r {local_path} -U" logger.info("Running command: '%s' in the dir: '%s' ", cmd, os.getcwd()) _run_shell_cmd(cmd) logger.info("Command %s ran successfully", cmd) @@ -268,7 +268,7 @@ def _create_conda_env(self, env_name, local_path): def _install_req_txt_in_conda_env(self, env_name, local_path): """Install requirements.txt in the given conda environment""" - cmd = f"{self._get_conda_exe()} run -n {env_name} pip install -r {local_path}" + cmd = f"{self._get_conda_exe()} run -n {env_name} pip install -r {local_path} -U" logger.info("Activating conda env and installing requirements: %s", cmd) _run_shell_cmd(cmd) logger.info("Requirements installed successfully in conda env %s", env_name) diff --git a/src/sagemaker/workflow/function_step.py b/src/sagemaker/workflow/function_step.py index a55955b4eb..55e7eac90c 100644 --- a/src/sagemaker/workflow/function_step.py +++ b/src/sagemaker/workflow/function_step.py @@ -34,6 +34,7 @@ ) from sagemaker.workflow.execution_variables import ExecutionVariables +from sagemaker.workflow.properties import Properties from sagemaker.workflow.retry import RetryPolicy from sagemaker.workflow.steps import Step, ConfigurableRetryStep, StepTypeEnum from sagemaker.workflow.step_collections import StepCollection @@ -101,6 +102,12 @@ def __init__( self.__job_settings = None + # It's for internal usage to retrieve execution id from the properties. + # However, we won't expose the properties of function step to customers. + self._properties = Properties( + step_name=name, step=self, shape_name="DescribeTrainingJobResponse" + ) + ( self._converted_func_args, self._converted_func_kwargs, diff --git a/src/sagemaker/workflow/pipeline.py b/src/sagemaker/workflow/pipeline.py index 0645e58386..6800f2a3ac 100644 --- a/src/sagemaker/workflow/pipeline.py +++ b/src/sagemaker/workflow/pipeline.py @@ -1039,11 +1039,19 @@ def get_function_step_result( raise ValueError(_ERROR_MSG_OF_WRONG_STEP_TYPE) s3_output_path = describe_training_job_response["OutputDataConfig"]["S3OutputPath"] + s3_uri_suffix = s3_path_join(execution_id, step_name, RESULTS_FOLDER) + if s3_output_path.endswith(s3_uri_suffix) or s3_output_path[0:-1].endswith(s3_uri_suffix): + s3_uri = s3_output_path + else: + # This is the obsoleted version of s3_output_path + # Keeping it for backward compatible + s3_uri = s3_path_join(s3_output_path, s3_uri_suffix) + job_status = describe_training_job_response["TrainingJobStatus"] if job_status == "Completed": return deserialize_obj_from_s3( sagemaker_session=sagemaker_session, - s3_uri=s3_path_join(s3_output_path, execution_id, step_name, RESULTS_FOLDER), + s3_uri=s3_uri, hmac_key=describe_training_job_response["Environment"]["REMOTE_FUNCTION_SECRET_KEY"], ) diff --git a/tests/integ/sagemaker/workflow/helpers.py b/tests/integ/sagemaker/workflow/helpers.py index 40681b9ac7..48e1e95734 100644 --- a/tests/integ/sagemaker/workflow/helpers.py +++ b/tests/integ/sagemaker/workflow/helpers.py @@ -39,18 +39,24 @@ def create_and_execute_pipeline( step_result_type=None, step_result_value=None, wait_duration=400, # seconds + selective_execution_config=None, ): - response = pipeline.create(role) - - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, + create_arn = None + if not selective_execution_config: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + + execution = pipeline.start( + parameters=execution_parameters, selective_execution_config=selective_execution_config ) - execution = pipeline.start(parameters=execution_parameters) - response = execution.describe() - assert response["PipelineArn"] == create_arn + if create_arn: + response = execution.describe() + assert response["PipelineArn"] == create_arn wait_pipeline_execution(execution=execution, delay=20, max_attempts=int(wait_duration / 20)) @@ -71,6 +77,16 @@ def create_and_execute_pipeline( if step_result_value: result = execution.result(execution_steps[0]["StepName"]) assert result == step_result_value, f"Expected {step_result_value}, instead found {result}" + + if selective_execution_config: + for exe_step in execution_steps: + if exe_step["StepName"] in selective_execution_config.selected_steps: + continue + assert ( + exe_step["SelectiveExecutionResult"]["SourcePipelineExecutionArn"] + == selective_execution_config.source_pipeline_execution_arn + ) + return execution, execution_steps diff --git a/tests/integ/sagemaker/workflow/test_selective_execution.py b/tests/integ/sagemaker/workflow/test_selective_execution.py new file mode 100644 index 0000000000..a2c0286c6a --- /dev/null +++ b/tests/integ/sagemaker/workflow/test_selective_execution.py @@ -0,0 +1,201 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os + +import pytest + +from tests.integ import DATA_DIR +from sagemaker.sklearn import SKLearnProcessor +from sagemaker.workflow.step_outputs import get_step + +from sagemaker.workflow.selective_execution_config import SelectiveExecutionConfig + +from tests.integ.sagemaker.workflow.helpers import create_and_execute_pipeline +from sagemaker import utils, get_execution_role +from sagemaker.workflow.function_step import step +from sagemaker.workflow.pipeline import Pipeline +from sagemaker.workflow.steps import ProcessingStep + +INSTANCE_TYPE = "ml.m5.large" + + +@pytest.fixture +def role(sagemaker_session): + return get_execution_role(sagemaker_session) + + +@pytest.fixture +def region_name(sagemaker_session): + return sagemaker_session.boto_session.region_name + + +@pytest.fixture +def pipeline_name(): + return utils.unique_name_from_base("Selective-Pipeline") + + +def test_selective_execution_among_pure_function_steps( + sagemaker_session, role, pipeline_name, region_name, dummy_container_without_error +): + # Test Selective Pipeline Execution on function step1 -> [select: function step2] + os.environ["AWS_DEFAULT_REGION"] = region_name + + step_settings = dict( + role=role, + instance_type=INSTANCE_TYPE, + image_uri=dummy_container_without_error, + keep_alive_period_in_seconds=60, + ) + + @step(**step_settings) + def generator() -> tuple: + return 3, 4 + + @step(**step_settings) + def sum(a, b): + """adds two numbers""" + return a + b + + step_output_a = generator() + step_output_b = sum(step_output_a[0], step_output_a[1]) + + pipeline = Pipeline( + name=pipeline_name, + steps=[step_output_b], + sagemaker_session=sagemaker_session, + ) + + try: + execution, _ = create_and_execute_pipeline( + pipeline=pipeline, + pipeline_name=pipeline_name, + region_name=region_name, + role=role, + no_of_steps=2, + last_step_name="sum", + execution_parameters=dict(), + step_status="Succeeded", + step_result_type=int, + step_result_value=7, + ) + + create_and_execute_pipeline( + pipeline=pipeline, + pipeline_name=pipeline_name, + region_name=region_name, + role=role, + no_of_steps=2, + last_step_name="sum", + execution_parameters=dict(), + step_status="Succeeded", + step_result_type=int, + step_result_value=7, + selective_execution_config=SelectiveExecutionConfig( + source_pipeline_execution_arn=execution.arn, + selected_steps=[get_step(step_output_b).name], + ), + ) + + finally: + try: + pipeline.delete() + except Exception: + pass + + +def test_selective_execution_of_regular_step_depended_by_function_step( + sagemaker_session, + role, + pipeline_name, + region_name, + dummy_container_without_error, + sklearn_latest_version, +): + # Test Selective Pipeline Execution on regular step -> [select: function step] + os.environ["AWS_DEFAULT_REGION"] = region_name + + script_path = os.path.join(DATA_DIR, "dummy_script.py") + + sklearn_processor = SKLearnProcessor( + framework_version=sklearn_latest_version, + role=role, + instance_type=INSTANCE_TYPE, + instance_count=1, + command=["python3"], + sagemaker_session=sagemaker_session, + base_job_name="test-sklearn", + ) + + step_sklearn = ProcessingStep( + name="sklearn-process", + processor=sklearn_processor, + code=script_path, + ) + + @step( + role=role, + instance_type=INSTANCE_TYPE, + image_uri=dummy_container_without_error, + keep_alive_period_in_seconds=60, + ) + def func_2(arg): + return arg + + final_output = func_2(step_sklearn.properties.ProcessingJobStatus) + + pipeline = Pipeline( + name=pipeline_name, + steps=[final_output], + sagemaker_session=sagemaker_session, + ) + + try: + execution, _ = create_and_execute_pipeline( + pipeline=pipeline, + pipeline_name=pipeline_name, + region_name=region_name, + role=role, + no_of_steps=2, + last_step_name="func", + execution_parameters=dict(), + step_status="Succeeded", + step_result_type=str, + step_result_value="Completed", + wait_duration=600, + ) + + create_and_execute_pipeline( + pipeline=pipeline, + pipeline_name=pipeline_name, + region_name=region_name, + role=role, + no_of_steps=2, + last_step_name="func", + execution_parameters=dict(), + step_status="Succeeded", + step_result_type=str, + step_result_value="Completed", + wait_duration=600, + selective_execution_config=SelectiveExecutionConfig( + source_pipeline_execution_arn=execution.arn, + selected_steps=[get_step(final_output).name], + ), + ) + + finally: + try: + pipeline.delete() + except Exception: + pass diff --git a/tests/integ/sagemaker/workflow/test_step_decorator.py b/tests/integ/sagemaker/workflow/test_step_decorator.py index 66f59956c3..bdd18a16f2 100644 --- a/tests/integ/sagemaker/workflow/test_step_decorator.py +++ b/tests/integ/sagemaker/workflow/test_step_decorator.py @@ -256,6 +256,7 @@ def sum(a, b): execution_parameters=dict(), step_status="Succeeded", step_result_type=int, + step_result_value=7, ) finally: try: @@ -784,12 +785,10 @@ def updated_func(x): pipeline.create(role) pipeline_definition = json.loads(pipeline.describe()["PipelineDefinition"]) - s3_base_uri = pipeline_definition["Steps"][0]["Arguments"]["OutputDataConfig"][ - "S3OutputPath" - ] step_container_args = pipeline_definition["Steps"][0]["Arguments"][ "AlgorithmSpecification" ]["ContainerArguments"] + s3_base_uri = step_container_args[step_container_args.index("--s3_base_uri") + 1] build_time = step_container_args[step_container_args.index("--func_step_s3_dir") + 1] # some other user updates the pickled function code diff --git a/tests/unit/sagemaker/remote_function/core/test_pipeline_variables.py b/tests/unit/sagemaker/remote_function/core/test_pipeline_variables.py index ebe26653b8..422d1949af 100644 --- a/tests/unit/sagemaker/remote_function/core/test_pipeline_variables.py +++ b/tests/unit/sagemaker/remote_function/core/test_pipeline_variables.py @@ -28,6 +28,7 @@ _DelayedReturnResolver, resolve_pipeline_variables, convert_pipeline_variables_to_pickleable, + _PropertiesResolver, _S3BaseUriIdentifier, ) @@ -47,31 +48,46 @@ def test_resolve_delayed_returns(mock_deserializer): delayed_returns = [ _DelayedReturn( - uri=["s3://my-bucket/", "sub-folder-1/"], reference_path=(("__getitem__", 0),) + uri=_Properties("Steps.func1.OutputDataConfig.S3OutputPath"), + reference_path=(("__getitem__", 0),), ), _DelayedReturn( - uri=["s3://my-bucket/", "sub-folder-1/"], reference_path=(("__getitem__", 1),) + uri=_Properties("Steps.func1.OutputDataConfig.S3OutputPath"), + reference_path=(("__getitem__", 1),), ), _DelayedReturn( - uri=["s3://my-bucket/", "sub-folder-1/"], reference_path=(("__getitem__", 2),) + uri=_Properties("Steps.func1.OutputDataConfig.S3OutputPath"), + reference_path=(("__getitem__", 2),), ), _DelayedReturn( - uri=["s3://my-bucket/", "sub-folder-1/"], + uri=_Properties("Steps.func1.OutputDataConfig.S3OutputPath"), reference_path=(("__getitem__", 2), ("__getitem__", "key")), ), # index out of bounds _DelayedReturn( - uri=["s3://my-bucket/", "sub-folder-1/"], reference_path=(("__getitem__", 3),) + uri=_Properties("Steps.func1.OutputDataConfig.S3OutputPath"), + reference_path=(("__getitem__", 3),), + ), + _DelayedReturn(uri=_Properties("Steps.func2.OutputDataConfig.S3OutputPath")), + # the obsoleted uri schema in old SDK version + _DelayedReturn( + uri=["s3://my-bucket/", "sub-folder-1/"], reference_path=(("__getitem__", 0),) ), - _DelayedReturn(uri=["s3://my-bucket/", "sub-folder-2/"]), ] mock_deserializer.return_value = (1, 2, {"key": 3}) + context = Context( + property_references={ + "Steps.func1.OutputDataConfig.S3OutputPath": "s3://my_bucket/exe_id/sub_folder1", + "Steps.func2.OutputDataConfig.S3OutputPath": "s3://my_bucket/exe_id/sub_folder2", + } + ) resolver = _DelayedReturnResolver( delayed_returns, "1234", - _ParameterResolver(Context()), - _ExecutionVariableResolver(Context()), + properties_resolver=_PropertiesResolver(context), + parameter_resolver=_ParameterResolver(context), + execution_variable_resolver=_ExecutionVariableResolver(context), sagemaker_session=None, s3_base_uri=f"s3://my-bucket/{PIPELINE_NAME}", ) @@ -83,25 +99,34 @@ def test_resolve_delayed_returns(mock_deserializer): with pytest.raises(IndexError): resolver.resolve(delayed_returns[4]) assert resolver.resolve(delayed_returns[5]) == (1, 2, {"key": 3}) - assert mock_deserializer.call_count == 2 + assert resolver.resolve(delayed_returns[6]) == 1 + assert mock_deserializer.call_count == 3 @patch("sagemaker.remote_function.core.pipeline_variables.deserialize_obj_from_s3") def test_deserializer_fails(mock_deserializer): delayed_returns = [ _DelayedReturn( - uri=["s3://my-bucket/", "sub-folder-1/"], reference_path=(("__getitem__", 0),) + uri=_Properties("Steps.func1.OutputDataConfig.S3OutputPath"), + reference_path=(("__getitem__", 0),), ), - _DelayedReturn(uri=["s3://my-bucket/", "sub-folder-2/"]), + _DelayedReturn(uri=_Properties("Steps.func2.OutputDataConfig.S3OutputPath")), ] mock_deserializer.side_effect = Exception("Something went wrong") + context = Context( + property_references={ + "Steps.func1.OutputDataConfig.S3OutputPath": "s3://my_bucket/exe_id/sub_folder1", + "Steps.func2.OutputDataConfig.S3OutputPath": "s3://my_bucket/exe_id/sub_folder2", + } + ) with pytest.raises(Exception, match="Something went wrong"): _DelayedReturnResolver( delayed_returns, "1234", - _ParameterResolver(Context()), - _ExecutionVariableResolver(Context()), + properties_resolver=_PropertiesResolver(context), + parameter_resolver=_ParameterResolver(context), + execution_variable_resolver=_ExecutionVariableResolver(context), sagemaker_session=None, s3_base_uri=f"s3://my-bucket/{PIPELINE_NAME}", ) @@ -142,6 +167,15 @@ def test_no_pipeline_variables_to_resolve(mock_deserializer, func_args, func_kwa _ParameterString("parameter_3"), _ParameterFloat("parameter_2"), _ParameterBoolean("parameter_4"), + _DelayedReturn( + uri=_Properties("Steps.func1.OutputDataConfig.S3OutputPath"), + reference_path=(("__getitem__", 0),), + ), + _DelayedReturn( + uri=_Properties("Steps.func1.OutputDataConfig.S3OutputPath"), + reference_path=(("__getitem__", 1),), + ), + # obsolete uri schema in old SDK version _DelayedReturn( uri=[ _S3BaseUriIdentifier(), @@ -161,7 +195,7 @@ def test_no_pipeline_variables_to_resolve(mock_deserializer, func_args, func_kwa _Properties("Steps.step_name.TrainingJobName"), ), {}, - (1, "string", 2.0, True, 1.0, 2.0, "a-cool-name"), + (1, "string", 2.0, True, 1.0, 2.0, 1.0, 2.0, "a-cool-name"), {}, ), ( @@ -172,6 +206,16 @@ def test_no_pipeline_variables_to_resolve(mock_deserializer, func_args, func_kwa "c": _ParameterFloat("parameter_2"), "d": _ParameterBoolean("parameter_4"), "e": _DelayedReturn( + uri=_Properties("Steps.func1.OutputDataConfig.S3OutputPath"), + reference_path=(("__getitem__", 0),), + ), + "f": _DelayedReturn( + uri=_Properties("Steps.func1.OutputDataConfig.S3OutputPath"), + reference_path=(("__getitem__", 1),), + ), + "g": _Properties("Steps.step_name.TrainingJobName"), + # obsolete uri schema in old SDK version + "h": _DelayedReturn( uri=[ _S3BaseUriIdentifier(), _ExecutionVariable("ExecutionId"), @@ -179,7 +223,7 @@ def test_no_pipeline_variables_to_resolve(mock_deserializer, func_args, func_kwa ], reference_path=(("__getitem__", 0),), ), - "f": _DelayedReturn( + "i": _DelayedReturn( uri=[ _S3BaseUriIdentifier(), _ExecutionVariable("ExecutionId"), @@ -187,7 +231,6 @@ def test_no_pipeline_variables_to_resolve(mock_deserializer, func_args, func_kwa ], reference_path=(("__getitem__", 1),), ), - "g": _Properties("Steps.step_name.TrainingJobName"), }, (), { @@ -198,6 +241,8 @@ def test_no_pipeline_variables_to_resolve(mock_deserializer, func_args, func_kwa "e": 1.0, "f": 2.0, "g": "a-cool-name", + "h": 1.0, + "i": 2.0, }, ), ], @@ -211,6 +256,7 @@ def test_resolve_pipeline_variables( expected_resolved_kwargs, ): s3_base_uri = f"s3://my-bucket/{PIPELINE_NAME}" + s3_results_uri = f"{s3_base_uri}/execution-id/sub-folder-1" context = Context( property_references={ "Parameters.parameter_1": "1", @@ -218,6 +264,7 @@ def test_resolve_pipeline_variables( "Parameters.parameter_3": "string", "Parameters.parameter_4": "true", "Execution.ExecutionId": "execution-id", + "Steps.func1.OutputDataConfig.S3OutputPath": s3_results_uri, "Steps.step_name.TrainingJobName": "a-cool-name", }, ) @@ -237,7 +284,7 @@ def test_resolve_pipeline_variables( assert resolved_kwargs == expected_resolved_kwargs mock_deserializer.assert_called_once_with( sagemaker_session=None, - s3_uri=f"{s3_base_uri}/execution-id/sub-folder-1", + s3_uri=s3_results_uri, hmac_key="1234", ) @@ -245,6 +292,9 @@ def test_resolve_pipeline_variables( def test_convert_pipeline_variables_to_pickleable(): function_step = Mock() function_step.name = "parent_step" + function_step._properties.OutputDataConfig.S3OutputPath = Properties( + step_name=function_step.name, path="OutputDataConfig.S3OutputPath" + ) func_args = ( DelayedReturn(function_step, reference_path=("__getitem__", 0)), ParameterBoolean("parameter_1"), @@ -274,12 +324,7 @@ def test_convert_pipeline_variables_to_pickleable(): assert converted_args == ( _DelayedReturn( - uri=[ - _S3BaseUriIdentifier(), - _ExecutionVariable(name="PipelineExecutionId"), - "parent_step", - "results", - ], + uri=_Properties(f"Steps.{function_step.name}.OutputDataConfig.S3OutputPath"), reference_path=("__getitem__", 0), ), _ParameterBoolean(name="parameter_1"), @@ -293,12 +338,7 @@ def test_convert_pipeline_variables_to_pickleable(): assert converted_kwargs == { "a": _DelayedReturn( - uri=[ - _S3BaseUriIdentifier(), - _ExecutionVariable(name="PipelineExecutionId"), - "parent_step", - "results", - ], + uri=_Properties(f"Steps.{function_step.name}.OutputDataConfig.S3OutputPath"), reference_path=("__getitem__", 1), ), "b": _ParameterBoolean(name="parameter_1"), diff --git a/tests/unit/sagemaker/remote_function/core/test_stored_function.py b/tests/unit/sagemaker/remote_function/core/test_stored_function.py index bcc09cb585..b263682641 100644 --- a/tests/unit/sagemaker/remote_function/core/test_stored_function.py +++ b/tests/unit/sagemaker/remote_function/core/test_stored_function.py @@ -315,12 +315,11 @@ def test_load_and_invoke_json_serialization( def test_save_and_load_with_pipeline_variable(monkeypatch): session = Mock() s3_base_uri = random_s3_uri() + func1_result_path = f"{s3_base_uri}/execution-id/func_1/results" function_step = _FunctionStep(name="func_1", display_name=None, description=None) x = DelayedReturn(function_step=function_step) - serialize_obj_to_s3( - 3.0, session, f"{s3_base_uri}/execution-id/func_1/results", HMAC_KEY, KMS_KEY - ) + serialize_obj_to_s3(3.0, session, func1_result_path, HMAC_KEY, KMS_KEY) stored_function = StoredFunction( sagemaker_session=session, @@ -332,8 +331,10 @@ def test_save_and_load_with_pipeline_variable(monkeypatch): "Parameters.a": "1.0", "Parameters.b": "2.0", "Parameters.c": "3.0", - "Execution.PipelineExecutionId": "execution-id", + "Steps.func_1.OutputDataConfig.S3OutputPath": func1_result_path, }, + execution_id="execution-id", + step_name="func_2", ), ) @@ -354,8 +355,9 @@ def test_save_and_load_with_pipeline_variable(monkeypatch): stored_function.save_pipeline_step_function(test_serialized_data) stored_function.load_and_invoke() + func2_result_path = f"{s3_base_uri}/execution-id/func_2/results" assert deserialize_obj_from_s3( - session, s3_uri=f"{s3_base_uri}/results", hmac_key=HMAC_KEY + session, s3_uri=func2_result_path, hmac_key=HMAC_KEY ) == quadratic(3.0, a=1.0, b=2.0, c=3.0) diff --git a/tests/unit/sagemaker/remote_function/runtime_environment/test_runtime_environment_manager.py b/tests/unit/sagemaker/remote_function/runtime_environment/test_runtime_environment_manager.py index afbcfb1ec5..45198f3388 100644 --- a/tests/unit/sagemaker/remote_function/runtime_environment/test_runtime_environment_manager.py +++ b/tests/unit/sagemaker/remote_function/runtime_environment/test_runtime_environment_manager.py @@ -208,7 +208,7 @@ def test_bootstrap_req_txt(): call_args = popen.call_args[0][0] assert call_args is not None - expected_cmd = "{} -m pip install -r {}".format(python_exe, TEST_REQUIREMENTS_TXT) + expected_cmd = "{} -m pip install -r {} -U".format(python_exe, TEST_REQUIREMENTS_TXT) assert call_args == expected_cmd @@ -229,7 +229,7 @@ def test_bootstrap_req_txt_error(): call_args = popen.call_args[0][0] assert call_args is not None - expected_cmd = "{} -m pip install -r {}".format(python_exe, TEST_REQUIREMENTS_TXT) + expected_cmd = "{} -m pip install -r {} -U".format(python_exe, TEST_REQUIREMENTS_TXT) assert call_args == expected_cmd @@ -260,7 +260,7 @@ def test_bootstrap_req_txt_with_conda_env(mock_conda_exe): call_args = popen.call_args[0][0] assert call_args is not None - expected_cmd = f"{mock_conda_exe.return_value} run -n conda_env pip install -r usr/local/requirements.txt" + expected_cmd = f"{mock_conda_exe.return_value} run -n conda_env pip install -r usr/local/requirements.txt -U" assert call_args == expected_cmd diff --git a/tests/unit/sagemaker/remote_function/test_job.py b/tests/unit/sagemaker/remote_function/test_job.py index 1884486f8b..ac321d4de0 100644 --- a/tests/unit/sagemaker/remote_function/test_job.py +++ b/tests/unit/sagemaker/remote_function/test_job.py @@ -27,10 +27,13 @@ from sagemaker.remote_function.spark_config import SparkConfig from sagemaker.remote_function.custom_file_filter import CustomFileFilter from sagemaker.remote_function.core.pipeline_variables import Context +from sagemaker.workflow.function_step import DelayedReturn +from sagemaker.workflow.functions import Join from sagemaker.workflow.pipeline_context import _PipelineConfig from sagemaker.workflow.pipeline_definition_config import PipelineDefinitionConfig from sagemaker.workflow.execution_variables import ExecutionVariables from sagemaker.utils import sagemaker_timestamp +from sagemaker.workflow.properties import Properties from tests.unit import DATA_DIR from sagemaker.remote_function.job import ( @@ -763,6 +766,13 @@ def test_get_train_args_under_pipeline_context( mock_stored_function = Mock() mock_stored_function_ctr.return_value = mock_stored_function + function_step = Mock() + function_step.name = "parent_step" + func_step_s3_output_prop = Properties( + step_name=function_step.name, path="OutputDataConfig.S3OutputPath" + ) + function_step._properties.OutputDataConfig.S3OutputPath = func_step_s3_output_prop + job_settings = _JobSettings( dependencies="path/to/dependencies/req.txt", pre_execution_script="path/to/script.sh", @@ -787,8 +797,16 @@ def test_get_train_args_under_pipeline_context( job_name=TEST_JOB_NAME, s3_base_uri=s3_base_uri, func=job_function, - func_args=(1, ParameterInteger(name="b", default_value=2)), - func_kwargs={"c": 3, "d": ParameterInteger(name="d", default_value=4)}, + func_args=( + 1, + ParameterInteger(name="b", default_value=2), + DelayedReturn(function_step, reference_path=("__getitem__", 0)), + ), + func_kwargs={ + "c": 3, + "d": ParameterInteger(name="d", default_value=4), + "e": DelayedReturn(function_step, reference_path=("__getitem__", 1)), + }, serialized_data=mocked_serialized_data, ) @@ -862,7 +880,15 @@ def test_get_train_args_under_pipeline_context( ), ], OutputDataConfig={ - "S3OutputPath": f"{S3_URI}/{TEST_PIPELINE_NAME}", + "S3OutputPath": Join( + on="/", + values=[ + "s3://my-s3-bucket/keyprefix/my-pipeline", + ExecutionVariables.PIPELINE_EXECUTION_ID, + "test-function-step", + "results", + ], + ), "KmsKeyId": KMS_KEY_ARN, }, AlgorithmSpecification=dict( @@ -896,8 +922,12 @@ def test_get_train_args_under_pipeline_context( ExecutionVariables.PIPELINE_EXECUTION_ID, "Parameters.b", ParameterInteger(name="b", default_value=2).to_string(), + "Steps.parent_step.OutputDataConfig.S3OutputPath", + func_step_s3_output_prop.to_string(), "Parameters.d", ParameterInteger(name="d", default_value=4).to_string(), + "Steps.parent_step.OutputDataConfig.S3OutputPath", + func_step_s3_output_prop.to_string(), ], ), ResourceConfig=dict( diff --git a/tests/unit/sagemaker/workflow/test_pipeline.py b/tests/unit/sagemaker/workflow/test_pipeline.py index 136b85fc49..d658455d62 100644 --- a/tests/unit/sagemaker/workflow/test_pipeline.py +++ b/tests/unit/sagemaker/workflow/test_pipeline.py @@ -19,6 +19,7 @@ import pytest from mock import Mock, call, patch +from mock.mock import MagicMock from sagemaker import s3 from sagemaker.remote_function.job import _JobSettings @@ -198,12 +199,7 @@ def test_large_pipeline_create(sagemaker_session_mock, role_arn): ) -@patch("botocore.waiter.create_waiter_with_client") -@patch("sagemaker.workflow.pipeline.deserialize_obj_from_s3") -@patch("sagemaker.s3.S3Downloader") -def test_pipeline_update( - waiter_mock, deserializer_mock, s3_downloader_mock, sagemaker_session_mock, role_arn -): +def test_pipeline_update(sagemaker_session_mock, role_arn): sagemaker_session_mock.sagemaker_config = {} pipeline = Pipeline( name="MyPipeline", @@ -262,6 +258,46 @@ def test_pipeline_update( PipelineName="MyPipeline", PipelineDefinition=pipeline.definition(), RoleArn=role_arn ) + +@pytest.mark.parametrize( + "s3_output_path, is_complete_path", + [ + ("s3:/my-bucket/my-key", False), + ("s3:/my-bucket/my-key/myexecution/stepA/results", True), + ("s3:/my-bucket/my-key/myexecution/stepA/results/", True), + ], +) +@patch("botocore.waiter.create_waiter_with_client", MagicMock()) +@patch("sagemaker.workflow.pipeline.deserialize_obj_from_s3") +def test_pipeline_execution_result( + mock_deserialize, s3_output_path, is_complete_path, sagemaker_session_mock, role_arn +): + sagemaker_session_mock.sagemaker_config = {} + + step1 = CustomStep(name="MyStep1") + dr_step_2 = DelayedReturn( + function_step=CustomFunctionStep( + func_args=(1, 2), + func=lambda x, y: x + y + 3, + job_settings=_JobSettings( + image_uri="image", + instance_type="ml.m4.xlarge", + sagemaker_session=sagemaker_session_mock, + ), + name="stepA", + description="", + display_name="", + depends_on=[step1], + ) + ) + pipeline = Pipeline( + name="MyPipeline", + parameters=[], + steps=[dr_step_2], + sagemaker_session=sagemaker_session_mock, + ) + pipeline.create(role_arn=role_arn) + sagemaker_session_mock.sagemaker_client.start_pipeline_execution.return_value = { "PipelineExecutionArn": "arn:aws:sagemaker:us-west-2:111111111111:pipeline/mypipeline/execution/myexecution", } @@ -270,7 +306,7 @@ def test_pipeline_update( sagemaker_session_mock.sagemaker_client.list_pipeline_execution_steps.return_value = { "PipelineExecutionSteps": [ { - "StepName": "stepC", + "StepName": "stepA", "Metadata": { "TrainingJob": { "Arn": "arn:aws:sagemaker:us-west-2:111111111111:training-job/foo" @@ -288,16 +324,15 @@ def test_pipeline_update( ] }, "TrainingJobStatus": "Completed", - "OutputDataConfig": {"S3OutputPath": "s3:/my-bucket/my-key"}, + "OutputDataConfig": {"S3OutputPath": s3_output_path}, "Environment": {"REMOTE_FUNCTION_SECRET_KEY": "abcdefg"}, } - execution.result("stepC") - assert s3_downloader_mock.read_bytes( - "s3:/my-bucket/my-key/myexecution/stepC/results/metadata.json" - ) - assert s3_downloader_mock.read_bytes( - "s3:/my-bucket/my-key/myexecution/stepC/results/payload.pkl" + execution.result("stepA") + + expected_s3_uri = ( + s3_output_path if is_complete_path else f"{s3_output_path}/myexecution/stepA/results" ) + assert mock_deserialize.call_args.kwargs["s3_uri"] == expected_s3_uri def test_pipeline_update_with_parallelism_config(sagemaker_session_mock, role_arn): From 55df3fcf149f30edc85b66e9b1ac297ae9922cac Mon Sep 17 00:00:00 2001 From: sagemaker-bot Date: Wed, 24 Jan 2024 14:17:33 +0000 Subject: [PATCH 60/76] change: update image_uri_configs 01-24-2024 06:17:33 PST --- src/sagemaker/image_uri_config/model-monitor.json | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sagemaker/image_uri_config/model-monitor.json b/src/sagemaker/image_uri_config/model-monitor.json index f0d50acf7a..117dbecd84 100644 --- a/src/sagemaker/image_uri_config/model-monitor.json +++ b/src/sagemaker/image_uri_config/model-monitor.json @@ -25,6 +25,7 @@ "eu-west-2": "749857270468", "eu-west-3": "680080141114", "il-central-1": "843974653677", + "me-central-1": "588750061953", "me-south-1": "607024016150", "sa-east-1": "539772159869", "us-east-1": "156813124566", From 5806f8b901798951f324a014cf392381c8675dcc Mon Sep 17 00:00:00 2001 From: Jay Goyani <135654128+jgoyani1@users.noreply.github.com> Date: Wed, 24 Jan 2024 12:01:53 -0800 Subject: [PATCH 61/76] fix: update get_execution_role_arn from metadata file if present (#4388) --- src/sagemaker/session.py | 7 ++++++- tests/unit/test_session.py | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py index 8f2753a7cf..b1342eb381 100644 --- a/src/sagemaker/session.py +++ b/src/sagemaker/session.py @@ -5412,6 +5412,7 @@ def get_caller_identity_arn(self): domain_id = metadata.get("DomainId") user_profile_name = metadata.get("UserProfileName") space_name = metadata.get("SpaceName") + execution_role_arn = metadata.get("ExecutionRoleArn") try: if domain_id is None: instance_desc = self.sagemaker_client.describe_notebook_instance( @@ -5419,7 +5420,11 @@ def get_caller_identity_arn(self): ) return instance_desc["RoleArn"] - # In Space app, find execution role from DefaultSpaceSettings on domain level + # find execution role from the metadata file if present + if execution_role_arn is not None: + return execution_role_arn + + # In Shared Space app, find execution role from DefaultSpaceSettings on domain level if space_name is not None: domain_desc = self.sagemaker_client.describe_domain(DomainId=domain_id) return domain_desc["DefaultSpaceSettings"]["ExecutionRole"] diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py index 6ee2cc9af5..93828d882f 100644 --- a/tests/unit/test_session.py +++ b/tests/unit/test_session.py @@ -698,6 +698,25 @@ def test_fallback_to_domain_if_role_unavailable_in_user_settings(boto_session): sess.sagemaker_client.describe_domain.assert_called_once_with(DomainId="d-kbnw5yk6tg8j") +@patch( + "six.moves.builtins.open", + mock_open( + read_data='{"ResourceName": "SageMakerInstance", ' + '"DomainId": "d-kbnw5yk6tg8j", ' + '"ExecutionRoleArn": "arn:aws:iam::369233609183:role/service-role/SageMakerRole-20171129T072388", ' + '"SpaceName": "space_name"}' + ), +) +@patch("os.path.exists", side_effect=mock_exists(NOTEBOOK_METADATA_FILE, True)) +def test_get_caller_identity_arn_from_metadata_file_for_space(boto_session): + sess = Session(boto_session) + expected_role = "arn:aws:iam::369233609183:role/service-role/SageMakerRole-20171129T072388" + + actual = sess.get_caller_identity_arn() + + assert actual == expected_role + + @patch( "six.moves.builtins.open", mock_open( From 6affb2d0b55e3d7e39abc590e5714527ac28980a Mon Sep 17 00:00:00 2001 From: qidewenwhen <32910701+qidewenwhen@users.noreply.github.com> Date: Wed, 24 Jan 2024 12:02:41 -0800 Subject: [PATCH 62/76] fix: Support using PipelineDefinitionConfig in local mode (#4352) --- src/sagemaker/local/pipeline.py | 12 ++- .../sagemaker/local/test_local_pipeline.py | 95 ++++++++++++++++++- 2 files changed, 102 insertions(+), 5 deletions(-) diff --git a/src/sagemaker/local/pipeline.py b/src/sagemaker/local/pipeline.py index 4b9209fc0b..9e97dd2059 100644 --- a/src/sagemaker/local/pipeline.py +++ b/src/sagemaker/local/pipeline.py @@ -273,8 +273,10 @@ class _TrainingStepExecutor(_StepExecutor): """Executor class to execute TrainingStep locally""" def execute(self): - job_name = unique_name_from_base(self.step.name) step_arguments = self.pipline_executor.evaluate_step_arguments(self.step) + job_name = step_arguments.pop("TrainingJobName", None) or unique_name_from_base( + self.step.name + ) try: self.pipline_executor.local_sagemaker_client.create_training_job( job_name, **step_arguments @@ -290,8 +292,10 @@ class _ProcessingStepExecutor(_StepExecutor): """Executor class to execute ProcessingStep locally""" def execute(self): - job_name = unique_name_from_base(self.step.name) step_arguments = self.pipline_executor.evaluate_step_arguments(self.step) + job_name = step_arguments.pop("ProcessingJobName", None) or unique_name_from_base( + self.step.name + ) try: self.pipline_executor.local_sagemaker_client.create_processing_job( job_name, **step_arguments @@ -482,8 +486,10 @@ class _TransformStepExecutor(_StepExecutor): """Executor class to execute TransformStep locally""" def execute(self): - job_name = unique_name_from_base(self.step.name) step_arguments = self.pipline_executor.evaluate_step_arguments(self.step) + job_name = step_arguments.pop("TransformJobName", None) or unique_name_from_base( + self.step.name + ) try: self.pipline_executor.local_sagemaker_client.create_transform_job( job_name, **step_arguments diff --git a/tests/unit/sagemaker/local/test_local_pipeline.py b/tests/unit/sagemaker/local/test_local_pipeline.py index 8fbadb9c05..548a70027b 100644 --- a/tests/unit/sagemaker/local/test_local_pipeline.py +++ b/tests/unit/sagemaker/local/test_local_pipeline.py @@ -47,6 +47,7 @@ from sagemaker.workflow.parameters import ParameterInteger, ParameterString from sagemaker.workflow.pipeline import Pipeline from sagemaker.workflow.pipeline_context import PipelineSession +from sagemaker.workflow.pipeline_definition_config import PipelineDefinitionConfig from sagemaker.workflow.step_outputs import get_step from sagemaker.workflow.steps import ( ProcessingStep, @@ -111,6 +112,7 @@ "Result": [1, 2, 3], "Exception": null } """ +TEST_JOB_NAME = "test-job-name" @pytest.fixture @@ -188,6 +190,8 @@ def training_step(pipeline_session): sagemaker_session=pipeline_session, output_path="s3://a/b", use_spot_instances=False, + # base_job_name would be popped out if no pipeline_definition_config configured + base_job_name=TEST_JOB_NAME, ) training_input = TrainingInput(s3_data=f"s3://{BUCKET}/train_manifest") step_args = estimator.fit(inputs=training_input) @@ -207,6 +211,8 @@ def processing_step(pipeline_session): instance_count=1, instance_type=INSTANCE_TYPE, sagemaker_session=pipeline_session, + # base_job_name would be popped out if no pipeline_definition_config configured + base_job_name=TEST_JOB_NAME, ) processing_input = [ ProcessingInput( @@ -239,6 +245,8 @@ def transform_step(pipeline_session): instance_count=1, output_path="s3://my-bucket/my-output-path", sagemaker_session=pipeline_session, + # base_transform_job_name would be popped out if no pipeline_definition_config configured + base_transform_job_name=TEST_JOB_NAME, ) transform_inputs = TransformInput(data="s3://my-bucket/my-data") step_args = transformer.transform( @@ -871,8 +879,8 @@ def depends_step(): ) -@patch("sagemaker.local.image._SageMakerContainer.process") -def test_execute_pipeline_processing_step(process, local_sagemaker_session, processing_step): +@patch("sagemaker.local.image._SageMakerContainer.process", MagicMock()) +def test_execute_pipeline_processing_step(local_sagemaker_session, processing_step): pipeline = Pipeline( name="MyPipeline2", steps=[processing_step], @@ -1362,3 +1370,86 @@ def test_execute_pipeline_step_create_transform_job_fail( step_execution = execution.step_execution assert step_execution[transform_step.name].status == _LocalExecutionStatus.FAILED.value assert "Dummy RuntimeError" in step_execution[transform_step.name].failure_reason + + +@patch( + "sagemaker.local.image._SageMakerContainer.train", + MagicMock(return_value="/some/path/to/model"), +) +@patch("sagemaker.local.image._SageMakerContainer.process", MagicMock()) +def test_pipeline_definition_config_in_local_mode_for_train_process_steps( + processing_step, + training_step, + local_sagemaker_session, +): + exe_steps = [processing_step, training_step] + + def _verify_execution(exe_step_name, execution, with_custom_job_prefix): + assert not execution.failure_reason + assert execution.status == _LocalExecutionStatus.SUCCEEDED.value + + step_execution = execution.step_execution[exe_step_name] + assert step_execution.status == _LocalExecutionStatus.SUCCEEDED.value + + if step_execution.type == StepTypeEnum.PROCESSING: + job_name_field = "ProcessingJobName" + elif step_execution.type == StepTypeEnum.TRAINING: + job_name_field = "TrainingJobName" + + if with_custom_job_prefix: + assert step_execution.properties[job_name_field] == TEST_JOB_NAME + else: + assert step_execution.properties[job_name_field].startswith(step_execution.name) + + for exe_step in exe_steps: + pipeline = Pipeline( + name="MyPipelineX-" + exe_step.name, + steps=[exe_step], + sagemaker_session=local_sagemaker_session, + parameters=[INSTANCE_COUNT_PIPELINE_PARAMETER], + ) + + execution = LocalPipelineExecutor( + _LocalPipelineExecution("my-execution-x-" + exe_step.name, pipeline), + local_sagemaker_session, + ).execute() + + _verify_execution( + exe_step_name=exe_step.name, execution=execution, with_custom_job_prefix=False + ) + + pipeline.pipeline_definition_config = PipelineDefinitionConfig(use_custom_job_prefix=True) + execution = LocalPipelineExecutor( + _LocalPipelineExecution("my-execution-x-" + exe_step.name, pipeline), + local_sagemaker_session, + ).execute() + + _verify_execution( + exe_step_name=exe_step.name, execution=execution, with_custom_job_prefix=True + ) + + +@patch("sagemaker.local.local_session.LocalSagemakerClient.create_transform_job") +def test_pipeline_definition_config_in_local_mode_for_transform_step( + create_transform_job, local_sagemaker_session, transform_step +): + pipeline = Pipeline( + name="MyPipelineX-" + transform_step.name, + steps=[transform_step], + sagemaker_session=local_sagemaker_session, + ) + LocalPipelineExecutor( + _LocalPipelineExecution("my-execution-x-" + transform_step.name, pipeline), + local_sagemaker_session, + ).execute() + + assert create_transform_job.call_args.args[0].startswith(transform_step.name) + + pipeline.pipeline_definition_config = PipelineDefinitionConfig(use_custom_job_prefix=True) + + LocalPipelineExecutor( + _LocalPipelineExecution("my-execution-x-" + transform_step.name, pipeline), + local_sagemaker_session, + ).execute() + + assert create_transform_job.call_args.args[0] == TEST_JOB_NAME From a97d24f7ae0463fc67b16c0980a42da3abad83e8 Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 24 Jan 2024 14:04:26 -0600 Subject: [PATCH 63/76] fix: remove fastapi and uvicorn dependencies (#4365) They are not used in the codebase. Closes #4361 #4295 --- setup.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup.py b/setup.py index ff833b7a5c..22cc8e2dc3 100644 --- a/setup.py +++ b/setup.py @@ -65,8 +65,6 @@ def read_requirements(filename): "platformdirs", "tblib>=1.7.0,<3", "urllib3<1.27", - "uvicorn==0.22.0", - "fastapi==0.95.2", "requests", "docker", "tqdm", From 97735244af0fcff63dcd8884fab3ac97a93eb011 Mon Sep 17 00:00:00 2001 From: ci Date: Thu, 25 Jan 2024 17:59:18 +0000 Subject: [PATCH 64/76] prepare release v2.205.0 --- CHANGELOG.md | 14 ++++++++++++++ VERSION | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 28984dd77d..7485f34081 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # Changelog +## v2.205.0 (2024-01-25) + +### Features + + * Support selective pipeline execution for function step + +### Bug Fixes and Other Changes + + * remove fastapi and uvicorn dependencies + * Support using PipelineDefinitionConfig in local mode + * update get_execution_role_arn from metadata file if present + * update image_uri_configs 01-24-2024 06:17:33 PST + * Add validation for empty ParameterString value in start local pipeline + ## v2.204.0 (2024-01-23) ### Features diff --git a/VERSION b/VERSION index 3f1ba2fb4c..b0436fe4e2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.204.1.dev0 +2.205.0 From 2fda21e7c437de44b7eb08e3fcc8e4c77f2e7be2 Mon Sep 17 00:00:00 2001 From: ci Date: Thu, 25 Jan 2024 17:59:20 +0000 Subject: [PATCH 65/76] update development version to v2.205.1.dev0 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index b0436fe4e2..99cb13b210 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.205.0 +2.205.1.dev0 From a6073d9c9373b4440ef82af46175bcd92ff0f9eb Mon Sep 17 00:00:00 2001 From: amzn-choeric <105388439+amzn-choeric@users.noreply.github.com> Date: Fri, 26 Jan 2024 11:10:16 -0500 Subject: [PATCH 66/76] change: TGI NeuronX 0.0.17 (#4390) --- .../huggingface-llm-neuronx.json | 29 +++++++++++++++++++ .../image_uris/test_huggingface_llm.py | 1 + 2 files changed, 30 insertions(+) diff --git a/src/sagemaker/image_uri_config/huggingface-llm-neuronx.json b/src/sagemaker/image_uri_config/huggingface-llm-neuronx.json index a13336fb79..d35a3c30a0 100644 --- a/src/sagemaker/image_uri_config/huggingface-llm-neuronx.json +++ b/src/sagemaker/image_uri_config/huggingface-llm-neuronx.json @@ -35,6 +35,35 @@ "container_version": { "inf2": "ubuntu22.04" } + }, + "0.0.17": { + "py_versions": [ + "py310" + ], + "registries": { + "ap-northeast-1": "763104351884", + "ap-south-1": "763104351884", + "ap-south-2": "772153158452", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ap-southeast-4": "457447274322", + "eu-central-1": "763104351884", + "eu-central-2": "380420809688", + "eu-south-2": "503227376785", + "eu-west-1": "763104351884", + "eu-west-3": "763104351884", + "il-central-1": "780543022126", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-west-2": "763104351884", + "ca-west-1": "204538143572" + }, + "tag_prefix": "1.13.1-optimum0.0.17", + "repository": "huggingface-pytorch-tgi-inference", + "container_version": { + "inf2": "ubuntu22.04" + } } } } diff --git a/tests/unit/sagemaker/image_uris/test_huggingface_llm.py b/tests/unit/sagemaker/image_uris/test_huggingface_llm.py index b02fe36e99..9a441bbcf9 100644 --- a/tests/unit/sagemaker/image_uris/test_huggingface_llm.py +++ b/tests/unit/sagemaker/image_uris/test_huggingface_llm.py @@ -31,6 +31,7 @@ }, "inf2": { "0.0.16": "1.13.1-optimum0.0.16-neuronx-py310-ubuntu22.04", + "0.0.17": "1.13.1-optimum0.0.17-neuronx-py310-ubuntu22.04", }, } From 2a525724e6dad7a71c88e300a47f9b9d89e36694 Mon Sep 17 00:00:00 2001 From: qidewenwhen <32910701+qidewenwhen@users.noreply.github.com> Date: Fri, 26 Jan 2024 08:10:56 -0800 Subject: [PATCH 67/76] fix: Support PipelineVariable for ModelQualityCheckConfig attributes (#4353) --- src/sagemaker/workflow/quality_check_step.py | 52 ++++++++++++------ .../workflow/test_quality_check_step.py | 53 ++++++++++++++++--- 2 files changed, 82 insertions(+), 23 deletions(-) diff --git a/src/sagemaker/workflow/quality_check_step.py b/src/sagemaker/workflow/quality_check_step.py index 2cae687770..8257ed3844 100644 --- a/src/sagemaker/workflow/quality_check_step.py +++ b/src/sagemaker/workflow/quality_check_step.py @@ -13,6 +13,7 @@ """The step definitions for workflow.""" from __future__ import absolute_import +import logging from abc import ABC from typing import List, Union, Optional import os @@ -24,7 +25,8 @@ from sagemaker.processing import ProcessingOutput, ProcessingJob, Processor, ProcessingInput from sagemaker.workflow import is_pipeline_variable -from sagemaker.workflow.entities import RequestType, PipelineVariable +from sagemaker.workflow.entities import RequestType, PipelineVariable, PrimitiveType +from sagemaker.workflow.parameters import Parameter, ParameterString from sagemaker.workflow.properties import ( Properties, ) @@ -47,6 +49,9 @@ _DATA_QUALITY_TYPE = "DATA_QUALITY" +logger = logging.getLogger(__name__) + + @attr.s class QualityCheckConfig(ABC): """Quality Check Config. @@ -407,25 +412,19 @@ def _generate_baseline_processor( post_processor_script_container_path=post_processor_script_container_path, ) else: - inference_attribute = ( - str(quality_check_cfg.inference_attribute) - if quality_check_cfg.inference_attribute is not None - else None + inference_attribute = _format_env_variable_value( + var_value=quality_check_cfg.inference_attribute, var_name="inference_attribute" ) - probability_attribute = ( - str(quality_check_cfg.probability_attribute) - if quality_check_cfg.probability_attribute is not None - else None + probability_attribute = _format_env_variable_value( + var_value=quality_check_cfg.probability_attribute, var_name="probability_attribute" ) - ground_truth_attribute = ( - str(quality_check_cfg.ground_truth_attribute) - if quality_check_cfg.ground_truth_attribute is not None - else None + ground_truth_attribute = _format_env_variable_value( + var_value=quality_check_cfg.ground_truth_attribute, + var_name="ground_truth_attribute", ) - probability_threshold_attr = ( - str(quality_check_cfg.probability_threshold_attribute) - if quality_check_cfg.probability_threshold_attribute is not None - else None + probability_threshold_attr = _format_env_variable_value( + var_value=quality_check_cfg.probability_threshold_attribute, + var_name="probability_threshold_attr", ) normalized_env = ModelMonitor._generate_env_map( env=self._model_monitor.env, @@ -458,3 +457,22 @@ def _generate_baseline_processor( tags=self._model_monitor.tags, network_config=self._model_monitor.network_config, ) + + +def _format_env_variable_value(var_value: Union[PrimitiveType, PipelineVariable], var_name: str): + """Helper function to format the variable values passed to env var + + Args: + var_value (PrimitiveType or PipelineVariable): The value of the variable. + var_name (str): The name of the variable. + """ + if var_value is None: + return None + + if is_pipeline_variable(var_value): + if isinstance(var_value, Parameter) and not isinstance(var_value, ParameterString): + raise ValueError(f"{var_name} cannot be Parameter types other than ParameterString.") + logger.warning("%s's runtime value must be the string type.", var_name) + return var_value + + return str(var_value) diff --git a/tests/unit/sagemaker/workflow/test_quality_check_step.py b/tests/unit/sagemaker/workflow/test_quality_check_step.py index 07dc37bafd..88125b714c 100644 --- a/tests/unit/sagemaker/workflow/test_quality_check_step.py +++ b/tests/unit/sagemaker/workflow/test_quality_check_step.py @@ -16,7 +16,8 @@ import pytest from sagemaker.model_monitor import DatasetFormat -from sagemaker.workflow.parameters import ParameterString +from sagemaker.workflow.execution_variables import ExecutionVariable +from sagemaker.workflow.parameters import ParameterString, ParameterInteger from sagemaker.workflow.pipeline import Pipeline from sagemaker.workflow.pipeline import PipelineDefinitionConfig from sagemaker.workflow.quality_check_step import ( @@ -178,8 +179,6 @@ "dataset_source": "/opt/ml/processing/input/baseline_dataset_input", "analysis_type": "MODEL_QUALITY", "problem_type": "BinaryClassification", - "probability_attribute": "0", - "probability_threshold_attribute": "0.5", }, "StoppingCondition": {"MaxRuntimeInSeconds": 1800}, }, @@ -269,23 +268,54 @@ def test_data_quality_check_step( assert step_definition == _expected_data_quality_dsl +@pytest.mark.parametrize( + "quality_cfg_attr_value, expected_value_in_dsl", + [ + (0, "0"), + ("attr", "attr"), + (None, None), + (ParameterString(name="ParamStringEnvVar"), {"Get": "Parameters.ParamStringEnvVar"}), + (ExecutionVariable("PipelineArn"), {"Get": "Execution.PipelineArn"}), + (ParameterInteger(name="ParamIntEnvVar"), "Error"), + ], +) def test_model_quality_check_step( sagemaker_session, check_job_config, model_package_group_name, supplied_baseline_statistics_uri, supplied_baseline_constraints_uri, + quality_cfg_attr_value, + expected_value_in_dsl, ): model_quality_check_config = ModelQualityCheckConfig( baseline_dataset="baseline_dataset_s3_url", dataset_format=DatasetFormat.csv(header=True), problem_type="BinaryClassification", - probability_attribute=0, # the integer should be converted to str by SDK - ground_truth_attribute=None, - probability_threshold_attribute=0.5, # the float should be converted to str by SDK + inference_attribute=quality_cfg_attr_value, + probability_attribute=quality_cfg_attr_value, + ground_truth_attribute=quality_cfg_attr_value, + probability_threshold_attribute=quality_cfg_attr_value, post_analytics_processor_script="s3://my_bucket/data_quality/postprocessor.py", output_s3_uri="", ) + + if expected_value_in_dsl == "Error": + with pytest.raises(ValueError) as err: + QualityCheckStep( + name="ModelQualityCheckStep", + register_new_baseline=False, + skip_check=False, + fail_on_violation=True, + quality_check_config=model_quality_check_config, + check_job_config=check_job_config, + model_package_group_name=model_package_group_name, + supplied_baseline_statistics=supplied_baseline_statistics_uri, + supplied_baseline_constraints=supplied_baseline_constraints_uri, + ) + assert "cannot be Parameter types other than ParameterString" in str(err) + return + model_quality_check_step = QualityCheckStep( name="ModelQualityCheckStep", register_new_baseline=False, @@ -297,6 +327,7 @@ def test_model_quality_check_step( supplied_baseline_statistics=supplied_baseline_statistics_uri, supplied_baseline_constraints=supplied_baseline_constraints_uri, ) + pipeline = Pipeline( name="MyPipeline", parameters=[ @@ -310,6 +341,16 @@ def test_model_quality_check_step( step_definition = _get_step_definition_for_test(pipeline) + step_def_env = step_definition["Arguments"]["Environment"] + for var in [ + "inference_attribute", + "probability_attribute", + "ground_truth_attribute", + "probability_threshold_attribute", + ]: + env_var_dsl = step_def_env.pop(var, None) + assert env_var_dsl == expected_value_in_dsl + assert step_definition == _expected_model_quality_dsl From 14e6a0385002938bb0013f4bd54562c1a0e4b555 Mon Sep 17 00:00:00 2001 From: Jonathan Makunga <54963715+makungaj1@users.noreply.github.com> Date: Tue, 30 Jan 2024 11:38:03 -0800 Subject: [PATCH 68/76] feat: Logic to detect hardware GPU count and aggregate GPU memory size in MiB (#4389) * Add logic to detect hardware GPU count and aggregate GPU memory size in MiB * Fix all formatting * Addressed PR review comments * Addressed PR Review messages * Addressed PR Review Messages * Addressed PR Review comments * Addressed PR Review Comments * Add integration tests * Add config * Fix integration tests * Include Instance Types GPU infor Config files * Addressed PR review comments * Fix unit tests * Fix unit test: 'Mock' object is not subscriptable --------- Co-authored-by: Jonathan Makunga --- .../image_uri_config/instance_gpu_info.json | 782 ++++++++++++++++++ src/sagemaker/instance_types_gpu_info.py | 43 + .../serve/utils/hardware_detector.py | 110 +++ .../serve/utils/test_hardware_detector.py | 44 + .../serve/utils/test_hardware_detector.py | 98 +++ .../sagemaker/test_instance_types_gpu_info.py | 30 + 6 files changed, 1107 insertions(+) create mode 100644 src/sagemaker/image_uri_config/instance_gpu_info.json create mode 100644 src/sagemaker/instance_types_gpu_info.py create mode 100644 src/sagemaker/serve/utils/hardware_detector.py create mode 100644 tests/integ/sagemaker/serve/utils/test_hardware_detector.py create mode 100644 tests/unit/sagemaker/serve/utils/test_hardware_detector.py create mode 100644 tests/unit/sagemaker/test_instance_types_gpu_info.py diff --git a/src/sagemaker/image_uri_config/instance_gpu_info.json b/src/sagemaker/image_uri_config/instance_gpu_info.json new file mode 100644 index 0000000000..9fc005bc47 --- /dev/null +++ b/src/sagemaker/image_uri_config/instance_gpu_info.json @@ -0,0 +1,782 @@ +{ + "af-south-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "ap-east-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "ap-northeast-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "ap-northeast-2": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "ap-northeast-3": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "ap-south-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "ap-southeast-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "ap-southeast-2": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "ap-southeast-3": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "ca-central-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "cn-north-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "cn-northwest-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "eu-central-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "eu-central-2": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "eu-north-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "eu-south-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "eu-south-2": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "eu-west-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "eu-west-2": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "eu-west-3": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "il-central-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "me-central-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "me-south-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "sa-east-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "us-east-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "us-east-2": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "us-gov-east-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "us-gov-west-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "us-west-1": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + }, + "us-west-2": { + "ml.p5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p4d.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 327680}, + "ml.p4de.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 655360}, + "ml.p3.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.p3.8xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.p3.16xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 131072}, + "ml.p3dn.24xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 262144}, + "ml.p2.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 12288}, + "ml.p2.8xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 98304}, + "ml.p2.16xlarge": {"Count": 16, "TotalGpuMemoryInMiB": 196608}, + "ml.g4dn.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 16384}, + "ml.g4dn.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 65536}, + "ml.g5n.xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.2xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.4xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.8xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.16xlarge": {"Count": 1, "TotalGpuMemoryInMiB": 24576}, + "ml.g5.12xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.24xlarge": {"Count": 4, "TotalGpuMemoryInMiB": 98304}, + "ml.g5.48xlarge": {"Count": 8, "TotalGpuMemoryInMiB": 196608} + } +} \ No newline at end of file diff --git a/src/sagemaker/instance_types_gpu_info.py b/src/sagemaker/instance_types_gpu_info.py new file mode 100644 index 0000000000..41566c6d32 --- /dev/null +++ b/src/sagemaker/instance_types_gpu_info.py @@ -0,0 +1,43 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +"""Accessors to retrieve instance types GPU info.""" +from __future__ import absolute_import + +import json +import os +from typing import Dict + + +def retrieve(region: str) -> Dict[str, Dict[str, int]]: + """Retrieves instance types GPU info of the given region. + + Args: + region (str): The AWS region. + + Returns: + dict[str, dict[str, int]]: A dictionary that contains instance types as keys + and GPU info as values or empty dictionary if the + config for the given region is not found. + + Raises: + ValueError: If no config found. + """ + config_path = os.path.join( + os.path.dirname(__file__), "image_uri_config", "instance_gpu_info.json" + ) + try: + with open(config_path) as f: + instance_types_gpu_info_config = json.load(f) + return instance_types_gpu_info_config.get(region, {}) + except FileNotFoundError: + raise ValueError("Could not find instance types gpu info.") diff --git a/src/sagemaker/serve/utils/hardware_detector.py b/src/sagemaker/serve/utils/hardware_detector.py new file mode 100644 index 0000000000..632149dc8f --- /dev/null +++ b/src/sagemaker/serve/utils/hardware_detector.py @@ -0,0 +1,110 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +"""Utilities for detecting available GPUs and Aggregate GPU Memory size of an instance""" +from __future__ import absolute_import + +import logging +from typing import Tuple + +from botocore.exceptions import ClientError + +from sagemaker import Session +from sagemaker import instance_types_gpu_info + +logger = logging.getLogger(__name__) + + +def _get_gpu_info(instance_type: str, session: Session) -> Tuple[int, int]: + """Get GPU info for the provided instance + + Args: + instance_type (str) + session: The session to use. + + Returns: tuple[int, int]: A tuple that contains number of GPUs available at index 0, + and aggregate memory size in MiB at index 1. + + Raises: + ValueError: If The given instance type does not exist or GPU is not enabled. + """ + ec2_client = session.boto_session.client("ec2") + ec2_instance = _format_instance_type(instance_type) + + try: + instance_info = ec2_client.describe_instance_types(InstanceTypes=[ec2_instance]).get( + "InstanceTypes" + )[0] + except ClientError: + raise ValueError(f"Provided instance_type is not GPU enabled: [#{ec2_instance}]") + + if instance_info is not None: + gpus_info = instance_info.get("GpuInfo") + if gpus_info is not None: + gpus = gpus_info.get("Gpus") + if gpus is not None and len(gpus) > 0: + count = gpus[0].get("Count") + total_gpu_memory_in_mib = gpus_info.get("TotalGpuMemoryInMiB") + if count and total_gpu_memory_in_mib: + instance_gpu_info = ( + count, + total_gpu_memory_in_mib, + ) + logger.info("GPU Info [%s]: %s", ec2_instance, instance_gpu_info) + return instance_gpu_info + + raise ValueError(f"Provided instance_type is not GPU enabled: [{ec2_instance}]") + + +def _get_gpu_info_fallback(instance_type: str, region: str) -> Tuple[int, int]: + """Get GPU info for the provided from the config + + Args: + instance_type (str): + region: The AWS region. + + Returns: tuple[int, int]: A tuple that contains number of GPUs available at index 0, + and aggregate memory size in MiB at index 1. + + Raises: + ValueError: If The given instance type does not exist. + """ + instance_types_gpu_info_config = instance_types_gpu_info.retrieve(region) + fallback_instance_gpu_info = instance_types_gpu_info_config.get(instance_type) + + ec2_instance = _format_instance_type(instance_type) + if fallback_instance_gpu_info is None: + raise ValueError(f"Provided instance_type is not GPU enabled: [{ec2_instance}]") + + fallback_instance_gpu_info = ( + fallback_instance_gpu_info.get("Count"), + fallback_instance_gpu_info.get("TotalGpuMemoryInMiB"), + ) + logger.info("GPU Info [%s]: %s", ec2_instance, fallback_instance_gpu_info) + return fallback_instance_gpu_info + + +def _format_instance_type(instance_type: str) -> str: + """Formats provided instance type name + + Args: + instance_type (str): + + Returns: formatted instance type. + """ + split_instance = instance_type.split(".") + + if len(split_instance) > 2: + split_instance.pop(0) + + ec2_instance = ".".join(split_instance) + return ec2_instance diff --git a/tests/integ/sagemaker/serve/utils/test_hardware_detector.py b/tests/integ/sagemaker/serve/utils/test_hardware_detector.py new file mode 100644 index 0000000000..9102927c55 --- /dev/null +++ b/tests/integ/sagemaker/serve/utils/test_hardware_detector.py @@ -0,0 +1,44 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import pytest + +from sagemaker.serve.utils import hardware_detector + +REGION = "us-west-2" +VALID_INSTANCE_TYPE = "ml.g5.48xlarge" +INVALID_INSTANCE_TYPE = "fl.c5.57xxlarge" +EXPECTED_INSTANCE_GPU_INFO = (8, 196608) + + +def test_get_gpu_info_success(sagemaker_session): + gpu_info = hardware_detector._get_gpu_info(VALID_INSTANCE_TYPE, sagemaker_session) + + assert gpu_info == EXPECTED_INSTANCE_GPU_INFO + + +def test_get_gpu_info_throws(sagemaker_session): + with pytest.raises(ValueError): + hardware_detector._get_gpu_info(INVALID_INSTANCE_TYPE, sagemaker_session) + + +def test_get_gpu_info_fallback_success(): + gpu_info = hardware_detector._get_gpu_info_fallback(VALID_INSTANCE_TYPE, REGION) + + assert gpu_info == EXPECTED_INSTANCE_GPU_INFO + + +def test_get_gpu_info_fallback_throws(): + with pytest.raises(ValueError): + hardware_detector._get_gpu_info_fallback(INVALID_INSTANCE_TYPE, REGION) diff --git a/tests/unit/sagemaker/serve/utils/test_hardware_detector.py b/tests/unit/sagemaker/serve/utils/test_hardware_detector.py new file mode 100644 index 0000000000..5ec493de72 --- /dev/null +++ b/tests/unit/sagemaker/serve/utils/test_hardware_detector.py @@ -0,0 +1,98 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +from botocore.exceptions import ClientError +import pytest + +from sagemaker.serve.utils import hardware_detector + +REGION = "us-west-2" +VALID_INSTANCE_TYPE = "ml.g5.48xlarge" +INVALID_INSTANCE_TYPE = "fl.c5.57xxlarge" +EXPECTED_INSTANCE_GPU_INFO = (8, 196608) + + +def test_get_gpu_info_success(sagemaker_session, boto_session): + boto_session.client("ec2").describe_instance_types.return_value = { + "InstanceTypes": [ + { + "GpuInfo": { + "Gpus": [ + { + "Name": "A10G", + "Manufacturer": "NVIDIA", + "Count": 8, + "MemoryInfo": {"SizeInMiB": 24576}, + } + ], + "TotalGpuMemoryInMiB": 196608, + }, + } + ] + } + + instance_gpu_info = hardware_detector._get_gpu_info(VALID_INSTANCE_TYPE, sagemaker_session) + + boto_session.client("ec2").describe_instance_types.assert_called_once_with( + InstanceTypes=["g5.48xlarge"] + ) + assert instance_gpu_info == EXPECTED_INSTANCE_GPU_INFO + + +def test_get_gpu_info_throws(sagemaker_session, boto_session): + boto_session.client("ec2").describe_instance_types.return_value = {"InstanceTypes": [{}]} + + with pytest.raises(ValueError): + hardware_detector._get_gpu_info(INVALID_INSTANCE_TYPE, sagemaker_session) + + +def test_get_gpu_info_describe_instance_types_throws(sagemaker_session, boto_session): + boto_session.client("ec2").describe_instance_types.side_effect = ClientError( + { + "Error": { + "Code": "InvalidInstanceType", + "Message": f"An error occurred (InvalidInstanceType) when calling the DescribeInstanceTypes " + f"operation: The following supplied instance types do not exist: [{INVALID_INSTANCE_TYPE}]", + } + }, + "DescribeInstanceTypes", + ) + + with pytest.raises(ValueError): + hardware_detector._get_gpu_info(INVALID_INSTANCE_TYPE, sagemaker_session) + + +def test_get_gpu_info_fallback_success(): + fallback_instance_gpu_info = hardware_detector._get_gpu_info_fallback( + VALID_INSTANCE_TYPE, REGION + ) + + assert fallback_instance_gpu_info == EXPECTED_INSTANCE_GPU_INFO + + +def test_get_gpu_info_fallback_throws(): + with pytest.raises(ValueError): + hardware_detector._get_gpu_info_fallback(INVALID_INSTANCE_TYPE, REGION) + + +def test_format_instance_type_success(): + formatted_instance_type = hardware_detector._format_instance_type(VALID_INSTANCE_TYPE) + + assert formatted_instance_type == "g5.48xlarge" + + +def test_format_instance_type_without_ml_success(): + formatted_instance_type = hardware_detector._format_instance_type("g5.48xlarge") + + assert formatted_instance_type == "g5.48xlarge" diff --git a/tests/unit/sagemaker/test_instance_types_gpu_info.py b/tests/unit/sagemaker/test_instance_types_gpu_info.py new file mode 100644 index 0000000000..d91cb43e51 --- /dev/null +++ b/tests/unit/sagemaker/test_instance_types_gpu_info.py @@ -0,0 +1,30 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +from sagemaker import instance_types_gpu_info + +REGION = "us-west-2" +INVALID_REGION = "invalid-region" + + +def test_retrieve_success(): + data = instance_types_gpu_info.retrieve(REGION) + + assert len(data) > 0 + + +def test_retrieve_throws(): + data = instance_types_gpu_info.retrieve(INVALID_REGION) + + assert len(data) == 0 From 44bbe1347a7a357fe369adaca4ec585f83fd128a Mon Sep 17 00:00:00 2001 From: Keshav Chandak Date: Wed, 31 Jan 2024 01:08:38 +0530 Subject: [PATCH 69/76] fix: fixed create monitoring schedule failing after validation error (#4385) Co-authored-by: Keshav Chandak --- .../model_monitor/clarify_model_monitoring.py | 2 + .../model_monitor/model_monitoring.py | 54 ++++++------ tests/integ/test_model_monitor.py | 82 +++++++++++++++++++ 3 files changed, 114 insertions(+), 24 deletions(-) diff --git a/src/sagemaker/model_monitor/clarify_model_monitoring.py b/src/sagemaker/model_monitor/clarify_model_monitoring.py index 77f27b37f0..3edfabc747 100644 --- a/src/sagemaker/model_monitor/clarify_model_monitoring.py +++ b/src/sagemaker/model_monitor/clarify_model_monitoring.py @@ -669,6 +669,7 @@ def create_monitoring_schedule( self.monitoring_schedule_name = monitor_schedule_name except Exception: logger.exception("Failed to create monitoring schedule.") + self.monitoring_schedule_name = None # noinspection PyBroadException try: self.sagemaker_session.sagemaker_client.delete_model_bias_job_definition( @@ -1109,6 +1110,7 @@ def create_monitoring_schedule( self.monitoring_schedule_name = monitor_schedule_name except Exception: logger.exception("Failed to create monitoring schedule.") + self.monitoring_schedule_name = None # noinspection PyBroadException try: self.sagemaker_session.sagemaker_client.delete_model_explainability_job_definition( diff --git a/src/sagemaker/model_monitor/model_monitoring.py b/src/sagemaker/model_monitor/model_monitoring.py index 2800082df4..c313efcf5e 100644 --- a/src/sagemaker/model_monitor/model_monitoring.py +++ b/src/sagemaker/model_monitor/model_monitoring.py @@ -415,30 +415,34 @@ def create_monitoring_schedule( if arguments is not None: self.arguments = arguments - self.sagemaker_session.create_monitoring_schedule( - monitoring_schedule_name=self.monitoring_schedule_name, - schedule_expression=schedule_cron_expression, - statistics_s3_uri=statistics_s3_uri, - constraints_s3_uri=constraints_s3_uri, - monitoring_inputs=[normalized_monitoring_input], - monitoring_output_config=monitoring_output_config, - instance_count=self.instance_count, - instance_type=self.instance_type, - volume_size_in_gb=self.volume_size_in_gb, - volume_kms_key=self.volume_kms_key, - image_uri=self.image_uri, - entrypoint=self.entrypoint, - arguments=self.arguments, - record_preprocessor_source_uri=None, - post_analytics_processor_source_uri=None, - max_runtime_in_seconds=self.max_runtime_in_seconds, - environment=self.env, - network_config=network_config_dict, - role_arn=self.sagemaker_session.expand_role(self.role), - tags=self.tags, - data_analysis_start_time=data_analysis_start_time, - data_analysis_end_time=data_analysis_end_time, - ) + try: + self.sagemaker_session.create_monitoring_schedule( + monitoring_schedule_name=self.monitoring_schedule_name, + schedule_expression=schedule_cron_expression, + statistics_s3_uri=statistics_s3_uri, + constraints_s3_uri=constraints_s3_uri, + monitoring_inputs=[normalized_monitoring_input], + monitoring_output_config=monitoring_output_config, + instance_count=self.instance_count, + instance_type=self.instance_type, + volume_size_in_gb=self.volume_size_in_gb, + volume_kms_key=self.volume_kms_key, + image_uri=self.image_uri, + entrypoint=self.entrypoint, + arguments=self.arguments, + record_preprocessor_source_uri=None, + post_analytics_processor_source_uri=None, + max_runtime_in_seconds=self.max_runtime_in_seconds, + environment=self.env, + network_config=network_config_dict, + role_arn=self.sagemaker_session.expand_role(self.role), + tags=self.tags, + data_analysis_start_time=data_analysis_start_time, + data_analysis_end_time=data_analysis_end_time, + ) + except Exception: + self.monitoring_schedule_name = None + raise def update_monitoring_schedule( self, @@ -2054,6 +2058,7 @@ def create_monitoring_schedule( self.monitoring_schedule_name = monitor_schedule_name except Exception: logger.exception("Failed to create monitoring schedule.") + self.monitoring_schedule_name = None # noinspection PyBroadException try: self.sagemaker_session.sagemaker_client.delete_data_quality_job_definition( @@ -3173,6 +3178,7 @@ def create_monitoring_schedule( self.monitoring_schedule_name = monitor_schedule_name except Exception: logger.exception("Failed to create monitoring schedule.") + self.monitoring_schedule_name = None # noinspection PyBroadException try: self.sagemaker_session.sagemaker_client.delete_model_quality_job_definition( diff --git a/tests/integ/test_model_monitor.py b/tests/integ/test_model_monitor.py index 1a11b08f7d..17ea70699b 100644 --- a/tests/integ/test_model_monitor.py +++ b/tests/integ/test_model_monitor.py @@ -2488,3 +2488,85 @@ def test_one_time_monitoring_schedule(sagemaker_session): my_default_monitor.stop_monitoring_schedule() my_default_monitor.delete_monitoring_schedule() raise e + + +def test_create_monitoring_schedule_with_validation_error(sagemaker_session): + my_default_monitor = DefaultModelMonitor( + role=ROLE, + instance_count=INSTANCE_COUNT, + instance_type=INSTANCE_TYPE, + volume_size_in_gb=VOLUME_SIZE_IN_GB, + max_runtime_in_seconds=MAX_RUNTIME_IN_SECONDS, + sagemaker_session=sagemaker_session, + env=ENVIRONMENT, + tags=TAGS, + network_config=NETWORK_CONFIG, + ) + + output_s3_uri = os.path.join( + "s3://", + sagemaker_session.default_bucket(), + "integ-test-monitoring-output-bucket", + str(uuid.uuid4()), + ) + + data_captured_destination_s3_uri = os.path.join( + "s3://", + sagemaker_session.default_bucket(), + "sagemaker-serving-batch-transform", + str(uuid.uuid4()), + ) + + batch_transform_input = BatchTransformInput( + data_captured_destination_s3_uri=data_captured_destination_s3_uri, + destination="/opt/ml/processing/output", + dataset_format=MonitoringDatasetFormat.csv(header=False), + ) + + statistics = Statistics.from_file_path( + statistics_file_path=os.path.join(tests.integ.DATA_DIR, "monitor/statistics.json"), + sagemaker_session=sagemaker_session, + ) + + constraints = Constraints.from_file_path( + constraints_file_path=os.path.join(tests.integ.DATA_DIR, "monitor/constraints.json"), + sagemaker_session=sagemaker_session, + ) + + try: + my_default_monitor.create_monitoring_schedule( + monitor_schedule_name="schedule-name-more-than-63-characters-to-get-a-validation-exception", + batch_transform_input=batch_transform_input, + output_s3_uri=output_s3_uri, + statistics=statistics, + constraints=constraints, + schedule_cron_expression=CronExpressionGenerator.now(), + data_analysis_start_time="-PT1H", + data_analysis_end_time="-PT0H", + enable_cloudwatch_metrics=ENABLE_CLOUDWATCH_METRICS, + ) + except Exception as e: + assert "ValidationException" in str(e) + + my_default_monitor.create_monitoring_schedule( + monitor_schedule_name=unique_name_from_base("valid-schedule-name"), + batch_transform_input=batch_transform_input, + output_s3_uri=output_s3_uri, + statistics=statistics, + constraints=constraints, + schedule_cron_expression=CronExpressionGenerator.now(), + data_analysis_start_time="-PT1H", + data_analysis_end_time="-PT0H", + enable_cloudwatch_metrics=ENABLE_CLOUDWATCH_METRICS, + ) + try: + + _wait_for_schedule_changes_to_apply(monitor=my_default_monitor) + + my_default_monitor.stop_monitoring_schedule() + my_default_monitor.delete_monitoring_schedule() + + except Exception as e: + my_default_monitor.stop_monitoring_schedule() + my_default_monitor.delete_monitoring_schedule() + raise e From af9c2ab37fe30df2df03c508f309481726ca979b Mon Sep 17 00:00:00 2001 From: Suryansh Singh Date: Fri, 2 Feb 2024 19:57:04 -0800 Subject: [PATCH 70/76] Add collection type support for Feaure Group Ingestion. Add TargetStores support for PutRecord and Ingestion. --- doc/api/prep_data/feature_store.rst | 4 + src/sagemaker/feature_store/feature_group.py | 382 +++++++--- src/sagemaker/feature_store/inputs.py | 11 + src/sagemaker/session.py | 31 +- tests/integ/test_feature_store.py | 680 +++++++++++++++++- .../feature_store/test_feature_group.py | 673 ++++++++++++++++- tests/unit/test_session.py | 52 ++ 7 files changed, 1707 insertions(+), 126 deletions(-) diff --git a/doc/api/prep_data/feature_store.rst b/doc/api/prep_data/feature_store.rst index 278574e400..50a10c5089 100644 --- a/doc/api/prep_data/feature_store.rst +++ b/doc/api/prep_data/feature_store.rst @@ -91,6 +91,10 @@ Inputs :members: :show-inheritance: +.. autoclass:: sagemaker.feature_store.inputs.TtlDuration + :members: + :show-inheritance: + .. autoclass:: sagemaker.feature_store.inputs.S3StorageConfig :members: :show-inheritance: diff --git a/src/sagemaker/feature_store/feature_group.py b/src/sagemaker/feature_store/feature_group.py index 9ffb0ea9da..07f7e29b6c 100644 --- a/src/sagemaker/feature_store/feature_group.py +++ b/src/sagemaker/feature_store/feature_group.py @@ -19,7 +19,7 @@ list feature groups APIs can be used to manage feature groups. """ -from __future__ import absolute_import +from __future__ import absolute_import, annotations import copy import logging @@ -28,14 +28,15 @@ import tempfile from concurrent.futures import as_completed from concurrent.futures import ThreadPoolExecutor -from typing import Optional, Sequence, List, Dict, Any, Union +from typing import Optional, Sequence, List, Dict, Any, Union, Iterable from urllib.parse import urlparse from multiprocessing.pool import AsyncResult import signal import attr import pandas as pd -from pandas import DataFrame +from pandas import DataFrame, Series +from pandas.api.types import is_list_like import boto3 from botocore.config import Config @@ -50,6 +51,7 @@ from sagemaker.feature_store.feature_definition import ( FeatureDefinition, FeatureTypeEnum, + ListCollectionType, ) from sagemaker.feature_store.inputs import ( OnlineStoreConfig, @@ -66,6 +68,7 @@ OnlineStoreStorageTypeEnum, ThroughputConfig, ThroughputConfigUpdate, + TargetStoreEnum, ) from sagemaker.utils import resolve_value_from_config, format_tags, Tags @@ -95,7 +98,7 @@ class AthenaQuery: _result_file_prefix: str = attr.ib(init=False, default=None) def run( - self, query_string: str, output_location: str, kms_key: str = None, workgroup: str = None + self, query_string: str, output_location: str, kms_key: str = None, workgroup: str = None ) -> str: """Execute a SQL query given a query string, output location and kms key. @@ -182,6 +185,9 @@ class IngestionManagerPandas: Attributes: feature_group_name (str): name of the Feature Group. + feature_definitions (Dict[str, FeatureDefinition]): dictionary of feature definitions. + where the key is the feature name and the value is the FeatureDefinition. + The FeatureDefinition contains the data type of the feature. sagemaker_fs_runtime_client_config (Config): instance of the Config class for boto calls. sagemaker_session (Session): session instance to perform boto calls. @@ -194,6 +200,7 @@ class IngestionManagerPandas: """ feature_group_name: str = attr.ib() + feature_definitions: Dict[str, Dict[Any, Any]] = attr.ib() sagemaker_fs_runtime_client_config: Config = attr.ib(default=None) sagemaker_session: Session = attr.ib(default=None) max_workers: int = attr.ib(default=1) @@ -205,22 +212,28 @@ class IngestionManagerPandas: @staticmethod def _ingest_single_batch( - data_frame: DataFrame, - feature_group_name: str, - client_config: Config, - start_index: int, - end_index: int, - profile_name: str = None, + data_frame: DataFrame, + feature_group_name: str, + feature_definitions: Dict[str, FeatureDefinition], + client_config: Config, + start_index: int, + end_index: int, + target_stores: Sequence[TargetStoreEnum] = None, + profile_name: str = None, ) -> List[int]: """Ingest a single batch of DataFrame rows into FeatureStore. Args: data_frame (DataFrame): source DataFrame to be ingested. feature_group_name (str): name of the Feature Group. + feature_definitions (Dict[str, FeatureDefinition]): dictionary of feature definitions. + where the key is the feature name and the value is the FeatureDefinition. + The FeatureDefinition contains the data type of the feature. client_config (Config): Configuration for the sagemaker feature store runtime client to perform boto calls. start_index (int): starting position to ingest in this batch. end_index (int): ending position to ingest in this batch. + target_stores (Sequence[TargetStoreEnum]): stores to be used for ingestion. profile_name (str): the profile credential should be used for ``PutRecord`` (default: None). @@ -240,8 +253,10 @@ def _ingest_single_batch( for row in data_frame[start_index:end_index].itertuples(): IngestionManagerPandas._ingest_row( data_frame=data_frame, + target_stores=target_stores, row=row, feature_group_name=feature_group_name, + feature_definitions=feature_definitions, sagemaker_fs_runtime_client=sagemaker_fs_runtime_client, failed_rows=failed_rows, ) @@ -288,47 +303,127 @@ def wait(self, timeout=None): @staticmethod def _ingest_row( - data_frame: DataFrame, - row: int, - feature_group_name: str, - sagemaker_fs_runtime_client: Session, - failed_rows: List[int], + data_frame: DataFrame, + row: Iterable[tuple[Any, ...]], + feature_group_name: str, + feature_definitions: Dict[str, Dict[Any, Any]], + sagemaker_fs_runtime_client: Session, + failed_rows: List[int], + target_stores: Sequence[TargetStoreEnum] = None, ): """Ingest a single Dataframe row into FeatureStore. Args: data_frame (DataFrame): source DataFrame to be ingested. - row (int): current row that is being ingested + row (Iterable[tuple[Any, ...]]): current row that is being ingested feature_group_name (str): name of the Feature Group. - sagemaker_featurestore_runtime_client (Session): session instance to perform boto calls. + feature_definitions (Dict[str, FeatureDefinition]): dictionary of feature definitions. + where the key is the feature name and the value is the FeatureDefinition. + The FeatureDefinition contains the data type of the feature. + sagemaker_fs_runtime_client (Session): session instance to perform boto calls. failed_rows (List[int]): list of indices from the data frame for which ingestion failed. + target_stores (Sequence[TargetStoreEnum]): stores to be used for ingestion. Returns: int of row indices that failed to be ingested. """ - record = [ - FeatureValue( - feature_name=data_frame.columns[index - 1], - value_as_string=str(row[index]), - ) - for index in range(1, len(row)) - if pd.notna(row[index]) - ] try: - sagemaker_fs_runtime_client.put_record( - FeatureGroupName=feature_group_name, - Record=[value.to_dict() for value in record], - ) + record = [ + FeatureValue( + feature_name=data_frame.columns[index - 1], + value_as_string_list=IngestionManagerPandas._covert_feature_value_to_string_list(row[index]), + ) if IngestionManagerPandas._is_feature_collection_type( + feature_name=data_frame.columns[index - 1], feature_definitions=feature_definitions) + else FeatureValue( + feature_name=data_frame.columns[index - 1], + value_as_string=str(row[index])) + for index in range(1, len(row)) + if IngestionManagerPandas._feature_value_is_not_none(feature_value=row[index]) + ] + + put_record_params = { + 'FeatureGroupName': feature_group_name, + 'Record': [value.to_dict() for value in record], + } + if target_stores: + put_record_params['TargetStores'] = [target_store.value for target_store in target_stores] + + sagemaker_fs_runtime_client.put_record(**put_record_params) except Exception as e: # pylint: disable=broad-except logger.error("Failed to ingest row %d: %s", row[0], e) failed_rows.append(row[0]) - def _run_single_process_single_thread(self, data_frame: DataFrame): - """Ingest a utilizing single process and single thread. + @staticmethod + def _is_feature_collection_type(feature_name: str, feature_definitions: Dict[str, Dict[Any, Any]]): + """Check if the feature is a collection type. + + Args: + feature_name (str): name of the feature. + feature_definitions (Dict[str, FeatureDefinition]): dictionary of feature definitions. + where the key is the feature name and the value is the FeatureDefinition. + The FeatureDefinition contains the data type of the feature and the type of collection. + If the feature is not a collection type, the value of the CollectionType attribute + is None. + + Returns: + bool: True if the feature is a collection type, False otherwise. + """ + feature_definition = feature_definitions.get(feature_name) + if feature_definition is not None: + return feature_definition.get('CollectionType') is not None + + @staticmethod + def _feature_value_is_not_none( + feature_value: Any, + ): + """Check if the feature value is not None. + + For Collection Type feature, we want to keep this check simple, where if the value is not None, + we convert and pass it to PutRecord, instead of relying on Pandas.notna(obj).all(). + + Also, we don't want to skip the collection attribute with partial None values, when calling PutRecord. Since, + vector value can have some dimensions as None. Instead, we want to let PutRecord either accept or fail the + entire record based on the service side implementation. As of this change the service fails any partial None + collection types. + + For the Scalar values (non Collection) we want to still use pd.notna() to keep the behavior same. + + Args: + feature_value (Any): feature value. + + Returns: + bool: True if the feature value is not None, False otherwise. + """ + if not is_list_like(feature_value): + return pd.notna(feature_value) + return feature_value + + @staticmethod + def _covert_feature_value_to_string_list(feature_value: List[Any]): + """Convert a list of feature values to a list of strings. + + Args: + feature_value (List[Any]): list of feature values. + + Returns: + List[str]: list of strings. + """ + if not is_list_like(feature_value): + raise ValueError(f"Invalid feature value, feature value: {feature_value} for a collection type feature" + f" must be an Array, but instead was {type(feature_value)}") + return [ + str(value) if value is not None else None + for value in feature_value + ] + + def _run_single_process_single_thread(self, data_frame: DataFrame, target_stores: Sequence[TargetStoreEnum] = None): + """Ingest utilizing a single process and a single thread. Args: data_frame (DataFrame): source DataFrame to be ingested. + target_stores (Sequence[TargetStoreEnum]): target stores to ingest to. + If not specified, ingest to both online and offline stores. """ logger.info("Started ingesting index %d to %d") failed_rows = list() @@ -336,8 +431,10 @@ def _run_single_process_single_thread(self, data_frame: DataFrame): for row in data_frame.itertuples(): IngestionManagerPandas._ingest_row( data_frame=data_frame, + target_stores=target_stores, row=row, feature_group_name=self.feature_group_name, + feature_definitions=self.feature_definitions, sagemaker_fs_runtime_client=sagemaker_fs_runtime_client, failed_rows=failed_rows, ) @@ -349,11 +446,19 @@ def _run_single_process_single_thread(self, data_frame: DataFrame): f"Failed to ingest some data into FeatureGroup {self.feature_group_name}", ) - def _run_multi_process(self, data_frame: DataFrame, wait=True, timeout=None): + def _run_multi_process( + self, + data_frame: DataFrame, + target_stores: Sequence[TargetStoreEnum] = None, + wait=True, + timeout=None + ): """Start the ingestion process with the specified number of processes. Args: data_frame (DataFrame): source DataFrame to be ingested. + target_stores (Sequence[TargetStoreEnum]): target stores to ingest to. + If not specified, ingest to both online and offline stores. wait (bool): whether to wait for the ingestion to finish or not. timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised if timeout is reached. @@ -370,8 +475,10 @@ def _run_multi_process(self, data_frame: DataFrame, wait=True, timeout=None): ( self.max_workers, self.feature_group_name, + self.feature_definitions, self.sagemaker_fs_runtime_client_config, data_frame[start_index:end_index], + target_stores, start_index, timeout, self.profile_name, @@ -393,18 +500,22 @@ def init_worker(): @staticmethod def _run_multi_threaded( - max_workers: int, - feature_group_name: str, - sagemaker_fs_runtime_client_config: Config, - data_frame: DataFrame, - row_offset=0, - timeout=None, - profile_name=None, + max_workers: int, + feature_group_name: str, + feature_definitions: Dict[str, FeatureDefinition], + sagemaker_fs_runtime_client_config: Config, + data_frame: DataFrame, + target_stores: Sequence[TargetStoreEnum] = None, + row_offset=0, + timeout=None, + profile_name=None, ) -> List[int]: """Start the ingestion process. Args: data_frame (DataFrame): source DataFrame to be ingested. + target_stores (Sequence[TargetStoreEnum]): target stores to ingest to. + If not specified, ingest to both online and offline stores. row_offset (int): if ``data_frame`` is a partition of a parent DataFrame, then the index of the parent where ``data_frame`` starts. Otherwise, 0. wait (bool): whether to wait for the ingestion to finish or not. @@ -429,7 +540,9 @@ def _run_multi_threaded( executor.submit( IngestionManagerPandas._ingest_single_batch, feature_group_name=feature_group_name, + feature_definitions=feature_definitions, data_frame=data_frame, + target_stores=target_stores, start_index=start_index, end_index=end_index, client_config=sagemaker_fs_runtime_client_config, @@ -449,19 +562,21 @@ def _run_multi_threaded( return failed_indices - def run(self, data_frame: DataFrame, wait=True, timeout=None): + def run(self, data_frame: DataFrame, target_stores: Sequence[TargetStoreEnum] = None, wait=True, timeout=None): """Start the ingestion process. Args: data_frame (DataFrame): source DataFrame to be ingested. + target_stores (Sequence[TargetStoreEnum]): list of target stores to be used for + the ingestion. If None, the default target store is used. wait (bool): whether to wait for the ingestion to finish or not. timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised if timeout is reached. """ if self.max_workers == 1 and self.max_processes == 1 and self.profile_name is None: - self._run_single_process_single_thread(data_frame=data_frame) + self._run_single_process_single_thread(data_frame=data_frame, target_stores=target_stores) else: - self._run_multi_process(data_frame=data_frame, wait=wait, timeout=timeout) + self._run_multi_process(data_frame=data_frame, target_stores=target_stores, wait=wait, timeout=timeout) class IngestionError(Exception): @@ -699,11 +814,11 @@ def update( ) def update_feature_metadata( - self, - feature_name: str, - description: str = None, - parameter_additions: Sequence[FeatureParameter] = None, - parameter_removals: Sequence[str] = None, + self, + feature_name: str, + description: str = None, + parameter_additions: Sequence[FeatureParameter] = None, + parameter_removals: Sequence[str] = None, ) -> Dict[str, Any]: """Update a feature metadata and add/remove metadata. @@ -765,9 +880,83 @@ def list_parameters_for_feature_metadata(self, feature_name: str) -> Sequence[Di feature_group_name=self.name, feature_name=feature_name ).get("Parameters") + @staticmethod + def _check_list_type(value): + """Check if the value is a list or None. + + Args: + value: value to be checked. + + Returns: + True if value is a list or None, False otherwise. + """ + return is_list_like(value) or pd.isna(value) + + @staticmethod + def _determine_collection_list_type(series: Series) -> FeatureTypeEnum | None: + """Determine the collection type of the feature. + + Args: + series (Series): column from the data frame. + + Returns: + feature type. + """ + + if (series.apply(lambda lst: + all(isinstance(x, int) or pd.isna(x) for x in lst) if is_list_like(lst) else True) + .all()): + return FeatureTypeEnum.INTEGRAL + if (series.apply(lambda lst: + all(isinstance(x, (float, int)) or pd.isna(x) for x in lst) if is_list_like(lst) else True) + .all()): + return FeatureTypeEnum.FRACTIONAL + if (series.apply(lambda lst: + all(isinstance(x, str) or pd.isna(x) for x in lst) if is_list_like(lst) else True) + .all()): + return FeatureTypeEnum.STRING + return None + + def _generate_feature_definition( + self, series: Series, online_storage_type: OnlineStoreStorageTypeEnum + ) -> FeatureDefinition: + """Generate feature definition from the Panda Series. + + Args: + series (Series): column from the data frame. + + Returns: + feature definition. + """ + params = {"feature_name": series.name} + + dtype = str(series.dtype).lower() + if ( + online_storage_type + and online_storage_type == OnlineStoreStorageTypeEnum.IN_MEMORY + and dtype == "object" + and pd.notna(series.head(1000)).any() + and series.head(1000).apply(FeatureGroup._check_list_type).all() + ): + params["collection_type"] = ListCollectionType() + params["feature_type"] = FeatureGroup._determine_collection_list_type(series.head(1000)) + else: + params["feature_type"] = self.DTYPE_TO_FEATURE_DEFINITION_CLS_MAP.get(dtype, None) + + if params["feature_type"] is None: + raise ValueError( + f"Failed to infer Feature type based on dtype {dtype} " + f"for column {series.name}." + ) + + feature_definition = FeatureDefinition(**params) + + return feature_definition + def load_feature_definitions( - self, - data_frame: DataFrame, + self, + data_frame: DataFrame, + online_storage_type: OnlineStoreStorageTypeEnum = None ) -> Sequence[FeatureDefinition]: """Load feature definitions from a Pandas DataFrame. @@ -780,33 +969,35 @@ def load_feature_definitions( No feature definitions will be loaded if the given data_frame contains unsupported dtypes. + For IN_MEMORY online_storage_type all collection type columns within DataFrame will be inferred as a List, + instead of a String. Due to performance limitations, only first 1,000 values of the column will be sampled, + when inferring collection Type. Customers can manually update the inferred collection type as needed. + Args: - data_frame (DataFrame): + data_frame (DataFrame): A Pandas DataFrame containing features. + online_storage_type (OnlineStoreStorageTypeEnum): + Optional. Online storage type for the feature group. The value can be either STANDARD or IN_MEMORY + If not specified,STANDARD will be used by default. + If specified as IN_MEMORY, we will infer any collection type column within DataFrame as a List + instead of a String. All, collection types (List, Set and Vector) will be inferred as List. + We will only sample the first 1,000 values of the column when inferring collection Type. + + Returns: list of FeatureDefinition """ feature_definitions = [] for column in data_frame: - feature_type = self.DTYPE_TO_FEATURE_DEFINITION_CLS_MAP.get( - str(data_frame[column].dtype).lower(), None - ) - if feature_type: - feature_definitions.append( - FeatureDefinition(feature_name=column, feature_type=feature_type) - ) - else: - raise ValueError( - f"Failed to infer Feature type based on dtype {data_frame[column].dtype} " - f"for column {column}." - ) + feature_definition = self._generate_feature_definition(data_frame[column], online_storage_type) + feature_definitions.append(feature_definition) self.feature_definitions = feature_definitions return self.feature_definitions def get_record( - self, - record_identifier_value_as_string: str, - feature_names: Sequence[str] = None, + self, + record_identifier_value_as_string: str, + feature_names: Sequence[str] = None, ) -> Sequence[Dict[str, str]]: """Get a single record in a FeatureGroup @@ -822,31 +1013,31 @@ def get_record( feature_names=feature_names, ).get("Record") - def put_record(self, record: Sequence[FeatureValue], ttl_duration: TtlDuration = None): + def put_record( + self, + record: Sequence[FeatureValue], + target_stores: Sequence[TargetStoreEnum] = None, + ttl_duration: TtlDuration = None): """Put a single record in the FeatureGroup. Args: record (Sequence[FeatureValue]): a list contains feature values. + target_stores (Sequence[str]): a list of target stores. ttl_duration (TtlDuration): customer specified ttl duration. """ - if ttl_duration is not None: - return self.sagemaker_session.put_record( - feature_group_name=self.name, - record=[value.to_dict() for value in record], - ttl_duration=ttl_duration.to_dict(), - ) - return self.sagemaker_session.put_record( feature_group_name=self.name, record=[value.to_dict() for value in record], + target_stores=[target_store.value for target_store in target_stores] if target_stores else None, + ttl_duration=ttl_duration.to_dict() if ttl_duration is not None else None, ) def delete_record( - self, - record_identifier_value_as_string: str, - event_time: str, - deletion_mode: DeletionModeEnum = DeletionModeEnum.SOFT_DELETE, + self, + record_identifier_value_as_string: str, + event_time: str, + deletion_mode: DeletionModeEnum = DeletionModeEnum.SOFT_DELETE, ): """Delete a single record from a FeatureGroup. @@ -867,13 +1058,14 @@ def delete_record( ) def ingest( - self, - data_frame: DataFrame, - max_workers: int = 1, - max_processes: int = 1, - wait: bool = True, - timeout: Union[int, float] = None, - profile_name: str = None, + self, + data_frame: DataFrame, + target_stores: Sequence[TargetStoreEnum] = None, + max_workers: int = 1, + max_processes: int = 1, + wait: bool = True, + timeout: Union[int, float] = None, + profile_name: str = None, ) -> IngestionManagerPandas: """Ingest the content of a pandas DataFrame to feature store. @@ -895,7 +1087,7 @@ def ingest( the ``ingest`` function synchronously. To access the rows that failed to ingest, set ``wait`` to ``False``. The - ``IngestionError.failed_rows`` object saves all of the rows that failed to ingest. + ``IngestionError.failed_rows`` object saves all the rows that failed to ingest. `profile_name` argument is an optional one. It will use the default credential if None is passed. This `profile_name` is used in the sagemaker_featurestore_runtime client only. See @@ -904,6 +1096,8 @@ def ingest( Args: data_frame (DataFrame): data_frame to be ingested to feature store. + target_stores (Sequence[TargetStoreEnum]): target stores to be used for + ingestion. (default: None). max_workers (int): number of threads to be created. max_processes (int): number of processes to be created. Each process spawns ``max_worker`` number of threads. @@ -925,8 +1119,11 @@ def ingest( if profile_name is None and self.sagemaker_session.boto_session.profile_name != "default": profile_name = self.sagemaker_session.boto_session.profile_name + feature_definition_dict = self._get_feature_definition_dict() + manager = IngestionManagerPandas( feature_group_name=self.name, + feature_definitions=feature_definition_dict, sagemaker_session=self.sagemaker_session, sagemaker_fs_runtime_client_config=self.sagemaker_session.sagemaker_featurestore_runtime_client.meta.config, max_workers=max_workers, @@ -934,10 +1131,23 @@ def ingest( profile_name=profile_name, ) - manager.run(data_frame=data_frame, wait=wait, timeout=timeout) + manager.run(data_frame=data_frame, target_stores=target_stores, wait=wait, timeout=timeout) return manager + def _get_feature_definition_dict(self) -> Dict[str, Dict[Any, Any]]: + """Get a dictionary of feature definitions with Feature Name as Key. + We are converting the FeatureDefinition into a List for faster lookups. + + Returns: + Dictionary of feature definitions with Key being the Feature Name. + """ + feature_definitions = self.describe()["FeatureDefinitions"] + feature_definition_dict = {} + for feature_definition in feature_definitions: + feature_definition_dict[feature_definition["FeatureName"]] = feature_definition + return feature_definition_dict + def athena_query(self) -> AthenaQuery: """Create an AthenaQuery instance. diff --git a/src/sagemaker/feature_store/inputs.py b/src/sagemaker/feature_store/inputs.py index aaff977d3c..78c26fe026 100644 --- a/src/sagemaker/feature_store/inputs.py +++ b/src/sagemaker/feature_store/inputs.py @@ -108,6 +108,17 @@ def to_dict(self) -> Dict[str, Any]: ) +@attr.s +class TargetStoreEnum(Enum): + """Enum of store types for put record. + + The store types can be Standard or InMemory. + """ + + ONLINE_STORE = "OnlineStore" + OFFLINE_STORE = "OfflineStore" + + class OnlineStoreStorageTypeEnum(Enum): """Enum of storage types for online store. diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py index b1342eb381..b08703cf8b 100644 --- a/src/sagemaker/session.py +++ b/src/sagemaker/session.py @@ -5939,27 +5939,34 @@ def put_record( self, feature_group_name: str, record: Sequence[Dict[str, str]], - ttl_duration: Dict[str, str] = None, + target_stores: Sequence[str] = None, + ttl_duration: Dict[str, Any] = None, ): """Puts a single record in the FeatureGroup. Args: - feature_group_name (str): name of the FeatureGroup. - record (Sequence[Dict[str, str]]): list of FeatureValue dicts to be ingested + feature_group_name (str): Name of the FeatureGroup. + record (Sequence[Dict[str, str]]): List of FeatureValue dicts to be ingested into FeatureStore. + target_stores (Sequence[str]): Optional. List of target stores to put the record. + ttl_duration (Dict[str, str]): Optional. Time-to-Live (TTL) duration for the record. + + Returns: + Response dict from service. """ + params = { + 'FeatureGroupName': feature_group_name, + 'Record': record, + } + if ttl_duration: - return self.sagemaker_featurestore_runtime_client.put_record( - FeatureGroupName=feature_group_name, - Record=record, - TtlDuration=ttl_duration, - ) + params['TtlDuration'] = ttl_duration - return self.sagemaker_featurestore_runtime_client.put_record( - FeatureGroupName=feature_group_name, - Record=record, - ) + if target_stores: + params['TargetStores'] = target_stores + + return self.sagemaker_featurestore_runtime_client.put_record(**params) def delete_record( self, diff --git a/tests/integ/test_feature_store.py b/tests/integ/test_feature_store.py index 319d492e83..96e31967b2 100644 --- a/tests/integ/test_feature_store.py +++ b/tests/integ/test_feature_store.py @@ -30,7 +30,9 @@ StringFeatureDefinition, ListCollectionType, ) -from sagemaker.feature_store.feature_group import FeatureGroup +from sagemaker.feature_store.feature_group import ( + FeatureGroup, + IngestionError) from sagemaker.feature_store.feature_store import FeatureStore from sagemaker.feature_store.inputs import ( FeatureValue, @@ -46,6 +48,7 @@ ThroughputConfig, ThroughputModeEnum, ThroughputConfigUpdate, + TargetStoreEnum, ) from sagemaker.feature_store.dataset_builder import ( JoinTypeEnum, @@ -137,6 +140,254 @@ def pandas_data_frame(): ) return df +@pytest.fixture +def get_record_results_for_data_frame(): + return { + "0.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '0.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '0'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '0.0'}, + ], + "1.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '1.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '1'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '1.0'}, + ], + "2.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '2.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '2'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '2.0'}, + ], + "3.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '3.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '3'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '3.0'} + ], + "4.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '4.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '4'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '4.0'} + ], + "5.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '5.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '5'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ], + "6.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '6.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '6'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ], + "7.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '7.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '7'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ], + "8.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '8.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '8'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ], + "9.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '9.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '9'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ] + } + + +@pytest.fixture +def pandas_data_frame_with_collection_type(): + df = pd.DataFrame( + { + "feature1": pd.Series(np.arange(10.0), dtype="float64"), + "feature2": pd.Series(np.arange(10), dtype="int64"), + "feature3": pd.Series(["2020-10-30T03:43:21Z"] * 10, dtype="string"), + "feature4": pd.Series(np.arange(5.0), dtype="float64"), # contains nan + "feature5": pd.Series([["a", "abc"], ["b", "c"], ["c", "f"], ["d"], []], dtype="object"), + "feature6": pd.Series([[1, 2], [1, 2, 3], [1, 5], [1], []], dtype="object"), + "feature7": pd.Series([[1.1, 2.3], [1.4, 2.5, 3.2, 25], [1.0, 5.3], [1.2], []], dtype="object"), + "feature8": pd.Series([[1, 2], [1, 2, 3], [1, 5], [1], [], []], dtype="object"), + "feature9": pd.Series([[1.1, 2.3], [1.4, 25, 3.2], [1.0, 3, 4], [1.2], []], dtype="object"), + "feature10": pd.Series([["a", "abc"], ["b", "c"], ["c", "None"], ["d"], []], dtype="object"), + } + ) + return df + +@pytest.fixture +def get_record_results_for_data_frame_with_collection_type(): + return { + "0.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '0.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '0'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '0.0'}, + {'FeatureName': 'feature5', 'ValueAsStringList': ['a', 'abc']}, + {'FeatureName': 'feature6', 'ValueAsStringList': ['1', '2']}, + {'FeatureName': 'feature7', 'ValueAsStringList': ['1.1', '2.3']}, + {'FeatureName': 'feature8', 'ValueAsStringList': ['1', '2']}, + {'FeatureName': 'feature9', 'ValueAsStringList': ['1.1', '2.3']}, + {'FeatureName': 'feature10', 'ValueAsStringList': ['a', 'abc']} + ], + "1.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '1.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '1'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '1.0'}, + {'FeatureName': 'feature5', 'ValueAsStringList': ['b', 'c']}, + {'FeatureName': 'feature6', 'ValueAsStringList': ['1', '2', '3']}, + {'FeatureName': 'feature7', 'ValueAsStringList': ['1.4', '2.5', '3.2', '25']}, + {'FeatureName': 'feature8', 'ValueAsStringList': ['1', '2', '3']}, + {'FeatureName': 'feature9', 'ValueAsStringList': ['1.4', '25', '3.2']}, + {'FeatureName': 'feature10', 'ValueAsStringList': ['b', 'c']} + ], + "2.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '2.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '2'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '2.0'}, + {'FeatureName': 'feature5', 'ValueAsStringList': ['c', 'f']}, + {'FeatureName': 'feature6', 'ValueAsStringList': ['1', '5']}, + {'FeatureName': 'feature7', 'ValueAsStringList': ['1.0', '5.3']}, + {'FeatureName': 'feature8', 'ValueAsStringList': ['1', '5']}, + {'FeatureName': 'feature9', 'ValueAsStringList': ['1.0', '3', '4']}, + {'FeatureName': 'feature10', 'ValueAsStringList': ['c', 'None']} + ], + "3.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '3.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '3'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '3.0'}, + {'FeatureName': 'feature5', 'ValueAsStringList': ['d']}, + {'FeatureName': 'feature6', 'ValueAsStringList': ['1']}, + {'FeatureName': 'feature7', 'ValueAsStringList': ['1.2']}, + {'FeatureName': 'feature8', 'ValueAsStringList': ['1']}, + {'FeatureName': 'feature9', 'ValueAsStringList': ['1.2']}, + {'FeatureName': 'feature10', 'ValueAsStringList': ['d']} + ], + "4.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '4.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '4'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '4.0'} + ], + "5.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '5.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '5'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ], + "6.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '6.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '6'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ], + "7.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '7.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '7'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ], + "8.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '8.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '8'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ], + "9.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '9.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '9'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ] + } + + +@pytest.fixture +def get_record_results_for_data_frame_without_collection_type(): + return { + "0.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '0.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '0'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '0.0'}, + {'FeatureName': 'feature5', 'ValueAsString': "['a', 'abc']"}, + {'FeatureName': 'feature6', 'ValueAsString': '[1, 2]'}, + {'FeatureName': 'feature7', 'ValueAsString': '[1.1, 2.3]'}, + {'FeatureName': 'feature8', 'ValueAsString': '[1, 2]'}, + {'FeatureName': 'feature9', 'ValueAsString': '[1.1, 2.3]'}, + {'FeatureName': 'feature10', 'ValueAsString': "['a', 'abc']"} + ], + "1.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '1.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '1'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '1.0'}, + {'FeatureName': 'feature5', 'ValueAsString': "['b', 'c']"}, + {'FeatureName': 'feature6', 'ValueAsString': '[1, 2, 3]'}, + {'FeatureName': 'feature7', 'ValueAsString': '[1.4, 2.5, 3.2, 25]'}, + {'FeatureName': 'feature8', 'ValueAsString': '[1, 2, 3]'}, + {'FeatureName': 'feature9', 'ValueAsString': '[1.4, 25, 3.2]'}, + {'FeatureName': 'feature10', 'ValueAsString': "['b', 'c']"} + ], + "2.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '2.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '2'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '2.0'}, + {'FeatureName': 'feature5', 'ValueAsString': "['c', 'f']"}, + {'FeatureName': 'feature6', 'ValueAsString': '[1, 5]'}, + {'FeatureName': 'feature7', 'ValueAsString': '[1.0, 5.3]'}, + {'FeatureName': 'feature8', 'ValueAsString': '[1, 5]'}, + {'FeatureName': 'feature9', 'ValueAsString': '[1.0, 3, 4]'}, + {'FeatureName': 'feature10', 'ValueAsString': "['c', 'None']"} + ], + "3.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '3.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '3'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '3.0'}, + {'FeatureName': 'feature5', 'ValueAsString': "['d']"}, + {'FeatureName': 'feature6', 'ValueAsString': '[1]'}, + {'FeatureName': 'feature7', 'ValueAsString': '[1.2]'}, + {'FeatureName': 'feature8', 'ValueAsString': '[1]'}, + {'FeatureName': 'feature9', 'ValueAsString': '[1.2]'}, + {'FeatureName': 'feature10', 'ValueAsString': "['d']"} + ], + "4.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '4.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '4'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '4.0'} + ], + "5.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '5.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '5'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ], + "6.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '6.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '6'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ], + "7.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '7.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '7'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ], + "8.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '8.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '8'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ], + "9.0": [ + {'FeatureName': 'feature1', 'ValueAsString': '9.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '9'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ] + } + @pytest.fixture def base_dataframe(): @@ -315,6 +566,293 @@ def test_create_feature_store( assert output["FeatureGroupArn"].endswith(f"feature-group/{feature_group_name}") +def test_create_feature_store_ingest_with_offline_target_stores( + feature_store_session, + role, + feature_group_name, + offline_store_s3_uri, + pandas_data_frame, + record, + create_table_ddl, + get_record_results_for_data_frame +): + feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) + feature_group.load_feature_definitions(data_frame=pandas_data_frame) + with cleanup_feature_group(feature_group): + output = feature_group.create( + s3_uri=offline_store_s3_uri, + record_identifier_name="feature1", + event_time_feature_name="feature3", + role_arn=role, + enable_online_store=True, + ) + _wait_for_feature_group_create(feature_group) + + resolved_output_s3_uri = ( + feature_group.describe() + .get("OfflineStoreConfig") + .get("S3StorageConfig") + .get("ResolvedOutputS3Uri") + ) + # Ingest data + feature_group.put_record(record=record) + ingestion_manager = feature_group.ingest( + data_frame=pandas_data_frame, + target_stores=[TargetStoreEnum.OFFLINE_STORE], + max_workers=3, + max_processes=2, + wait=False + ) + ingestion_manager.wait() + assert 0 == len(ingestion_manager.failed_rows) + + for index, value in pandas_data_frame["feature1"].items(): + assert feature_group.get_record( + record_identifier_value_as_string=str(value) + ) is None + + # Query the integrated Glue table. + athena_query = feature_group.athena_query() + df = DataFrame() + with timeout(minutes=10): + while df.shape[0] < 11: + athena_query.run( + query_string=f'SELECT * FROM "{athena_query.table_name}"', + output_location=f"{offline_store_s3_uri}/query_results", + ) + athena_query.wait() + assert "SUCCEEDED" == athena_query.get_query_execution().get("QueryExecution").get( + "Status" + ).get("State") + df = athena_query.as_dataframe() + print(f"Found {df.shape[0]} records.") + time.sleep(60) + + assert df.shape[0] == 11 + nans = pd.isna(df.loc[df["feature1"].isin([5, 6, 7, 8, 9])]["feature4"]) + for is_na in nans.items(): + assert is_na + assert ( + create_table_ddl.format( + feature_group_name=feature_group_name, + region=feature_store_session.boto_session.region_name, + account=feature_store_session.account_id(), + resolved_output_s3_uri=resolved_output_s3_uri, + ) + == feature_group.as_hive_ddl() + ) + assert output["FeatureGroupArn"].endswith(f"feature-group/{feature_group_name}") + + +def test_create_feature_store_ingest_with_online_offline_target_stores( + feature_store_session, + role, + feature_group_name, + offline_store_s3_uri, + pandas_data_frame, + record, + create_table_ddl, + get_record_results_for_data_frame +): + feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) + feature_group.load_feature_definitions(data_frame=pandas_data_frame) + with cleanup_feature_group(feature_group): + output = feature_group.create( + s3_uri=offline_store_s3_uri, + record_identifier_name="feature1", + event_time_feature_name="feature3", + role_arn=role, + enable_online_store=True, + ) + _wait_for_feature_group_create(feature_group) + + resolved_output_s3_uri = ( + feature_group.describe() + .get("OfflineStoreConfig") + .get("S3StorageConfig") + .get("ResolvedOutputS3Uri") + ) + # Ingest data + feature_group.put_record(record=record) + ingestion_manager = feature_group.ingest( + data_frame=pandas_data_frame, + target_stores=[TargetStoreEnum.ONLINE_STORE, TargetStoreEnum.OFFLINE_STORE], + max_workers=3, + max_processes=2, + wait=False + ) + ingestion_manager.wait() + assert 0 == len(ingestion_manager.failed_rows) + + for index, value in pandas_data_frame["feature1"].items(): + assert feature_group.get_record( + record_identifier_value_as_string=str(value) + ) == get_record_results_for_data_frame[str(value)] + + # Query the integrated Glue table. + athena_query = feature_group.athena_query() + df = DataFrame() + with timeout(minutes=10): + while df.shape[0] < 11: + athena_query.run( + query_string=f'SELECT * FROM "{athena_query.table_name}"', + output_location=f"{offline_store_s3_uri}/query_results", + ) + athena_query.wait() + assert "SUCCEEDED" == athena_query.get_query_execution().get("QueryExecution").get( + "Status" + ).get("State") + df = athena_query.as_dataframe() + print(f"Found {df.shape[0]} records.") + time.sleep(60) + + assert df.shape[0] == 11 + nans = pd.isna(df.loc[df["feature1"].isin([5, 6, 7, 8, 9])]["feature4"]) + for is_na in nans.items(): + assert is_na + assert ( + create_table_ddl.format( + feature_group_name=feature_group_name, + region=feature_store_session.boto_session.region_name, + account=feature_store_session.account_id(), + resolved_output_s3_uri=resolved_output_s3_uri, + ) + == feature_group.as_hive_ddl() + ) + assert output["FeatureGroupArn"].endswith(f"feature-group/{feature_group_name}") + + +def test_create_feature_store_ingest_with_online_target_stores( + feature_store_session, + role, + feature_group_name, + offline_store_s3_uri, + pandas_data_frame, + record, + create_table_ddl, + get_record_results_for_data_frame +): + feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) + feature_group.load_feature_definitions(data_frame=pandas_data_frame) + with cleanup_feature_group(feature_group): + output = feature_group.create( + s3_uri=offline_store_s3_uri, + record_identifier_name="feature1", + event_time_feature_name="feature3", + role_arn=role, + enable_online_store=True, + ) + _wait_for_feature_group_create(feature_group) + + resolved_output_s3_uri = ( + feature_group.describe() + .get("OfflineStoreConfig") + .get("S3StorageConfig") + .get("ResolvedOutputS3Uri") + ) + # Ingest data + ingestion_manager = feature_group.ingest( + data_frame=pandas_data_frame, + target_stores=[TargetStoreEnum.ONLINE_STORE], + max_workers=3, + max_processes=2, + wait=False + ) + ingestion_manager.wait() + assert 0 == len(ingestion_manager.failed_rows) + + for index, value in pandas_data_frame["feature1"].items(): + assert feature_group.get_record( + record_identifier_value_as_string=str(value) + ) == get_record_results_for_data_frame[str(value)] + + + feature_group.put_record( + record=[ + FeatureValue(feature_name='feature1', value_as_string='100.0'), + FeatureValue(feature_name='feature2', value_as_string='100'), + FeatureValue(feature_name='feature3', value_as_string='2020-10-30T03:43:21Z'), + FeatureValue(feature_name='feature4', value_as_string='100.0') + ], + target_stores=[TargetStoreEnum.OFFLINE_STORE] + ) + assert feature_group.get_record(record_identifier_value_as_string='100.0') == None + + + # Query the integrated Glue table. + athena_query = feature_group.athena_query() + df = DataFrame() + with timeout(minutes=10): + while df.shape[0] < 1: + athena_query.run( + query_string=f'SELECT * FROM "{athena_query.table_name}"', + output_location=f"{offline_store_s3_uri}/query_results", + ) + athena_query.wait() + assert "SUCCEEDED" == athena_query.get_query_execution().get("QueryExecution").get( + "Status" + ).get("State") + df = athena_query.as_dataframe() + print(f"Found {df.shape[0]} records.") + time.sleep(60) + + assert df.shape[0] == 1 + assert df.loc[0, 'feature1'] == 100.0 + assert df.loc[0, 'feature2'] == 100 + assert df.loc[0, 'feature3'] == "2020-10-30T03:43:21Z" + assert df.loc[0, 'feature4'] == 100.0 + assert output["FeatureGroupArn"].endswith(f"feature-group/{feature_group_name}") + + +def test_put_record_with_target_stores( + feature_store_session, + role, + feature_group_name, + offline_store_s3_uri, + pandas_data_frame, + record, + create_table_ddl, + get_record_results_for_data_frame +): + feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) + feature_group.load_feature_definitions(data_frame=pandas_data_frame) + with cleanup_feature_group(feature_group): + output = feature_group.create( + s3_uri=offline_store_s3_uri, + record_identifier_name="feature1", + event_time_feature_name="feature3", + role_arn=role, + enable_online_store=True, + ) + _wait_for_feature_group_create(feature_group) + feature_group.put_record( + record=[ + FeatureValue(feature_name='feature1', value_as_string='100.0'), + FeatureValue(feature_name='feature2', value_as_string='100'), + FeatureValue(feature_name='feature3', value_as_string='2020-10-30T03:43:21Z'), + FeatureValue(feature_name='feature4', value_as_string='100.0') + ], + target_stores=[TargetStoreEnum.OFFLINE_STORE] + ) + assert feature_group.get_record(record_identifier_value_as_string='100.0') == None + + feature_group.put_record( + record=[ + FeatureValue(feature_name='feature1', value_as_string='100.0'), + FeatureValue(feature_name='feature2', value_as_string='100'), + FeatureValue(feature_name='feature3', value_as_string='2020-10-30T03:43:21Z'), + FeatureValue(feature_name='feature4', value_as_string='100.0') + ], + target_stores=[TargetStoreEnum.ONLINE_STORE] + ) + assert feature_group.get_record(record_identifier_value_as_string='100.0') == [ + {'FeatureName': 'feature1', 'ValueAsString': '100.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '100'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '100.0'} + ] + + def test_create_feature_group_iceberg_table_format( feature_store_session, role, @@ -1630,6 +2168,146 @@ def test_get_feature_group_with_session( assert output["FeatureGroupArn"].endswith(f"feature-group/{feature_group_name}") +def test_ingest_in_memory_multi_process_with_collection_types( + feature_store_session, + role, + feature_group_name, + offline_store_s3_uri, + pandas_data_frame_with_collection_type, + get_record_results_for_data_frame_with_collection_type, +): + feature_group = FeatureGroup(feature_group_name, sagemaker_session=feature_store_session) + feature_group.load_feature_definitions(data_frame=pandas_data_frame_with_collection_type, + online_storage_type=OnlineStoreStorageTypeEnum.IN_MEMORY) + + with (cleanup_feature_group(feature_group)): + output = feature_group.create( + record_identifier_name="feature1", + event_time_feature_name="feature3", + role_arn=role, + enable_online_store=True, + online_store_storage_type=OnlineStoreStorageTypeEnum.IN_MEMORY, + s3_uri=False + ) + _wait_for_feature_group_create(feature_group) + + ingestion_manager = feature_group.ingest( + data_frame=pandas_data_frame_with_collection_type, max_workers=3, max_processes=2, wait=True + ) + ingestion_manager.wait() + assert 0 == len(ingestion_manager.failed_rows) + assert output["FeatureGroupArn"].endswith(f"feature-group/{feature_group_name}") + for index, value in pandas_data_frame_with_collection_type["feature1"].items(): + assert feature_group.get_record( + record_identifier_value_as_string=str(value) + ) == get_record_results_for_data_frame_with_collection_type[str(value)] + + new_row_data = [ + 10.0, 10, "2020-10-30T03:43:21Z", 5.0, ["a", "b"], [1, 2, None], [3.0, 4.0], [1, 2], [3.0, 4.0], ["a", "b"] + ] + pandas_data_frame_with_collection_type.loc[len(pandas_data_frame_with_collection_type)] = new_row_data + with pytest.raises(IngestionError) as error: + feature_group.ingest( + data_frame=pandas_data_frame_with_collection_type, max_workers=1, max_processes=1, wait=True + ) + + +def test_ingest_in_memory_single_process_with_collection_types( + feature_store_session, + role, + feature_group_name, + offline_store_s3_uri, + pandas_data_frame_with_collection_type, + get_record_results_for_data_frame_with_collection_type, +): + feature_group = FeatureGroup(feature_group_name, sagemaker_session=feature_store_session) + feature_group.load_feature_definitions(data_frame=pandas_data_frame_with_collection_type, + online_storage_type=OnlineStoreStorageTypeEnum.IN_MEMORY) + + with (cleanup_feature_group(feature_group)): + output = feature_group.create( + record_identifier_name="feature1", + event_time_feature_name="feature3", + role_arn=role, + enable_online_store=True, + online_store_storage_type=OnlineStoreStorageTypeEnum.IN_MEMORY, + s3_uri=False + ) + _wait_for_feature_group_create(feature_group) + + ingestion_manager = feature_group.ingest( + data_frame=pandas_data_frame_with_collection_type, max_workers=1, max_processes=1, wait=True + ) + assert 0 == len(ingestion_manager.failed_rows) + assert output["FeatureGroupArn"].endswith(f"feature-group/{feature_group_name}") + for index, value in pandas_data_frame_with_collection_type["feature1"].items(): + assert feature_group.get_record( + record_identifier_value_as_string=str(value) + ) == get_record_results_for_data_frame_with_collection_type[str(value)] + + new_row_data = [ + 10.0, 10, "2020-10-30T03:43:21Z", 5.0, ["a", "b"], [1, 2, None], [3.0, 4.0], [1, 2], [3.0, 4.0], ["a", "b"] + ] + pandas_data_frame_with_collection_type.loc[len(pandas_data_frame_with_collection_type)] = new_row_data + with pytest.raises(IngestionError) as error: + feature_group.ingest( + data_frame=pandas_data_frame_with_collection_type, max_workers=1, max_processes=1, wait=True + ) + + +def test_ingest_standard_multi_process_with_collection_types( + feature_store_session, + role, + feature_group_name, + offline_store_s3_uri, + pandas_data_frame_with_collection_type, + get_record_results_for_data_frame_without_collection_type, +): + feature_group = FeatureGroup(feature_group_name, sagemaker_session=feature_store_session) + feature_group.load_feature_definitions(data_frame=pandas_data_frame_with_collection_type, + online_storage_type=OnlineStoreStorageTypeEnum.STANDARD) + + with (cleanup_feature_group(feature_group)): + output = feature_group.create( + record_identifier_name="feature1", + event_time_feature_name="feature3", + role_arn=role, + enable_online_store=True, + online_store_storage_type=OnlineStoreStorageTypeEnum.STANDARD, + s3_uri=False + ) + _wait_for_feature_group_create(feature_group) + + new_row_data = [ + 10.0, 10, "2020-10-30T03:43:21Z", 5.0, ["a", "b"], [1, 2, None], [3.0, 4.0], [1, 2], [3.0, 4.0], ["a", "b"] + ] + pandas_data_frame_with_collection_type.loc[len(pandas_data_frame_with_collection_type)] = new_row_data + + ingestion_manager = feature_group.ingest( + data_frame=pandas_data_frame_with_collection_type, max_workers=3, max_processes=2, wait=True + ) + ingestion_manager.wait() + assert 0 == len(ingestion_manager.failed_rows) + get_record_results_for_data_frame_without_collection_type["10.0"] = [ + {'FeatureName': 'feature1', 'ValueAsString': '10.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '10'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '5.0'}, + {'FeatureName': 'feature5', 'ValueAsString': "['a', 'b']"}, + {'FeatureName': 'feature6', 'ValueAsString': '[1, 2, None]'}, + {'FeatureName': 'feature7', 'ValueAsString': '[3.0, 4.0]'}, + {'FeatureName': 'feature8', 'ValueAsString': '[1, 2]'}, + {'FeatureName': 'feature9', 'ValueAsString': '[3.0, 4.0]'}, + {'FeatureName': 'feature10', 'ValueAsString': "['a', 'b']"} + ] + assert output["FeatureGroupArn"].endswith(f"feature-group/{feature_group_name}") + for index, value in pandas_data_frame_with_collection_type["feature1"].items(): + assert feature_group.get_record( + record_identifier_value_as_string=str(value) + ) == get_record_results_for_data_frame_without_collection_type[str(value)] + + + @contextmanager def cleanup_feature_group(feature_group: FeatureGroup): try: diff --git a/tests/unit/sagemaker/feature_store/test_feature_group.py b/tests/unit/sagemaker/feature_store/test_feature_group.py index 394ecb25b3..c05c330dbb 100644 --- a/tests/unit/sagemaker/feature_store/test_feature_group.py +++ b/tests/unit/sagemaker/feature_store/test_feature_group.py @@ -13,10 +13,10 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import - import pandas as pd +import numpy as np import pytest -from mock import Mock, patch, MagicMock +from mock import Mock, patch, MagicMock, call from botocore.exceptions import ProfileNotFound from sagemaker.feature_store.feature_definition import ( @@ -27,6 +27,7 @@ VectorCollectionType, SetCollectionType, ListCollectionType, + FeatureDefinition, ) from sagemaker.feature_store.feature_group import ( FeatureGroup, @@ -43,6 +44,7 @@ ThroughputModeEnum, ThroughputConfig, ThroughputConfigUpdate, + TargetStoreEnum, ) from tests.unit import SAGEMAKER_CONFIG_FEATURE_GROUP @@ -84,6 +86,126 @@ def feature_group_dummy_definitions(): ] +@pytest.fixture +def feature_group_describe_dummy_definitions(): + return [ + {'FeatureName': 'feature1', 'FeatureType': 'Fractional'}, + {'FeatureName': 'feature2', 'FeatureType': 'Integral'}, + {'FeatureName': 'feature3', 'FeatureType': 'String'}, + ] + + +@pytest.fixture +def feature_group_dummy_definition_dict(): + return { + 'feature1': {'FeatureName': 'feature1', 'FeatureType': 'Fractional'}, + 'feature2': {'FeatureName': 'feature2', 'FeatureType': 'Integral'}, + 'feature3': {'FeatureName': 'feature3', 'FeatureType': 'String'}, + } + + +@pytest.fixture +def feature_group_dummy_definitions(): + return [ + FractionalFeatureDefinition(feature_name="feature1"), + IntegralFeatureDefinition(feature_name="feature2"), + StringFeatureDefinition(feature_name="feature3"), + ] + + +@pytest.fixture +def data_frame_with_collection_type(): + df = pd.DataFrame( + { + "feature1": pd.Series(np.arange(10.0), dtype="float64"), + "feature2": pd.Series(np.arange(10), dtype="int64"), + "feature3": pd.Series(["2020-10-30T03:43:21Z"] * 10, dtype="string"), + "feature4": pd.Series(np.arange(5.0), dtype="float64"), # contains nan + "feature5": pd.Series([["a", "abc"], ["b", "c"], ["c", "f"], ["d"], []], dtype="object"), + "feature6": pd.Series([[1, 2], [1, 2, 3], [1, 5], [1], []], dtype="object"), + "feature7": pd.Series([[1.1, 2.3], [1.4, 2.5, 3.2, 25], [1.0, 5.3], [1.2], []], dtype="object"), + "feature8": pd.Series([[1, 2], [1, 2, None], [1, 5], [1], [], [None]], dtype="object"), + "feature9": pd.Series([[1.1, 2.3], [1.4, 25, 3.2], [1.0, 3, None], [1.2], [], [None]], dtype="object"), + "feature10": pd.Series([["a", "abc"], ["b", "c"], ["c", None], ["d"], [], [None]], dtype="object"), + } + ) + return df + + +@pytest.fixture +def expected_standard_feature_definitions(): + return [ + FeatureDefinition(feature_name="feature1", feature_type=FeatureTypeEnum.FRACTIONAL), + FeatureDefinition(feature_name="feature2", feature_type=FeatureTypeEnum.INTEGRAL), + FeatureDefinition(feature_name="feature3", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="feature4", feature_type=FeatureTypeEnum.FRACTIONAL), + FeatureDefinition(feature_name="feature5", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="feature6", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="feature7", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="feature8", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="feature9", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="feature10", feature_type=FeatureTypeEnum.STRING), + ] + + +@pytest.fixture +def expected_standard_feature_definition_dict(): + return { + 'feature1': {'FeatureName': 'feature1', 'FeatureType': 'Fractional'}, + 'feature2': {'FeatureName': 'feature2', 'FeatureType': 'Integral'}, + 'feature3': {'FeatureName': 'feature3', 'FeatureType': 'String'}, + 'feature4': {'FeatureName': 'feature4', 'FeatureType': 'Fractional', 'CollectionType': None}, + 'feature5': {'FeatureName': 'feature5', 'FeatureType': 'String'}, + 'feature6': {'FeatureName': 'feature6', 'FeatureType': 'Integral'}, + 'feature7': {'FeatureName': 'feature7', 'FeatureType': 'Fractional'}, + 'feature8': {'FeatureName': 'feature8', 'FeatureType': 'Integral'}, + 'feature9': {'FeatureName': 'feature9', 'FeatureType': 'Fractional'}, + 'feature10': {'FeatureName': 'feature10', 'FeatureType': 'String'} + } + + +@pytest.fixture +def expected_in_memory_feature_definitions(): + return [ + FeatureDefinition( + feature_name="feature1", feature_type=FeatureTypeEnum.FRACTIONAL), + FeatureDefinition( + feature_name="feature2", feature_type=FeatureTypeEnum.INTEGRAL), + FeatureDefinition( + feature_name="feature3", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition( + feature_name="feature4", feature_type=FeatureTypeEnum.FRACTIONAL), + FeatureDefinition( + feature_name="feature5", feature_type=FeatureTypeEnum.STRING, collection_type=ListCollectionType()), + FeatureDefinition( + feature_name="feature6", feature_type=FeatureTypeEnum.INTEGRAL, collection_type=ListCollectionType()), + FeatureDefinition( + feature_name="feature7", feature_type=FeatureTypeEnum.FRACTIONAL, collection_type=ListCollectionType()), + FeatureDefinition( + feature_name="feature8", feature_type=FeatureTypeEnum.INTEGRAL, collection_type=ListCollectionType()), + FeatureDefinition( + feature_name="feature9", feature_type=FeatureTypeEnum.FRACTIONAL, collection_type=ListCollectionType()), + FeatureDefinition( + feature_name="feature10", feature_type=FeatureTypeEnum.STRING, collection_type=ListCollectionType()), + ] + + +@pytest.fixture +def expected_in_memory_feature_definition_dict(): + return { + 'feature1': {'FeatureName': 'feature1', 'FeatureType': 'Fractional'}, + 'feature2': {'FeatureName': 'feature2', 'FeatureType': 'Integral'}, + 'feature3': {'FeatureName': 'feature3', 'FeatureType': 'String'}, + 'feature4': {'FeatureName': 'feature4', 'FeatureType': 'Fractional'}, + 'feature5': {'FeatureName': 'feature5', 'FeatureType': 'String', 'CollectionType': 'List'}, + 'feature6': {'FeatureName': 'feature6', 'FeatureType': 'Integral', 'CollectionType': 'List'}, + 'feature7': {'FeatureName': 'feature7', 'FeatureType': 'Fractional', 'CollectionType': 'List'}, + 'feature8': {'FeatureName': 'feature8', 'FeatureType': 'Integral', 'CollectionType': 'List'}, + 'feature9': {'FeatureName': 'feature9', 'FeatureType': 'Fractional', 'CollectionType': 'List'}, + 'feature10': {'FeatureName': 'feature10', 'FeatureType': 'String', 'CollectionType': 'List'} + } + + @pytest.fixture def create_table_ddl(): return ( @@ -104,7 +226,7 @@ def create_table_ddl(): def test_feature_group_create_without_role( - sagemaker_session_mock, feature_group_dummy_definitions, s3_uri + sagemaker_session_mock, feature_group_dummy_definitions, s3_uri ): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.feature_definitions = feature_group_dummy_definitions @@ -118,9 +240,8 @@ def test_feature_group_create_without_role( def test_feature_store_create_with_config_injection( - sagemaker_session, role_arn, feature_group_dummy_definitions, s3_uri + sagemaker_session, role_arn, feature_group_dummy_definitions, s3_uri ): - sagemaker_session.sagemaker_config = SAGEMAKER_CONFIG_FEATURE_GROUP sagemaker_session.create_feature_group = Mock() @@ -161,8 +282,45 @@ def test_feature_store_create_with_config_injection( ) +def test_feature_group_load_definition( + sagemaker_session_mock, + data_frame_with_collection_type, + expected_standard_feature_definitions, + expected_in_memory_feature_definitions +): + feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) + + feature_group.load_feature_definitions(data_frame=data_frame_with_collection_type) + assert feature_group.feature_definitions == expected_standard_feature_definitions + + feature_group.load_feature_definitions( + data_frame=data_frame_with_collection_type, online_storage_type=OnlineStoreStorageTypeEnum.STANDARD + ) + assert feature_group.feature_definitions == expected_standard_feature_definitions + + feature_group.load_feature_definitions( + data_frame=data_frame_with_collection_type, online_storage_type=OnlineStoreStorageTypeEnum.IN_MEMORY + ) + assert feature_group.feature_definitions == expected_in_memory_feature_definitions + + data_frame_with_collection_type["feature11"] = pd.Series( + [[1.1, "2.3"], [1.4, 2.5, 3.2, 25], [1.0, 5.3], [1.2], []], dtype="object") + + feature_group.load_feature_definitions( + data_frame=data_frame_with_collection_type, online_storage_type=OnlineStoreStorageTypeEnum.STANDARD + ) + expected_standard_feature_definitions.append( + FeatureDefinition(feature_name='feature11', feature_type=FeatureTypeEnum.STRING)) + assert feature_group.feature_definitions == expected_standard_feature_definitions + + with pytest.raises(ValueError): + feature_group.load_feature_definitions( + data_frame=data_frame_with_collection_type, online_storage_type=OnlineStoreStorageTypeEnum.IN_MEMORY + ) + + def test_feature_store_create( - sagemaker_session_mock, role_arn, feature_group_dummy_definitions, s3_uri + sagemaker_session_mock, role_arn, feature_group_dummy_definitions, s3_uri ): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.feature_definitions = feature_group_dummy_definitions @@ -190,7 +348,7 @@ def test_feature_store_create( def test_feature_store_create_with_ttl_duration( - sagemaker_session_mock, role_arn, feature_group_dummy_definitions, s3_uri + sagemaker_session_mock, role_arn, feature_group_dummy_definitions, s3_uri ): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.feature_definitions = feature_group_dummy_definitions @@ -223,7 +381,7 @@ def test_feature_store_create_with_ttl_duration( def test_feature_store_create_online_only( - sagemaker_session_mock, role_arn, feature_group_dummy_definitions + sagemaker_session_mock, role_arn, feature_group_dummy_definitions ): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.feature_definitions = feature_group_dummy_definitions @@ -247,7 +405,7 @@ def test_feature_store_create_online_only( def test_feature_store_create_online_only_with_in_memory( - sagemaker_session_mock, role_arn, feature_group_dummy_definitions + sagemaker_session_mock, role_arn, feature_group_dummy_definitions ): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.feature_definitions = feature_group_dummy_definitions @@ -272,7 +430,7 @@ def test_feature_store_create_online_only_with_in_memory( def test_feature_store_create_with_in_memory_collection_types( - sagemaker_session_mock, role_arn, feature_group_dummy_definitions + sagemaker_session_mock, role_arn, feature_group_dummy_definitions ): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_definition_with_collection = [ @@ -491,7 +649,7 @@ def test_put_record(sagemaker_session_mock): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.put_record(record=[]) sagemaker_session_mock.put_record.assert_called_with( - feature_group_name="MyFeatureGroup", record=[] + feature_group_name="MyFeatureGroup", record=[], target_stores=None, ttl_duration=None ) @@ -502,6 +660,22 @@ def test_put_record_ttl_duration(sagemaker_session_mock): sagemaker_session_mock.put_record.assert_called_with( feature_group_name="MyFeatureGroup", record=[], + target_stores=None, + ttl_duration=ttl_duration.to_dict(), + ) + + +def test_put_record_target_stores(sagemaker_session_mock): + feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) + ttl_duration = TtlDuration(unit="Minutes", value=123) + feature_group.put_record( + record=[], + target_stores=[TargetStoreEnum.ONLINE_STORE, TargetStoreEnum.OFFLINE_STORE], + ttl_duration=ttl_duration) + sagemaker_session_mock.put_record.assert_called_with( + feature_group_name="MyFeatureGroup", + record=[], + target_stores=[TargetStoreEnum.ONLINE_STORE.value, TargetStoreEnum.OFFLINE_STORE.value], ttl_duration=ttl_duration.to_dict(), ) @@ -628,10 +802,19 @@ def test_ingest_zero_workers(): @patch("sagemaker.feature_store.feature_group.IngestionManagerPandas") -def test_ingest(ingestion_manager_init, sagemaker_session_mock, fs_runtime_client_config_mock): +def test_ingest( + ingestion_manager_init, + sagemaker_session_mock, + fs_runtime_client_config_mock, + feature_group_describe_dummy_definitions, + feature_group_dummy_definition_dict +): sagemaker_session_mock.sagemaker_featurestore_runtime_client.meta.config = ( fs_runtime_client_config_mock ) + sagemaker_session_mock.describe_feature_group.return_value = { + 'FeatureDefinitions': feature_group_describe_dummy_definitions + } feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) df = pd.DataFrame(dict((f"float{i}", pd.Series([2.0], dtype="float64")) for i in range(300))) @@ -642,6 +825,7 @@ def test_ingest(ingestion_manager_init, sagemaker_session_mock, fs_runtime_clien ingestion_manager_init.assert_called_once_with( feature_group_name="MyGroup", + feature_definitions=feature_group_dummy_definition_dict, sagemaker_session=sagemaker_session_mock, sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock, max_workers=10, @@ -649,16 +833,26 @@ def test_ingest(ingestion_manager_init, sagemaker_session_mock, fs_runtime_clien profile_name=sagemaker_session_mock.boto_session.profile_name, ) mock_ingestion_manager_instance.run.assert_called_once_with( - data_frame=df, wait=True, timeout=None + data_frame=df, target_stores=None, wait=True, timeout=None ) @patch("sagemaker.feature_store.feature_group.IngestionManagerPandas") -def test_ingest_default(ingestion_manager_init, sagemaker_session_mock): +def test_ingest_default( + ingestion_manager_init, + sagemaker_session_mock, + fs_runtime_client_config_mock, + feature_group_describe_dummy_definitions, + feature_group_dummy_definition_dict + +): sagemaker_session_mock.sagemaker_featurestore_runtime_client.meta.config = ( fs_runtime_client_config_mock ) sagemaker_session_mock.boto_session.profile_name = "default" + sagemaker_session_mock.describe_feature_group.return_value = { + 'FeatureDefinitions': feature_group_describe_dummy_definitions + } feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) df = pd.DataFrame(dict((f"float{i}", pd.Series([2.0], dtype="float64")) for i in range(300))) @@ -669,6 +863,7 @@ def test_ingest_default(ingestion_manager_init, sagemaker_session_mock): ingestion_manager_init.assert_called_once_with( feature_group_name="MyGroup", + feature_definitions=feature_group_dummy_definition_dict, sagemaker_session=sagemaker_session_mock, sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock, max_workers=1, @@ -676,17 +871,90 @@ def test_ingest_default(ingestion_manager_init, sagemaker_session_mock): profile_name=None, ) mock_ingestion_manager_instance.run.assert_called_once_with( - data_frame=df, wait=True, timeout=None + data_frame=df, target_stores=None, wait=True, timeout=None ) +@patch("sagemaker.feature_store.feature_group.IngestionManagerPandas") +def test_ingest_with_target_stores( + ingestion_manager_init, + sagemaker_session_mock, + fs_runtime_client_config_mock, + feature_group_describe_dummy_definitions, + feature_group_dummy_definition_dict +): + sagemaker_session_mock.sagemaker_featurestore_runtime_client.meta.config = ( + fs_runtime_client_config_mock + ) + sagemaker_session_mock.describe_feature_group.return_value = { + 'FeatureDefinitions': feature_group_describe_dummy_definitions + } + + feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) + df = pd.DataFrame(dict((f"float{i}", pd.Series([2.0], dtype="float64")) for i in range(300))) + + mock_ingestion_manager_instance = Mock() + ingestion_manager_init.return_value = mock_ingestion_manager_instance + feature_group.ingest(data_frame=df, max_workers=10, target_stores=[TargetStoreEnum.ONLINE_STORE]) + feature_group.ingest(data_frame=df, max_workers=10, target_stores=[TargetStoreEnum.OFFLINE_STORE]) + feature_group.ingest( + data_frame=df, max_workers=10, target_stores=[TargetStoreEnum.ONLINE_STORE, TargetStoreEnum.OFFLINE_STORE]) + + actual_ingestion_manager_init_calls = ingestion_manager_init.mock_calls + expected_ingestion_manager_init_calls = [ + call( + feature_group_name="MyGroup", + feature_definitions=feature_group_dummy_definition_dict, + sagemaker_session=sagemaker_session_mock, + sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock, + max_workers=10, + max_processes=1, + profile_name=sagemaker_session_mock.boto_session.profile_name, + ), + call().run(data_frame=df, target_stores=[TargetStoreEnum.ONLINE_STORE], wait=True, timeout=None), + call( + feature_group_name="MyGroup", + feature_definitions=feature_group_dummy_definition_dict, + sagemaker_session=sagemaker_session_mock, + sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock, + max_workers=10, + max_processes=1, + profile_name=sagemaker_session_mock.boto_session.profile_name, + ), + call().run(data_frame=df, target_stores=[TargetStoreEnum.OFFLINE_STORE], wait=True, timeout=None), + call( + feature_group_name="MyGroup", + feature_definitions=feature_group_dummy_definition_dict, + sagemaker_session=sagemaker_session_mock, + sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock, + max_workers=10, + max_processes=1, + profile_name=sagemaker_session_mock.boto_session.profile_name, + ), + call().run( + data_frame=df, + target_stores=[TargetStoreEnum.ONLINE_STORE, TargetStoreEnum.OFFLINE_STORE], + wait=True, + timeout=None), + ] + assert actual_ingestion_manager_init_calls == expected_ingestion_manager_init_calls, \ + f"Expected {expected_ingestion_manager_init_calls} calls, but got {actual_ingestion_manager_init_calls}" + + @patch("sagemaker.feature_store.feature_group.IngestionManagerPandas") def test_ingest_with_profile_name( - ingestion_manager_init, sagemaker_session_mock, fs_runtime_client_config_mock + ingestion_manager_init, + sagemaker_session_mock, + fs_runtime_client_config_mock, + feature_group_describe_dummy_definitions, + feature_group_dummy_definition_dict ): sagemaker_session_mock.sagemaker_featurestore_runtime_client.meta.config = ( fs_runtime_client_config_mock ) + sagemaker_session_mock.describe_feature_group.return_value = { + 'FeatureDefinitions': feature_group_describe_dummy_definitions + } feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) df = pd.DataFrame(dict((f"float{i}", pd.Series([2.0], dtype="float64")) for i in range(300))) @@ -697,6 +965,7 @@ def test_ingest_with_profile_name( ingestion_manager_init.assert_called_once_with( feature_group_name="MyGroup", + feature_definitions=feature_group_dummy_definition_dict, sagemaker_session=sagemaker_session_mock, sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock, max_workers=10, @@ -704,12 +973,12 @@ def test_ingest_with_profile_name( profile_name="profile_name", ) mock_ingestion_manager_instance.run.assert_called_once_with( - data_frame=df, wait=True, timeout=None + data_frame=df, target_stores=None, wait=True, timeout=None ) def test_as_hive_ddl_with_default_values( - create_table_ddl, feature_group_dummy_definitions, sagemaker_session_mock + create_table_ddl, feature_group_dummy_definitions, sagemaker_session_mock ): sagemaker_session_mock.describe_feature_group.return_value = { "OfflineStoreConfig": { @@ -725,14 +994,14 @@ def test_as_hive_ddl_with_default_values( feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) feature_group.feature_definitions = feature_group_dummy_definitions assert ( - create_table_ddl.format( - database="sagemaker_featurestore", - table_name="MyGroup", - account="1234", - region="us-west-2", - feature_group_name="MyGroup", - ) - == feature_group.as_hive_ddl() + create_table_ddl.format( + database="sagemaker_featurestore", + table_name="MyGroup", + account="1234", + region="us-west-2", + feature_group_name="MyGroup", + ) + == feature_group.as_hive_ddl() ) @@ -767,6 +1036,7 @@ def test_ingestion_manager_run_success(): df = pd.DataFrame({"float": pd.Series([2.0], dtype="float64")}) manager = IngestionManagerPandas( feature_group_name="MyGroup", + feature_definitions=feature_group_dummy_definition_dict, sagemaker_session=sagemaker_session_mock, sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock, max_workers=10, @@ -781,11 +1051,12 @@ def test_ingestion_manager_run_success(): PicklableMock(return_value=[]), ) def test_ingestion_manager_run_multi_process_with_multi_thread_success( - fs_runtime_client_config_mock, + fs_runtime_client_config_mock, ): df = pd.DataFrame({"float": pd.Series([2.0], dtype="float64")}) manager = IngestionManagerPandas( feature_group_name="MyGroup", + feature_definitions=feature_group_dummy_definition_dict, sagemaker_session=sagemaker_session_mock, sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock, max_workers=2, @@ -802,6 +1073,7 @@ def test_ingestion_manager_run_failure(): df = pd.DataFrame({"float": pd.Series([2.0], dtype="float64")}) manager = IngestionManagerPandas( feature_group_name="MyGroup", + feature_definitions=feature_group_dummy_definition_dict, sagemaker_session=sagemaker_session_mock, sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock, max_workers=2, @@ -815,6 +1087,351 @@ def test_ingestion_manager_run_failure(): assert manager.failed_rows == [1, 1] +@patch( + "sagemaker.feature_store.feature_group.IngestionManagerPandas._ingest_row", + MagicMock(return_value=[1]), +) +def test_ingestion_manager_run_success(sagemaker_session_mock, fs_runtime_client_config_mock): + sagemaker_session_mock.sagemaker_featurestore_runtime_client = fs_runtime_client_config_mock + df = pd.DataFrame({"float": pd.Series([2.0], dtype="float64")}) + manager = IngestionManagerPandas( + feature_group_name="MyGroup", + feature_definitions=feature_group_dummy_definition_dict, + sagemaker_session=sagemaker_session_mock, + sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock, + ) + + manager.run(df) + for row in df.itertuples(): + manager._ingest_row.assert_called_with( + data_frame=df, + target_stores=None, + row=row, + feature_group_name='MyGroup', + feature_definitions=feature_group_dummy_definition_dict, + sagemaker_fs_runtime_client=fs_runtime_client_config_mock, + failed_rows=[] + ) + + expected_invocation_count = 1 # Set your expected count + actual_invocation_count = len(manager._ingest_row.mock_calls) + assert actual_invocation_count == expected_invocation_count, \ + f"Expected {expected_invocation_count} calls, but got {actual_invocation_count}" + + +def test_ingestion_manager_run_standard( + sagemaker_session_mock, + fs_runtime_client_config_mock, + feature_group_dummy_definition_dict +): + sagemaker_session_mock.sagemaker_featurestore_runtime_client = fs_runtime_client_config_mock + df = pd.DataFrame(data={'feature1': [2.0, 3.0], 'feature2': [3, 4], 'feature3': ['abc', 'edf']}) + + manager = IngestionManagerPandas( + feature_group_name="MyGroup", + feature_definitions=feature_group_dummy_definition_dict, + sagemaker_session=sagemaker_session_mock, + sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock, + ) + + manager.run(df) + + actual_put_record_calls = fs_runtime_client_config_mock.put_record.mock_calls + expected_put_record_calls = [ + call( + FeatureGroupName="MyGroup", + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '2.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '3'}, + {'FeatureName': 'feature3', 'ValueAsString': 'abc'} + ] + ), + call( + FeatureGroupName="MyGroup", + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '3.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '4'}, + {'FeatureName': 'feature3', 'ValueAsString': 'edf'} + ] + ), + ] + assert actual_put_record_calls == expected_put_record_calls, \ + f"Expected {expected_put_record_calls} calls, but got {actual_put_record_calls}" + + +def test_ingestion_manager_run_non_collection_type( + sagemaker_session_mock, + fs_runtime_client_config_mock, + feature_group_dummy_definition_dict, + data_frame_with_collection_type, + expected_standard_feature_definition_dict +): + sagemaker_session_mock.sagemaker_featurestore_runtime_client = fs_runtime_client_config_mock + manager = IngestionManagerPandas( + feature_group_name="MyGroup", + feature_definitions=expected_standard_feature_definition_dict, + sagemaker_session=sagemaker_session_mock, + sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock, + ) + + manager.run(data_frame_with_collection_type) + + actual_put_record_calls = fs_runtime_client_config_mock.put_record.mock_calls + expected_put_record_calls = [ + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '0.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '0'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '0.0'}, + {'FeatureName': 'feature5', 'ValueAsString': "['a', 'abc']"}, + {'FeatureName': 'feature6', 'ValueAsString': '[1, 2]'}, + {'FeatureName': 'feature7', 'ValueAsString': '[1.1, 2.3]'}, + {'FeatureName': 'feature8', 'ValueAsString': '[1, 2]'}, + {'FeatureName': 'feature9', 'ValueAsString': '[1.1, 2.3]'}, + {'FeatureName': 'feature10', 'ValueAsString': "['a', 'abc']"} + ] + ), + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '1.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '1'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '1.0'}, + {'FeatureName': 'feature5', 'ValueAsString': "['b', 'c']"}, + {'FeatureName': 'feature6', 'ValueAsString': '[1, 2, 3]'}, + {'FeatureName': 'feature7', 'ValueAsString': '[1.4, 2.5, 3.2, 25]'}, + {'FeatureName': 'feature8', 'ValueAsString': '[1, 2, None]'}, + {'FeatureName': 'feature9', 'ValueAsString': '[1.4, 25, 3.2]'}, + {'FeatureName': 'feature10', 'ValueAsString': "['b', 'c']"} + ] + ), + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '2.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '2'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '2.0'}, + {'FeatureName': 'feature5', 'ValueAsString': "['c', 'f']"}, + {'FeatureName': 'feature6', 'ValueAsString': '[1, 5]'}, + {'FeatureName': 'feature7', 'ValueAsString': '[1.0, 5.3]'}, + {'FeatureName': 'feature8', 'ValueAsString': '[1, 5]'}, + {'FeatureName': 'feature9', 'ValueAsString': '[1.0, 3, None]'}, + {'FeatureName': 'feature10', 'ValueAsString': "['c', None]"} + ] + ), + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '3.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '3'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '3.0'}, + {'FeatureName': 'feature5', 'ValueAsString': "['d']"}, + {'FeatureName': 'feature6', 'ValueAsString': '[1]'}, + {'FeatureName': 'feature7', 'ValueAsString': '[1.2]'}, + {'FeatureName': 'feature8', 'ValueAsString': '[1]'}, + {'FeatureName': 'feature9', 'ValueAsString': '[1.2]'}, + {'FeatureName': 'feature10', 'ValueAsString': "['d']"} + ] + ), + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '4.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '4'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '4.0'} + ] + ), + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '5.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '5'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature8', 'ValueAsString': '[None]'}, + {'FeatureName': 'feature9', 'ValueAsString': '[None]'}, + {'FeatureName': 'feature10', 'ValueAsString': '[None]'} + ] + ), + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '6.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '6'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ] + ), + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '7.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '7'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ] + ), + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '8.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '8'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ] + ), + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '9.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '9'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ] + ) + ] + assert actual_put_record_calls == expected_put_record_calls, \ + f"Expected {expected_put_record_calls} calls, but got {actual_put_record_calls}" + + +def test_ingestion_manager_run_collection_type( + sagemaker_session_mock, + fs_runtime_client_config_mock, + feature_group_dummy_definition_dict, + data_frame_with_collection_type, + expected_in_memory_feature_definition_dict +): + sagemaker_session_mock.sagemaker_featurestore_runtime_client = fs_runtime_client_config_mock + + manager = IngestionManagerPandas( + feature_group_name="MyGroup", + feature_definitions=expected_in_memory_feature_definition_dict, + sagemaker_session=sagemaker_session_mock, + sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock, + ) + + manager.run(data_frame_with_collection_type) + + actual_put_record_calls = fs_runtime_client_config_mock.put_record.mock_calls + expected_put_record_calls = [ + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '0.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '0'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '0.0'}, + {'FeatureName': 'feature5', 'ValueAsStringList': ['a', 'abc']}, + {'FeatureName': 'feature6', 'ValueAsStringList': ['1', '2']}, + {'FeatureName': 'feature7', 'ValueAsStringList': ['1.1', '2.3']}, + {'FeatureName': 'feature8', 'ValueAsStringList': ['1', '2']}, + {'FeatureName': 'feature9', 'ValueAsStringList': ['1.1', '2.3']}, + {'FeatureName': 'feature10', 'ValueAsStringList': ['a', 'abc']} + ] + ), + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '1.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '1'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '1.0'}, + {'FeatureName': 'feature5', 'ValueAsStringList': ['b', 'c']}, + {'FeatureName': 'feature6', 'ValueAsStringList': ['1', '2', '3']}, + {'FeatureName': 'feature7', 'ValueAsStringList': ['1.4', '2.5', '3.2', '25']}, + {'FeatureName': 'feature8', 'ValueAsStringList': ['1', '2', None]}, + {'FeatureName': 'feature9', 'ValueAsStringList': ['1.4', '25', '3.2']}, + {'FeatureName': 'feature10', 'ValueAsStringList': ['b', 'c']} + ] + ), + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '2.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '2'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '2.0'}, + {'FeatureName': 'feature5', 'ValueAsStringList': ['c', 'f']}, + {'FeatureName': 'feature6', 'ValueAsStringList': ['1', '5']}, + {'FeatureName': 'feature7', 'ValueAsStringList': ['1.0', '5.3']}, + {'FeatureName': 'feature8', 'ValueAsStringList': ['1', '5']}, + {'FeatureName': 'feature9', 'ValueAsStringList': ['1.0', '3', None]}, + {'FeatureName': 'feature10', 'ValueAsStringList': ['c', None]} + ] + ), + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '3.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '3'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '3.0'}, + {'FeatureName': 'feature5', 'ValueAsStringList': ['d']}, + {'FeatureName': 'feature6', 'ValueAsStringList': ['1']}, + {'FeatureName': 'feature7', 'ValueAsStringList': ['1.2']}, + {'FeatureName': 'feature8', 'ValueAsStringList': ['1']}, + {'FeatureName': 'feature9', 'ValueAsStringList': ['1.2']}, + {'FeatureName': 'feature10', 'ValueAsStringList': ['d']} + ] + ), + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '4.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '4'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature4', 'ValueAsString': '4.0'} + ] + ), + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '5.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '5'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, + {'FeatureName': 'feature8', 'ValueAsStringList': [None]}, + {'FeatureName': 'feature9', 'ValueAsStringList': [None]}, + {'FeatureName': 'feature10', 'ValueAsStringList': [None]} + ] + ), + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '6.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '6'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ] + ), + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '7.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '7'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ] + ), + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '8.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '8'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ] + ), + call( + FeatureGroupName='MyGroup', + Record=[ + {'FeatureName': 'feature1', 'ValueAsString': '9.0'}, + {'FeatureName': 'feature2', 'ValueAsString': '9'}, + {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + ] + ) + ] + assert actual_put_record_calls == expected_put_record_calls, \ + f"Expected {expected_put_record_calls} calls, but got {actual_put_record_calls}" + + @patch( "sagemaker.feature_store.feature_group.IngestionManagerPandas._ingest_single_batch", MagicMock(side_effect=ProfileNotFound(profile="non_exist")), @@ -823,6 +1440,7 @@ def test_ingestion_manager_with_profile_name_run_failure(): df = pd.DataFrame({"float": pd.Series([2.0], dtype="float64")}) manager = IngestionManagerPandas( feature_group_name="MyGroup", + feature_definitions=feature_group_dummy_definition_dict, sagemaker_session=sagemaker_session_mock, sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock, max_workers=1, @@ -843,6 +1461,7 @@ def test_ingestion_manager_run_multi_process_failure(): df = pd.DataFrame({"float": pd.Series([2.0], dtype="float64")}) manager = IngestionManagerPandas( feature_group_name="MyGroup", + feature_definitions=feature_group_dummy_definition_dict, sagemaker_session=None, sagemaker_fs_runtime_client_config=None, max_workers=2, diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py index 93828d882f..308536c8d7 100644 --- a/tests/unit/test_session.py +++ b/tests/unit/test_session.py @@ -5261,6 +5261,58 @@ def test_list_feature_groups(sagemaker_session): ) +@pytest.fixture() +def sagemaker_session_with_featurestore_runtime_client(): + boto_mock = MagicMock(name="boto_session") + sagemaker_session = sagemaker.Session(boto_session=boto_mock, sagemaker_featurestore_runtime_client=MagicMock()) + return sagemaker_session + + +def test_feature_group_put_record(sagemaker_session_with_featurestore_runtime_client): + sagemaker_session_with_featurestore_runtime_client.put_record( + feature_group_name="MyFeatureGroup", + record=[{ + "FeatureName": "feature1", + "ValueAsString": "value1" + }] + ) + assert sagemaker_session_with_featurestore_runtime_client.sagemaker_featurestore_runtime_client.put_record.called_with( + FeatureGroupName="MyFeatureGroup", + record=[{ + "FeatureName": "feature1", + "ValueAsString": "value1" + }], + ) + + +def test_feature_group_put_record_with_ttl_and_target_stores(sagemaker_session_with_featurestore_runtime_client): + sagemaker_session_with_featurestore_runtime_client.put_record( + feature_group_name="MyFeatureGroup", + record=[{ + "FeatureName": "feature1", + "ValueAsString": "value1" + }], + ttl_duration={ + "Unit": "Seconds", + "Value": 123 + }, + target_stores=["OnlineStore", "OfflineStore"] + ) + assert (sagemaker_session_with_featurestore_runtime_client.sagemaker_featurestore_runtime_client + .put_record.called_with( + FeatureGroupName="MyFeatureGroup", + record=[{ + "FeatureName": "feature1", + "ValueAsString": "value1" + }], + target_stores=["OnlineStore", "OfflineStore"], + ttl_duration={ + "Unit": "Seconds", + "Value": 123 + } + )) + + def test_start_query_execution(sagemaker_session): athena_mock = Mock() sagemaker_session.boto_session.client( From 3451c330f0fa2cc271d085a283aaf1608cda2644 Mon Sep 17 00:00:00 2001 From: Suryansh Singh Date: Mon, 5 Feb 2024 14:14:29 -0800 Subject: [PATCH 71/76] Remove merge conflicts. --- tests/unit/sagemaker/image_uris/test_smp_v2.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/unit/sagemaker/image_uris/test_smp_v2.py b/tests/unit/sagemaker/image_uris/test_smp_v2.py index 21b541771a..36accdebbb 100644 --- a/tests/unit/sagemaker/image_uris/test_smp_v2.py +++ b/tests/unit/sagemaker/image_uris/test_smp_v2.py @@ -34,13 +34,10 @@ def test_smp_v2(load_config): for py_version in PY_VERSIONS: for region in ACCOUNTS.keys(): for instance_type in CONTAINER_VERSIONS.keys(): -<<<<<<< HEAD cuda_vers = CONTAINER_VERSIONS[instance_type] if "2.1" in version: cuda_vers = "cu121" -======= ->>>>>>> staging/master-feature-store-collection-type uri = image_uris.get_training_image_uri( region, framework="pytorch", @@ -52,11 +49,7 @@ def test_smp_v2(load_config): expected = expected_uris.framework_uri( repo="smdistributed-modelparallel", fw_version=version, -<<<<<<< HEAD py_version=f"{py_version}-{cuda_vers}", -======= - py_version=f"{py_version}-{CONTAINER_VERSIONS[instance_type]}", ->>>>>>> staging/master-feature-store-collection-type processor=processor, region=region, account=ACCOUNTS[region], From e228b89a198ff077894d8d95a2366bbbbfe8049a Mon Sep 17 00:00:00 2001 From: Suryansh Singh Date: Mon, 5 Feb 2024 15:08:55 -0800 Subject: [PATCH 72/76] Update the feature definition type --- src/sagemaker/feature_store/feature_group.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/sagemaker/feature_store/feature_group.py b/src/sagemaker/feature_store/feature_group.py index 0896194662..f0a35be822 100644 --- a/src/sagemaker/feature_store/feature_group.py +++ b/src/sagemaker/feature_store/feature_group.py @@ -186,7 +186,7 @@ class IngestionManagerPandas: Attributes: feature_group_name (str): name of the Feature Group. - feature_definitions (Dict[str, FeatureDefinition]): dictionary of feature definitions. + feature_definitions (Dict[str, Dict[Any, Any]]): dictionary of feature definitions. where the key is the feature name and the value is the FeatureDefinition. The FeatureDefinition contains the data type of the feature. sagemaker_fs_runtime_client_config (Config): instance of the Config class @@ -215,7 +215,7 @@ class IngestionManagerPandas: def _ingest_single_batch( data_frame: DataFrame, feature_group_name: str, - feature_definitions: Dict[str, FeatureDefinition], + feature_definitions: Dict[str, Dict[Any, Any]], client_config: Config, start_index: int, end_index: int, @@ -227,7 +227,7 @@ def _ingest_single_batch( Args: data_frame (DataFrame): source DataFrame to be ingested. feature_group_name (str): name of the Feature Group. - feature_definitions (Dict[str, FeatureDefinition]): dictionary of feature definitions. + feature_definitions (Dict[str, Dict[Any, Any]]): dictionary of feature definitions. where the key is the feature name and the value is the FeatureDefinition. The FeatureDefinition contains the data type of the feature. client_config (Config): Configuration for the sagemaker feature store runtime @@ -318,7 +318,7 @@ def _ingest_row( data_frame (DataFrame): source DataFrame to be ingested. row (Iterable[tuple[Any, ...]]): current row that is being ingested feature_group_name (str): name of the Feature Group. - feature_definitions (Dict[str, FeatureDefinition]): dictionary of feature definitions. + feature_definitions (Dict[str, Dict[Any, Any]]): dictionary of feature definitions. where the key is the feature name and the value is the FeatureDefinition. The FeatureDefinition contains the data type of the feature. sagemaker_fs_runtime_client (Session): session instance to perform boto calls. @@ -361,7 +361,7 @@ def _is_feature_collection_type(feature_name: str, feature_definitions: Dict[str Args: feature_name (str): name of the feature. - feature_definitions (Dict[str, FeatureDefinition]): dictionary of feature definitions. + feature_definitions (Dict[str, Dict[Any, Any]]): dictionary of feature definitions. where the key is the feature name and the value is the FeatureDefinition. The FeatureDefinition contains the data type of the feature and the type of collection. If the feature is not a collection type, the value of the CollectionType attribute @@ -503,7 +503,7 @@ def init_worker(): def _run_multi_threaded( max_workers: int, feature_group_name: str, - feature_definitions: Dict[str, FeatureDefinition], + feature_definitions: Dict[str, Dict[Any, Any]], sagemaker_fs_runtime_client_config: Config, data_frame: DataFrame, target_stores: Sequence[TargetStoreEnum] = None, From e7c6861e38895a31ef76481023d3b8ffaf8a8446 Mon Sep 17 00:00:00 2001 From: Suryansh Singh Date: Mon, 5 Feb 2024 19:18:47 -0800 Subject: [PATCH 73/76] Black formatting --- src/sagemaker/feature_store/feature_group.py | 222 +++--- src/sagemaker/session.py | 8 +- tests/integ/test_feature_store.py | 676 ++++++++++-------- .../feature_store/test_feature_group.py | 673 +++++++++-------- tests/unit/test_session.py | 45 +- 5 files changed, 883 insertions(+), 741 deletions(-) diff --git a/src/sagemaker/feature_store/feature_group.py b/src/sagemaker/feature_store/feature_group.py index f0a35be822..800cc2dff9 100644 --- a/src/sagemaker/feature_store/feature_group.py +++ b/src/sagemaker/feature_store/feature_group.py @@ -99,7 +99,7 @@ class AthenaQuery: _result_file_prefix: str = attr.ib(init=False, default=None) def run( - self, query_string: str, output_location: str, kms_key: str = None, workgroup: str = None + self, query_string: str, output_location: str, kms_key: str = None, workgroup: str = None ) -> str: """Execute a SQL query given a query string, output location and kms key. @@ -213,14 +213,14 @@ class IngestionManagerPandas: @staticmethod def _ingest_single_batch( - data_frame: DataFrame, - feature_group_name: str, - feature_definitions: Dict[str, Dict[Any, Any]], - client_config: Config, - start_index: int, - end_index: int, - target_stores: Sequence[TargetStoreEnum] = None, - profile_name: str = None, + data_frame: DataFrame, + feature_group_name: str, + feature_definitions: Dict[str, Dict[Any, Any]], + client_config: Config, + start_index: int, + end_index: int, + target_stores: Sequence[TargetStoreEnum] = None, + profile_name: str = None, ) -> List[int]: """Ingest a single batch of DataFrame rows into FeatureStore. @@ -304,13 +304,13 @@ def wait(self, timeout=None): @staticmethod def _ingest_row( - data_frame: DataFrame, - row: Iterable[tuple[Any, ...]], - feature_group_name: str, - feature_definitions: Dict[str, Dict[Any, Any]], - sagemaker_fs_runtime_client: Session, - failed_rows: List[int], - target_stores: Sequence[TargetStoreEnum] = None, + data_frame: DataFrame, + row: Iterable[tuple[Any, ...]], + feature_group_name: str, + feature_definitions: Dict[str, Dict[Any, Any]], + sagemaker_fs_runtime_client: Session, + failed_rows: List[int], + target_stores: Sequence[TargetStoreEnum] = None, ): """Ingest a single Dataframe row into FeatureStore. @@ -333,22 +333,29 @@ def _ingest_row( record = [ FeatureValue( feature_name=data_frame.columns[index - 1], - value_as_string_list=IngestionManagerPandas._covert_feature_value_to_string_list(row[index]), - ) if IngestionManagerPandas._is_feature_collection_type( - feature_name=data_frame.columns[index - 1], feature_definitions=feature_definitions) - else FeatureValue( + value_as_string_list=IngestionManagerPandas._covert_feature_value_to_string_list( + row[index] + ), + ) + if IngestionManagerPandas._is_feature_collection_type( feature_name=data_frame.columns[index - 1], - value_as_string=str(row[index])) + feature_definitions=feature_definitions, + ) + else FeatureValue( + feature_name=data_frame.columns[index - 1], value_as_string=str(row[index]) + ) for index in range(1, len(row)) if IngestionManagerPandas._feature_value_is_not_none(feature_value=row[index]) ] put_record_params = { - 'FeatureGroupName': feature_group_name, - 'Record': [value.to_dict() for value in record], + "FeatureGroupName": feature_group_name, + "Record": [value.to_dict() for value in record], } if target_stores: - put_record_params['TargetStores'] = [target_store.value for target_store in target_stores] + put_record_params["TargetStores"] = [ + target_store.value for target_store in target_stores + ] sagemaker_fs_runtime_client.put_record(**put_record_params) except Exception as e: # pylint: disable=broad-except @@ -356,7 +363,9 @@ def _ingest_row( failed_rows.append(row[0]) @staticmethod - def _is_feature_collection_type(feature_name: str, feature_definitions: Dict[str, Dict[Any, Any]]): + def _is_feature_collection_type( + feature_name: str, feature_definitions: Dict[str, Dict[Any, Any]] + ): """Check if the feature is a collection type. Args: @@ -372,11 +381,11 @@ def _is_feature_collection_type(feature_name: str, feature_definitions: Dict[str """ feature_definition = feature_definitions.get(feature_name) if feature_definition is not None: - return feature_definition.get('CollectionType') is not None + return feature_definition.get("CollectionType") is not None @staticmethod def _feature_value_is_not_none( - feature_value: Any, + feature_value: Any, ): """Check if the feature value is not None. @@ -411,14 +420,15 @@ def _covert_feature_value_to_string_list(feature_value: List[Any]): List[str]: list of strings. """ if not is_list_like(feature_value): - raise ValueError(f"Invalid feature value, feature value: {feature_value} for a collection type feature" - f" must be an Array, but instead was {type(feature_value)}") - return [ - str(value) if value is not None else None - for value in feature_value - ] + raise ValueError( + f"Invalid feature value, feature value: {feature_value} for a collection type feature" + f" must be an Array, but instead was {type(feature_value)}" + ) + return [str(value) if value is not None else None for value in feature_value] - def _run_single_process_single_thread(self, data_frame: DataFrame, target_stores: Sequence[TargetStoreEnum] = None): + def _run_single_process_single_thread( + self, data_frame: DataFrame, target_stores: Sequence[TargetStoreEnum] = None + ): """Ingest utilizing a single process and a single thread. Args: @@ -448,11 +458,11 @@ def _run_single_process_single_thread(self, data_frame: DataFrame, target_stores ) def _run_multi_process( - self, - data_frame: DataFrame, - target_stores: Sequence[TargetStoreEnum] = None, - wait=True, - timeout=None + self, + data_frame: DataFrame, + target_stores: Sequence[TargetStoreEnum] = None, + wait=True, + timeout=None, ): """Start the ingestion process with the specified number of processes. @@ -501,15 +511,15 @@ def init_worker(): @staticmethod def _run_multi_threaded( - max_workers: int, - feature_group_name: str, - feature_definitions: Dict[str, Dict[Any, Any]], - sagemaker_fs_runtime_client_config: Config, - data_frame: DataFrame, - target_stores: Sequence[TargetStoreEnum] = None, - row_offset=0, - timeout=None, - profile_name=None, + max_workers: int, + feature_group_name: str, + feature_definitions: Dict[str, Dict[Any, Any]], + sagemaker_fs_runtime_client_config: Config, + data_frame: DataFrame, + target_stores: Sequence[TargetStoreEnum] = None, + row_offset=0, + timeout=None, + profile_name=None, ) -> List[int]: """Start the ingestion process. @@ -563,7 +573,13 @@ def _run_multi_threaded( return failed_indices - def run(self, data_frame: DataFrame, target_stores: Sequence[TargetStoreEnum] = None, wait=True, timeout=None): + def run( + self, + data_frame: DataFrame, + target_stores: Sequence[TargetStoreEnum] = None, + wait=True, + timeout=None, + ): """Start the ingestion process. Args: @@ -575,9 +591,13 @@ def run(self, data_frame: DataFrame, target_stores: Sequence[TargetStoreEnum] = if timeout is reached. """ if self.max_workers == 1 and self.max_processes == 1 and self.profile_name is None: - self._run_single_process_single_thread(data_frame=data_frame, target_stores=target_stores) + self._run_single_process_single_thread( + data_frame=data_frame, target_stores=target_stores + ) else: - self._run_multi_process(data_frame=data_frame, target_stores=target_stores, wait=wait, timeout=timeout) + self._run_multi_process( + data_frame=data_frame, target_stores=target_stores, wait=wait, timeout=timeout + ) class IngestionError(Exception): @@ -815,11 +835,11 @@ def update( ) def update_feature_metadata( - self, - feature_name: str, - description: str = None, - parameter_additions: Sequence[FeatureParameter] = None, - parameter_removals: Sequence[str] = None, + self, + feature_name: str, + description: str = None, + parameter_additions: Sequence[FeatureParameter] = None, + parameter_removals: Sequence[str] = None, ) -> Dict[str, Any]: """Update a feature metadata and add/remove metadata. @@ -904,22 +924,28 @@ def _determine_collection_list_type(series: Series) -> FeatureTypeEnum | None: feature type. """ - if (series.apply(lambda lst: - all(isinstance(x, int) or pd.isna(x) for x in lst) if is_list_like(lst) else True) - .all()): + if series.apply( + lambda lst: all(isinstance(x, int) or pd.isna(x) for x in lst) + if is_list_like(lst) + else True + ).all(): return FeatureTypeEnum.INTEGRAL - if (series.apply(lambda lst: - all(isinstance(x, (float, int)) or pd.isna(x) for x in lst) if is_list_like(lst) else True) - .all()): + if series.apply( + lambda lst: all(isinstance(x, (float, int)) or pd.isna(x) for x in lst) + if is_list_like(lst) + else True + ).all(): return FeatureTypeEnum.FRACTIONAL - if (series.apply(lambda lst: - all(isinstance(x, str) or pd.isna(x) for x in lst) if is_list_like(lst) else True) - .all()): + if series.apply( + lambda lst: all(isinstance(x, str) or pd.isna(x) for x in lst) + if is_list_like(lst) + else True + ).all(): return FeatureTypeEnum.STRING return None def _generate_feature_definition( - self, series: Series, online_storage_type: OnlineStoreStorageTypeEnum + self, series: Series, online_storage_type: OnlineStoreStorageTypeEnum ) -> FeatureDefinition: """Generate feature definition from the Panda Series. @@ -933,11 +959,11 @@ def _generate_feature_definition( dtype = str(series.dtype).lower() if ( - online_storage_type - and online_storage_type == OnlineStoreStorageTypeEnum.IN_MEMORY - and dtype == "object" - and pd.notna(series.head(1000)).any() - and series.head(1000).apply(FeatureGroup._check_list_type).all() + online_storage_type + and online_storage_type == OnlineStoreStorageTypeEnum.IN_MEMORY + and dtype == "object" + and pd.notna(series.head(1000)).any() + and series.head(1000).apply(FeatureGroup._check_list_type).all() ): params["collection_type"] = ListCollectionType() params["feature_type"] = FeatureGroup._determine_collection_list_type(series.head(1000)) @@ -946,8 +972,7 @@ def _generate_feature_definition( if params["feature_type"] is None: raise ValueError( - f"Failed to infer Feature type based on dtype {dtype} " - f"for column {series.name}." + f"Failed to infer Feature type based on dtype {dtype} " f"for column {series.name}." ) feature_definition = FeatureDefinition(**params) @@ -955,9 +980,7 @@ def _generate_feature_definition( return feature_definition def load_feature_definitions( - self, - data_frame: DataFrame, - online_storage_type: OnlineStoreStorageTypeEnum = None + self, data_frame: DataFrame, online_storage_type: OnlineStoreStorageTypeEnum = None ) -> Sequence[FeatureDefinition]: """Load feature definitions from a Pandas DataFrame. @@ -990,15 +1013,17 @@ def load_feature_definitions( """ feature_definitions = [] for column in data_frame: - feature_definition = self._generate_feature_definition(data_frame[column], online_storage_type) + feature_definition = self._generate_feature_definition( + data_frame[column], online_storage_type + ) feature_definitions.append(feature_definition) self.feature_definitions = feature_definitions return self.feature_definitions def get_record( - self, - record_identifier_value_as_string: str, - feature_names: Sequence[str] = None, + self, + record_identifier_value_as_string: str, + feature_names: Sequence[str] = None, ) -> Sequence[Dict[str, str]]: """Get a single record in a FeatureGroup @@ -1015,10 +1040,11 @@ def get_record( ).get("Record") def put_record( - self, - record: Sequence[FeatureValue], - target_stores: Sequence[TargetStoreEnum] = None, - ttl_duration: TtlDuration = None): + self, + record: Sequence[FeatureValue], + target_stores: Sequence[TargetStoreEnum] = None, + ttl_duration: TtlDuration = None, + ): """Put a single record in the FeatureGroup. Args: @@ -1030,15 +1056,17 @@ def put_record( return self.sagemaker_session.put_record( feature_group_name=self.name, record=[value.to_dict() for value in record], - target_stores=[target_store.value for target_store in target_stores] if target_stores else None, + target_stores=[target_store.value for target_store in target_stores] + if target_stores + else None, ttl_duration=ttl_duration.to_dict() if ttl_duration is not None else None, ) def delete_record( - self, - record_identifier_value_as_string: str, - event_time: str, - deletion_mode: DeletionModeEnum = DeletionModeEnum.SOFT_DELETE, + self, + record_identifier_value_as_string: str, + event_time: str, + deletion_mode: DeletionModeEnum = DeletionModeEnum.SOFT_DELETE, ): """Delete a single record from a FeatureGroup. @@ -1059,14 +1087,14 @@ def delete_record( ) def ingest( - self, - data_frame: DataFrame, - target_stores: Sequence[TargetStoreEnum] = None, - max_workers: int = 1, - max_processes: int = 1, - wait: bool = True, - timeout: Union[int, float] = None, - profile_name: str = None, + self, + data_frame: DataFrame, + target_stores: Sequence[TargetStoreEnum] = None, + max_workers: int = 1, + max_processes: int = 1, + wait: bool = True, + timeout: Union[int, float] = None, + profile_name: str = None, ) -> IngestionManagerPandas: """Ingest the content of a pandas DataFrame to feature store. diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py index 869a7888b1..ff5a82a902 100644 --- a/src/sagemaker/session.py +++ b/src/sagemaker/session.py @@ -6009,15 +6009,15 @@ def put_record( """ params = { - 'FeatureGroupName': feature_group_name, - 'Record': record, + "FeatureGroupName": feature_group_name, + "Record": record, } if ttl_duration: - params['TtlDuration'] = ttl_duration + params["TtlDuration"] = ttl_duration if target_stores: - params['TargetStores'] = target_stores + params["TargetStores"] = target_stores return self.sagemaker_featurestore_runtime_client.put_record(**params) diff --git a/tests/integ/test_feature_store.py b/tests/integ/test_feature_store.py index 96e31967b2..ec51205dcd 100644 --- a/tests/integ/test_feature_store.py +++ b/tests/integ/test_feature_store.py @@ -30,9 +30,7 @@ StringFeatureDefinition, ListCollectionType, ) -from sagemaker.feature_store.feature_group import ( - FeatureGroup, - IngestionError) +from sagemaker.feature_store.feature_group import FeatureGroup, IngestionError from sagemaker.feature_store.feature_store import FeatureStore from sagemaker.feature_store.inputs import ( FeatureValue, @@ -140,64 +138,65 @@ def pandas_data_frame(): ) return df + @pytest.fixture def get_record_results_for_data_frame(): return { "0.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '0.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '0'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '0.0'}, + {"FeatureName": "feature1", "ValueAsString": "0.0"}, + {"FeatureName": "feature2", "ValueAsString": "0"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "0.0"}, ], "1.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '1.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '1'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '1.0'}, + {"FeatureName": "feature1", "ValueAsString": "1.0"}, + {"FeatureName": "feature2", "ValueAsString": "1"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "1.0"}, ], "2.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '2.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '2'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '2.0'}, + {"FeatureName": "feature1", "ValueAsString": "2.0"}, + {"FeatureName": "feature2", "ValueAsString": "2"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "2.0"}, ], "3.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '3.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '3'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '3.0'} + {"FeatureName": "feature1", "ValueAsString": "3.0"}, + {"FeatureName": "feature2", "ValueAsString": "3"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "3.0"}, ], "4.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '4.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '4'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '4.0'} + {"FeatureName": "feature1", "ValueAsString": "4.0"}, + {"FeatureName": "feature2", "ValueAsString": "4"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "4.0"}, ], "5.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '5.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '5'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + {"FeatureName": "feature1", "ValueAsString": "5.0"}, + {"FeatureName": "feature2", "ValueAsString": "5"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, ], "6.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '6.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '6'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + {"FeatureName": "feature1", "ValueAsString": "6.0"}, + {"FeatureName": "feature2", "ValueAsString": "6"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, ], "7.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '7.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '7'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + {"FeatureName": "feature1", "ValueAsString": "7.0"}, + {"FeatureName": "feature2", "ValueAsString": "7"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, ], "8.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '8.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '8'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + {"FeatureName": "feature1", "ValueAsString": "8.0"}, + {"FeatureName": "feature2", "ValueAsString": "8"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, ], "9.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '9.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '9'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} - ] + {"FeatureName": "feature1", "ValueAsString": "9.0"}, + {"FeatureName": "feature2", "ValueAsString": "9"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + ], } @@ -209,98 +208,107 @@ def pandas_data_frame_with_collection_type(): "feature2": pd.Series(np.arange(10), dtype="int64"), "feature3": pd.Series(["2020-10-30T03:43:21Z"] * 10, dtype="string"), "feature4": pd.Series(np.arange(5.0), dtype="float64"), # contains nan - "feature5": pd.Series([["a", "abc"], ["b", "c"], ["c", "f"], ["d"], []], dtype="object"), + "feature5": pd.Series( + [["a", "abc"], ["b", "c"], ["c", "f"], ["d"], []], dtype="object" + ), "feature6": pd.Series([[1, 2], [1, 2, 3], [1, 5], [1], []], dtype="object"), - "feature7": pd.Series([[1.1, 2.3], [1.4, 2.5, 3.2, 25], [1.0, 5.3], [1.2], []], dtype="object"), + "feature7": pd.Series( + [[1.1, 2.3], [1.4, 2.5, 3.2, 25], [1.0, 5.3], [1.2], []], dtype="object" + ), "feature8": pd.Series([[1, 2], [1, 2, 3], [1, 5], [1], [], []], dtype="object"), - "feature9": pd.Series([[1.1, 2.3], [1.4, 25, 3.2], [1.0, 3, 4], [1.2], []], dtype="object"), - "feature10": pd.Series([["a", "abc"], ["b", "c"], ["c", "None"], ["d"], []], dtype="object"), + "feature9": pd.Series( + [[1.1, 2.3], [1.4, 25, 3.2], [1.0, 3, 4], [1.2], []], dtype="object" + ), + "feature10": pd.Series( + [["a", "abc"], ["b", "c"], ["c", "None"], ["d"], []], dtype="object" + ), } ) return df + @pytest.fixture def get_record_results_for_data_frame_with_collection_type(): return { "0.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '0.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '0'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '0.0'}, - {'FeatureName': 'feature5', 'ValueAsStringList': ['a', 'abc']}, - {'FeatureName': 'feature6', 'ValueAsStringList': ['1', '2']}, - {'FeatureName': 'feature7', 'ValueAsStringList': ['1.1', '2.3']}, - {'FeatureName': 'feature8', 'ValueAsStringList': ['1', '2']}, - {'FeatureName': 'feature9', 'ValueAsStringList': ['1.1', '2.3']}, - {'FeatureName': 'feature10', 'ValueAsStringList': ['a', 'abc']} + {"FeatureName": "feature1", "ValueAsString": "0.0"}, + {"FeatureName": "feature2", "ValueAsString": "0"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "0.0"}, + {"FeatureName": "feature5", "ValueAsStringList": ["a", "abc"]}, + {"FeatureName": "feature6", "ValueAsStringList": ["1", "2"]}, + {"FeatureName": "feature7", "ValueAsStringList": ["1.1", "2.3"]}, + {"FeatureName": "feature8", "ValueAsStringList": ["1", "2"]}, + {"FeatureName": "feature9", "ValueAsStringList": ["1.1", "2.3"]}, + {"FeatureName": "feature10", "ValueAsStringList": ["a", "abc"]}, ], "1.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '1.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '1'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '1.0'}, - {'FeatureName': 'feature5', 'ValueAsStringList': ['b', 'c']}, - {'FeatureName': 'feature6', 'ValueAsStringList': ['1', '2', '3']}, - {'FeatureName': 'feature7', 'ValueAsStringList': ['1.4', '2.5', '3.2', '25']}, - {'FeatureName': 'feature8', 'ValueAsStringList': ['1', '2', '3']}, - {'FeatureName': 'feature9', 'ValueAsStringList': ['1.4', '25', '3.2']}, - {'FeatureName': 'feature10', 'ValueAsStringList': ['b', 'c']} + {"FeatureName": "feature1", "ValueAsString": "1.0"}, + {"FeatureName": "feature2", "ValueAsString": "1"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "1.0"}, + {"FeatureName": "feature5", "ValueAsStringList": ["b", "c"]}, + {"FeatureName": "feature6", "ValueAsStringList": ["1", "2", "3"]}, + {"FeatureName": "feature7", "ValueAsStringList": ["1.4", "2.5", "3.2", "25"]}, + {"FeatureName": "feature8", "ValueAsStringList": ["1", "2", "3"]}, + {"FeatureName": "feature9", "ValueAsStringList": ["1.4", "25", "3.2"]}, + {"FeatureName": "feature10", "ValueAsStringList": ["b", "c"]}, ], "2.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '2.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '2'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '2.0'}, - {'FeatureName': 'feature5', 'ValueAsStringList': ['c', 'f']}, - {'FeatureName': 'feature6', 'ValueAsStringList': ['1', '5']}, - {'FeatureName': 'feature7', 'ValueAsStringList': ['1.0', '5.3']}, - {'FeatureName': 'feature8', 'ValueAsStringList': ['1', '5']}, - {'FeatureName': 'feature9', 'ValueAsStringList': ['1.0', '3', '4']}, - {'FeatureName': 'feature10', 'ValueAsStringList': ['c', 'None']} + {"FeatureName": "feature1", "ValueAsString": "2.0"}, + {"FeatureName": "feature2", "ValueAsString": "2"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "2.0"}, + {"FeatureName": "feature5", "ValueAsStringList": ["c", "f"]}, + {"FeatureName": "feature6", "ValueAsStringList": ["1", "5"]}, + {"FeatureName": "feature7", "ValueAsStringList": ["1.0", "5.3"]}, + {"FeatureName": "feature8", "ValueAsStringList": ["1", "5"]}, + {"FeatureName": "feature9", "ValueAsStringList": ["1.0", "3", "4"]}, + {"FeatureName": "feature10", "ValueAsStringList": ["c", "None"]}, ], "3.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '3.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '3'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '3.0'}, - {'FeatureName': 'feature5', 'ValueAsStringList': ['d']}, - {'FeatureName': 'feature6', 'ValueAsStringList': ['1']}, - {'FeatureName': 'feature7', 'ValueAsStringList': ['1.2']}, - {'FeatureName': 'feature8', 'ValueAsStringList': ['1']}, - {'FeatureName': 'feature9', 'ValueAsStringList': ['1.2']}, - {'FeatureName': 'feature10', 'ValueAsStringList': ['d']} + {"FeatureName": "feature1", "ValueAsString": "3.0"}, + {"FeatureName": "feature2", "ValueAsString": "3"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "3.0"}, + {"FeatureName": "feature5", "ValueAsStringList": ["d"]}, + {"FeatureName": "feature6", "ValueAsStringList": ["1"]}, + {"FeatureName": "feature7", "ValueAsStringList": ["1.2"]}, + {"FeatureName": "feature8", "ValueAsStringList": ["1"]}, + {"FeatureName": "feature9", "ValueAsStringList": ["1.2"]}, + {"FeatureName": "feature10", "ValueAsStringList": ["d"]}, ], "4.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '4.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '4'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '4.0'} + {"FeatureName": "feature1", "ValueAsString": "4.0"}, + {"FeatureName": "feature2", "ValueAsString": "4"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "4.0"}, ], "5.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '5.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '5'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + {"FeatureName": "feature1", "ValueAsString": "5.0"}, + {"FeatureName": "feature2", "ValueAsString": "5"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, ], "6.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '6.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '6'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + {"FeatureName": "feature1", "ValueAsString": "6.0"}, + {"FeatureName": "feature2", "ValueAsString": "6"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, ], "7.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '7.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '7'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + {"FeatureName": "feature1", "ValueAsString": "7.0"}, + {"FeatureName": "feature2", "ValueAsString": "7"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, ], "8.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '8.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '8'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + {"FeatureName": "feature1", "ValueAsString": "8.0"}, + {"FeatureName": "feature2", "ValueAsString": "8"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, ], "9.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '9.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '9'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} - ] + {"FeatureName": "feature1", "ValueAsString": "9.0"}, + {"FeatureName": "feature2", "ValueAsString": "9"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + ], } @@ -308,84 +316,84 @@ def get_record_results_for_data_frame_with_collection_type(): def get_record_results_for_data_frame_without_collection_type(): return { "0.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '0.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '0'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '0.0'}, - {'FeatureName': 'feature5', 'ValueAsString': "['a', 'abc']"}, - {'FeatureName': 'feature6', 'ValueAsString': '[1, 2]'}, - {'FeatureName': 'feature7', 'ValueAsString': '[1.1, 2.3]'}, - {'FeatureName': 'feature8', 'ValueAsString': '[1, 2]'}, - {'FeatureName': 'feature9', 'ValueAsString': '[1.1, 2.3]'}, - {'FeatureName': 'feature10', 'ValueAsString': "['a', 'abc']"} + {"FeatureName": "feature1", "ValueAsString": "0.0"}, + {"FeatureName": "feature2", "ValueAsString": "0"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "0.0"}, + {"FeatureName": "feature5", "ValueAsString": "['a', 'abc']"}, + {"FeatureName": "feature6", "ValueAsString": "[1, 2]"}, + {"FeatureName": "feature7", "ValueAsString": "[1.1, 2.3]"}, + {"FeatureName": "feature8", "ValueAsString": "[1, 2]"}, + {"FeatureName": "feature9", "ValueAsString": "[1.1, 2.3]"}, + {"FeatureName": "feature10", "ValueAsString": "['a', 'abc']"}, ], "1.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '1.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '1'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '1.0'}, - {'FeatureName': 'feature5', 'ValueAsString': "['b', 'c']"}, - {'FeatureName': 'feature6', 'ValueAsString': '[1, 2, 3]'}, - {'FeatureName': 'feature7', 'ValueAsString': '[1.4, 2.5, 3.2, 25]'}, - {'FeatureName': 'feature8', 'ValueAsString': '[1, 2, 3]'}, - {'FeatureName': 'feature9', 'ValueAsString': '[1.4, 25, 3.2]'}, - {'FeatureName': 'feature10', 'ValueAsString': "['b', 'c']"} + {"FeatureName": "feature1", "ValueAsString": "1.0"}, + {"FeatureName": "feature2", "ValueAsString": "1"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "1.0"}, + {"FeatureName": "feature5", "ValueAsString": "['b', 'c']"}, + {"FeatureName": "feature6", "ValueAsString": "[1, 2, 3]"}, + {"FeatureName": "feature7", "ValueAsString": "[1.4, 2.5, 3.2, 25]"}, + {"FeatureName": "feature8", "ValueAsString": "[1, 2, 3]"}, + {"FeatureName": "feature9", "ValueAsString": "[1.4, 25, 3.2]"}, + {"FeatureName": "feature10", "ValueAsString": "['b', 'c']"}, ], "2.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '2.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '2'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '2.0'}, - {'FeatureName': 'feature5', 'ValueAsString': "['c', 'f']"}, - {'FeatureName': 'feature6', 'ValueAsString': '[1, 5]'}, - {'FeatureName': 'feature7', 'ValueAsString': '[1.0, 5.3]'}, - {'FeatureName': 'feature8', 'ValueAsString': '[1, 5]'}, - {'FeatureName': 'feature9', 'ValueAsString': '[1.0, 3, 4]'}, - {'FeatureName': 'feature10', 'ValueAsString': "['c', 'None']"} + {"FeatureName": "feature1", "ValueAsString": "2.0"}, + {"FeatureName": "feature2", "ValueAsString": "2"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "2.0"}, + {"FeatureName": "feature5", "ValueAsString": "['c', 'f']"}, + {"FeatureName": "feature6", "ValueAsString": "[1, 5]"}, + {"FeatureName": "feature7", "ValueAsString": "[1.0, 5.3]"}, + {"FeatureName": "feature8", "ValueAsString": "[1, 5]"}, + {"FeatureName": "feature9", "ValueAsString": "[1.0, 3, 4]"}, + {"FeatureName": "feature10", "ValueAsString": "['c', 'None']"}, ], "3.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '3.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '3'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '3.0'}, - {'FeatureName': 'feature5', 'ValueAsString': "['d']"}, - {'FeatureName': 'feature6', 'ValueAsString': '[1]'}, - {'FeatureName': 'feature7', 'ValueAsString': '[1.2]'}, - {'FeatureName': 'feature8', 'ValueAsString': '[1]'}, - {'FeatureName': 'feature9', 'ValueAsString': '[1.2]'}, - {'FeatureName': 'feature10', 'ValueAsString': "['d']"} + {"FeatureName": "feature1", "ValueAsString": "3.0"}, + {"FeatureName": "feature2", "ValueAsString": "3"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "3.0"}, + {"FeatureName": "feature5", "ValueAsString": "['d']"}, + {"FeatureName": "feature6", "ValueAsString": "[1]"}, + {"FeatureName": "feature7", "ValueAsString": "[1.2]"}, + {"FeatureName": "feature8", "ValueAsString": "[1]"}, + {"FeatureName": "feature9", "ValueAsString": "[1.2]"}, + {"FeatureName": "feature10", "ValueAsString": "['d']"}, ], "4.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '4.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '4'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '4.0'} + {"FeatureName": "feature1", "ValueAsString": "4.0"}, + {"FeatureName": "feature2", "ValueAsString": "4"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "4.0"}, ], "5.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '5.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '5'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + {"FeatureName": "feature1", "ValueAsString": "5.0"}, + {"FeatureName": "feature2", "ValueAsString": "5"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, ], "6.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '6.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '6'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + {"FeatureName": "feature1", "ValueAsString": "6.0"}, + {"FeatureName": "feature2", "ValueAsString": "6"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, ], "7.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '7.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '7'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + {"FeatureName": "feature1", "ValueAsString": "7.0"}, + {"FeatureName": "feature2", "ValueAsString": "7"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, ], "8.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '8.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '8'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} + {"FeatureName": "feature1", "ValueAsString": "8.0"}, + {"FeatureName": "feature2", "ValueAsString": "8"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, ], "9.0": [ - {'FeatureName': 'feature1', 'ValueAsString': '9.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '9'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} - ] + {"FeatureName": "feature1", "ValueAsString": "9.0"}, + {"FeatureName": "feature2", "ValueAsString": "9"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + ], } @@ -567,14 +575,14 @@ def test_create_feature_store( def test_create_feature_store_ingest_with_offline_target_stores( - feature_store_session, - role, - feature_group_name, - offline_store_s3_uri, - pandas_data_frame, - record, - create_table_ddl, - get_record_results_for_data_frame + feature_store_session, + role, + feature_group_name, + offline_store_s3_uri, + pandas_data_frame, + record, + create_table_ddl, + get_record_results_for_data_frame, ): feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) feature_group.load_feature_definitions(data_frame=pandas_data_frame) @@ -601,15 +609,13 @@ def test_create_feature_store_ingest_with_offline_target_stores( target_stores=[TargetStoreEnum.OFFLINE_STORE], max_workers=3, max_processes=2, - wait=False + wait=False, ) ingestion_manager.wait() assert 0 == len(ingestion_manager.failed_rows) for index, value in pandas_data_frame["feature1"].items(): - assert feature_group.get_record( - record_identifier_value_as_string=str(value) - ) is None + assert feature_group.get_record(record_identifier_value_as_string=str(value)) is None # Query the integrated Glue table. athena_query = feature_group.athena_query() @@ -633,26 +639,26 @@ def test_create_feature_store_ingest_with_offline_target_stores( for is_na in nans.items(): assert is_na assert ( - create_table_ddl.format( - feature_group_name=feature_group_name, - region=feature_store_session.boto_session.region_name, - account=feature_store_session.account_id(), - resolved_output_s3_uri=resolved_output_s3_uri, - ) - == feature_group.as_hive_ddl() + create_table_ddl.format( + feature_group_name=feature_group_name, + region=feature_store_session.boto_session.region_name, + account=feature_store_session.account_id(), + resolved_output_s3_uri=resolved_output_s3_uri, + ) + == feature_group.as_hive_ddl() ) assert output["FeatureGroupArn"].endswith(f"feature-group/{feature_group_name}") def test_create_feature_store_ingest_with_online_offline_target_stores( - feature_store_session, - role, - feature_group_name, - offline_store_s3_uri, - pandas_data_frame, - record, - create_table_ddl, - get_record_results_for_data_frame + feature_store_session, + role, + feature_group_name, + offline_store_s3_uri, + pandas_data_frame, + record, + create_table_ddl, + get_record_results_for_data_frame, ): feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) feature_group.load_feature_definitions(data_frame=pandas_data_frame) @@ -679,15 +685,16 @@ def test_create_feature_store_ingest_with_online_offline_target_stores( target_stores=[TargetStoreEnum.ONLINE_STORE, TargetStoreEnum.OFFLINE_STORE], max_workers=3, max_processes=2, - wait=False + wait=False, ) ingestion_manager.wait() assert 0 == len(ingestion_manager.failed_rows) for index, value in pandas_data_frame["feature1"].items(): - assert feature_group.get_record( - record_identifier_value_as_string=str(value) - ) == get_record_results_for_data_frame[str(value)] + assert ( + feature_group.get_record(record_identifier_value_as_string=str(value)) + == get_record_results_for_data_frame[str(value)] + ) # Query the integrated Glue table. athena_query = feature_group.athena_query() @@ -711,26 +718,26 @@ def test_create_feature_store_ingest_with_online_offline_target_stores( for is_na in nans.items(): assert is_na assert ( - create_table_ddl.format( - feature_group_name=feature_group_name, - region=feature_store_session.boto_session.region_name, - account=feature_store_session.account_id(), - resolved_output_s3_uri=resolved_output_s3_uri, - ) - == feature_group.as_hive_ddl() + create_table_ddl.format( + feature_group_name=feature_group_name, + region=feature_store_session.boto_session.region_name, + account=feature_store_session.account_id(), + resolved_output_s3_uri=resolved_output_s3_uri, + ) + == feature_group.as_hive_ddl() ) assert output["FeatureGroupArn"].endswith(f"feature-group/{feature_group_name}") def test_create_feature_store_ingest_with_online_target_stores( - feature_store_session, - role, - feature_group_name, - offline_store_s3_uri, - pandas_data_frame, - record, - create_table_ddl, - get_record_results_for_data_frame + feature_store_session, + role, + feature_group_name, + offline_store_s3_uri, + pandas_data_frame, + record, + create_table_ddl, + get_record_results_for_data_frame, ): feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) feature_group.load_feature_definitions(data_frame=pandas_data_frame) @@ -756,28 +763,27 @@ def test_create_feature_store_ingest_with_online_target_stores( target_stores=[TargetStoreEnum.ONLINE_STORE], max_workers=3, max_processes=2, - wait=False + wait=False, ) ingestion_manager.wait() assert 0 == len(ingestion_manager.failed_rows) for index, value in pandas_data_frame["feature1"].items(): - assert feature_group.get_record( - record_identifier_value_as_string=str(value) - ) == get_record_results_for_data_frame[str(value)] - + assert ( + feature_group.get_record(record_identifier_value_as_string=str(value)) + == get_record_results_for_data_frame[str(value)] + ) feature_group.put_record( record=[ - FeatureValue(feature_name='feature1', value_as_string='100.0'), - FeatureValue(feature_name='feature2', value_as_string='100'), - FeatureValue(feature_name='feature3', value_as_string='2020-10-30T03:43:21Z'), - FeatureValue(feature_name='feature4', value_as_string='100.0') + FeatureValue(feature_name="feature1", value_as_string="100.0"), + FeatureValue(feature_name="feature2", value_as_string="100"), + FeatureValue(feature_name="feature3", value_as_string="2020-10-30T03:43:21Z"), + FeatureValue(feature_name="feature4", value_as_string="100.0"), ], - target_stores=[TargetStoreEnum.OFFLINE_STORE] + target_stores=[TargetStoreEnum.OFFLINE_STORE], ) - assert feature_group.get_record(record_identifier_value_as_string='100.0') == None - + assert feature_group.get_record(record_identifier_value_as_string="100.0") == None # Query the integrated Glue table. athena_query = feature_group.athena_query() @@ -797,22 +803,22 @@ def test_create_feature_store_ingest_with_online_target_stores( time.sleep(60) assert df.shape[0] == 1 - assert df.loc[0, 'feature1'] == 100.0 - assert df.loc[0, 'feature2'] == 100 - assert df.loc[0, 'feature3'] == "2020-10-30T03:43:21Z" - assert df.loc[0, 'feature4'] == 100.0 + assert df.loc[0, "feature1"] == 100.0 + assert df.loc[0, "feature2"] == 100 + assert df.loc[0, "feature3"] == "2020-10-30T03:43:21Z" + assert df.loc[0, "feature4"] == 100.0 assert output["FeatureGroupArn"].endswith(f"feature-group/{feature_group_name}") def test_put_record_with_target_stores( - feature_store_session, - role, - feature_group_name, - offline_store_s3_uri, - pandas_data_frame, - record, - create_table_ddl, - get_record_results_for_data_frame + feature_store_session, + role, + feature_group_name, + offline_store_s3_uri, + pandas_data_frame, + record, + create_table_ddl, + get_record_results_for_data_frame, ): feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) feature_group.load_feature_definitions(data_frame=pandas_data_frame) @@ -827,29 +833,29 @@ def test_put_record_with_target_stores( _wait_for_feature_group_create(feature_group) feature_group.put_record( record=[ - FeatureValue(feature_name='feature1', value_as_string='100.0'), - FeatureValue(feature_name='feature2', value_as_string='100'), - FeatureValue(feature_name='feature3', value_as_string='2020-10-30T03:43:21Z'), - FeatureValue(feature_name='feature4', value_as_string='100.0') + FeatureValue(feature_name="feature1", value_as_string="100.0"), + FeatureValue(feature_name="feature2", value_as_string="100"), + FeatureValue(feature_name="feature3", value_as_string="2020-10-30T03:43:21Z"), + FeatureValue(feature_name="feature4", value_as_string="100.0"), ], - target_stores=[TargetStoreEnum.OFFLINE_STORE] + target_stores=[TargetStoreEnum.OFFLINE_STORE], ) - assert feature_group.get_record(record_identifier_value_as_string='100.0') == None + assert feature_group.get_record(record_identifier_value_as_string="100.0") == None feature_group.put_record( record=[ - FeatureValue(feature_name='feature1', value_as_string='100.0'), - FeatureValue(feature_name='feature2', value_as_string='100'), - FeatureValue(feature_name='feature3', value_as_string='2020-10-30T03:43:21Z'), - FeatureValue(feature_name='feature4', value_as_string='100.0') + FeatureValue(feature_name="feature1", value_as_string="100.0"), + FeatureValue(feature_name="feature2", value_as_string="100"), + FeatureValue(feature_name="feature3", value_as_string="2020-10-30T03:43:21Z"), + FeatureValue(feature_name="feature4", value_as_string="100.0"), ], - target_stores=[TargetStoreEnum.ONLINE_STORE] + target_stores=[TargetStoreEnum.ONLINE_STORE], ) - assert feature_group.get_record(record_identifier_value_as_string='100.0') == [ - {'FeatureName': 'feature1', 'ValueAsString': '100.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '100'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '100.0'} + assert feature_group.get_record(record_identifier_value_as_string="100.0") == [ + {"FeatureName": "feature1", "ValueAsString": "100.0"}, + {"FeatureName": "feature2", "ValueAsString": "100"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "100.0"}, ] @@ -2169,16 +2175,18 @@ def test_get_feature_group_with_session( def test_ingest_in_memory_multi_process_with_collection_types( - feature_store_session, - role, - feature_group_name, - offline_store_s3_uri, - pandas_data_frame_with_collection_type, - get_record_results_for_data_frame_with_collection_type, + feature_store_session, + role, + feature_group_name, + offline_store_s3_uri, + pandas_data_frame_with_collection_type, + get_record_results_for_data_frame_with_collection_type, ): feature_group = FeatureGroup(feature_group_name, sagemaker_session=feature_store_session) - feature_group.load_feature_definitions(data_frame=pandas_data_frame_with_collection_type, - online_storage_type=OnlineStoreStorageTypeEnum.IN_MEMORY) + feature_group.load_feature_definitions( + data_frame=pandas_data_frame_with_collection_type, + online_storage_type=OnlineStoreStorageTypeEnum.IN_MEMORY, + ) with (cleanup_feature_group(feature_group)): output = feature_group.create( @@ -2187,42 +2195,62 @@ def test_ingest_in_memory_multi_process_with_collection_types( role_arn=role, enable_online_store=True, online_store_storage_type=OnlineStoreStorageTypeEnum.IN_MEMORY, - s3_uri=False + s3_uri=False, ) _wait_for_feature_group_create(feature_group) ingestion_manager = feature_group.ingest( - data_frame=pandas_data_frame_with_collection_type, max_workers=3, max_processes=2, wait=True + data_frame=pandas_data_frame_with_collection_type, + max_workers=3, + max_processes=2, + wait=True, ) ingestion_manager.wait() assert 0 == len(ingestion_manager.failed_rows) assert output["FeatureGroupArn"].endswith(f"feature-group/{feature_group_name}") for index, value in pandas_data_frame_with_collection_type["feature1"].items(): - assert feature_group.get_record( - record_identifier_value_as_string=str(value) - ) == get_record_results_for_data_frame_with_collection_type[str(value)] + assert ( + feature_group.get_record(record_identifier_value_as_string=str(value)) + == get_record_results_for_data_frame_with_collection_type[str(value)] + ) new_row_data = [ - 10.0, 10, "2020-10-30T03:43:21Z", 5.0, ["a", "b"], [1, 2, None], [3.0, 4.0], [1, 2], [3.0, 4.0], ["a", "b"] + 10.0, + 10, + "2020-10-30T03:43:21Z", + 5.0, + ["a", "b"], + [1, 2, None], + [3.0, 4.0], + [1, 2], + [3.0, 4.0], + ["a", "b"], ] - pandas_data_frame_with_collection_type.loc[len(pandas_data_frame_with_collection_type)] = new_row_data + pandas_data_frame_with_collection_type.loc[ + len(pandas_data_frame_with_collection_type) + ] = new_row_data with pytest.raises(IngestionError) as error: feature_group.ingest( - data_frame=pandas_data_frame_with_collection_type, max_workers=1, max_processes=1, wait=True + data_frame=pandas_data_frame_with_collection_type, + max_workers=1, + max_processes=1, + wait=True, ) def test_ingest_in_memory_single_process_with_collection_types( - feature_store_session, - role, - feature_group_name, - offline_store_s3_uri, - pandas_data_frame_with_collection_type, - get_record_results_for_data_frame_with_collection_type, + feature_store_session, + role, + feature_group_name, + offline_store_s3_uri, + pandas_data_frame_with_collection_type, + get_record_results_for_data_frame_with_collection_type, ): feature_group = FeatureGroup(feature_group_name, sagemaker_session=feature_store_session) - feature_group.load_feature_definitions(data_frame=pandas_data_frame_with_collection_type, - online_storage_type=OnlineStoreStorageTypeEnum.IN_MEMORY) + feature_group.load_feature_definitions( + data_frame=pandas_data_frame_with_collection_type, + online_storage_type=OnlineStoreStorageTypeEnum.IN_MEMORY, + ) with (cleanup_feature_group(feature_group)): output = feature_group.create( @@ -2231,41 +2259,61 @@ def test_ingest_in_memory_single_process_with_collection_types( role_arn=role, enable_online_store=True, online_store_storage_type=OnlineStoreStorageTypeEnum.IN_MEMORY, - s3_uri=False + s3_uri=False, ) _wait_for_feature_group_create(feature_group) ingestion_manager = feature_group.ingest( - data_frame=pandas_data_frame_with_collection_type, max_workers=1, max_processes=1, wait=True + data_frame=pandas_data_frame_with_collection_type, + max_workers=1, + max_processes=1, + wait=True, ) assert 0 == len(ingestion_manager.failed_rows) assert output["FeatureGroupArn"].endswith(f"feature-group/{feature_group_name}") for index, value in pandas_data_frame_with_collection_type["feature1"].items(): - assert feature_group.get_record( - record_identifier_value_as_string=str(value) - ) == get_record_results_for_data_frame_with_collection_type[str(value)] + assert ( + feature_group.get_record(record_identifier_value_as_string=str(value)) + == get_record_results_for_data_frame_with_collection_type[str(value)] + ) new_row_data = [ - 10.0, 10, "2020-10-30T03:43:21Z", 5.0, ["a", "b"], [1, 2, None], [3.0, 4.0], [1, 2], [3.0, 4.0], ["a", "b"] + 10.0, + 10, + "2020-10-30T03:43:21Z", + 5.0, + ["a", "b"], + [1, 2, None], + [3.0, 4.0], + [1, 2], + [3.0, 4.0], + ["a", "b"], ] - pandas_data_frame_with_collection_type.loc[len(pandas_data_frame_with_collection_type)] = new_row_data + pandas_data_frame_with_collection_type.loc[ + len(pandas_data_frame_with_collection_type) + ] = new_row_data with pytest.raises(IngestionError) as error: feature_group.ingest( - data_frame=pandas_data_frame_with_collection_type, max_workers=1, max_processes=1, wait=True + data_frame=pandas_data_frame_with_collection_type, + max_workers=1, + max_processes=1, + wait=True, ) def test_ingest_standard_multi_process_with_collection_types( - feature_store_session, - role, - feature_group_name, - offline_store_s3_uri, - pandas_data_frame_with_collection_type, - get_record_results_for_data_frame_without_collection_type, + feature_store_session, + role, + feature_group_name, + offline_store_s3_uri, + pandas_data_frame_with_collection_type, + get_record_results_for_data_frame_without_collection_type, ): feature_group = FeatureGroup(feature_group_name, sagemaker_session=feature_store_session) - feature_group.load_feature_definitions(data_frame=pandas_data_frame_with_collection_type, - online_storage_type=OnlineStoreStorageTypeEnum.STANDARD) + feature_group.load_feature_definitions( + data_frame=pandas_data_frame_with_collection_type, + online_storage_type=OnlineStoreStorageTypeEnum.STANDARD, + ) with (cleanup_feature_group(feature_group)): output = feature_group.create( @@ -2274,38 +2322,52 @@ def test_ingest_standard_multi_process_with_collection_types( role_arn=role, enable_online_store=True, online_store_storage_type=OnlineStoreStorageTypeEnum.STANDARD, - s3_uri=False + s3_uri=False, ) _wait_for_feature_group_create(feature_group) new_row_data = [ - 10.0, 10, "2020-10-30T03:43:21Z", 5.0, ["a", "b"], [1, 2, None], [3.0, 4.0], [1, 2], [3.0, 4.0], ["a", "b"] + 10.0, + 10, + "2020-10-30T03:43:21Z", + 5.0, + ["a", "b"], + [1, 2, None], + [3.0, 4.0], + [1, 2], + [3.0, 4.0], + ["a", "b"], ] - pandas_data_frame_with_collection_type.loc[len(pandas_data_frame_with_collection_type)] = new_row_data + pandas_data_frame_with_collection_type.loc[ + len(pandas_data_frame_with_collection_type) + ] = new_row_data ingestion_manager = feature_group.ingest( - data_frame=pandas_data_frame_with_collection_type, max_workers=3, max_processes=2, wait=True + data_frame=pandas_data_frame_with_collection_type, + max_workers=3, + max_processes=2, + wait=True, ) ingestion_manager.wait() assert 0 == len(ingestion_manager.failed_rows) get_record_results_for_data_frame_without_collection_type["10.0"] = [ - {'FeatureName': 'feature1', 'ValueAsString': '10.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '10'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '5.0'}, - {'FeatureName': 'feature5', 'ValueAsString': "['a', 'b']"}, - {'FeatureName': 'feature6', 'ValueAsString': '[1, 2, None]'}, - {'FeatureName': 'feature7', 'ValueAsString': '[3.0, 4.0]'}, - {'FeatureName': 'feature8', 'ValueAsString': '[1, 2]'}, - {'FeatureName': 'feature9', 'ValueAsString': '[3.0, 4.0]'}, - {'FeatureName': 'feature10', 'ValueAsString': "['a', 'b']"} + {"FeatureName": "feature1", "ValueAsString": "10.0"}, + {"FeatureName": "feature2", "ValueAsString": "10"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "5.0"}, + {"FeatureName": "feature5", "ValueAsString": "['a', 'b']"}, + {"FeatureName": "feature6", "ValueAsString": "[1, 2, None]"}, + {"FeatureName": "feature7", "ValueAsString": "[3.0, 4.0]"}, + {"FeatureName": "feature8", "ValueAsString": "[1, 2]"}, + {"FeatureName": "feature9", "ValueAsString": "[3.0, 4.0]"}, + {"FeatureName": "feature10", "ValueAsString": "['a', 'b']"}, ] assert output["FeatureGroupArn"].endswith(f"feature-group/{feature_group_name}") for index, value in pandas_data_frame_with_collection_type["feature1"].items(): - assert feature_group.get_record( - record_identifier_value_as_string=str(value) - ) == get_record_results_for_data_frame_without_collection_type[str(value)] - + assert ( + feature_group.get_record(record_identifier_value_as_string=str(value)) + == get_record_results_for_data_frame_without_collection_type[str(value)] + ) @contextmanager diff --git a/tests/unit/sagemaker/feature_store/test_feature_group.py b/tests/unit/sagemaker/feature_store/test_feature_group.py index c05c330dbb..5a26fce026 100644 --- a/tests/unit/sagemaker/feature_store/test_feature_group.py +++ b/tests/unit/sagemaker/feature_store/test_feature_group.py @@ -89,18 +89,18 @@ def feature_group_dummy_definitions(): @pytest.fixture def feature_group_describe_dummy_definitions(): return [ - {'FeatureName': 'feature1', 'FeatureType': 'Fractional'}, - {'FeatureName': 'feature2', 'FeatureType': 'Integral'}, - {'FeatureName': 'feature3', 'FeatureType': 'String'}, + {"FeatureName": "feature1", "FeatureType": "Fractional"}, + {"FeatureName": "feature2", "FeatureType": "Integral"}, + {"FeatureName": "feature3", "FeatureType": "String"}, ] @pytest.fixture def feature_group_dummy_definition_dict(): return { - 'feature1': {'FeatureName': 'feature1', 'FeatureType': 'Fractional'}, - 'feature2': {'FeatureName': 'feature2', 'FeatureType': 'Integral'}, - 'feature3': {'FeatureName': 'feature3', 'FeatureType': 'String'}, + "feature1": {"FeatureName": "feature1", "FeatureType": "Fractional"}, + "feature2": {"FeatureName": "feature2", "FeatureType": "Integral"}, + "feature3": {"FeatureName": "feature3", "FeatureType": "String"}, } @@ -121,12 +121,20 @@ def data_frame_with_collection_type(): "feature2": pd.Series(np.arange(10), dtype="int64"), "feature3": pd.Series(["2020-10-30T03:43:21Z"] * 10, dtype="string"), "feature4": pd.Series(np.arange(5.0), dtype="float64"), # contains nan - "feature5": pd.Series([["a", "abc"], ["b", "c"], ["c", "f"], ["d"], []], dtype="object"), + "feature5": pd.Series( + [["a", "abc"], ["b", "c"], ["c", "f"], ["d"], []], dtype="object" + ), "feature6": pd.Series([[1, 2], [1, 2, 3], [1, 5], [1], []], dtype="object"), - "feature7": pd.Series([[1.1, 2.3], [1.4, 2.5, 3.2, 25], [1.0, 5.3], [1.2], []], dtype="object"), + "feature7": pd.Series( + [[1.1, 2.3], [1.4, 2.5, 3.2, 25], [1.0, 5.3], [1.2], []], dtype="object" + ), "feature8": pd.Series([[1, 2], [1, 2, None], [1, 5], [1], [], [None]], dtype="object"), - "feature9": pd.Series([[1.1, 2.3], [1.4, 25, 3.2], [1.0, 3, None], [1.2], [], [None]], dtype="object"), - "feature10": pd.Series([["a", "abc"], ["b", "c"], ["c", None], ["d"], [], [None]], dtype="object"), + "feature9": pd.Series( + [[1.1, 2.3], [1.4, 25, 3.2], [1.0, 3, None], [1.2], [], [None]], dtype="object" + ), + "feature10": pd.Series( + [["a", "abc"], ["b", "c"], ["c", None], ["d"], [], [None]], dtype="object" + ), } ) return df @@ -151,58 +159,96 @@ def expected_standard_feature_definitions(): @pytest.fixture def expected_standard_feature_definition_dict(): return { - 'feature1': {'FeatureName': 'feature1', 'FeatureType': 'Fractional'}, - 'feature2': {'FeatureName': 'feature2', 'FeatureType': 'Integral'}, - 'feature3': {'FeatureName': 'feature3', 'FeatureType': 'String'}, - 'feature4': {'FeatureName': 'feature4', 'FeatureType': 'Fractional', 'CollectionType': None}, - 'feature5': {'FeatureName': 'feature5', 'FeatureType': 'String'}, - 'feature6': {'FeatureName': 'feature6', 'FeatureType': 'Integral'}, - 'feature7': {'FeatureName': 'feature7', 'FeatureType': 'Fractional'}, - 'feature8': {'FeatureName': 'feature8', 'FeatureType': 'Integral'}, - 'feature9': {'FeatureName': 'feature9', 'FeatureType': 'Fractional'}, - 'feature10': {'FeatureName': 'feature10', 'FeatureType': 'String'} + "feature1": {"FeatureName": "feature1", "FeatureType": "Fractional"}, + "feature2": {"FeatureName": "feature2", "FeatureType": "Integral"}, + "feature3": {"FeatureName": "feature3", "FeatureType": "String"}, + "feature4": { + "FeatureName": "feature4", + "FeatureType": "Fractional", + "CollectionType": None, + }, + "feature5": {"FeatureName": "feature5", "FeatureType": "String"}, + "feature6": {"FeatureName": "feature6", "FeatureType": "Integral"}, + "feature7": {"FeatureName": "feature7", "FeatureType": "Fractional"}, + "feature8": {"FeatureName": "feature8", "FeatureType": "Integral"}, + "feature9": {"FeatureName": "feature9", "FeatureType": "Fractional"}, + "feature10": {"FeatureName": "feature10", "FeatureType": "String"}, } @pytest.fixture def expected_in_memory_feature_definitions(): return [ + FeatureDefinition(feature_name="feature1", feature_type=FeatureTypeEnum.FRACTIONAL), + FeatureDefinition(feature_name="feature2", feature_type=FeatureTypeEnum.INTEGRAL), + FeatureDefinition(feature_name="feature3", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="feature4", feature_type=FeatureTypeEnum.FRACTIONAL), FeatureDefinition( - feature_name="feature1", feature_type=FeatureTypeEnum.FRACTIONAL), - FeatureDefinition( - feature_name="feature2", feature_type=FeatureTypeEnum.INTEGRAL), - FeatureDefinition( - feature_name="feature3", feature_type=FeatureTypeEnum.STRING), - FeatureDefinition( - feature_name="feature4", feature_type=FeatureTypeEnum.FRACTIONAL), - FeatureDefinition( - feature_name="feature5", feature_type=FeatureTypeEnum.STRING, collection_type=ListCollectionType()), + feature_name="feature5", + feature_type=FeatureTypeEnum.STRING, + collection_type=ListCollectionType(), + ), FeatureDefinition( - feature_name="feature6", feature_type=FeatureTypeEnum.INTEGRAL, collection_type=ListCollectionType()), + feature_name="feature6", + feature_type=FeatureTypeEnum.INTEGRAL, + collection_type=ListCollectionType(), + ), FeatureDefinition( - feature_name="feature7", feature_type=FeatureTypeEnum.FRACTIONAL, collection_type=ListCollectionType()), + feature_name="feature7", + feature_type=FeatureTypeEnum.FRACTIONAL, + collection_type=ListCollectionType(), + ), FeatureDefinition( - feature_name="feature8", feature_type=FeatureTypeEnum.INTEGRAL, collection_type=ListCollectionType()), + feature_name="feature8", + feature_type=FeatureTypeEnum.INTEGRAL, + collection_type=ListCollectionType(), + ), FeatureDefinition( - feature_name="feature9", feature_type=FeatureTypeEnum.FRACTIONAL, collection_type=ListCollectionType()), + feature_name="feature9", + feature_type=FeatureTypeEnum.FRACTIONAL, + collection_type=ListCollectionType(), + ), FeatureDefinition( - feature_name="feature10", feature_type=FeatureTypeEnum.STRING, collection_type=ListCollectionType()), + feature_name="feature10", + feature_type=FeatureTypeEnum.STRING, + collection_type=ListCollectionType(), + ), ] @pytest.fixture def expected_in_memory_feature_definition_dict(): return { - 'feature1': {'FeatureName': 'feature1', 'FeatureType': 'Fractional'}, - 'feature2': {'FeatureName': 'feature2', 'FeatureType': 'Integral'}, - 'feature3': {'FeatureName': 'feature3', 'FeatureType': 'String'}, - 'feature4': {'FeatureName': 'feature4', 'FeatureType': 'Fractional'}, - 'feature5': {'FeatureName': 'feature5', 'FeatureType': 'String', 'CollectionType': 'List'}, - 'feature6': {'FeatureName': 'feature6', 'FeatureType': 'Integral', 'CollectionType': 'List'}, - 'feature7': {'FeatureName': 'feature7', 'FeatureType': 'Fractional', 'CollectionType': 'List'}, - 'feature8': {'FeatureName': 'feature8', 'FeatureType': 'Integral', 'CollectionType': 'List'}, - 'feature9': {'FeatureName': 'feature9', 'FeatureType': 'Fractional', 'CollectionType': 'List'}, - 'feature10': {'FeatureName': 'feature10', 'FeatureType': 'String', 'CollectionType': 'List'} + "feature1": {"FeatureName": "feature1", "FeatureType": "Fractional"}, + "feature2": {"FeatureName": "feature2", "FeatureType": "Integral"}, + "feature3": {"FeatureName": "feature3", "FeatureType": "String"}, + "feature4": {"FeatureName": "feature4", "FeatureType": "Fractional"}, + "feature5": {"FeatureName": "feature5", "FeatureType": "String", "CollectionType": "List"}, + "feature6": { + "FeatureName": "feature6", + "FeatureType": "Integral", + "CollectionType": "List", + }, + "feature7": { + "FeatureName": "feature7", + "FeatureType": "Fractional", + "CollectionType": "List", + }, + "feature8": { + "FeatureName": "feature8", + "FeatureType": "Integral", + "CollectionType": "List", + }, + "feature9": { + "FeatureName": "feature9", + "FeatureType": "Fractional", + "CollectionType": "List", + }, + "feature10": { + "FeatureName": "feature10", + "FeatureType": "String", + "CollectionType": "List", + }, } @@ -226,7 +272,7 @@ def create_table_ddl(): def test_feature_group_create_without_role( - sagemaker_session_mock, feature_group_dummy_definitions, s3_uri + sagemaker_session_mock, feature_group_dummy_definitions, s3_uri ): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.feature_definitions = feature_group_dummy_definitions @@ -240,7 +286,7 @@ def test_feature_group_create_without_role( def test_feature_store_create_with_config_injection( - sagemaker_session, role_arn, feature_group_dummy_definitions, s3_uri + sagemaker_session, role_arn, feature_group_dummy_definitions, s3_uri ): sagemaker_session.sagemaker_config = SAGEMAKER_CONFIG_FEATURE_GROUP sagemaker_session.create_feature_group = Mock() @@ -283,10 +329,10 @@ def test_feature_store_create_with_config_injection( def test_feature_group_load_definition( - sagemaker_session_mock, - data_frame_with_collection_type, - expected_standard_feature_definitions, - expected_in_memory_feature_definitions + sagemaker_session_mock, + data_frame_with_collection_type, + expected_standard_feature_definitions, + expected_in_memory_feature_definitions, ): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) @@ -294,33 +340,39 @@ def test_feature_group_load_definition( assert feature_group.feature_definitions == expected_standard_feature_definitions feature_group.load_feature_definitions( - data_frame=data_frame_with_collection_type, online_storage_type=OnlineStoreStorageTypeEnum.STANDARD + data_frame=data_frame_with_collection_type, + online_storage_type=OnlineStoreStorageTypeEnum.STANDARD, ) assert feature_group.feature_definitions == expected_standard_feature_definitions feature_group.load_feature_definitions( - data_frame=data_frame_with_collection_type, online_storage_type=OnlineStoreStorageTypeEnum.IN_MEMORY + data_frame=data_frame_with_collection_type, + online_storage_type=OnlineStoreStorageTypeEnum.IN_MEMORY, ) assert feature_group.feature_definitions == expected_in_memory_feature_definitions data_frame_with_collection_type["feature11"] = pd.Series( - [[1.1, "2.3"], [1.4, 2.5, 3.2, 25], [1.0, 5.3], [1.2], []], dtype="object") + [[1.1, "2.3"], [1.4, 2.5, 3.2, 25], [1.0, 5.3], [1.2], []], dtype="object" + ) feature_group.load_feature_definitions( - data_frame=data_frame_with_collection_type, online_storage_type=OnlineStoreStorageTypeEnum.STANDARD + data_frame=data_frame_with_collection_type, + online_storage_type=OnlineStoreStorageTypeEnum.STANDARD, ) expected_standard_feature_definitions.append( - FeatureDefinition(feature_name='feature11', feature_type=FeatureTypeEnum.STRING)) + FeatureDefinition(feature_name="feature11", feature_type=FeatureTypeEnum.STRING) + ) assert feature_group.feature_definitions == expected_standard_feature_definitions with pytest.raises(ValueError): feature_group.load_feature_definitions( - data_frame=data_frame_with_collection_type, online_storage_type=OnlineStoreStorageTypeEnum.IN_MEMORY + data_frame=data_frame_with_collection_type, + online_storage_type=OnlineStoreStorageTypeEnum.IN_MEMORY, ) def test_feature_store_create( - sagemaker_session_mock, role_arn, feature_group_dummy_definitions, s3_uri + sagemaker_session_mock, role_arn, feature_group_dummy_definitions, s3_uri ): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.feature_definitions = feature_group_dummy_definitions @@ -348,7 +400,7 @@ def test_feature_store_create( def test_feature_store_create_with_ttl_duration( - sagemaker_session_mock, role_arn, feature_group_dummy_definitions, s3_uri + sagemaker_session_mock, role_arn, feature_group_dummy_definitions, s3_uri ): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.feature_definitions = feature_group_dummy_definitions @@ -381,7 +433,7 @@ def test_feature_store_create_with_ttl_duration( def test_feature_store_create_online_only( - sagemaker_session_mock, role_arn, feature_group_dummy_definitions + sagemaker_session_mock, role_arn, feature_group_dummy_definitions ): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.feature_definitions = feature_group_dummy_definitions @@ -405,7 +457,7 @@ def test_feature_store_create_online_only( def test_feature_store_create_online_only_with_in_memory( - sagemaker_session_mock, role_arn, feature_group_dummy_definitions + sagemaker_session_mock, role_arn, feature_group_dummy_definitions ): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.feature_definitions = feature_group_dummy_definitions @@ -430,7 +482,7 @@ def test_feature_store_create_online_only_with_in_memory( def test_feature_store_create_with_in_memory_collection_types( - sagemaker_session_mock, role_arn, feature_group_dummy_definitions + sagemaker_session_mock, role_arn, feature_group_dummy_definitions ): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_definition_with_collection = [ @@ -671,7 +723,8 @@ def test_put_record_target_stores(sagemaker_session_mock): feature_group.put_record( record=[], target_stores=[TargetStoreEnum.ONLINE_STORE, TargetStoreEnum.OFFLINE_STORE], - ttl_duration=ttl_duration) + ttl_duration=ttl_duration, + ) sagemaker_session_mock.put_record.assert_called_with( feature_group_name="MyFeatureGroup", record=[], @@ -803,17 +856,17 @@ def test_ingest_zero_workers(): @patch("sagemaker.feature_store.feature_group.IngestionManagerPandas") def test_ingest( - ingestion_manager_init, - sagemaker_session_mock, - fs_runtime_client_config_mock, - feature_group_describe_dummy_definitions, - feature_group_dummy_definition_dict + ingestion_manager_init, + sagemaker_session_mock, + fs_runtime_client_config_mock, + feature_group_describe_dummy_definitions, + feature_group_dummy_definition_dict, ): sagemaker_session_mock.sagemaker_featurestore_runtime_client.meta.config = ( fs_runtime_client_config_mock ) sagemaker_session_mock.describe_feature_group.return_value = { - 'FeatureDefinitions': feature_group_describe_dummy_definitions + "FeatureDefinitions": feature_group_describe_dummy_definitions } feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) @@ -839,19 +892,18 @@ def test_ingest( @patch("sagemaker.feature_store.feature_group.IngestionManagerPandas") def test_ingest_default( - ingestion_manager_init, - sagemaker_session_mock, - fs_runtime_client_config_mock, - feature_group_describe_dummy_definitions, - feature_group_dummy_definition_dict - + ingestion_manager_init, + sagemaker_session_mock, + fs_runtime_client_config_mock, + feature_group_describe_dummy_definitions, + feature_group_dummy_definition_dict, ): sagemaker_session_mock.sagemaker_featurestore_runtime_client.meta.config = ( fs_runtime_client_config_mock ) sagemaker_session_mock.boto_session.profile_name = "default" sagemaker_session_mock.describe_feature_group.return_value = { - 'FeatureDefinitions': feature_group_describe_dummy_definitions + "FeatureDefinitions": feature_group_describe_dummy_definitions } feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) @@ -877,17 +929,17 @@ def test_ingest_default( @patch("sagemaker.feature_store.feature_group.IngestionManagerPandas") def test_ingest_with_target_stores( - ingestion_manager_init, - sagemaker_session_mock, - fs_runtime_client_config_mock, - feature_group_describe_dummy_definitions, - feature_group_dummy_definition_dict + ingestion_manager_init, + sagemaker_session_mock, + fs_runtime_client_config_mock, + feature_group_describe_dummy_definitions, + feature_group_dummy_definition_dict, ): sagemaker_session_mock.sagemaker_featurestore_runtime_client.meta.config = ( fs_runtime_client_config_mock ) sagemaker_session_mock.describe_feature_group.return_value = { - 'FeatureDefinitions': feature_group_describe_dummy_definitions + "FeatureDefinitions": feature_group_describe_dummy_definitions } feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) @@ -895,10 +947,17 @@ def test_ingest_with_target_stores( mock_ingestion_manager_instance = Mock() ingestion_manager_init.return_value = mock_ingestion_manager_instance - feature_group.ingest(data_frame=df, max_workers=10, target_stores=[TargetStoreEnum.ONLINE_STORE]) - feature_group.ingest(data_frame=df, max_workers=10, target_stores=[TargetStoreEnum.OFFLINE_STORE]) feature_group.ingest( - data_frame=df, max_workers=10, target_stores=[TargetStoreEnum.ONLINE_STORE, TargetStoreEnum.OFFLINE_STORE]) + data_frame=df, max_workers=10, target_stores=[TargetStoreEnum.ONLINE_STORE] + ) + feature_group.ingest( + data_frame=df, max_workers=10, target_stores=[TargetStoreEnum.OFFLINE_STORE] + ) + feature_group.ingest( + data_frame=df, + max_workers=10, + target_stores=[TargetStoreEnum.ONLINE_STORE, TargetStoreEnum.OFFLINE_STORE], + ) actual_ingestion_manager_init_calls = ingestion_manager_init.mock_calls expected_ingestion_manager_init_calls = [ @@ -911,7 +970,9 @@ def test_ingest_with_target_stores( max_processes=1, profile_name=sagemaker_session_mock.boto_session.profile_name, ), - call().run(data_frame=df, target_stores=[TargetStoreEnum.ONLINE_STORE], wait=True, timeout=None), + call().run( + data_frame=df, target_stores=[TargetStoreEnum.ONLINE_STORE], wait=True, timeout=None + ), call( feature_group_name="MyGroup", feature_definitions=feature_group_dummy_definition_dict, @@ -921,7 +982,9 @@ def test_ingest_with_target_stores( max_processes=1, profile_name=sagemaker_session_mock.boto_session.profile_name, ), - call().run(data_frame=df, target_stores=[TargetStoreEnum.OFFLINE_STORE], wait=True, timeout=None), + call().run( + data_frame=df, target_stores=[TargetStoreEnum.OFFLINE_STORE], wait=True, timeout=None + ), call( feature_group_name="MyGroup", feature_definitions=feature_group_dummy_definition_dict, @@ -935,25 +998,27 @@ def test_ingest_with_target_stores( data_frame=df, target_stores=[TargetStoreEnum.ONLINE_STORE, TargetStoreEnum.OFFLINE_STORE], wait=True, - timeout=None), + timeout=None, + ), ] - assert actual_ingestion_manager_init_calls == expected_ingestion_manager_init_calls, \ - f"Expected {expected_ingestion_manager_init_calls} calls, but got {actual_ingestion_manager_init_calls}" + assert ( + actual_ingestion_manager_init_calls == expected_ingestion_manager_init_calls + ), f"Expected {expected_ingestion_manager_init_calls} calls, but got {actual_ingestion_manager_init_calls}" @patch("sagemaker.feature_store.feature_group.IngestionManagerPandas") def test_ingest_with_profile_name( - ingestion_manager_init, - sagemaker_session_mock, - fs_runtime_client_config_mock, - feature_group_describe_dummy_definitions, - feature_group_dummy_definition_dict + ingestion_manager_init, + sagemaker_session_mock, + fs_runtime_client_config_mock, + feature_group_describe_dummy_definitions, + feature_group_dummy_definition_dict, ): sagemaker_session_mock.sagemaker_featurestore_runtime_client.meta.config = ( fs_runtime_client_config_mock ) sagemaker_session_mock.describe_feature_group.return_value = { - 'FeatureDefinitions': feature_group_describe_dummy_definitions + "FeatureDefinitions": feature_group_describe_dummy_definitions } feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) @@ -978,7 +1043,7 @@ def test_ingest_with_profile_name( def test_as_hive_ddl_with_default_values( - create_table_ddl, feature_group_dummy_definitions, sagemaker_session_mock + create_table_ddl, feature_group_dummy_definitions, sagemaker_session_mock ): sagemaker_session_mock.describe_feature_group.return_value = { "OfflineStoreConfig": { @@ -994,14 +1059,14 @@ def test_as_hive_ddl_with_default_values( feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) feature_group.feature_definitions = feature_group_dummy_definitions assert ( - create_table_ddl.format( - database="sagemaker_featurestore", - table_name="MyGroup", - account="1234", - region="us-west-2", - feature_group_name="MyGroup", - ) - == feature_group.as_hive_ddl() + create_table_ddl.format( + database="sagemaker_featurestore", + table_name="MyGroup", + account="1234", + region="us-west-2", + feature_group_name="MyGroup", + ) + == feature_group.as_hive_ddl() ) @@ -1051,7 +1116,7 @@ def test_ingestion_manager_run_success(): PicklableMock(return_value=[]), ) def test_ingestion_manager_run_multi_process_with_multi_thread_success( - fs_runtime_client_config_mock, + fs_runtime_client_config_mock, ): df = pd.DataFrame({"float": pd.Series([2.0], dtype="float64")}) manager = IngestionManagerPandas( @@ -1107,25 +1172,24 @@ def test_ingestion_manager_run_success(sagemaker_session_mock, fs_runtime_client data_frame=df, target_stores=None, row=row, - feature_group_name='MyGroup', + feature_group_name="MyGroup", feature_definitions=feature_group_dummy_definition_dict, sagemaker_fs_runtime_client=fs_runtime_client_config_mock, - failed_rows=[] + failed_rows=[], ) expected_invocation_count = 1 # Set your expected count actual_invocation_count = len(manager._ingest_row.mock_calls) - assert actual_invocation_count == expected_invocation_count, \ - f"Expected {expected_invocation_count} calls, but got {actual_invocation_count}" + assert ( + actual_invocation_count == expected_invocation_count + ), f"Expected {expected_invocation_count} calls, but got {actual_invocation_count}" def test_ingestion_manager_run_standard( - sagemaker_session_mock, - fs_runtime_client_config_mock, - feature_group_dummy_definition_dict + sagemaker_session_mock, fs_runtime_client_config_mock, feature_group_dummy_definition_dict ): sagemaker_session_mock.sagemaker_featurestore_runtime_client = fs_runtime_client_config_mock - df = pd.DataFrame(data={'feature1': [2.0, 3.0], 'feature2': [3, 4], 'feature3': ['abc', 'edf']}) + df = pd.DataFrame(data={"feature1": [2.0, 3.0], "feature2": [3, 4], "feature3": ["abc", "edf"]}) manager = IngestionManagerPandas( feature_group_name="MyGroup", @@ -1141,30 +1205,31 @@ def test_ingestion_manager_run_standard( call( FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '2.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '3'}, - {'FeatureName': 'feature3', 'ValueAsString': 'abc'} - ] + {"FeatureName": "feature1", "ValueAsString": "2.0"}, + {"FeatureName": "feature2", "ValueAsString": "3"}, + {"FeatureName": "feature3", "ValueAsString": "abc"}, + ], ), call( FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '3.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '4'}, - {'FeatureName': 'feature3', 'ValueAsString': 'edf'} - ] + {"FeatureName": "feature1", "ValueAsString": "3.0"}, + {"FeatureName": "feature2", "ValueAsString": "4"}, + {"FeatureName": "feature3", "ValueAsString": "edf"}, + ], ), ] - assert actual_put_record_calls == expected_put_record_calls, \ - f"Expected {expected_put_record_calls} calls, but got {actual_put_record_calls}" + assert ( + actual_put_record_calls == expected_put_record_calls + ), f"Expected {expected_put_record_calls} calls, but got {actual_put_record_calls}" def test_ingestion_manager_run_non_collection_type( - sagemaker_session_mock, - fs_runtime_client_config_mock, - feature_group_dummy_definition_dict, - data_frame_with_collection_type, - expected_standard_feature_definition_dict + sagemaker_session_mock, + fs_runtime_client_config_mock, + feature_group_dummy_definition_dict, + data_frame_with_collection_type, + expected_standard_feature_definition_dict, ): sagemaker_session_mock.sagemaker_featurestore_runtime_client = fs_runtime_client_config_mock manager = IngestionManagerPandas( @@ -1179,128 +1244,129 @@ def test_ingestion_manager_run_non_collection_type( actual_put_record_calls = fs_runtime_client_config_mock.put_record.mock_calls expected_put_record_calls = [ call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '0.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '0'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '0.0'}, - {'FeatureName': 'feature5', 'ValueAsString': "['a', 'abc']"}, - {'FeatureName': 'feature6', 'ValueAsString': '[1, 2]'}, - {'FeatureName': 'feature7', 'ValueAsString': '[1.1, 2.3]'}, - {'FeatureName': 'feature8', 'ValueAsString': '[1, 2]'}, - {'FeatureName': 'feature9', 'ValueAsString': '[1.1, 2.3]'}, - {'FeatureName': 'feature10', 'ValueAsString': "['a', 'abc']"} - ] + {"FeatureName": "feature1", "ValueAsString": "0.0"}, + {"FeatureName": "feature2", "ValueAsString": "0"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "0.0"}, + {"FeatureName": "feature5", "ValueAsString": "['a', 'abc']"}, + {"FeatureName": "feature6", "ValueAsString": "[1, 2]"}, + {"FeatureName": "feature7", "ValueAsString": "[1.1, 2.3]"}, + {"FeatureName": "feature8", "ValueAsString": "[1, 2]"}, + {"FeatureName": "feature9", "ValueAsString": "[1.1, 2.3]"}, + {"FeatureName": "feature10", "ValueAsString": "['a', 'abc']"}, + ], ), call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '1.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '1'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '1.0'}, - {'FeatureName': 'feature5', 'ValueAsString': "['b', 'c']"}, - {'FeatureName': 'feature6', 'ValueAsString': '[1, 2, 3]'}, - {'FeatureName': 'feature7', 'ValueAsString': '[1.4, 2.5, 3.2, 25]'}, - {'FeatureName': 'feature8', 'ValueAsString': '[1, 2, None]'}, - {'FeatureName': 'feature9', 'ValueAsString': '[1.4, 25, 3.2]'}, - {'FeatureName': 'feature10', 'ValueAsString': "['b', 'c']"} - ] + {"FeatureName": "feature1", "ValueAsString": "1.0"}, + {"FeatureName": "feature2", "ValueAsString": "1"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "1.0"}, + {"FeatureName": "feature5", "ValueAsString": "['b', 'c']"}, + {"FeatureName": "feature6", "ValueAsString": "[1, 2, 3]"}, + {"FeatureName": "feature7", "ValueAsString": "[1.4, 2.5, 3.2, 25]"}, + {"FeatureName": "feature8", "ValueAsString": "[1, 2, None]"}, + {"FeatureName": "feature9", "ValueAsString": "[1.4, 25, 3.2]"}, + {"FeatureName": "feature10", "ValueAsString": "['b', 'c']"}, + ], ), call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '2.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '2'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '2.0'}, - {'FeatureName': 'feature5', 'ValueAsString': "['c', 'f']"}, - {'FeatureName': 'feature6', 'ValueAsString': '[1, 5]'}, - {'FeatureName': 'feature7', 'ValueAsString': '[1.0, 5.3]'}, - {'FeatureName': 'feature8', 'ValueAsString': '[1, 5]'}, - {'FeatureName': 'feature9', 'ValueAsString': '[1.0, 3, None]'}, - {'FeatureName': 'feature10', 'ValueAsString': "['c', None]"} - ] + {"FeatureName": "feature1", "ValueAsString": "2.0"}, + {"FeatureName": "feature2", "ValueAsString": "2"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "2.0"}, + {"FeatureName": "feature5", "ValueAsString": "['c', 'f']"}, + {"FeatureName": "feature6", "ValueAsString": "[1, 5]"}, + {"FeatureName": "feature7", "ValueAsString": "[1.0, 5.3]"}, + {"FeatureName": "feature8", "ValueAsString": "[1, 5]"}, + {"FeatureName": "feature9", "ValueAsString": "[1.0, 3, None]"}, + {"FeatureName": "feature10", "ValueAsString": "['c', None]"}, + ], ), call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '3.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '3'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '3.0'}, - {'FeatureName': 'feature5', 'ValueAsString': "['d']"}, - {'FeatureName': 'feature6', 'ValueAsString': '[1]'}, - {'FeatureName': 'feature7', 'ValueAsString': '[1.2]'}, - {'FeatureName': 'feature8', 'ValueAsString': '[1]'}, - {'FeatureName': 'feature9', 'ValueAsString': '[1.2]'}, - {'FeatureName': 'feature10', 'ValueAsString': "['d']"} - ] + {"FeatureName": "feature1", "ValueAsString": "3.0"}, + {"FeatureName": "feature2", "ValueAsString": "3"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "3.0"}, + {"FeatureName": "feature5", "ValueAsString": "['d']"}, + {"FeatureName": "feature6", "ValueAsString": "[1]"}, + {"FeatureName": "feature7", "ValueAsString": "[1.2]"}, + {"FeatureName": "feature8", "ValueAsString": "[1]"}, + {"FeatureName": "feature9", "ValueAsString": "[1.2]"}, + {"FeatureName": "feature10", "ValueAsString": "['d']"}, + ], ), call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '4.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '4'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '4.0'} - ] + {"FeatureName": "feature1", "ValueAsString": "4.0"}, + {"FeatureName": "feature2", "ValueAsString": "4"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "4.0"}, + ], ), call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '5.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '5'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature8', 'ValueAsString': '[None]'}, - {'FeatureName': 'feature9', 'ValueAsString': '[None]'}, - {'FeatureName': 'feature10', 'ValueAsString': '[None]'} - ] + {"FeatureName": "feature1", "ValueAsString": "5.0"}, + {"FeatureName": "feature2", "ValueAsString": "5"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature8", "ValueAsString": "[None]"}, + {"FeatureName": "feature9", "ValueAsString": "[None]"}, + {"FeatureName": "feature10", "ValueAsString": "[None]"}, + ], ), call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '6.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '6'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} - ] + {"FeatureName": "feature1", "ValueAsString": "6.0"}, + {"FeatureName": "feature2", "ValueAsString": "6"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + ], ), call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '7.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '7'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} - ] + {"FeatureName": "feature1", "ValueAsString": "7.0"}, + {"FeatureName": "feature2", "ValueAsString": "7"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + ], ), call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '8.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '8'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} - ] + {"FeatureName": "feature1", "ValueAsString": "8.0"}, + {"FeatureName": "feature2", "ValueAsString": "8"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + ], ), call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '9.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '9'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} - ] - ) + {"FeatureName": "feature1", "ValueAsString": "9.0"}, + {"FeatureName": "feature2", "ValueAsString": "9"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + ], + ), ] - assert actual_put_record_calls == expected_put_record_calls, \ - f"Expected {expected_put_record_calls} calls, but got {actual_put_record_calls}" + assert ( + actual_put_record_calls == expected_put_record_calls + ), f"Expected {expected_put_record_calls} calls, but got {actual_put_record_calls}" def test_ingestion_manager_run_collection_type( - sagemaker_session_mock, - fs_runtime_client_config_mock, - feature_group_dummy_definition_dict, - data_frame_with_collection_type, - expected_in_memory_feature_definition_dict + sagemaker_session_mock, + fs_runtime_client_config_mock, + feature_group_dummy_definition_dict, + data_frame_with_collection_type, + expected_in_memory_feature_definition_dict, ): sagemaker_session_mock.sagemaker_featurestore_runtime_client = fs_runtime_client_config_mock @@ -1316,120 +1382,121 @@ def test_ingestion_manager_run_collection_type( actual_put_record_calls = fs_runtime_client_config_mock.put_record.mock_calls expected_put_record_calls = [ call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '0.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '0'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '0.0'}, - {'FeatureName': 'feature5', 'ValueAsStringList': ['a', 'abc']}, - {'FeatureName': 'feature6', 'ValueAsStringList': ['1', '2']}, - {'FeatureName': 'feature7', 'ValueAsStringList': ['1.1', '2.3']}, - {'FeatureName': 'feature8', 'ValueAsStringList': ['1', '2']}, - {'FeatureName': 'feature9', 'ValueAsStringList': ['1.1', '2.3']}, - {'FeatureName': 'feature10', 'ValueAsStringList': ['a', 'abc']} - ] + {"FeatureName": "feature1", "ValueAsString": "0.0"}, + {"FeatureName": "feature2", "ValueAsString": "0"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "0.0"}, + {"FeatureName": "feature5", "ValueAsStringList": ["a", "abc"]}, + {"FeatureName": "feature6", "ValueAsStringList": ["1", "2"]}, + {"FeatureName": "feature7", "ValueAsStringList": ["1.1", "2.3"]}, + {"FeatureName": "feature8", "ValueAsStringList": ["1", "2"]}, + {"FeatureName": "feature9", "ValueAsStringList": ["1.1", "2.3"]}, + {"FeatureName": "feature10", "ValueAsStringList": ["a", "abc"]}, + ], ), call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '1.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '1'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '1.0'}, - {'FeatureName': 'feature5', 'ValueAsStringList': ['b', 'c']}, - {'FeatureName': 'feature6', 'ValueAsStringList': ['1', '2', '3']}, - {'FeatureName': 'feature7', 'ValueAsStringList': ['1.4', '2.5', '3.2', '25']}, - {'FeatureName': 'feature8', 'ValueAsStringList': ['1', '2', None]}, - {'FeatureName': 'feature9', 'ValueAsStringList': ['1.4', '25', '3.2']}, - {'FeatureName': 'feature10', 'ValueAsStringList': ['b', 'c']} - ] + {"FeatureName": "feature1", "ValueAsString": "1.0"}, + {"FeatureName": "feature2", "ValueAsString": "1"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "1.0"}, + {"FeatureName": "feature5", "ValueAsStringList": ["b", "c"]}, + {"FeatureName": "feature6", "ValueAsStringList": ["1", "2", "3"]}, + {"FeatureName": "feature7", "ValueAsStringList": ["1.4", "2.5", "3.2", "25"]}, + {"FeatureName": "feature8", "ValueAsStringList": ["1", "2", None]}, + {"FeatureName": "feature9", "ValueAsStringList": ["1.4", "25", "3.2"]}, + {"FeatureName": "feature10", "ValueAsStringList": ["b", "c"]}, + ], ), call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '2.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '2'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '2.0'}, - {'FeatureName': 'feature5', 'ValueAsStringList': ['c', 'f']}, - {'FeatureName': 'feature6', 'ValueAsStringList': ['1', '5']}, - {'FeatureName': 'feature7', 'ValueAsStringList': ['1.0', '5.3']}, - {'FeatureName': 'feature8', 'ValueAsStringList': ['1', '5']}, - {'FeatureName': 'feature9', 'ValueAsStringList': ['1.0', '3', None]}, - {'FeatureName': 'feature10', 'ValueAsStringList': ['c', None]} - ] + {"FeatureName": "feature1", "ValueAsString": "2.0"}, + {"FeatureName": "feature2", "ValueAsString": "2"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "2.0"}, + {"FeatureName": "feature5", "ValueAsStringList": ["c", "f"]}, + {"FeatureName": "feature6", "ValueAsStringList": ["1", "5"]}, + {"FeatureName": "feature7", "ValueAsStringList": ["1.0", "5.3"]}, + {"FeatureName": "feature8", "ValueAsStringList": ["1", "5"]}, + {"FeatureName": "feature9", "ValueAsStringList": ["1.0", "3", None]}, + {"FeatureName": "feature10", "ValueAsStringList": ["c", None]}, + ], ), call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '3.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '3'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '3.0'}, - {'FeatureName': 'feature5', 'ValueAsStringList': ['d']}, - {'FeatureName': 'feature6', 'ValueAsStringList': ['1']}, - {'FeatureName': 'feature7', 'ValueAsStringList': ['1.2']}, - {'FeatureName': 'feature8', 'ValueAsStringList': ['1']}, - {'FeatureName': 'feature9', 'ValueAsStringList': ['1.2']}, - {'FeatureName': 'feature10', 'ValueAsStringList': ['d']} - ] + {"FeatureName": "feature1", "ValueAsString": "3.0"}, + {"FeatureName": "feature2", "ValueAsString": "3"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "3.0"}, + {"FeatureName": "feature5", "ValueAsStringList": ["d"]}, + {"FeatureName": "feature6", "ValueAsStringList": ["1"]}, + {"FeatureName": "feature7", "ValueAsStringList": ["1.2"]}, + {"FeatureName": "feature8", "ValueAsStringList": ["1"]}, + {"FeatureName": "feature9", "ValueAsStringList": ["1.2"]}, + {"FeatureName": "feature10", "ValueAsStringList": ["d"]}, + ], ), call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '4.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '4'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature4', 'ValueAsString': '4.0'} - ] + {"FeatureName": "feature1", "ValueAsString": "4.0"}, + {"FeatureName": "feature2", "ValueAsString": "4"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature4", "ValueAsString": "4.0"}, + ], ), call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '5.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '5'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'}, - {'FeatureName': 'feature8', 'ValueAsStringList': [None]}, - {'FeatureName': 'feature9', 'ValueAsStringList': [None]}, - {'FeatureName': 'feature10', 'ValueAsStringList': [None]} - ] + {"FeatureName": "feature1", "ValueAsString": "5.0"}, + {"FeatureName": "feature2", "ValueAsString": "5"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + {"FeatureName": "feature8", "ValueAsStringList": [None]}, + {"FeatureName": "feature9", "ValueAsStringList": [None]}, + {"FeatureName": "feature10", "ValueAsStringList": [None]}, + ], ), call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '6.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '6'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} - ] + {"FeatureName": "feature1", "ValueAsString": "6.0"}, + {"FeatureName": "feature2", "ValueAsString": "6"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + ], ), call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '7.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '7'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} - ] + {"FeatureName": "feature1", "ValueAsString": "7.0"}, + {"FeatureName": "feature2", "ValueAsString": "7"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + ], ), call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '8.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '8'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} - ] + {"FeatureName": "feature1", "ValueAsString": "8.0"}, + {"FeatureName": "feature2", "ValueAsString": "8"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + ], ), call( - FeatureGroupName='MyGroup', + FeatureGroupName="MyGroup", Record=[ - {'FeatureName': 'feature1', 'ValueAsString': '9.0'}, - {'FeatureName': 'feature2', 'ValueAsString': '9'}, - {'FeatureName': 'feature3', 'ValueAsString': '2020-10-30T03:43:21Z'} - ] - ) + {"FeatureName": "feature1", "ValueAsString": "9.0"}, + {"FeatureName": "feature2", "ValueAsString": "9"}, + {"FeatureName": "feature3", "ValueAsString": "2020-10-30T03:43:21Z"}, + ], + ), ] - assert actual_put_record_calls == expected_put_record_calls, \ - f"Expected {expected_put_record_calls} calls, but got {actual_put_record_calls}" + assert ( + actual_put_record_calls == expected_put_record_calls + ), f"Expected {expected_put_record_calls} calls, but got {actual_put_record_calls}" @patch( diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py index 6b2a17714e..54c1c43391 100644 --- a/tests/unit/test_session.py +++ b/tests/unit/test_session.py @@ -5264,53 +5264,38 @@ def test_list_feature_groups(sagemaker_session): @pytest.fixture() def sagemaker_session_with_featurestore_runtime_client(): boto_mock = MagicMock(name="boto_session") - sagemaker_session = sagemaker.Session(boto_session=boto_mock, sagemaker_featurestore_runtime_client=MagicMock()) + sagemaker_session = sagemaker.Session( + boto_session=boto_mock, sagemaker_featurestore_runtime_client=MagicMock() + ) return sagemaker_session def test_feature_group_put_record(sagemaker_session_with_featurestore_runtime_client): sagemaker_session_with_featurestore_runtime_client.put_record( feature_group_name="MyFeatureGroup", - record=[{ - "FeatureName": "feature1", - "ValueAsString": "value1" - }] + record=[{"FeatureName": "feature1", "ValueAsString": "value1"}], ) assert sagemaker_session_with_featurestore_runtime_client.sagemaker_featurestore_runtime_client.put_record.called_with( FeatureGroupName="MyFeatureGroup", - record=[{ - "FeatureName": "feature1", - "ValueAsString": "value1" - }], + record=[{"FeatureName": "feature1", "ValueAsString": "value1"}], ) -def test_feature_group_put_record_with_ttl_and_target_stores(sagemaker_session_with_featurestore_runtime_client): +def test_feature_group_put_record_with_ttl_and_target_stores( + sagemaker_session_with_featurestore_runtime_client, +): sagemaker_session_with_featurestore_runtime_client.put_record( feature_group_name="MyFeatureGroup", - record=[{ - "FeatureName": "feature1", - "ValueAsString": "value1" - }], - ttl_duration={ - "Unit": "Seconds", - "Value": 123 - }, - target_stores=["OnlineStore", "OfflineStore"] + record=[{"FeatureName": "feature1", "ValueAsString": "value1"}], + ttl_duration={"Unit": "Seconds", "Value": 123}, + target_stores=["OnlineStore", "OfflineStore"], ) - assert (sagemaker_session_with_featurestore_runtime_client.sagemaker_featurestore_runtime_client - .put_record.called_with( + assert sagemaker_session_with_featurestore_runtime_client.sagemaker_featurestore_runtime_client.put_record.called_with( FeatureGroupName="MyFeatureGroup", - record=[{ - "FeatureName": "feature1", - "ValueAsString": "value1" - }], + record=[{"FeatureName": "feature1", "ValueAsString": "value1"}], target_stores=["OnlineStore", "OfflineStore"], - ttl_duration={ - "Unit": "Seconds", - "Value": 123 - } - )) + ttl_duration={"Unit": "Seconds", "Value": 123}, + ) def test_start_query_execution(sagemaker_session): From 0e290dbfdbffb4c3aeabc0e089be3acff4aedf55 Mon Sep 17 00:00:00 2001 From: Suryansh Singh Date: Mon, 5 Feb 2024 20:19:48 -0800 Subject: [PATCH 74/76] Fix Flake8 formatting --- src/sagemaker/feature_store/feature_group.py | 53 ++++++++++--------- tests/integ/test_feature_store.py | 22 +++----- .../feature_store/test_feature_group.py | 11 +--- tests/unit/test_session.py | 23 +++++--- 4 files changed, 52 insertions(+), 57 deletions(-) diff --git a/src/sagemaker/feature_store/feature_group.py b/src/sagemaker/feature_store/feature_group.py index 800cc2dff9..e2c52c94d3 100644 --- a/src/sagemaker/feature_store/feature_group.py +++ b/src/sagemaker/feature_store/feature_group.py @@ -28,7 +28,6 @@ import tempfile from concurrent.futures import as_completed from concurrent.futures import ThreadPoolExecutor -from typing import Optional, Sequence, List, Dict, Any, Union from typing import Optional, Sequence, List, Dict, Any, Union, Iterable from urllib.parse import urlparse @@ -331,18 +330,20 @@ def _ingest_row( """ try: record = [ - FeatureValue( - feature_name=data_frame.columns[index - 1], - value_as_string_list=IngestionManagerPandas._covert_feature_value_to_string_list( - row[index] - ), - ) - if IngestionManagerPandas._is_feature_collection_type( - feature_name=data_frame.columns[index - 1], - feature_definitions=feature_definitions, - ) - else FeatureValue( - feature_name=data_frame.columns[index - 1], value_as_string=str(row[index]) + ( + FeatureValue( + feature_name=data_frame.columns[index - 1], + value_as_string_list=IngestionManagerPandas._covert_feature_value_to_string_list( + row[index] + ), + ) + if IngestionManagerPandas._is_feature_collection_type( + feature_name=data_frame.columns[index - 1], + feature_definitions=feature_definitions, + ) + else FeatureValue( + feature_name=data_frame.columns[index - 1], value_as_string=str(row[index]) + ) ) for index in range(1, len(row)) if IngestionManagerPandas._feature_value_is_not_none(feature_value=row[index]) @@ -925,21 +926,23 @@ def _determine_collection_list_type(series: Series) -> FeatureTypeEnum | None: """ if series.apply( - lambda lst: all(isinstance(x, int) or pd.isna(x) for x in lst) - if is_list_like(lst) - else True + lambda lst: ( + all(isinstance(x, int) or pd.isna(x) for x in lst) if is_list_like(lst) else True + ) ).all(): return FeatureTypeEnum.INTEGRAL if series.apply( - lambda lst: all(isinstance(x, (float, int)) or pd.isna(x) for x in lst) - if is_list_like(lst) - else True + lambda lst: ( + all(isinstance(x, (float, int)) or pd.isna(x) for x in lst) + if is_list_like(lst) + else True + ) ).all(): return FeatureTypeEnum.FRACTIONAL if series.apply( - lambda lst: all(isinstance(x, str) or pd.isna(x) for x in lst) - if is_list_like(lst) - else True + lambda lst: ( + all(isinstance(x, str) or pd.isna(x) for x in lst) if is_list_like(lst) else True + ) ).all(): return FeatureTypeEnum.STRING return None @@ -1056,9 +1059,9 @@ def put_record( return self.sagemaker_session.put_record( feature_group_name=self.name, record=[value.to_dict() for value in record], - target_stores=[target_store.value for target_store in target_stores] - if target_stores - else None, + target_stores=( + [target_store.value for target_store in target_stores] if target_stores else None + ), ttl_duration=ttl_duration.to_dict() if ttl_duration is not None else None, ) diff --git a/tests/integ/test_feature_store.py b/tests/integ/test_feature_store.py index ec51205dcd..949c41e79c 100644 --- a/tests/integ/test_feature_store.py +++ b/tests/integ/test_feature_store.py @@ -751,12 +751,6 @@ def test_create_feature_store_ingest_with_online_target_stores( ) _wait_for_feature_group_create(feature_group) - resolved_output_s3_uri = ( - feature_group.describe() - .get("OfflineStoreConfig") - .get("S3StorageConfig") - .get("ResolvedOutputS3Uri") - ) # Ingest data ingestion_manager = feature_group.ingest( data_frame=pandas_data_frame, @@ -783,7 +777,7 @@ def test_create_feature_store_ingest_with_online_target_stores( ], target_stores=[TargetStoreEnum.OFFLINE_STORE], ) - assert feature_group.get_record(record_identifier_value_as_string="100.0") == None + assert feature_group.get_record(record_identifier_value_as_string="100.0") is None # Query the integrated Glue table. athena_query = feature_group.athena_query() @@ -823,7 +817,7 @@ def test_put_record_with_target_stores( feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) feature_group.load_feature_definitions(data_frame=pandas_data_frame) with cleanup_feature_group(feature_group): - output = feature_group.create( + feature_group.create( s3_uri=offline_store_s3_uri, record_identifier_name="feature1", event_time_feature_name="feature3", @@ -840,7 +834,7 @@ def test_put_record_with_target_stores( ], target_stores=[TargetStoreEnum.OFFLINE_STORE], ) - assert feature_group.get_record(record_identifier_value_as_string="100.0") == None + assert feature_group.get_record(record_identifier_value_as_string="100.0") is None feature_group.put_record( record=[ @@ -2188,7 +2182,7 @@ def test_ingest_in_memory_multi_process_with_collection_types( online_storage_type=OnlineStoreStorageTypeEnum.IN_MEMORY, ) - with (cleanup_feature_group(feature_group)): + with cleanup_feature_group(feature_group): output = feature_group.create( record_identifier_name="feature1", event_time_feature_name="feature3", @@ -2229,7 +2223,7 @@ def test_ingest_in_memory_multi_process_with_collection_types( pandas_data_frame_with_collection_type.loc[ len(pandas_data_frame_with_collection_type) ] = new_row_data - with pytest.raises(IngestionError) as error: + with pytest.raises(IngestionError): feature_group.ingest( data_frame=pandas_data_frame_with_collection_type, max_workers=1, @@ -2252,7 +2246,7 @@ def test_ingest_in_memory_single_process_with_collection_types( online_storage_type=OnlineStoreStorageTypeEnum.IN_MEMORY, ) - with (cleanup_feature_group(feature_group)): + with cleanup_feature_group(feature_group): output = feature_group.create( record_identifier_name="feature1", event_time_feature_name="feature3", @@ -2292,7 +2286,7 @@ def test_ingest_in_memory_single_process_with_collection_types( pandas_data_frame_with_collection_type.loc[ len(pandas_data_frame_with_collection_type) ] = new_row_data - with pytest.raises(IngestionError) as error: + with pytest.raises(IngestionError): feature_group.ingest( data_frame=pandas_data_frame_with_collection_type, max_workers=1, @@ -2315,7 +2309,7 @@ def test_ingest_standard_multi_process_with_collection_types( online_storage_type=OnlineStoreStorageTypeEnum.STANDARD, ) - with (cleanup_feature_group(feature_group)): + with cleanup_feature_group(feature_group): output = feature_group.create( record_identifier_name="feature1", event_time_feature_name="feature3", diff --git a/tests/unit/sagemaker/feature_store/test_feature_group.py b/tests/unit/sagemaker/feature_store/test_feature_group.py index 5a26fce026..1084d338f2 100644 --- a/tests/unit/sagemaker/feature_store/test_feature_group.py +++ b/tests/unit/sagemaker/feature_store/test_feature_group.py @@ -104,15 +104,6 @@ def feature_group_dummy_definition_dict(): } -@pytest.fixture -def feature_group_dummy_definitions(): - return [ - FractionalFeatureDefinition(feature_name="feature1"), - IntegralFeatureDefinition(feature_name="feature2"), - StringFeatureDefinition(feature_name="feature3"), - ] - - @pytest.fixture def data_frame_with_collection_type(): df = pd.DataFrame( @@ -1097,7 +1088,7 @@ def test_as_hive_ddl(create_table_ddl, feature_group_dummy_definitions, sagemake "sagemaker.feature_store.feature_group.IngestionManagerPandas._run_multi_process", MagicMock(), ) -def test_ingestion_manager_run_success(): +def test_ingestion_manager__run_multi_process_success(): df = pd.DataFrame({"float": pd.Series([2.0], dtype="float64")}) manager = IngestionManagerPandas( feature_group_name="MyGroup", diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py index 54c1c43391..aa3306de1d 100644 --- a/tests/unit/test_session.py +++ b/tests/unit/test_session.py @@ -5275,12 +5275,16 @@ def test_feature_group_put_record(sagemaker_session_with_featurestore_runtime_cl feature_group_name="MyFeatureGroup", record=[{"FeatureName": "feature1", "ValueAsString": "value1"}], ) - assert sagemaker_session_with_featurestore_runtime_client.sagemaker_featurestore_runtime_client.put_record.called_with( - FeatureGroupName="MyFeatureGroup", - record=[{"FeatureName": "feature1", "ValueAsString": "value1"}], + assert ( + sagemaker_session_with_featurestore_runtime_client.sagemaker_featurestore_runtime_client + .put_record.called_with( + FeatureGroupName="MyFeatureGroup", + record=[{"FeatureName": "feature1", "ValueAsString": "value1"}], + ) ) + def test_feature_group_put_record_with_ttl_and_target_stores( sagemaker_session_with_featurestore_runtime_client, ): @@ -5290,11 +5294,14 @@ def test_feature_group_put_record_with_ttl_and_target_stores( ttl_duration={"Unit": "Seconds", "Value": 123}, target_stores=["OnlineStore", "OfflineStore"], ) - assert sagemaker_session_with_featurestore_runtime_client.sagemaker_featurestore_runtime_client.put_record.called_with( - FeatureGroupName="MyFeatureGroup", - record=[{"FeatureName": "feature1", "ValueAsString": "value1"}], - target_stores=["OnlineStore", "OfflineStore"], - ttl_duration={"Unit": "Seconds", "Value": 123}, + assert ( + sagemaker_session_with_featurestore_runtime_client.sagemaker_featurestore_runtime_client + .put_record.called_with( + FeatureGroupName="MyFeatureGroup", + record=[{"FeatureName": "feature1", "ValueAsString": "value1"}], + target_stores=["OnlineStore", "OfflineStore"], + ttl_duration={"Unit": "Seconds", "Value": 123}, + ) ) From 7228b4720e5be93b93100b7821448d44bf9aebab Mon Sep 17 00:00:00 2001 From: Suryansh Singh Date: Mon, 5 Feb 2024 21:30:22 -0800 Subject: [PATCH 75/76] Fix Pylint --- src/sagemaker/feature_store/feature_group.py | 50 +++++++++++++------ .../feature_store/test_feature_group.py | 4 +- tests/unit/test_session.py | 14 +++--- 3 files changed, 44 insertions(+), 24 deletions(-) diff --git a/src/sagemaker/feature_store/feature_group.py b/src/sagemaker/feature_store/feature_group.py index e2c52c94d3..dab9843be9 100644 --- a/src/sagemaker/feature_store/feature_group.py +++ b/src/sagemaker/feature_store/feature_group.py @@ -333,7 +333,8 @@ def _ingest_row( ( FeatureValue( feature_name=data_frame.columns[index - 1], - value_as_string_list=IngestionManagerPandas._covert_feature_value_to_string_list( + value_as_string_list + =IngestionManagerPandas._covert_feature_value_to_string_list( row[index] ), ) @@ -373,7 +374,8 @@ def _is_feature_collection_type( feature_name (str): name of the feature. feature_definitions (Dict[str, Dict[Any, Any]]): dictionary of feature definitions. where the key is the feature name and the value is the FeatureDefinition. - The FeatureDefinition contains the data type of the feature and the type of collection. + The FeatureDefinition contains the data type of the feature and + the type of collection. If the feature is not a collection type, the value of the CollectionType attribute is None. @@ -383,6 +385,7 @@ def _is_feature_collection_type( feature_definition = feature_definitions.get(feature_name) if feature_definition is not None: return feature_definition.get("CollectionType") is not None + return None @staticmethod def _feature_value_is_not_none( @@ -390,15 +393,20 @@ def _feature_value_is_not_none( ): """Check if the feature value is not None. - For Collection Type feature, we want to keep this check simple, where if the value is not None, + For Collection Type feature, we want to keep this check simple, + where if the value is not None, we convert and pass it to PutRecord, instead of relying on Pandas.notna(obj).all(). - Also, we don't want to skip the collection attribute with partial None values, when calling PutRecord. Since, - vector value can have some dimensions as None. Instead, we want to let PutRecord either accept or fail the - entire record based on the service side implementation. As of this change the service fails any partial None + Also, we don't want to skip the collection attribute with partial None values, + when calling PutRecord. Since, + vector value can have some dimensions as None. Instead, + we want to let PutRecord either accept or fail the + entire record based on the service side implementation. + As of this change the service fails any partial None collection types. - For the Scalar values (non Collection) we want to still use pd.notna() to keep the behavior same. + For the Scalar values (non Collection) we want to still use pd.notna() + to keep the behavior same. Args: feature_value (Any): feature value. @@ -422,7 +430,8 @@ def _covert_feature_value_to_string_list(feature_value: List[Any]): """ if not is_list_like(feature_value): raise ValueError( - f"Invalid feature value, feature value: {feature_value} for a collection type feature" + f"Invalid feature value, feature value: {feature_value}" + f" for a collection type feature" f" must be an Array, but instead was {type(feature_value)}" ) return [str(value) if value is not None else None for value in feature_value] @@ -996,18 +1005,25 @@ def load_feature_definitions( No feature definitions will be loaded if the given data_frame contains unsupported dtypes. - For IN_MEMORY online_storage_type all collection type columns within DataFrame will be inferred as a List, - instead of a String. Due to performance limitations, only first 1,000 values of the column will be sampled, - when inferring collection Type. Customers can manually update the inferred collection type as needed. + For IN_MEMORY online_storage_type all collection type columns within DataFrame + will be inferred as a List, + instead of a String. Due to performance limitations, + only first 1,000 values of the column will be sampled, + when inferring collection Type. + Customers can manually update the inferred collection type as needed. Args: data_frame (DataFrame): A Pandas DataFrame containing features. online_storage_type (OnlineStoreStorageTypeEnum): - Optional. Online storage type for the feature group. The value can be either STANDARD or IN_MEMORY + Optional. Online storage type for the feature group. + The value can be either STANDARD or IN_MEMORY If not specified,STANDARD will be used by default. - If specified as IN_MEMORY, we will infer any collection type column within DataFrame as a List - instead of a String. All, collection types (List, Set and Vector) will be inferred as List. - We will only sample the first 1,000 values of the column when inferring collection Type. + If specified as IN_MEMORY, + we will infer any collection type column within DataFrame as a List instead of a + String. + All, collection types (List, Set and Vector) will be inferred as List. + We will only sample the first 1,000 values of the column when inferring + collection Type. @@ -1157,7 +1173,8 @@ def ingest( feature_group_name=self.name, feature_definitions=feature_definition_dict, sagemaker_session=self.sagemaker_session, - sagemaker_fs_runtime_client_config=self.sagemaker_session.sagemaker_featurestore_runtime_client.meta.config, + sagemaker_fs_runtime_client_config + =self.sagemaker_session.sagemaker_featurestore_runtime_client.meta.config, max_workers=max_workers, max_processes=max_processes, profile_name=profile_name, @@ -1169,6 +1186,7 @@ def ingest( def _get_feature_definition_dict(self) -> Dict[str, Dict[Any, Any]]: """Get a dictionary of feature definitions with Feature Name as Key. + We are converting the FeatureDefinition into a List for faster lookups. Returns: diff --git a/tests/unit/sagemaker/feature_store/test_feature_group.py b/tests/unit/sagemaker/feature_store/test_feature_group.py index 1084d338f2..5155a802d4 100644 --- a/tests/unit/sagemaker/feature_store/test_feature_group.py +++ b/tests/unit/sagemaker/feature_store/test_feature_group.py @@ -1099,7 +1099,9 @@ def test_ingestion_manager__run_multi_process_success(): ) manager.run(df) - manager._run_multi_process.assert_called_once_with(data_frame=df, wait=True, timeout=None) + manager._run_multi_process.assert_called_once_with( + data_frame=df, target_stores=None, wait=True, timeout=None + ) @patch( diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py index aa3306de1d..fe87bc8def 100644 --- a/tests/unit/test_session.py +++ b/tests/unit/test_session.py @@ -5262,7 +5262,7 @@ def test_list_feature_groups(sagemaker_session): @pytest.fixture() -def sagemaker_session_with_featurestore_runtime_client(): +def sagemaker_session_with_fs_runtime_client(): boto_mock = MagicMock(name="boto_session") sagemaker_session = sagemaker.Session( boto_session=boto_mock, sagemaker_featurestore_runtime_client=MagicMock() @@ -5270,13 +5270,13 @@ def sagemaker_session_with_featurestore_runtime_client(): return sagemaker_session -def test_feature_group_put_record(sagemaker_session_with_featurestore_runtime_client): - sagemaker_session_with_featurestore_runtime_client.put_record( +def test_feature_group_put_record(sagemaker_session_with_fs_runtime_client): + sagemaker_session_with_fs_runtime_client.put_record( feature_group_name="MyFeatureGroup", record=[{"FeatureName": "feature1", "ValueAsString": "value1"}], ) assert ( - sagemaker_session_with_featurestore_runtime_client.sagemaker_featurestore_runtime_client + sagemaker_session_with_fs_runtime_client.sagemaker_featurestore_runtime_client .put_record.called_with( FeatureGroupName="MyFeatureGroup", record=[{"FeatureName": "feature1", "ValueAsString": "value1"}], @@ -5286,16 +5286,16 @@ def test_feature_group_put_record(sagemaker_session_with_featurestore_runtime_cl def test_feature_group_put_record_with_ttl_and_target_stores( - sagemaker_session_with_featurestore_runtime_client, + sagemaker_session_with_fs_runtime_client, ): - sagemaker_session_with_featurestore_runtime_client.put_record( + sagemaker_session_with_fs_runtime_client.put_record( feature_group_name="MyFeatureGroup", record=[{"FeatureName": "feature1", "ValueAsString": "value1"}], ttl_duration={"Unit": "Seconds", "Value": 123}, target_stores=["OnlineStore", "OfflineStore"], ) assert ( - sagemaker_session_with_featurestore_runtime_client.sagemaker_featurestore_runtime_client + sagemaker_session_with_fs_runtime_client.sagemaker_featurestore_runtime_client .put_record.called_with( FeatureGroupName="MyFeatureGroup", record=[{"FeatureName": "feature1", "ValueAsString": "value1"}], From 85e1536dcf8220b29d167d40fc466e239d44a28b Mon Sep 17 00:00:00 2001 From: Suryansh Singh Date: Tue, 6 Feb 2024 12:23:34 -0800 Subject: [PATCH 76/76] Fix Formatting. --- src/sagemaker/feature_store/feature_group.py | 10 +++---- tests/unit/test_session.py | 28 +++++++++----------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/src/sagemaker/feature_store/feature_group.py b/src/sagemaker/feature_store/feature_group.py index dab9843be9..39915b60dc 100644 --- a/src/sagemaker/feature_store/feature_group.py +++ b/src/sagemaker/feature_store/feature_group.py @@ -333,8 +333,7 @@ def _ingest_row( ( FeatureValue( feature_name=data_frame.columns[index - 1], - value_as_string_list - =IngestionManagerPandas._covert_feature_value_to_string_list( + value_as_string_list=IngestionManagerPandas._covert_feature_to_string_list( row[index] ), ) @@ -419,7 +418,7 @@ def _feature_value_is_not_none( return feature_value @staticmethod - def _covert_feature_value_to_string_list(feature_value: List[Any]): + def _covert_feature_to_string_list(feature_value: List[Any]): """Convert a list of feature values to a list of strings. Args: @@ -1173,8 +1172,9 @@ def ingest( feature_group_name=self.name, feature_definitions=feature_definition_dict, sagemaker_session=self.sagemaker_session, - sagemaker_fs_runtime_client_config - =self.sagemaker_session.sagemaker_featurestore_runtime_client.meta.config, + sagemaker_fs_runtime_client_config=( + self.sagemaker_session.sagemaker_featurestore_runtime_client.meta.config + ), max_workers=max_workers, max_processes=max_processes, profile_name=profile_name, diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py index fe87bc8def..de543b6f53 100644 --- a/tests/unit/test_session.py +++ b/tests/unit/test_session.py @@ -5275,18 +5275,16 @@ def test_feature_group_put_record(sagemaker_session_with_fs_runtime_client): feature_group_name="MyFeatureGroup", record=[{"FeatureName": "feature1", "ValueAsString": "value1"}], ) - assert ( - sagemaker_session_with_fs_runtime_client.sagemaker_featurestore_runtime_client - .put_record.called_with( - FeatureGroupName="MyFeatureGroup", - record=[{"FeatureName": "feature1", "ValueAsString": "value1"}], - ) - ) + fs_client_mock = sagemaker_session_with_fs_runtime_client.sagemaker_featurestore_runtime_client + assert fs_client_mock.put_record.called_with( + FeatureGroupName="MyFeatureGroup", + record=[{"FeatureName": "feature1", "ValueAsString": "value1"}], + ) def test_feature_group_put_record_with_ttl_and_target_stores( - sagemaker_session_with_fs_runtime_client, + sagemaker_session_with_fs_runtime_client, ): sagemaker_session_with_fs_runtime_client.put_record( feature_group_name="MyFeatureGroup", @@ -5294,14 +5292,12 @@ def test_feature_group_put_record_with_ttl_and_target_stores( ttl_duration={"Unit": "Seconds", "Value": 123}, target_stores=["OnlineStore", "OfflineStore"], ) - assert ( - sagemaker_session_with_fs_runtime_client.sagemaker_featurestore_runtime_client - .put_record.called_with( - FeatureGroupName="MyFeatureGroup", - record=[{"FeatureName": "feature1", "ValueAsString": "value1"}], - target_stores=["OnlineStore", "OfflineStore"], - ttl_duration={"Unit": "Seconds", "Value": 123}, - ) + fs_client_mock = sagemaker_session_with_fs_runtime_client.sagemaker_featurestore_runtime_client + assert fs_client_mock.put_record.called_with( + FeatureGroupName="MyFeatureGroup", + record=[{"FeatureName": "feature1", "ValueAsString": "value1"}], + target_stores=["OnlineStore", "OfflineStore"], + ttl_duration={"Unit": "Seconds", "Value": 123}, )