From 1e7fcc8f5d4f553700db04283cf7bc00f96fa116 Mon Sep 17 00:00:00 2001 From: Marcio Vinicius dos Santos Date: Wed, 24 Jul 2019 09:59:55 -0700 Subject: [PATCH 1/4] Removing unnecessary test case --- tests/integ/test_tf_script_mode.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/tests/integ/test_tf_script_mode.py b/tests/integ/test_tf_script_mode.py index 5892c2736a..ea4541c5bb 100644 --- a/tests/integ/test_tf_script_mode.py +++ b/tests/integ/test_tf_script_mode.py @@ -40,17 +40,7 @@ @pytest.fixture( scope="session", - params=[ - "ml.c4.xlarge", - pytest.param( - "ml.p2.xlarge", - marks=pytest.mark.skipif( - tests.integ.test_region() in tests.integ.HOSTING_NO_P2_REGIONS - or tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS, - reason="no ml.p2 instances in this region", - ), - ), - ], + params=["ml.c4.xlarge"] ) def instance_type(request): return request.param From 2b4dcf9ae7f688c8134e8b9e31b2ea7c26587465 Mon Sep 17 00:00:00 2001 From: Marcio Vinicius dos Santos Date: Wed, 24 Jul 2019 10:07:24 -0700 Subject: [PATCH 2/4] Update test_tf_script_mode.py --- tests/integ/test_tf_script_mode.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integ/test_tf_script_mode.py b/tests/integ/test_tf_script_mode.py index ea4541c5bb..bcfe094859 100644 --- a/tests/integ/test_tf_script_mode.py +++ b/tests/integ/test_tf_script_mode.py @@ -40,7 +40,9 @@ @pytest.fixture( scope="session", - params=["ml.c4.xlarge"] + params=[ + "ml.c4.xlarge", + ], ) def instance_type(request): return request.param From 09642f5d7f1e6225e082158179a3fef9d54d5d19 Mon Sep 17 00:00:00 2001 From: Marcio Vinicius dos Santos Date: Wed, 24 Jul 2019 16:27:41 -0700 Subject: [PATCH 3/4] Update test_tf_script_mode.py --- tests/integ/test_tf_script_mode.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/integ/test_tf_script_mode.py b/tests/integ/test_tf_script_mode.py index bcfe094859..a0dce773fe 100644 --- a/tests/integ/test_tf_script_mode.py +++ b/tests/integ/test_tf_script_mode.py @@ -38,12 +38,7 @@ TAGS = [{"Key": "some-key", "Value": "some-value"}] -@pytest.fixture( - scope="session", - params=[ - "ml.c4.xlarge", - ], -) +@pytest.fixture(scope="session", params=["ml.c4.xlarge"]) def instance_type(request): return request.param From b6a735fc977531c57fd608160de9a66100c4ab77 Mon Sep 17 00:00:00 2001 From: Marcio Dos Santos Date: Thu, 25 Jul 2019 11:23:51 -0700 Subject: [PATCH 4/4] removed additional tests --- tests/integ/test_horovod.py | 15 +----- tests/integ/test_pytorch_train.py | 38 -------------- tests/integ/test_tf_cifar.py | 83 ------------------------------ tests/integ/test_tf_keras.py | 6 +-- tests/integ/test_tf_script_mode.py | 21 ++++---- 5 files changed, 15 insertions(+), 148 deletions(-) delete mode 100644 tests/integ/test_tf_cifar.py diff --git a/tests/integ/test_horovod.py b/tests/integ/test_horovod.py index 76e50b15e6..befc428fa4 100644 --- a/tests/integ/test_horovod.py +++ b/tests/integ/test_horovod.py @@ -23,23 +23,12 @@ import sagemaker.utils import tests.integ as integ from sagemaker.tensorflow import TensorFlow -from tests.integ import test_region, timeout, HOSTING_NO_P3_REGIONS +from tests.integ import timeout horovod_dir = os.path.join(os.path.dirname(__file__), "..", "data", "horovod") -@pytest.fixture( - scope="session", - params=[ - "ml.c4.xlarge", - pytest.param( - "ml.p3.2xlarge", - marks=pytest.mark.skipif( - test_region() in HOSTING_NO_P3_REGIONS, reason="no ml.p3 instances in this region" - ), - ), - ], -) +@pytest.fixture(scope="session", params=["ml.c4.xlarge"]) def instance_type(request): return request.param diff --git a/tests/integ/test_pytorch_train.py b/tests/integ/test_pytorch_train.py index f21450ceed..9e6d32b9e0 100644 --- a/tests/integ/test_pytorch_train.py +++ b/tests/integ/test_pytorch_train.py @@ -13,11 +13,9 @@ from __future__ import absolute_import import os -import time import numpy import pytest -import tests.integ from tests.integ import DATA_DIR, PYTHON_VERSION, TRAINING_DEFAULT_TIMEOUT_MINUTES from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name @@ -80,42 +78,6 @@ def test_deploy_model(pytorch_training_job, sagemaker_session): assert output.shape == (batch_size, 10) -@pytest.mark.skipif( - tests.integ.test_region() in tests.integ.HOSTING_NO_P2_REGIONS - or tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS, - reason="no ml.p2 instances in these regions", -) -def test_async_fit_deploy(sagemaker_session, pytorch_full_version): - training_job_name = "" - # TODO: add tests against local mode when it's ready to be used - instance_type = "ml.p2.xlarge" - - with timeout(minutes=10): - pytorch = _get_pytorch_estimator(sagemaker_session, pytorch_full_version, instance_type) - - pytorch.fit({"training": _upload_training_data(pytorch)}, wait=False) - training_job_name = pytorch.latest_training_job.name - - print("Waiting to re-attach to the training job: %s" % training_job_name) - time.sleep(20) - - if not _is_local_mode(instance_type): - endpoint_name = "test-pytorch-async-fit-attach-deploy-{}".format(sagemaker_timestamp()) - - with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): - print("Re-attaching now to: %s" % training_job_name) - estimator = PyTorch.attach( - training_job_name=training_job_name, sagemaker_session=sagemaker_session - ) - predictor = estimator.deploy(1, instance_type, endpoint_name=endpoint_name) - - batch_size = 100 - data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32) - output = predictor.predict(data) - - assert output.shape == (batch_size, 10) - - def _upload_training_data(pytorch): return pytorch.sagemaker_session.upload_data( path=os.path.join(MNIST_DIR, "training"), diff --git a/tests/integ/test_tf_cifar.py b/tests/integ/test_tf_cifar.py deleted file mode 100644 index 75a1f8635f..0000000000 --- a/tests/integ/test_tf_cifar.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import os -import pickle - -import numpy as np -import pytest - -import tests.integ -from tests.integ.timeout import timeout_and_delete_endpoint_by_name, timeout - -from sagemaker.tensorflow import TensorFlow -from sagemaker.utils import unique_name_from_base - -PICKLE_CONTENT_TYPE = "application/python-pickle" - - -class PickleSerializer(object): - def __init__(self): - self.content_type = PICKLE_CONTENT_TYPE - - def __call__(self, data): - return pickle.dumps(data, protocol=2) - - -@pytest.mark.canary_quick -@pytest.mark.skipif( - tests.integ.PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2." -) -@pytest.mark.skipif( - tests.integ.test_region() in tests.integ.HOSTING_NO_P2_REGIONS - or tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS, - reason="no ml.p2 instances in these regions", -) -def test_cifar(sagemaker_session): - with timeout(minutes=45): - script_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "source") - - dataset_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "data") - - estimator = TensorFlow( - entry_point="resnet_cifar_10.py", - source_dir=script_path, - role="SageMakerRole", - framework_version="1.12", - training_steps=50, - evaluation_steps=5, - train_instance_count=2, - train_instance_type="ml.p2.xlarge", - sagemaker_session=sagemaker_session, - train_max_run=45 * 60, - base_job_name="test-cifar", - ) - - inputs = estimator.sagemaker_session.upload_data( - path=dataset_path, key_prefix="data/cifar10" - ) - job_name = unique_name_from_base("test-tf-cifar") - - estimator.fit(inputs, logs=False, job_name=job_name) - print("job succeeded: {}".format(estimator.latest_training_job.name)) - - endpoint_name = estimator.latest_training_job.name - with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): - predictor = estimator.deploy(initial_instance_count=1, instance_type="ml.p2.xlarge") - predictor.serializer = PickleSerializer() - predictor.content_type = PICKLE_CONTENT_TYPE - - data = np.random.randn(32, 32, 3) - predict_response = predictor.predict(data) - assert len(predict_response["outputs"]["probabilities"]["floatVal"]) == 10 diff --git a/tests/integ/test_tf_keras.py b/tests/integ/test_tf_keras.py index 9939e67b2a..34d4d49238 100644 --- a/tests/integ/test_tf_keras.py +++ b/tests/integ/test_tf_keras.py @@ -28,10 +28,6 @@ @pytest.mark.skipif( tests.integ.PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2." ) -@pytest.mark.skipif( - tests.integ.test_region() in tests.integ.HOSTING_NO_P2_REGIONS, - reason="no ml.p2 instances in these regions", -) def test_keras(sagemaker_session): script_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "source") dataset_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "data") @@ -60,7 +56,7 @@ def test_keras(sagemaker_session): endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): - predictor = estimator.deploy(initial_instance_count=1, instance_type="ml.p2.xlarge") + predictor = estimator.deploy(initial_instance_count=1, instance_type="ml.c4.xlarge") data = np.random.randn(32, 32, 3) predict_response = predictor.predict(data) diff --git a/tests/integ/test_tf_script_mode.py b/tests/integ/test_tf_script_mode.py index bcfe094859..204f47ebde 100644 --- a/tests/integ/test_tf_script_mode.py +++ b/tests/integ/test_tf_script_mode.py @@ -38,12 +38,7 @@ TAGS = [{"Key": "some-key", "Value": "some-value"}] -@pytest.fixture( - scope="session", - params=[ - "ml.c4.xlarge", - ], -) +@pytest.fixture(scope="session", params=["ml.c4.xlarge"]) def instance_type(request): return request.param @@ -220,8 +215,15 @@ def _assert_s3_files_exist(s3_url, files): raise ValueError("File {} is not found under {}".format(f, s3_url)) -def _assert_tags_match(sagemaker_client, resource_arn, tags): - actual_tags = sagemaker_client.list_tags(ResourceArn=resource_arn)["Tags"] +def _assert_tags_match(sagemaker_client, resource_arn, tags, retries=1): + actual_tags = None + for _ in range(retries): + actual_tags = sagemaker_client.list_tags(ResourceArn=resource_arn)["Tags"] + if actual_tags: + break + else: + # endpoint tags might take minutes to propagate. Sleeping. + time.sleep(30) assert actual_tags == tags @@ -232,7 +234,8 @@ def _assert_model_tags_match(sagemaker_client, model_name, tags): def _assert_endpoint_tags_match(sagemaker_client, endpoint_name, tags): endpoint_description = sagemaker_client.describe_endpoint(EndpointName=endpoint_name) - _assert_tags_match(sagemaker_client, endpoint_description["EndpointArn"], tags) + + _assert_tags_match(sagemaker_client, endpoint_description["EndpointArn"], tags, retries=10) def _assert_training_job_tags_match(sagemaker_client, training_job_name, tags):