From 30deecd8630d694c3a76a47d2966357f7c11c71c Mon Sep 17 00:00:00 2001 From: jesterhazy Date: Sat, 27 Apr 2019 05:07:19 -0700 Subject: [PATCH 1/4] skip p2/p3 tests in eu-central-1 --- tests/integ/__init__.py | 7 ++-- tests/integ/test_tf_script_mode.py | 64 +++++++++++++++++------------- 2 files changed, 41 insertions(+), 30 deletions(-) diff --git a/tests/integ/__init__.py b/tests/integ/__init__.py index 4d28b692ce..9a3d99c681 100644 --- a/tests/integ/__init__.py +++ b/tests/integ/__init__.py @@ -24,10 +24,11 @@ TRANSFORM_DEFAULT_TIMEOUT_MINUTES = 20 PYTHON_VERSION = 'py' + str(sys.version_info.major) -# 'eu-central-1' has some p2, but no enough for continuous testing -HOSTING_NO_P2_REGIONS = ['ca-central-1', 'eu-west-2', 'us-west-1', 'eu-central-1'] +# these regions have some p2 and p3 instances, but not enough for continuous testing +HOSTING_NO_P2_REGIONS = ['ca-central-1', 'eu-central-1', 'eu-west-2', 'us-west-1'] HOSTING_NO_P3_REGIONS = ['ap-southeast-1', 'ap-southeast-2', 'ap-south-1', 'ca-central-1', - 'eu-west-2', 'us-west-1'] + 'eu-central-1', 'eu-west-2', 'us-west-1'] + # EI is currently only supported in the following regions # regions were derived from https://aws.amazon.com/machine-learning/elastic-inference/pricing/ EI_SUPPORTED_REGIONS = ['us-east-1', 'us-east-2', 'us-west-2', 'eu-west-1', 'ap-northeast-1', 'ap-northeast-2'] diff --git a/tests/integ/test_tf_script_mode.py b/tests/integ/test_tf_script_mode.py index c17f14961b..23bb86d94d 100644 --- a/tests/integ/test_tf_script_mode.py +++ b/tests/integ/test_tf_script_mode.py @@ -11,6 +11,7 @@ # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. from __future__ import absolute_import +from __future__ import absolute_import import numpy as np import os @@ -22,9 +23,8 @@ from sagemaker.tensorflow import TensorFlow from six.moves.urllib.parse import urlparse from sagemaker.utils import unique_name_from_base -import tests.integ as integ -from tests.integ import kms_utils -import tests.integ.timeout as timeout + +import tests.integ ROLE = 'SageMakerRole' @@ -35,14 +35,18 @@ TAGS = [{'Key': 'some-key', 'Value': 'some-value'}] -@pytest.fixture(scope='session', params=['ml.c5.xlarge', 'ml.p2.xlarge']) +@pytest.fixture(scope='session', params=[ + 'ml.c5.xlarge', + pytest.param('ml.p2.xlarge', + marks=pytest.mark.skipif( + tests.integ.test_region() in tests.integ.HOSTING_NO_P2_REGIONS, + reason='no ml.p2 instances in this region'))]) def instance_type(request): return request.param -@pytest.mark.skipif(integ.test_region() in integ.HOSTING_NO_P2_REGIONS, - reason='no ml.p2 instances in these regions') -@pytest.mark.skipif(integ.PYTHON_VERSION != 'py3', reason="Script Mode tests are only configured to run with Python 3") +@pytest.mark.skipif(tests.integ.PYTHON_VERSION != 'py3', + reason="Script Mode tests are only configured to run with Python 3") def test_mnist(sagemaker_session, instance_type): estimator = TensorFlow(entry_point=SCRIPT, role='SageMakerRole', @@ -51,26 +55,26 @@ def test_mnist(sagemaker_session, instance_type): sagemaker_session=sagemaker_session, py_version='py3', framework_version=TensorFlow.LATEST_VERSION, - metric_definitions=[{'Name': 'train:global_steps', 'Regex': r'global_step\/sec:\s(.*)'}]) + metric_definitions=[ + {'Name': 'train:global_steps', 'Regex': r'global_step\/sec:\s(.*)'}]) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(RESOURCE_PATH, 'data'), key_prefix='scriptmode/mnist') - with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): + with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs=inputs, job_name=unique_name_from_base('test-tf-sm-mnist')) _assert_s3_files_exist(estimator.model_dir, ['graph.pbtxt', 'model.ckpt-0.index', 'model.ckpt-0.meta']) df = estimator.training_job_analytics.dataframe() - print(df) assert df.size > 0 def test_server_side_encryption(sagemaker_session): - boto_session = sagemaker_session.boto_session - with kms_utils.bucket_with_encryption(boto_session, ROLE) as (bucket_with_kms, kms_key): - - output_path = os.path.join(bucket_with_kms, 'test-server-side-encryption', time.strftime('%y%m%d-%H%M')) + with tests.integ.kms_utils.bucket_with_encryption(boto_session, ROLE) as ( + bucket_with_kms, kms_key): + output_path = os.path.join(bucket_with_kms, 'test-server-side-encryption', + time.strftime('%y%m%d-%H%M')) estimator = TensorFlow(entry_point=SCRIPT, role=ROLE, @@ -88,20 +92,21 @@ def test_server_side_encryption(sagemaker_session): path=os.path.join(RESOURCE_PATH, 'data'), key_prefix='scriptmode/mnist') - with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): - estimator.fit(inputs=inputs, job_name=unique_name_from_base('test-server-side-encryption')) + with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): + estimator.fit(inputs=inputs, + job_name=unique_name_from_base('test-server-side-encryption')) @pytest.mark.canary_quick -@pytest.mark.skipif(integ.PYTHON_VERSION != 'py3', reason="Script Mode tests are only configured to run with Python 3") +@pytest.mark.skipif(tests.integ.PYTHON_VERSION != 'py3', + reason="Script Mode tests are only configured to run with Python 3") def test_mnist_distributed(sagemaker_session, instance_type): estimator = TensorFlow(entry_point=SCRIPT, role=ROLE, train_instance_count=2, - # TODO: change train_instance_type to instance_type once the test is passing consistently - train_instance_type='ml.c5.xlarge', + train_instance_type=instance_type, sagemaker_session=sagemaker_session, - py_version=integ.PYTHON_VERSION, + py_version=tests.integ.PYTHON_VERSION, script_mode=True, framework_version=TensorFlow.LATEST_VERSION, distributions=PARAMETER_SERVER_DISTRIBUTION) @@ -109,7 +114,7 @@ def test_mnist_distributed(sagemaker_session, instance_type): path=os.path.join(RESOURCE_PATH, 'data'), key_prefix='scriptmode/distributed_mnist') - with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): + with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs=inputs, job_name=unique_name_from_base('test-tf-sm-distributed')) _assert_s3_files_exist(estimator.model_dir, ['graph.pbtxt', 'model.ckpt-0.index', 'model.ckpt-0.meta']) @@ -131,22 +136,26 @@ def test_mnist_async(sagemaker_session): training_job_name = estimator.latest_training_job.name time.sleep(20) endpoint_name = training_job_name - _assert_training_job_tags_match(sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS) - with timeout.timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): - estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) + _assert_training_job_tags_match(sagemaker_session.sagemaker_client, + estimator.latest_training_job.name, TAGS) + with tests.integ.timeout.timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): + estimator = TensorFlow.attach(training_job_name=training_job_name, + sagemaker_session=sagemaker_session) predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name) result = predictor.predict(np.zeros(784)) print('predict result: {}'.format(result)) _assert_endpoint_tags_match(sagemaker_session.sagemaker_client, predictor.endpoint, TAGS) - _assert_model_tags_match(sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS) + _assert_model_tags_match(sagemaker_session.sagemaker_client, + estimator.latest_training_job.name, TAGS) def _assert_s3_files_exist(s3_url, files): parsed_url = urlparse(s3_url) s3 = boto3.client('s3') - contents = s3.list_objects_v2(Bucket=parsed_url.netloc, Prefix=parsed_url.path.lstrip('/'))["Contents"] + contents = s3.list_objects_v2(Bucket=parsed_url.netloc, Prefix=parsed_url.path.lstrip('/'))[ + "Contents"] for f in files: found = [x['Key'] for x in contents if x['Key'].endswith(f)] if not found: @@ -169,5 +178,6 @@ def _assert_endpoint_tags_match(sagemaker_client, endpoint_name, tags): def _assert_training_job_tags_match(sagemaker_client, training_job_name, tags): - training_job_description = sagemaker_client.describe_training_job(TrainingJobName=training_job_name) + training_job_description = sagemaker_client.describe_training_job( + TrainingJobName=training_job_name) _assert_tags_match(sagemaker_client, training_job_description['TrainingJobArn'], tags) From e0a4b9219f9cdc688e12fda170d9caa543121ebc Mon Sep 17 00:00:00 2001 From: jesterhazy Date: Sat, 27 Apr 2019 05:08:07 -0700 Subject: [PATCH 2/4] skip unnecessary steps during tox tests --- tox.ini | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index ca754d6c4d..1bc127b896 100644 --- a/tox.ini +++ b/tox.ini @@ -60,7 +60,9 @@ commands = deps = .[test] [testenv:flake8] -basepython = python +basepython = python3 +skipdist = true +skip_install = true deps = flake8 flake8-future-import @@ -68,13 +70,17 @@ commands = flake8 [testenv:pylint] basepython = python3 +skipdist = true +skip_install = true deps = pylint==2.3.1 commands = python -m pylint --rcfile=.pylintrc -j 0 src/sagemaker [testenv:twine] -basepython = python +basepython = python3 +skipdist = true +skip_install = true # twine check was added starting in 1.12.0 # https://github.com/pypa/twine/blob/master/docs/changelog.rst deps = From 4cd567a16022a3726db72aff54b2f701c4d533c6 Mon Sep 17 00:00:00 2001 From: jesterhazy Date: Mon, 29 Apr 2019 11:52:59 -0700 Subject: [PATCH 3/4] remove duplicate import --- tests/integ/test_tf_script_mode.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integ/test_tf_script_mode.py b/tests/integ/test_tf_script_mode.py index 23bb86d94d..c8dba8fd81 100644 --- a/tests/integ/test_tf_script_mode.py +++ b/tests/integ/test_tf_script_mode.py @@ -11,7 +11,6 @@ # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. from __future__ import absolute_import -from __future__ import absolute_import import numpy as np import os From af677044a026aad29784fa807770ecccfee16692 Mon Sep 17 00:00:00 2001 From: jesterhazy Date: Mon, 29 Apr 2019 11:55:22 -0700 Subject: [PATCH 4/4] restore sdist/install step during sphinx test --- tox.ini | 2 -- 1 file changed, 2 deletions(-) diff --git a/tox.ini b/tox.ini index 1bc127b896..b9baae836f 100644 --- a/tox.ini +++ b/tox.ini @@ -79,8 +79,6 @@ commands = [testenv:twine] basepython = python3 -skipdist = true -skip_install = true # twine check was added starting in 1.12.0 # https://github.com/pypa/twine/blob/master/docs/changelog.rst deps =