From 9cd234257f492ed26f50bba3d9e81a18e7568dcc Mon Sep 17 00:00:00 2001 From: Dana Benson Date: Tue, 3 Mar 2020 15:25:00 -0800 Subject: [PATCH] add sagemaker-experiments --- docker/2.1.0/py3/Dockerfile.cpu | 1 + docker/2.1.0/py3/Dockerfile.gpu | 1 + setup.py | 3 +- test/integration/conftest.py | 7 ++ .../integration/sagemaker/test_experiments.py | 97 +++++++++++++++++++ 5 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 test/integration/sagemaker/test_experiments.py diff --git a/docker/2.1.0/py3/Dockerfile.cpu b/docker/2.1.0/py3/Dockerfile.cpu index c34393c2..3a7dc324 100644 --- a/docker/2.1.0/py3/Dockerfile.cpu +++ b/docker/2.1.0/py3/Dockerfile.cpu @@ -109,6 +109,7 @@ RUN ${PIP} install --no-cache-dir -U \ awscli \ mpi4py==3.0.3 \ opencv-python==4.2.0.32 \ + sagemaker-experiments==0.1.7 \ "sagemaker-tensorflow>=2.1,<2.2" \ # Let's install TensorFlow separately in the end to avoid # the library version to be overwritten diff --git a/docker/2.1.0/py3/Dockerfile.gpu b/docker/2.1.0/py3/Dockerfile.gpu index 4def95ee..6f405f10 100644 --- a/docker/2.1.0/py3/Dockerfile.gpu +++ b/docker/2.1.0/py3/Dockerfile.gpu @@ -153,6 +153,7 @@ RUN ${PIP} install --no-cache-dir -U \ awscli \ mpi4py==3.0.3 \ opencv-python==4.2.0.32 \ + sagemaker-experiments==0.1.7 \ "sagemaker-tensorflow>=2.1,<2.2" \ # Let's install TensorFlow separately in the end to avoid # the library version to be overwritten diff --git a/setup.py b/setup.py index 983ebd13..1dd4d593 100644 --- a/setup.py +++ b/setup.py @@ -60,6 +60,7 @@ def read_version(): 'sagemaker==1.50.1', 'tensorflow<2.0', 'docker-compose', 'boto3==1.10.50', 'six==1.13.0', 'python-dateutil>=2.1,<2.8.1', 'botocore==1.13.50', 'requests-mock', 'awscli==1.16.314'], - 'benchmark': ['click'] + 'benchmark': ['click'], + ':python_version=="3.6"': ['sagemaker-experiments==0.1.7'] }, ) diff --git a/test/integration/conftest.py b/test/integration/conftest.py index 4b599675..102326aa 100644 --- a/test/integration/conftest.py +++ b/test/integration/conftest.py @@ -116,3 +116,10 @@ def docker_image(docker_base_name, tag): def ecr_image(account_id, docker_base_name, tag, region): return '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format( account_id, region, docker_base_name, tag) + + +@pytest.fixture(autouse=True) +def skip_py2_containers(request, tag): + if request.node.get_closest_marker('skip_py2_containers'): + if 'py2' in tag: + pytest.skip('Skipping python2 container with tag {}'.format(tag)) diff --git a/test/integration/sagemaker/test_experiments.py b/test/integration/sagemaker/test_experiments.py new file mode 100644 index 00000000..3504331a --- /dev/null +++ b/test/integration/sagemaker/test_experiments.py @@ -0,0 +1,97 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# or in the "license" file accompanying this file. This file is distributed +# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. +from __future__ import absolute_import + +import os +import time + +import pytest +from sagemaker import utils +from sagemaker.tensorflow import TensorFlow +from smexperiments.experiment import Experiment +from smexperiments.trial import Trial +from smexperiments.trial_component import TrialComponent + +from test.integration import DEFAULT_TIMEOUT +from test.integration import RESOURCE_PATH +from timeout import timeout + +DATA_PATH = os.path.join(RESOURCE_PATH, "mnist") +SCRIPT_PATH = os.path.join(DATA_PATH, "mnist_gluon_basic_hook_demo.py") + + +@pytest.mark.skip_py2_containers +def test_training(sagemaker_session, ecr_image, instance_type, framework_version): + + sm_client = sagemaker_session.sagemaker_client + + experiment_name = f"tf-container-integ-test-{int(time.time())}" + + experiment = Experiment.create( + experiment_name=experiment_name, + description="Integration test experiment from sagemaker-tf-container", + sagemaker_boto_client=sm_client, + ) + + trial_name = f"tf-container-integ-test-{int(time.time())}" + trial = Trial.create( + experiment_name=experiment_name, trial_name=trial_name, sagemaker_boto_client=sm_client + ) + + training_job_name = utils.unique_name_from_base("test-tf-experiments-mnist") + + # create a training job and wait for it to complete + with timeout(minutes=DEFAULT_TIMEOUT): + resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") + script = os.path.join(resource_path, "mnist", "mnist.py") + estimator = TensorFlow( + entry_point=script, + role="SageMakerRole", + train_instance_type=instance_type, + train_instance_count=1, + sagemaker_session=sagemaker_session, + image_name=ecr_image, + framework_version=framework_version, + script_mode=True, + ) + inputs = estimator.sagemaker_session.upload_data( + path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist" + ) + estimator.fit(inputs, job_name=training_job_name) + + training_job = sm_client.describe_training_job(TrainingJobName=training_job_name) + training_job_arn = training_job["TrainingJobArn"] + + # verify trial component auto created from the training job + trial_components = list( + TrialComponent.list(source_arn=training_job_arn, sagemaker_boto_client=sm_client) + ) + + trial_component_summary = trial_components[0] + trial_component = TrialComponent.load( + trial_component_name=trial_component_summary.trial_component_name, + sagemaker_boto_name=sm_client, + ) + + # associate the trial component with the trial + trial.add_trial_component(trial_component) + + # verify association + associated_trial_components = list(trial.list_trial_components()) + assert len(associated_trial_components) == 1 + + # cleanup + trial.remove_trial_component(trial_component_summary.trial_component_name) + trial_component.delete() + trial.delete() + experiment.delete()