From bf0a76d4c2578a36ff0f119b6c03384ab330e601 Mon Sep 17 00:00:00 2001 From: Vandana Kannan Date: Tue, 10 Mar 2020 21:17:13 -0700 Subject: [PATCH 1/6] SM integration test for TF 1.x --- test/integration/sagemaker/test_mnist.py | 20 ++++ test/resources/mnist/tf1x_mnist_smdebug.py | 103 +++++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 test/resources/mnist/tf1x_mnist_smdebug.py diff --git a/test/integration/sagemaker/test_mnist.py b/test/integration/sagemaker/test_mnist.py index 25c8db3e..03067b3e 100644 --- a/test/integration/sagemaker/test_mnist.py +++ b/test/integration/sagemaker/test_mnist.py @@ -146,6 +146,26 @@ def test_tuning(sagemaker_session, ecr_image, instance_type, framework_version): tuner.wait() +def test_tf1x_smdebug(sagemaker_session, ecr_image, instance_type, framework_version): + resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') + script = os.path.join(resource_path, 'mnist', 'tf1x_mnist_smdebug.py') + hyperparameters = {'smdebug_path': '/opt/ml/output/tensors'} + estimator = TensorFlow(entry_point=script, + role='SageMakerRole', + train_instance_type=instance_type, + train_instance_count=1, + sagemaker_session=sagemaker_session, + image_name=ecr_image, + framework_version=framework_version, + script_mode=True, + hyperparameters=hyperparameters) + inputs = estimator.sagemaker_session.upload_data( + path=os.path.join(resource_path, 'mnist', 'data'), + key_prefix='scriptmode/mnist_smdebug') + estimator.fit(inputs, job_name=unique_name_from_base('test-sagemaker-mnist-smdebug')) + _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data) + + def _assert_checkpoint_exists(region, model_dir, checkpoint_number): _assert_s3_file_exists(region, os.path.join(model_dir, 'graph.pbtxt')) _assert_s3_file_exists(region, diff --git a/test/resources/mnist/tf1x_mnist_smdebug.py b/test/resources/mnist/tf1x_mnist_smdebug.py new file mode 100644 index 00000000..b54b90cc --- /dev/null +++ b/test/resources/mnist/tf1x_mnist_smdebug.py @@ -0,0 +1,103 @@ +import argparse +import json +import os +import sys + +import numpy as np +import tensorflow.compat.v1 as tf + +import smdebug.tensorflow as smd +from smdebug.core.collection import CollectionKeys +from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS +from smdebug.tensorflow import ReductionConfig, SaveConfig +from smdebug.trials import create_trial + + +def _parse_args(): + + parser = argparse.ArgumentParser() + + # hyperparameters sent by the client are passed as command-line arguments to the script. + parser.add_argument('--epochs', type=int, default=1) + # Data, model, and output directories + parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) + parser.add_argument( + "--smdebug_path", + type=str, + default=None, + help="S3 URI of the bucket where tensor data will be stored.", + ) + parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAINING']) + parser.add_argument('--hosts', type=list, default=json.loads(os.environ['SM_HOSTS'])) + parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST']) + + return parser.parse_known_args() + + +def _load_training_data(base_dir): + x_train = np.load(os.path.join(base_dir, 'train', 'x_train.npy')) + y_train = np.load(os.path.join(base_dir, 'train', 'y_train.npy')) + return x_train, y_train + + +def _load_testing_data(base_dir): + x_test = np.load(os.path.join(base_dir, 'test', 'x_test.npy')) + y_test = np.load(os.path.join(base_dir, 'test', 'y_test.npy')) + return x_test, y_test + + +def create_smdebug_hook(out_dir): + include_collections = [ + CollectionKeys.WEIGHTS, + CollectionKeys.BIASES, + CollectionKeys.GRADIENTS, + CollectionKeys.LOSSES, + CollectionKeys.OUTPUTS, + CollectionKeys.METRICS, + CollectionKeys.LOSSES, + CollectionKeys.OPTIMIZER_VARIABLES, + ] + save_config = SaveConfig(save_interval=3) + hook = smd.KerasHook( + out_dir, + save_config=save_config, + include_collections=include_collections, + reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + ) + return hook + + +args, unknown = _parse_args() + +hook = create_smdebug_hook(args.smdebug_path) +hooks = [hook] + +model = tf.keras.models.Sequential([ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(512, activation=tf.nn.relu), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation=tf.nn.softmax) +]) + +model.compile(optimizer='adam', + loss='sparse_categorical_crossentropy', + metrics=['accuracy']) +x_train, y_train = _load_training_data(args.train) +x_test, y_test = _load_testing_data(args.train) +model.fit(x_train, y_train, epochs=args.epochs, callbacks=hooks) +model.evaluate(x_test, y_test, callbacks=hooks) + +if args.current_host == args.hosts[0]: + model.save(os.path.join('/opt/ml/model', 'my_model.h5')) + +print("Created the trial with out_dir {0}".format(args.smdebug_path)) +trial = create_trial(args.smdebug_path) +assert trial + +print(f"trial.tensor_names() = {trial.tensor_names()}") + +weights_tensors = hook.collection_manager.get("weights").tensor_names +assert len(weights_tensors) > 0 + +losses_tensors = hook.collection_manager.get("losses").tensor_names +assert len(losses_tensors) > 0 From 42b747f036ed6ba2697372c51858319a1ec914a8 Mon Sep 17 00:00:00 2001 From: Vandana Kannan Date: Wed, 11 Mar 2020 08:10:52 -0700 Subject: [PATCH 2/6] add skip py2 fixture --- test/integration/conftest.py | 7 +++++++ test/integration/sagemaker/test_mnist.py | 1 + 2 files changed, 8 insertions(+) diff --git a/test/integration/conftest.py b/test/integration/conftest.py index 4b599675..102326aa 100644 --- a/test/integration/conftest.py +++ b/test/integration/conftest.py @@ -116,3 +116,10 @@ def docker_image(docker_base_name, tag): def ecr_image(account_id, docker_base_name, tag, region): return '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format( account_id, region, docker_base_name, tag) + + +@pytest.fixture(autouse=True) +def skip_py2_containers(request, tag): + if request.node.get_closest_marker('skip_py2_containers'): + if 'py2' in tag: + pytest.skip('Skipping python2 container with tag {}'.format(tag)) diff --git a/test/integration/sagemaker/test_mnist.py b/test/integration/sagemaker/test_mnist.py index 03067b3e..db320680 100644 --- a/test/integration/sagemaker/test_mnist.py +++ b/test/integration/sagemaker/test_mnist.py @@ -146,6 +146,7 @@ def test_tuning(sagemaker_session, ecr_image, instance_type, framework_version): tuner.wait() +@pytest.mark.skip_py2_containers def test_tf1x_smdebug(sagemaker_session, ecr_image, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') script = os.path.join(resource_path, 'mnist', 'tf1x_mnist_smdebug.py') From 4fc504085e8f488de4804e8c2164873e3a1368e8 Mon Sep 17 00:00:00 2001 From: Vandana Kannan Date: Tue, 10 Mar 2020 21:17:13 -0700 Subject: [PATCH 3/6] SM integration test for TF 1.x --- test/integration/sagemaker/test_mnist.py | 20 ++++ test/resources/mnist/tf1x_mnist_smdebug.py | 103 +++++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 test/resources/mnist/tf1x_mnist_smdebug.py diff --git a/test/integration/sagemaker/test_mnist.py b/test/integration/sagemaker/test_mnist.py index 25c8db3e..03067b3e 100644 --- a/test/integration/sagemaker/test_mnist.py +++ b/test/integration/sagemaker/test_mnist.py @@ -146,6 +146,26 @@ def test_tuning(sagemaker_session, ecr_image, instance_type, framework_version): tuner.wait() +def test_tf1x_smdebug(sagemaker_session, ecr_image, instance_type, framework_version): + resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') + script = os.path.join(resource_path, 'mnist', 'tf1x_mnist_smdebug.py') + hyperparameters = {'smdebug_path': '/opt/ml/output/tensors'} + estimator = TensorFlow(entry_point=script, + role='SageMakerRole', + train_instance_type=instance_type, + train_instance_count=1, + sagemaker_session=sagemaker_session, + image_name=ecr_image, + framework_version=framework_version, + script_mode=True, + hyperparameters=hyperparameters) + inputs = estimator.sagemaker_session.upload_data( + path=os.path.join(resource_path, 'mnist', 'data'), + key_prefix='scriptmode/mnist_smdebug') + estimator.fit(inputs, job_name=unique_name_from_base('test-sagemaker-mnist-smdebug')) + _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data) + + def _assert_checkpoint_exists(region, model_dir, checkpoint_number): _assert_s3_file_exists(region, os.path.join(model_dir, 'graph.pbtxt')) _assert_s3_file_exists(region, diff --git a/test/resources/mnist/tf1x_mnist_smdebug.py b/test/resources/mnist/tf1x_mnist_smdebug.py new file mode 100644 index 00000000..b54b90cc --- /dev/null +++ b/test/resources/mnist/tf1x_mnist_smdebug.py @@ -0,0 +1,103 @@ +import argparse +import json +import os +import sys + +import numpy as np +import tensorflow.compat.v1 as tf + +import smdebug.tensorflow as smd +from smdebug.core.collection import CollectionKeys +from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS +from smdebug.tensorflow import ReductionConfig, SaveConfig +from smdebug.trials import create_trial + + +def _parse_args(): + + parser = argparse.ArgumentParser() + + # hyperparameters sent by the client are passed as command-line arguments to the script. + parser.add_argument('--epochs', type=int, default=1) + # Data, model, and output directories + parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) + parser.add_argument( + "--smdebug_path", + type=str, + default=None, + help="S3 URI of the bucket where tensor data will be stored.", + ) + parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAINING']) + parser.add_argument('--hosts', type=list, default=json.loads(os.environ['SM_HOSTS'])) + parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST']) + + return parser.parse_known_args() + + +def _load_training_data(base_dir): + x_train = np.load(os.path.join(base_dir, 'train', 'x_train.npy')) + y_train = np.load(os.path.join(base_dir, 'train', 'y_train.npy')) + return x_train, y_train + + +def _load_testing_data(base_dir): + x_test = np.load(os.path.join(base_dir, 'test', 'x_test.npy')) + y_test = np.load(os.path.join(base_dir, 'test', 'y_test.npy')) + return x_test, y_test + + +def create_smdebug_hook(out_dir): + include_collections = [ + CollectionKeys.WEIGHTS, + CollectionKeys.BIASES, + CollectionKeys.GRADIENTS, + CollectionKeys.LOSSES, + CollectionKeys.OUTPUTS, + CollectionKeys.METRICS, + CollectionKeys.LOSSES, + CollectionKeys.OPTIMIZER_VARIABLES, + ] + save_config = SaveConfig(save_interval=3) + hook = smd.KerasHook( + out_dir, + save_config=save_config, + include_collections=include_collections, + reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + ) + return hook + + +args, unknown = _parse_args() + +hook = create_smdebug_hook(args.smdebug_path) +hooks = [hook] + +model = tf.keras.models.Sequential([ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(512, activation=tf.nn.relu), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation=tf.nn.softmax) +]) + +model.compile(optimizer='adam', + loss='sparse_categorical_crossentropy', + metrics=['accuracy']) +x_train, y_train = _load_training_data(args.train) +x_test, y_test = _load_testing_data(args.train) +model.fit(x_train, y_train, epochs=args.epochs, callbacks=hooks) +model.evaluate(x_test, y_test, callbacks=hooks) + +if args.current_host == args.hosts[0]: + model.save(os.path.join('/opt/ml/model', 'my_model.h5')) + +print("Created the trial with out_dir {0}".format(args.smdebug_path)) +trial = create_trial(args.smdebug_path) +assert trial + +print(f"trial.tensor_names() = {trial.tensor_names()}") + +weights_tensors = hook.collection_manager.get("weights").tensor_names +assert len(weights_tensors) > 0 + +losses_tensors = hook.collection_manager.get("losses").tensor_names +assert len(losses_tensors) > 0 From 404e5aa6554a1e94ec79d4803183fe6e972ce7db Mon Sep 17 00:00:00 2001 From: Vandana Kannan Date: Wed, 11 Mar 2020 08:10:52 -0700 Subject: [PATCH 4/6] add skip py2 fixture --- test/integration/conftest.py | 7 +++++++ test/integration/sagemaker/test_mnist.py | 1 + 2 files changed, 8 insertions(+) diff --git a/test/integration/conftest.py b/test/integration/conftest.py index 4b8d674b..ce5eaad0 100644 --- a/test/integration/conftest.py +++ b/test/integration/conftest.py @@ -123,3 +123,10 @@ def skip_py2_containers(request, tag): def ecr_image(account_id, docker_base_name, tag, region): return '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format( account_id, region, docker_base_name, tag) + + +@pytest.fixture(autouse=True) +def skip_py2_containers(request, tag): + if request.node.get_closest_marker('skip_py2_containers'): + if 'py2' in tag: + pytest.skip('Skipping python2 container with tag {}'.format(tag)) diff --git a/test/integration/sagemaker/test_mnist.py b/test/integration/sagemaker/test_mnist.py index 03067b3e..db320680 100644 --- a/test/integration/sagemaker/test_mnist.py +++ b/test/integration/sagemaker/test_mnist.py @@ -146,6 +146,7 @@ def test_tuning(sagemaker_session, ecr_image, instance_type, framework_version): tuner.wait() +@pytest.mark.skip_py2_containers def test_tf1x_smdebug(sagemaker_session, ecr_image, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') script = os.path.join(resource_path, 'mnist', 'tf1x_mnist_smdebug.py') From 14573f2d67320afa995d13d5a5da1d6fbd5b29d2 Mon Sep 17 00:00:00 2001 From: Vandana Kannan Date: Wed, 11 Mar 2020 10:41:01 -0700 Subject: [PATCH 5/6] remove py2 fixture --- test/integration/conftest.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/test/integration/conftest.py b/test/integration/conftest.py index ce5eaad0..4b8d674b 100644 --- a/test/integration/conftest.py +++ b/test/integration/conftest.py @@ -123,10 +123,3 @@ def skip_py2_containers(request, tag): def ecr_image(account_id, docker_base_name, tag, region): return '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format( account_id, region, docker_base_name, tag) - - -@pytest.fixture(autouse=True) -def skip_py2_containers(request, tag): - if request.node.get_closest_marker('skip_py2_containers'): - if 'py2' in tag: - pytest.skip('Skipping python2 container with tag {}'.format(tag)) From cbcb15309be337c1e242f9f7d51479cf61437c61 Mon Sep 17 00:00:00 2001 From: Vandana Kannan Date: Wed, 11 Mar 2020 10:44:37 -0700 Subject: [PATCH 6/6] remove py2 fixture --- test/integration/conftest.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/test/integration/conftest.py b/test/integration/conftest.py index ce5eaad0..4b8d674b 100644 --- a/test/integration/conftest.py +++ b/test/integration/conftest.py @@ -123,10 +123,3 @@ def skip_py2_containers(request, tag): def ecr_image(account_id, docker_base_name, tag, region): return '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format( account_id, region, docker_base_name, tag) - - -@pytest.fixture(autouse=True) -def skip_py2_containers(request, tag): - if request.node.get_closest_marker('skip_py2_containers'): - if 'py2' in tag: - pytest.skip('Skipping python2 container with tag {}'.format(tag))