From 36c9ac667fe1ba65e035760e96ee47041616c8e7 Mon Sep 17 00:00:00 2001 From: Chuyang Deng Date: Fri, 11 Oct 2019 15:35:59 -0700 Subject: [PATCH 1/5] fix: use default bucket for checkpoint_s3_uri --- tests/integ/test_tf_script_mode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integ/test_tf_script_mode.py b/tests/integ/test_tf_script_mode.py index 4264d693db..f26c516db5 100644 --- a/tests/integ/test_tf_script_mode.py +++ b/tests/integ/test_tf_script_mode.py @@ -71,7 +71,7 @@ def test_mnist(sagemaker_session, instance_type): reason="checkpoint s3 bucket is in us-east-1, ListObjectsV2 will fail in other regions", ) def test_checkpoint_config(sagemaker_session, instance_type): - checkpoint_s3_uri = "s3://142577830533-us-east-1-sagemaker-checkpoint" + checkpoint_s3_uri = "s3://{}".format(sagemaker_session.default_bucket()) checkpoint_local_path = "/test/checkpoint/path" estimator = TensorFlow( entry_point=SCRIPT, From 8ebcf7bf9eb3a6afc20da29bd3b2e980ec51d682 Mon Sep 17 00:00:00 2001 From: Chuyang Deng Date: Fri, 11 Oct 2019 15:37:20 -0700 Subject: [PATCH 2/5] use default bucket for checkpoint_s3_uri --- tests/integ/test_tf_script_mode.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/integ/test_tf_script_mode.py b/tests/integ/test_tf_script_mode.py index f26c516db5..d04b02523d 100644 --- a/tests/integ/test_tf_script_mode.py +++ b/tests/integ/test_tf_script_mode.py @@ -66,10 +66,6 @@ def test_mnist(sagemaker_session, instance_type): assert df.size > 0 -@pytest.mark.skipif( - tests.integ.test_region() != "us-east-1", - reason="checkpoint s3 bucket is in us-east-1, ListObjectsV2 will fail in other regions", -) def test_checkpoint_config(sagemaker_session, instance_type): checkpoint_s3_uri = "s3://{}".format(sagemaker_session.default_bucket()) checkpoint_local_path = "/test/checkpoint/path" From 1f7180818689661d63e680b7afaea23edcd361cc Mon Sep 17 00:00:00 2001 From: Chuyang Deng Date: Fri, 11 Oct 2019 18:20:15 -0700 Subject: [PATCH 3/5] creating a unique object for checkpointing --- tests/integ/test_tf_script_mode.py | 36 +++++++++--------------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/tests/integ/test_tf_script_mode.py b/tests/integ/test_tf_script_mode.py index d04b02523d..0dfee0723a 100644 --- a/tests/integ/test_tf_script_mode.py +++ b/tests/integ/test_tf_script_mode.py @@ -19,7 +19,7 @@ import pytest from sagemaker.tensorflow import TensorFlow -from sagemaker.utils import unique_name_from_base +from sagemaker.utils import unique_name_from_base, sagemaker_timestamp import tests.integ from tests.integ import timeout @@ -39,7 +39,11 @@ TAGS = [{"Key": "some-key", "Value": "some-value"}] -def test_mnist(sagemaker_session, instance_type): +def test_mnist_with_checkpoint_config(sagemaker_session, instance_type): + checkpoint_s3_uri = "s3://{}/tf-{}".format( + sagemaker_session.default_bucket(), sagemaker_timestamp() + ) + checkpoint_local_path = "/test/checkpoint/path" estimator = TensorFlow( entry_point=SCRIPT, role="SageMakerRole", @@ -50,13 +54,16 @@ def test_mnist(sagemaker_session, instance_type): framework_version=TensorFlow.LATEST_VERSION, py_version=tests.integ.PYTHON_VERSION, metric_definitions=[{"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"}], + checkpoint_s3_uri=checkpoint_s3_uri, + checkpoint_local_path=checkpoint_local_path ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist" ) + training_job_name = unique_name_from_base("test-tf-sm-mnist") with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): - estimator.fit(inputs=inputs, job_name=unique_name_from_base("test-tf-sm-mnist")) + estimator.fit(inputs=inputs, job_name=training_job_name) assert_s3_files_exist( sagemaker_session, estimator.model_dir, @@ -65,29 +72,6 @@ def test_mnist(sagemaker_session, instance_type): df = estimator.training_job_analytics.dataframe() assert df.size > 0 - -def test_checkpoint_config(sagemaker_session, instance_type): - checkpoint_s3_uri = "s3://{}".format(sagemaker_session.default_bucket()) - checkpoint_local_path = "/test/checkpoint/path" - estimator = TensorFlow( - entry_point=SCRIPT, - role="SageMakerRole", - train_instance_count=1, - train_instance_type=instance_type, - sagemaker_session=sagemaker_session, - script_mode=True, - framework_version=TensorFlow.LATEST_VERSION, - py_version=tests.integ.PYTHON_VERSION, - checkpoint_s3_uri=checkpoint_s3_uri, - checkpoint_local_path=checkpoint_local_path, - ) - inputs = estimator.sagemaker_session.upload_data( - path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="script/mnist" - ) - training_job_name = unique_name_from_base("test-tf-sm-checkpoint") - with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): - estimator.fit(inputs=inputs, job_name=training_job_name) - expected_training_checkpoint_config = { "S3Uri": checkpoint_s3_uri, "LocalPath": checkpoint_local_path, From 34fdb930ad0bbfbd7fa63d758570be48b03b5679 Mon Sep 17 00:00:00 2001 From: Chuyang Deng Date: Fri, 11 Oct 2019 18:34:40 -0700 Subject: [PATCH 4/5] add checkpoint in url --- tests/integ/test_tf_script_mode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integ/test_tf_script_mode.py b/tests/integ/test_tf_script_mode.py index 0dfee0723a..0f6dd9f71e 100644 --- a/tests/integ/test_tf_script_mode.py +++ b/tests/integ/test_tf_script_mode.py @@ -40,7 +40,7 @@ def test_mnist_with_checkpoint_config(sagemaker_session, instance_type): - checkpoint_s3_uri = "s3://{}/tf-{}".format( + checkpoint_s3_uri = "s3://{}//checkpoints/tf-{}".format( sagemaker_session.default_bucket(), sagemaker_timestamp() ) checkpoint_local_path = "/test/checkpoint/path" @@ -55,7 +55,7 @@ def test_mnist_with_checkpoint_config(sagemaker_session, instance_type): py_version=tests.integ.PYTHON_VERSION, metric_definitions=[{"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"}], checkpoint_s3_uri=checkpoint_s3_uri, - checkpoint_local_path=checkpoint_local_path + checkpoint_local_path=checkpoint_local_path, ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist" From 88cdef99baa3a779b2c53e414aea58cf2bb37c76 Mon Sep 17 00:00:00 2001 From: Chuyang Date: Fri, 11 Oct 2019 18:50:09 -0700 Subject: [PATCH 5/5] fix typo --- tests/integ/test_tf_script_mode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integ/test_tf_script_mode.py b/tests/integ/test_tf_script_mode.py index 0f6dd9f71e..2586465841 100644 --- a/tests/integ/test_tf_script_mode.py +++ b/tests/integ/test_tf_script_mode.py @@ -40,7 +40,7 @@ def test_mnist_with_checkpoint_config(sagemaker_session, instance_type): - checkpoint_s3_uri = "s3://{}//checkpoints/tf-{}".format( + checkpoint_s3_uri = "s3://{}/checkpoints/tf-{}".format( sagemaker_session.default_bucket(), sagemaker_timestamp() ) checkpoint_local_path = "/test/checkpoint/path"