From ab0fcae717187b45db5c1374ae51688e915bdb2c Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Mon, 17 Jun 2019 12:03:12 -0700 Subject: [PATCH 1/6] feature: add TensorFlow 1.13 support --- README.rst | 2 +- src/sagemaker/tensorflow/estimator.py | 15 ++++++++++++--- src/sagemaker/utils.py | 4 ++++ tests/data/tensorflow_mnist/mnist.py | 8 -------- tests/unit/test_tf_estimator.py | 7 +++++++ tests/unit/test_utils.py | 5 +++++ 6 files changed, 29 insertions(+), 12 deletions(-) diff --git a/README.rst b/README.rst index a8261dfe21..7aeedb3fb0 100644 --- a/README.rst +++ b/README.rst @@ -173,7 +173,7 @@ TensorFlow SageMaker Estimators By using TensorFlow SageMaker Estimators, you can train and host TensorFlow models on Amazon SageMaker. -Supported versions of TensorFlow: ``1.4.1``, ``1.5.0``, ``1.6.0``, ``1.7.0``, ``1.8.0``, ``1.9.0``, ``1.10.0``, ``1.11.0``, ``1.12.0``. +Supported versions of TensorFlow: ``1.4.1``, ``1.5.0``, ``1.6.0``, ``1.7.0``, ``1.8.0``, ``1.9.0``, ``1.10.0``, ``1.11.0``, ``1.12.0``, ``1.13.1``. Supported versions of TensorFlow for Elastic Inference: ``1.11.0``, ``1.12.0``. diff --git a/src/sagemaker/tensorflow/estimator.py b/src/sagemaker/tensorflow/estimator.py index ef22bd15ca..cebc9ecdf5 100644 --- a/src/sagemaker/tensorflow/estimator.py +++ b/src/sagemaker/tensorflow/estimator.py @@ -26,7 +26,7 @@ from sagemaker.tensorflow.defaults import TF_VERSION from sagemaker.tensorflow.model import TensorFlowModel from sagemaker.tensorflow.serving import Model -from sagemaker.utils import get_config_value +from sagemaker.utils import get_config_value, get_short_version from sagemaker.vpc_utils import VPC_CONFIG_DEFAULT logger = logging.getLogger("sagemaker") @@ -190,9 +190,11 @@ class TensorFlow(Framework): __framework_name__ = "tensorflow" - LATEST_VERSION = "1.12" + LATEST_VERSION = '1.13' """The latest version of TensorFlow included in the SageMaker pre-built Docker images.""" + _LOWEST_SCRIPT_MODE_ONLY_VERSION = [1, 13] + def __init__( self, training_steps=None, @@ -321,6 +323,13 @@ def _validate_args( ) ) + if (not self._script_mode_enabled()) and \ + [int(s) for s in self.framework_version.split('.')] >= self._LOWEST_SCRIPT_MODE_ONLY_VERSION: + raise AttributeError( + 'Legacy mode is deprecated in versions 1.13 and higher.' + 'Please set the script_mode argument to True to use Script Mode' + ) + def _validate_requirements_file(self, requirements_file): if not requirements_file: return @@ -489,7 +498,7 @@ def _create_tfs_model(self, role=None, vpc_config_override=VPC_CONFIG_DEFAULT): image=self.image_name, name=self._current_job_name, container_log_level=self.container_log_level, - framework_version=self.framework_version, + framework_version=get_short_version(self.framework_version), sagemaker_session=self.sagemaker_session, vpc_config=self.get_vpc_config(vpc_config_override), ) diff --git a/src/sagemaker/utils.py b/src/sagemaker/utils.py index 68f5f9d47f..17b36418d9 100644 --- a/src/sagemaker/utils.py +++ b/src/sagemaker/utils.py @@ -123,6 +123,10 @@ def get_config_value(key_path, config): return current_section +def get_short_version(framework_version): + return '.'.join(framework_version.split('.')[:2]) + + def to_str(value): """Convert the input to a string, unless it is a unicode string in Python 2. diff --git a/tests/data/tensorflow_mnist/mnist.py b/tests/data/tensorflow_mnist/mnist.py index 5882011815..bb68c29fbd 100644 --- a/tests/data/tensorflow_mnist/mnist.py +++ b/tests/data/tensorflow_mnist/mnist.py @@ -14,17 +14,11 @@ import argparse import json -import logging as _logging import numpy as np import os -import sys as _sys import tensorflow as tf -from tensorflow.python.platform import tf_logging tf.logging.set_verbosity(tf.logging.DEBUG) -_handler = _logging.StreamHandler(_sys.stdout) -tf_logger = tf_logging._get_logger() -tf_logger.handlers = [_handler] def cnn_model_fn(features, labels, mode): @@ -179,5 +173,3 @@ def serving_input_fn(): if args.current_host == args.hosts[0]: mnist_classifier.export_savedmodel("/opt/ml/model", serving_input_fn) - - tf_logger.info("====== Training finished =========") diff --git a/tests/unit/test_tf_estimator.py b/tests/unit/test_tf_estimator.py index fd97f08d63..a5bbfbc358 100644 --- a/tests/unit/test_tf_estimator.py +++ b/tests/unit/test_tf_estimator.py @@ -870,6 +870,13 @@ def test_script_mode_deprecated_args(sagemaker_session): ) in str(e.value) +def test_legacy_mode_deprecation_error(sagemaker_session): + with pytest.raises(AttributeError) as e: + _build_tf(sagemaker_session=sagemaker_session, framework_version='1.13.1', + py_version='py2', script_mode=False) + assert 'Legacy mode is deprecated' in str(e.value) + + def test_script_mode_enabled(sagemaker_session): tf = _build_tf(sagemaker_session=sagemaker_session, py_version="py3") assert tf._script_mode_enabled() is True diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index e59c05f865..ff7bfff356 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -47,6 +47,11 @@ def test_get_config_value(): assert sagemaker.utils.get_config_value("other.key", None) is None +def test_get_short_version(): + assert sagemaker.utils.get_short_version('1.13.1') == '1.13' + assert sagemaker.utils.get_short_version('1.13') == '1.13' + + def test_deferred_error(): de = sagemaker.utils.DeferredError(ImportError("pretend the import failed")) with pytest.raises(ImportError) as _: # noqa: F841 From 632c5c605f08cd133bdd4ef1e43a31d181518389 Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Thu, 20 Jun 2019 14:52:09 -0700 Subject: [PATCH 2/6] Use script mode in 1.13 and older --- src/sagemaker/tensorflow/estimator.py | 12 ++++++------ tests/unit/test_tf_estimator.py | 9 ++++----- tox.ini | 2 +- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/sagemaker/tensorflow/estimator.py b/src/sagemaker/tensorflow/estimator.py index cebc9ecdf5..eec1c55cd1 100644 --- a/src/sagemaker/tensorflow/estimator.py +++ b/src/sagemaker/tensorflow/estimator.py @@ -323,12 +323,12 @@ def _validate_args( ) ) - if (not self._script_mode_enabled()) and \ - [int(s) for s in self.framework_version.split('.')] >= self._LOWEST_SCRIPT_MODE_ONLY_VERSION: - raise AttributeError( - 'Legacy mode is deprecated in versions 1.13 and higher.' - 'Please set the script_mode argument to True to use Script Mode' - ) + if (not self._script_mode_enabled()) and self._only_script_mode_supported(): + logger.warning('Legacy mode is deprecated in versions 1.13 and higher. Using script mode instead.') + self.script_mode = True + + def _only_script_mode_supported(self): + return [int(s) for s in self.framework_version.split('.')] >= self._LOWEST_SCRIPT_MODE_ONLY_VERSION def _validate_requirements_file(self, requirements_file): if not requirements_file: diff --git a/tests/unit/test_tf_estimator.py b/tests/unit/test_tf_estimator.py index a5bbfbc358..848a9df628 100644 --- a/tests/unit/test_tf_estimator.py +++ b/tests/unit/test_tf_estimator.py @@ -870,11 +870,10 @@ def test_script_mode_deprecated_args(sagemaker_session): ) in str(e.value) -def test_legacy_mode_deprecation_error(sagemaker_session): - with pytest.raises(AttributeError) as e: - _build_tf(sagemaker_session=sagemaker_session, framework_version='1.13.1', - py_version='py2', script_mode=False) - assert 'Legacy mode is deprecated' in str(e.value) +def test_legacy_mode_deprecated(sagemaker_session): + tf = _build_tf(sagemaker_session=sagemaker_session, framework_version='1.13.1', + py_version='py2', script_mode=False) + assert tf._script_mode_enabled() is True def test_script_mode_enabled(sagemaker_session): diff --git a/tox.ini b/tox.ini index c1d8d5196f..0382d9d1e7 100644 --- a/tox.ini +++ b/tox.ini @@ -58,7 +58,7 @@ passenv = commands = coverage run --source sagemaker -m pytest {posargs} {env:IGNORE_COVERAGE:} coverage report --fail-under=90 --omit */tensorflow/tensorflow_serving/* -deps = .[test] +extras = test [testenv:flake8] basepython = python3 From aa2b998871bb93557627f3ac480746701980b9ed Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Thu, 20 Jun 2019 17:17:39 -0700 Subject: [PATCH 3/6] changes based on cr feedback --- src/sagemaker/utils.py | 8 ++++++++ tests/unit/test_tf_estimator.py | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/src/sagemaker/utils.py b/src/sagemaker/utils.py index 17b36418d9..6b2c7dcee5 100644 --- a/src/sagemaker/utils.py +++ b/src/sagemaker/utils.py @@ -124,6 +124,14 @@ def get_config_value(key_path, config): def get_short_version(framework_version): + """Return short version in the format of x.x + + Args: + framework_version: The version string to be shortened. + + Returns: + str: The short version string + """ return '.'.join(framework_version.split('.')[:2]) diff --git a/tests/unit/test_tf_estimator.py b/tests/unit/test_tf_estimator.py index 848a9df628..ad9c55f661 100644 --- a/tests/unit/test_tf_estimator.py +++ b/tests/unit/test_tf_estimator.py @@ -875,6 +875,10 @@ def test_legacy_mode_deprecated(sagemaker_session): py_version='py2', script_mode=False) assert tf._script_mode_enabled() is True + tf = _build_tf(sagemaker_session=sagemaker_session, framework_version='1.12', + py_version='py2', script_mode=False) + assert tf._script_mode_enabled() is False + def test_script_mode_enabled(sagemaker_session): tf = _build_tf(sagemaker_session=sagemaker_session, py_version="py3") From b4a6f6c11dc61e522b8fad6fce902088df921792 Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Mon, 24 Jun 2019 09:42:25 -0700 Subject: [PATCH 4/6] reformatting --- src/sagemaker/tensorflow/estimator.py | 10 +++++++--- src/sagemaker/utils.py | 2 +- tests/unit/test_tf_estimator.py | 16 ++++++++++++---- tests/unit/test_utils.py | 4 ++-- 4 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/sagemaker/tensorflow/estimator.py b/src/sagemaker/tensorflow/estimator.py index eec1c55cd1..dbc4e88b9a 100644 --- a/src/sagemaker/tensorflow/estimator.py +++ b/src/sagemaker/tensorflow/estimator.py @@ -190,7 +190,7 @@ class TensorFlow(Framework): __framework_name__ = "tensorflow" - LATEST_VERSION = '1.13' + LATEST_VERSION = "1.13" """The latest version of TensorFlow included in the SageMaker pre-built Docker images.""" _LOWEST_SCRIPT_MODE_ONLY_VERSION = [1, 13] @@ -324,11 +324,15 @@ def _validate_args( ) if (not self._script_mode_enabled()) and self._only_script_mode_supported(): - logger.warning('Legacy mode is deprecated in versions 1.13 and higher. Using script mode instead.') + logger.warning( + "Legacy mode is deprecated in versions 1.13 and higher. Using script mode instead." + ) self.script_mode = True def _only_script_mode_supported(self): - return [int(s) for s in self.framework_version.split('.')] >= self._LOWEST_SCRIPT_MODE_ONLY_VERSION + return [ + int(s) for s in self.framework_version.split(".") + ] >= self._LOWEST_SCRIPT_MODE_ONLY_VERSION def _validate_requirements_file(self, requirements_file): if not requirements_file: diff --git a/src/sagemaker/utils.py b/src/sagemaker/utils.py index 6b2c7dcee5..3fa07fb678 100644 --- a/src/sagemaker/utils.py +++ b/src/sagemaker/utils.py @@ -132,7 +132,7 @@ def get_short_version(framework_version): Returns: str: The short version string """ - return '.'.join(framework_version.split('.')[:2]) + return ".".join(framework_version.split(".")[:2]) def to_str(value): diff --git a/tests/unit/test_tf_estimator.py b/tests/unit/test_tf_estimator.py index ad9c55f661..9b4c5333df 100644 --- a/tests/unit/test_tf_estimator.py +++ b/tests/unit/test_tf_estimator.py @@ -871,12 +871,20 @@ def test_script_mode_deprecated_args(sagemaker_session): def test_legacy_mode_deprecated(sagemaker_session): - tf = _build_tf(sagemaker_session=sagemaker_session, framework_version='1.13.1', - py_version='py2', script_mode=False) + tf = _build_tf( + sagemaker_session=sagemaker_session, + framework_version="1.13.1", + py_version="py2", + script_mode=False, + ) assert tf._script_mode_enabled() is True - tf = _build_tf(sagemaker_session=sagemaker_session, framework_version='1.12', - py_version='py2', script_mode=False) + tf = _build_tf( + sagemaker_session=sagemaker_session, + framework_version="1.12", + py_version="py2", + script_mode=False, + ) assert tf._script_mode_enabled() is False diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index ff7bfff356..d962125d3c 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -48,8 +48,8 @@ def test_get_config_value(): def test_get_short_version(): - assert sagemaker.utils.get_short_version('1.13.1') == '1.13' - assert sagemaker.utils.get_short_version('1.13') == '1.13' + assert sagemaker.utils.get_short_version("1.13.1") == "1.13" + assert sagemaker.utils.get_short_version("1.13") == "1.13" def test_deferred_error(): From fcbb7a6493efe5a529cd0749f4cd8036952f8043 Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Mon, 24 Jun 2019 11:45:28 -0700 Subject: [PATCH 5/6] doc fix and remove framework mode tests --- doc/using_tf.rst | 16 +- src/sagemaker/tensorflow/deploying_python.rst | 199 ------------- src/sagemaker/tensorflow/estimator.py | 6 +- tests/integ/test_local_mode.py | 7 +- tests/integ/test_tf.py | 261 ------------------ tests/integ/test_tf_cifar.py | 6 +- tests/integ/test_tf_keras.py | 3 +- 7 files changed, 16 insertions(+), 482 deletions(-) delete mode 100644 src/sagemaker/tensorflow/deploying_python.rst delete mode 100644 tests/integ/test_tf.py diff --git a/doc/using_tf.rst b/doc/using_tf.rst index d8463942ce..d2d9228153 100644 --- a/doc/using_tf.rst +++ b/doc/using_tf.rst @@ -443,20 +443,10 @@ After a TensorFlow estimator has been fit, it saves a TensorFlow SavedModel in the S3 location defined by ``output_path``. You can call ``deploy`` on a TensorFlow estimator to create a SageMaker Endpoint. -SageMaker provides two different options for deploying TensorFlow models to a SageMaker -Endpoint: +Your model will be deployed to a TensorFlow Serving-based server. The server provides a super-set of the +`TensorFlow Serving REST API `_. -- The first option uses a Python-based server that allows you to specify your own custom - input and output handling functions in a Python script. This is the default option. - - See `Deploying to Python-based Endpoints `_ to learn how to use this option. - - -- The second option uses a TensorFlow Serving-based server to provide a super-set of the - `TensorFlow Serving REST API `_. This option - does not require (or allow) a custom python script. - - See `Deploying to TensorFlow Serving Endpoints `_ to learn how to use this option. +See `Deploying to TensorFlow Serving Endpoints `_ to learn how to deploy your model and make inference requests. SageMaker TensorFlow Docker containers diff --git a/src/sagemaker/tensorflow/deploying_python.rst b/src/sagemaker/tensorflow/deploying_python.rst deleted file mode 100644 index 7722e5124a..0000000000 --- a/src/sagemaker/tensorflow/deploying_python.rst +++ /dev/null @@ -1,199 +0,0 @@ -Deploying to Python-based Endpoints -=================================== - -Deploying from an Estimator -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -After a TensorFlow estimator has been fit, it saves a TensorFlow ``SavedModel`` in -the S3 location defined by ``output_path``. You can call ``deploy`` on a TensorFlow -estimator to create a SageMaker Endpoint. - -A common usage of the ``deploy`` method, after the TensorFlow estimator has been fit look -like this: - -.. code:: python - - from sagemaker.tensorflow import TensorFlow - - estimator = TensorFlow(entry_point='tf-train.py', ..., train_instance_count=1, - train_instance_type='ml.c4.xlarge', framework_version='1.10.0') - - estimator.fit(inputs) - - predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge') - - -The code block above deploys a SageMaker Endpoint with one instance of the type 'ml.c4.xlarge'. - -Python-based TensorFlow serving on SageMaker has support for `Elastic Inference `_, which allows for inference acceleration to a hosted endpoint for a fraction of the cost of using a full GPU instance. In order to attach an Elastic Inference accelerator to your endpoint provide the accelerator type to ``accelerator_type`` to your ``deploy`` call. - -.. code:: python - - predictor = estimator.deploy(initial_instance_count=1, - instance_type='ml.c5.xlarge', - accelerator_type='ml.eia1.medium') - -What happens when deploy is called -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Calling ``deploy`` starts the process of creating a SageMaker Endpoint. This process includes the following steps. - -- Starts ``initial_instance_count`` EC2 instances of the type ``instance_type``. -- On each instance, it will do the following steps: - - - start a Docker container optimized for TensorFlow Serving, see `SageMaker TensorFlow Docker containers`_. - - start a `TensorFlow Serving` process configured to run your model. - - start a Python-based HTTP server which supports protobuf, JSON and CSV content types, and can run your custom - input and output python functions. See `Making predictions against a SageMaker Endpoint`_. - - -When the ``deploy`` call finishes, the created SageMaker Endpoint is ready for prediction requests. The next chapter will explain -how to make predictions against the Endpoint, how to use different content-types in your requests, and how to extend the Web server -functionality. - -Deploying directly from model artifacts -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -If you already have existing model artifacts, you can skip training and deploy them directly to an endpoint: - -.. code:: python - - from sagemaker.tensorflow import TensorFlowModel - - tf_model = TensorFlowModel(model_data='s3://mybucket/model.tar.gz', - role='MySageMakerRole', - entry_point='entry.py', - name='model_name') - - predictor = tf_model.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge') - -You can also optionally specify a pip `requirements file `_ if you need to install additional packages into the deployed -runtime environment by including it in your source_dir and specifying it in the ``'SAGEMAKER_REQUIREMENTS'`` env variable: - -.. code:: python - - from sagemaker.tensorflow import TensorFlowModel - - tf_model = TensorFlowModel(model_data='s3://mybucket/model.tar.gz', - role='MySageMakerRole', - entry_point='entry.py', - source_dir='my_src', # directory which contains entry_point script and requirements file - name='model_name', - env={'SAGEMAKER_REQUIREMENTS': 'requirements.txt'}) # path relative to source_dir - - predictor = tf_model.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge') - - -Making predictions against a SageMaker Endpoint -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The following code adds a prediction request to the previous code example: - -.. code:: python - - estimator = TensorFlow(entry_point='tf-train.py', ..., train_instance_count=1, - train_instance_type='ml.c4.xlarge', framework_version='1.10.0') - - estimator.fit(inputs) - - predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge') - - result = predictor.predict([6.4, 3.2, 4.5, 1.5]) - -The ``predictor.predict`` method call takes one parameter, the input ``data`` for which you want the SageMaker Endpoint -to provide inference. ``predict`` will serialize the input data, and send it in as request to the SageMaker Endpoint by -an ``InvokeEndpoint`` SageMaker operation. ``InvokeEndpoint`` operation requests can be made by ``predictor.predict``, -by boto3 `SageMakerRuntime `_ -client or by AWS CLI. - -The SageMaker Endpoint web server will process the request, make an inference using the deployed model, and return a response. -The ``result`` returned by ``predict`` is -a Python dictionary with the model prediction. In the code example above, the prediction ``result`` looks like this: - -.. code:: python - - {'result': - {'classifications': [ - {'classes': [ - {'label': '0', 'score': 0.0012890376383438706}, - {'label': '1', 'score': 0.9814321994781494}, - {'label': '2', 'score': 0.017278732731938362} - ]} - ]} - } - -Specifying the output of a prediction request -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The format of the prediction ``result`` is determined by the parameter ``export_outputs`` of the `tf.estimator.EstimatorSpec `_ that you returned when you created your ``model_fn``, see -`Example of a complete model_fn`_ for an example of ``export_outputs``. - -More information on how to create ``export_outputs`` can find in `specifying the outputs of a custom model `_. - -Endpoint prediction request handling -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Whenever a prediction request is made to a SageMaker Endpoint via a ``InvokeEndpoint`` SageMaker operation, the request will -be deserialized by the web server, sent to TensorFlow Serving, and serialized back to the client as response. - -The TensorFlow Web server breaks request handling into three steps: - -- input processing, -- TensorFlow Serving prediction, and -- output processing. - -The SageMaker Endpoint provides default input and output processing, which support by default JSON, CSV, and protobuf requests. -This process looks like this: - -.. code:: python - - # Deserialize the Invoke request body into an object we can perform prediction on - deserialized_input = input_fn(serialized_input, request_content_type) - - # Perform prediction on the deserialized object, with the loaded model - prediction_result = make_tensorflow_serving_prediction(deserialized_input) - - # Serialize the prediction result into the desired response content type - serialized_output = output_fn(prediction_result, accepts) - -The common functionality can be extended by the addiction of the following two functions to your training script: - -Overriding input preprocessing with an ``input_fn`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -An example of ``input_fn`` for the content-type "application/python-pickle" can be seen below: - -.. code:: python - - import numpy as np - - def input_fn(serialized_input, content_type): - """An input_fn that loads a pickled object""" - if request_content_type == "application/python-pickle": - deserialized_input = pickle.loads(serialized_input) - return deserialized_input - else: - # Handle other content-types here or raise an Exception - # if the content type is not supported. - pass - -Overriding output postprocessing with an ``output_fn`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -An example of ``output_fn`` for the accept type "application/python-pickle" can be seen below: - -.. code:: python - - import numpy as np - - def output_fn(prediction_result, accepts): - """An output_fn that dumps a pickled object as response""" - if request_content_type == "application/python-pickle": - return np.dumps(prediction_result) - else: - # Handle other content-types here or raise an Exception - # if the content type is not supported. - pass - -A example with ``input_fn`` and ``output_fn`` above can be found in -`here `_. diff --git a/src/sagemaker/tensorflow/estimator.py b/src/sagemaker/tensorflow/estimator.py index dbc4e88b9a..ba807742f2 100644 --- a/src/sagemaker/tensorflow/estimator.py +++ b/src/sagemaker/tensorflow/estimator.py @@ -26,7 +26,7 @@ from sagemaker.tensorflow.defaults import TF_VERSION from sagemaker.tensorflow.model import TensorFlowModel from sagemaker.tensorflow.serving import Model -from sagemaker.utils import get_config_value, get_short_version +from sagemaker import utils from sagemaker.vpc_utils import VPC_CONFIG_DEFAULT logger = logging.getLogger("sagemaker") @@ -502,7 +502,7 @@ def _create_tfs_model(self, role=None, vpc_config_override=VPC_CONFIG_DEFAULT): image=self.image_name, name=self._current_job_name, container_log_level=self.container_log_level, - framework_version=get_short_version(self.framework_version), + framework_version=utils.get_short_version(self.framework_version), sagemaker_session=self.sagemaker_session, vpc_config=self.get_vpc_config(vpc_config_override), ) @@ -566,7 +566,7 @@ def hyperparameters(self): return hyperparameters def _default_s3_path(self, directory, mpi=False): - local_code = get_config_value("local.local_code", self.sagemaker_session.config) + local_code = utils.get_config_value("local.local_code", self.sagemaker_session.config) if self.sagemaker_session.local_mode and local_code: return "/opt/ml/shared/{}".format(directory) elif mpi: diff --git a/tests/integ/test_local_mode.py b/tests/integ/test_local_mode.py index f8f31c7516..db2225db14 100644 --- a/tests/integ/test_local_mode.py +++ b/tests/integ/test_local_mode.py @@ -85,14 +85,14 @@ def _create_model(output_path): @pytest.mark.local_mode @pytest.mark.skipif(PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2.") -def test_tf_local_mode(tf_full_version, sagemaker_local_session): +def test_tf_local_mode(sagemaker_local_session): with timeout(minutes=5): script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py") estimator = TensorFlow( entry_point=script_path, role="SageMakerRole", - framework_version=tf_full_version, + framework_version='1.12', training_steps=1, evaluation_steps=1, hyperparameters={"input_tensor_name": "inputs"}, @@ -135,6 +135,7 @@ def test_tf_distributed_local_mode(sagemaker_local_session): estimator = TensorFlow( entry_point=script_path, role="SageMakerRole", + framework_version='1.12', training_steps=1, evaluation_steps=1, hyperparameters={"input_tensor_name": "inputs"}, @@ -176,6 +177,7 @@ def test_tf_local_data(sagemaker_local_session): estimator = TensorFlow( entry_point=script_path, role="SageMakerRole", + framework_version='1.12', training_steps=1, evaluation_steps=1, hyperparameters={"input_tensor_name": "inputs"}, @@ -216,6 +218,7 @@ def test_tf_local_data_local_script(): estimator = TensorFlow( entry_point=script_path, role="SageMakerRole", + framework_version='1.12', training_steps=1, evaluation_steps=1, hyperparameters={"input_tensor_name": "inputs"}, diff --git a/tests/integ/test_tf.py b/tests/integ/test_tf.py deleted file mode 100644 index 6d04b5026e..0000000000 --- a/tests/integ/test_tf.py +++ /dev/null @@ -1,261 +0,0 @@ -# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import os -import time - -import pytest - -import tests.integ -from sagemaker.tensorflow import TensorFlow, TensorFlowModel -from sagemaker.utils import sagemaker_timestamp, unique_name_from_base -from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES, PYTHON_VERSION -from tests.integ.timeout import timeout_and_delete_endpoint_by_name, timeout -from tests.integ.vpc_test_utils import ( - get_or_create_vpc_resources, - setup_security_group_for_encryption, -) - -DATA_PATH = os.path.join(DATA_DIR, "iris", "data") - - -@pytest.fixture(scope="module") -@pytest.mark.skipif(PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2.") -def tf_training_job(sagemaker_session, tf_full_version): - with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): - script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py") - - estimator = TensorFlow( - entry_point=script_path, - role="SageMakerRole", - framework_version=tf_full_version, - training_steps=1, - evaluation_steps=1, - checkpoint_path="/opt/ml/model", - hyperparameters={"input_tensor_name": "inputs"}, - train_instance_count=1, - train_instance_type="ml.c4.xlarge", - sagemaker_session=sagemaker_session, - base_job_name="test-tf", - ) - - inputs = sagemaker_session.upload_data(path=DATA_PATH, key_prefix="integ-test-data/tf_iris") - job_name = unique_name_from_base("test-tf-train") - estimator.fit(inputs, job_name=job_name) - print("job succeeded: {}".format(estimator.latest_training_job.name)) - - return estimator.latest_training_job.name - - -@pytest.mark.canary_quick -@pytest.mark.regional_testing -@pytest.mark.skipif(PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2.") -def test_deploy_model(sagemaker_session, tf_training_job): - endpoint_name = "test-tf-deploy-model-{}".format(sagemaker_timestamp()) - - with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): - desc = sagemaker_session.sagemaker_client.describe_training_job( - TrainingJobName=tf_training_job - ) - model_data = desc["ModelArtifacts"]["S3ModelArtifacts"] - - script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py") - model = TensorFlowModel( - model_data, - "SageMakerRole", - entry_point=script_path, - sagemaker_session=sagemaker_session, - ) - - json_predictor = model.deploy( - initial_instance_count=1, instance_type="ml.c4.xlarge", endpoint_name=endpoint_name - ) - - features = [6.4, 3.2, 4.5, 1.5] - dict_result = json_predictor.predict({"inputs": features}) - print("predict result: {}".format(dict_result)) - list_result = json_predictor.predict(features) - print("predict result: {}".format(list_result)) - - assert dict_result == list_result - - -@pytest.mark.canary_quick -@pytest.mark.regional_testing -@pytest.mark.skipif( - tests.integ.test_region() not in tests.integ.EI_SUPPORTED_REGIONS, - reason="EI isn't supported in that specific region.", -) -@pytest.mark.skipif(PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2.") -def test_deploy_model_with_accelerator(sagemaker_session, tf_training_job, ei_tf_full_version): - endpoint_name = "test-tf-deploy-model-ei-{}".format(sagemaker_timestamp()) - - with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): - desc = sagemaker_session.sagemaker_client.describe_training_job( - TrainingJobName=tf_training_job - ) - model_data = desc["ModelArtifacts"]["S3ModelArtifacts"] - - script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py") - model = TensorFlowModel( - model_data, - "SageMakerRole", - entry_point=script_path, - framework_version=ei_tf_full_version, - sagemaker_session=sagemaker_session, - ) - - json_predictor = model.deploy( - initial_instance_count=1, - instance_type="ml.c4.xlarge", - endpoint_name=endpoint_name, - accelerator_type="ml.eia1.medium", - ) - - features = [6.4, 3.2, 4.5, 1.5] - dict_result = json_predictor.predict({"inputs": features}) - print("predict result: {}".format(dict_result)) - list_result = json_predictor.predict(features) - print("predict result: {}".format(list_result)) - - assert dict_result == list_result - - -@pytest.mark.skipif(PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2.") -def test_tf_async(sagemaker_session): - with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): - script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py") - - estimator = TensorFlow( - entry_point=script_path, - role="SageMakerRole", - training_steps=1, - evaluation_steps=1, - checkpoint_path="/opt/ml/model", - hyperparameters={"input_tensor_name": "inputs"}, - train_instance_count=1, - train_instance_type="ml.c4.xlarge", - sagemaker_session=sagemaker_session, - base_job_name="test-tf", - ) - - inputs = estimator.sagemaker_session.upload_data( - path=DATA_PATH, key_prefix="integ-test-data/tf_iris" - ) - job_name = unique_name_from_base("test-tf-async") - estimator.fit(inputs, wait=False, job_name=job_name) - training_job_name = estimator.latest_training_job.name - time.sleep(20) - - endpoint_name = training_job_name - with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): - estimator = TensorFlow.attach( - training_job_name=training_job_name, sagemaker_session=sagemaker_session - ) - json_predictor = estimator.deploy( - initial_instance_count=1, instance_type="ml.c4.xlarge", endpoint_name=endpoint_name - ) - - result = json_predictor.predict([6.4, 3.2, 4.5, 1.5]) - print("predict result: {}".format(result)) - - -@pytest.mark.skipif(PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2.") -def test_tf_vpc_multi(sagemaker_session, tf_full_version): - """Test Tensorflow multi-instance using the same VpcConfig for training and inference""" - instance_type = "ml.c4.xlarge" - instance_count = 2 - - train_input = sagemaker_session.upload_data( - path=os.path.join(DATA_DIR, "iris", "data"), key_prefix="integ-test-data/tf_iris" - ) - script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py") - - ec2_client = sagemaker_session.boto_session.client("ec2") - subnet_ids, security_group_id = get_or_create_vpc_resources( - ec2_client, sagemaker_session.boto_session.region_name - ) - - setup_security_group_for_encryption(ec2_client, security_group_id) - - estimator = TensorFlow( - entry_point=script_path, - role="SageMakerRole", - framework_version=tf_full_version, - training_steps=1, - evaluation_steps=1, - hyperparameters={"input_tensor_name": "inputs"}, - train_instance_count=instance_count, - train_instance_type=instance_type, - sagemaker_session=sagemaker_session, - base_job_name="test-vpc-tf", - subnets=subnet_ids, - security_group_ids=[security_group_id], - encrypt_inter_container_traffic=True, - ) - job_name = unique_name_from_base("test-tf-vpc-multi") - - with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): - estimator.fit(train_input, job_name=job_name) - print("training job succeeded: {}".format(estimator.latest_training_job.name)) - - job_desc = sagemaker_session.sagemaker_client.describe_training_job( - TrainingJobName=estimator.latest_training_job.name - ) - assert set(subnet_ids) == set(job_desc["VpcConfig"]["Subnets"]) - assert [security_group_id] == job_desc["VpcConfig"]["SecurityGroupIds"] - assert job_desc["EnableInterContainerTrafficEncryption"] is True - - endpoint_name = estimator.latest_training_job.name - with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): - model = estimator.create_model() - json_predictor = model.deploy( - initial_instance_count=instance_count, - instance_type="ml.c4.xlarge", - endpoint_name=endpoint_name, - ) - - features = [6.4, 3.2, 4.5, 1.5] - dict_result = json_predictor.predict({"inputs": features}) - print("predict result: {}".format(dict_result)) - list_result = json_predictor.predict(features) - print("predict result: {}".format(list_result)) - - assert dict_result == list_result - - model_desc = sagemaker_session.sagemaker_client.describe_model(ModelName=model.name) - assert set(subnet_ids) == set(model_desc["VpcConfig"]["Subnets"]) - assert [security_group_id] == model_desc["VpcConfig"]["SecurityGroupIds"] - - -@pytest.mark.skipif(PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2.") -def test_failed_tf_training(sagemaker_session, tf_full_version): - with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): - script_path = os.path.join(DATA_DIR, "iris", "failure_script.py") - estimator = TensorFlow( - entry_point=script_path, - role="SageMakerRole", - framework_version=tf_full_version, - training_steps=1, - evaluation_steps=1, - hyperparameters={"input_tensor_name": "inputs"}, - train_instance_count=1, - train_instance_type="ml.c4.xlarge", - sagemaker_session=sagemaker_session, - ) - job_name = unique_name_from_base("test-tf-fail") - - with pytest.raises(ValueError) as e: - estimator.fit(job_name=job_name) - assert "This failure is expected" in str(e.value) diff --git a/tests/integ/test_tf_cifar.py b/tests/integ/test_tf_cifar.py index ed7418b77d..36c05beade 100644 --- a/tests/integ/test_tf_cifar.py +++ b/tests/integ/test_tf_cifar.py @@ -44,7 +44,7 @@ def __call__(self, data): or tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS, reason="no ml.p2 instances in these regions", ) -def test_cifar(sagemaker_session, tf_full_version): +def test_cifar(sagemaker_session): with timeout(minutes=45): script_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "source") @@ -54,8 +54,8 @@ def test_cifar(sagemaker_session, tf_full_version): entry_point="resnet_cifar_10.py", source_dir=script_path, role="SageMakerRole", - framework_version=tf_full_version, - training_steps=500, + framework_version='1.12', + training_steps=50, evaluation_steps=5, train_instance_count=2, train_instance_type="ml.p2.xlarge", diff --git a/tests/integ/test_tf_keras.py b/tests/integ/test_tf_keras.py index 9fea4a7ca5..ac27816dcd 100644 --- a/tests/integ/test_tf_keras.py +++ b/tests/integ/test_tf_keras.py @@ -32,7 +32,7 @@ tests.integ.test_region() in tests.integ.HOSTING_NO_P2_REGIONS, reason="no ml.p2 instances in these regions", ) -def test_keras(sagemaker_session, tf_full_version): +def test_keras(sagemaker_session): script_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "source") dataset_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "data") @@ -41,6 +41,7 @@ def test_keras(sagemaker_session, tf_full_version): entry_point="keras_cnn_cifar_10.py", source_dir=script_path, role="SageMakerRole", + framework_version='1.12', sagemaker_session=sagemaker_session, hyperparameters={"learning_rate": 1e-4, "decay": 1e-6}, training_steps=50, From f56c4b05932431d5c9c2d1590bc0d9e307f428c2 Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Mon, 24 Jun 2019 13:38:17 -0700 Subject: [PATCH 6/6] reformatting --- tests/integ/test_local_mode.py | 8 ++++---- tests/integ/test_tf_cifar.py | 2 +- tests/integ/test_tf_keras.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/integ/test_local_mode.py b/tests/integ/test_local_mode.py index db2225db14..d5cbc67e62 100644 --- a/tests/integ/test_local_mode.py +++ b/tests/integ/test_local_mode.py @@ -92,7 +92,7 @@ def test_tf_local_mode(sagemaker_local_session): estimator = TensorFlow( entry_point=script_path, role="SageMakerRole", - framework_version='1.12', + framework_version="1.12", training_steps=1, evaluation_steps=1, hyperparameters={"input_tensor_name": "inputs"}, @@ -135,7 +135,7 @@ def test_tf_distributed_local_mode(sagemaker_local_session): estimator = TensorFlow( entry_point=script_path, role="SageMakerRole", - framework_version='1.12', + framework_version="1.12", training_steps=1, evaluation_steps=1, hyperparameters={"input_tensor_name": "inputs"}, @@ -177,7 +177,7 @@ def test_tf_local_data(sagemaker_local_session): estimator = TensorFlow( entry_point=script_path, role="SageMakerRole", - framework_version='1.12', + framework_version="1.12", training_steps=1, evaluation_steps=1, hyperparameters={"input_tensor_name": "inputs"}, @@ -218,7 +218,7 @@ def test_tf_local_data_local_script(): estimator = TensorFlow( entry_point=script_path, role="SageMakerRole", - framework_version='1.12', + framework_version="1.12", training_steps=1, evaluation_steps=1, hyperparameters={"input_tensor_name": "inputs"}, diff --git a/tests/integ/test_tf_cifar.py b/tests/integ/test_tf_cifar.py index 36c05beade..75a1f8635f 100644 --- a/tests/integ/test_tf_cifar.py +++ b/tests/integ/test_tf_cifar.py @@ -54,7 +54,7 @@ def test_cifar(sagemaker_session): entry_point="resnet_cifar_10.py", source_dir=script_path, role="SageMakerRole", - framework_version='1.12', + framework_version="1.12", training_steps=50, evaluation_steps=5, train_instance_count=2, diff --git a/tests/integ/test_tf_keras.py b/tests/integ/test_tf_keras.py index ac27816dcd..9939e67b2a 100644 --- a/tests/integ/test_tf_keras.py +++ b/tests/integ/test_tf_keras.py @@ -41,7 +41,7 @@ def test_keras(sagemaker_session): entry_point="keras_cnn_cifar_10.py", source_dir=script_path, role="SageMakerRole", - framework_version='1.12', + framework_version="1.12", sagemaker_session=sagemaker_session, hyperparameters={"learning_rate": 1e-4, "decay": 1e-6}, training_steps=50,