From 2fef9cd55b55dc9230e4e85c5ca24d22dbd275f8 Mon Sep 17 00:00:00 2001 From: Ignacio Quintero Date: Mon, 4 Jun 2018 14:12:14 -0700 Subject: [PATCH 1/6] Allow Framework Estimators to use custom image Chainer, Tensorflow and MXNet estimators can now pass an image_name argument to the constructor to use that image instead of the default sagemaker ones. --- CHANGELOG.rst | 1 + src/sagemaker/chainer/estimator.py | 35 ++++++------- src/sagemaker/estimator.py | 27 +++++++++- src/sagemaker/mxnet/estimator.py | 38 +++++++------- src/sagemaker/tensorflow/estimator.py | 35 ++++++------- tests/unit/test_chainer.py | 69 ++++++++++++++++++++++++++ tests/unit/test_mxnet.py | 67 ++++++++++++++++++++++++- tests/unit/test_tf_estimator.py | 71 +++++++++++++++++++++++++++ 8 files changed, 284 insertions(+), 59 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 34e2b715d9..11ae75fbbf 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,7 @@ CHANGELOG * bug-fix: Unit Tests: Improve unit test runtime * bug-fix: Estimators: Fix attach for LDA +* feature: Allow Chainer, Tensorflow and MXNet estimators to use a custom docker image. 1.4.1 ===== diff --git a/src/sagemaker/chainer/estimator.py b/src/sagemaker/chainer/estimator.py index c4decba92c..7f7f0fa3d8 100644 --- a/src/sagemaker/chainer/estimator.py +++ b/src/sagemaker/chainer/estimator.py @@ -13,7 +13,7 @@ from __future__ import absolute_import from sagemaker.estimator import Framework -from sagemaker.fw_utils import create_image_uri, framework_name_from_image, framework_version_from_tag +from sagemaker.fw_utils import framework_name_from_image, framework_version_from_tag from sagemaker.chainer.defaults import CHAINER_VERSION from sagemaker.chainer.model import ChainerModel @@ -31,7 +31,7 @@ class Chainer(Framework): def __init__(self, entry_point, use_mpi=None, num_processes=None, process_slots_per_host=None, additional_mpi_options=None, source_dir=None, hyperparameters=None, py_version='py3', - framework_version=CHAINER_VERSION, **kwargs): + framework_version=CHAINER_VERSION, image_name=None, **kwargs): """ This ``Estimator`` executes an Chainer script in a managed Chainer execution environment, within a SageMaker Training Job. The managed Chainer environment is an Amazon-built Docker container that executes functions @@ -67,9 +67,12 @@ def __init__(self, entry_point, use_mpi=None, num_processes=None, process_slots_ One of 'py2' or 'py3'. framework_version (str): Chainer version you want to use for executing your model training code. List of supported versions https://github.com/aws/sagemaker-python-sdk#chainer-sagemaker-estimators + image_name (str): The container image to use for training. This will override py_version and + framework_version. The image is expected to be a modification of the SageMaker Chainer image. **kwargs: Additional kwargs passed to the :class:`~sagemaker.estimator.Framework` constructor. """ - super(Chainer, self).__init__(entry_point, source_dir, hyperparameters, **kwargs) + super(Chainer, self).__init__(entry_point, source_dir, hyperparameters, + image_name=image_name, **kwargs) self.py_version = py_version self.framework_version = framework_version self.use_mpi = use_mpi @@ -91,20 +94,6 @@ def hyperparameters(self): hyperparameters.update(Framework._json_encode_hyperparameters(additional_hyperparameters)) return hyperparameters - def train_image(self): - """Return the Docker image to use for training. - - The :meth:`~sagemaker.estimator.EstimatorBase.fit` method, which does the model training, calls this method to - find the image to use for model training. - - Returns: - str: The URI of the Docker image. - """ - - return create_image_uri(self.sagemaker_session.boto_session.region_name, self.__framework_name__, - self.train_instance_type, framework_version=self.framework_version, - py_version=self.py_version) - def create_model(self, model_server_workers=None): """Create a SageMaker ``ChainerModel`` object that can be deployed to an ``Endpoint``. @@ -120,7 +109,8 @@ def create_model(self, model_server_workers=None): enable_cloudwatch_metrics=self.enable_cloudwatch_metrics, name=self._current_job_name, container_log_level=self.container_log_level, code_location=self.code_location, py_version=self.py_version, framework_version=self.framework_version, - model_server_workers=model_server_workers, sagemaker_session=self.sagemaker_session) + model_server_workers=model_server_workers, image=self.image_name, + sagemaker_session=self.sagemaker_session) @classmethod def _prepare_init_params_from_job_description(cls, job_details): @@ -142,7 +132,14 @@ def _prepare_init_params_from_job_description(cls, job_details): if value: init_params[argument[len('sagemaker_'):]] = value - framework, py_version, tag = framework_name_from_image(init_params.pop('image')) + image_name = init_params.pop('image') + framework, py_version, tag = framework_name_from_image(image_name) + + if not framework: + # If we were unable to parse the framework name from the image it is not one of our + # officially supported images, in this case just add the image to the init params. + init_params['image_name'] = image_name + return init_params init_params['py_version'] = py_version init_params['framework_version'] = framework_version_from_tag(tag) diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index 8e1b7d2281..53d35be394 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -20,7 +20,8 @@ from six import with_metaclass from sagemaker.analytics import TrainingJobAnalytics -from sagemaker.fw_utils import tar_and_upload_dir, parse_s3_url, UploadedCode, validate_source_dir +from sagemaker.fw_utils import (create_image_uri, tar_and_upload_dir, parse_s3_url, UploadedCode, + validate_source_dir) from sagemaker.job import _Job from sagemaker.local import LocalSession from sagemaker.model import Model @@ -226,6 +227,7 @@ def attach(cls, training_job_name, sagemaker_session=None): job_details = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=training_job_name) init_params = cls._prepare_init_params_from_job_description(job_details) + print(init_params) estimator = cls(sagemaker_session=sagemaker_session, **init_params) estimator.latest_training_job = _TrainingJob(sagemaker_session=sagemaker_session, training_job_name=init_params['base_job_name']) @@ -493,7 +495,7 @@ class Framework(EstimatorBase): """ def __init__(self, entry_point, source_dir=None, hyperparameters=None, enable_cloudwatch_metrics=False, - container_log_level=logging.INFO, code_location=None, **kwargs): + container_log_level=logging.INFO, code_location=None, image_name=None, **kwargs): """Base class initializer. Subclasses which override ``__init__`` should invoke ``super()`` Args: @@ -513,6 +515,9 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, enable_cl code_location (str): Name of the S3 bucket where custom code is uploaded (default: None). If not specified, default bucket created by ``sagemaker.session.Session`` is used. **kwargs: Additional kwargs passed to the ``EstimatorBase`` constructor. + image_name (str): An alternate image name to use instead of the official Sagemaker image + for the framework. This is useful to run one of the Sagemaker supported frameworks + with an image containing custom dependencies. """ super(Framework, self).__init__(**kwargs) self.source_dir = source_dir @@ -521,6 +526,9 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, enable_cl self.container_log_level = container_log_level self._hyperparameters = hyperparameters or {} self.code_location = code_location + self.image_name = image_name + print(self.image_name) + print(kwargs) def _prepare_for_training(self, job_name=None): """Set hyperparameters needed for training. This method will also validate ``source_dir``. @@ -624,6 +632,21 @@ def _prepare_init_params_from_job_description(cls, job_details): return init_params + def train_image(self): + """Return the Docker image to use for training. + + The :meth:`~sagemaker.estimator.EstimatorBase.fit` method, which does the model training, + calls this method to find the image to use for model training. + + Returns: + str: The URI of the Docker image. + """ + if self.image_name: + return self.image_name + else: + return create_image_uri(self.sagemaker_session.boto_region_name, self.__framework_name__, + self.train_instance_type, self.framework_version, py_version=self.py_version) + @classmethod def attach(cls, training_job_name, sagemaker_session=None): """Attach to an existing training job. diff --git a/src/sagemaker/mxnet/estimator.py b/src/sagemaker/mxnet/estimator.py index 71e72e06a1..d7176432a2 100644 --- a/src/sagemaker/mxnet/estimator.py +++ b/src/sagemaker/mxnet/estimator.py @@ -13,7 +13,7 @@ from __future__ import absolute_import from sagemaker.estimator import Framework -from sagemaker.fw_utils import create_image_uri, framework_name_from_image, framework_version_from_tag +from sagemaker.fw_utils import framework_name_from_image, framework_version_from_tag from sagemaker.mxnet.defaults import MXNET_VERSION from sagemaker.mxnet.model import MXNetModel @@ -24,7 +24,7 @@ class MXNet(Framework): __framework_name__ = "mxnet" def __init__(self, entry_point, source_dir=None, hyperparameters=None, py_version='py2', - framework_version=MXNET_VERSION, **kwargs): + framework_version=MXNET_VERSION, image_name=None, **kwargs): """ This ``Estimator`` executes an MXNet script in a managed MXNet execution environment, within a SageMaker Training Job. The managed MXNet environment is an Amazon-built Docker container that executes functions @@ -52,25 +52,15 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, py_versio One of 'py2' or 'py3'. framework_version (str): MXNet version you want to use for executing your model training code. List of supported versions https://github.com/aws/sagemaker-python-sdk#mxnet-sagemaker-estimators + image_name (str): The container image to use for training. This will override py_version and + framework_version. The image is expected to be a modification of the SageMaker MXNet image. **kwargs: Additional kwargs passed to the :class:`~sagemaker.estimator.Framework` constructor. """ - super(MXNet, self).__init__(entry_point, source_dir, hyperparameters, **kwargs) + super(MXNet, self).__init__(entry_point, source_dir, hyperparameters, + image_name=image_name, **kwargs) self.py_version = py_version self.framework_version = framework_version - def train_image(self): - """Return the Docker image to use for training. - - The :meth:`~sagemaker.estimator.EstimatorBase.fit` method, which does the model training, calls this method to - find the image to use for model training. - - Returns: - str: The URI of the Docker image. - """ - return create_image_uri(self.sagemaker_session.boto_region_name, self.__framework_name__, - self.train_instance_type, framework_version=self.framework_version, - py_version=self.py_version) - def create_model(self, model_server_workers=None): """Create a SageMaker ``MXNetModel`` object that can be deployed to an ``Endpoint``. @@ -82,11 +72,16 @@ def create_model(self, model_server_workers=None): sagemaker.mxnet.model.MXNetModel: A SageMaker ``MXNetModel`` object. See :func:`~sagemaker.mxnet.model.MXNetModel` for full details. """ + kwargs = {} + # pass our custom image if there is one. + if self.image_name: + kwargs['image'] = self.image_name + return MXNetModel(self.model_data, self.role, self.entry_point, source_dir=self.source_dir, enable_cloudwatch_metrics=self.enable_cloudwatch_metrics, name=self._current_job_name, container_log_level=self.container_log_level, code_location=self.code_location, py_version=self.py_version, framework_version=self.framework_version, - model_server_workers=model_server_workers, sagemaker_session=self.sagemaker_session) + model_server_workers=model_server_workers, sagemaker_session=self.sagemaker_session, **kwargs) @classmethod def _prepare_init_params_from_job_description(cls, job_details): @@ -100,7 +95,14 @@ def _prepare_init_params_from_job_description(cls, job_details): """ init_params = super(MXNet, cls)._prepare_init_params_from_job_description(job_details) - framework, py_version, tag = framework_name_from_image(init_params.pop('image')) + image_name = init_params.pop('image') + framework, py_version, tag = framework_name_from_image(image_name) + + if not framework: + # If we were unable to parse the framework name from the image it is not one of our + # officially supported images, in this case just add the image to the init params. + init_params['image_name'] = image_name + return init_params init_params['py_version'] = py_version diff --git a/src/sagemaker/tensorflow/estimator.py b/src/sagemaker/tensorflow/estimator.py index 0c38873026..40eb962464 100644 --- a/src/sagemaker/tensorflow/estimator.py +++ b/src/sagemaker/tensorflow/estimator.py @@ -21,7 +21,7 @@ import threading from sagemaker.estimator import Framework -from sagemaker.fw_utils import create_image_uri, framework_name_from_image, framework_version_from_tag +from sagemaker.fw_utils import framework_name_from_image, framework_version_from_tag from sagemaker.utils import get_config_value from sagemaker.tensorflow.defaults import TF_VERSION @@ -157,7 +157,7 @@ class TensorFlow(Framework): __framework_name__ = 'tensorflow' def __init__(self, training_steps=None, evaluation_steps=None, checkpoint_path=None, py_version='py2', - framework_version=TF_VERSION, requirements_file='', **kwargs): + framework_version=TF_VERSION, requirements_file='', image_name=None, **kwargs): """Initialize an ``TensorFlow`` estimator. Args: training_steps (int): Perform this many steps of training. `None`, the default means train forever. @@ -171,9 +171,11 @@ def __init__(self, training_steps=None, evaluation_steps=None, checkpoint_path=N requirements_file (str): Path to a ``requirements.txt`` file (default: ''). The path should be within and relative to ``source_dir``. Details on the format can be found in the `Pip User Guide `_. + image_name (str): The container image to use for training. This will override py_version and + framework_version. The image is expected to be a modification of the SageMaker TensorFlow image. **kwargs: Additional kwargs passed to the Framework constructor. """ - super(TensorFlow, self).__init__(**kwargs) + super(TensorFlow, self).__init__(image_name=image_name, **kwargs) self.checkpoint_path = checkpoint_path self.py_version = py_version self.framework_version = framework_version @@ -257,7 +259,14 @@ def _prepare_init_params_from_job_description(cls, job_details): if value is not None: init_params[argument] = value - framework, py_version, tag = framework_name_from_image(init_params.pop('image')) + image_name = init_params.pop('image') + framework, py_version, tag = framework_name_from_image(image_name) + if not framework: + # If we were unable to parse the framework name from the image it is not one of our + # officially supported images, in this case just add the image to the init params. + init_params['image_name'] = image_name + return init_params + init_params['py_version'] = py_version # We switched image tagging scheme from regular image version (e.g. '1.0') to more expressive @@ -272,18 +281,6 @@ def _prepare_init_params_from_job_description(cls, job_details): return init_params - def train_image(self): - """Return the Docker image to use for training. - - The :meth:`~sagemaker.estimator.EstimatorBase.fit` method, which does the model training, calls this method to - find the image to use for model training. - - Returns: - str: The URI of the Docker image. - """ - return create_image_uri(self.sagemaker_session.boto_region_name, self.__framework_name__, - self.train_instance_type, self.framework_version, py_version=self.py_version) - def create_model(self, model_server_workers=None): """Create a SageMaker ``TensorFlowModel`` object that can be deployed to an ``Endpoint``. @@ -296,9 +293,9 @@ def create_model(self, model_server_workers=None): See :func:`~sagemaker.tensorflow.model.TensorFlowModel` for full details. """ env = {'SAGEMAKER_REQUIREMENTS': self.requirements_file} - return TensorFlowModel(self.model_data, self.role, self.entry_point, source_dir=self.source_dir, - enable_cloudwatch_metrics=self.enable_cloudwatch_metrics, env=env, - name=self._current_job_name, container_log_level=self.container_log_level, + return TensorFlowModel(self.model_data, self.role, self.entry_point, image=self.image_name, + source_dir=self.source_dir, enable_cloudwatch_metrics=self.enable_cloudwatch_metrics, + env=env, name=self._current_job_name, container_log_level=self.container_log_level, code_location=self.code_location, py_version=self.py_version, framework_version=self.framework_version, model_server_workers=model_server_workers, sagemaker_session=self.sagemaker_session) diff --git a/tests/unit/test_chainer.py b/tests/unit/test_chainer.py index 49fe59a560..b222bcd60c 100644 --- a/tests/unit/test_chainer.py +++ b/tests/unit/test_chainer.py @@ -248,6 +248,32 @@ def test_create_model(sagemaker_session, chainer_version): assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics +def test_create_model_with_custom_image(sagemaker_session): + container_log_level = '"logging.INFO"' + source_dir = 's3://mybucket/source' + enable_cloudwatch_metrics = 'true' + custom_image = 'ubuntu:latest' + chainer = Chainer(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, + train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, + image_name=custom_image, container_log_level=container_log_level, + py_version=PYTHON_VERSION, base_job_name='job', source_dir=source_dir, + enable_cloudwatch_metrics=enable_cloudwatch_metrics) + + job_name = 'new_name' + chainer.fit(inputs='s3://mybucket/train', job_name='new_name') + model = chainer.create_model() + chainer.container_log_level + + assert model.sagemaker_session == sagemaker_session + assert model.image == custom_image + assert model.entry_point == SCRIPT_PATH + assert model.role == ROLE + assert model.name == job_name + assert model.container_log_level == container_log_level + assert model.source_dir == source_dir + assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics + + @patch('time.strftime', return_value=TIMESTAMP) def test_chainer(strftime, sagemaker_session, chainer_version): chainer = Chainer(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, @@ -393,3 +419,46 @@ def test_attach_wrong_framework(sagemaker_session): with pytest.raises(ValueError) as error: Chainer.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error) + + +def test_attach_custom_image(sagemaker_session): + training_image = '1.dkr.ecr.us-west-2.amazonaws.com/my_custom_chainer_image:latest' + returned_job_description = {'AlgorithmSpecification': + {'TrainingInputMode': 'File', + 'TrainingImage': training_image}, + 'HyperParameters': + {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', + 'sagemaker_program': '"iris-dnn-classifier.py"', + 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', + 'sagemaker_enable_cloudwatch_metrics': 'false', + 'sagemaker_container_log_level': '"logging.INFO"', + 'sagemaker_job_name': '"neo"', + 'training_steps': '100', + 'sagemaker_region': '"us-west-2"'}, + 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', + 'ResourceConfig': + {'VolumeSizeInGB': 30, + 'InstanceCount': 1, + 'InstanceType': 'ml.c4.xlarge'}, + 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, + 'TrainingJobName': 'neo', + 'TrainingJobStatus': 'Completed', + 'OutputDataConfig': {'KmsKeyId': '', + 'S3OutputPath': 's3://place/output/neo'}, + 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} + sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', + return_value=returned_job_description) + + estimator = Chainer.attach(training_job_name='neo', sagemaker_session=sagemaker_session) + assert estimator.latest_training_job.job_name == 'neo' + assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole' + assert estimator.train_instance_count == 1 + assert estimator.train_max_run == 24 * 60 * 60 + assert estimator.input_mode == 'File' + assert estimator.base_job_name == 'neo' + assert estimator.output_path == 's3://place/output/neo' + assert estimator.output_kms_key == '' + assert estimator.hyperparameters()['training_steps'] == '100' + assert estimator.source_dir == 's3://some/sourcedir.tar.gz' + assert estimator.entry_point == 'iris-dnn-classifier.py' + assert estimator.train_image() == training_image diff --git a/tests/unit/test_mxnet.py b/tests/unit/test_mxnet.py index f0cea93de6..d77160e621 100644 --- a/tests/unit/test_mxnet.py +++ b/tests/unit/test_mxnet.py @@ -110,7 +110,6 @@ def test_create_model(sagemaker_session, mxnet_version): job_name = 'new_name' mx.fit(inputs='s3://mybucket/train', job_name='new_name') model = mx.create_model() - mx.container_log_level assert model.sagemaker_session == sagemaker_session assert model.framework_version == mxnet_version @@ -123,6 +122,30 @@ def test_create_model(sagemaker_session, mxnet_version): assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics +def test_create_model_with_custom_image(sagemaker_session): + container_log_level = '"logging.INFO"' + source_dir = 's3://mybucket/source' + enable_cloudwatch_metrics = 'true' + custom_image = 'mxnet:2.0' + mx = MXNet(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, + train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, + image_name=custom_image, container_log_level=container_log_level, + base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) + + job_name = 'new_name' + mx.fit(inputs='s3://mybucket/train', job_name='new_name') + model = mx.create_model() + + assert model.sagemaker_session == sagemaker_session + assert model.image == custom_image + assert model.entry_point == SCRIPT_PATH + assert model.role == ROLE + assert model.name == job_name + assert model.container_log_level == container_log_level + assert model.source_dir == source_dir + assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics + + @patch('time.strftime', return_value=TIMESTAMP) def test_mxnet(strftime, sagemaker_session, mxnet_version): mx = MXNet(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, @@ -297,3 +320,45 @@ def test_attach_wrong_framework(sagemaker_session): with pytest.raises(ValueError) as error: MXNet.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error) + + +def test_attach_custom_image(sagemaker_session): + training_image = 'ubuntu:latest' + returned_job_description = {'AlgorithmSpecification': { + 'TrainingInputMode': 'File', + 'TrainingImage': training_image}, + 'HyperParameters': { + 'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', + 'sagemaker_program': '"iris-dnn-classifier.py"', + 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', + 'sagemaker_enable_cloudwatch_metrics': 'false', + 'sagemaker_container_log_level': '"logging.INFO"', + 'sagemaker_job_name': '"neo"', + 'training_steps': '100', + 'sagemaker_region': '"us-west-2"'}, + 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', + 'ResourceConfig': { + 'VolumeSizeInGB': 30, + 'InstanceCount': 1, + 'InstanceType': 'ml.c4.xlarge'}, + 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, + 'TrainingJobName': 'neo', + 'TrainingJobStatus': 'Completed', + 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, + 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} + sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', + return_value=returned_job_description) + + estimator = MXNet.attach(training_job_name='neo', sagemaker_session=sagemaker_session) + assert estimator.latest_training_job.job_name == 'neo' + assert estimator.image_name == training_image + assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole' + assert estimator.train_instance_count == 1 + assert estimator.train_max_run == 24 * 60 * 60 + assert estimator.input_mode == 'File' + assert estimator.base_job_name == 'neo' + assert estimator.output_path == 's3://place/output/neo' + assert estimator.output_kms_key == '' + assert estimator.hyperparameters()['training_steps'] == '100' + assert estimator.source_dir == 's3://some/sourcedir.tar.gz' + assert estimator.entry_point == 'iris-dnn-classifier.py' diff --git a/tests/unit/test_tf_estimator.py b/tests/unit/test_tf_estimator.py index afd33c960c..ae98be07f1 100644 --- a/tests/unit/test_tf_estimator.py +++ b/tests/unit/test_tf_estimator.py @@ -205,6 +205,31 @@ def test_create_model(sagemaker_session, tf_version): assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics +def test_create_model_with_custom_image(sagemaker_session): + container_log_level = '"logging.INFO"' + source_dir = 's3://mybucket/source' + enable_cloudwatch_metrics = 'true' + custom_image = 'tensorflow:1.0' + tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, + training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, + train_instance_type=INSTANCE_TYPE, image_name=custom_image, + container_log_level=container_log_level, base_job_name='job', + source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) + + job_name = 'doing something' + tf.fit(inputs='s3://mybucket/train', job_name=job_name) + model = tf.create_model() + + assert model.sagemaker_session == sagemaker_session + assert model.image == custom_image + assert model.entry_point == SCRIPT_PATH + assert model.role == ROLE + assert model.name == job_name + assert model.container_log_level == container_log_level + assert model.source_dir == source_dir + assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics + + @patch('time.strftime', return_value=TIMESTAMP) @patch('time.time', return_value=TIME) @patch('sagemaker.estimator.tar_and_upload_dir') @@ -557,3 +582,49 @@ def test_attach_wrong_framework(sagemaker_session): with pytest.raises(ValueError) as error: TensorFlow.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error) + + +def test_attach_custom_image(sagemaker_session): + training_image = '1.dkr.ecr.us-west-2.amazonaws.com/tensorflow_with_custom_binary:1.0' + rjd = { + 'AlgorithmSpecification': { + 'TrainingInputMode': 'File', + 'TrainingImage': training_image}, + 'HyperParameters': { + 'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', + 'checkpoint_path': '"s3://other/1508872349"', + 'sagemaker_program': '"iris-dnn-classifier.py"', + 'sagemaker_enable_cloudwatch_metrics': 'false', + 'sagemaker_container_log_level': '"logging.INFO"', + 'sagemaker_job_name': '"neo"', + 'training_steps': '100', + 'evaluation_steps': '10'}, + 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', + 'ResourceConfig': { + 'VolumeSizeInGB': 30, + 'InstanceCount': 1, + 'InstanceType': 'ml.c4.xlarge'}, + 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, + 'TrainingJobName': 'neo', + 'TrainingJobStatus': 'Completed', + 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, + 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} + sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=rjd) + + estimator = TensorFlow.attach(training_job_name='neo', sagemaker_session=sagemaker_session) + assert estimator.latest_training_job.job_name == 'neo' + assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole' + assert estimator.train_instance_count == 1 + assert estimator.train_max_run == 24 * 60 * 60 + assert estimator.input_mode == 'File' + assert estimator.training_steps == 100 + assert estimator.evaluation_steps == 10 + assert estimator.input_mode == 'File' + assert estimator.base_job_name == 'neo' + assert estimator.output_path == 's3://place/output/neo' + assert estimator.output_kms_key == '' + assert estimator.hyperparameters()['training_steps'] == '100' + assert estimator.source_dir == 's3://some/sourcedir.tar.gz' + assert estimator.entry_point == 'iris-dnn-classifier.py' + assert estimator.checkpoint_path == 's3://other/1508872349' + assert estimator.train_image() == training_image From 4926e1095b4f75be8c7890ffdb8d1a43356bf090 Mon Sep 17 00:00:00 2001 From: Ignacio Quintero Date: Thu, 14 Jun 2018 13:53:23 -0700 Subject: [PATCH 2/6] Adding better docs --- src/sagemaker/chainer/README.rst | 9 +++++++++ src/sagemaker/chainer/estimator.py | 6 ++++-- src/sagemaker/estimator.py | 3 --- src/sagemaker/mxnet/README.rst | 7 +++++++ src/sagemaker/mxnet/estimator.py | 6 ++++-- src/sagemaker/tensorflow/README.rst | 9 +++++++++ src/sagemaker/tensorflow/estimator.py | 6 ++++-- 7 files changed, 37 insertions(+), 9 deletions(-) diff --git a/src/sagemaker/chainer/README.rst b/src/sagemaker/chainer/README.rst index f87a2905bf..334269dede 100644 --- a/src/sagemaker/chainer/README.rst +++ b/src/sagemaker/chainer/README.rst @@ -175,6 +175,12 @@ The following are optional arguments. When you create a ``Chainer`` object, you - ``job_name`` Name to assign for the training job that the fit() method launches. If not specified, the estimator generates a default job name, based on the training image name and current timestamp +- ``image_name`` An alternative docker image to use for training and + serving. If specified, the estimator will use this image for training and + hosting, instead of selecting the appropriate SageMaker official image based on + framework_version and py_version. Refer to: `SageMaker Chainer Docker Containers + <#sagemaker-chainer-docker-containers>`_ for details on what the Official images support + and where to find the source code to build your custom image. Distributed Chainer Training @@ -656,5 +662,8 @@ Currently supported versions are listed in the above table. You can also set fra minor version, which will cause your training script to be run on the latest supported patch version of that minor version. +Alternatively, you can build your own image by following the instructions in the SageMaker Chainer containers +repository, and passing ``image_name`` to the Chainer Estimator constructor. + You can visit the SageMaker Chainer containers repository here: https://github.com/aws/sagemaker-chainer-containers/ diff --git a/src/sagemaker/chainer/estimator.py b/src/sagemaker/chainer/estimator.py index 7f7f0fa3d8..a9bf391417 100644 --- a/src/sagemaker/chainer/estimator.py +++ b/src/sagemaker/chainer/estimator.py @@ -67,8 +67,10 @@ def __init__(self, entry_point, use_mpi=None, num_processes=None, process_slots_ One of 'py2' or 'py3'. framework_version (str): Chainer version you want to use for executing your model training code. List of supported versions https://github.com/aws/sagemaker-python-sdk#chainer-sagemaker-estimators - image_name (str): The container image to use for training. This will override py_version and - framework_version. The image is expected to be a modification of the SageMaker Chainer image. + image_name (str): If specified, the estimator will use this image for training and hosting, instead of + selecting the appropriate SageMaker official image based on framework_version and py_version. It can + be an ECR url or dockerhub image and tag: 123.dkr.ecr.us-west-2.amazonaws.com/my-custom-image:1.0, + custom-image:latest. **kwargs: Additional kwargs passed to the :class:`~sagemaker.estimator.Framework` constructor. """ super(Chainer, self).__init__(entry_point, source_dir, hyperparameters, diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index 53d35be394..29957b5819 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -227,7 +227,6 @@ def attach(cls, training_job_name, sagemaker_session=None): job_details = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=training_job_name) init_params = cls._prepare_init_params_from_job_description(job_details) - print(init_params) estimator = cls(sagemaker_session=sagemaker_session, **init_params) estimator.latest_training_job = _TrainingJob(sagemaker_session=sagemaker_session, training_job_name=init_params['base_job_name']) @@ -527,8 +526,6 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, enable_cl self._hyperparameters = hyperparameters or {} self.code_location = code_location self.image_name = image_name - print(self.image_name) - print(kwargs) def _prepare_for_training(self, job_name=None): """Set hyperparameters needed for training. This method will also validate ``source_dir``. diff --git a/src/sagemaker/mxnet/README.rst b/src/sagemaker/mxnet/README.rst index 9a5dd9e7cc..d3fc89cc31 100644 --- a/src/sagemaker/mxnet/README.rst +++ b/src/sagemaker/mxnet/README.rst @@ -153,6 +153,12 @@ The following are optional arguments. When you create an ``MXNet`` object, you c - ``job_name`` Name to assign for the training job that the fit() method launches. If not specified, the estimator generates a default job name, based on the training image name and current timestamp +- ``image_name`` An alternative docker image to use for training and + serving. If specified, the estimator will use this image for training and + hosting, instead of selecting the appropriate SageMaker official image based on + framework_version and py_version. Refer to: `SageMaker MXNet Docker Containers + <#sagemaker-mxnet-docker-containers>`_ for details on what the Official images support + and where to find the source code to build your custom image. Calling fit ^^^^^^^^^^^ @@ -595,5 +601,6 @@ The Docker images have the following dependencies installed: The Docker images extend Ubuntu 16.04. You can select version of MXNet by passing a ``framework_version`` keyword arg to the MXNet Estimator constructor. Currently supported versions are listed in the above table. You can also set ``framework_version`` to only specify major and minor version, e.g ``1.1``, which will cause your training script to be run on the latest supported patch version of that minor version, which in this example would be 1.1.0. +Alternatively, you can build your own image by following the instructions in the SageMaker MXNet containers repository, and passing ``image_name`` to the MXNet Estimator constructor. You can visit the SageMaker MXNet containers repository here: https://github.com/aws/sagemaker-mxnet-containers/ diff --git a/src/sagemaker/mxnet/estimator.py b/src/sagemaker/mxnet/estimator.py index d7176432a2..ef3ddfd5de 100644 --- a/src/sagemaker/mxnet/estimator.py +++ b/src/sagemaker/mxnet/estimator.py @@ -52,8 +52,10 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, py_versio One of 'py2' or 'py3'. framework_version (str): MXNet version you want to use for executing your model training code. List of supported versions https://github.com/aws/sagemaker-python-sdk#mxnet-sagemaker-estimators - image_name (str): The container image to use for training. This will override py_version and - framework_version. The image is expected to be a modification of the SageMaker MXNet image. + image_name (str): If specified, the estimator will use this image for training and hosting, instead of + selecting the appropriate SageMaker official image based on framework_version and py_version. It can + be an ECR url or dockerhub image and tag: 123.dkr.ecr.us-west-2.amazonaws.com/my-custom-image:1.0, + custom-image:latest. **kwargs: Additional kwargs passed to the :class:`~sagemaker.estimator.Framework` constructor. """ super(MXNet, self).__init__(entry_point, source_dir, hyperparameters, diff --git a/src/sagemaker/tensorflow/README.rst b/src/sagemaker/tensorflow/README.rst index 81f36eb5b4..e22bf98ea2 100644 --- a/src/sagemaker/tensorflow/README.rst +++ b/src/sagemaker/tensorflow/README.rst @@ -433,6 +433,12 @@ you can specify these as keyword arguments. - ``base_job_name`` Name to assign for the training job that the ``fit`` method launches. If not specified, the estimator generates a default job name, based on the training image name and current timestamp. +- ``image_name`` An alternative docker image to use for training and + serving. If specified, the estimator will use this image for training and + hosting, instead of selecting the appropriate SageMaker official image based on + framework_version and py_version. Refer to: `SageMaker TensorFlow Docker Containers + <#sagemaker-tensorflow-docker-containers>`_ for details on what the Official images support + and where to find the source code to build your custom image. Optional Hyperparameters @@ -785,5 +791,8 @@ The TensorFlow Docker images support Python 2.7 and have the following Python mo The Docker images extend Ubuntu 16.04. You can select version of TensorFlow by passing a ``framework_version`` keyword arg to the TensorFlow Estimator constructor. Currently supported versions are listed in the table above. You can also set ``framework_version`` to only specify major and minor version, e.g ``1.6``, which will cause your training script to be run on the latest supported patch version of that minor version, which in this example would be 1.6.0. +Alternatively, you can build your own image by following the instructions in the SageMaker TensorFlow containers +repository, and passing ``image_name`` to the TensorFlow Estimator constructor. + You can visit the SageMaker TensorFlow containers repository here: https://github.com/aws/sagemaker-tensorflow-containers/ diff --git a/src/sagemaker/tensorflow/estimator.py b/src/sagemaker/tensorflow/estimator.py index 40eb962464..912d669968 100644 --- a/src/sagemaker/tensorflow/estimator.py +++ b/src/sagemaker/tensorflow/estimator.py @@ -171,8 +171,10 @@ def __init__(self, training_steps=None, evaluation_steps=None, checkpoint_path=N requirements_file (str): Path to a ``requirements.txt`` file (default: ''). The path should be within and relative to ``source_dir``. Details on the format can be found in the `Pip User Guide `_. - image_name (str): The container image to use for training. This will override py_version and - framework_version. The image is expected to be a modification of the SageMaker TensorFlow image. + image_name (str): If specified, the estimator will use this image for training and hosting, instead of + selecting the appropriate SageMaker official image based on framework_version and py_version. It can + be an ECR url or dockerhub image and tag: 123.dkr.ecr.us-west-2.amazonaws.com/my-custom-image:1.0, + custom-image:latest. **kwargs: Additional kwargs passed to the Framework constructor. """ super(TensorFlow, self).__init__(image_name=image_name, **kwargs) From 17d14d191c6d031648ee054e6fbecd9e5484b121 Mon Sep 17 00:00:00 2001 From: Ignacio Quintero Date: Thu, 14 Jun 2018 15:01:33 -0700 Subject: [PATCH 3/6] Cleanup tests --- tests/unit/test_chainer.py | 22 +--------------------- tests/unit/test_mxnet.py | 12 +----------- tests/unit/test_tf_estimator.py | 23 +---------------------- 3 files changed, 3 insertions(+), 54 deletions(-) diff --git a/tests/unit/test_chainer.py b/tests/unit/test_chainer.py index b222bcd60c..a07472919c 100644 --- a/tests/unit/test_chainer.py +++ b/tests/unit/test_chainer.py @@ -235,7 +235,6 @@ def test_create_model(sagemaker_session, chainer_version): job_name = 'new_name' chainer.fit(inputs='s3://mybucket/train', job_name='new_name') model = chainer.create_model() - chainer.container_log_level assert model.sagemaker_session == sagemaker_session assert model.framework_version == chainer_version @@ -259,19 +258,10 @@ def test_create_model_with_custom_image(sagemaker_session): py_version=PYTHON_VERSION, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) - job_name = 'new_name' chainer.fit(inputs='s3://mybucket/train', job_name='new_name') model = chainer.create_model() - chainer.container_log_level - assert model.sagemaker_session == sagemaker_session assert model.image == custom_image - assert model.entry_point == SCRIPT_PATH - assert model.role == ROLE - assert model.name == job_name - assert model.container_log_level == container_log_level - assert model.source_dir == source_dir - assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics @patch('time.strftime', return_value=TIMESTAMP) @@ -450,15 +440,5 @@ def test_attach_custom_image(sagemaker_session): return_value=returned_job_description) estimator = Chainer.attach(training_job_name='neo', sagemaker_session=sagemaker_session) - assert estimator.latest_training_job.job_name == 'neo' - assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole' - assert estimator.train_instance_count == 1 - assert estimator.train_max_run == 24 * 60 * 60 - assert estimator.input_mode == 'File' - assert estimator.base_job_name == 'neo' - assert estimator.output_path == 's3://place/output/neo' - assert estimator.output_kms_key == '' - assert estimator.hyperparameters()['training_steps'] == '100' - assert estimator.source_dir == 's3://some/sourcedir.tar.gz' - assert estimator.entry_point == 'iris-dnn-classifier.py' + assert estimator.image_name == training_image assert estimator.train_image() == training_image diff --git a/tests/unit/test_mxnet.py b/tests/unit/test_mxnet.py index d77160e621..245ad96378 100644 --- a/tests/unit/test_mxnet.py +++ b/tests/unit/test_mxnet.py @@ -350,15 +350,5 @@ def test_attach_custom_image(sagemaker_session): return_value=returned_job_description) estimator = MXNet.attach(training_job_name='neo', sagemaker_session=sagemaker_session) - assert estimator.latest_training_job.job_name == 'neo' assert estimator.image_name == training_image - assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole' - assert estimator.train_instance_count == 1 - assert estimator.train_max_run == 24 * 60 * 60 - assert estimator.input_mode == 'File' - assert estimator.base_job_name == 'neo' - assert estimator.output_path == 's3://place/output/neo' - assert estimator.output_kms_key == '' - assert estimator.hyperparameters()['training_steps'] == '100' - assert estimator.source_dir == 's3://some/sourcedir.tar.gz' - assert estimator.entry_point == 'iris-dnn-classifier.py' + assert estimator.train_image() == training_image diff --git a/tests/unit/test_tf_estimator.py b/tests/unit/test_tf_estimator.py index ae98be07f1..479bda65d7 100644 --- a/tests/unit/test_tf_estimator.py +++ b/tests/unit/test_tf_estimator.py @@ -220,14 +220,7 @@ def test_create_model_with_custom_image(sagemaker_session): tf.fit(inputs='s3://mybucket/train', job_name=job_name) model = tf.create_model() - assert model.sagemaker_session == sagemaker_session assert model.image == custom_image - assert model.entry_point == SCRIPT_PATH - assert model.role == ROLE - assert model.name == job_name - assert model.container_log_level == container_log_level - assert model.source_dir == source_dir - assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics @patch('time.strftime', return_value=TIMESTAMP) @@ -612,19 +605,5 @@ def test_attach_custom_image(sagemaker_session): sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=rjd) estimator = TensorFlow.attach(training_job_name='neo', sagemaker_session=sagemaker_session) - assert estimator.latest_training_job.job_name == 'neo' - assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole' - assert estimator.train_instance_count == 1 - assert estimator.train_max_run == 24 * 60 * 60 - assert estimator.input_mode == 'File' - assert estimator.training_steps == 100 - assert estimator.evaluation_steps == 10 - assert estimator.input_mode == 'File' - assert estimator.base_job_name == 'neo' - assert estimator.output_path == 's3://place/output/neo' - assert estimator.output_kms_key == '' - assert estimator.hyperparameters()['training_steps'] == '100' - assert estimator.source_dir == 's3://some/sourcedir.tar.gz' - assert estimator.entry_point == 'iris-dnn-classifier.py' - assert estimator.checkpoint_path == 's3://other/1508872349' + assert estimator.image_name == training_image assert estimator.train_image() == training_image From 74775e471fd53b970bd15540c9936e1af2c1f499 Mon Sep 17 00:00:00 2001 From: Ignacio Quintero Date: Fri, 15 Jun 2018 10:43:12 -0700 Subject: [PATCH 4/6] Fix doc formatting --- src/sagemaker/chainer/README.rst | 10 +++++----- src/sagemaker/chainer/estimator.py | 6 ++++-- src/sagemaker/mxnet/README.rst | 10 +++++----- src/sagemaker/mxnet/estimator.py | 6 ++++-- src/sagemaker/tensorflow/README.rst | 10 +++++----- src/sagemaker/tensorflow/estimator.py | 6 ++++-- 6 files changed, 27 insertions(+), 21 deletions(-) diff --git a/src/sagemaker/chainer/README.rst b/src/sagemaker/chainer/README.rst index 334269dede..912acb4482 100644 --- a/src/sagemaker/chainer/README.rst +++ b/src/sagemaker/chainer/README.rst @@ -176,11 +176,11 @@ The following are optional arguments. When you create a ``Chainer`` object, you method launches. If not specified, the estimator generates a default job name, based on the training image name and current timestamp - ``image_name`` An alternative docker image to use for training and - serving. If specified, the estimator will use this image for training and - hosting, instead of selecting the appropriate SageMaker official image based on - framework_version and py_version. Refer to: `SageMaker Chainer Docker Containers - <#sagemaker-chainer-docker-containers>`_ for details on what the Official images support - and where to find the source code to build your custom image. + serving. If specified, the estimator will use this image for training and + hosting, instead of selecting the appropriate SageMaker official image based on + framework_version and py_version. Refer to: `SageMaker Chainer Docker Containers + <#sagemaker-chainer-docker-containers>`_ for details on what the Official images support + and where to find the source code to build your custom image. Distributed Chainer Training diff --git a/src/sagemaker/chainer/estimator.py b/src/sagemaker/chainer/estimator.py index a9bf391417..cf0420fc63 100644 --- a/src/sagemaker/chainer/estimator.py +++ b/src/sagemaker/chainer/estimator.py @@ -69,8 +69,10 @@ def __init__(self, entry_point, use_mpi=None, num_processes=None, process_slots_ List of supported versions https://github.com/aws/sagemaker-python-sdk#chainer-sagemaker-estimators image_name (str): If specified, the estimator will use this image for training and hosting, instead of selecting the appropriate SageMaker official image based on framework_version and py_version. It can - be an ECR url or dockerhub image and tag: 123.dkr.ecr.us-west-2.amazonaws.com/my-custom-image:1.0, - custom-image:latest. + be an ECR url or dockerhub image and tag. + Examples: + 123.dkr.ecr.us-west-2.amazonaws.com/my-custom-image:1.0 + custom-image:latest. **kwargs: Additional kwargs passed to the :class:`~sagemaker.estimator.Framework` constructor. """ super(Chainer, self).__init__(entry_point, source_dir, hyperparameters, diff --git a/src/sagemaker/mxnet/README.rst b/src/sagemaker/mxnet/README.rst index d3fc89cc31..5eae5246bb 100644 --- a/src/sagemaker/mxnet/README.rst +++ b/src/sagemaker/mxnet/README.rst @@ -154,11 +154,11 @@ The following are optional arguments. When you create an ``MXNet`` object, you c method launches. If not specified, the estimator generates a default job name, based on the training image name and current timestamp - ``image_name`` An alternative docker image to use for training and - serving. If specified, the estimator will use this image for training and - hosting, instead of selecting the appropriate SageMaker official image based on - framework_version and py_version. Refer to: `SageMaker MXNet Docker Containers - <#sagemaker-mxnet-docker-containers>`_ for details on what the Official images support - and where to find the source code to build your custom image. + serving. If specified, the estimator will use this image for training and + hosting, instead of selecting the appropriate SageMaker official image based on + framework_version and py_version. Refer to: `SageMaker MXNet Docker Containers + <#sagemaker-mxnet-docker-containers>`_ for details on what the Official images support + and where to find the source code to build your custom image. Calling fit ^^^^^^^^^^^ diff --git a/src/sagemaker/mxnet/estimator.py b/src/sagemaker/mxnet/estimator.py index ef3ddfd5de..8fc0811c9e 100644 --- a/src/sagemaker/mxnet/estimator.py +++ b/src/sagemaker/mxnet/estimator.py @@ -54,8 +54,10 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, py_versio List of supported versions https://github.com/aws/sagemaker-python-sdk#mxnet-sagemaker-estimators image_name (str): If specified, the estimator will use this image for training and hosting, instead of selecting the appropriate SageMaker official image based on framework_version and py_version. It can - be an ECR url or dockerhub image and tag: 123.dkr.ecr.us-west-2.amazonaws.com/my-custom-image:1.0, - custom-image:latest. + be an ECR url or dockerhub image and tag. + Examples: + 123.dkr.ecr.us-west-2.amazonaws.com/my-custom-image:1.0 + custom-image:latest. **kwargs: Additional kwargs passed to the :class:`~sagemaker.estimator.Framework` constructor. """ super(MXNet, self).__init__(entry_point, source_dir, hyperparameters, diff --git a/src/sagemaker/tensorflow/README.rst b/src/sagemaker/tensorflow/README.rst index e22bf98ea2..83b7cfd78a 100644 --- a/src/sagemaker/tensorflow/README.rst +++ b/src/sagemaker/tensorflow/README.rst @@ -434,11 +434,11 @@ you can specify these as keyword arguments. method launches. If not specified, the estimator generates a default job name, based on the training image name and current timestamp. - ``image_name`` An alternative docker image to use for training and - serving. If specified, the estimator will use this image for training and - hosting, instead of selecting the appropriate SageMaker official image based on - framework_version and py_version. Refer to: `SageMaker TensorFlow Docker Containers - <#sagemaker-tensorflow-docker-containers>`_ for details on what the Official images support - and where to find the source code to build your custom image. + serving. If specified, the estimator will use this image for training and + hosting, instead of selecting the appropriate SageMaker official image based on + framework_version and py_version. Refer to: `SageMaker TensorFlow Docker Containers + <#sagemaker-tensorflow-docker-containers>`_ for details on what the Official images support + and where to find the source code to build your custom image. Optional Hyperparameters diff --git a/src/sagemaker/tensorflow/estimator.py b/src/sagemaker/tensorflow/estimator.py index 912d669968..4b100571f1 100644 --- a/src/sagemaker/tensorflow/estimator.py +++ b/src/sagemaker/tensorflow/estimator.py @@ -173,8 +173,10 @@ def __init__(self, training_steps=None, evaluation_steps=None, checkpoint_path=N `Pip User Guide `_. image_name (str): If specified, the estimator will use this image for training and hosting, instead of selecting the appropriate SageMaker official image based on framework_version and py_version. It can - be an ECR url or dockerhub image and tag: 123.dkr.ecr.us-west-2.amazonaws.com/my-custom-image:1.0, - custom-image:latest. + be an ECR url or dockerhub image and tag. + Examples: + 123.dkr.ecr.us-west-2.amazonaws.com/my-custom-image:1.0 + custom-image:latest. **kwargs: Additional kwargs passed to the Framework constructor. """ super(TensorFlow, self).__init__(image_name=image_name, **kwargs) From f2cab5f3a5faf03d6d951d00b826b5ac6ac0377d Mon Sep 17 00:00:00 2001 From: Ignacio Quintero Date: Mon, 25 Jun 2018 10:39:12 -0700 Subject: [PATCH 5/6] Add support for PyTorch --- src/sagemaker/mxnet/estimator.py | 11 ++---- src/sagemaker/pytorch/README.rst | 10 ++++- src/sagemaker/pytorch/estimator.py | 36 +++++++++--------- tests/unit/test_pytorch.py | 59 +++++++++++++++++++++++++++++- 4 files changed, 88 insertions(+), 28 deletions(-) diff --git a/src/sagemaker/mxnet/estimator.py b/src/sagemaker/mxnet/estimator.py index 702836fa0c..dc226de199 100644 --- a/src/sagemaker/mxnet/estimator.py +++ b/src/sagemaker/mxnet/estimator.py @@ -76,16 +76,11 @@ def create_model(self, model_server_workers=None): sagemaker.mxnet.model.MXNetModel: A SageMaker ``MXNetModel`` object. See :func:`~sagemaker.mxnet.model.MXNetModel` for full details. """ - kwargs = {} - # pass our custom image if there is one. - if self.image_name: - kwargs['image'] = self.image_name - return MXNetModel(self.model_data, self.role, self.entry_point, source_dir=self._model_source_dir(), enable_cloudwatch_metrics=self.enable_cloudwatch_metrics, name=self._current_job_name, - container_log_level=self.container_log_level, code_location=self.code_location, image=self.image_name, - py_version=self.py_version, framework_version=self.framework_version, - model_server_workers=model_server_workers, sagemaker_session=self.sagemaker_session, **kwargs) + container_log_level=self.container_log_level, code_location=self.code_location, + py_version=self.py_version, framework_version=self.framework_version, image=self.image_name, + model_server_workers=model_server_workers, sagemaker_session=self.sagemaker_session) @classmethod def _prepare_init_params_from_job_description(cls, job_details): diff --git a/src/sagemaker/pytorch/README.rst b/src/sagemaker/pytorch/README.rst index adab4389d3..d6c761e42a 100644 --- a/src/sagemaker/pytorch/README.rst +++ b/src/sagemaker/pytorch/README.rst @@ -204,7 +204,12 @@ The following are optional arguments. When you create a ``PyTorch`` object, you - ``job_name`` Name to assign for the training job that the ``fit``` method launches. If not specified, the estimator generates a default job name, based on the training image name and current timestamp - +- ``image_name`` An alternative docker image to use for training and + serving. If specified, the estimator will use this image for training and + hosting, instead of selecting the appropriate SageMaker official image based on + framework_version and py_version. Refer to: `SageMaker PyTorch Docker Containers + <#sagemaker-pytorch-docker-containers>`_ for details on what the Official images support + and where to find the source code to build your custom image. Calling fit ~~~~~~~~~~~ @@ -705,4 +710,7 @@ Currently supported versions are listed in the above table. You can also set ``f minor version, which will cause your training script to be run on the latest supported patch version of that minor version. +Alternatively, you can build your own image by following the instructions in the SageMaker Chainer containers +repository, and passing ``image_name`` to the Chainer Estimator constructor. + You can visit `the SageMaker PyTorch containers repository `_. diff --git a/src/sagemaker/pytorch/estimator.py b/src/sagemaker/pytorch/estimator.py index 30dbbd41ce..b5d0120ea3 100644 --- a/src/sagemaker/pytorch/estimator.py +++ b/src/sagemaker/pytorch/estimator.py @@ -12,7 +12,7 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import from sagemaker.estimator import Framework -from sagemaker.fw_utils import create_image_uri, framework_name_from_image, framework_version_from_tag +from sagemaker.fw_utils import framework_name_from_image, framework_version_from_tag from sagemaker.pytorch.defaults import PYTORCH_VERSION, PYTHON_VERSION from sagemaker.pytorch.model import PyTorchModel @@ -23,7 +23,7 @@ class PyTorch(Framework): __framework_name__ = "pytorch" def __init__(self, entry_point, source_dir=None, hyperparameters=None, py_version=PYTHON_VERSION, - framework_version=PYTORCH_VERSION, **kwargs): + framework_version=PYTORCH_VERSION, image_name=None, **kwargs): """ This ``Estimator`` executes an PyTorch script in a managed PyTorch execution environment, within a SageMaker Training Job. The managed PyTorch environment is an Amazon-built Docker container that executes functions @@ -51,25 +51,18 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, py_versio One of 'py2' or 'py3'. framework_version (str): PyTorch version you want to use for executing your model training code. List of supported versions https://github.com/aws/sagemaker-python-sdk#pytorch-sagemaker-estimators + image_name (str): If specified, the estimator will use this image for training and hosting, instead of + selecting the appropriate SageMaker official image based on framework_version and py_version. It can + be an ECR url or dockerhub image and tag. + Examples: + 123.dkr.ecr.us-west-2.amazonaws.com/my-custom-image:1.0 + custom-image:latest. **kwargs: Additional kwargs passed to the :class:`~sagemaker.estimator.Framework` constructor. """ - super(PyTorch, self).__init__(entry_point, source_dir, hyperparameters, **kwargs) + super(PyTorch, self).__init__(entry_point, source_dir, hyperparameters, image_name=image_name, **kwargs) self.py_version = py_version self.framework_version = framework_version - def train_image(self): - """Return the Docker image to use for training. - - The :meth:`~sagemaker.estimator.EstimatorBase.fit` method, which does the model training, calls this method to - find the image to use for model training. - - Returns: - str: The URI of the Docker image. - """ - return create_image_uri(self.sagemaker_session.boto_session.region_name, self.__framework_name__, - self.train_instance_type, framework_version=self.framework_version, - py_version=self.py_version) - def create_model(self, model_server_workers=None): """Create a SageMaker ``PyTorchModel`` object that can be deployed to an ``Endpoint``. @@ -84,7 +77,7 @@ def create_model(self, model_server_workers=None): return PyTorchModel(self.model_data, self.role, self.entry_point, source_dir=self._model_source_dir(), enable_cloudwatch_metrics=self.enable_cloudwatch_metrics, name=self._current_job_name, container_log_level=self.container_log_level, code_location=self.code_location, - py_version=self.py_version, framework_version=self.framework_version, + py_version=self.py_version, framework_version=self.framework_version, image=self.image_name, model_server_workers=model_server_workers, sagemaker_session=self.sagemaker_session) @classmethod @@ -99,7 +92,14 @@ def _prepare_init_params_from_job_description(cls, job_details): """ init_params = super(PyTorch, cls)._prepare_init_params_from_job_description(job_details) - framework, py_version, tag = framework_name_from_image(init_params.pop('image')) + image_name = init_params.pop('image') + framework, py_version, tag = framework_name_from_image(image_name) + + if not framework: + # If we were unable to parse the framework name from the image it is not one of our + # officially supported images, in this case just add the image to the init params. + init_params['image_name'] = image_name + return init_params init_params['py_version'] = py_version init_params['framework_version'] = framework_version_from_tag(tag) diff --git a/tests/unit/test_pytorch.py b/tests/unit/test_pytorch.py index 2b153791d0..92015ce103 100644 --- a/tests/unit/test_pytorch.py +++ b/tests/unit/test_pytorch.py @@ -128,7 +128,6 @@ def test_create_model(sagemaker_session, pytorch_version): job_name = 'new_name' pytorch.fit(inputs='s3://mybucket/train', job_name='new_name') model = pytorch.create_model() - pytorch.container_log_level assert model.sagemaker_session == sagemaker_session assert model.framework_version == pytorch_version @@ -141,6 +140,30 @@ def test_create_model(sagemaker_session, pytorch_version): assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics +def test_create_model_with_custom_image(sagemaker_session): + container_log_level = '"logging.INFO"' + source_dir = 's3://mybucket/source' + enable_cloudwatch_metrics = 'true' + image = 'pytorch:9000' + pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, + train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, + container_log_level=container_log_level, image_name=image, + base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) + + job_name = 'new_name' + pytorch.fit(inputs='s3://mybucket/train', job_name='new_name') + model = pytorch.create_model() + + assert model.sagemaker_session == sagemaker_session + assert model.image == image + assert model.entry_point == SCRIPT_PATH + assert model.role == ROLE + assert model.name == job_name + assert model.container_log_level == container_log_level + assert model.source_dir == source_dir + assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics + + @patch('time.strftime', return_value=TIMESTAMP) def test_pytorch(strftime, sagemaker_session, pytorch_version): pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, @@ -286,3 +309,37 @@ def test_attach_wrong_framework(sagemaker_session): with pytest.raises(ValueError) as error: PyTorch.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error) + + +def test_attach_custom_image(sagemaker_session): + training_image = 'pytorch:latest' + returned_job_description = {'AlgorithmSpecification': + {'TrainingInputMode': 'File', + 'TrainingImage': training_image}, + 'HyperParameters': + {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', + 'sagemaker_program': '"iris-dnn-classifier.py"', + 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', + 'sagemaker_enable_cloudwatch_metrics': 'false', + 'sagemaker_container_log_level': '"logging.INFO"', + 'sagemaker_job_name': '"neo"', + 'training_steps': '100', + 'sagemaker_region': '"us-west-2"'}, + 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', + 'ResourceConfig': + {'VolumeSizeInGB': 30, + 'InstanceCount': 1, + 'InstanceType': 'ml.c4.xlarge'}, + 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, + 'TrainingJobName': 'neo', + 'TrainingJobStatus': 'Completed', + 'OutputDataConfig': {'KmsKeyId': '', + 'S3OutputPath': 's3://place/output/neo'}, + 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} + sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', + return_value=returned_job_description) + + estimator = PyTorch.attach(training_job_name='neo', sagemaker_session=sagemaker_session) + assert estimator.latest_training_job.job_name == 'neo' + assert estimator.image_name == training_image + assert estimator.train_image() == training_image From 31643bd4e75f25dc1d1d01c34a3f65fcc44b8dae Mon Sep 17 00:00:00 2001 From: Ignacio Quintero Date: Mon, 25 Jun 2018 10:40:33 -0700 Subject: [PATCH 6/6] update changelog --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 37f80d5f62..650c3de952 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,7 +8,7 @@ CHANGELOG * enhancement: Let Framework models reuse code uploaded by Framework estimators * enhancement: Unify generation of model uploaded code location * feature: Change minimum required scipy from 1.0.0 to 0.19.0 -* feature: Allow Chainer, Tensorflow and MXNet estimators to use a custom docker image. +* feature: Allow all Framework Estimators to use a custom docker image. 1.5.0 =====