From 760e1fef5ab67590d2e28deb38e54b211c9ab5ff Mon Sep 17 00:00:00 2001 From: Dan Choi Date: Wed, 22 May 2019 20:38:11 -0700 Subject: [PATCH 1/7] feature: support MXNet 1.4 with MMS --- doc/using_mxnet.rst | 42 ++++++++----- src/sagemaker/fw_utils.py | 2 +- src/sagemaker/model.py | 29 ++++++--- src/sagemaker/mxnet/README.rst | 35 ++++++----- src/sagemaker/mxnet/estimator.py | 2 +- src/sagemaker/mxnet/model.py | 16 ++++- tests/conftest.py | 2 +- tests/data/mxnet_mnist/mnist.py | 4 +- tests/unit/test_fw_utils.py | 10 ++- tests/unit/test_mxnet.py | 105 ++++++++++++++++++++++++++++--- 10 files changed, 184 insertions(+), 63 deletions(-) diff --git a/doc/using_mxnet.rst b/doc/using_mxnet.rst index 839b2dfe21..0b4923c05c 100644 --- a/doc/using_mxnet.rst +++ b/doc/using_mxnet.rst @@ -6,9 +6,9 @@ Using MXNet with the SageMaker Python SDK With the SageMaker Python SDK, you can train and host MXNet models on Amazon SageMaker. -Supported versions of MXNet: ``1.3.0``, ``1.2.1``, ``1.1.0``, ``1.0.0``, ``0.12.1``. +Supported versions of MXNet: ``1.4.0``, ``1.3.0``, ``1.2.1``, ``1.1.0``, ``1.0.0``, ``0.12.1``. -Supported versions of MXNet for Elastic Inference: ``1.3.0``. +Supported versions of MXNet for Elastic Inference: ``1.4.0``, ``1.3.0``. Training with MXNet ------------------- @@ -38,7 +38,7 @@ Preparing the MXNet training script +----------------------------------------------------------------------------------------------------------------------------------------------------------+ | WARNING | +==========================================================================================================================================================+ -| The structure for training scripts changed with MXNet version 1.3. | +| The structure for training scripts changed starting at MXNet version 1.3. | | Make sure you refer to the correct section of this README when you prepare your script. | | For information on how to upgrade an old script to the new format, see `"Updating your MXNet training script" <#updating-your-mxnet-training-script>`__. | +----------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -700,6 +700,13 @@ Where ``model`` is the model objected loaded by ``model_fn``, ``request_body`` i This one function should handle processing the input, performing a prediction, and processing the output. The return object should be one of the following: +For versions 1.4 and higher: +---------------------------- +- a tuple with two items: the response data and ``accept_type`` (the content type of the response data), or +- the response data: (the content type of the response will be set to either the accept header in the initial request or a default) + +For versions 1.3 and lower: +--------------------------- - a tuple with two items: the response data and ``accept_type`` (the content type of the response data), or - a Flask response object: http://flask.pocoo.org/docs/1.0/api/#response-objects @@ -802,23 +809,24 @@ Your MXNet training script will be run on version 1.2.1 by default. (See below f The Docker images have the following dependencies installed: -+-------------------------+--------------+-------------+-------------+-------------+-------------+ -| Dependencies | MXNet 0.12.1 | MXNet 1.0.0 | MXNet 1.1.0 | MXNet 1.2.1 | MXNet 1.3.0 | -+-------------------------+--------------+-------------+-------------+-------------+-------------+ -| Python | 2.7 or 3.5 | 2.7 or 3.5| 2.7 or 3.5| 2.7 or 3.5| 2.7 or 3.5| -+-------------------------+--------------+-------------+-------------+-------------+-------------+ -| CUDA (GPU image only) | 9.0 | 9.0 | 9.0 | 9.0 | 9.0 | -+-------------------------+--------------+-------------+-------------+-------------+-------------+ -| numpy | 1.13.3 | 1.13.3 | 1.13.3 | 1.14.5 | 1.14.6 | -+-------------------------+--------------+-------------+-------------+-------------+-------------+ -| onnx | N/A | N/A | N/A | 1.2.1 | 1.2.1 | -+-------------------------+--------------+-------------+-------------+-------------+-------------+ -| keras-mxnet | N/A | N/A | N/A | N/A | 2.2.2 | -+-------------------------+--------------+-------------+-------------+-------------+-------------+ ++-------------------------+--------------+-------------+-------------+-------------+-------------+-------------+ +| Dependencies | MXNet 0.12.1 | MXNet 1.0.0 | MXNet 1.1.0 | MXNet 1.2.1 | MXNet 1.3.0 | MXNet 1.4.0 | ++-------------------------+--------------+-------------+-------------+-------------+-------------+-------------+ +| Python | 2.7 or 3.5 | 2.7 or 3.5| 2.7 or 3.5| 2.7 or 3.5| 2.7 or 3.5| 2.7 or 3.6| ++-------------------------+--------------+-------------+-------------+-------------+-------------+-------------+ +| CUDA (GPU image only) | 9.0 | 9.0 | 9.0 | 9.0 | 9.0 | 9.2 | ++-------------------------+--------------+-------------+-------------+-------------+-------------+-------------+ +| numpy | 1.13.3 | 1.13.3 | 1.13.3 | 1.14.5 | 1.14.6 | 1.16.3 | ++-------------------------+--------------+-------------+-------------+-------------+-------------+-------------+ +| onnx | N/A | N/A | N/A | 1.2.1 | 1.2.1 | 1.4.1 | ++-------------------------+--------------+-------------+-------------+-------------+-------------+-------------+ +| keras-mxnet | N/A | N/A | N/A | N/A | 2.2.2 | 2.2.4.1 | ++-------------------------+--------------+-------------+-------------+-------------+-------------+-------------+ The Docker images extend Ubuntu 16.04. You can select version of MXNet by passing a ``framework_version`` keyword arg to the MXNet Estimator constructor. Currently supported versions are listed in the above table. You can also set ``framework_version`` to only specify major and minor version, e.g ``1.2``, which will cause your training script to be run on the latest supported patch version of that minor version, which in this example would be 1.2.1. Alternatively, you can build your own image by following the instructions in the SageMaker MXNet containers repository, and passing ``image_name`` to the MXNet Estimator constructor. -You can visit the SageMaker MXNet containers repository here: https://github.com/aws/sagemaker-mxnet-container +You can visit the SageMaker MXNet training containers repository here: https://github.com/aws/sagemaker-mxnet-container +You can visit the SageMaker MXNet serving containers repository here: https://github.com/aws/sagemaker-mxnet-serving-container diff --git a/src/sagemaker/fw_utils.py b/src/sagemaker/fw_utils.py index daf0b60877..f186fb8780 100644 --- a/src/sagemaker/fw_utils.py +++ b/src/sagemaker/fw_utils.py @@ -45,7 +45,7 @@ 'Please add framework_version={} to your constructor to avoid this error.' VALID_PY_VERSIONS = ['py2', 'py3'] -VALID_EIA_FRAMEWORKS = ['tensorflow', 'tensorflow-serving', 'mxnet'] +VALID_EIA_FRAMEWORKS = ['tensorflow', 'tensorflow-serving', 'mxnet', 'mxnet-serving'] VALID_ACCOUNTS_BY_REGION = {'us-gov-west-1': '246785580436', 'us-iso-east-1': '744548109606'} diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py index 80b809363f..8762dbc65c 100644 --- a/src/sagemaker/model.py +++ b/src/sagemaker/model.py @@ -14,9 +14,11 @@ import json import logging +import os import sagemaker from sagemaker import fw_utils, local, session, utils +from sagemaker.fw_utils import UploadedCode from sagemaker.transformer import Transformer LOGGER = logging.getLogger('sagemaker') @@ -408,6 +410,7 @@ def __init__(self, model_data, image, role, entry_point, source_dir=None, predic else: self.bucket, self.key_prefix = None, None self.uploaded_code = None + self.repacked_model_data = None def prepare_container_def(self, instance_type, accelerator_type=None): # pylint disable=unused-argument """Return a container definition with framework configuration set in model environment variables. @@ -428,18 +431,28 @@ def prepare_container_def(self, instance_type, accelerator_type=None): # pylint deploy_env.update(self._framework_env_vars()) return sagemaker.container_def(self.image, self.model_data, deploy_env) - def _upload_code(self, key_prefix): + def _upload_code(self, key_prefix, repack=False): local_code = utils.get_config_value('local.local_code', self.sagemaker_session.config) if self.sagemaker_session.local_mode and local_code: self.uploaded_code = None else: - bucket = self.bucket or self.sagemaker_session.default_bucket() - self.uploaded_code = fw_utils.tar_and_upload_dir(session=self.sagemaker_session.boto_session, - bucket=bucket, - s3_key_prefix=key_prefix, - script=self.entry_point, - directory=self.source_dir, - dependencies=self.dependencies) + if repack: + self.repacked_model_data = utils.repack_model(inference_script=self.entry_point, + source_directory=self.source_dir, + model_uri=self.model_data, + sagemaker_session=self.sagemaker_session) + + self.uploaded_code = UploadedCode(s3_prefix=self.repacked_model_data, + script_name=os.path.basename(self.entry_point)) + + else: + bucket = self.bucket or self.sagemaker_session.default_bucket() + self.uploaded_code = fw_utils.tar_and_upload_dir(session=self.sagemaker_session.boto_session, + bucket=bucket, + s3_key_prefix=key_prefix, + script=self.entry_point, + directory=self.source_dir, + dependencies=self.dependencies) def _framework_env_vars(self): if self.uploaded_code: diff --git a/src/sagemaker/mxnet/README.rst b/src/sagemaker/mxnet/README.rst index 3d08ff4463..7654022a56 100644 --- a/src/sagemaker/mxnet/README.rst +++ b/src/sagemaker/mxnet/README.rst @@ -4,9 +4,9 @@ Using MXNet with the SageMaker Python SDK With the SageMaker Python SDK, you can train and host MXNet models on Amazon SageMaker. -Supported versions of MXNet: ``1.3.0``, ``1.2.1``, ``1.1.0``, ``1.0.0``, ``0.12.1``. +Supported versions of MXNet: ``1.4.0``, ``1.3.0``, ``1.2.1``, ``1.1.0``, ``1.0.0``, ``0.12.1``. -Supported versions of MXNet for Elastic Inference: ``1.3.0``. +Supported versions of MXNet for Elastic Inference: ``1.3.0``, ``1.4.0``. For information about using MXNet with the SageMaker Python SDK, see https://sagemaker.readthedocs.io/en/stable/using_mxnet.html. @@ -15,29 +15,30 @@ SageMaker MXNet Containers When training and deploying training scripts, SageMaker runs your Python script in a Docker container with several libraries installed. When creating the Estimator and calling deploy to create the SageMaker Endpoint, you can control the environment your script runs in. -SageMaker runs MXNet Estimator scripts in either Python 2.7 or Python 3.5. You can select the Python version by passing a ``py_version`` keyword arg to the MXNet Estimator constructor. Setting this to ``py2`` (the default) will cause your training script to be run on Python 2.7. Setting this to ``py3`` will cause your training script to be run on Python 3.5. This Python version applies to both the Training Job, created by fit, and the Endpoint, created by deploy. +SageMaker runs MXNet Estimator scripts in either Python 2.7 or Python 3.6. You can select the Python version by passing a ``py_version`` keyword arg to the MXNet Estimator constructor. Setting this to ``py2`` (the default) will cause your training script to be run on Python 2.7. Setting this to ``py3`` will cause your training script to be run on Python 3.5. This Python version applies to both the Training Job, created by fit, and the Endpoint, created by deploy. Your MXNet training script will be run on version 1.2.1 by default. (See below for how to choose a different version, and currently supported versions.) The decision to use the GPU or CPU version of MXNet is made by the ``train_instance_type``, set on the MXNet constructor. If you choose a GPU instance type, your training job will be run on a GPU version of MXNet. If you choose a CPU instance type, your training job will be run on a CPU version of MXNet. Similarly, when you call deploy, specifying a GPU or CPU deploy_instance_type, will control which MXNet build your Endpoint runs. The Docker images have the following dependencies installed: -+-------------------------+--------------+-------------+-------------+-------------+-------------+ -| Dependencies | MXNet 0.12.1 | MXNet 1.0.0 | MXNet 1.1.0 | MXNet 1.2.1 | MXNet 1.3.0 | -+-------------------------+--------------+-------------+-------------+-------------+-------------+ -| Python | 2.7 or 3.5 | 2.7 or 3.5| 2.7 or 3.5| 2.7 or 3.5| 2.7 or 3.5| -+-------------------------+--------------+-------------+-------------+-------------+-------------+ -| CUDA (GPU image only) | 9.0 | 9.0 | 9.0 | 9.0 | 9.0 | -+-------------------------+--------------+-------------+-------------+-------------+-------------+ -| numpy | 1.13.3 | 1.13.3 | 1.13.3 | 1.14.5 | 1.14.6 | -+-------------------------+--------------+-------------+-------------+-------------+-------------+ -| onnx | N/A | N/A | N/A | 1.2.1 | 1.2.1 | -+-------------------------+--------------+-------------+-------------+-------------+-------------+ -| keras-mxnet | N/A | N/A | N/A | N/A | 2.2.2 | -+-------------------------+--------------+-------------+-------------+-------------+-------------+ ++-------------------------+--------------+-------------+-------------+-------------+-------------+-------------+ +| Dependencies | MXNet 0.12.1 | MXNet 1.0.0 | MXNet 1.1.0 | MXNet 1.2.1 | MXNet 1.3.0 | MXNet 1.4.0 | ++-------------------------+--------------+-------------+-------------+-------------+-------------+-------------+ +| Python | 2.7 or 3.5 | 2.7 or 3.5| 2.7 or 3.5| 2.7 or 3.5| 2.7 or 3.5| 2.7 or 3.6| ++-------------------------+--------------+-------------+-------------+-------------+-------------+-------------+ +| CUDA (GPU image only) | 9.0 | 9.0 | 9.0 | 9.0 | 9.0 | 9.2 | ++-------------------------+--------------+-------------+-------------+-------------+-------------+-------------+ +| numpy | 1.13.3 | 1.13.3 | 1.13.3 | 1.14.5 | 1.14.6 | 1.16.3 | ++-------------------------+--------------+-------------+-------------+-------------+-------------+-------------+ +| onnx | N/A | N/A | N/A | 1.2.1 | 1.2.1 | 1.4.1 | ++-------------------------+--------------+-------------+-------------+-------------+-------------+-------------+ +| keras-mxnet | N/A | N/A | N/A | N/A | 2.2.2 | 2.2.4.1 | ++-------------------------+--------------+-------------+-------------+-------------+-------------+-------------+ The Docker images extend Ubuntu 16.04. You can select version of MXNet by passing a ``framework_version`` keyword arg to the MXNet Estimator constructor. Currently supported versions are listed in the above table. You can also set ``framework_version`` to only specify major and minor version, e.g ``1.2``, which will cause your training script to be run on the latest supported patch version of that minor version, which in this example would be 1.2.1. Alternatively, you can build your own image by following the instructions in the SageMaker MXNet containers repository, and passing ``image_name`` to the MXNet Estimator constructor. -You can visit the SageMaker MXNet containers repository here: https://github.com/aws/sagemaker-mxnet-container +You can visit the SageMaker MXNet training containers repository here: https://github.com/aws/sagemaker-mxnet-container +You can visit the SageMaker MXNet serving containers repository here: https://github.com/aws/sagemaker-mxnet-serving-container diff --git a/src/sagemaker/mxnet/estimator.py b/src/sagemaker/mxnet/estimator.py index e6510e3fdf..43634b1bba 100644 --- a/src/sagemaker/mxnet/estimator.py +++ b/src/sagemaker/mxnet/estimator.py @@ -30,7 +30,7 @@ class MXNet(Framework): __framework_name__ = 'mxnet' _LOWEST_SCRIPT_MODE_VERSION = ['1', '3'] - LATEST_VERSION = '1.3' + LATEST_VERSION = '1.4' """The latest version of MXNet included in the SageMaker pre-built Docker images.""" def __init__(self, entry_point, source_dir=None, hyperparameters=None, py_version='py2', diff --git a/src/sagemaker/mxnet/model.py b/src/sagemaker/mxnet/model.py index ae2084b289..40eea52bf5 100644 --- a/src/sagemaker/mxnet/model.py +++ b/src/sagemaker/mxnet/model.py @@ -14,6 +14,8 @@ import logging +from pkg_resources import parse_version + import sagemaker from sagemaker.fw_utils import create_image_uri, model_code_key_prefix, python_deprecation_warning from sagemaker.model import FrameworkModel, MODEL_SERVER_WORKERS_PARAM_NAME @@ -45,6 +47,7 @@ class MXNetModel(FrameworkModel): """An MXNet SageMaker ``Model`` that can be deployed to a SageMaker ``Endpoint``.""" __framework_name__ = 'mxnet' + _LOWEST_MMS_VERSION = '1.4' def __init__(self, model_data, role, entry_point, image=None, py_version='py2', framework_version=MXNET_VERSION, predictor_cls=MXNetPredictor, model_server_workers=None, **kwargs): @@ -89,17 +92,24 @@ def prepare_container_def(self, instance_type, accelerator_type=None): Returns: dict[str, str]: A container definition object usable with the CreateModel API. """ + mms_version = parse_version(self.framework_version) >= parse_version(self._LOWEST_MMS_VERSION) + deploy_image = self.image if not deploy_image: region_name = self.sagemaker_session.boto_session.region_name - deploy_image = create_image_uri(region_name, self.__framework_name__, instance_type, + + framework_name = self.__framework_name__ + if mms_version: + framework_name += '-serving' + + deploy_image = create_image_uri(region_name, framework_name, instance_type, self.framework_version, self.py_version, accelerator_type=accelerator_type) deploy_key_prefix = model_code_key_prefix(self.key_prefix, self.name, deploy_image) - self._upload_code(deploy_key_prefix) + self._upload_code(deploy_key_prefix, mms_version) deploy_env = dict(self.env) deploy_env.update(self._framework_env_vars()) if self.model_server_workers: deploy_env[MODEL_SERVER_WORKERS_PARAM_NAME.upper()] = str(self.model_server_workers) - return sagemaker.container_def(deploy_image, self.model_data, deploy_env) + return sagemaker.container_def(deploy_image, self.repacked_model_data or self.model_data, deploy_env) diff --git a/tests/conftest.py b/tests/conftest.py index dde173b2ec..a110adbd29 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -108,7 +108,7 @@ def chainer_version(request): @pytest.fixture(scope='module', params=['0.12', '0.12.1', '1.0', '1.0.0', '1.1', '1.1.0', '1.2', - '1.2.1', '1.3', '1.3.0']) + '1.2.1', '1.3', '1.3.0', '1.4', '1.4.0']) def mxnet_version(request): return request.param diff --git a/tests/data/mxnet_mnist/mnist.py b/tests/data/mxnet_mnist/mnist.py index 7ee5b65ef3..16ab7c2e98 100644 --- a/tests/data/mxnet_mnist/mnist.py +++ b/tests/data/mxnet_mnist/mnist.py @@ -20,8 +20,6 @@ import mxnet as mx import numpy as np -from sagemaker_mxnet_container.training_utils import scheduler_host - def load_data(path): with gzip.open(find_file(path, "labels.gz")) as flbl: @@ -91,7 +89,7 @@ def train(batch_size, epochs, learning_rate, num_gpus, training_channel, testing batch_end_callback=mx.callback.Speedometer(batch_size, 100), num_epoch=epochs) - if len(hosts) == 1 or current_host == scheduler_host(hosts): + if len(hosts) == 1 or current_host == hosts[0]: save(model_dir, mlp_model) diff --git a/tests/unit/test_fw_utils.py b/tests/unit/test_fw_utils.py index 539a75f42f..ed5e97b631 100644 --- a/tests/unit/test_fw_utils.py +++ b/tests/unit/test_fw_utils.py @@ -86,7 +86,7 @@ def test_create_image_uri_gpu(): assert image_uri == '23.dkr.ecr.mars-south-3.amazonaws.com/sagemaker-mlfw:1.0rc-gpu-py3' -def test_create_image_uri_ei(): +def test_create_image_uri_accelerator_tfs(): image_uri = fw_utils.create_image_uri(MOCK_REGION, 'tensorflow-serving', 'ml.c4.large', '1.1.0', accelerator_type='ml.eia1.large', account='23') assert image_uri == '23.dkr.ecr.mars-south-3.amazonaws.com/sagemaker-tensorflow-serving-eia:1.1.0-cpu' @@ -102,12 +102,18 @@ def test_create_image_uri_gov_cloud(): assert image_uri == '246785580436.dkr.ecr.us-gov-west-1.amazonaws.com/sagemaker-mlfw:1.0rc-gpu-py3' -def test_create_image_uri_accelerator(): +def test_create_image_uri_accelerator_tf(): image_uri = fw_utils.create_image_uri(MOCK_REGION, 'tensorflow', 'ml.p3.2xlarge', '1.0rc', 'py3', accelerator_type='ml.eia1.medium') assert image_uri == '520713654638.dkr.ecr.mars-south-3.amazonaws.com/sagemaker-tensorflow-eia:1.0rc-gpu-py3' +def test_create_image_uri_accelerator_mxnet_serving(): + image_uri = fw_utils.create_image_uri(MOCK_REGION, 'mxnet-serving', 'ml.p3.2xlarge', '1.0rc', 'py3', + accelerator_type='ml.eia1.medium') + assert image_uri == '520713654638.dkr.ecr.mars-south-3.amazonaws.com/sagemaker-mxnet-serving-eia:1.0rc-gpu-py3' + + def test_create_image_uri_local_sagemaker_notebook_accelerator(): image_uri = fw_utils.create_image_uri(MOCK_REGION, 'mxnet', 'ml.p3.2xlarge', '1.0rc', 'py3', accelerator_type='local_sagemaker_notebook') diff --git a/tests/unit/test_mxnet.py b/tests/unit/test_mxnet.py index 80c112b1f0..dfb298d47b 100644 --- a/tests/unit/test_mxnet.py +++ b/tests/unit/test_mxnet.py @@ -19,7 +19,9 @@ import pytest from mock import MagicMock, Mock from mock import patch +from pkg_resources import parse_version +from sagemaker.fw_utils import UploadedCode from sagemaker.mxnet import defaults from sagemaker.mxnet import MXNet from sagemaker.mxnet import MXNetPredictor, MXNetModel @@ -27,17 +29,19 @@ DATA_DIR = os.path.join(os.path.dirname(__file__), '..', 'data') SCRIPT_PATH = os.path.join(DATA_DIR, 'dummy_script.py') MODEL_DATA = 's3://mybucket/model' +REPACKED_MODEL_DATA = 's3://mybucket/repacked/model' TIMESTAMP = '2017-11-06-14:14:15.672' TIME = 1507167947 BUCKET_NAME = 'mybucket' INSTANCE_COUNT = 1 INSTANCE_TYPE = 'ml.c4.4xlarge' ACCELERATOR_TYPE = 'ml.eia.medium' -IMAGE_CPU_NAME = 'sagemaker-mxnet' -JOB_NAME = '{}-{}'.format(IMAGE_CPU_NAME, TIMESTAMP) +IMAGE_REPO_NAME = 'sagemaker-mxnet' +IMAGE_REPO_SERVING_NAME = 'sagemaker-mxnet-serving' +JOB_NAME = '{}-{}'.format(IMAGE_REPO_NAME, TIMESTAMP) COMPILATION_JOB_NAME = '{}-{}'.format('compilation-sagemaker-mxnet', TIMESTAMP) FRAMEWORK = 'mxnet' -FULL_IMAGE_URI = '520713654638.dkr.ecr.us-west-2.amazonaws.com/{}:{}-cpu-py2' +FULL_IMAGE_URI = '520713654638.dkr.ecr.us-west-2.amazonaws.com/{}:{}-{}-{}' ROLE = 'Dummy' REGION = 'us-west-2' GPU = 'ml.p2.xlarge' @@ -77,12 +81,24 @@ def sagemaker_session(): return session -def _get_full_image_uri(version): - return FULL_IMAGE_URI.format(IMAGE_CPU_NAME, version) +@pytest.fixture() +def skip_if_mms_version(mxnet_version): + if parse_version(MXNetModel._LOWEST_MMS_VERSION) <= parse_version(mxnet_version): + pytest.skip('Skipping because this version uses MMS') + + +@pytest.fixture() +def skip_if_not_mms_version(mxnet_version): + if parse_version(MXNetModel._LOWEST_MMS_VERSION) > parse_version(mxnet_version): + pytest.skip('Skipping because this version does not use MMS') -def _get_full_image_uri_for_cpu_with_ei(version): - return FULL_IMAGE_URI.format('{}-eia'.format(IMAGE_CPU_NAME), version) +def _get_full_image_uri(version, repo=IMAGE_REPO_NAME, processor='cpu', py_version='py2'): + return FULL_IMAGE_URI.format(repo, version, processor, py_version) + + +def _get_full_image_uri_with_ei(version, repo=IMAGE_REPO_NAME, processor='cpu', py_version='py2'): + return FULL_IMAGE_URI.format('{}-eia'.format(repo), version, processor, py_version) def _create_train_job(version): @@ -226,7 +242,7 @@ def test_create_model_with_custom_image(sagemaker_session): @patch('sagemaker.utils.create_tar_file', MagicMock()) @patch('time.strftime', return_value=TIMESTAMP) -def test_mxnet(strftime, sagemaker_session, mxnet_version): +def test_mxnet(strftime, sagemaker_session, mxnet_version, skip_if_mms_version): mx = MXNet(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=mxnet_version) @@ -264,9 +280,49 @@ def test_mxnet(strftime, sagemaker_session, mxnet_version): assert isinstance(predictor, MXNetPredictor) +@patch('sagemaker.utils.repack_model', return_value=REPACKED_MODEL_DATA) +@patch('time.strftime', return_value=TIMESTAMP) +def test_mxnet_mms_version(strftime, repack_model, sagemaker_session, mxnet_version, skip_if_not_mms_version): + mx = MXNet(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, + train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, + framework_version=mxnet_version) + + inputs = 's3://mybucket/train' + + mx.fit(inputs=inputs) + + sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] + assert sagemaker_call_names == ['train', 'logs_for_job'] + boto_call_names = [c[0] for c in sagemaker_session.boto_session.method_calls] + assert boto_call_names == ['resource'] + + expected_train_args = _create_train_job(mxnet_version) + expected_train_args['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] = inputs + + actual_train_args = sagemaker_session.method_calls[0][2] + assert actual_train_args == expected_train_args + + model = mx.create_model() + + expected_image_base = _get_full_image_uri(mxnet_version, IMAGE_REPO_SERVING_NAME, 'gpu') + environment = { + 'Environment': { + 'SAGEMAKER_SUBMIT_DIRECTORY': REPACKED_MODEL_DATA, + 'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', + 'SAGEMAKER_REGION': 'us-west-2', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20' + }, + 'Image': expected_image_base.format(mxnet_version), 'ModelDataUrl': REPACKED_MODEL_DATA + } + assert environment == model.prepare_container_def(GPU) + + assert 'cpu' in model.prepare_container_def(CPU)['Image'] + predictor = mx.deploy(1, GPU) + assert isinstance(predictor, MXNetPredictor) + + @patch('sagemaker.utils.create_tar_file', MagicMock()) @patch('time.strftime', return_value=TIMESTAMP) -def test_mxnet_neo(strftime, sagemaker_session, mxnet_version): +def test_mxnet_neo(strftime, sagemaker_session, mxnet_version, skip_if_mms_version): mx = MXNet(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=mxnet_version) @@ -310,12 +366,41 @@ def test_model(sagemaker_session): assert isinstance(predictor, MXNetPredictor) +@patch('sagemaker.utils.repack_model', return_value=REPACKED_MODEL_DATA) +def test_model_mms_version(repack_model, sagemaker_session): + model = MXNetModel(MODEL_DATA, role=ROLE, entry_point=SCRIPT_PATH, + framework_version=MXNetModel._LOWEST_MMS_VERSION, + sagemaker_session=sagemaker_session) + predictor = model.deploy(1, GPU) + + repack_model.assert_called_once_with(inference_script=SCRIPT_PATH, + source_directory=None, + model_uri=MODEL_DATA, + sagemaker_session=sagemaker_session) + + assert model.model_data == MODEL_DATA + assert model.repacked_model_data == REPACKED_MODEL_DATA + assert model.uploaded_code == UploadedCode(s3_prefix=REPACKED_MODEL_DATA, + script_name=os.path.basename(SCRIPT_PATH)) + assert isinstance(predictor, MXNetPredictor) + + @patch('sagemaker.fw_utils.tar_and_upload_dir', MagicMock()) def test_model_image_accelerator(sagemaker_session): model = MXNetModel(MODEL_DATA, role=ROLE, entry_point=SCRIPT_PATH, sagemaker_session=sagemaker_session) container_def = model.prepare_container_def(INSTANCE_TYPE, accelerator_type=ACCELERATOR_TYPE) - assert container_def['Image'] == _get_full_image_uri_for_cpu_with_ei(defaults.MXNET_VERSION) + assert container_def['Image'] == _get_full_image_uri_with_ei(defaults.MXNET_VERSION) + + +@patch('sagemaker.utils.repack_model', MagicMock()) +def test_model_image_accelerator_mms_version(sagemaker_session): + model = MXNetModel(MODEL_DATA, role=ROLE, entry_point=SCRIPT_PATH, + framework_version=MXNetModel._LOWEST_MMS_VERSION, + sagemaker_session=sagemaker_session) + container_def = model.prepare_container_def(INSTANCE_TYPE, accelerator_type=ACCELERATOR_TYPE) + assert container_def['Image'] == _get_full_image_uri_with_ei(MXNetModel._LOWEST_MMS_VERSION, + IMAGE_REPO_SERVING_NAME) def test_train_image_default(sagemaker_session): From 2a9ab73ce6d740609dc64afe0bb1ffbeda5f12c0 Mon Sep 17 00:00:00 2001 From: Dan Choi Date: Thu, 23 May 2019 01:07:21 -0700 Subject: [PATCH 2/7] change local_model_path to local_code_path --- src/sagemaker/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sagemaker/utils.py b/src/sagemaker/utils.py index e32f2bc09f..d20f3194e0 100644 --- a/src/sagemaker/utils.py +++ b/src/sagemaker/utils.py @@ -344,7 +344,7 @@ def repack_model(inference_script, source_directory, model_uri, sagemaker_sessio local_code_path = os.path.join(tmp, 'local_code.tar.gz') download_file_from_url(source_directory, local_code_path, sagemaker_session) - with tarfile.open(name=local_model_path, mode='r:gz') as t: + with tarfile.open(name=local_code_path, mode='r:gz') as t: t.extractall(path=code_dir) elif source_directory: From 62c02fefc75ca2ca320eec1413e7dc46633a5521 Mon Sep 17 00:00:00 2001 From: Dan Choi Date: Thu, 23 May 2019 11:11:56 -0700 Subject: [PATCH 3/7] address comments --- doc/using_mxnet.rst | 2 +- src/sagemaker/mxnet/README.rst | 4 ++-- tests/integ/test_mxnet_train.py | 12 ++++++++---- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/doc/using_mxnet.rst b/doc/using_mxnet.rst index 0b4923c05c..e2d1f4c73c 100644 --- a/doc/using_mxnet.rst +++ b/doc/using_mxnet.rst @@ -703,7 +703,7 @@ The return object should be one of the following: For versions 1.4 and higher: ---------------------------- - a tuple with two items: the response data and ``accept_type`` (the content type of the response data), or -- the response data: (the content type of the response will be set to either the accept header in the initial request or a default) +- the response data: (the content type of the response will be set to either the accept header in the initial request or default to application/json) For versions 1.3 and lower: --------------------------- diff --git a/src/sagemaker/mxnet/README.rst b/src/sagemaker/mxnet/README.rst index 7654022a56..ec83a09309 100644 --- a/src/sagemaker/mxnet/README.rst +++ b/src/sagemaker/mxnet/README.rst @@ -6,7 +6,7 @@ With the SageMaker Python SDK, you can train and host MXNet models on Amazon Sag Supported versions of MXNet: ``1.4.0``, ``1.3.0``, ``1.2.1``, ``1.1.0``, ``1.0.0``, ``0.12.1``. -Supported versions of MXNet for Elastic Inference: ``1.3.0``, ``1.4.0``. +Supported versions of MXNet for Elastic Inference: ``1.4.0``, ``1.3.0``. For information about using MXNet with the SageMaker Python SDK, see https://sagemaker.readthedocs.io/en/stable/using_mxnet.html. @@ -15,7 +15,7 @@ SageMaker MXNet Containers When training and deploying training scripts, SageMaker runs your Python script in a Docker container with several libraries installed. When creating the Estimator and calling deploy to create the SageMaker Endpoint, you can control the environment your script runs in. -SageMaker runs MXNet Estimator scripts in either Python 2.7 or Python 3.6. You can select the Python version by passing a ``py_version`` keyword arg to the MXNet Estimator constructor. Setting this to ``py2`` (the default) will cause your training script to be run on Python 2.7. Setting this to ``py3`` will cause your training script to be run on Python 3.5. This Python version applies to both the Training Job, created by fit, and the Endpoint, created by deploy. +SageMaker runs MXNet scripts in either Python 2.7 or Python 3.6. You can select the Python version by passing a ``py_version`` keyword arg to the MXNet Estimator constructor. Setting this to ``py2`` (the default) will cause your training script to be run on Python 2.7. Setting this to ``py3`` will cause your training script to be run on Python 3.6. This Python version applies to both the Training Job, created by fit, and the Endpoint, created by deploy. Your MXNet training script will be run on version 1.2.1 by default. (See below for how to choose a different version, and currently supported versions.) The decision to use the GPU or CPU version of MXNet is made by the ``train_instance_type``, set on the MXNet constructor. If you choose a GPU instance type, your training job will be run on a GPU version of MXNet. If you choose a CPU instance type, your training job will be run on a CPU version of MXNet. Similarly, when you call deploy, specifying a GPU or CPU deploy_instance_type, will control which MXNet build your Endpoint runs. diff --git a/tests/integ/test_mxnet_train.py b/tests/integ/test_mxnet_train.py index 3866ebc793..b801b9252f 100644 --- a/tests/integ/test_mxnet_train.py +++ b/tests/integ/test_mxnet_train.py @@ -55,7 +55,8 @@ def test_attach_deploy(mxnet_training_job, sagemaker_session): estimator = MXNet.attach(mxnet_training_job, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) - predictor.predict(data) + result = predictor.predict(data) + assert result is not None def test_deploy_model(mxnet_training_job, sagemaker_session, mxnet_full_version): @@ -71,7 +72,8 @@ def test_deploy_model(mxnet_training_job, sagemaker_session, mxnet_full_version) predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) - predictor.predict(data) + result = predictor.predict(data) + assert result is not None predictor.delete_model() with pytest.raises(Exception) as exception: @@ -175,7 +177,8 @@ def test_deploy_model_with_accelerator(mxnet_training_job, sagemaker_session, ei predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name, accelerator_type='ml.eia1.medium') data = numpy.zeros(shape=(1, 1, 28, 28)) - predictor.predict(data) + result = predictor.predict(data) + assert result is not None def test_async_fit(sagemaker_session, mxnet_full_version): @@ -206,7 +209,8 @@ def test_async_fit(sagemaker_session, mxnet_full_version): estimator = MXNet.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) - predictor.predict(data) + result = predictor.predict(data) + assert result is not None def test_failed_training_job(sagemaker_session, mxnet_full_version): From 1ac3a8caccae202a5f01d148c9d6d20c55ea8ac6 Mon Sep 17 00:00:00 2001 From: Dan Choi Date: Thu, 23 May 2019 15:22:33 -0700 Subject: [PATCH 4/7] add durations to pytest --- buildspec.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildspec.yml b/buildspec.yml index af7e071a4d..4e116f7b01 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -31,7 +31,7 @@ phases: # run integration tests - | if has-matching-changes "tests/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"; then - IGNORE_COVERAGE=- tox -e py36,py27 -- tests/integ -n 24 --boxed --reruns 2 + IGNORE_COVERAGE=- tox -e py36,py27 -- tests/integ -n 24 --boxed --reruns 2 --durations=0 else echo "skipping integration tests" fi From 92d12c7c02c803acb27aae7a79a11749b2f0f896 Mon Sep 17 00:00:00 2001 From: Stephen Leonard <46679837+stephenleonard8@users.noreply.github.com> Date: Thu, 23 May 2019 18:51:25 -0400 Subject: [PATCH 5/7] documentation: update using_sklearn.rst parameter name (#814) Incorrect parameter name in docs. Updated to match what is implemented in the method and what is used in other estimators. --- doc/using_sklearn.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/using_sklearn.rst b/doc/using_sklearn.rst index 9d80f1fbad..bf9c506b96 100644 --- a/doc/using_sklearn.rst +++ b/doc/using_sklearn.rst @@ -170,7 +170,7 @@ The following are optional arguments. When you create a ``SKLearn`` object, you method execution. - ``output_kms_key`` Optional KMS key ID to optionally encrypt training output with. -- ``job_name`` Name to assign for the training job that the fit() +- ``base_job_name`` Name to assign for the training job that the fit() method launches. If not specified, the estimator generates a default job name, based on the training image name and current timestamp - ``image_name`` An alternative docker image to use for training and From b779403449415138245df5331cc78192f83c6d82 Mon Sep 17 00:00:00 2001 From: Dan Choi Date: Thu, 23 May 2019 17:40:30 -0700 Subject: [PATCH 6/7] call repack in local local --- buildspec.yml | 2 +- src/sagemaker/model.py | 19 +++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/buildspec.yml b/buildspec.yml index 4e116f7b01..af7e071a4d 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -31,7 +31,7 @@ phases: # run integration tests - | if has-matching-changes "tests/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"; then - IGNORE_COVERAGE=- tox -e py36,py27 -- tests/integ -n 24 --boxed --reruns 2 --durations=0 + IGNORE_COVERAGE=- tox -e py36,py27 -- tests/integ -n 24 --boxed --reruns 2 else echo "skipping integration tests" fi diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py index 8762dbc65c..bbb78bdfc8 100644 --- a/src/sagemaker/model.py +++ b/src/sagemaker/model.py @@ -436,16 +436,7 @@ def _upload_code(self, key_prefix, repack=False): if self.sagemaker_session.local_mode and local_code: self.uploaded_code = None else: - if repack: - self.repacked_model_data = utils.repack_model(inference_script=self.entry_point, - source_directory=self.source_dir, - model_uri=self.model_data, - sagemaker_session=self.sagemaker_session) - - self.uploaded_code = UploadedCode(s3_prefix=self.repacked_model_data, - script_name=os.path.basename(self.entry_point)) - - else: + if not repack: bucket = self.bucket or self.sagemaker_session.default_bucket() self.uploaded_code = fw_utils.tar_and_upload_dir(session=self.sagemaker_session.boto_session, bucket=bucket, @@ -454,6 +445,14 @@ def _upload_code(self, key_prefix, repack=False): directory=self.source_dir, dependencies=self.dependencies) + if repack: + self.repacked_model_data = utils.repack_model(inference_script=self.entry_point, + source_directory=self.source_dir, + model_uri=self.model_data, + sagemaker_session=self.sagemaker_session) + self.uploaded_code = UploadedCode(s3_prefix=self.repacked_model_data, + script_name=os.path.basename(self.entry_point)) + def _framework_env_vars(self): if self.uploaded_code: script_name = self.uploaded_code.script_name From e26eb928386ae1992b7779660dcf2b89cb8c0638 Mon Sep 17 00:00:00 2001 From: Lauren Yu <6631887+laurenyu@users.noreply.github.com> Date: Fri, 24 May 2019 09:40:14 -0700 Subject: [PATCH 7/7] fix README formatting --- src/sagemaker/mxnet/README.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/sagemaker/mxnet/README.rst b/src/sagemaker/mxnet/README.rst index ec83a09309..f0965cd38a 100644 --- a/src/sagemaker/mxnet/README.rst +++ b/src/sagemaker/mxnet/README.rst @@ -40,5 +40,7 @@ The Docker images extend Ubuntu 16.04. You can select version of MXNet by passing a ``framework_version`` keyword arg to the MXNet Estimator constructor. Currently supported versions are listed in the above table. You can also set ``framework_version`` to only specify major and minor version, e.g ``1.2``, which will cause your training script to be run on the latest supported patch version of that minor version, which in this example would be 1.2.1. Alternatively, you can build your own image by following the instructions in the SageMaker MXNet containers repository, and passing ``image_name`` to the MXNet Estimator constructor. -You can visit the SageMaker MXNet training containers repository here: https://github.com/aws/sagemaker-mxnet-container -You can visit the SageMaker MXNet serving containers repository here: https://github.com/aws/sagemaker-mxnet-serving-container +You can visit the SageMaker MXNet container repositories here: + +- training: https://github.com/aws/sagemaker-mxnet-container +- serving: https://github.com/aws/sagemaker-mxnet-serving-container