From 4ac4f987d485f7d9a2841fba9af7a660a63bf056 Mon Sep 17 00:00:00 2001 From: Lianyi Ding Date: Wed, 26 Jun 2019 15:48:13 -0700 Subject: [PATCH 1/5] Add eu-west-3, eu-north-1, sa-east-1 and ap-east-1 to the no-p2 regions and no-p3 regions. --- tests/integ/__init__.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/tests/integ/__init__.py b/tests/integ/__init__.py index 1a99c29601..d7b5e61c8c 100644 --- a/tests/integ/__init__.py +++ b/tests/integ/__init__.py @@ -25,7 +25,16 @@ PYTHON_VERSION = "py" + str(sys.version_info.major) # these regions have some p2 and p3 instances, but not enough for continuous testing -HOSTING_NO_P2_REGIONS = ["ca-central-1", "eu-central-1", "eu-west-2", "us-west-1"] +HOSTING_NO_P2_REGIONS = [ + "ca-central-1", + "eu-central-1", + "eu-west-2", + "us-west-1", + "eu-west-3", + "eu-north-1", + "sa-east-1", + "ap-east-1", +] HOSTING_NO_P3_REGIONS = [ "ap-southeast-1", "ap-southeast-2", @@ -34,8 +43,19 @@ "eu-central-1", "eu-west-2", "us-west-1", + "eu-west-3", + "eu-north-1", + "sa-east-1", + "ap-east-1", +] +TRAINING_NO_P2_REGIONS = [ + "ap-southeast-1", + "ap-southeast-2", + "eu-west-3", + "eu-north-1", + "sa-east-1", + "ap-east-1", ] -TRAINING_NO_P2_REGIONS = ["ap-southeast-1", "ap-southeast-2"] # EI is currently only supported in the following regions # regions were derived from https://aws.amazon.com/machine-learning/elastic-inference/pricing/ From 714e5693a1940afaff1930fa78dbe5dcd2289af1 Mon Sep 17 00:00:00 2001 From: Lianyi Ding Date: Mon, 22 Jul 2019 17:44:28 -0700 Subject: [PATCH 2/5] making changes for regions without m4 instances. Adding account for HKG. Squash commits. --- src/sagemaker/fw_utils.py | 27 ++----- tests/conftest.py | 33 +++++++++ tests/integ/__init__.py | 3 + tests/integ/conftest.py | 7 -- tests/integ/marketplace_utils.py | 4 ++ tests/integ/test_byo_estimator.py | 12 ++-- tests/integ/test_chainer_train.py | 6 +- tests/integ/test_factorization_machines.py | 12 ++-- tests/integ/test_horovod.py | 58 +++++++++------ tests/integ/test_inference_pipeline.py | 12 ++-- tests/integ/test_ipinsights.py | 6 +- tests/integ/test_kmeans.py | 12 ++-- tests/integ/test_knn.py | 12 ++-- tests/integ/test_lda.py | 12 +++- tests/integ/test_linear_learner.py | 18 ++--- tests/integ/test_local_mode.py | 6 +- tests/integ/test_marketplace.py | 57 ++++++++++----- tests/integ/test_mxnet_train.py | 38 +++++----- tests/integ/test_neo_mxnet.py | 20 +++--- tests/integ/test_ntm.py | 10 +-- tests/integ/test_object2vec.py | 12 +--- tests/integ/test_pca.py | 12 ++-- tests/integ/test_pytorch_train.py | 15 ++-- tests/integ/test_randomcutforest.py | 6 +- tests/integ/test_record_set.py | 4 +- tests/integ/test_rl.py | 28 ++++---- tests/integ/test_sklearn_train.py | 47 ++++++------ tests/integ/test_sparkml_serving.py | 8 +-- tests/integ/test_tf_cifar.py | 83 ++++++++++++++++++++++ tests/integ/test_tf_keras.py | 8 ++- tests/integ/test_tf_script_mode.py | 9 +-- tests/integ/test_tfs.py | 4 +- tests/integ/test_transformer.py | 38 +++++----- tests/integ/test_tuner.py | 74 ++++++++++--------- 34 files changed, 423 insertions(+), 290 deletions(-) create mode 100644 tests/integ/test_tf_cifar.py diff --git a/src/sagemaker/fw_utils.py b/src/sagemaker/fw_utils.py index 493151dec0..f9048b0f2a 100644 --- a/src/sagemaker/fw_utils.py +++ b/src/sagemaker/fw_utils.py @@ -28,7 +28,6 @@ UploadedCode = namedtuple("UserCode", ["s3_prefix", "script_name"]) """sagemaker.fw_utils.UserCode: An object containing the S3 prefix and script name. - This is for the source code used for the entry point with an ``Estimator``. It can be instantiated with positional or keyword arguments. """ @@ -54,6 +53,8 @@ VALID_PY_VERSIONS = ["py2", "py3"] VALID_EIA_FRAMEWORKS = ["tensorflow", "tensorflow-serving", "mxnet", "mxnet-serving"] VALID_ACCOUNTS_BY_REGION = {"us-gov-west-1": "246785580436", "us-iso-east-1": "744548109606"} +OPT_IN_ACCOUNTS_BY_REGION = {"ap-east-1": "057415533634"} +ASIMOV_OPT_IN_ACCOUNTS_BY_REGION = {"ap-east-1": "871362719292"} MERGED_FRAMEWORKS_REPO_MAP = { "tensorflow-scriptmode": "tensorflow-training", @@ -73,12 +74,10 @@ def is_version_equal_or_higher(lowest_version, framework_version): """Determine whether the ``framework_version`` is equal to or higher than ``lowest_version`` - Args: lowest_version (List[int]): lowest version represented in an integer list framework_version (str): framework version string - Returns: bool: Whether or not framework_version is equal to or higher than lowest_version @@ -125,7 +124,11 @@ def _registry_id(region, framework, py_version, account, accelerator_type, frame framework_version: """ if _using_merged_images(region, framework, py_version, accelerator_type, framework_version): + if region in ASIMOV_OPT_IN_ACCOUNTS_BY_REGION: + return ASIMOV_OPT_IN_ACCOUNTS_BY_REGION.get(region) return "763104351884" + if region in OPT_IN_ACCOUNTS_BY_REGION: + return OPT_IN_ACCOUNTS_BY_REGION.get(region) return VALID_ACCOUNTS_BY_REGION.get(region, account) @@ -140,7 +143,6 @@ def create_image_uri( optimized_families=None, ): """Return the ECR URI of an image. - Args: region (str): AWS region where the image is uploaded. framework (str): framework used by the image. @@ -155,7 +157,6 @@ def create_image_uri( accelerator_type (str): SageMaker Elastic Inference accelerator type. optimized_families (str): Instance families for which there exist specific optimized images. - Returns: str: The appropriate image URI based on the given parameters. """ @@ -249,11 +250,9 @@ def _accelerator_type_valid_for_framework( def validate_source_dir(script, directory): """Validate that the source directory exists and it contains the user script - Args: script (str): Script filename. directory (str): Directory containing the source file. - Raises: ValueError: If ``directory`` does not exist, is not a directory, or does not contain ``script``. @@ -272,18 +271,14 @@ def tar_and_upload_dir( ): """Package source files and upload a compress tar file to S3. The S3 location will be ``s3:///s3_key_prefix/sourcedir.tar.gz``. - If directory is an S3 URI, an UploadedCode object will be returned, but nothing will be uploaded to S3 (this allow reuse of code already in S3). - If directory is None, the script will be added to the archive at ``./``. - If directory is not None, the (recursive) contents of the directory will be added to the archive. directory is treated as the base path of the archive, and the script name is assumed to be a filename or relative path inside the directory. - Args: session (boto3.Session): Boto session used to access S3. bucket (str): S3 bucket to which the compressed file is uploaded. @@ -296,7 +291,6 @@ def tar_and_upload_dir( copied into /opt/ml/lib kms_key (str): Optional. KMS key ID used to upload objects to the bucket (default: None). - Returns: sagemaker.fw_utils.UserCode: An object with the S3 bucket and key (S3 prefix) and script name. @@ -343,7 +337,6 @@ def _list_files_to_compress(script, directory): def framework_name_from_image(image_name): # noinspection LongLine """Extract the framework and Python version from the image name. - Args: image_name (str): Image URI, which should be one of the following forms: legacy: @@ -354,7 +347,6 @@ def framework_name_from_image(image_name): '.dkr.ecr..amazonaws.com/sagemaker-:--' current: '.dkr.ecr..amazonaws.com/sagemaker-rl-:--' - Returns: tuple: A tuple containing: str: The framework name str: The Python version str: The image tag @@ -390,11 +382,9 @@ def framework_name_from_image(image_name): def framework_version_from_tag(image_tag): """Extract the framework version from the image tag. - Args: image_tag (str): Image tag, which should take the form '--' - Returns: str: The framework version. """ @@ -406,10 +396,8 @@ def framework_version_from_tag(image_tag): def parse_s3_url(url): """Returns an (s3 bucket, key name/prefix) tuple from a url with an s3 scheme - Args: url (str): - Returns: tuple: A tuple containing: str: S3 bucket name str: S3 key @@ -422,16 +410,13 @@ def parse_s3_url(url): def model_code_key_prefix(code_location_key_prefix, model_name, image): """Returns the s3 key prefix for uploading code during model deployment - The location returned is a potential concatenation of 2 parts 1. code_location_key_prefix if it exists 2. model_name or a name derived from the image - Args: code_location_key_prefix (str): the s3 key prefix from code_location model_name (str): the name of the model image (str): the image from which a default name can be extracted - Returns: str: the key prefix to be used in uploading code """ diff --git a/tests/conftest.py b/tests/conftest.py index 588aaf9b97..76b6f783f1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,6 +17,7 @@ import boto3 import pytest +import tests.integ from botocore.config import Config from sagemaker import Session @@ -30,6 +31,8 @@ DEFAULT_REGION = "us-west-2" +NO_M4_REGIONS = ["eu-west-3", "eu-north-1", "ap-east-1"] + def pytest_addoption(parser): parser.addoption("--sagemaker-client-config", action="store", default=None) @@ -242,3 +245,33 @@ def tf_full_version(request): @pytest.fixture(scope="module") def ei_tf_full_version(request): return request.config.getoption("--ei-tf-full-version") + + +@pytest.fixture(scope="session") +def cpu_instance_type(sagemaker_session, request): + region = sagemaker_session.boto_session.region_name + if region in NO_M4_REGIONS: + return "ml.m5.xlarge" + else: + return "ml.m4.xlarge" + + +@pytest.fixture(scope="session") +def cpu_instance_family(cpu_instance_type): + "_".join(cpu_instance_type.split(".")[0:2]) + + +def pytest_generate_tests(metafunc): + if "instance_type" in metafunc.fixturenames: + boto_config = metafunc.config.getoption("--boto-config") + parsed_config = json.loads(boto_config) if boto_config else {} + region = parsed_config.get("region_name", DEFAULT_REGION) + cpu_instance_type = "ml.m5.xlarge" if region in NO_M4_REGIONS else "ml.m4.xlarge" + + params = [cpu_instance_type] + if not ( + region in tests.integ.HOSTING_NO_P2_REGIONS + or region in tests.integ.TRAINING_NO_P2_REGIONS + ): + params.append("ml.p2.xlarge") + metafunc.parametrize("instance_type", params, scope="session") diff --git a/tests/integ/__init__.py b/tests/integ/__init__.py index d7b5e61c8c..f080014b32 100644 --- a/tests/integ/__init__.py +++ b/tests/integ/__init__.py @@ -68,6 +68,9 @@ "ap-northeast-2", ] +NO_LDA_REGIONS = ["eu-west-3", "eu-north-1", "sa-east-1", "ap-east-1"] +NO_MARKET_PLACE_REGIONS = ["eu-west-3", "eu-north-1", "sa-east-1", "ap-east-1"] + logging.getLogger("boto3").setLevel(logging.INFO) logging.getLogger("botocore").setLevel(logging.INFO) diff --git a/tests/integ/conftest.py b/tests/integ/conftest.py index a0c9f1cb2e..5042870a54 100644 --- a/tests/integ/conftest.py +++ b/tests/integ/conftest.py @@ -14,8 +14,6 @@ import os -import pytest - def create_sagemaker_local_network(): """ @@ -28,8 +26,3 @@ def create_sagemaker_local_network(): create_sagemaker_local_network() - - -@pytest.fixture(scope="session", params=["local", "ml.c4.xlarge"]) -def instance_type(request): - return request.param diff --git a/tests/integ/marketplace_utils.py b/tests/integ/marketplace_utils.py index 5bfb293914..dcb8ebcef0 100644 --- a/tests/integ/marketplace_utils.py +++ b/tests/integ/marketplace_utils.py @@ -26,4 +26,8 @@ "ca-central-1": "470592106596", "eu-west-2": "856760150666", "us-west-1": "382657785993", + "eu-west-3": "843114510376", + "eu-north-1": "136758871317", + "sa-east-1": "270155090741", + "ap-east-1": "822005858737", } diff --git a/tests/integ/test_byo_estimator.py b/tests/integ/test_byo_estimator.py index 22b2318e36..a6ae718cac 100644 --- a/tests/integ/test_byo_estimator.py +++ b/tests/integ/test_byo_estimator.py @@ -41,7 +41,7 @@ def fm_serializer(data): @pytest.mark.canary_quick -def test_byo_estimator(sagemaker_session, region): +def test_byo_estimator(sagemaker_session, region, cpu_instance_type): """Use Factorization Machines algorithm as an example here. First we need to prepare data for training. We take standard data set, convert it to the @@ -74,7 +74,7 @@ def test_byo_estimator(sagemaker_session, region): image_name=image_name, role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) @@ -87,7 +87,7 @@ def test_byo_estimator(sagemaker_session, region): with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = estimator.create_model() - predictor = model.deploy(1, "ml.m4.xlarge", endpoint_name=job_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) predictor.serializer = fm_serializer predictor.content_type = "application/json" predictor.deserializer = sagemaker.predictor.json_deserializer @@ -99,7 +99,7 @@ def test_byo_estimator(sagemaker_session, region): assert prediction["score"] is not None -def test_async_byo_estimator(sagemaker_session, region): +def test_async_byo_estimator(sagemaker_session, region, cpu_instance_type): image_name = registry(region) + "/factorization-machines:1" endpoint_name = unique_name_from_base("byo") training_data_path = os.path.join(DATA_DIR, "dummy_tensor") @@ -123,7 +123,7 @@ def test_async_byo_estimator(sagemaker_session, region): image_name=image_name, role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) @@ -139,7 +139,7 @@ def test_async_byo_estimator(sagemaker_session, region): training_job_name=job_name, sagemaker_session=sagemaker_session ) model = estimator.create_model() - predictor = model.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) predictor.serializer = fm_serializer predictor.content_type = "application/json" predictor.deserializer = sagemaker.predictor.json_deserializer diff --git a/tests/integ/test_chainer_train.py b/tests/integ/test_chainer_train.py index 5b036cd432..4701472e76 100644 --- a/tests/integ/test_chainer_train.py +++ b/tests/integ/test_chainer_train.py @@ -62,7 +62,7 @@ def test_training_with_additional_hyperparameters(sagemaker_local_session, chain @pytest.mark.canary_quick @pytest.mark.regional_testing -def test_attach_deploy(sagemaker_session, chainer_full_version): +def test_attach_deploy(sagemaker_session, chainer_full_version, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "chainer_mnist") @@ -73,7 +73,7 @@ def test_attach_deploy(sagemaker_session, chainer_full_version): framework_version=chainer_full_version, py_version=PYTHON_VERSION, train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, hyperparameters={"epochs": 1}, ) @@ -95,7 +95,7 @@ def test_attach_deploy(sagemaker_session, chainer_full_version): estimator = Chainer.attach( chainer.latest_training_job.name, sagemaker_session=sagemaker_session ) - predictor = estimator.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) + predictor = estimator.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) _predict_and_assert(predictor) diff --git a/tests/integ/test_factorization_machines.py b/tests/integ/test_factorization_machines.py index 55a576b88a..7e13cadfee 100644 --- a/tests/integ/test_factorization_machines.py +++ b/tests/integ/test_factorization_machines.py @@ -24,7 +24,7 @@ from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name -def test_factorization_machines(sagemaker_session): +def test_factorization_machines(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("fm") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): @@ -38,7 +38,7 @@ def test_factorization_machines(sagemaker_session): fm = FactorizationMachines( role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, num_factors=10, predictor_type="regressor", epochs=2, @@ -58,7 +58,7 @@ def test_factorization_machines(sagemaker_session): model = FactorizationMachinesModel( fm.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) - predictor = model.deploy(1, "ml.c4.xlarge", endpoint_name=job_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(train_set[0][:10]) assert len(result) == 10 @@ -66,7 +66,7 @@ def test_factorization_machines(sagemaker_session): assert record.label["score"] is not None -def test_async_factorization_machines(sagemaker_session): +def test_async_factorization_machines(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("fm") with timeout(minutes=5): @@ -80,7 +80,7 @@ def test_async_factorization_machines(sagemaker_session): fm = FactorizationMachines( role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, num_factors=10, predictor_type="regressor", epochs=2, @@ -108,7 +108,7 @@ def test_async_factorization_machines(sagemaker_session): model = FactorizationMachinesModel( estimator.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) - predictor = model.deploy(1, "ml.c4.xlarge", endpoint_name=job_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(train_set[0][:10]) assert len(result) == 10 diff --git a/tests/integ/test_horovod.py b/tests/integ/test_horovod.py index befc428fa4..556419f735 100644 --- a/tests/integ/test_horovod.py +++ b/tests/integ/test_horovod.py @@ -28,34 +28,22 @@ horovod_dir = os.path.join(os.path.dirname(__file__), "..", "data", "horovod") -@pytest.fixture(scope="session", params=["ml.c4.xlarge"]) -def instance_type(request): - return request.param +@pytest.fixture(scope="module") +def gpu_instance_type(request): + return "ml.p3.2xlarge" @pytest.mark.canary_quick -def test_horovod(sagemaker_session, instance_type, tmpdir): - job_name = sagemaker.utils.unique_name_from_base("tf-horovod") - estimator = TensorFlow( - entry_point=os.path.join(horovod_dir, "test_hvd_basic.py"), - role="SageMakerRole", - train_instance_count=2, - train_instance_type=instance_type, - sagemaker_session=sagemaker_session, - py_version=integ.PYTHON_VERSION, - script_mode=True, - framework_version="1.12", - distributions={"mpi": {"enabled": True}}, - ) - - with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): - estimator.fit(job_name=job_name) +def test_hvd_cpu(sagemaker_session, cpu_instance_type, tmpdir): + __create_and_fit_estimator(sagemaker_session, cpu_instance_type, tmpdir) - tmp = str(tmpdir) - extract_files_from_s3(estimator.model_data, tmp) - for rank in range(2): - assert read_json("rank-%s" % rank, tmp)["rank"] == rank +@pytest.mark.canary_quick +@pytest.mark.skipif( + integ.test_region() in integ.HOSTING_NO_P3_REGIONS, reason="no ml.p3 instances in this region" +) +def test_hvd_gpu(sagemaker_session, gpu_instance_type, tmpdir): + __create_and_fit_estimator(sagemaker_session, gpu_instance_type, tmpdir) @pytest.mark.local_mode @@ -107,3 +95,27 @@ def extract_files_from_s3(s3_url, tmpdir): with tarfile.open(model, "r") as tar_file: tar_file.extractall(tmpdir) + + +def __create_and_fit_estimator(sagemaker_session, instance_type, tmpdir): + job_name = sagemaker.utils.unique_name_from_base("tf-horovod") + estimator = TensorFlow( + entry_point=os.path.join(horovod_dir, "test_hvd_basic.py"), + role="SageMakerRole", + train_instance_count=2, + train_instance_type=instance_type, + sagemaker_session=sagemaker_session, + py_version=integ.PYTHON_VERSION, + script_mode=True, + framework_version="1.12", + distributions={"mpi": {"enabled": True}}, + ) + + with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): + estimator.fit(job_name=job_name) + + tmp = str(tmpdir) + extract_files_from_s3(estimator.model_data, tmp) + + for rank in range(2): + assert read_json("rank-%s" % rank, tmp)["rank"] == rank diff --git a/tests/integ/test_inference_pipeline.py b/tests/integ/test_inference_pipeline.py index 5ca265885a..3260619fe4 100644 --- a/tests/integ/test_inference_pipeline.py +++ b/tests/integ/test_inference_pipeline.py @@ -52,7 +52,7 @@ @pytest.mark.continuous_testing @pytest.mark.regional_testing -def test_inference_pipeline_batch_transform(sagemaker_session): +def test_inference_pipeline_batch_transform(sagemaker_session, cpu_instance_type): sparkml_model_data = sagemaker_session.upload_data( path=os.path.join(SPARKML_DATA_PATH, "mleap_model.tar.gz"), key_prefix="integ-test-data/sparkml/model", @@ -77,7 +77,7 @@ def test_inference_pipeline_batch_transform(sagemaker_session): sagemaker_session=sagemaker_session, name=batch_job_name, ) - transformer = model.transformer(1, "ml.m4.xlarge") + transformer = model.transformer(1, cpu_instance_type) transform_input_key_prefix = "integ-test-data/sparkml_xgboost/transform" transform_input = transformer.sagemaker_session.upload_data( path=VALID_DATA_PATH, key_prefix=transform_input_key_prefix @@ -94,11 +94,7 @@ def test_inference_pipeline_batch_transform(sagemaker_session): @pytest.mark.canary_quick @pytest.mark.regional_testing -@pytest.mark.skip( - reason="This test has always failed, but the failure was masked by a bug. " - "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" -) -def test_inference_pipeline_model_deploy(sagemaker_session): +def test_inference_pipeline_model_deploy(sagemaker_session, cpu_instance_type): sparkml_data_path = os.path.join(DATA_DIR, "sparkml_model") xgboost_data_path = os.path.join(DATA_DIR, "xgboost_model") endpoint_name = "test-inference-pipeline-deploy-{}".format(sagemaker_timestamp()) @@ -127,7 +123,7 @@ def test_inference_pipeline_model_deploy(sagemaker_session): sagemaker_session=sagemaker_session, name=endpoint_name, ) - model.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) + model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) predictor = RealTimePredictor( endpoint=endpoint_name, sagemaker_session=sagemaker_session, diff --git a/tests/integ/test_ipinsights.py b/tests/integ/test_ipinsights.py index 4c1a47e062..35cbbe4482 100644 --- a/tests/integ/test_ipinsights.py +++ b/tests/integ/test_ipinsights.py @@ -24,7 +24,7 @@ FEATURE_DIM = None -def test_ipinsights(sagemaker_session): +def test_ipinsights(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("ipinsights") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): @@ -37,7 +37,7 @@ def test_ipinsights(sagemaker_session): ipinsights = IPInsights( role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, num_entity_vectors=10, vector_dim=100, sagemaker_session=sagemaker_session, @@ -52,7 +52,7 @@ def test_ipinsights(sagemaker_session): model = IPInsightsModel( ipinsights.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) - predictor = model.deploy(1, "ml.c4.xlarge", endpoint_name=job_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) assert isinstance(predictor, RealTimePredictor) predict_input = [["user_1", "1.1.1.1"]] diff --git a/tests/integ/test_kmeans.py b/tests/integ/test_kmeans.py index 65f2854179..1b21baa806 100644 --- a/tests/integ/test_kmeans.py +++ b/tests/integ/test_kmeans.py @@ -27,7 +27,7 @@ from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name -def test_kmeans(sagemaker_session): +def test_kmeans(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("kmeans") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") @@ -40,7 +40,7 @@ def test_kmeans(sagemaker_session): kmeans = KMeans( role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, ) @@ -75,7 +75,7 @@ def test_kmeans(sagemaker_session): model = KMeansModel( kmeans.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) - predictor = model.deploy(1, "ml.c4.xlarge", endpoint_name=job_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(train_set[0][:10]) assert len(result) == 10 @@ -89,7 +89,7 @@ def test_kmeans(sagemaker_session): assert "Could not find model" in str(exception.value) -def test_async_kmeans(sagemaker_session): +def test_async_kmeans(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("kmeans") with timeout(minutes=5): @@ -103,7 +103,7 @@ def test_async_kmeans(sagemaker_session): kmeans = KMeans( role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, ) @@ -141,7 +141,7 @@ def test_async_kmeans(sagemaker_session): model = KMeansModel( estimator.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) - predictor = model.deploy(1, "ml.c4.xlarge", endpoint_name=job_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(train_set[0][:10]) assert len(result) == 10 diff --git a/tests/integ/test_knn.py b/tests/integ/test_knn.py index beafa002d6..1f9db30b78 100644 --- a/tests/integ/test_knn.py +++ b/tests/integ/test_knn.py @@ -24,7 +24,7 @@ from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name -def test_knn_regressor(sagemaker_session): +def test_knn_regressor(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("knn") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): @@ -38,7 +38,7 @@ def test_knn_regressor(sagemaker_session): knn = KNN( role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, k=10, predictor_type="regressor", sample_size=500, @@ -53,7 +53,7 @@ def test_knn_regressor(sagemaker_session): with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = KNNModel(knn.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) - predictor = model.deploy(1, "ml.c4.xlarge", endpoint_name=job_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(train_set[0][:10]) assert len(result) == 10 @@ -61,7 +61,7 @@ def test_knn_regressor(sagemaker_session): assert record.label["score"] is not None -def test_async_knn_classifier(sagemaker_session): +def test_async_knn_classifier(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("knn") with timeout(minutes=5): @@ -75,7 +75,7 @@ def test_async_knn_classifier(sagemaker_session): knn = KNN( role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, k=10, predictor_type="classifier", sample_size=500, @@ -100,7 +100,7 @@ def test_async_knn_classifier(sagemaker_session): model = KNNModel( estimator.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) - predictor = model.deploy(1, "ml.c4.xlarge", endpoint_name=job_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(train_set[0][:10]) assert len(result) == 10 diff --git a/tests/integ/test_lda.py b/tests/integ/test_lda.py index bfc2da792f..b1d159407a 100644 --- a/tests/integ/test_lda.py +++ b/tests/integ/test_lda.py @@ -16,6 +16,8 @@ import numpy as np +import pytest +import tests.integ from sagemaker import LDA, LDAModel from sagemaker.amazon.common import read_records from sagemaker.utils import unique_name_from_base @@ -24,7 +26,11 @@ from tests.integ.record_set import prepare_record_set_from_local_files -def test_lda(sagemaker_session): +@pytest.mark.skipif( + tests.integ.test_region() in tests.integ.NO_LDA_REGIONS, + reason="LDA image is not supported in certain regions", +) +def test_lda(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("lda") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): @@ -39,7 +45,7 @@ def test_lda(sagemaker_session): lda = LDA( role="SageMakerRole", - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, num_topics=10, sagemaker_session=sagemaker_session, ) @@ -51,7 +57,7 @@ def test_lda(sagemaker_session): with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = LDAModel(lda.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) - predictor = model.deploy(1, "ml.c4.xlarge", endpoint_name=job_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) diff --git a/tests/integ/test_linear_learner.py b/tests/integ/test_linear_learner.py index e754819e2c..25432715c4 100644 --- a/tests/integ/test_linear_learner.py +++ b/tests/integ/test_linear_learner.py @@ -28,7 +28,7 @@ @pytest.mark.canary_quick -def test_linear_learner(sagemaker_session): +def test_linear_learner(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("linear-learner") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): @@ -46,7 +46,7 @@ def test_linear_learner(sagemaker_session): ll = LinearLearner( "SageMakerRole", 1, - "ml.c4.2xlarge", + cpu_instance_type, predictor_type="binary_classifier", sagemaker_session=sagemaker_session, ) @@ -88,7 +88,7 @@ def test_linear_learner(sagemaker_session): ll.fit(ll.record_set(train_set[0][:200], train_set[1][:200]), job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): - predictor = ll.deploy(1, "ml.c4.xlarge", endpoint_name=job_name) + predictor = ll.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(train_set[0][0:100]) assert len(result) == 100 @@ -97,7 +97,7 @@ def test_linear_learner(sagemaker_session): assert record.label["score"] is not None -def test_linear_learner_multiclass(sagemaker_session): +def test_linear_learner_multiclass(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("linear-learner") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): @@ -113,7 +113,7 @@ def test_linear_learner_multiclass(sagemaker_session): ll = LinearLearner( "SageMakerRole", 1, - "ml.c4.2xlarge", + cpu_instance_type, predictor_type="multiclass_classifier", num_classes=10, sagemaker_session=sagemaker_session, @@ -123,7 +123,7 @@ def test_linear_learner_multiclass(sagemaker_session): ll.fit(ll.record_set(train_set[0][:200], train_set[1][:200]), job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): - predictor = ll.deploy(1, "ml.c4.xlarge", endpoint_name=job_name) + predictor = ll.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(train_set[0][0:100]) assert len(result) == 100 @@ -132,7 +132,7 @@ def test_linear_learner_multiclass(sagemaker_session): assert record.label["score"] is not None -def test_async_linear_learner(sagemaker_session): +def test_async_linear_learner(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("linear-learner") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): @@ -150,7 +150,7 @@ def test_async_linear_learner(sagemaker_session): ll = LinearLearner( "SageMakerRole", 1, - "ml.c4.2xlarge", + cpu_instance_type, predictor_type="binary_classifier", sagemaker_session=sagemaker_session, ) @@ -201,7 +201,7 @@ def test_async_linear_learner(sagemaker_session): model = LinearLearnerModel( estimator.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) - predictor = model.deploy(1, "ml.c4.xlarge", endpoint_name=job_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(train_set[0][0:100]) assert len(result) == 100 diff --git a/tests/integ/test_local_mode.py b/tests/integ/test_local_mode.py index d5a34d0b3e..a73c9e1e0d 100644 --- a/tests/integ/test_local_mode.py +++ b/tests/integ/test_local_mode.py @@ -372,7 +372,9 @@ def test_mxnet_training_failure(sagemaker_local_session, mxnet_full_version, tmp @pytest.mark.local_mode -def test_local_transform_mxnet(sagemaker_local_session, tmpdir, mxnet_full_version): +def test_local_transform_mxnet( + sagemaker_local_session, tmpdir, mxnet_full_version, cpu_instance_type +): data_path = os.path.join(DATA_DIR, "mxnet_mnist") script_path = os.path.join(data_path, "mnist.py") @@ -380,7 +382,7 @@ def test_local_transform_mxnet(sagemaker_local_session, tmpdir, mxnet_full_versi entry_point=script_path, role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type="local", framework_version=mxnet_full_version, sagemaker_session=sagemaker_local_session, ) diff --git a/tests/integ/test_marketplace.py b/tests/integ/test_marketplace.py index fd51f8a016..ce21c20bb4 100644 --- a/tests/integ/test_marketplace.py +++ b/tests/integ/test_marketplace.py @@ -20,6 +20,7 @@ import pytest import sagemaker +import tests.integ from sagemaker import AlgorithmEstimator, ModelPackage from sagemaker.tuner import IntegerParameter, HyperparameterTuner from sagemaker.utils import sagemaker_timestamp @@ -49,11 +50,11 @@ @pytest.mark.canary_quick -@pytest.mark.skip( - reason="This test has always failed, but the failure was masked by a bug. " - "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" +@pytest.mark.skipif( + tests.integ.test_region() in tests.integ.NO_MARKET_PLACE_REGIONS, + reason="Marketplace is not available in {}".format(tests.integ.test_region()), ) -def test_marketplace_estimator(sagemaker_session): +def test_marketplace_estimator(sagemaker_session, cpu_instance_type): with timeout(minutes=15): data_path = os.path.join(DATA_DIR, "marketplace", "training") region = sagemaker_session.boto_region_name @@ -64,7 +65,7 @@ def test_marketplace_estimator(sagemaker_session): algorithm_arn=algorithm_arn, role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) @@ -76,7 +77,7 @@ def test_marketplace_estimator(sagemaker_session): endpoint_name = "test-marketplace-estimator{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20): - predictor = algo.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) + predictor = algo.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) shape = pandas.read_csv(os.path.join(data_path, "iris.csv"), header=None) a = [50 * i for i in range(3)] @@ -89,7 +90,11 @@ def test_marketplace_estimator(sagemaker_session): print(predictor.predict(test_x.values).decode("utf-8")) -def test_marketplace_attach(sagemaker_session): +@pytest.mark.skipif( + tests.integ.test_region() in tests.integ.NO_MARKET_PLACE_REGIONS, + reason="Marketplace is not available in {}".format(tests.integ.test_region()), +) +def test_marketplace_attach(sagemaker_session, cpu_instance_type): with timeout(minutes=15): data_path = os.path.join(DATA_DIR, "marketplace", "training") region = sagemaker_session.boto_region_name @@ -100,7 +105,7 @@ def test_marketplace_attach(sagemaker_session): algorithm_arn=algorithm_arn, role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, base_job_name="test-marketplace", ) @@ -123,7 +128,7 @@ def test_marketplace_attach(sagemaker_session): ) predictor = estimator.deploy( 1, - "ml.m4.xlarge", + cpu_instance_type, endpoint_name=endpoint_name, serializer=sagemaker.predictor.csv_serializer, ) @@ -139,7 +144,11 @@ def test_marketplace_attach(sagemaker_session): @pytest.mark.canary_quick -def test_marketplace_model(sagemaker_session): +@pytest.mark.skipif( + tests.integ.test_region() in tests.integ.NO_MARKET_PLACE_REGIONS, + reason="Marketplace is not available in {}".format(tests.integ.test_region()), +) +def test_marketplace_model(sagemaker_session, cpu_instance_type): region = sagemaker_session.boto_region_name account = REGION_ACCOUNT_MAP[region] model_package_arn = MODEL_PACKAGE_ARN % (region, account) @@ -158,7 +167,7 @@ def predict_wrapper(endpoint, session): endpoint_name = "test-marketplace-model-endpoint{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20): - predictor = model.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) data_path = os.path.join(DATA_DIR, "marketplace", "training") shape = pandas.read_csv(os.path.join(data_path, "iris.csv"), header=None) a = [50 * i for i in range(3)] @@ -171,7 +180,11 @@ def predict_wrapper(endpoint, session): print(predictor.predict(test_x.values).decode("utf-8")) -def test_marketplace_tuning_job(sagemaker_session): +@pytest.mark.skipif( + tests.integ.test_region() in tests.integ.NO_MARKET_PLACE_REGIONS, + reason="Marketplace is not available in {}".format(tests.integ.test_region()), +) +def test_marketplace_tuning_job(sagemaker_session, cpu_instance_type): data_path = os.path.join(DATA_DIR, "marketplace", "training") region = sagemaker_session.boto_region_name account = REGION_ACCOUNT_MAP[region] @@ -181,7 +194,7 @@ def test_marketplace_tuning_job(sagemaker_session): algorithm_arn=algorithm_arn, role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, base_job_name="test-marketplace", ) @@ -208,7 +221,11 @@ def test_marketplace_tuning_job(sagemaker_session): tuner.wait() -def test_marketplace_transform_job(sagemaker_session): +@pytest.mark.skipif( + tests.integ.test_region() in tests.integ.NO_MARKET_PLACE_REGIONS, + reason="Marketplace is not available in {}".format(tests.integ.test_region()), +) +def test_marketplace_transform_job(sagemaker_session, cpu_instance_type): data_path = os.path.join(DATA_DIR, "marketplace", "training") region = sagemaker_session.boto_region_name account = REGION_ACCOUNT_MAP[region] @@ -218,7 +235,7 @@ def test_marketplace_transform_job(sagemaker_session): algorithm_arn=algorithm_arn, role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, base_job_name="test-marketplace", ) @@ -237,12 +254,16 @@ def test_marketplace_transform_job(sagemaker_session): algo.fit({"training": train_input}) - transformer = algo.transformer(1, "ml.m4.xlarge") + transformer = algo.transformer(1, cpu_instance_type) transformer.transform(transform_input, content_type="text/csv") transformer.wait() -def test_marketplace_transform_job_from_model_package(sagemaker_session): +@pytest.mark.skipif( + tests.integ.test_region() in tests.integ.NO_MARKET_PLACE_REGIONS, + reason="Marketplace is not available in {}".format(tests.integ.test_region()), +) +def test_marketplace_transform_job_from_model_package(sagemaker_session, cpu_instance_type): data_path = os.path.join(DATA_DIR, "marketplace", "training") shape = pandas.read_csv(data_path + "/iris.csv", header=None).drop([0], axis=1) @@ -262,6 +283,6 @@ def test_marketplace_transform_job_from_model_package(sagemaker_session): sagemaker_session=sagemaker_session, ) - transformer = model.transformer(1, "ml.m4.xlarge") + transformer = model.transformer(1, cpu_instance_type) transformer.transform(transform_input, content_type="text/csv") transformer.wait() diff --git a/tests/integ/test_mxnet_train.py b/tests/integ/test_mxnet_train.py index 70d82396de..5de8760f72 100644 --- a/tests/integ/test_mxnet_train.py +++ b/tests/integ/test_mxnet_train.py @@ -28,7 +28,7 @@ @pytest.fixture(scope="module") -def mxnet_training_job(sagemaker_session, mxnet_full_version): +def mxnet_training_job(sagemaker_session, mxnet_full_version, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") @@ -39,7 +39,7 @@ def mxnet_training_job(sagemaker_session, mxnet_full_version): framework_version=mxnet_full_version, py_version=PYTHON_VERSION, train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) @@ -56,18 +56,18 @@ def mxnet_training_job(sagemaker_session, mxnet_full_version): @pytest.mark.canary_quick @pytest.mark.regional_testing -def test_attach_deploy(mxnet_training_job, sagemaker_session): +def test_attach_deploy(mxnet_training_job, sagemaker_session, cpu_instance_type): endpoint_name = "test-mxnet-attach-deploy-{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = MXNet.attach(mxnet_training_job, sagemaker_session=sagemaker_session) - predictor = estimator.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) + predictor = estimator.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) result = predictor.predict(data) assert result is not None -def test_deploy_model(mxnet_training_job, sagemaker_session, mxnet_full_version): +def test_deploy_model(mxnet_training_job, sagemaker_session, mxnet_full_version, cpu_instance_type): endpoint_name = "test-mxnet-deploy-model-{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): @@ -84,7 +84,7 @@ def test_deploy_model(mxnet_training_job, sagemaker_session, mxnet_full_version) sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, ) - predictor = model.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) result = predictor.predict(data) @@ -96,7 +96,9 @@ def test_deploy_model(mxnet_training_job, sagemaker_session, mxnet_full_version) assert "Could not find model" in str(exception.value) -def test_deploy_model_with_tags_and_kms(mxnet_training_job, sagemaker_session, mxnet_full_version): +def test_deploy_model_with_tags_and_kms( + mxnet_training_job, sagemaker_session, mxnet_full_version, cpu_instance_type +): endpoint_name = "test-mxnet-deploy-model-{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): @@ -117,7 +119,9 @@ def test_deploy_model_with_tags_and_kms(mxnet_training_job, sagemaker_session, m tags = [{"Key": "TagtestKey", "Value": "TagtestValue"}] kms_key_arn = get_or_create_kms_key(sagemaker_session) - model.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name, tags=tags, kms_key=kms_key_arn) + model.deploy( + 1, cpu_instance_type, endpoint_name=endpoint_name, tags=tags, kms_key=kms_key_arn + ) returned_model = sagemaker_session.sagemaker_client.describe_model(ModelName=model.name) returned_model_tags = sagemaker_session.sagemaker_client.list_tags( @@ -141,13 +145,13 @@ def test_deploy_model_with_tags_and_kms(mxnet_training_job, sagemaker_session, m assert returned_model_tags == tags assert endpoint_config_tags == tags assert endpoint_tags == tags - assert production_variants[0]["InstanceType"] == "ml.m4.xlarge" + assert production_variants[0]["InstanceType"] == cpu_instance_type assert production_variants[0]["InitialInstanceCount"] == 1 assert endpoint_config["KmsKeyId"] == kms_key_arn def test_deploy_model_with_update_endpoint( - mxnet_training_job, sagemaker_session, mxnet_full_version + mxnet_training_job, sagemaker_session, mxnet_full_version, cpu_instance_type ): endpoint_name = "test-mxnet-deploy-model-{}".format(sagemaker_timestamp()) @@ -198,7 +202,7 @@ def test_deploy_model_with_update_endpoint( def test_deploy_model_with_update_non_existing_endpoint( - mxnet_training_job, sagemaker_session, mxnet_full_version + mxnet_training_job, sagemaker_session, mxnet_full_version, cpu_instance_type ): endpoint_name = "test-mxnet-deploy-model-{}".format(sagemaker_timestamp()) expected_error_message = ( @@ -225,7 +229,7 @@ def test_deploy_model_with_update_non_existing_endpoint( with pytest.raises(ValueError, message=expected_error_message): model.deploy( - 1, "ml.m4.xlarge", update_endpoint=True, endpoint_name="non-existing-endpoint" + 1, cpu_instance_type, update_endpoint=True, endpoint_name="non-existing-endpoint" ) @@ -236,7 +240,7 @@ def test_deploy_model_with_update_non_existing_endpoint( reason="EI isn't supported in that specific region.", ) def test_deploy_model_with_accelerator( - mxnet_training_job, sagemaker_session, ei_mxnet_full_version + mxnet_training_job, sagemaker_session, ei_mxnet_full_version, cpu_instance_type ): endpoint_name = "test-mxnet-deploy-model-ei-{}".format(sagemaker_timestamp()) @@ -255,7 +259,7 @@ def test_deploy_model_with_accelerator( sagemaker_session=sagemaker_session, ) predictor = model.deploy( - 1, "ml.m4.xlarge", endpoint_name=endpoint_name, accelerator_type="ml.eia1.medium" + 1, cpu_instance_type, endpoint_name=endpoint_name, accelerator_type="ml.eia1.medium" ) data = numpy.zeros(shape=(1, 1, 28, 28)) @@ -263,7 +267,7 @@ def test_deploy_model_with_accelerator( assert result is not None -def test_async_fit(sagemaker_session, mxnet_full_version): +def test_async_fit(sagemaker_session, mxnet_full_version, cpu_instance_type): endpoint_name = "test-mxnet-attach-deploy-{}".format(sagemaker_timestamp()) with timeout(minutes=5): @@ -275,7 +279,7 @@ def test_async_fit(sagemaker_session, mxnet_full_version): role="SageMakerRole", py_version=PYTHON_VERSION, train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, distributions={"parameter_server": {"enabled": True}}, @@ -299,7 +303,7 @@ def test_async_fit(sagemaker_session, mxnet_full_version): estimator = MXNet.attach( training_job_name=training_job_name, sagemaker_session=sagemaker_session ) - predictor = estimator.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) + predictor = estimator.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) result = predictor.predict(data) assert result is not None diff --git a/tests/integ/test_neo_mxnet.py b/tests/integ/test_neo_mxnet.py index fed278dc6e..5127bbf9cc 100644 --- a/tests/integ/test_neo_mxnet.py +++ b/tests/integ/test_neo_mxnet.py @@ -24,7 +24,7 @@ @pytest.fixture(scope="module") -def mxnet_training_job(sagemaker_session, mxnet_full_version): +def mxnet_training_job(sagemaker_session, mxnet_full_version, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_neo.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") @@ -35,7 +35,7 @@ def mxnet_training_job(sagemaker_session, mxnet_full_version): framework_version=mxnet_full_version, py_version=PYTHON_VERSION, train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) @@ -55,20 +55,22 @@ def mxnet_training_job(sagemaker_session, mxnet_full_version): @pytest.mark.skip( reason="This should be enabled along with the Boto SDK release for Neo API changes" ) -def test_attach_deploy(mxnet_training_job, sagemaker_session): +def test_attach_deploy( + mxnet_training_job, sagemaker_session, cpu_instance_type, cpu_instance_family +): endpoint_name = "test-mxnet-attach-deploy-{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = MXNet.attach(mxnet_training_job, sagemaker_session=sagemaker_session) estimator.compile_model( - target_instance_family="ml_m4", + target_instance_family=cpu_instance_family, input_shape={"data": [1, 1, 28, 28]}, output_path=estimator.output_path, ) predictor = estimator.deploy( - 1, "ml.m4.xlarge", use_compiled_model=True, endpoint_name=endpoint_name + 1, cpu_instance_type, use_compiled_model=True, endpoint_name=endpoint_name ) predictor.content_type = "application/vnd+python.numpy+binary" data = numpy.zeros(shape=(1, 1, 28, 28)) @@ -78,7 +80,9 @@ def test_attach_deploy(mxnet_training_job, sagemaker_session): @pytest.mark.skip( reason="This should be enabled along with the Boto SDK release for Neo API changes" ) -def test_deploy_model(mxnet_training_job, sagemaker_session): +def test_deploy_model( + mxnet_training_job, sagemaker_session, cpu_instance_type, cpu_instance_family +): endpoint_name = "test-mxnet-deploy-model-{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): @@ -97,13 +101,13 @@ def test_deploy_model(mxnet_training_job, sagemaker_session): ) model.compile( - target_instance_family="ml_m4", + target_instance_family=cpu_instance_family, input_shape={"data": [1, 1, 28, 28]}, role=role, job_name="test-deploy-model-compilation-job-{}".format(int(time.time())), output_path="/".join(model_data.split("/")[:-1]), ) - predictor = model.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) predictor.content_type = "application/vnd+python.numpy+binary" data = numpy.zeros(shape=(1, 1, 28, 28)) diff --git a/tests/integ/test_ntm.py b/tests/integ/test_ntm.py index 5e5977286c..0579e5064b 100644 --- a/tests/integ/test_ntm.py +++ b/tests/integ/test_ntm.py @@ -26,11 +26,7 @@ @pytest.mark.canary_quick -@pytest.mark.skip( - reason="This test has always failed, but the failure was masked by a bug. " - "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" -) -def test_ntm(sagemaker_session): +def test_ntm(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("ntm") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): @@ -46,7 +42,7 @@ def test_ntm(sagemaker_session): ntm = NTM( role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, num_topics=10, sagemaker_session=sagemaker_session, ) @@ -58,7 +54,7 @@ def test_ntm(sagemaker_session): with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = NTMModel(ntm.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) - predictor = model.deploy(1, "ml.c4.xlarge", endpoint_name=job_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) diff --git a/tests/integ/test_object2vec.py b/tests/integ/test_object2vec.py index f5073caa90..1a5e73d6a1 100644 --- a/tests/integ/test_object2vec.py +++ b/tests/integ/test_object2vec.py @@ -14,8 +14,6 @@ import os -import pytest - from sagemaker.predictor import RealTimePredictor from sagemaker import Object2Vec, Object2VecModel from sagemaker.utils import unique_name_from_base @@ -26,11 +24,7 @@ FEATURE_NUM = None -@pytest.mark.skip( - reason="This test has always failed, but the failure was masked by a bug. " - "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" -) -def test_object2vec(sagemaker_session): +def test_object2vec(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("object2vec") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): @@ -43,7 +37,7 @@ def test_object2vec(sagemaker_session): object2vec = Object2Vec( role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, epochs=3, enc0_max_seq_len=20, enc0_vocab_size=45000, @@ -66,7 +60,7 @@ def test_object2vec(sagemaker_session): model = Object2VecModel( object2vec.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) - predictor = model.deploy(1, "ml.c4.xlarge", endpoint_name=job_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) assert isinstance(predictor, RealTimePredictor) predict_input = {"instances": [{"in0": [354, 623], "in1": [16]}]} diff --git a/tests/integ/test_pca.py b/tests/integ/test_pca.py index 15b8da1095..e9b639a3f5 100644 --- a/tests/integ/test_pca.py +++ b/tests/integ/test_pca.py @@ -24,7 +24,7 @@ from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name -def test_pca(sagemaker_session): +def test_pca(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("pca") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): @@ -38,7 +38,7 @@ def test_pca(sagemaker_session): pca = sagemaker.amazon.pca.PCA( role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.m4.xlarge", + train_instance_type=cpu_instance_type, num_components=48, sagemaker_session=sagemaker_session, ) @@ -53,7 +53,7 @@ def test_pca(sagemaker_session): model_data=pca.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) predictor = pca_model.deploy( - initial_instance_count=1, instance_type="ml.c4.xlarge", endpoint_name=job_name + initial_instance_count=1, instance_type=cpu_instance_type, endpoint_name=job_name ) result = predictor.predict(train_set[0][:5]) @@ -63,7 +63,7 @@ def test_pca(sagemaker_session): assert record.label["projection"] is not None -def test_async_pca(sagemaker_session): +def test_async_pca(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("pca") with timeout(minutes=5): @@ -77,7 +77,7 @@ def test_async_pca(sagemaker_session): pca = sagemaker.amazon.pca.PCA( role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.m4.xlarge", + train_instance_type=cpu_instance_type, num_components=48, sagemaker_session=sagemaker_session, base_job_name="test-pca", @@ -100,7 +100,7 @@ def test_async_pca(sagemaker_session): estimator.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) predictor = model.deploy( - initial_instance_count=1, instance_type="ml.c4.xlarge", endpoint_name=job_name + initial_instance_count=1, instance_type=cpu_instance_type, endpoint_name=job_name ) result = predictor.predict(train_set[0][:5]) diff --git a/tests/integ/test_pytorch_train.py b/tests/integ/test_pytorch_train.py index 9e6d32b9e0..8f430c3665 100644 --- a/tests/integ/test_pytorch_train.py +++ b/tests/integ/test_pytorch_train.py @@ -28,10 +28,9 @@ @pytest.fixture(scope="module", name="pytorch_training_job") -def fixture_training_job(sagemaker_session, pytorch_full_version): - instance_type = "ml.c4.xlarge" +def fixture_training_job(sagemaker_session, pytorch_full_version, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): - pytorch = _get_pytorch_estimator(sagemaker_session, pytorch_full_version, instance_type) + pytorch = _get_pytorch_estimator(sagemaker_session, pytorch_full_version, cpu_instance_type) pytorch.fit({"training": _upload_training_data(pytorch)}) return pytorch.latest_training_job.name @@ -39,12 +38,12 @@ def fixture_training_job(sagemaker_session, pytorch_full_version): @pytest.mark.canary_quick @pytest.mark.regional_testing -def test_sync_fit_deploy(pytorch_training_job, sagemaker_session): +def test_sync_fit_deploy(pytorch_training_job, sagemaker_session, cpu_instance_type): # TODO: add tests against local mode when it's ready to be used endpoint_name = "test-pytorch-sync-fit-attach-deploy{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = PyTorch.attach(pytorch_training_job, sagemaker_session=sagemaker_session) - predictor = estimator.deploy(1, "ml.c4.xlarge", endpoint_name=endpoint_name) + predictor = estimator.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28), dtype=numpy.float32) predictor.predict(data) @@ -55,7 +54,7 @@ def test_sync_fit_deploy(pytorch_training_job, sagemaker_session): assert output.shape == (batch_size, 10) -def test_deploy_model(pytorch_training_job, sagemaker_session): +def test_deploy_model(pytorch_training_job, sagemaker_session, cpu_instance_type): endpoint_name = "test-pytorch-deploy-model-{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): @@ -69,7 +68,7 @@ def test_deploy_model(pytorch_training_job, sagemaker_session): entry_point=MNIST_SCRIPT, sagemaker_session=sagemaker_session, ) - predictor = model.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) batch_size = 100 data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32) @@ -86,7 +85,7 @@ def _upload_training_data(pytorch): def _get_pytorch_estimator( - sagemaker_session, pytorch_full_version, instance_type="ml.c4.xlarge", entry_point=MNIST_SCRIPT + sagemaker_session, pytorch_full_version, instance_type, entry_point=MNIST_SCRIPT ): return PyTorch( entry_point=entry_point, diff --git a/tests/integ/test_randomcutforest.py b/tests/integ/test_randomcutforest.py index 2301cca786..04b1de0c12 100644 --- a/tests/integ/test_randomcutforest.py +++ b/tests/integ/test_randomcutforest.py @@ -20,7 +20,7 @@ from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name -def test_randomcutforest(sagemaker_session): +def test_randomcutforest(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("randomcutforest") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): @@ -31,7 +31,7 @@ def test_randomcutforest(sagemaker_session): rcf = RandomCutForest( role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, num_trees=50, num_samples_per_tree=20, eval_metrics=["accuracy", "precision_recall_fscore"], @@ -44,7 +44,7 @@ def test_randomcutforest(sagemaker_session): model = RandomCutForestModel( rcf.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) - predictor = model.deploy(1, "ml.c4.xlarge", endpoint_name=job_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) diff --git a/tests/integ/test_record_set.py b/tests/integ/test_record_set.py index 9dbeb51e2d..b1c4060f84 100644 --- a/tests/integ/test_record_set.py +++ b/tests/integ/test_record_set.py @@ -23,7 +23,7 @@ from tests.integ import DATA_DIR -def test_record_set(sagemaker_session): +def test_record_set(sagemaker_session, cpu_instance_type): """Test the method ``AmazonAlgorithmEstimatorBase.record_set``. In particular, test that the objects uploaded to the S3 bucket are encrypted. @@ -35,7 +35,7 @@ def test_record_set(sagemaker_session): kmeans = KMeans( role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, ) diff --git a/tests/integ/test_rl.py b/tests/integ/test_rl.py index 9288f49b16..18736610ea 100644 --- a/tests/integ/test_rl.py +++ b/tests/integ/test_rl.py @@ -22,13 +22,13 @@ from tests.integ import DATA_DIR, PYTHON_VERSION from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name -CPU_INSTANCE = "ml.m4.xlarge" - @pytest.mark.canary_quick @pytest.mark.skipif(PYTHON_VERSION != "py3", reason="RL images supports only Python 3.") -def test_coach_mxnet(sagemaker_session, rl_coach_mxnet_full_version): - estimator = _test_coach(sagemaker_session, RLFramework.MXNET, rl_coach_mxnet_full_version) +def test_coach_mxnet(sagemaker_session, rl_coach_mxnet_full_version, cpu_instance_type): + estimator = _test_coach( + sagemaker_session, RLFramework.MXNET, rl_coach_mxnet_full_version, cpu_instance_type + ) job_name = unique_name_from_base("test-coach-mxnet") with timeout(minutes=15): @@ -42,7 +42,7 @@ def test_coach_mxnet(sagemaker_session, rl_coach_mxnet_full_version): with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy( - 1, CPU_INSTANCE, entry_point="mxnet_deploy.py", endpoint_name=endpoint_name + 1, cpu_instance_type, entry_point="mxnet_deploy.py", endpoint_name=endpoint_name ) observation = numpy.asarray([0, 0, 0, 0]) @@ -53,8 +53,10 @@ def test_coach_mxnet(sagemaker_session, rl_coach_mxnet_full_version): @pytest.mark.skipif(PYTHON_VERSION != "py3", reason="RL images supports only Python 3.") -def test_coach_tf(sagemaker_session, rl_coach_tf_full_version): - estimator = _test_coach(sagemaker_session, RLFramework.TENSORFLOW, rl_coach_tf_full_version) +def test_coach_tf(sagemaker_session, rl_coach_tf_full_version, cpu_instance_type): + estimator = _test_coach( + sagemaker_session, RLFramework.TENSORFLOW, rl_coach_tf_full_version, cpu_instance_type + ) job_name = unique_name_from_base("test-coach-tf") with timeout(minutes=15): @@ -63,14 +65,14 @@ def test_coach_tf(sagemaker_session, rl_coach_tf_full_version): endpoint_name = "test-tf-coach-deploy-{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): - predictor = estimator.deploy(1, CPU_INSTANCE) + predictor = estimator.deploy(1, cpu_instance_type) observation = numpy.asarray([0, 0, 0, 0]) action = predictor.predict(observation) assert action == {"predictions": [[0.5, 0.5]]} -def _test_coach(sagemaker_session, rl_framework, rl_coach_version): +def _test_coach(sagemaker_session, rl_framework, rl_coach_version, cpu_instance_type): source_dir = os.path.join(DATA_DIR, "coach_cartpole") dependencies = [os.path.join(DATA_DIR, "sagemaker_rl")] cartpole = "train_coach.py" @@ -83,7 +85,7 @@ def _test_coach(sagemaker_session, rl_framework, rl_coach_version): source_dir=source_dir, role="SageMakerRole", train_instance_count=1, - train_instance_type=CPU_INSTANCE, + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, dependencies=dependencies, hyperparameters={ @@ -97,7 +99,7 @@ def _test_coach(sagemaker_session, rl_framework, rl_coach_version): @pytest.mark.canary_quick @pytest.mark.skipif(PYTHON_VERSION != "py3", reason="RL images supports only Python 3.") -def test_ray_tf(sagemaker_session, rl_ray_full_version): +def test_ray_tf(sagemaker_session, rl_ray_full_version, cpu_instance_type): source_dir = os.path.join(DATA_DIR, "ray_cartpole") cartpole = "train_ray.py" @@ -109,7 +111,7 @@ def test_ray_tf(sagemaker_session, rl_ray_full_version): toolkit_version=rl_ray_full_version, sagemaker_session=sagemaker_session, role="SageMakerRole", - train_instance_type=CPU_INSTANCE, + train_instance_type=cpu_instance_type, train_instance_count=1, ) job_name = unique_name_from_base("test-ray-tf") @@ -118,5 +120,5 @@ def test_ray_tf(sagemaker_session, rl_ray_full_version): estimator.fit(job_name=job_name) with pytest.raises(NotImplementedError) as e: - estimator.deploy(1, CPU_INSTANCE) + estimator.deploy(1, cpu_instance_type) assert "Automatic deployment of Ray models is not currently available" in str(e.value) diff --git a/tests/integ/test_sklearn_train.py b/tests/integ/test_sklearn_train.py index f1a98ae3ac..45ce95a53d 100644 --- a/tests/integ/test_sklearn_train.py +++ b/tests/integ/test_sklearn_train.py @@ -27,12 +27,15 @@ @pytest.fixture(scope="module") -def sklearn_training_job(sagemaker_session, sklearn_full_version): - return _run_mnist_training_job(sagemaker_session, "ml.c4.xlarge", sklearn_full_version) +def sklearn_training_job(sagemaker_session, sklearn_full_version, cpu_instance_type): + return _run_mnist_training_job(sagemaker_session, cpu_instance_type, sklearn_full_version) + sagemaker_session.boto_region_name @pytest.mark.skipif(PYTHON_VERSION != "py3", reason="Scikit-learn image supports only python 3.") -def test_training_with_additional_hyperparameters(sagemaker_session, sklearn_full_version): +def test_training_with_additional_hyperparameters( + sagemaker_session, sklearn_full_version, cpu_instance_type +): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "sklearn_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "sklearn_mnist") @@ -40,7 +43,7 @@ def test_training_with_additional_hyperparameters(sagemaker_session, sklearn_ful sklearn = SKLearn( entry_point=script_path, role="SageMakerRole", - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, framework_version=sklearn_full_version, py_version=PYTHON_VERSION, sagemaker_session=sagemaker_session, @@ -60,7 +63,9 @@ def test_training_with_additional_hyperparameters(sagemaker_session, sklearn_ful @pytest.mark.skipif(PYTHON_VERSION != "py3", reason="Scikit-learn image supports only python 3.") -def test_training_with_network_isolation(sagemaker_session, sklearn_full_version): +def test_training_with_network_isolation( + sagemaker_session, sklearn_full_version, cpu_instance_type +): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "sklearn_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "sklearn_mnist") @@ -68,7 +73,7 @@ def test_training_with_network_isolation(sagemaker_session, sklearn_full_version sklearn = SKLearn( entry_point=script_path, role="SageMakerRole", - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, framework_version=sklearn_full_version, py_version=PYTHON_VERSION, sagemaker_session=sagemaker_session, @@ -94,25 +99,17 @@ def test_training_with_network_isolation(sagemaker_session, sklearn_full_version @pytest.mark.canary_quick @pytest.mark.regional_testing @pytest.mark.skipif(PYTHON_VERSION != "py3", reason="Scikit-learn image supports only python 3.") -@pytest.mark.skip( - reason="This test has always failed, but the failure was masked by a bug. " - "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" -) -def test_attach_deploy(sklearn_training_job, sagemaker_session): +def test_attach_deploy(sklearn_training_job, sagemaker_session, cpu_instance_type): endpoint_name = "test-sklearn-attach-deploy-{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = SKLearn.attach(sklearn_training_job, sagemaker_session=sagemaker_session) - predictor = estimator.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) + predictor = estimator.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) _predict_and_assert(predictor) @pytest.mark.skipif(PYTHON_VERSION != "py3", reason="Scikit-learn image supports only python 3.") -@pytest.mark.skip( - reason="This test has always failed, but the failure was masked by a bug. " - "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" -) -def test_deploy_model(sklearn_training_job, sagemaker_session): +def test_deploy_model(sklearn_training_job, sagemaker_session, cpu_instance_type): endpoint_name = "test-sklearn-deploy-model-{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): desc = sagemaker_session.sagemaker_client.describe_training_job( @@ -126,21 +123,17 @@ def test_deploy_model(sklearn_training_job, sagemaker_session): entry_point=script_path, sagemaker_session=sagemaker_session, ) - predictor = model.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) _predict_and_assert(predictor) @pytest.mark.skipif(PYTHON_VERSION != "py3", reason="Scikit-learn image supports only python 3.") -@pytest.mark.skip( - reason="This test has always failed, but the failure was masked by a bug. " - "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" -) -def test_async_fit(sagemaker_session): +def test_async_fit(sagemaker_session, cpu_instance_type): endpoint_name = "test-sklearn-attach-deploy-{}".format(sagemaker_timestamp()) with timeout(minutes=5): training_job_name = _run_mnist_training_job( - sagemaker_session, "ml.c4.xlarge", sklearn_full_version=SKLEARN_VERSION, wait=False + sagemaker_session, cpu_instance_type, sklearn_full_version=SKLEARN_VERSION, wait=False ) print("Waiting to re-attach to the training job: %s" % training_job_name) @@ -151,12 +144,12 @@ def test_async_fit(sagemaker_session): estimator = SKLearn.attach( training_job_name=training_job_name, sagemaker_session=sagemaker_session ) - predictor = estimator.deploy(1, "ml.c4.xlarge", endpoint_name=endpoint_name) + predictor = estimator.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) _predict_and_assert(predictor) @pytest.mark.skipif(PYTHON_VERSION != "py3", reason="Scikit-learn image supports only python 3.") -def test_failed_training_job(sagemaker_session, sklearn_full_version): +def test_failed_training_job(sagemaker_session, sklearn_full_version, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "sklearn_mnist", "failure_script.py") data_path = os.path.join(DATA_DIR, "sklearn_mnist") @@ -167,7 +160,7 @@ def test_failed_training_job(sagemaker_session, sklearn_full_version): framework_version=sklearn_full_version, py_version=PYTHON_VERSION, train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) diff --git a/tests/integ/test_sparkml_serving.py b/tests/integ/test_sparkml_serving.py index a64b550233..fe0521650a 100644 --- a/tests/integ/test_sparkml_serving.py +++ b/tests/integ/test_sparkml_serving.py @@ -25,11 +25,7 @@ @pytest.mark.canary_quick @pytest.mark.regional_testing -@pytest.mark.skip( - reason="This test has always failed, but the failure was masked by a bug. " - "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" -) -def test_sparkml_model_deploy(sagemaker_session): +def test_sparkml_model_deploy(sagemaker_session, cpu_instance_type): # Uploads an MLeap serialized MLeap model to S3 and use that to deploy a SparkML model to perform inference data_path = os.path.join(DATA_DIR, "sparkml_model") endpoint_name = "test-sparkml-deploy-{}".format(sagemaker_timestamp()) @@ -57,7 +53,7 @@ def test_sparkml_model_deploy(sagemaker_session): sagemaker_session=sagemaker_session, env={"SAGEMAKER_SPARKML_SCHEMA": schema}, ) - predictor = model.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) + predictor = model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) valid_data = "1.0,C,38.0,71.5,1.0,female" assert predictor.predict(valid_data) == "1.0,0.0,38.0,1.0,71.5,0.0,1.0" diff --git a/tests/integ/test_tf_cifar.py b/tests/integ/test_tf_cifar.py new file mode 100644 index 0000000000..75a1f8635f --- /dev/null +++ b/tests/integ/test_tf_cifar.py @@ -0,0 +1,83 @@ +# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os +import pickle + +import numpy as np +import pytest + +import tests.integ +from tests.integ.timeout import timeout_and_delete_endpoint_by_name, timeout + +from sagemaker.tensorflow import TensorFlow +from sagemaker.utils import unique_name_from_base + +PICKLE_CONTENT_TYPE = "application/python-pickle" + + +class PickleSerializer(object): + def __init__(self): + self.content_type = PICKLE_CONTENT_TYPE + + def __call__(self, data): + return pickle.dumps(data, protocol=2) + + +@pytest.mark.canary_quick +@pytest.mark.skipif( + tests.integ.PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2." +) +@pytest.mark.skipif( + tests.integ.test_region() in tests.integ.HOSTING_NO_P2_REGIONS + or tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS, + reason="no ml.p2 instances in these regions", +) +def test_cifar(sagemaker_session): + with timeout(minutes=45): + script_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "source") + + dataset_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "data") + + estimator = TensorFlow( + entry_point="resnet_cifar_10.py", + source_dir=script_path, + role="SageMakerRole", + framework_version="1.12", + training_steps=50, + evaluation_steps=5, + train_instance_count=2, + train_instance_type="ml.p2.xlarge", + sagemaker_session=sagemaker_session, + train_max_run=45 * 60, + base_job_name="test-cifar", + ) + + inputs = estimator.sagemaker_session.upload_data( + path=dataset_path, key_prefix="data/cifar10" + ) + job_name = unique_name_from_base("test-tf-cifar") + + estimator.fit(inputs, logs=False, job_name=job_name) + print("job succeeded: {}".format(estimator.latest_training_job.name)) + + endpoint_name = estimator.latest_training_job.name + with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): + predictor = estimator.deploy(initial_instance_count=1, instance_type="ml.p2.xlarge") + predictor.serializer = PickleSerializer() + predictor.content_type = PICKLE_CONTENT_TYPE + + data = np.random.randn(32, 32, 3) + predict_response = predictor.predict(data) + assert len(predict_response["outputs"]["probabilities"]["floatVal"]) == 10 diff --git a/tests/integ/test_tf_keras.py b/tests/integ/test_tf_keras.py index 34d4d49238..f76792a453 100644 --- a/tests/integ/test_tf_keras.py +++ b/tests/integ/test_tf_keras.py @@ -28,7 +28,11 @@ @pytest.mark.skipif( tests.integ.PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2." ) -def test_keras(sagemaker_session): +@pytest.mark.skipif( + tests.integ.test_region() in tests.integ.HOSTING_NO_P2_REGIONS, + reason="no ml.p2 instances in these regions", +) +def test_keras(sagemaker_session, cpu_instance_type): script_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "source") dataset_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "data") @@ -43,7 +47,7 @@ def test_keras(sagemaker_session): training_steps=50, evaluation_steps=5, train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, train_max_run=45 * 60, ) diff --git a/tests/integ/test_tf_script_mode.py b/tests/integ/test_tf_script_mode.py index f75e82199f..55b880d4cc 100644 --- a/tests/integ/test_tf_script_mode.py +++ b/tests/integ/test_tf_script_mode.py @@ -38,11 +38,6 @@ TAGS = [{"Key": "some-key", "Value": "some-value"}] -@pytest.fixture(scope="session", params=["ml.c4.xlarge"]) -def instance_type(request): - return request.param - - def test_mnist(sagemaker_session, instance_type): estimator = TensorFlow( entry_point=SCRIPT, @@ -127,7 +122,7 @@ def test_mnist_distributed(sagemaker_session, instance_type): ) -def test_mnist_async(sagemaker_session): +def test_mnist_async(sagemaker_session, cpu_instance_type): estimator = TensorFlow( entry_point=SCRIPT, role=ROLE, @@ -156,7 +151,7 @@ def test_mnist_async(sagemaker_session): model_name = "model-mnist-async" predictor = estimator.deploy( initial_instance_count=1, - instance_type="ml.c4.xlarge", + instance_type=cpu_instance_type, endpoint_name=endpoint_name, model_name=model_name, ) diff --git a/tests/integ/test_tfs.py b/tests/integ/test_tfs.py index 7589a53b4f..3c4ae76ef9 100644 --- a/tests/integ/test_tfs.py +++ b/tests/integ/test_tfs.py @@ -111,7 +111,7 @@ def tfs_predictor_with_model_and_entry_point_and_dependencies( @pytest.fixture(scope="module") -def tfs_predictor_with_accelerator(sagemaker_session): +def tfs_predictor_with_accelerator(sagemaker_session, tf_full_version, cpu_instance_type): endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-tensorflow-serving") model_data = sagemaker_session.upload_data( path=os.path.join(tests.integ.DATA_DIR, "tensorflow-serving-test-model.tar.gz"), @@ -125,7 +125,7 @@ def tfs_predictor_with_accelerator(sagemaker_session): sagemaker_session=sagemaker_session, ) predictor = model.deploy( - 1, "ml.c4.large", endpoint_name=endpoint_name, accelerator_type="ml.eia1.medium" + 1, cpu_instance_type, endpoint_name=endpoint_name, accelerator_type="ml.eia1.medium" ) yield predictor diff --git a/tests/integ/test_transformer.py b/tests/integ/test_transformer.py index ad3fd65c2d..18e7ec926a 100644 --- a/tests/integ/test_transformer.py +++ b/tests/integ/test_transformer.py @@ -35,7 +35,7 @@ @pytest.mark.canary_quick -def test_transform_mxnet(sagemaker_session, mxnet_full_version): +def test_transform_mxnet(sagemaker_session, mxnet_full_version, cpu_instance_type): data_path = os.path.join(DATA_DIR, "mxnet_mnist") script_path = os.path.join(data_path, "mnist.py") @@ -43,7 +43,7 @@ def test_transform_mxnet(sagemaker_session, mxnet_full_version): entry_point=script_path, role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, ) @@ -72,6 +72,7 @@ def test_transform_mxnet(sagemaker_session, mxnet_full_version): transformer = _create_transformer_and_transform_job( mx, transform_input, + cpu_instance_type, kms_key_arn, input_filter=input_filter, output_filter=output_filter, @@ -91,7 +92,7 @@ def test_transform_mxnet(sagemaker_session, mxnet_full_version): @pytest.mark.canary_quick -def test_attach_transform_kmeans(sagemaker_session): +def test_attach_transform_kmeans(sagemaker_session, cpu_instance_type): data_path = os.path.join(DATA_DIR, "one_p_mnist") pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} @@ -103,7 +104,7 @@ def test_attach_transform_kmeans(sagemaker_session): kmeans = KMeans( role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, output_path="s3://{}/".format(sagemaker_session.default_bucket()), @@ -131,7 +132,7 @@ def test_attach_transform_kmeans(sagemaker_session): path=transform_input_path, key_prefix=transform_input_key_prefix ) - transformer = _create_transformer_and_transform_job(kmeans, transform_input) + transformer = _create_transformer_and_transform_job(kmeans, transform_input, cpu_instance_type) attached_transformer = Transformer.attach( transformer.latest_transform_job.name, sagemaker_session=sagemaker_session @@ -142,7 +143,7 @@ def test_attach_transform_kmeans(sagemaker_session): attached_transformer.wait() -def test_transform_mxnet_vpc(sagemaker_session, mxnet_full_version): +def test_transform_mxnet_vpc(sagemaker_session, mxnet_full_version, cpu_instance_type): data_path = os.path.join(DATA_DIR, "mxnet_mnist") script_path = os.path.join(data_path, "mnist.py") @@ -155,7 +156,7 @@ def test_transform_mxnet_vpc(sagemaker_session, mxnet_full_version): entry_point=script_path, role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, subnets=subnet_ids, @@ -185,7 +186,7 @@ def test_transform_mxnet_vpc(sagemaker_session, mxnet_full_version): path=transform_input_path, key_prefix=transform_input_key_prefix ) - transformer = _create_transformer_and_transform_job(mx, transform_input) + transformer = _create_transformer_and_transform_job(mx, transform_input, cpu_instance_type) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES ): @@ -197,7 +198,7 @@ def test_transform_mxnet_vpc(sagemaker_session, mxnet_full_version): assert [security_group_id] == model_desc["VpcConfig"]["SecurityGroupIds"] -def test_transform_mxnet_tags(sagemaker_session, mxnet_full_version): +def test_transform_mxnet_tags(sagemaker_session, mxnet_full_version, cpu_instance_type): data_path = os.path.join(DATA_DIR, "mxnet_mnist") script_path = os.path.join(data_path, "mnist.py") tags = [{"Key": "some-tag", "Value": "value-for-tag"}] @@ -206,7 +207,7 @@ def test_transform_mxnet_tags(sagemaker_session, mxnet_full_version): entry_point=script_path, role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, ) @@ -228,7 +229,7 @@ def test_transform_mxnet_tags(sagemaker_session, mxnet_full_version): path=transform_input_path, key_prefix=transform_input_key_prefix ) - transformer = mx.transformer(1, "ml.m4.xlarge", tags=tags) + transformer = mx.transformer(1, cpu_instance_type, tags=tags) transformer.transform(transform_input, content_type="text/csv") with timeout_and_delete_model_with_transformer( @@ -244,7 +245,7 @@ def test_transform_mxnet_tags(sagemaker_session, mxnet_full_version): assert tags == model_tags -def test_transform_byo_estimator(sagemaker_session): +def test_transform_byo_estimator(sagemaker_session, cpu_instance_type): data_path = os.path.join(DATA_DIR, "one_p_mnist") pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} tags = [{"Key": "some-tag", "Value": "value-for-tag"}] @@ -257,7 +258,7 @@ def test_transform_byo_estimator(sagemaker_session): kmeans = KMeans( role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, output_path="s3://{}/".format(sagemaker_session.default_bucket()), @@ -287,7 +288,7 @@ def test_transform_byo_estimator(sagemaker_session): estimator = Estimator.attach(training_job_name=job_name, sagemaker_session=sagemaker_session) - transformer = estimator.transformer(1, "ml.m4.xlarge", tags=tags) + transformer = estimator.transformer(1, cpu_instance_type, tags=tags) transformer.transform(transform_input, content_type="text/csv") with timeout_and_delete_model_with_transformer( @@ -303,7 +304,7 @@ def test_transform_byo_estimator(sagemaker_session): assert tags == model_tags -def test_single_transformer_multiple_jobs(sagemaker_session, mxnet_full_version): +def test_single_transformer_multiple_jobs(sagemaker_session, mxnet_full_version, cpu_instance_type): data_path = os.path.join(DATA_DIR, "mxnet_mnist") script_path = os.path.join(data_path, "mnist.py") @@ -311,7 +312,7 @@ def test_single_transformer_multiple_jobs(sagemaker_session, mxnet_full_version) entry_point=script_path, role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, ) @@ -333,7 +334,7 @@ def test_single_transformer_multiple_jobs(sagemaker_session, mxnet_full_version) path=transform_input_path, key_prefix=transform_input_key_prefix ) - transformer = mx.transformer(1, "ml.m4.xlarge") + transformer = mx.transformer(1, cpu_instance_type) job_name = unique_name_from_base("test-mxnet-transform") transformer.transform(transform_input, content_type="text/csv", job_name=job_name) @@ -353,12 +354,13 @@ def test_single_transformer_multiple_jobs(sagemaker_session, mxnet_full_version) def _create_transformer_and_transform_job( estimator, transform_input, + instance_type, volume_kms_key=None, input_filter=None, output_filter=None, join_source=None, ): - transformer = estimator.transformer(1, "ml.m4.xlarge", volume_kms_key=volume_kms_key) + transformer = estimator.transformer(1, instance_type, volume_kms_key=volume_kms_key) transformer.transform( transform_input, content_type="text/csv", diff --git a/tests/integ/test_tuner.py b/tests/integ/test_tuner.py index 106b252095..8b57300bc4 100644 --- a/tests/integ/test_tuner.py +++ b/tests/integ/test_tuner.py @@ -21,6 +21,7 @@ import numpy as np import pytest +import tests.integ from botocore.exceptions import ClientError from tests.integ import DATA_DIR, PYTHON_VERSION, TUNING_DEFAULT_TIMEOUT_MINUTES from tests.integ.record_set import prepare_record_set_from_local_files @@ -63,11 +64,11 @@ def kmeans_train_set(sagemaker_session): @pytest.fixture(scope="module") -def kmeans_estimator(sagemaker_session): +def kmeans_estimator(sagemaker_session, cpu_instance_type): kmeans = KMeans( role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, output_path="s3://{}/".format(sagemaker_session.default_bucket()), @@ -98,6 +99,7 @@ def _tune_and_deploy( kmeans_estimator, kmeans_train_set, sagemaker_session, + cpu_instance_type, hyperparameter_ranges=None, job_name=None, warm_start_config=None, @@ -111,14 +113,14 @@ def _tune_and_deploy( job_name=job_name, early_stopping_type=early_stopping_type, ) - _deploy(kmeans_train_set, sagemaker_session, tuner, early_stopping_type) + _deploy(kmeans_train_set, sagemaker_session, tuner, early_stopping_type, cpu_instance_type) -def _deploy(kmeans_train_set, sagemaker_session, tuner, early_stopping_type): +def _deploy(kmeans_train_set, sagemaker_session, tuner, early_stopping_type, cpu_instance_type): best_training_job = tuner.best_training_job() assert tuner.early_stopping_type == early_stopping_type with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): - predictor = tuner.deploy(1, "ml.c4.xlarge") + predictor = tuner.deploy(1, cpu_instance_type) result = predictor.predict(kmeans_train_set[0][:10]) @@ -168,13 +170,14 @@ def _tune( @pytest.mark.canary_quick def test_tuning_kmeans( - sagemaker_session, kmeans_train_set, kmeans_estimator, hyperparameter_ranges + sagemaker_session, kmeans_train_set, kmeans_estimator, hyperparameter_ranges, cpu_instance_type ): job_name = unique_name_from_base("test-tune-kmeans") _tune_and_deploy( kmeans_estimator, kmeans_train_set, sagemaker_session, + cpu_instance_type, hyperparameter_ranges=hyperparameter_ranges, job_name=job_name, ) @@ -407,7 +410,11 @@ def test_tuning_kmeans_identical_dataset_algorithm_tuner_from_non_terminal_paren ) -def test_tuning_lda(sagemaker_session): +@pytest.mark.skipif( + tests.integ.test_region() in tests.integ.NO_LDA_REGIONS, + reason="LDA image is not supported in certain regions", +) +def test_tuning_lda(sagemaker_session, cpu_instance_type): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "lda") data_filename = "nips-train_1.pbr" @@ -420,7 +427,7 @@ def test_tuning_lda(sagemaker_session): lda = LDA( role="SageMakerRole", - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, num_topics=10, sagemaker_session=sagemaker_session, ) @@ -470,7 +477,7 @@ def test_tuning_lda(sagemaker_session): best_training_job = attached_tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): - predictor = tuner.deploy(1, "ml.c4.xlarge") + predictor = tuner.deploy(1, cpu_instance_type) predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) @@ -479,14 +486,14 @@ def test_tuning_lda(sagemaker_session): assert record.label["topic_mixture"] is not None -def test_stop_tuning_job(sagemaker_session): +def test_stop_tuning_job(sagemaker_session, cpu_instance_type): feature_num = 14 train_input = np.random.rand(1000, feature_num) rcf = RandomCutForest( role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, num_trees=50, num_samples_per_tree=20, sagemaker_session=sagemaker_session, @@ -531,7 +538,7 @@ def test_stop_tuning_job(sagemaker_session): @pytest.mark.canary_quick -def test_tuning_mxnet(sagemaker_session, mxnet_full_version): +def test_tuning_mxnet(sagemaker_session, mxnet_full_version, cpu_instance_type): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") @@ -541,7 +548,7 @@ def test_tuning_mxnet(sagemaker_session, mxnet_full_version): role="SageMakerRole", py_version=PYTHON_VERSION, train_instance_count=1, - train_instance_type="ml.m4.xlarge", + train_instance_type=cpu_instance_type, framework_version=mxnet_full_version, sagemaker_session=sagemaker_session, ) @@ -577,13 +584,13 @@ def test_tuning_mxnet(sagemaker_session, mxnet_full_version): best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): - predictor = tuner.deploy(1, "ml.c4.xlarge") + predictor = tuner.deploy(1, cpu_instance_type) data = np.zeros(shape=(1, 1, 28, 28)) predictor.predict(data) @pytest.mark.canary_quick -def test_tuning_tf_script_mode(sagemaker_session): +def test_tuning_tf_script_mode(sagemaker_session, cpu_instance_type): resource_path = os.path.join(DATA_DIR, "tensorflow_mnist") script_path = os.path.join(resource_path, "mnist.py") @@ -591,7 +598,7 @@ def test_tuning_tf_script_mode(sagemaker_session): entry_point=script_path, role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.m4.xlarge", + train_instance_type=cpu_instance_type, script_mode=True, sagemaker_session=sagemaker_session, py_version=PYTHON_VERSION, @@ -627,7 +634,7 @@ def test_tuning_tf_script_mode(sagemaker_session): @pytest.mark.canary_quick @pytest.mark.skipif(PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2.") -def test_tuning_tf(sagemaker_session): +def test_tuning_tf(sagemaker_session, cpu_instance_type): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py") @@ -638,7 +645,7 @@ def test_tuning_tf(sagemaker_session): evaluation_steps=1, hyperparameters={"input_tensor_name": "inputs"}, train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) @@ -668,7 +675,7 @@ def test_tuning_tf(sagemaker_session): best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): - predictor = tuner.deploy(1, "ml.c4.xlarge") + predictor = tuner.deploy(1, cpu_instance_type) features = [6.4, 3.2, 4.5, 1.5] dict_result = predictor.predict({"inputs": features}) @@ -680,9 +687,9 @@ def test_tuning_tf(sagemaker_session): @pytest.mark.skipif(PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2.") -def test_tuning_tf_vpc_multi(sagemaker_session): +def test_tuning_tf_vpc_multi(sagemaker_session, cpu_instance_type): """Test Tensorflow multi-instance using the same VpcConfig for training and inference""" - instance_type = "ml.c4.xlarge" + instance_type = cpu_instance_type instance_count = 2 script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py") @@ -735,7 +742,7 @@ def test_tuning_tf_vpc_multi(sagemaker_session): @pytest.mark.canary_quick -def test_tuning_chainer(sagemaker_session): +def test_tuning_chainer(sagemaker_session, cpu_instance_type): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "chainer_mnist") @@ -745,7 +752,7 @@ def test_tuning_chainer(sagemaker_session): role="SageMakerRole", py_version=PYTHON_VERSION, train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, hyperparameters={"epochs": 1}, ) @@ -786,7 +793,7 @@ def test_tuning_chainer(sagemaker_session): best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): - predictor = tuner.deploy(1, "ml.c4.xlarge") + predictor = tuner.deploy(1, cpu_instance_type) batch_size = 100 data = np.zeros((batch_size, 784), dtype="float32") @@ -803,11 +810,7 @@ def test_tuning_chainer(sagemaker_session): @pytest.mark.canary_quick -@pytest.mark.skip( - reason="This test has always failed, but the failure was masked by a bug. " - "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" -) -def test_attach_tuning_pytorch(sagemaker_session): +def test_attach_tuning_pytorch(sagemaker_session, cpu_instance_type): mnist_dir = os.path.join(DATA_DIR, "pytorch_mnist") mnist_script = os.path.join(mnist_dir, "mnist.py") @@ -816,7 +819,7 @@ def test_attach_tuning_pytorch(sagemaker_session): role="SageMakerRole", train_instance_count=1, py_version=PYTHON_VERSION, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) @@ -859,8 +862,11 @@ def test_attach_tuning_pytorch(sagemaker_session): with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = attached_tuner.deploy( - 1, "ml.c4.xlarge", endpoint_name=endpoint_name, model_name=model_name + 1, cpu_instance_type, endpoint_name=endpoint_name, model_name=model_name ) + best_training_job = tuner.best_training_job() + with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): + predictor = attached_tuner.deploy(1, cpu_instance_type) data = np.zeros(shape=(1, 1, 28, 28), dtype=np.float32) predictor.predict(data) @@ -873,7 +879,7 @@ def test_attach_tuning_pytorch(sagemaker_session): @pytest.mark.canary_quick -def test_tuning_byo_estimator(sagemaker_session): +def test_tuning_byo_estimator(sagemaker_session, cpu_instance_type): """Use Factorization Machines algorithm as an example here. First we need to prepare data for training. We take standard data set, convert it to the @@ -903,7 +909,7 @@ def test_tuning_byo_estimator(sagemaker_session): image_name=image_name, role="SageMakerRole", train_instance_count=1, - train_instance_type="ml.c4.xlarge", + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) @@ -934,7 +940,7 @@ def test_tuning_byo_estimator(sagemaker_session): best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): - predictor = tuner.deploy(1, "ml.m4.xlarge", endpoint_name=best_training_job) + predictor = tuner.deploy(1, cpu_instance_type, endpoint_name=best_training_job) predictor.serializer = _fm_serializer predictor.content_type = "application/json" predictor.deserializer = json_deserializer From 1ca4223a0756ebb5a85599f680fb2fed56b5877c Mon Sep 17 00:00:00 2001 From: Chuyang Deng Date: Tue, 13 Aug 2019 12:18:23 -0700 Subject: [PATCH 3/5] unit test for asimov hkg account image uri --- tests/unit/test_fw_utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/unit/test_fw_utils.py b/tests/unit/test_fw_utils.py index e0226f34cf..4e31e5821b 100644 --- a/tests/unit/test_fw_utils.py +++ b/tests/unit/test_fw_utils.py @@ -33,6 +33,7 @@ MOCK_FRAMEWORK = "mlfw" MOCK_REGION = "mars-south-3" MOCK_ACCELERATOR = "eia1.medium" +MOCK_HKG_REGION = "ap-east-1" @contextmanager @@ -136,6 +137,15 @@ def test_create_image_uri_gov_cloud(): ) +def test_create_image_uri_hkg(): + image_uri = fw_utils.create_image_uri( + MOCK_HKG_REGION, MOCK_FRAMEWORK, "ml.p3.2xlarge", "1.0rc", "py3" + ) + assert { + image_uri == "871362719292.dkr.ecr.ap-east-1.amazonaws.com/sagemaker-mlfw:1.0rc-gpu-py3" + } + + def test_create_image_uri_merged(): image_uri = fw_utils.create_image_uri( "us-west-2", "tensorflow-scriptmode", "ml.p3.2xlarge", "1.14", "py3" From 765f523c8e50b0f3c38680e202df95d4ed8cbab6 Mon Sep 17 00:00:00 2001 From: Chuyang Deng Date: Tue, 13 Aug 2019 13:34:16 -0700 Subject: [PATCH 4/5] keep skipping broken tests --- tests/integ/test_inference_pipeline.py | 4 ++++ tests/integ/test_marketplace.py | 4 ++++ tests/integ/test_ntm.py | 4 ++++ tests/integ/test_object2vec.py | 6 ++++++ tests/integ/test_sklearn_train.py | 16 ++++++++++++++++ tests/integ/test_sparkml_serving.py | 4 ++++ tests/integ/test_tuner.py | 4 ++++ 7 files changed, 42 insertions(+) diff --git a/tests/integ/test_inference_pipeline.py b/tests/integ/test_inference_pipeline.py index 3260619fe4..af053c67e6 100644 --- a/tests/integ/test_inference_pipeline.py +++ b/tests/integ/test_inference_pipeline.py @@ -94,6 +94,10 @@ def test_inference_pipeline_batch_transform(sagemaker_session, cpu_instance_type @pytest.mark.canary_quick @pytest.mark.regional_testing +@pytest.mark.skip( + reason="This test has always failed, but the failure was masked by a bug. " + "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" +) def test_inference_pipeline_model_deploy(sagemaker_session, cpu_instance_type): sparkml_data_path = os.path.join(DATA_DIR, "sparkml_model") xgboost_data_path = os.path.join(DATA_DIR, "xgboost_model") diff --git a/tests/integ/test_marketplace.py b/tests/integ/test_marketplace.py index ce21c20bb4..64b3ef4871 100644 --- a/tests/integ/test_marketplace.py +++ b/tests/integ/test_marketplace.py @@ -54,6 +54,10 @@ tests.integ.test_region() in tests.integ.NO_MARKET_PLACE_REGIONS, reason="Marketplace is not available in {}".format(tests.integ.test_region()), ) +@pytest.mark.skip( + reason="This test has always failed, but the failure was masked by a bug. " + "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" +) def test_marketplace_estimator(sagemaker_session, cpu_instance_type): with timeout(minutes=15): data_path = os.path.join(DATA_DIR, "marketplace", "training") diff --git a/tests/integ/test_ntm.py b/tests/integ/test_ntm.py index 0579e5064b..3530715204 100644 --- a/tests/integ/test_ntm.py +++ b/tests/integ/test_ntm.py @@ -26,6 +26,10 @@ @pytest.mark.canary_quick +@pytest.mark.skip( + reason="This test has always failed, but the failure was masked by a bug. " + "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" +) def test_ntm(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("ntm") diff --git a/tests/integ/test_object2vec.py b/tests/integ/test_object2vec.py index 1a5e73d6a1..1a80e50c6d 100644 --- a/tests/integ/test_object2vec.py +++ b/tests/integ/test_object2vec.py @@ -14,6 +14,8 @@ import os +import pytest + from sagemaker.predictor import RealTimePredictor from sagemaker import Object2Vec, Object2VecModel from sagemaker.utils import unique_name_from_base @@ -24,6 +26,10 @@ FEATURE_NUM = None +@pytest.mark.skip( + reason="This test has always failed, but the failure was masked by a bug. " + "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" +) def test_object2vec(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("object2vec") diff --git a/tests/integ/test_sklearn_train.py b/tests/integ/test_sklearn_train.py index 45ce95a53d..d9771ec30a 100644 --- a/tests/integ/test_sklearn_train.py +++ b/tests/integ/test_sklearn_train.py @@ -27,6 +27,10 @@ @pytest.fixture(scope="module") +@pytest.mark.skip( + reason="This test has always failed, but the failure was masked by a bug. " + "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" +) def sklearn_training_job(sagemaker_session, sklearn_full_version, cpu_instance_type): return _run_mnist_training_job(sagemaker_session, cpu_instance_type, sklearn_full_version) sagemaker_session.boto_region_name @@ -99,6 +103,10 @@ def test_training_with_network_isolation( @pytest.mark.canary_quick @pytest.mark.regional_testing @pytest.mark.skipif(PYTHON_VERSION != "py3", reason="Scikit-learn image supports only python 3.") +@pytest.mark.skip( + reason="This test has always failed, but the failure was masked by a bug. " + "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" +) def test_attach_deploy(sklearn_training_job, sagemaker_session, cpu_instance_type): endpoint_name = "test-sklearn-attach-deploy-{}".format(sagemaker_timestamp()) @@ -109,6 +117,10 @@ def test_attach_deploy(sklearn_training_job, sagemaker_session, cpu_instance_typ @pytest.mark.skipif(PYTHON_VERSION != "py3", reason="Scikit-learn image supports only python 3.") +@pytest.mark.skip( + reason="This test has always failed, but the failure was masked by a bug. " + "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" +) def test_deploy_model(sklearn_training_job, sagemaker_session, cpu_instance_type): endpoint_name = "test-sklearn-deploy-model-{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): @@ -128,6 +140,10 @@ def test_deploy_model(sklearn_training_job, sagemaker_session, cpu_instance_type @pytest.mark.skipif(PYTHON_VERSION != "py3", reason="Scikit-learn image supports only python 3.") +@pytest.mark.skip( + reason="This test has always failed, but the failure was masked by a bug. " + "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" +) def test_async_fit(sagemaker_session, cpu_instance_type): endpoint_name = "test-sklearn-attach-deploy-{}".format(sagemaker_timestamp()) diff --git a/tests/integ/test_sparkml_serving.py b/tests/integ/test_sparkml_serving.py index fe0521650a..6211ee0909 100644 --- a/tests/integ/test_sparkml_serving.py +++ b/tests/integ/test_sparkml_serving.py @@ -25,6 +25,10 @@ @pytest.mark.canary_quick @pytest.mark.regional_testing +@pytest.mark.skip( + reason="This test has always failed, but the failure was masked by a bug. " + "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" +) def test_sparkml_model_deploy(sagemaker_session, cpu_instance_type): # Uploads an MLeap serialized MLeap model to S3 and use that to deploy a SparkML model to perform inference data_path = os.path.join(DATA_DIR, "sparkml_model") diff --git a/tests/integ/test_tuner.py b/tests/integ/test_tuner.py index 8b57300bc4..a14560c8ce 100644 --- a/tests/integ/test_tuner.py +++ b/tests/integ/test_tuner.py @@ -810,6 +810,10 @@ def test_tuning_chainer(sagemaker_session, cpu_instance_type): @pytest.mark.canary_quick +@pytest.mark.skip( + reason="This test has always failed, but the failure was masked by a bug. " + "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968" +) def test_attach_tuning_pytorch(sagemaker_session, cpu_instance_type): mnist_dir = os.path.join(DATA_DIR, "pytorch_mnist") mnist_script = os.path.join(mnist_dir, "mnist.py") From 8b3159b2a46d45a172204ec289b144fb697237a1 Mon Sep 17 00:00:00 2001 From: Chuyang Deng Date: Tue, 13 Aug 2019 14:24:55 -0700 Subject: [PATCH 5/5] add new region support --- src/sagemaker/amazon/amazon_estimator.py | 12 ++++++++++++ src/sagemaker/fw_registry.py | 4 ++++ 2 files changed, 16 insertions(+) diff --git a/src/sagemaker/amazon/amazon_estimator.py b/src/sagemaker/amazon/amazon_estimator.py index e8ee5ce22e..9336ca66d4 100644 --- a/src/sagemaker/amazon/amazon_estimator.py +++ b/src/sagemaker/amazon/amazon_estimator.py @@ -386,6 +386,10 @@ def registry(region_name, algorithm=None): "eu-west-2": "644912444149", "us-west-1": "632365934929", "us-iso-east-1": "490574956308", + "ap-east-1": "286214385809", + "eu-north-1": "669576153137", + "eu-west-3": "749696950732", + "sa-east-1": "855470959533", }[region_name] elif algorithm in ["lda"]: account_id = { @@ -422,6 +426,10 @@ def registry(region_name, algorithm=None): "eu-west-2": "644912444149", "us-west-1": "632365934929", "us-iso-east-1": "490574956308", + "ap-east-1": "286214385809", + "eu-north-1": "669576153137", + "eu-west-3": "749696950732", + "sa-east-1": "855470959533", }[region_name] elif algorithm in [ "xgboost", @@ -447,6 +455,10 @@ def registry(region_name, algorithm=None): "eu-west-2": "644912444149", "us-west-1": "632365934929", "us-iso-east-1": "490574956308", + "ap-east-1": "286214385809", + "eu-north-1": "669576153137", + "eu-west-3": "749696950732", + "sa-east-1": "855470959533", }[region_name] elif algorithm in ["image-classification-neo", "xgboost-neo"]: account_id = NEO_IMAGE_ACCOUNT[region_name] diff --git a/src/sagemaker/fw_registry.py b/src/sagemaker/fw_registry.py index 71403f7101..ef71dff79a 100644 --- a/src/sagemaker/fw_registry.py +++ b/src/sagemaker/fw_registry.py @@ -32,6 +32,10 @@ "ca-central-1": {"sparkml-serving": "341280168497", "scikit-learn": "341280168497"}, "us-gov-west-1": {"sparkml-serving": "414596584902", "scikit-learn": "414596584902"}, "us-iso-east-1": {"sparkml-serving": "833128469047", "scikit-learn": "833128469047"}, + "ap-east-1": {"sparkml-serving": "651117190479", "scikit-learn": "651117190479"}, + "sa-east-1": {"sparkml-serving": "737474898029", "scikit-learn": "737474898029"}, + "eu-north-1": {"sparkml-serving": "662702820516", "scikit-learn": "662702820516"}, + "eu-west-3": {"sparkml-serving": "659782779980", "scikit-learn": "659782779980"}, }