From a877ca9c724de9a168d741cfbc81f2b56dd33b73 Mon Sep 17 00:00:00 2001 From: Xiaohua Date: Fri, 23 Aug 2019 11:48:41 -0700 Subject: [PATCH 1/8] update: change directory_path for EFS usage example in overview doc --- doc/overview.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/overview.rst b/doc/overview.rst index a05a233f45..d3347b543c 100644 --- a/doc/overview.rst +++ b/doc/overview.rst @@ -327,7 +327,7 @@ Here are examples of how to use Amazon EFS as input for training: file_system_input = FileSystemInput(file_system_id='fs-1', file_system_type='EFS', - directory_path='tensorflow', + directory_path='/tensorflow', file_system_access_mode='ro') # Start an Amazon SageMaker training job with EFS using the FileSystemInput class @@ -347,7 +347,7 @@ Here are examples of how to use Amazon EFS as input for training: records = FileSystemRecordSet(file_system_id='fs-1, file_system_type='EFS', - directory_path='kmeans', + directory_path='/kmeans', num_records=784, feature_dim=784) @@ -372,7 +372,7 @@ Here are examples of how to use Amazon FSx for Lustre as input for training: file_system_input = FileSystemInput(file_system_id='fs-2', file_system_type='FSxLustre', - directory_path='tensorflow', + directory_path='/fsx/tensorflow', file_system_access_mode='ro') # Start an Amazon SageMaker training job with FSx using the FileSystemInput class @@ -392,7 +392,7 @@ Here are examples of how to use Amazon FSx for Lustre as input for training: records = FileSystemRecordSet(file_system_id='fs-=2, file_system_type='FSxLustre', - directory_path='kmeans', + directory_path='/fsx/kmeans', num_records=784, feature_dim=784) From 2ca2c9ea98b714f686afa68589d4235121498d64 Mon Sep 17 00:00:00 2001 From: Xiaohua Date: Fri, 23 Aug 2019 12:13:00 -0700 Subject: [PATCH 2/8] update: changed EFS directory path instructions in doc and docstring to Absolute or normalized path --- src/sagemaker/amazon/amazon_estimator.py | 2 +- src/sagemaker/inputs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sagemaker/amazon/amazon_estimator.py b/src/sagemaker/amazon/amazon_estimator.py index 83e7b7c56f..4375922f4a 100644 --- a/src/sagemaker/amazon/amazon_estimator.py +++ b/src/sagemaker/amazon/amazon_estimator.py @@ -303,7 +303,7 @@ def __init__( file_system_id (str): An Amazon file system ID starting with 'fs-'. file_system_type (str): The type of file system used for the input. Valid values: 'EFS', 'FSxLustre'. - directory_path (str): Relative path to the root directory (mount point) in + directory_path (str): Absolute or normalized path to the root directory (mount point) in the file system. Reference: https://docs.aws.amazon.com/efs/latest/ug/mounting-fs.html and https://docs.aws.amazon.com/efs/latest/ug/wt1-test.html diff --git a/src/sagemaker/inputs.py b/src/sagemaker/inputs.py index 856612353d..ba1b9c3f66 100644 --- a/src/sagemaker/inputs.py +++ b/src/sagemaker/inputs.py @@ -114,7 +114,7 @@ def __init__( file_system_id (str): An Amazon file system ID starting with 'fs-'. file_system_type (str): The type of file system used for the input. Valid values: 'EFS', 'FSxLustre'. - directory_path (str): Relative path to the root directory (mount point) in + directory_path (str): Absolute or normalized path to the root directory (mount point) in the file system. Reference: https://docs.aws.amazon.com/efs/latest/ug/mounting-fs.html and https://docs.aws.amazon.com/fsx/latest/LustreGuide/mount-fs-auto-mount-onreboot.html From a4ec855574321c2e2f2cab1e7681b265d9fbc34c Mon Sep 17 00:00:00 2001 From: Xiaohua Date: Tue, 27 Aug 2019 01:17:08 -0700 Subject: [PATCH 3/8] update: changed AMI ids to be dynamic based on regions --- tests/integ/file_system_input_utils.py | 32 ++++++++++++++++++++++---- tests/integ/test_kmeans_efs_fsx.py | 20 ++++++++-------- tests/integ/test_tf_efs_fsx.py | 19 +++++++-------- tests/integ/vpc_test_utils.py | 12 ++++------ tests/scripts/fs_mount_setup.sh | 4 ++-- 5 files changed, 51 insertions(+), 36 deletions(-) diff --git a/tests/integ/file_system_input_utils.py b/tests/integ/file_system_input_utils.py index deb8ff8569..15f9882357 100644 --- a/tests/integ/file_system_input_utils.py +++ b/tests/integ/file_system_input_utils.py @@ -31,7 +31,6 @@ PREFIX = "ec2_fs_key_" KEY_NAME = PREFIX + str(uuid.uuid4().hex.upper()[0:8]) ROLE_NAME = "SageMakerRole" -REGION = "us-west-2" EC2_INSTANCE_TYPE = "t2.micro" AMI_ID = "ami-082b5a644766e0e6f" MIN_COUNT = 1 @@ -50,6 +49,28 @@ KEY_PATH = os.path.join(tempfile.gettempdir(), FILE_NAME) STORAGE_CAPACITY_IN_BYTES = 3600 +AWSRegionArch2AMI = { + "us-east-1": "ami-0ff8a91507f77f867", + "us-west-2": "ami-a0cfeed8", + "us-west-1": "ami-0bdb828fd58c52235", + "eu-west-1": "ami-047bb4163c506cd98", + "eu-west-2": "ami-f976839e", + "eu-west-3": "ami-0ebc281c20e89ba4b", + "eu-central-1": "ami-0233214e13e500f77", + "ap-northeast-1": "ami-06cd52961ce9f0d85", + "ap-northeast-2": "ami-0a10b2721688ce9d2", + "ap-northeast-3": "ami-0d98120a9fb693f07", + "ap-southeast-1": "ami-08569b978cc4dfa10", + "ap-southeast-2": "ami-09b42976632b27e9b", + "ap-south-1": "ami-0912f71e06545ad88", + "us-east-2": "ami-0b59bfac6be064b78", + "ca-central-1": "ami-0b18956f", + "sa-east-1": "ami-07b14488da8ea02a0", + "cn-north-1": "ami-0a4eaf6c4454eda75", + "cn-northwest-1": "ami-6b6a7d09", + "us-gov-west-1": "ami-906cf0f1", +} + FsResources = collections.namedtuple( "FsResources", [ @@ -70,12 +91,13 @@ def set_up_efs_fsx(sagemaker_session): _check_or_create_key_pair(sagemaker_session) _check_or_create_iam_profile_and_attach_role(sagemaker_session) subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx( - sagemaker_session, REGION, VPC_NAME + sagemaker_session, VPC_NAME ) + region = sagemaker_session.boto_region_name ec2_instance = _create_ec2_instance( sagemaker_session, - AMI_ID, + AWSRegionArch2AMI[region], EC2_INSTANCE_TYPE, KEY_NAME, MIN_COUNT, @@ -169,7 +191,7 @@ def _check_or_create_efs(sagemaker_session): def _create_efs_mount(sagemaker_session, file_system_id): subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx( - sagemaker_session, REGION, VPC_NAME + sagemaker_session, VPC_NAME ) efs_client = sagemaker_session.boto_session.client("efs") mount_response = efs_client.create_mount_target( @@ -189,7 +211,7 @@ def _create_efs_mount(sagemaker_session, file_system_id): def _check_or_create_fsx(sagemaker_session): fsx_client = sagemaker_session.boto_session.client("fsx") subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx( - sagemaker_session, REGION, VPC_NAME + sagemaker_session, VPC_NAME ) create_response = fsx_client.create_file_system( FileSystemType="LUSTRE", diff --git a/tests/integ/test_kmeans_efs_fsx.py b/tests/integ/test_kmeans_efs_fsx.py index c30c6fdbcb..7a34f0451b 100644 --- a/tests/integ/test_kmeans_efs_fsx.py +++ b/tests/integ/test_kmeans_efs_fsx.py @@ -24,7 +24,6 @@ from tests.integ.file_system_input_utils import set_up_efs_fsx, tear_down from tests.integ.timeout import timeout -TRAIN_INSTANCE_TYPE = "ml.c4.xlarge" TRAIN_INSTANCE_COUNT = 1 OBJECTIVE_METRIC_NAME = "test:msd" EFS_DIR_PATH = "/one_p_mnist" @@ -45,8 +44,8 @@ def efs_fsx_setup(sagemaker_session): tear_down(sagemaker_session, fs_resources) -@pytest.mark.canary_quick -def test_kmeans_efs(efs_fsx_setup, sagemaker_session): +def test_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type): + print("cpu_instance_type = ", cpu_instance_type) with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids @@ -54,7 +53,7 @@ def test_kmeans_efs(efs_fsx_setup, sagemaker_session): kmeans = KMeans( role=role, train_instance_count=TRAIN_INSTANCE_COUNT, - train_instance_type=TRAIN_INSTANCE_TYPE, + train_instance_type=cpu_instance_type, k=K, sagemaker_session=sagemaker_session, subnets=subnets, @@ -76,8 +75,7 @@ def test_kmeans_efs(efs_fsx_setup, sagemaker_session): assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"]) -@pytest.mark.canary_quick -def test_kmeans_fsx(efs_fsx_setup, sagemaker_session): +def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids @@ -85,7 +83,7 @@ def test_kmeans_fsx(efs_fsx_setup, sagemaker_session): kmeans = KMeans( role=role, train_instance_count=TRAIN_INSTANCE_COUNT, - train_instance_type=TRAIN_INSTANCE_TYPE, + train_instance_type=cpu_instance_type, k=K, sagemaker_session=sagemaker_session, subnets=subnets, @@ -107,14 +105,14 @@ def test_kmeans_fsx(efs_fsx_setup, sagemaker_session): assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"]) -def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session): +def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type): subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids role = efs_fsx_setup.role_name kmeans = KMeans( role=role, train_instance_count=TRAIN_INSTANCE_COUNT, - train_instance_type=TRAIN_INSTANCE_TYPE, + train_instance_type=cpu_instance_type, k=K, sagemaker_session=sagemaker_session, subnets=subnets, @@ -163,14 +161,14 @@ def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session): assert best_training_job -def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session): +def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type): subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids role = efs_fsx_setup.role_name kmeans = KMeans( role=role, train_instance_count=TRAIN_INSTANCE_COUNT, - train_instance_type=TRAIN_INSTANCE_TYPE, + train_instance_type=cpu_instance_type, k=K, sagemaker_session=sagemaker_session, subnets=subnets, diff --git a/tests/integ/test_tf_efs_fsx.py b/tests/integ/test_tf_efs_fsx.py index 02c4dd95bc..2d142c436e 100644 --- a/tests/integ/test_tf_efs_fsx.py +++ b/tests/integ/test_tf_efs_fsx.py @@ -31,7 +31,6 @@ MNIST_RESOURCE_PATH = os.path.join(RESOURCE_PATH, "tensorflow_mnist") SCRIPT = os.path.join(MNIST_RESOURCE_PATH, "mnist.py") TFS_RESOURCE_PATH = os.path.join(RESOURCE_PATH, "tfs", "tfs-test-entrypoint-with-handler") -INSTANCE_TYPE = "ml.c4.xlarge" EFS_DIR_PATH = "/tensorflow" FSX_DIR_PATH = "/fsx/tensorflow" MAX_JOBS = 2 @@ -48,8 +47,7 @@ def efs_fsx_setup(sagemaker_session): tear_down(sagemaker_session, fs_resources) -@pytest.mark.canary_quick -def test_mnist_efs(efs_fsx_setup, sagemaker_session): +def test_mnist_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type): role = efs_fsx_setup.role_name subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids @@ -58,7 +56,7 @@ def test_mnist_efs(efs_fsx_setup, sagemaker_session): entry_point=SCRIPT, role=role, train_instance_count=1, - train_instance_type=INSTANCE_TYPE, + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, script_mode=True, framework_version=TensorFlow.LATEST_VERSION, @@ -81,8 +79,7 @@ def test_mnist_efs(efs_fsx_setup, sagemaker_session): ) -@pytest.mark.canary_quick -def test_mnist_lustre(efs_fsx_setup, sagemaker_session): +def test_mnist_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type): role = efs_fsx_setup.role_name subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids @@ -91,7 +88,7 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session): entry_point=SCRIPT, role=role, train_instance_count=1, - train_instance_type=INSTANCE_TYPE, + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, script_mode=True, framework_version=TensorFlow.LATEST_VERSION, @@ -114,7 +111,7 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session): ) -def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session): +def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type): role = efs_fsx_setup.role_name subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids @@ -123,7 +120,7 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session): entry_point=SCRIPT, role=role, train_instance_count=1, - train_instance_type=INSTANCE_TYPE, + train_instance_type=cpu_instance_type, script_mode=True, sagemaker_session=sagemaker_session, py_version=PY_VERSION, @@ -158,7 +155,7 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session): assert best_training_job -def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session): +def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type): role = efs_fsx_setup.role_name subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids @@ -167,7 +164,7 @@ def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session): entry_point=SCRIPT, role=role, train_instance_count=1, - train_instance_type=INSTANCE_TYPE, + train_instance_type=cpu_instance_type, script_mode=True, sagemaker_session=sagemaker_session, py_version=PY_VERSION, diff --git a/tests/integ/vpc_test_utils.py b/tests/integ/vpc_test_utils.py index ec3f01a51e..301833c3a4 100644 --- a/tests/integ/vpc_test_utils.py +++ b/tests/integ/vpc_test_utils.py @@ -62,7 +62,7 @@ def _route_table_id(ec2_client, vpc_id): return desc["RouteTables"][0]["RouteTableId"] -def check_or_create_vpc_resources_efs_fsx(sagemaker_session, region, name=VPC_NAME): +def check_or_create_vpc_resources_efs_fsx(sagemaker_session, name=VPC_NAME): # use lock to prevent race condition when tests are running concurrently with lock.lock(LOCK_PATH): ec2_client = sagemaker_session.boto_session.client("ec2") @@ -74,13 +74,11 @@ def check_or_create_vpc_resources_efs_fsx(sagemaker_session, region, name=VPC_NA _security_group_ids_by_vpc_id(sagemaker_session, vpc_id), ) else: - return _create_vpc_with_name_efs_fsx(ec2_client, region, name) + return _create_vpc_with_name_efs_fsx(ec2_client, name) -def _create_vpc_with_name_efs_fsx(ec2_client, region, name): - vpc_id, [subnet_id_a, subnet_id_b], security_group_id = _create_vpc_resources( - ec2_client, region, name - ) +def _create_vpc_with_name_efs_fsx(ec2_client, name): + vpc_id, [subnet_id_a, subnet_id_b], security_group_id = _create_vpc_resources(ec2_client, name) ec2_client.modify_vpc_attribute(EnableDnsHostnames={"Value": True}, VpcId=vpc_id) ig = ec2_client.create_internet_gateway() @@ -121,7 +119,7 @@ def _create_vpc_with_name_efs_fsx(ec2_client, region, name): return [subnet_id_a], [security_group_id] -def _create_vpc_resources(ec2_client, region, name): +def _create_vpc_resources(ec2_client, name): vpc_id = ec2_client.create_vpc(CidrBlock="10.0.0.0/16")["Vpc"]["VpcId"] print("created vpc: {}".format(vpc_id)) diff --git a/tests/scripts/fs_mount_setup.sh b/tests/scripts/fs_mount_setup.sh index a5e5eaa051..1c3877309b 100644 --- a/tests/scripts/fs_mount_setup.sh +++ b/tests/scripts/fs_mount_setup.sh @@ -18,14 +18,14 @@ FILE_SYSTEM_EFS_ID=$1 FILE_SYSTEM_FSX_ID=$2 echo "Mounting EFS File Systems" -sudo yum install -y amazon-efs-utils.noarch 0:1.10-1.amzn2 +sudo yum install -y amazon-efs-utils sudo mkdir efs sudo mount -t efs "$FILE_SYSTEM_EFS_ID":/ efs sudo mkdir efs/tensorflow sudo mkdir efs/one_p_mnist echo "Mounting FSx for Lustre File System" -sudo amazon-linux-extras install -y lustre2.10 +sudo yum install -y lustre-client sudo mkdir -p /mnt/fsx sudo mount -t lustre -o noatime,flock "$FILE_SYSTEM_FSX_ID".fsx.us-west-2.amazonaws.com@tcp:/fsx /mnt/fsx sudo mkdir /mnt/fsx/tensorflow From c43ff3808a49dde29cba978e3cd38d6ef938169c Mon Sep 17 00:00:00 2001 From: Xiaohua Date: Tue, 27 Aug 2019 13:39:11 -0700 Subject: [PATCH 4/8] fix: change Amazon AMI ids to be dynamic based on regions using searching Amazon Linux AMI --- tests/integ/file_system_input_utils.py | 51 ++++++++++++-------------- tests/scripts/fs_mount_setup.sh | 3 +- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/tests/integ/file_system_input_utils.py b/tests/integ/file_system_input_utils.py index 90e70ac45a..019ecc915a 100644 --- a/tests/integ/file_system_input_utils.py +++ b/tests/integ/file_system_input_utils.py @@ -14,6 +14,7 @@ import collections import logging +from operator import itemgetter import os from os import path import stat @@ -32,7 +33,6 @@ KEY_NAME = PREFIX + str(uuid.uuid4().hex.upper()[0:8]) ROLE_NAME = "SageMakerRole" EC2_INSTANCE_TYPE = "t2.micro" -AMI_ID = "ami-082b5a644766e0e6f" MIN_COUNT = 1 MAX_COUNT = 1 @@ -48,28 +48,6 @@ KEY_PATH = os.path.join(tempfile.gettempdir(), FILE_NAME) STORAGE_CAPACITY_IN_BYTES = 3600 -AWSRegionArch2AMI = { - "us-east-1": "ami-0ff8a91507f77f867", - "us-west-2": "ami-a0cfeed8", - "us-west-1": "ami-0bdb828fd58c52235", - "eu-west-1": "ami-047bb4163c506cd98", - "eu-west-2": "ami-f976839e", - "eu-west-3": "ami-0ebc281c20e89ba4b", - "eu-central-1": "ami-0233214e13e500f77", - "ap-northeast-1": "ami-06cd52961ce9f0d85", - "ap-northeast-2": "ami-0a10b2721688ce9d2", - "ap-northeast-3": "ami-0d98120a9fb693f07", - "ap-southeast-1": "ami-08569b978cc4dfa10", - "ap-southeast-2": "ami-09b42976632b27e9b", - "ap-south-1": "ami-0912f71e06545ad88", - "us-east-2": "ami-0b59bfac6be064b78", - "ca-central-1": "ami-0b18956f", - "sa-east-1": "ami-07b14488da8ea02a0", - "cn-north-1": "ami-0a4eaf6c4454eda75", - "cn-northwest-1": "ami-6b6a7d09", - "us-gov-west-1": "ami-906cf0f1", -} - FsResources = collections.namedtuple( "FsResources", [ @@ -93,10 +71,11 @@ def set_up_efs_fsx(sagemaker_session): sagemaker_session, VPC_NAME ) + ami_id = _dynamic_ami_id(sagemaker_session) region = sagemaker_session.boto_region_name ec2_instance = _create_ec2_instance( sagemaker_session, - AWSRegionArch2AMI[region], + ami_id, EC2_INSTANCE_TYPE, KEY_NAME, MIN_COUNT, @@ -124,7 +103,9 @@ def set_up_efs_fsx(sagemaker_session): try: connected_instance = _connect_ec2_instance(ec2_instance) - _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_system_fsx_id) + _upload_data_and_mount_fs( + connected_instance, file_system_efs_id, file_system_fsx_id, region + ) except Exception: tear_down(sagemaker_session, fs_resources) raise @@ -132,6 +113,22 @@ def set_up_efs_fsx(sagemaker_session): return fs_resources +def _dynamic_ami_id(sagemaker_session): + ec2_client = sagemaker_session.boto_session.client("ec2") + filters = [ + {"Name": "name", "Values": ["amzn-ami-hvm-????.??.?.????????-x86_64-gp2"]}, + {"Name": "state", "Values": ["available"]}, + ] + response = ec2_client.describe_images(Filters=filters) + + image_details = sorted(response["Images"], key=itemgetter("CreationDate"), reverse=True) + if len(image_details) > 0: + ami_id = image_details[0]["ImageId"] + return ami_id + else: + raise Exception("AMI was not found based on current search criteria: {}".format(filters)) + + def _connect_ec2_instance(ec2_instance): public_ip_address = ec2_instance.public_ip_address connected_instance = Connection( @@ -140,7 +137,7 @@ def _connect_ec2_instance(ec2_instance): return connected_instance -def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_system_fsx_id): +def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_system_fsx_id, region): connected_instance.put(FS_MOUNT_SCRIPT, ".") connected_instance.run("mkdir temp_tf; mkdir temp_one_p", in_stream=False) for dir_name, subdir_list, file_list in os.walk(MNIST_LOCAL_DATA): @@ -149,7 +146,7 @@ def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_syste connected_instance.put(local_file, "temp_tf/") connected_instance.put(ONE_P_LOCAL_DATA, "temp_one_p/") connected_instance.run( - "sudo sh fs_mount_setup.sh {} {}".format(file_system_efs_id, file_system_fsx_id), + "sudo sh fs_mount_setup.sh {} {} {}".format(file_system_efs_id, file_system_fsx_id, region), in_stream=False, ) diff --git a/tests/scripts/fs_mount_setup.sh b/tests/scripts/fs_mount_setup.sh index 1c3877309b..111c360498 100644 --- a/tests/scripts/fs_mount_setup.sh +++ b/tests/scripts/fs_mount_setup.sh @@ -16,6 +16,7 @@ # Mounting EFS and FSx for Lustre file systems for integration Tests FILE_SYSTEM_EFS_ID=$1 FILE_SYSTEM_FSX_ID=$2 +REGION=$3 echo "Mounting EFS File Systems" sudo yum install -y amazon-efs-utils @@ -27,7 +28,7 @@ sudo mkdir efs/one_p_mnist echo "Mounting FSx for Lustre File System" sudo yum install -y lustre-client sudo mkdir -p /mnt/fsx -sudo mount -t lustre -o noatime,flock "$FILE_SYSTEM_FSX_ID".fsx.us-west-2.amazonaws.com@tcp:/fsx /mnt/fsx +sudo mount -t lustre -o noatime,flock "$FILE_SYSTEM_FSX_ID".fsx."$REGION".amazonaws.com@tcp:/fsx /mnt/fsx sudo mkdir /mnt/fsx/tensorflow sudo mkdir /mnt/fsx/one_p_mnist From f41d3c52b75d38f2f5b105dd3856aa42ffa80e7a Mon Sep 17 00:00:00 2001 From: Xiaohua Date: Tue, 27 Aug 2019 13:41:47 -0700 Subject: [PATCH 5/8] update: move region closer to its usage --- tests/integ/file_system_input_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integ/file_system_input_utils.py b/tests/integ/file_system_input_utils.py index 019ecc915a..98a2cd4cc3 100644 --- a/tests/integ/file_system_input_utils.py +++ b/tests/integ/file_system_input_utils.py @@ -72,7 +72,6 @@ def set_up_efs_fsx(sagemaker_session): ) ami_id = _dynamic_ami_id(sagemaker_session) - region = sagemaker_session.boto_region_name ec2_instance = _create_ec2_instance( sagemaker_session, ami_id, @@ -101,6 +100,7 @@ def set_up_efs_fsx(sagemaker_session): mount_efs_target_id, ) + region = sagemaker_session.boto_region_name try: connected_instance = _connect_ec2_instance(ec2_instance) _upload_data_and_mount_fs( From 039338479e5892d71afb471f418033c9a06b2626 Mon Sep 17 00:00:00 2001 From: Xiaohua Date: Tue, 27 Aug 2019 14:24:48 -0700 Subject: [PATCH 6/8] update: make filter as constant variable --- tests/integ/file_system_input_utils.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tests/integ/file_system_input_utils.py b/tests/integ/file_system_input_utils.py index 98a2cd4cc3..02e92704a8 100644 --- a/tests/integ/file_system_input_utils.py +++ b/tests/integ/file_system_input_utils.py @@ -48,6 +48,11 @@ KEY_PATH = os.path.join(tempfile.gettempdir(), FILE_NAME) STORAGE_CAPACITY_IN_BYTES = 3600 +AMI_FILTERS = [ + {"Name": "name", "Values": ["amzn-ami-hvm-????.??.?.????????-x86_64-gp2"]}, + {"Name": "state", "Values": ["available"]}, +] + FsResources = collections.namedtuple( "FsResources", [ @@ -71,7 +76,7 @@ def set_up_efs_fsx(sagemaker_session): sagemaker_session, VPC_NAME ) - ami_id = _dynamic_ami_id(sagemaker_session) + ami_id = _ami_id_for_region(sagemaker_session) ec2_instance = _create_ec2_instance( sagemaker_session, ami_id, @@ -113,20 +118,17 @@ def set_up_efs_fsx(sagemaker_session): return fs_resources -def _dynamic_ami_id(sagemaker_session): +def _ami_id_for_region(sagemaker_session): ec2_client = sagemaker_session.boto_session.client("ec2") - filters = [ - {"Name": "name", "Values": ["amzn-ami-hvm-????.??.?.????????-x86_64-gp2"]}, - {"Name": "state", "Values": ["available"]}, - ] - response = ec2_client.describe_images(Filters=filters) - + response = ec2_client.describe_images(Filters=AMI_FILTERS) image_details = sorted(response["Images"], key=itemgetter("CreationDate"), reverse=True) - if len(image_details) > 0: - ami_id = image_details[0]["ImageId"] - return ami_id + + if len(image_details) == 0: + raise Exception( + "AMI was not found based on current search criteria: {}".format(AMI_FILTERS) + ) else: - raise Exception("AMI was not found based on current search criteria: {}".format(filters)) + return image_details[0]["ImageId"] def _connect_ec2_instance(ec2_instance): From 6b6e070f39aa861b8e980e90027b245f4a6f29b3 Mon Sep 17 00:00:00 2001 From: Xiaohua Date: Tue, 27 Aug 2019 14:29:24 -0700 Subject: [PATCH 7/8] update: remove else --- tests/integ/file_system_input_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integ/file_system_input_utils.py b/tests/integ/file_system_input_utils.py index 02e92704a8..cfdab8d556 100644 --- a/tests/integ/file_system_input_utils.py +++ b/tests/integ/file_system_input_utils.py @@ -127,8 +127,8 @@ def _ami_id_for_region(sagemaker_session): raise Exception( "AMI was not found based on current search criteria: {}".format(AMI_FILTERS) ) - else: - return image_details[0]["ImageId"] + + return image_details[0]["ImageId"] def _connect_ec2_instance(ec2_instance): From b80ccbc507019c5113d8858899c7731e340503b6 Mon Sep 17 00:00:00 2001 From: Xiaohua Date: Tue, 27 Aug 2019 16:39:58 -0700 Subject: [PATCH 8/8] update: add constant variable for search AMI name --- tests/integ/file_system_input_utils.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/tests/integ/file_system_input_utils.py b/tests/integ/file_system_input_utils.py index cfdab8d556..cf29919813 100644 --- a/tests/integ/file_system_input_utils.py +++ b/tests/integ/file_system_input_utils.py @@ -28,6 +28,7 @@ from tests.integ.vpc_test_utils import check_or_create_vpc_resources_efs_fsx VPC_NAME = "sagemaker-efs-fsx-vpc" +ALINUX_AMI_NAME_FILTER = "amzn-ami-hvm-????.??.?.????????-x86_64-gp2" EFS_CREATION_TOKEN = str(uuid.uuid4()) PREFIX = "ec2_fs_key_" KEY_NAME = PREFIX + str(uuid.uuid4().hex.upper()[0:8]) @@ -48,11 +49,6 @@ KEY_PATH = os.path.join(tempfile.gettempdir(), FILE_NAME) STORAGE_CAPACITY_IN_BYTES = 3600 -AMI_FILTERS = [ - {"Name": "name", "Values": ["amzn-ami-hvm-????.??.?.????????-x86_64-gp2"]}, - {"Name": "state", "Values": ["available"]}, -] - FsResources = collections.namedtuple( "FsResources", [ @@ -120,13 +116,15 @@ def set_up_efs_fsx(sagemaker_session): def _ami_id_for_region(sagemaker_session): ec2_client = sagemaker_session.boto_session.client("ec2") - response = ec2_client.describe_images(Filters=AMI_FILTERS) + filters = [ + {"Name": "name", "Values": [ALINUX_AMI_NAME_FILTER]}, + {"Name": "state", "Values": ["available"]}, + ] + response = ec2_client.describe_images(Filters=filters) image_details = sorted(response["Images"], key=itemgetter("CreationDate"), reverse=True) if len(image_details) == 0: - raise Exception( - "AMI was not found based on current search criteria: {}".format(AMI_FILTERS) - ) + raise Exception("AMI was not found based on current search criteria: {}".format(filters)) return image_details[0]["ImageId"]