diff --git a/tests/integ/__init__.py b/tests/integ/__init__.py index 8326b1cbd8..f080014b32 100644 --- a/tests/integ/__init__.py +++ b/tests/integ/__init__.py @@ -71,8 +71,6 @@ NO_LDA_REGIONS = ["eu-west-3", "eu-north-1", "sa-east-1", "ap-east-1"] NO_MARKET_PLACE_REGIONS = ["eu-west-3", "eu-north-1", "sa-east-1", "ap-east-1"] -EFS_TEST_ENABLED_REGION = ["us-west-2"] - logging.getLogger("boto3").setLevel(logging.INFO) logging.getLogger("botocore").setLevel(logging.INFO) diff --git a/tests/integ/file_system_input_utils.py b/tests/integ/file_system_input_utils.py index 94e2f8c4d3..cf29919813 100644 --- a/tests/integ/file_system_input_utils.py +++ b/tests/integ/file_system_input_utils.py @@ -14,6 +14,7 @@ import collections import logging +from operator import itemgetter import os from os import path import stat @@ -27,13 +28,12 @@ from tests.integ.vpc_test_utils import check_or_create_vpc_resources_efs_fsx VPC_NAME = "sagemaker-efs-fsx-vpc" +ALINUX_AMI_NAME_FILTER = "amzn-ami-hvm-????.??.?.????????-x86_64-gp2" EFS_CREATION_TOKEN = str(uuid.uuid4()) PREFIX = "ec2_fs_key_" KEY_NAME = PREFIX + str(uuid.uuid4().hex.upper()[0:8]) ROLE_NAME = "SageMakerRole" -REGION = "us-west-2" EC2_INSTANCE_TYPE = "t2.micro" -AMI_ID = "ami-082b5a644766e0e6f" MIN_COUNT = 1 MAX_COUNT = 1 @@ -69,12 +69,13 @@ def set_up_efs_fsx(sagemaker_session): _check_or_create_key_pair(sagemaker_session) _check_or_create_iam_profile_and_attach_role(sagemaker_session) subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx( - sagemaker_session, REGION, VPC_NAME + sagemaker_session, VPC_NAME ) + ami_id = _ami_id_for_region(sagemaker_session) ec2_instance = _create_ec2_instance( sagemaker_session, - AMI_ID, + ami_id, EC2_INSTANCE_TYPE, KEY_NAME, MIN_COUNT, @@ -100,9 +101,12 @@ def set_up_efs_fsx(sagemaker_session): mount_efs_target_id, ) + region = sagemaker_session.boto_region_name try: connected_instance = _connect_ec2_instance(ec2_instance) - _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_system_fsx_id) + _upload_data_and_mount_fs( + connected_instance, file_system_efs_id, file_system_fsx_id, region + ) except Exception: tear_down(sagemaker_session, fs_resources) raise @@ -110,6 +114,21 @@ def set_up_efs_fsx(sagemaker_session): return fs_resources +def _ami_id_for_region(sagemaker_session): + ec2_client = sagemaker_session.boto_session.client("ec2") + filters = [ + {"Name": "name", "Values": [ALINUX_AMI_NAME_FILTER]}, + {"Name": "state", "Values": ["available"]}, + ] + response = ec2_client.describe_images(Filters=filters) + image_details = sorted(response["Images"], key=itemgetter("CreationDate"), reverse=True) + + if len(image_details) == 0: + raise Exception("AMI was not found based on current search criteria: {}".format(filters)) + + return image_details[0]["ImageId"] + + def _connect_ec2_instance(ec2_instance): public_ip_address = ec2_instance.public_ip_address connected_instance = Connection( @@ -118,7 +137,7 @@ def _connect_ec2_instance(ec2_instance): return connected_instance -def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_system_fsx_id): +def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_system_fsx_id, region): connected_instance.put(FS_MOUNT_SCRIPT, ".") connected_instance.run("mkdir temp_tf; mkdir temp_one_p", in_stream=False) for dir_name, subdir_list, file_list in os.walk(MNIST_LOCAL_DATA): @@ -127,7 +146,7 @@ def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_syste connected_instance.put(local_file, "temp_tf/") connected_instance.put(ONE_P_LOCAL_DATA, "temp_one_p/") connected_instance.run( - "sudo sh fs_mount_setup.sh {} {}".format(file_system_efs_id, file_system_fsx_id), + "sudo sh fs_mount_setup.sh {} {} {}".format(file_system_efs_id, file_system_fsx_id, region), in_stream=False, ) @@ -168,7 +187,7 @@ def _check_or_create_efs(sagemaker_session): def _create_efs_mount(sagemaker_session, file_system_id): subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx( - sagemaker_session, REGION, VPC_NAME + sagemaker_session, VPC_NAME ) efs_client = sagemaker_session.boto_session.client("efs") mount_response = efs_client.create_mount_target( @@ -188,7 +207,7 @@ def _create_efs_mount(sagemaker_session, file_system_id): def _check_or_create_fsx(sagemaker_session): fsx_client = sagemaker_session.boto_session.client("fsx") subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx( - sagemaker_session, REGION, VPC_NAME + sagemaker_session, VPC_NAME ) create_response = fsx_client.create_file_system( FileSystemType="LUSTRE", diff --git a/tests/integ/test_kmeans_efs_fsx.py b/tests/integ/test_kmeans_efs_fsx.py index 9f4b647c44..5c5d2dd57c 100644 --- a/tests/integ/test_kmeans_efs_fsx.py +++ b/tests/integ/test_kmeans_efs_fsx.py @@ -14,7 +14,6 @@ import pytest -import tests.integ from sagemaker import KMeans from sagemaker.amazon.amazon_estimator import FileSystemRecordSet from sagemaker.parameter import IntegerParameter, CategoricalParameter @@ -25,7 +24,6 @@ from tests.integ.s3_utils import assert_s3_files_exist from tests.integ.timeout import timeout -TRAIN_INSTANCE_TYPE = "ml.c4.xlarge" TRAIN_INSTANCE_COUNT = 1 OBJECTIVE_METRIC_NAME = "test:msd" EFS_DIR_PATH = "/one_p_mnist" @@ -46,11 +44,7 @@ def efs_fsx_setup(sagemaker_session): tear_down(sagemaker_session, fs_resources) -@pytest.mark.skipif( - tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION, - reason="EFS integration tests need to be fixed before running in all regions.", -) -def test_kmeans_efs(efs_fsx_setup, sagemaker_session): +def test_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids @@ -58,7 +52,7 @@ def test_kmeans_efs(efs_fsx_setup, sagemaker_session): kmeans = KMeans( role=role, train_instance_count=TRAIN_INSTANCE_COUNT, - train_instance_type=TRAIN_INSTANCE_TYPE, + train_instance_type=cpu_instance_type, k=K, sagemaker_session=sagemaker_session, subnets=subnets, @@ -80,11 +74,7 @@ def test_kmeans_efs(efs_fsx_setup, sagemaker_session): assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"]) -@pytest.mark.skipif( - tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION, - reason="EFS integration tests need to be fixed before running in all regions.", -) -def test_kmeans_fsx(efs_fsx_setup, sagemaker_session): +def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids @@ -92,7 +82,7 @@ def test_kmeans_fsx(efs_fsx_setup, sagemaker_session): kmeans = KMeans( role=role, train_instance_count=TRAIN_INSTANCE_COUNT, - train_instance_type=TRAIN_INSTANCE_TYPE, + train_instance_type=cpu_instance_type, k=K, sagemaker_session=sagemaker_session, subnets=subnets, @@ -114,18 +104,14 @@ def test_kmeans_fsx(efs_fsx_setup, sagemaker_session): assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"]) -@pytest.mark.skipif( - tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION, - reason="EFS integration tests need to be fixed before running in all regions.", -) -def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session): +def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type): subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids role = efs_fsx_setup.role_name kmeans = KMeans( role=role, train_instance_count=TRAIN_INSTANCE_COUNT, - train_instance_type=TRAIN_INSTANCE_TYPE, + train_instance_type=cpu_instance_type, k=K, sagemaker_session=sagemaker_session, subnets=subnets, @@ -174,18 +160,14 @@ def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session): assert best_training_job -@pytest.mark.skipif( - tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION, - reason="EFS integration tests need to be fixed before running in all regions.", -) -def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session): +def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type): subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids role = efs_fsx_setup.role_name kmeans = KMeans( role=role, train_instance_count=TRAIN_INSTANCE_COUNT, - train_instance_type=TRAIN_INSTANCE_TYPE, + train_instance_type=cpu_instance_type, k=K, sagemaker_session=sagemaker_session, subnets=subnets, diff --git a/tests/integ/test_tf_efs_fsx.py b/tests/integ/test_tf_efs_fsx.py index 12b52e5850..2186bb2a2e 100644 --- a/tests/integ/test_tf_efs_fsx.py +++ b/tests/integ/test_tf_efs_fsx.py @@ -17,7 +17,6 @@ import pytest -import tests.integ from sagemaker.inputs import FileSystemInput from sagemaker.parameter import IntegerParameter from sagemaker.tensorflow import TensorFlow @@ -32,7 +31,6 @@ MNIST_RESOURCE_PATH = os.path.join(RESOURCE_PATH, "tensorflow_mnist") SCRIPT = os.path.join(MNIST_RESOURCE_PATH, "mnist.py") TFS_RESOURCE_PATH = os.path.join(RESOURCE_PATH, "tfs", "tfs-test-entrypoint-with-handler") -INSTANCE_TYPE = "ml.c4.xlarge" EFS_DIR_PATH = "/tensorflow" FSX_DIR_PATH = "/fsx/tensorflow" MAX_JOBS = 2 @@ -49,11 +47,7 @@ def efs_fsx_setup(sagemaker_session): tear_down(sagemaker_session, fs_resources) -@pytest.mark.skipif( - tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION, - reason="EFS integration tests need to be fixed before running in all regions.", -) -def test_mnist_efs(efs_fsx_setup, sagemaker_session): +def test_mnist_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type): role = efs_fsx_setup.role_name subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids @@ -62,7 +56,7 @@ def test_mnist_efs(efs_fsx_setup, sagemaker_session): entry_point=SCRIPT, role=role, train_instance_count=1, - train_instance_type=INSTANCE_TYPE, + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, script_mode=True, framework_version=TensorFlow.LATEST_VERSION, @@ -85,11 +79,7 @@ def test_mnist_efs(efs_fsx_setup, sagemaker_session): ) -@pytest.mark.skipif( - tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION, - reason="EFS integration tests need to be fixed before running in all regions.", -) -def test_mnist_lustre(efs_fsx_setup, sagemaker_session): +def test_mnist_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type): role = efs_fsx_setup.role_name subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids @@ -98,7 +88,7 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session): entry_point=SCRIPT, role=role, train_instance_count=1, - train_instance_type=INSTANCE_TYPE, + train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, script_mode=True, framework_version=TensorFlow.LATEST_VERSION, @@ -121,11 +111,7 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session): ) -@pytest.mark.skipif( - tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION, - reason="EFS integration tests need to be fixed before running in all regions.", -) -def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session): +def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type): role = efs_fsx_setup.role_name subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids @@ -134,7 +120,7 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session): entry_point=SCRIPT, role=role, train_instance_count=1, - train_instance_type=INSTANCE_TYPE, + train_instance_type=cpu_instance_type, script_mode=True, sagemaker_session=sagemaker_session, py_version=PY_VERSION, @@ -169,11 +155,7 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session): assert best_training_job -@pytest.mark.skipif( - tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION, - reason="EFS integration tests need to be fixed before running in all regions.", -) -def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session): +def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type): role = efs_fsx_setup.role_name subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids @@ -182,7 +164,7 @@ def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session): entry_point=SCRIPT, role=role, train_instance_count=1, - train_instance_type=INSTANCE_TYPE, + train_instance_type=cpu_instance_type, script_mode=True, sagemaker_session=sagemaker_session, py_version=PY_VERSION, diff --git a/tests/integ/vpc_test_utils.py b/tests/integ/vpc_test_utils.py index ec3f01a51e..301833c3a4 100644 --- a/tests/integ/vpc_test_utils.py +++ b/tests/integ/vpc_test_utils.py @@ -62,7 +62,7 @@ def _route_table_id(ec2_client, vpc_id): return desc["RouteTables"][0]["RouteTableId"] -def check_or_create_vpc_resources_efs_fsx(sagemaker_session, region, name=VPC_NAME): +def check_or_create_vpc_resources_efs_fsx(sagemaker_session, name=VPC_NAME): # use lock to prevent race condition when tests are running concurrently with lock.lock(LOCK_PATH): ec2_client = sagemaker_session.boto_session.client("ec2") @@ -74,13 +74,11 @@ def check_or_create_vpc_resources_efs_fsx(sagemaker_session, region, name=VPC_NA _security_group_ids_by_vpc_id(sagemaker_session, vpc_id), ) else: - return _create_vpc_with_name_efs_fsx(ec2_client, region, name) + return _create_vpc_with_name_efs_fsx(ec2_client, name) -def _create_vpc_with_name_efs_fsx(ec2_client, region, name): - vpc_id, [subnet_id_a, subnet_id_b], security_group_id = _create_vpc_resources( - ec2_client, region, name - ) +def _create_vpc_with_name_efs_fsx(ec2_client, name): + vpc_id, [subnet_id_a, subnet_id_b], security_group_id = _create_vpc_resources(ec2_client, name) ec2_client.modify_vpc_attribute(EnableDnsHostnames={"Value": True}, VpcId=vpc_id) ig = ec2_client.create_internet_gateway() @@ -121,7 +119,7 @@ def _create_vpc_with_name_efs_fsx(ec2_client, region, name): return [subnet_id_a], [security_group_id] -def _create_vpc_resources(ec2_client, region, name): +def _create_vpc_resources(ec2_client, name): vpc_id = ec2_client.create_vpc(CidrBlock="10.0.0.0/16")["Vpc"]["VpcId"] print("created vpc: {}".format(vpc_id)) diff --git a/tests/scripts/fs_mount_setup.sh b/tests/scripts/fs_mount_setup.sh index a5e5eaa051..111c360498 100644 --- a/tests/scripts/fs_mount_setup.sh +++ b/tests/scripts/fs_mount_setup.sh @@ -16,18 +16,19 @@ # Mounting EFS and FSx for Lustre file systems for integration Tests FILE_SYSTEM_EFS_ID=$1 FILE_SYSTEM_FSX_ID=$2 +REGION=$3 echo "Mounting EFS File Systems" -sudo yum install -y amazon-efs-utils.noarch 0:1.10-1.amzn2 +sudo yum install -y amazon-efs-utils sudo mkdir efs sudo mount -t efs "$FILE_SYSTEM_EFS_ID":/ efs sudo mkdir efs/tensorflow sudo mkdir efs/one_p_mnist echo "Mounting FSx for Lustre File System" -sudo amazon-linux-extras install -y lustre2.10 +sudo yum install -y lustre-client sudo mkdir -p /mnt/fsx -sudo mount -t lustre -o noatime,flock "$FILE_SYSTEM_FSX_ID".fsx.us-west-2.amazonaws.com@tcp:/fsx /mnt/fsx +sudo mount -t lustre -o noatime,flock "$FILE_SYSTEM_FSX_ID".fsx."$REGION".amazonaws.com@tcp:/fsx /mnt/fsx sudo mkdir /mnt/fsx/tensorflow sudo mkdir /mnt/fsx/one_p_mnist