From f3329c77731eee5f6b7fc0beea6dba61995f5bfb Mon Sep 17 00:00:00 2001 From: Dipankar Patro Date: Sat, 22 Apr 2023 10:53:09 -0700 Subject: [PATCH 1/6] documentation: Adding Remote Function updates * Add remote function related classes and method specifications * Update Configuration File specification to include remote function function as supported SageMaker capability. --- doc/overview.rst | 74 ++++++++++++++++++- .../sagemaker.remote_function.rst | 26 +++++++ 2 files changed, 97 insertions(+), 3 deletions(-) create mode 100644 doc/remote_function/sagemaker.remote_function.rst diff --git a/doc/overview.rst b/doc/overview.rst index 7f6490a58b..2a5397abca 100644 --- a/doc/overview.rst +++ b/doc/overview.rst @@ -2021,6 +2021,31 @@ set default values for. For the full schema, see ``sagemaker.config.config_schem     Tags:     - Key: 'tag_key'       Value: 'tag_value' + PythonSDK: + Modules: + RemoteFunction: + Dependencies: 'path/to/requirements.txt' + EnableInterContainerTrafficEncryption: true + EnvironmentVariables: {'EnvVarKey': 'EnvVarValue'} + ImageUri: '366666666666.dkr.ecr.us-west-2.amazonaws.com/my-image:latest' + IncludeLocalWorkDir: true + InstanceType: 'ml.m5.large' + JobCondaEnvironment: 'your_conda_env' + PreExecutionCommands: + - 'command_1' + - 'command_2' + PreExecutionScript: 'path/to/script.sh' + RoleArn: 'arn:aws:iam::366666666666:role/MyRole' + S3KmsKeyId: 'yourkmskeyid' + S3RootUri: 's3://my-bucket/my-project' + VpcConfig: + SecurityGroupIds: + - 'sg123' + Subnets: + - 'subnet-1234' + Tags: + - {'Key': 'yourTagKey', 'Value': 'yourTagValue'} + VolumeKmsKeyId: 'yourkmskeyid' Configuration file locations ============================ @@ -2170,8 +2195,8 @@ types support setting defaults with a configuration file. - Tags - Enable inter-container traffic encryption -List of APIs supported ----------------------- +List of APIs and SDK capabilities supported +------------------------------------------- Default values for the supported parameters of these APIs apply to all create and update calls for that API. For example, if a supported @@ -2202,6 +2227,10 @@ configuration file. Hyperparameter Tuning Job: Supported indirectly via ``TrainingJob`` API. While this API is not directly supported, it includes the training job definition as a parameter. If you provide defaults for this parameter as part of the ``TrainingJob`` API, these defaults are also used for Hyperparameter Tuning Job. +The following goups of SDK capabilities support defaults with a configuration file. + +- Remote Function ``@remote decorator``, ``RemoteExecutor``` + Configuration file resolution ============================= @@ -2411,7 +2440,46 @@ specifically the contents of ``'body': b'{...}`` . botocore.endpoint [DEBUG] Making request for OperationModel(name=) with params: {'url_path': ..., 'query_string': ..., 'method': 'POST', 'headers': {...}, 'body': b'{...}', 'url': 'https://api.sagemaker.us-west-2.amazonaws.com/', - 'context': {...}} + 'context': {...}}cd + + +************************************************************ +Run Machine Learning code on SageMaker using remote function +************************************************************ + +You can seamlessly integrate your local machine language (ML) code to run in a Amazon SageMaker Training job by wrapping +your code inside a @remote decorator as shown in the following code example. + +.. code-block:: python + + from sagemaker.remote_function import remote + import numpy as np + + @remote(instance_type="ml.m5.large") + def matrix_multiply(a, b): + return np.matmul(a, b) + + a = np.array([[1, 0], + [0, 1]]) + b = np.array([1, 2]) + + assert (matrix_multiply(a, b) == np.array([1,2])).all() + +The SageMaker Python SDK will automatically translate your existing workspace environment and any associated data +processing code and datasets into a SageMaker Training job that runs on the SageMaker Training platform. +You can also activate a persistent cache feature, which will further reduce job start up latency by caching previously +downloaded dependency packages. This reduction in job latency is greater than the reduction in latency from using +SageMaker managed warm pools alone. The following sections show you how to wrap your local ML code and tailor your +experience for your use case including customizing your environment and integrating with SageMaker Experiments. + +See the `Run your local code as a SageMaker Training job `__ for detailed developer guide. + +Follow is the API specification for methods and classes related to remote function feature. + +.. toctree:: + :maxdepth: 1 + + remote_function/sagemaker.remote_function.rst *** FAQ diff --git a/doc/remote_function/sagemaker.remote_function.rst b/doc/remote_function/sagemaker.remote_function.rst new file mode 100644 index 0000000000..266e4e6c1f --- /dev/null +++ b/doc/remote_function/sagemaker.remote_function.rst @@ -0,0 +1,26 @@ +Remote function classes and methods specification +================================================= + + +@remote decorator +----------------- + +.. automethod:: sagemaker.remote_function.client.remote + + +RemoteExcutor +------------- + +.. autoclass:: sagemaker.remote_function.RemoteExecutor + :members: + + +Future +------ + +.. autoclass:: sagemaker.remote_function.client.Future + :members: + +.. automethod:: sagemaker.remote_function.client.list_futures + +.. automethod:: sagemaker.remote_function.client.get_future From ef8a9ecd72b811b9a8fc667d0b54eaa08981767d Mon Sep 17 00:00:00 2001 From: Dipankar Patro Date: Sat, 22 Apr 2023 10:58:36 -0700 Subject: [PATCH 2/6] Add missed remote_function/client.py --- src/sagemaker/remote_function/client.py | 468 ++++++++++++++++++------ 1 file changed, 360 insertions(+), 108 deletions(-) diff --git a/src/sagemaker/remote_function/client.py b/src/sagemaker/remote_function/client.py index a07f7baeb0..f7de0b7c33 100644 --- a/src/sagemaker/remote_function/client.py +++ b/src/sagemaker/remote_function/client.py @@ -82,49 +82,174 @@ def remote( volume_size: int = 30, encrypt_inter_container_traffic: bool = None, ): - """Function that starts a new SageMaker job synchronously with overridden runtime settings. + """Decorator for running the annotated function as a SageMaker training job. + + This decorator wraps the annotated code and runs it is a new SageMaker job synchronously + with the provided runtime settings. + + Unless mentioned otherwise, the decorator first looks up the value from the SageMaker + configuration file. If no value is specified in the configuration file or no configuration file + is found, the decorator selects the default as specified below. For more information, see + `Configuring and using defaults with the SageMaker Python SDK `_. Args: - _func (Optional): Python function to be executed on the SageMaker job runtime environment. - dependencies (str): Path to dependencies file or a reserved keyword - ``auto_capture``. Defaults to None. + _func (Optional): A Python function to run as a SageMaker training job. + + dependencies (str): Either the path to a dependencies file or the reserved keyword + ``auto_capture``. Defaults to ``None``. + If dependencies is provided, the value must be one of the following: + + * A path to a conda environment.yml file. The following conditions apply. + + * If job_conda_env is set, then the conda environment is updated by installing + dependencies from the yaml file and the function is invoked within that + conda environment. For this to succeed, the specified conda environment must + already exist in the image. + * If the environment variable ``SAGEMAKER_JOB_CONDA_ENV`` is set in the image, then the + conda environment is updated by installing dependencies from the yaml file and the + function is invoked within that conda environment. For this to succeed, the + conda environment name must already be set in ``SAGEMAKER_JOB_CONDA_ENV``, and + ``SAGEMAKER_JOB_CONDA_ENV`` must already exist in the image. + * If none of the previous conditions are met, a new conda environment named + ``sagemaker-runtime-env`` is created and the function annotated with the remote + decorator is invoked in that conda environment. + + * A path to a requirements.txt file. The following conditions apply. + + * If ``job_conda_env`` is set in the remote decorator, dependencies are installed + within that conda environment and the function annotated with the remote decorator + is invoked in the same conda environment. For this to succeed, the specified + conda environment must already exist in the image. + * If an environment variable ``SAGEMAKER_JOB_CONDA_ENV`` is set in the image, + dependencies are installed within that conda environment and the function annotated + with the remote decorator is invoked in the same. For this to succeed, the conda + environment name must already be set in ``SAGEMAKER_JOB_CONDA_ENV``, and + ``SAGEMAKER_JOB_CONDA_ENV`` must already exist in the image. + * If none of the above conditions are met, conda is not used. Dependencies are + installed at the system level, without any virtual environment, and the function + annotated with the remote decorator is invoked using the python runtime available + in the system path. + + * The parameter dependencies is set to auto_capture. SageMaker will automatically + generate a env_snapshot.yml corresponding to the current active conda environment’s + snapshot. You do not need to provide a dependencies file. The following conditions + apply: + + * You must run the remote function within an active conda environment. + * When installing the dependencies on the training job, the same conditions as when + dependencies is set to a path to a conda environment file apply. These conditions are + as follows: + + * If job_conda_env is set, then the conda environment is updated by installing + dependencies from the yaml file and the function is invoked within that + conda environment. For this to succeed, the specified conda environment must + already exist in the image. + * If the environment variable ``SAGEMAKER_JOB_CONDA_ENV`` is set in the image, then + the conda environment is updated by installing dependencies from the yaml file + and the function is invoked within that conda environment. For this to + succeed, the conda environment name must already be set in + ``SAGEMAKER_JOB_CONDA_ENV``, and ``SAGEMAKER_JOB_CONDA_ENV`` must already exist + in the image. + * If none of the previous conditions are met, a new conda environment with name + ``sagemaker-runtime-env`` is created and the function annotated with the + remote decorator is invoked in that conda environment. + + * ``None``. SageMaker will assume that there are no dependencies to install while + executing the remote annotated function in the training job. + pre_execution_commands (List[str]): List of commands to be executed prior to executing - remote function. Only one of ``pre_execution_commands`` or ``pre_execution_script`` - can be specified at the same time. Defaults to None. + remote function. Only one of ``pre_execution_commands`` or ``pre_execution_script`` + can be specified at the same time. Defaults to None. + pre_execution_script (str): Path to script file to be executed prior to executing - remote function. Only one of ``pre_execution_commands`` or ``pre_execution_script`` - can be specified at the same time. Defaults to None. - environment_variables (Dict): environment variables - image_uri (str): Docker image URI on ECR. - include_local_workdir (bool): Set to ``True`` if the remote function code imports local - modules and methods that are not available via PyPI or conda. Default value is ``False``. - instance_count (int): Number of instance to use. Default is 1. - instance_type (str): EC2 instance type. - job_conda_env (str): Name of the conda environment to activate during execution of the job. - Default is None. - job_name_prefix (str): Prefix used to identify the underlying sagemaker job. - keep_alive_period_in_seconds (int): The duration of time in seconds to retain configured - resources in a warm pool for subsequent training jobs. Default is 0. - max_retry_attempts (int): Max number of times the job is retried on InternalServerFailure. - Default is 1. - max_runtime_in_seconds (int): Timeout in seconds for training. After this amount of time - Amazon SageMaker terminates the job regardless of its current status. - Default is 86400 seconds (1 day). - role (str): IAM role used for SageMaker execution. - s3_kms_key (str): The encryption key used for storing serialized data. - s3_root_uri (str): The root S3 folder where the code archives and data are uploaded to. - sagemaker_session (sagemaker.session.Session): The underlying SageMaker session which - AWS service calls are delegated to (default: None). If not provided, one is created - with default AWS configuration chain. - security_group_ids (List[str]): List of security group IDs. - subnets (List[str]): List of subnet IDs. - tags (List[Tuple[str, str]]): List of tags attached to the job. - volume_kms_key (str): KMS key used for encrypting EBS volume attached to the training - instance. - volume_size (int): Size in GB of the storage volume to use for storing input and output - data. Default is 30. - encrypt_inter_container_traffic (bool): Specifies whether traffic between training - containers is encrypted for the training job. (default: ``False``). + remote function. Only one of ``pre_execution_commands`` or ``pre_execution_script`` + can be specified at the same time. Defaults to None. + + environment_variables (Dict): The environment variables used inside the decorator function. + Defaults to ``None``. + + image_uri (str): The universal resource identifier (URI) location of a Docker image on + Amazon Elastic Container Registry (ECR). Defaults to the following based on where the SDK + is running: + + * For SageMaker Studio notebook cases, the image used as the kernel image for the + notebook is used. + * For other cases, it is resolved to base python image with the same python version + as the environment running the local code. + + If no compatible image is found, a ValueError is thrown. + + include_local_workdir (bool): A flag to indicate that the remote function should include + local directories. Set to ``True`` if the remote function code imports local modules and + methods that are not available via PyPI or conda. Default value is ``False``. + + instance_count (int): The number of instance to use. Defaults to 1. + + instance_type (str): The Amazon Elastic Compute Cloud (EC2) instance type to use to run + the SageMaker job. Valid values include ml.c4.xlarge. If not provided, ValueError is + thrown. + + job_conda_env (str): The name of the conda environment to activate during job's runtime. + Defaults to ``None``. + + job_name_prefix (str): The prefix used used to create the underlying SageMaker job. + + keep_alive_period_in_seconds (int): The duration in seconds to retain and reuse provisioned + infrastructure after the completion of a training job, also known as SageMaker managed + warm pools. The use of warmpools reduces the latency time spent to provision new + resources. The default value for ``keep_alive_period_in_seconds`` is 0. + NOTE: Additional charges associated with warm pools may apply. Using this parameter will + also activate a new persistent cache feature, which will further reduce job start up + latency than over using SageMaker managed warm pools alone by caching the package source + downloaded in the previous runs. + + max_retry_attempts (int): The max number of times the job is retried after an on + ```InternalServerFailure``` Error. Defaults to 1. + + max_runtime_in_seconds (int): The upper limit in seconds to be used for training. After + this specified amount of time, SageMaker terminates the job regardless of its current + status. The max_run_duration time defaults to 1 day or (86400 seconds). + + role (str): The IAM role (either name or full ARN) used to run your SageMaker training + job. Defaults to: + + * the SageMaker default IAM role if the SDK is running in SageMaker Notebooks or + SageMaker Studio Notebooks. + * if not above, a ValueError is be thrown. + + s3_kms_key (str): The key used to encrypt the input and output data. Default to ``None``. + + s3_root_uri (str): The root S3 folder to which where the code archives and data are + uploaded to. Defaults to ``s3://``. + + sagemaker_session (sagemaker.session.Session): The underlying SageMaker session to which + SageMaker service calls are delegated to (default: None). If not provided, one is created + using a default configuration chain. + + security_group_ids (List[str): A list of security group IDs. Defaults to ``None`` and the + training job is created without VPC config. + + subnets (List[str): A list of subnet IDs. Defaults to ``None`` and the job is created + without VPC config. + + tags (List[Tuple[str, str]): A list of tags attached to the job. Defaults to ``None`` and + the training job is created without tags. + + volume_kms_key (str): An Amazon Key Management Service (KMS) key used to encrypt an + Amazon Elastic Block Storage (EBS) volume attached to the training instance. Defaults to + ``None``. + + volume_size (int): The size in GB of the storage volume for storing input and output data + during training. Defaults to ``30``. + + encrypt_inter_container_traffic (bool): A flag that specifies whether traffic between + training containers is encrypted for the training job. Defaults to ``False``. + + enable_network_isolation (bool): A flag that specifies whether container will run in + network isolation mode. Defaults to ``False``. Network isolation mode restricts the + container access to outside networks (such as the Internet). The container does not + make any inbound or outbound network calls. Also known as Internet-free mode. """ def _remote(func): @@ -349,55 +474,175 @@ def __init__( volume_size: int = 30, encrypt_inter_container_traffic: bool = None, ): - """Initiates a ``RemoteExecutor`` instance. + """Constructor for RemoteExecutor + + Unless mentioned otherwise, the construcutor first looks up the value from the SageMaker + configuration file. If no value is specified in the configuration file or no configuration + file is found, the constructor selects the default as specified below. For more + information, see `Configuring and using defaults with the SageMaker Python SDK + `_. Args: - dependencies (str): Path to dependencies file or a reserved keyword - ``auto_capture``. Defaults to None. + _func (Optional): A Python function to run as a SageMaker training job. + + dependencies (str): Either the path to a dependencies file or the reserved keyword + ``auto_capture``. Defaults to ``None``. + If dependencies is provided, the value must be one of the following: + + * A path to a conda environment.yml file. The following conditions apply. + + * If job_conda_env is set, then the conda environment is updated by installing + dependencies from the yaml file and the function is invoked within that + conda environment. For this to succeed, the specified conda environment must + already exist in the image. + * If the environment variable ``SAGEMAKER_JOB_CONDA_ENV`` is set in the image, then + the conda environment is updated by installing dependencies from the yaml file and + the function is invoked within that conda environment. For this to succeed, the + conda environment name must already be set in ``SAGEMAKER_JOB_CONDA_ENV``, and + ``SAGEMAKER_JOB_CONDA_ENV`` must already exist in the image. + * If none of the previous conditions are met, a new conda environment named + ``sagemaker-runtime-env`` is created and the function annotated with the remote + decorator is invoked in that conda environment. + + * A path to a requirements.txt file. The following conditions apply. + + * If ``job_conda_env`` is set in the remote decorator, dependencies are installed + within that conda environment and the function annotated with the remote decorator + is invoked in the same conda environment. For this to succeed, the specified + conda environment must already exist in the image. + * If an environment variable ``SAGEMAKER_JOB_CONDA_ENV`` is set in the image, + dependencies are installed within that conda environment and the function annotated + with the remote decorator is invoked in the same. For this to succeed, the + conda environment name must already be set in ``SAGEMAKER_JOB_CONDA_ENV``, and + ``SAGEMAKER_JOB_CONDA_ENV`` must already exist in the image. + * If none of the above conditions are met, conda is not used. Dependencies are + installed at the system level, without any virtual environment, and the function + annotated with the remote decorator is invoked using the python runtime available + in the system path. + + * The parameter dependencies is set to auto_capture. SageMaker will automatically + generate a env_snapshot.yml corresponding to the current active conda environment’s + snapshot. You do not need to provide a dependencies file. The following conditions + apply: + + * You must run the remote function within an active conda environment. + * When installing the dependencies on the training job, the same conditions as when + dependencies is set to a path to a conda environment file apply. These conditions + are as follows: + + * If job_conda_env is set, then the conda environment is updated by installing + dependencies from the yaml file and the function is invoked within that + conda environment. For this to succeed, the specified conda environment must + already exist in the image. + * If the environment variable ``SAGEMAKER_JOB_CONDA_ENV`` is set in the image, + then the conda environment is updated by installing dependencies from the yaml + file and the function is invoked within that conda environment. For this to + succeed, the conda environment name must already be set in + ``SAGEMAKER_JOB_CONDA_ENV``, and ``SAGEMAKER_JOB_CONDA_ENV`` must already exist + in the image. + * If none of the previous conditions are met, a new conda environment with name + ``sagemaker-runtime-env`` is created and the function annotated with the + remote decorator is invoked in that conda environment. + + * ``None``. SageMaker will assume that there are no dependencies to install while + executing the remote annotated function in the training job. + pre_execution_commands (List[str]): List of commands to be executed prior to executing - remote function. Only one of ``pre_execution_commands`` or ``pre_execution_script`` - can be specified at the same time. Defaults to None. + remote function. Only one of ``pre_execution_commands`` or ``pre_execution_script`` + can be specified at the same time. Defaults to None. + pre_execution_script (str): Path to script file to be executed prior to executing - remote function. Only one of ``pre_execution_commands`` or ``pre_execution_script`` - can be specified at the same time. Defaults to None. - environment_variables (Dict): Environment variables passed to the underlying sagemaker - job. Defaults to None - image_uri (str): Docker image URI on ECR. Defaults to base Python image. - include_local_workdir (bool): Set to ``True`` if the remote function code imports local - modules and methods that are not available via PyPI or conda. Default value is - ``False``. - instance_count (int): Number of instance to use. Defaults to 1. - instance_type (str): EC2 instance type. - job_conda_env (str): Name of the conda environment to activate during execution - of the job. Default is None. - job_name_prefix (str): Prefix used to identify the underlying sagemaker job. - keep_alive_period_in_seconds (int): The duration of time in seconds to retain configured - resources in a warm pool for subsequent training jobs. Defaults to 0. - max_parallel_jobs (int): Maximal number of jobs that run in parallel. Default to 1. - max_retry_attempts (int): Max number of times the job is retried on - InternalServerFailure.Defaults to 1. - max_runtime_in_seconds (int): Timeout in seconds for training. After this amount of - time Amazon SageMaker terminates the job regardless of its current status. - Defaults to 86400 seconds (1 day). - role (str): IAM role used for SageMaker execution. Defaults to SageMaker default - execution role. - s3_kms_key (str): The encryption key used for storing serialized data. Defaults to S3 - managed key. - s3_root_uri (str): The root S3 folder where the code archives and data are uploaded to. - This parameter is autogenerated using information regarding the image uri if not - provided. - sagemaker_session (sagemaker.session.Session): The underlying SageMaker session which - AWS service calls are delegated to (default: None). If not provided, one is created - with default AWS configuration chain. - security_group_ids (List[str]): List of security group IDs. Defaults to None. - subnets (List[str]): List of subnet IDs. Defaults to None. - tags (List[Tuple[str, str]]): List of tags attached to the job. Defaults to None. - volume_kms_key (str): KMS key used for encrypting EBS volume attached to the training - instance. - volume_size (int): Size in GB of the storage volume to use for storing input and output - data. Defaults to 30. - encrypt_inter_container_traffic (bool): Specifies whether traffic between training - containers is encrypted for the training job. (default: ``False``). + remote function. Only one of ``pre_execution_commands`` or ``pre_execution_script`` + can be specified at the same time. Defaults to None. + + environment_variables (Dict): The environment variables used inside the decorator + function. Defaults to ``None``. + + image_uri (str): The universal resource identifier (URI) location of a Docker image on + Amazon Elastic Container Registry (ECR). Defaults to the following based on where the + SDK is running: + + * For SageMaker Studio notebook cases, the image used as the kernel image for the + notebook is used. + * For other cases, it is resolved to base python image with the same python + version as the environment running the local code. + + If no compatible image is found, a ValueError is thrown. + + include_local_workdir (bool): A flag to indicate that the remote function should include + local directories. Set to ``True`` if the remote function code imports local modules + and methods that are not available via PyPI or conda. Default value is ``False``. + + instance_count (int): The number of instance to use. Defaults to 1. + + instance_type (str): The Amazon Elastic Compute Cloud (EC2) instance type to use to run + the SageMaker job. Valid values include ml.c4.xlarge. If not provided, ValueError is + thrown. + + job_conda_env (str): The name of the conda environment to activate during job's runtime. + Defaults to ``None``. + + job_name_prefix (str): The prefix used used to create the underlying SageMaker job. + + keep_alive_period_in_seconds (int): The duration in seconds to retain and reuse + provisioned infrastructure after the completion of a training job, also known as + SageMaker managed warm pools. The use of warmpools reduces the latency time spent to + provision new resources. The default value for ``keep_alive_period_in_seconds`` is 0. + Additional charges associated with warm pools may apply. Using this parameter will + also activate a new persistent cache feature, which will further reduce job start up + latency than over using SageMaker managed warm pools alone by caching the package + source downloaded in the previous runs. + + max_parallel_jobs (int): Maximum number of jobs that run in parallel. Defaults to 1. + + max_retry_attempts (int): The max number of times the job is retried after an on + ```InternalServerFailure``` Error. Defaults to 1. + + max_runtime_in_seconds (int): The upper limit in seconds to be used for training. After + this specified amount of time, SageMaker terminates the job regardless of its current + status. The max_run_duration time defaults to 1 day or (86400 seconds). + + role (str): The IAM role (either name or full ARN) used to run your SageMaker training + job. Defaults to: + + * the SageMaker default IAM role if the SDK is running in SageMaker Notebooks or + SageMaker Studio Notebooks. + * if not above, a ValueError is be thrown. + + s3_kms_key (str): The key used to encrypt the input and output data. + Default to ``None``. + + s3_root_uri (str): The root S3 folder to which where the code archives and data are + uploaded to. Defaults to ``s3://``. + + sagemaker_session (sagemaker.session.Session): The underlying SageMaker session to which + SageMaker service calls are delegated to (default: None). If not provided, one is + created using a default configuration chain. + + security_group_ids (List[str): A list of security group IDs. Defaults to ``None`` and + the training job is created without VPC config. + + subnets (List[str): A list of subnet IDs. Defaults to ``None`` and the job is + created without VPC config. + + tags (List[Tuple[str, str]): A list of tags attached to the job. Defaults to ``None`` + and the training job is created without tags. + + volume_kms_key (str): An Amazon Key Management Service (KMS) key used to encrypt an + Amazon Elastic Block Storage (EBS) volume attached to the training instance. + Defaults to ``None``. + + volume_size (int): The size in GB of the storage volume for storing input and output + data during training. Defaults to ``30``. + + encrypt_inter_container_traffic (bool): A flag that specifies whether traffic between + training containers is encrypted for the training job. Defaults to ``False``. + + enable_network_isolation (bool): A flag that specifies whether container will run in + network isolation mode. Defaults to ``False``. Network isolation mode restricts the + container access to outside networks (such as the Internet). The container does not + make any inbound or outbound network calls. Also known as Internet-free mode. """ self.max_parallel_jobs = max_parallel_jobs @@ -575,9 +820,10 @@ def _validate_submit_args(func, *args, **kwargs): class Future(object): - """Class representing a reference to a sagemaker job result. + """Class representing a reference to a SageMaker job result. - The sagemaker job represented may or may not have finished running. + Reference to the SageMaker job created as a result of the remote function execution. The job may + or may not have finished running. """ def __init__(self): @@ -657,7 +903,7 @@ def _start_and_notify( """Start and record the newly created job in the future object. The job is recorded if one is successfully started. Otherwise, the exception is - recorded. The state update will be broadcast to other waiting threads. + recorded. The state update is broadcast to other waiting threads. """ with self._condition: if self._state in [_PENDING]: @@ -676,16 +922,18 @@ def _start_and_notify( return None def result(self, timeout: float = None) -> Any: - """Returns the function result. + """Returns the SageMaker job result. + + This method waits for the SageMaker job created from the remote function execution to + complete for up to the timeout value (if specified). If timeout is ``None``, + this method will wait until the SageMaker job completes. - This method blocks on the sagemaker job completing for up to the timeout value (if - specified). If timeout is ``None``, this method will block until the job is completed. Args: timeout (float): Timeout in seconds to wait until the job is completed. ``None`` by - default. + default. Returns: - The Python object returned by the function + The Python object returned by the remote function execution. """ try: self.wait(timeout) @@ -756,13 +1004,15 @@ def wait( self, timeout: int = None, ) -> None: - """Wait for the underlying sagemaker job to complete. + """Wait for the underlying SageMaker job to complete. + + This method waits for the SageMaker job created as a result of the remote function execution + to complete for up to the timeout value (if specified). If timeout is ``None``, this method + will block until the job is completed. - This method blocks on the sagemaker job completing for up to the timeout value (if - specified). If timeout is ``None``, this method will block until the job is completed. Args: - timeout (int): Timeout in seconds to wait until the job is completed. ``None`` by - default. + timeout (int): Timeout in seconds to wait for until the job is to completed before it is + stopped. Defaults to ``None``. Returns: None """ @@ -777,10 +1027,11 @@ def wait( def cancel(self): """Cancel the function execution. - It prevents the SageMaker job being created or stops the underlying sagemaker job early - if it is already in progress. + This method prevents the SageMaker job being created or stops the underlying SageMaker job + early if it is already in progress. - Returns: ``True`` if the underlying sagemaker job is cancelled. + Returns: ``True`` if the underlying SageMaker job created as a result of the remote function + execution is cancelled. """ with self._condition: if self._state == _FINISHED: @@ -823,10 +1074,11 @@ def get_future(job_name, sagemaker_session=None): """Get a future object with information about a job with the given job_name. Args: - job_name (str): name of the underlying SageMaker job. - sagemaker_session (sagemaker.session.Session): Session object which - manages interactions with Amazon SageMaker APIs and any other - AWS services needed. + job_name (str): name of the underlying SageMaker job created as a result of the remote + function execution. + + sagemaker_session (sagemaker.session.Session): A session object which manages interactions + with Amazon SageMaker APIs and any other AWS services needed. Returns: A `sagemaker.remote_function.client.Future` instance. @@ -843,10 +1095,10 @@ def list_futures(job_name_prefix, sagemaker_session=None): """Generates Future objects with information about jobs with given job_name_prefix. Args: - job_name_prefix (str): prefix used to identify relevant SageMaker jobs. - sagemaker_session (sagemaker.session.Session): Session object which - manages interactions with Amazon SageMaker APIs and any other - AWS services needed. + job_name_prefix (str): A prefix used to identify the SageMaker jobs associated with remote + function execution. + sagemaker_session (sagemaker.session.Session): A session object which manages interactions + with Amazon SageMaker APIs and any other AWS services needed. Yields: A `sagemaker.remote_function.client.Future` instance. From 11cfd6f2b7651d62808234d82d2917a5418ca77a Mon Sep 17 00:00:00 2001 From: Dipankar Patro Date: Sat, 22 Apr 2023 16:35:20 -0700 Subject: [PATCH 3/6] Address PR feedback --- doc/overview.rst | 9 +++++---- src/sagemaker/remote_function/client.py | 26 ++++++++++++------------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/doc/overview.rst b/doc/overview.rst index 2a5397abca..dfee53b750 100644 --- a/doc/overview.rst +++ b/doc/overview.rst @@ -2027,7 +2027,7 @@ set default values for. For the full schema, see ``sagemaker.config.config_schem Dependencies: 'path/to/requirements.txt' EnableInterContainerTrafficEncryption: true EnvironmentVariables: {'EnvVarKey': 'EnvVarValue'} - ImageUri: '366666666666.dkr.ecr.us-west-2.amazonaws.com/my-image:latest' + ImageUri: '555555555555.dkr.ecr.us-west-2.amazonaws.com/my-image:latest' IncludeLocalWorkDir: true InstanceType: 'ml.m5.large' JobCondaEnvironment: 'your_conda_env' @@ -2035,7 +2035,7 @@ set default values for. For the full schema, see ``sagemaker.config.config_schem - 'command_1' - 'command_2' PreExecutionScript: 'path/to/script.sh' - RoleArn: 'arn:aws:iam::366666666666:role/MyRole' + RoleArn: 'arn:aws:iam::555555555555:role/MyRole' S3KmsKeyId: 'yourkmskeyid' S3RootUri: 's3://my-bucket/my-project' VpcConfig: @@ -2044,7 +2044,8 @@ set default values for. For the full schema, see ``sagemaker.config.config_schem Subnets: - 'subnet-1234' Tags: - - {'Key': 'yourTagKey', 'Value': 'yourTagValue'} + - Key: 'tag_key' + Value: 'tag_value' VolumeKmsKeyId: 'yourkmskeyid' Configuration file locations @@ -2440,7 +2441,7 @@ specifically the contents of ``'body': b'{...}`` . botocore.endpoint [DEBUG] Making request for OperationModel(name=) with params: {'url_path': ..., 'query_string': ..., 'method': 'POST', 'headers': {...}, 'body': b'{...}', 'url': 'https://api.sagemaker.us-west-2.amazonaws.com/', - 'context': {...}}cd + 'context': {...}} ************************************************************ diff --git a/src/sagemaker/remote_function/client.py b/src/sagemaker/remote_function/client.py index f7de0b7c33..427182d624 100644 --- a/src/sagemaker/remote_function/client.py +++ b/src/sagemaker/remote_function/client.py @@ -84,7 +84,7 @@ def remote( ): """Decorator for running the annotated function as a SageMaker training job. - This decorator wraps the annotated code and runs it is a new SageMaker job synchronously + This decorator wraps the annotated code and runs it as a new SageMaker job synchronously with the provided runtime settings. Unless mentioned otherwise, the decorator first looks up the value from the SageMaker @@ -187,8 +187,7 @@ def remote( instance_count (int): The number of instance to use. Defaults to 1. instance_type (str): The Amazon Elastic Compute Cloud (EC2) instance type to use to run - the SageMaker job. Valid values include ml.c4.xlarge. If not provided, ValueError is - thrown. + the SageMaker job. e.g. ml.c4.xlarge. If not provided, ValueError is thrown. job_conda_env (str): The name of the conda environment to activate during job's runtime. Defaults to ``None``. @@ -204,12 +203,12 @@ def remote( latency than over using SageMaker managed warm pools alone by caching the package source downloaded in the previous runs. - max_retry_attempts (int): The max number of times the job is retried after an on - ```InternalServerFailure``` Error. Defaults to 1. + max_retry_attempts (int): The max number of times the job is retried on + ```InternalServerFailure``` Error from SageMaker service. Defaults to 1. max_runtime_in_seconds (int): The upper limit in seconds to be used for training. After this specified amount of time, SageMaker terminates the job regardless of its current - status. The max_run_duration time defaults to 1 day or (86400 seconds). + status. Defaults to 1 day or (86400 seconds). role (str): The IAM role (either name or full ARN) used to run your SageMaker training job. Defaults to: @@ -220,7 +219,7 @@ def remote( s3_kms_key (str): The key used to encrypt the input and output data. Default to ``None``. - s3_root_uri (str): The root S3 folder to which where the code archives and data are + s3_root_uri (str): The root S3 folder to which the code archives and data are uploaded to. Defaults to ``s3://``. sagemaker_session (sagemaker.session.Session): The underlying SageMaker session to which @@ -577,8 +576,7 @@ def __init__( instance_count (int): The number of instance to use. Defaults to 1. instance_type (str): The Amazon Elastic Compute Cloud (EC2) instance type to use to run - the SageMaker job. Valid values include ml.c4.xlarge. If not provided, ValueError is - thrown. + the SageMaker job. e.g. ml.c4.xlarge. If not provided, ValueError is thrown. job_conda_env (str): The name of the conda environment to activate during job's runtime. Defaults to ``None``. @@ -596,12 +594,12 @@ def __init__( max_parallel_jobs (int): Maximum number of jobs that run in parallel. Defaults to 1. - max_retry_attempts (int): The max number of times the job is retried after an on - ```InternalServerFailure``` Error. Defaults to 1. + max_retry_attempts (int): The max number of times the job is retried on + ```InternalServerFailure``` Error from SageMaker service. Defaults to 1. max_runtime_in_seconds (int): The upper limit in seconds to be used for training. After this specified amount of time, SageMaker terminates the job regardless of its current - status. The max_run_duration time defaults to 1 day or (86400 seconds). + status. Defaults to 1 day or (86400 seconds). role (str): The IAM role (either name or full ARN) used to run your SageMaker training job. Defaults to: @@ -613,7 +611,7 @@ def __init__( s3_kms_key (str): The key used to encrypt the input and output data. Default to ``None``. - s3_root_uri (str): The root S3 folder to which where the code archives and data are + s3_root_uri (str): The root S3 folder to which the code archives and data are uploaded to. Defaults to ``s3://``. sagemaker_session (sagemaker.session.Session): The underlying SageMaker session to which @@ -1011,7 +1009,7 @@ def wait( will block until the job is completed. Args: - timeout (int): Timeout in seconds to wait for until the job is to completed before it is + timeout (int): Timeout in seconds to wait for until the job is completed before it is stopped. Defaults to ``None``. Returns: None From 6fcef3477a533827f702c20f95ce63d90ba22f40 Mon Sep 17 00:00:00 2001 From: Dipankar Patro Date: Mon, 24 Apr 2023 12:39:19 -0700 Subject: [PATCH 4/6] Address PR feedback --- doc/overview.rst | 2 +- src/sagemaker/remote_function/client.py | 38 ++++++++++++------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/doc/overview.rst b/doc/overview.rst index dfee53b750..82411f4e71 100644 --- a/doc/overview.rst +++ b/doc/overview.rst @@ -2448,7 +2448,7 @@ specifically the contents of ``'body': b'{...}`` . Run Machine Learning code on SageMaker using remote function ************************************************************ -You can seamlessly integrate your local machine language (ML) code to run in a Amazon SageMaker Training job by wrapping +You can integrate your local machine language (ML) code to run in a Amazon SageMaker Training job by wrapping your code inside a @remote decorator as shown in the following code example. .. code-block:: python diff --git a/src/sagemaker/remote_function/client.py b/src/sagemaker/remote_function/client.py index 427182d624..d342fa8fda 100644 --- a/src/sagemaker/remote_function/client.py +++ b/src/sagemaker/remote_function/client.py @@ -132,7 +132,7 @@ def remote( in the system path. * The parameter dependencies is set to auto_capture. SageMaker will automatically - generate a env_snapshot.yml corresponding to the current active conda environment’s + generate an env_snapshot.yml corresponding to the current active conda environment’s snapshot. You do not need to provide a dependencies file. The following conditions apply: @@ -173,9 +173,9 @@ def remote( Amazon Elastic Container Registry (ECR). Defaults to the following based on where the SDK is running: - * For SageMaker Studio notebook cases, the image used as the kernel image for the + * For users on SageMaker Studio notebooks, the image used as the kernel image for the notebook is used. - * For other cases, it is resolved to base python image with the same python version + * For other users, it is resolved to base python image with the same python version as the environment running the local code. If no compatible image is found, a ValueError is thrown. @@ -184,7 +184,7 @@ def remote( local directories. Set to ``True`` if the remote function code imports local modules and methods that are not available via PyPI or conda. Default value is ``False``. - instance_count (int): The number of instance to use. Defaults to 1. + instance_count (int): The number of instances to use. Defaults to 1. instance_type (str): The Amazon Elastic Compute Cloud (EC2) instance type to use to run the SageMaker job. e.g. ml.c4.xlarge. If not provided, ValueError is thrown. @@ -199,7 +199,7 @@ def remote( warm pools. The use of warmpools reduces the latency time spent to provision new resources. The default value for ``keep_alive_period_in_seconds`` is 0. NOTE: Additional charges associated with warm pools may apply. Using this parameter will - also activate a new persistent cache feature, which will further reduce job start up + also activate a new Persistent Cache feature, which will further reduce job start up latency than over using SageMaker managed warm pools alone by caching the package source downloaded in the previous runs. @@ -521,7 +521,7 @@ def __init__( in the system path. * The parameter dependencies is set to auto_capture. SageMaker will automatically - generate a env_snapshot.yml corresponding to the current active conda environment’s + generate an env_snapshot.yml corresponding to the current active conda environment’s snapshot. You do not need to provide a dependencies file. The following conditions apply: @@ -562,9 +562,9 @@ def __init__( Amazon Elastic Container Registry (ECR). Defaults to the following based on where the SDK is running: - * For SageMaker Studio notebook cases, the image used as the kernel image for the - notebook is used. - * For other cases, it is resolved to base python image with the same python + * For users on SageMaker Studio notebooks, the image used as the kernel image for + the notebook is used. + * For other users, it is resolved to base python image with the same python version as the environment running the local code. If no compatible image is found, a ValueError is thrown. @@ -573,7 +573,7 @@ def __init__( local directories. Set to ``True`` if the remote function code imports local modules and methods that are not available via PyPI or conda. Default value is ``False``. - instance_count (int): The number of instance to use. Defaults to 1. + instance_count (int): The number of instances to use. Defaults to 1. instance_type (str): The Amazon Elastic Compute Cloud (EC2) instance type to use to run the SageMaker job. e.g. ml.c4.xlarge. If not provided, ValueError is thrown. @@ -820,7 +820,7 @@ def _validate_submit_args(func, *args, **kwargs): class Future(object): """Class representing a reference to a SageMaker job result. - Reference to the SageMaker job created as a result of the remote function execution. The job may + Reference to the SageMaker job created as a result of the remote function run. The job may or may not have finished running. """ @@ -931,7 +931,7 @@ def result(self, timeout: float = None) -> Any: default. Returns: - The Python object returned by the remote function execution. + The Python object returned by the remote function. """ try: self.wait(timeout) @@ -1004,12 +1004,12 @@ def wait( ) -> None: """Wait for the underlying SageMaker job to complete. - This method waits for the SageMaker job created as a result of the remote function execution + This method waits for the SageMaker job created as a result of the remote function run to complete for up to the timeout value (if specified). If timeout is ``None``, this method will block until the job is completed. Args: - timeout (int): Timeout in seconds to wait for until the job is completed before it is + timeout (int): Timeout in seconds to wait until the job is completed before it is stopped. Defaults to ``None``. Returns: None @@ -1029,7 +1029,7 @@ def cancel(self): early if it is already in progress. Returns: ``True`` if the underlying SageMaker job created as a result of the remote function - execution is cancelled. + run is cancelled. """ with self._condition: if self._state == _FINISHED: @@ -1073,9 +1073,9 @@ def get_future(job_name, sagemaker_session=None): Args: job_name (str): name of the underlying SageMaker job created as a result of the remote - function execution. + function run. - sagemaker_session (sagemaker.session.Session): A session object which manages interactions + sagemaker_session (sagemaker.session.Session): A session object that manages interactions with Amazon SageMaker APIs and any other AWS services needed. Returns: @@ -1094,8 +1094,8 @@ def list_futures(job_name_prefix, sagemaker_session=None): Args: job_name_prefix (str): A prefix used to identify the SageMaker jobs associated with remote - function execution. - sagemaker_session (sagemaker.session.Session): A session object which manages interactions + function run. + sagemaker_session (sagemaker.session.Session): A session object that manages interactions with Amazon SageMaker APIs and any other AWS services needed. Yields: From 34959f8f8f39f20e7c4761facd3d8ec830c2ddd0 Mon Sep 17 00:00:00 2001 From: Dipankar Patro Date: Mon, 24 Apr 2023 13:50:48 -0700 Subject: [PATCH 5/6] persistent cache reference updates --- src/sagemaker/remote_function/client.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/sagemaker/remote_function/client.py b/src/sagemaker/remote_function/client.py index d342fa8fda..0b53d57202 100644 --- a/src/sagemaker/remote_function/client.py +++ b/src/sagemaker/remote_function/client.py @@ -198,8 +198,8 @@ def remote( infrastructure after the completion of a training job, also known as SageMaker managed warm pools. The use of warmpools reduces the latency time spent to provision new resources. The default value for ``keep_alive_period_in_seconds`` is 0. - NOTE: Additional charges associated with warm pools may apply. Using this parameter will - also activate a new Persistent Cache feature, which will further reduce job start up + NOTE: Additional charges associated with warm pools may apply. Using this parameter also + activates a new pesistent cache feature, which will further reduce job start up latency than over using SageMaker managed warm pools alone by caching the package source downloaded in the previous runs. @@ -587,9 +587,9 @@ def __init__( provisioned infrastructure after the completion of a training job, also known as SageMaker managed warm pools. The use of warmpools reduces the latency time spent to provision new resources. The default value for ``keep_alive_period_in_seconds`` is 0. - Additional charges associated with warm pools may apply. Using this parameter will - also activate a new persistent cache feature, which will further reduce job start up - latency than over using SageMaker managed warm pools alone by caching the package + NOTE: Additional charges associated with warm pools may apply. Using this parameter + also activates a new pesistent cache feature, which will further reduce job start + up latency than over using SageMaker managed warm pools alone by caching the package source downloaded in the previous runs. max_parallel_jobs (int): Maximum number of jobs that run in parallel. Defaults to 1. From 83095a32040e51837433091d97eedf0250f5a225 Mon Sep 17 00:00:00 2001 From: Dipankar Patro Date: Mon, 24 Apr 2023 14:21:57 -0700 Subject: [PATCH 6/6] addressing some more feedback --- src/sagemaker/remote_function/client.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/sagemaker/remote_function/client.py b/src/sagemaker/remote_function/client.py index 0b53d57202..d6da8a054a 100644 --- a/src/sagemaker/remote_function/client.py +++ b/src/sagemaker/remote_function/client.py @@ -98,7 +98,7 @@ def remote( dependencies (str): Either the path to a dependencies file or the reserved keyword ``auto_capture``. Defaults to ``None``. - If dependencies is provided, the value must be one of the following: + If ``dependencies`` is provided, the value must be one of the following: * A path to a conda environment.yml file. The following conditions apply. @@ -128,7 +128,7 @@ def remote( ``SAGEMAKER_JOB_CONDA_ENV`` must already exist in the image. * If none of the above conditions are met, conda is not used. Dependencies are installed at the system level, without any virtual environment, and the function - annotated with the remote decorator is invoked using the python runtime available + annotated with the remote decorator is invoked using the Python runtime available in the system path. * The parameter dependencies is set to auto_capture. SageMaker will automatically @@ -187,7 +187,7 @@ def remote( instance_count (int): The number of instances to use. Defaults to 1. instance_type (str): The Amazon Elastic Compute Cloud (EC2) instance type to use to run - the SageMaker job. e.g. ml.c4.xlarge. If not provided, ValueError is thrown. + the SageMaker job. e.g. ml.c4.xlarge. If not provided, a ValueError is thrown. job_conda_env (str): The name of the conda environment to activate during job's runtime. Defaults to ``None``. @@ -475,7 +475,7 @@ def __init__( ): """Constructor for RemoteExecutor - Unless mentioned otherwise, the construcutor first looks up the value from the SageMaker + Unless mentioned otherwise, the constructor first looks up the value from the SageMaker configuration file. If no value is specified in the configuration file or no configuration file is found, the constructor selects the default as specified below. For more information, see `Configuring and using defaults with the SageMaker Python SDK @@ -487,7 +487,7 @@ def __init__( dependencies (str): Either the path to a dependencies file or the reserved keyword ``auto_capture``. Defaults to ``None``. - If dependencies is provided, the value must be one of the following: + If ``dependencies`` is provided, the value must be one of the following: * A path to a conda environment.yml file. The following conditions apply. @@ -517,7 +517,7 @@ def __init__( ``SAGEMAKER_JOB_CONDA_ENV`` must already exist in the image. * If none of the above conditions are met, conda is not used. Dependencies are installed at the system level, without any virtual environment, and the function - annotated with the remote decorator is invoked using the python runtime available + annotated with the remote decorator is invoked using the Python runtime available in the system path. * The parameter dependencies is set to auto_capture. SageMaker will automatically @@ -576,7 +576,7 @@ def __init__( instance_count (int): The number of instances to use. Defaults to 1. instance_type (str): The Amazon Elastic Compute Cloud (EC2) instance type to use to run - the SageMaker job. e.g. ml.c4.xlarge. If not provided, ValueError is thrown. + the SageMaker job. e.g. ml.c4.xlarge. If not provided, a ValueError is thrown. job_conda_env (str): The name of the conda environment to activate during job's runtime. Defaults to ``None``.