From 1f30e8bb7260c3d4afbdd6249327772cd2cd5867 Mon Sep 17 00:00:00 2001
From: Malte <malterei@amazon.com>
Date: Mon, 4 Aug 2025 08:19:07 +0000
Subject: [PATCH 1/3] fix: moves local mode check out of sagemaker studio
 enable docker access function

---
 03_finetune_swift.ipynb | 28 +++++++++++---------
 utils/training_image.py | 57 ++++++++++++++++++++---------------------
 2 files changed, 44 insertions(+), 41 deletions(-)
diff --git a/03_finetune_swift.ipynb b/03_finetune_swift.ipynb
index da0bbb0..d0bdafb 100644
--- a/03_finetune_swift.ipynb
+++ b/03_finetune_swift.ipynb
@@ -63,7 +63,7 @@
     "    get_aws_account_id_for_region,\n",
     "    is_docker_installed,\n",
     "    is_docker_compose_installed,\n",
-    "    check_and_enable_docker_access_sagemaker_studio\n",
+    "    enable_docker_access_sagemaker_studio\n",
     ")"
    ]
   },
@@ -340,7 +340,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "use_local_mode = True # Set to true to run on local instance\n",
+    "use_local_mode = False # Set to true to run on local instance\n",
     "instance_type = \"local_gpu\" if use_local_mode else \"ml.g6.8xlarge\" # \"ml.g6.12xlarge\" "
    ]
   },
@@ -354,7 +354,9 @@
     "    border-radius: 5px; \n",
     "    max-width: 100%;\n",
     "    background: #f0fbff;\">\n",
-    "    <b>Note:</b> If you run into out of memory errors during training use a larger instance type. For example if you are training a larger model or with more data. If you are using local mode then you will need a GPU on your local machine, for example running inside a SageMaker Studio JuypterLab on a ml.g6.8xlarge instance.\n",
+    "    <b>Note:</b> If you run into out of memory errors during training use a larger instance type. For example if you are training a larger model or with more data. \n",
+    "<br><br>\n",
+    "If you are using local mode then you will need a GPU on your local machine, for example running inside a SageMaker Studio JuypterLab on a ml.g6.8xlarge instance.\n",
     "</div>"
    ]
   },
@@ -373,7 +375,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "use_spot = True"
+    "use_spot = True # Set to False to use on-demand instances instead of spot instances."
    ]
   },
   {
@@ -423,7 +425,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "checkpoint_loc = None"
+    "checkpoint_loc = None # Default: None - No checkpointing. Re-run cell above \"Note box\" to set checkpoint_loc  "
    ]
   },
   {
@@ -724,24 +726,26 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2d40f9f3-ee0d-4ba0-8472-f6bfde86503c",
+   "id": "39563e42-1a02-4c14-af64-68101a055633",
    "metadata": {},
    "outputs": [],
    "source": [
-    "if use_local_mode and (not is_docker_installed() or not is_docker_compose_installed()): \n",
-    "    # we need docker and docker-compose for LocalMode execution\n",
-    "    !bash docker-artifacts/01_docker_install.sh"
+    "if use_local_mode:\n",
+    "    # for local mode we need docker enabled\n",
+    "    # if in SageMaker Studio the following function ensures that docker access is enabled\n",
+    "    enable_docker_access_sagemaker_studio(session)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "39563e42-1a02-4c14-af64-68101a055633",
+   "id": "2d40f9f3-ee0d-4ba0-8472-f6bfde86503c",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# if in SageMaker Studio check if docker access is enabled if it is not enable it\n",
-    "check_and_enable_docker_access_sagemaker_studio(use_local_mode, session)"
+    "if use_local_mode and (not is_docker_installed() or not is_docker_compose_installed()): \n",
+    "    # we need docker and docker-compose for LocalMode execution\n",
+    "    !bash docker-artifacts/01_docker_install.sh"
    ]
   },
   {
diff --git a/utils/training_image.py b/utils/training_image.py
index 1370622..38c99ee 100644
--- a/utils/training_image.py
+++ b/utils/training_image.py
@@ -129,33 +129,32 @@ def load_json_file(file_path):
 DOCKER_ENABLED = "ENABLED"
 DOCKER_DISABLED = "DISABLED"
 
-def check_and_enable_docker_access_sagemaker_studio(use_local_mode, session):
-    if use_local_mode:
-        resource_metadata = load_json_file(SAGEMAKER_STUDIO_METADATA)
-        if resource_metadata:
-            docker_access_disabled = True
-            docker_access = None
-            try:
-                domain_id = resource_metadata["DomainId"]
-                sm_client = session.boto_session.client("sagemaker")
-                domain = sm_client.describe_domain(DomainId=domain_id)
-                if docker_access_disabled := ((docker_access := domain["DomainSettings"]["DockerSettings"]["EnableDockerAccess"]) == DOCKER_DISABLED):
-                    print("Docker disabled on SageMaker Studio domain. Trying to enable docker access...")
-                    sm_client.update_domain(
-                        DomainId=domain_id,
-                        DomainSettingsForUpdate={
-                            'DockerSettings': {
-                                'EnableDockerAccess': DOCKER_ENABLED
-                            }
+def enable_docker_access_sagemaker_studio(session):
+    resource_metadata = load_json_file(SAGEMAKER_STUDIO_METADATA)
+    if resource_metadata:
+        docker_access_disabled = True
+        docker_access = None
+        try:
+            domain_id = resource_metadata["DomainId"]
+            sm_client = session.boto_session.client("sagemaker")
+            domain = sm_client.describe_domain(DomainId=domain_id)
+            if docker_access_disabled := ((docker_access := domain["DomainSettings"]["DockerSettings"]["EnableDockerAccess"]) == DOCKER_DISABLED):
+                print("Docker disabled on SageMaker Studio domain. Trying to enable docker access...")
+                sm_client.update_domain(
+                    DomainId=domain_id,
+                    DomainSettingsForUpdate={
+                        'DockerSettings': {
+                            'EnableDockerAccess': DOCKER_ENABLED
                         }
-                    )
-                    time.sleep(4)
-                    domain = sm_client.describe_domain(DomainId=domain_id)
-                    docker_access = domain["DomainSettings"]["DockerSettings"]["EnableDockerAccess"]
-                    docker_access_disabled = (docker_access == DOCKER_DISABLED)
-            except Exception as e: 
-                print(e)
-                
-            print(f"SageMaker Studio domain ({domain_id}) docker access: {docker_access}")
-            if docker_access_disabled:
-                print("Failed to enable Docker Access on SageMaker Studio domain. Please enable it manually or ask your administrator. Docker access is required to run in local mode. https://docs.aws.amazon.com/sagemaker/latest/dg/studio-updated-local-get-started.html#studio-updated-local-enable")
\ No newline at end of file
+                    }
+                )
+                time.sleep(4)
+                domain = sm_client.describe_domain(DomainId=domain_id)
+                docker_access = domain["DomainSettings"]["DockerSettings"]["EnableDockerAccess"]
+                docker_access_disabled = (docker_access == DOCKER_DISABLED)
+        except Exception as e: 
+            print(e)
+            
+        print(f"SageMaker Studio domain ({domain_id}) docker access: {docker_access}")
+        if docker_access_disabled:
+            print("Failed to enable Docker Access on SageMaker Studio domain. Please enable it manually or ask your administrator. Docker access is required to run in local mode. https://docs.aws.amazon.com/sagemaker/latest/dg/studio-updated-local-get-started.html#studio-updated-local-enable")
\ No newline at end of file

From 0257dffe45e7ad46e851068beafc5c169ba46c04 Mon Sep 17 00:00:00 2001
From: Malte <malterei@amazon.com>
Date: Mon, 4 Aug 2025 08:20:12 +0000
Subject: [PATCH 2/3] docs: adds additional inference notes

---
 04_run_batch_inference.ipynb | 69 +++++++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 24 deletions(-)

diff --git a/04_run_batch_inference.ipynb b/04_run_batch_inference.ipynb
index 00e5ddf..c34def0 100644
--- a/04_run_batch_inference.ipynb
+++ b/04_run_batch_inference.ipynb
@@ -65,10 +65,7 @@
     "    get_sagemaker_distribution, \n",
     "    SageMakerDistribution, \n",
     "    get_python_version, \n",
-    "    get_aws_account_id_for_region,\n",
-    "    is_docker_installed,\n",
-    "    is_docker_compose_installed,\n",
-    "    check_and_enable_docker_access_sagemaker_studio\n",
+    "    get_aws_account_id_for_region\n",
     ")"
    ]
   },
@@ -389,6 +386,30 @@
     "sagemaker_dist_uri = f\"{sagemaker_distr_account_id}.dkr.ecr.{region}.amazonaws.com/sagemaker-distribution-prod:{sm_distro_version.image_version}-gpu\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "04305c64-7ae5-4ffc-be2f-fe0e6e1a398b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "instance_type = \"ml.g6e.xlarge\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d0bb0ade-103c-4746-bd48-524a0fe54de5",
+   "metadata": {},
+   "source": [
+    "<div style=\"border: 2px solid #006CE0; \n",
+    "    padding: 10px; \n",
+    "    border-radius: 5px; \n",
+    "    max-width: 100%;\n",
+    "    background: #f0fbff;\">\n",
+    "    <b>Note:</b> If you run into out of memory errors during inference use a larger instance type. For example if you have trained a larger model. \n",
+    "</div>"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "202dfb1b-322c-470d-9191-29490a7ffee6",
@@ -521,7 +542,7 @@
     "        # RoleArn: <replace the role arn here>\n",
     "        S3RootUri: {s3_root_uri}\n",
     "        ImageUri: {sagemaker_dist_uri}        \n",
-    "        InstanceType: ml.g6e.4xlarge\n",
+    "        InstanceType: ml.g6e.4xlarge # default instance type to use\n",
     "        Dependencies: requirements.txt\n",
     "        IncludeLocalWorkDir: true\n",
     "        PreExecutionCommands:\n",
@@ -610,6 +631,22 @@
     "## Batch Inference Function"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9050ae0-98eb-48c9-8d76-320a5b4ee86d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inference_kwargs = {\n",
+    "    \"model_id\":model_config.model_id,\n",
+    "    \"model_type\":model_config.model_type,\n",
+    "    \"dataset_s3\":dataset_s3_uri,\n",
+    "    \"test_data_path\":\"conversations_test_swift_format.json\",\n",
+    "    \"guided_decoding\":guided_decoding\n",
+    "}"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -618,11 +655,11 @@
    "outputs": [],
    "source": [
     "@remote(\n",
-    "    instance_type=\"ml.g6e.xlarge\",  # Powerful GPU for fast inference\n",
+    "    instance_type=instance_type,  # Powerful GPU for fast inference\n",
     "    instance_count=1,  # Single instance for cost efficiency\n",
     "    volume_size=200, # Large volume for model and data storage\n",
     "    job_name_prefix=job_name_prefix,\n",
-    "    # use_spot_instances=True, # Cost efficient inference. Inference can be restarted if no spot capacity. \n",
+    "    # use_spot_instances=True, # Enable cost efficient inference. Inference can be restarted if no spot capacity. \n",
     "    max_wait_time_in_seconds=172800, # 48 hours max wait\n",
     "    max_runtime_in_seconds=172800, # 48 hours max runtime\n",
     ")\n",
@@ -710,28 +747,12 @@
   },
   {
    "cell_type": "markdown",
-   "id": "6eb0b491-7f9f-4b83-98d1-daedabe3f311",
+   "id": "259ae020-c35d-4470-95ce-0043a1b61893",
    "metadata": {},
    "source": [
     "## Run Batch Inference"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e9050ae0-98eb-48c9-8d76-320a5b4ee86d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "inference_kwargs = {\n",
-    "    \"model_id\":model_config.model_id,\n",
-    "    \"model_type\":model_config.model_type,\n",
-    "    \"dataset_s3\":dataset_s3_uri,\n",
-    "    \"test_data_path\":\"conversations_test_swift_format.json\",\n",
-    "    \"guided_decoding\":guided_decoding\n",
-    "}"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,

From 307968b4145fedd4d6b84036e2d1bf2e3a6675db Mon Sep 17 00:00:00 2001
From: Malte <malterei@amazon.com>
Date: Mon, 4 Aug 2025 08:25:48 +0000
Subject: [PATCH 3/3] fix: adds sagemaker to repo name to fix #3

---
 06_deploy_model_endpoint.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/06_deploy_model_endpoint.ipynb b/06_deploy_model_endpoint.ipynb
index 55884b4..8dfe450 100644
--- a/06_deploy_model_endpoint.ipynb
+++ b/06_deploy_model_endpoint.ipynb
@@ -241,7 +241,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "REPO_NAME = \"swift-json-vlm-container-finetuned\"\n",
+    "REPO_NAME = \"sagemaker-json-vlm-finetuned\"\n",
     "os.environ['REPO_NAME'] = REPO_NAME\n",
     "os.environ[\"S3_MODEL_URI\"]=s3_model_uri"
    ]