CI-817 - Allow updating a samplesheet from SDK (#169)

nathanthorpe · web-flow · commit ee3dbf7d9076 · 2025-09-09T11:35:24.000-07:00
* allow updating a samplesheet

* lint

* bump version
diff --git a/cirro/models/file.py b/cirro/models/file.py
@@ -85,7 +85,7 @@ def upload_sample_sheet(cls, project_id: str, dataset_id: str, base_url: str):
                 access_type=ProjectAccessType.SAMPLESHEET_UPLOAD,
                 dataset_id=dataset_id
             ),
-            base_url=base_url,
+            base_url=f'{base_url}/data',
             project_id=project_id
         )
 
diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py
@@ -1,11 +1,14 @@
 import datetime
+from pathlib import Path
 from typing import Union, List, Optional
 
+from cirro_api_client.v1.api.processes import validate_file_requirements
 from cirro_api_client.v1.models import Dataset, DatasetDetail, RunAnalysisRequest, ProcessDetail, Status, \
-    RunAnalysisRequestParams, Tag, ArtifactType, NamedItem
+    RunAnalysisRequestParams, Tag, ArtifactType, NamedItem, Executor, ValidateFileRequirementsRequest
 
 from cirro.cirro_client import CirroApi
 from cirro.models.assets import DatasetAssets
+from cirro.models.file import PathLike
 from cirro.sdk.asset import DataPortalAssets, DataPortalAsset
 from cirro.sdk.exceptions import DataPortalAssetNotFound
 from cirro.sdk.exceptions import DataPortalInputError
@@ -302,6 +305,55 @@ def run_analysis(
         )
         return resp.id
 
+    def update_samplesheet(self,
+                           contents: str = None,
+                           file_path: PathLike = None):
+        """
+        Updates the samplesheet metadata of a dataset.
+        Provide either the contents (as a string) or a file path.
+        Both must be in the format of a CSV.
+
+        Args:
+            contents (str): Samplesheet contents to update (should be a CSV string)
+            file_path (PathLike): Path of file to update (should be a CSV file)
+
+        Example:
+        ```python
+        dataset.update_samplesheet(
+            file_path=Path('~/samplesheet.csv')
+        )
+        ```
+        """
+
+        if contents is None and file_path is None:
+            raise DataPortalInputError("Must specify either 'contents' or 'file_path' when updating samplesheet")
+
+        if self.process.executor != Executor.INGEST:
+            raise DataPortalInputError("Cannot update a samplesheet on a non-ingest dataset")
+
+        samplesheet_contents = contents
+        if file_path is not None:
+            samplesheet_contents = Path(file_path).expanduser().read_text()
+
+        # Validate samplesheet
+        file_names = [f.file_name for f in self.list_files()]
+        request = ValidateFileRequirementsRequest(
+            file_names=file_names,
+            sample_sheet=samplesheet_contents,
+        )
+        requirements = validate_file_requirements.sync(process_id=self.process_id,
+                                                       body=request,
+                                                       client=self._client.api_client)
+        if error_msg := requirements.error_msg:
+            raise DataPortalInputError(error_msg)
+
+        # Update the samplesheet if everything looks ok
+        self._client.datasets.update_samplesheet(
+            project_id=self.project_id,
+            dataset_id=self.id,
+            samplesheet=samplesheet_contents
+        )
+
 
 class DataPortalDatasets(DataPortalAssets[DataPortalDataset]):
     """Collection of multiple DataPortalDataset objects."""
diff --git a/cirro/services/dataset.py b/cirro/services/dataset.py
@@ -339,3 +339,30 @@ def download_files(
                                                             base_url=dataset.s3)
 
         self._file_service.download_files(access_context, download_location, files)
+
+    def update_samplesheet(
+        self,
+        project_id: str,
+        dataset_id: str,
+        samplesheet: str
+    ):
+        """
+        Updates a samplesheet on a dataset
+
+        Args:
+            project_id (str): ID of the Project
+            dataset_id (str): ID of the Dataset
+            samplesheet (str): Samplesheet contents to update (should be a CSV string)
+        """
+        dataset = self.get(project_id, dataset_id)
+        access_context = FileAccessContext.upload_sample_sheet(project_id=project_id,
+                                                               dataset_id=dataset_id,
+                                                               base_url=dataset.s3)
+
+        samplesheet_key = f'{access_context.prefix}/samplesheet.csv'
+        self._file_service.create_file(
+            access_context=access_context,
+            key=samplesheet_key,
+            contents=samplesheet,
+            content_type='text/csv'
+        )
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "cirro"
-version = "1.7.2"
+version = "1.8.0"
 description = "CLI tool and SDK for interacting with the Cirro platform"
 authors = ["Cirro Bio <support@cirro.bio>"]
 license = "MIT"
diff --git a/samples/Uploading_a_dataset.ipynb b/samples/Uploading_a_dataset.ipynb
@@ -243,12 +243,12 @@
      "evalue": "Files do not match dataset type. Expected file type requirements: \nPaired FASTQ (Illumina Format) *_S*_L???_{I,R}{1,2}_001.fastq.gz",
      "output_type": "error",
      "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "\u001b[0;32m/var/folders/ck/j40906kx3mj90bcc8qs7gyxm0000gp/T/ipykernel_83747/2225702019.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;31m# Try to upload the data (which will cause an error)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m project.upload_dataset(\n\u001b[0m\u001b[1;32m      8\u001b[0m     \u001b[0mname\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'Test dataset'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m     \u001b[0mdescription\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/Documents/GitHub/Cirro-client/cirro/sdk/project.py\u001b[0m in \u001b[0;36mupload_dataset\u001b[0;34m(self, name, description, process, upload_folder, files)\u001b[0m\n\u001b[1;32m    126\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    127\u001b[0m         \u001b[0;31m# Make sure that the files match the expected pattern\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 128\u001b[0;31m         \u001b[0mcheck_dataset_files\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfiles\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprocess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfile_mapping_rules\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mupload_folder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    129\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    130\u001b[0m         \u001b[0;31m# Create the ingest process request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/Documents/GitHub/Cirro-client/cirro/file_utils.py\u001b[0m in \u001b[0;36mcheck_dataset_files\u001b[0;34m(files, file_mapping_rules, directory)\u001b[0m\n\u001b[1;32m    148\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    149\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0many\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunctools\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpartial\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmatch_pattern\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_mapping_rules\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 150\u001b[0;31m         raise ValueError(\"Files do not match dataset type. Expected file type requirements: \\n\" + \"\\n\".join(\n\u001b[0m\u001b[1;32m    151\u001b[0m             [f\"{rule.get('description', '')} {rule.get('glob')}\" for rule in file_mapping_rules]))\n",
-      "\u001b[0;31mValueError\u001b[0m: Files do not match dataset type. Expected file type requirements: \nPaired FASTQ (Illumina Format) *_S*_L???_{I,R}{1,2}_001.fastq.gz"
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mValueError\u001B[0m                                Traceback (most recent call last)",
+      "\u001B[0;32m/var/folders/ck/j40906kx3mj90bcc8qs7gyxm0000gp/T/ipykernel_83747/2225702019.py\u001B[0m in \u001B[0;36m<module>\u001B[0;34m\u001B[0m\n\u001B[1;32m      5\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m      6\u001B[0m \u001B[0;31m# Try to upload the data (which will cause an error)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m----> 7\u001B[0;31m project.upload_dataset(\n\u001B[0m\u001B[1;32m      8\u001B[0m     \u001B[0mname\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;34m'Test dataset'\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m      9\u001B[0m     \u001B[0mdescription\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;34m''\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/Documents/GitHub/Cirro-client/cirro/sdk/project.py\u001B[0m in \u001B[0;36mupload_dataset\u001B[0;34m(self, name, description, process, upload_folder, files)\u001B[0m\n\u001B[1;32m    126\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    127\u001B[0m         \u001B[0;31m# Make sure that the files match the expected pattern\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 128\u001B[0;31m         \u001B[0mcheck_dataset_files\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mfiles\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mprocess\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mfile_mapping_rules\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mupload_folder\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    129\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    130\u001B[0m         \u001B[0;31m# Create the ingest process request\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/Documents/GitHub/Cirro-client/cirro/file_utils.py\u001B[0m in \u001B[0;36mcheck_dataset_files\u001B[0;34m(files, file_mapping_rules, directory)\u001B[0m\n\u001B[1;32m    148\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    149\u001B[0m     \u001B[0;32mif\u001B[0m \u001B[0;32mnot\u001B[0m \u001B[0many\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmap\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mfunctools\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mpartial\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmatch_pattern\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mfiles\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mfile_mapping_rules\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 150\u001B[0;31m         raise ValueError(\"Files do not match dataset type. Expected file type requirements: \\n\" + \"\\n\".join(\n\u001B[0m\u001B[1;32m    151\u001B[0m             [f\"{rule.get('description', '')} {rule.get('glob')}\" for rule in file_mapping_rules]))\n",
+      "\u001B[0;31mValueError\u001B[0m: Files do not match dataset type. Expected file type requirements: \nPaired FASTQ (Illumina Format) *_S*_L???_{I,R}{1,2}_001.fastq.gz"
      ]
     }
    ],
@@ -259,7 +259,7 @@
     "print(json.dumps(ingest_10X.file_mapping_rules, indent=3))\n",
     "\n",
     "# Try to upload the data (which will cause an error)\n",
-    "project.upload_dataset(\n",
+    "dataset = project.upload_dataset(\n",
     "    name = 'Test dataset',\n",
     "    description = '',\n",
     "    upload_folder = '/tmp',\n",
@@ -269,11 +269,43 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "You can update a sample sheet on an existing dataset by using the `update_samplesheet` method.\n",
+    "\n",
+    "You may provide either the CSV contents or a file path."
+   ]
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
    "outputs": [],
-   "source": []
+   "execution_count": null,
+   "source": [
+    "from pathlib import Path\n",
+    "import pandas as pd\n",
+    "\n",
+    "samplesheet = pd.DataFrame.from_records([\n",
+    "    {\n",
+    "        'sample': 'test',\n",
+    "        'fastq_1': 'test.R1.fastq.gz',\n",
+    "        'fastq_2': 'test.R2.fastq.gz',\n",
+    "        'status': 'Normal'\n",
+    "    }\n",
+    "])\n",
+    "\n",
+    "dataset.update_samplesheet(\n",
+    "    contents=samplesheet.to_csv(index=False),\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# OR\n",
+    "\n",
+    "dataset.update_samplesheet(\n",
+    "    file_path=Path('~/samplesheet.csv')\n",
+    ")"
+   ]
   }
  ],
  "metadata": {

Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ def upload_sample_sheet(cls, project_id: str, dataset_id: str, base_url: str):`
`85`	`85`	`access_type=ProjectAccessType.SAMPLESHEET_UPLOAD,`
`86`	`86`	`dataset_id=dataset_id`
`87`	`87`	`),`
`88`		`- base_url=base_url,`
	`88`	`+ base_url=f'{base_url}/data',`
`89`	`89`	`project_id=project_id`
`90`	`90`	`)`
`91`	`91`