diff --git a/cirro/models/file.py b/cirro/models/file.py index aa5c65ba..f35ae518 100644 --- a/cirro/models/file.py +++ b/cirro/models/file.py @@ -85,7 +85,7 @@ def upload_sample_sheet(cls, project_id: str, dataset_id: str, base_url: str): access_type=ProjectAccessType.SAMPLESHEET_UPLOAD, dataset_id=dataset_id ), - base_url=base_url, + base_url=f'{base_url}/data', project_id=project_id ) diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index eb5ac063..10a76aa1 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -1,11 +1,14 @@ import datetime +from pathlib import Path from typing import Union, List, Optional +from cirro_api_client.v1.api.processes import validate_file_requirements from cirro_api_client.v1.models import Dataset, DatasetDetail, RunAnalysisRequest, ProcessDetail, Status, \ - RunAnalysisRequestParams, Tag, ArtifactType, NamedItem + RunAnalysisRequestParams, Tag, ArtifactType, NamedItem, Executor, ValidateFileRequirementsRequest from cirro.cirro_client import CirroApi from cirro.models.assets import DatasetAssets +from cirro.models.file import PathLike from cirro.sdk.asset import DataPortalAssets, DataPortalAsset from cirro.sdk.exceptions import DataPortalAssetNotFound from cirro.sdk.exceptions import DataPortalInputError @@ -302,6 +305,55 @@ def run_analysis( ) return resp.id + def update_samplesheet(self, + contents: str = None, + file_path: PathLike = None): + """ + Updates the samplesheet metadata of a dataset. + Provide either the contents (as a string) or a file path. + Both must be in the format of a CSV. + + Args: + contents (str): Samplesheet contents to update (should be a CSV string) + file_path (PathLike): Path of file to update (should be a CSV file) + + Example: + ```python + dataset.update_samplesheet( + file_path=Path('~/samplesheet.csv') + ) + ``` + """ + + if contents is None and file_path is None: + raise DataPortalInputError("Must specify either 'contents' or 'file_path' when updating samplesheet") + + if self.process.executor != Executor.INGEST: + raise DataPortalInputError("Cannot update a samplesheet on a non-ingest dataset") + + samplesheet_contents = contents + if file_path is not None: + samplesheet_contents = Path(file_path).expanduser().read_text() + + # Validate samplesheet + file_names = [f.file_name for f in self.list_files()] + request = ValidateFileRequirementsRequest( + file_names=file_names, + sample_sheet=samplesheet_contents, + ) + requirements = validate_file_requirements.sync(process_id=self.process_id, + body=request, + client=self._client.api_client) + if error_msg := requirements.error_msg: + raise DataPortalInputError(error_msg) + + # Update the samplesheet if everything looks ok + self._client.datasets.update_samplesheet( + project_id=self.project_id, + dataset_id=self.id, + samplesheet=samplesheet_contents + ) + class DataPortalDatasets(DataPortalAssets[DataPortalDataset]): """Collection of multiple DataPortalDataset objects.""" diff --git a/cirro/services/dataset.py b/cirro/services/dataset.py index f21981e3..4407f956 100644 --- a/cirro/services/dataset.py +++ b/cirro/services/dataset.py @@ -339,3 +339,30 @@ def download_files( base_url=dataset.s3) self._file_service.download_files(access_context, download_location, files) + + def update_samplesheet( + self, + project_id: str, + dataset_id: str, + samplesheet: str + ): + """ + Updates a samplesheet on a dataset + + Args: + project_id (str): ID of the Project + dataset_id (str): ID of the Dataset + samplesheet (str): Samplesheet contents to update (should be a CSV string) + """ + dataset = self.get(project_id, dataset_id) + access_context = FileAccessContext.upload_sample_sheet(project_id=project_id, + dataset_id=dataset_id, + base_url=dataset.s3) + + samplesheet_key = f'{access_context.prefix}/samplesheet.csv' + self._file_service.create_file( + access_context=access_context, + key=samplesheet_key, + contents=samplesheet, + content_type='text/csv' + ) diff --git a/pyproject.toml b/pyproject.toml index a2ff5e92..a7d2aac1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "cirro" -version = "1.7.2" +version = "1.8.0" description = "CLI tool and SDK for interacting with the Cirro platform" authors = ["Cirro Bio "] license = "MIT" diff --git a/samples/Uploading_a_dataset.ipynb b/samples/Uploading_a_dataset.ipynb index a90cde6f..cbde3e39 100644 --- a/samples/Uploading_a_dataset.ipynb +++ b/samples/Uploading_a_dataset.ipynb @@ -243,12 +243,12 @@ "evalue": "Files do not match dataset type. Expected file type requirements: \nPaired FASTQ (Illumina Format) *_S*_L???_{I,R}{1,2}_001.fastq.gz", "output_type": "error", "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/var/folders/ck/j40906kx3mj90bcc8qs7gyxm0000gp/T/ipykernel_83747/2225702019.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;31m# Try to upload the data (which will cause an error)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m project.upload_dataset(\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0mname\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'Test dataset'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mdescription\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Documents/GitHub/Cirro-client/cirro/sdk/project.py\u001b[0m in \u001b[0;36mupload_dataset\u001b[0;34m(self, name, description, process, upload_folder, files)\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 127\u001b[0m \u001b[0;31m# Make sure that the files match the expected pattern\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 128\u001b[0;31m \u001b[0mcheck_dataset_files\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfiles\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprocess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfile_mapping_rules\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mupload_folder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 129\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[0;31m# Create the ingest process request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Documents/GitHub/Cirro-client/cirro/file_utils.py\u001b[0m in \u001b[0;36mcheck_dataset_files\u001b[0;34m(files, file_mapping_rules, directory)\u001b[0m\n\u001b[1;32m 148\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0many\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunctools\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpartial\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmatch_pattern\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_mapping_rules\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 150\u001b[0;31m raise ValueError(\"Files do not match dataset type. Expected file type requirements: \\n\" + \"\\n\".join(\n\u001b[0m\u001b[1;32m 151\u001b[0m [f\"{rule.get('description', '')} {rule.get('glob')}\" for rule in file_mapping_rules]))\n", - "\u001b[0;31mValueError\u001b[0m: Files do not match dataset type. Expected file type requirements: \nPaired FASTQ (Illumina Format) *_S*_L???_{I,R}{1,2}_001.fastq.gz" + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mValueError\u001B[0m Traceback (most recent call last)", + "\u001B[0;32m/var/folders/ck/j40906kx3mj90bcc8qs7gyxm0000gp/T/ipykernel_83747/2225702019.py\u001B[0m in \u001B[0;36m\u001B[0;34m\u001B[0m\n\u001B[1;32m 5\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 6\u001B[0m \u001B[0;31m# Try to upload the data (which will cause an error)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m----> 7\u001B[0;31m project.upload_dataset(\n\u001B[0m\u001B[1;32m 8\u001B[0m \u001B[0mname\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;34m'Test dataset'\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 9\u001B[0m \u001B[0mdescription\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;34m''\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", + "\u001B[0;32m~/Documents/GitHub/Cirro-client/cirro/sdk/project.py\u001B[0m in \u001B[0;36mupload_dataset\u001B[0;34m(self, name, description, process, upload_folder, files)\u001B[0m\n\u001B[1;32m 126\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 127\u001B[0m \u001B[0;31m# Make sure that the files match the expected pattern\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 128\u001B[0;31m \u001B[0mcheck_dataset_files\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mfiles\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mprocess\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mfile_mapping_rules\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mupload_folder\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 129\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 130\u001B[0m \u001B[0;31m# Create the ingest process request\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", + "\u001B[0;32m~/Documents/GitHub/Cirro-client/cirro/file_utils.py\u001B[0m in \u001B[0;36mcheck_dataset_files\u001B[0;34m(files, file_mapping_rules, directory)\u001B[0m\n\u001B[1;32m 148\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 149\u001B[0m \u001B[0;32mif\u001B[0m \u001B[0;32mnot\u001B[0m \u001B[0many\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmap\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mfunctools\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mpartial\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmatch_pattern\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mfiles\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mfile_mapping_rules\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 150\u001B[0;31m raise ValueError(\"Files do not match dataset type. Expected file type requirements: \\n\" + \"\\n\".join(\n\u001B[0m\u001B[1;32m 151\u001B[0m [f\"{rule.get('description', '')} {rule.get('glob')}\" for rule in file_mapping_rules]))\n", + "\u001B[0;31mValueError\u001B[0m: Files do not match dataset type. Expected file type requirements: \nPaired FASTQ (Illumina Format) *_S*_L???_{I,R}{1,2}_001.fastq.gz" ] } ], @@ -259,7 +259,7 @@ "print(json.dumps(ingest_10X.file_mapping_rules, indent=3))\n", "\n", "# Try to upload the data (which will cause an error)\n", - "project.upload_dataset(\n", + "dataset = project.upload_dataset(\n", " name = 'Test dataset',\n", " description = '',\n", " upload_folder = '/tmp',\n", @@ -269,11 +269,43 @@ ] }, { - "cell_type": "code", - "execution_count": null, "metadata": {}, + "cell_type": "markdown", + "source": [ + "You can update a sample sheet on an existing dataset by using the `update_samplesheet` method.\n", + "\n", + "You may provide either the CSV contents or a file path." + ] + }, + { + "metadata": {}, + "cell_type": "code", "outputs": [], - "source": [] + "execution_count": null, + "source": [ + "from pathlib import Path\n", + "import pandas as pd\n", + "\n", + "samplesheet = pd.DataFrame.from_records([\n", + " {\n", + " 'sample': 'test',\n", + " 'fastq_1': 'test.R1.fastq.gz',\n", + " 'fastq_2': 'test.R2.fastq.gz',\n", + " 'status': 'Normal'\n", + " }\n", + "])\n", + "\n", + "dataset.update_samplesheet(\n", + " contents=samplesheet.to_csv(index=False),\n", + ")\n", + "\n", + "\n", + "# OR\n", + "\n", + "dataset.update_samplesheet(\n", + " file_path=Path('~/samplesheet.csv')\n", + ")" + ] } ], "metadata": {