Skip to content

Commit ee3dbf7

Browse files
authored
CI-817 - Allow updating a samplesheet from SDK (#169)
* allow updating a samplesheet * lint * bump version
1 parent 88e8cad commit ee3dbf7

File tree

5 files changed

+124
-13
lines changed

5 files changed

+124
-13
lines changed

cirro/models/file.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def upload_sample_sheet(cls, project_id: str, dataset_id: str, base_url: str):
8585
access_type=ProjectAccessType.SAMPLESHEET_UPLOAD,
8686
dataset_id=dataset_id
8787
),
88-
base_url=base_url,
88+
base_url=f'{base_url}/data',
8989
project_id=project_id
9090
)
9191

cirro/sdk/dataset.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
import datetime
2+
from pathlib import Path
23
from typing import Union, List, Optional
34

5+
from cirro_api_client.v1.api.processes import validate_file_requirements
46
from cirro_api_client.v1.models import Dataset, DatasetDetail, RunAnalysisRequest, ProcessDetail, Status, \
5-
RunAnalysisRequestParams, Tag, ArtifactType, NamedItem
7+
RunAnalysisRequestParams, Tag, ArtifactType, NamedItem, Executor, ValidateFileRequirementsRequest
68

79
from cirro.cirro_client import CirroApi
810
from cirro.models.assets import DatasetAssets
11+
from cirro.models.file import PathLike
912
from cirro.sdk.asset import DataPortalAssets, DataPortalAsset
1013
from cirro.sdk.exceptions import DataPortalAssetNotFound
1114
from cirro.sdk.exceptions import DataPortalInputError
@@ -302,6 +305,55 @@ def run_analysis(
302305
)
303306
return resp.id
304307

308+
def update_samplesheet(self,
309+
contents: str = None,
310+
file_path: PathLike = None):
311+
"""
312+
Updates the samplesheet metadata of a dataset.
313+
Provide either the contents (as a string) or a file path.
314+
Both must be in the format of a CSV.
315+
316+
Args:
317+
contents (str): Samplesheet contents to update (should be a CSV string)
318+
file_path (PathLike): Path of file to update (should be a CSV file)
319+
320+
Example:
321+
```python
322+
dataset.update_samplesheet(
323+
file_path=Path('~/samplesheet.csv')
324+
)
325+
```
326+
"""
327+
328+
if contents is None and file_path is None:
329+
raise DataPortalInputError("Must specify either 'contents' or 'file_path' when updating samplesheet")
330+
331+
if self.process.executor != Executor.INGEST:
332+
raise DataPortalInputError("Cannot update a samplesheet on a non-ingest dataset")
333+
334+
samplesheet_contents = contents
335+
if file_path is not None:
336+
samplesheet_contents = Path(file_path).expanduser().read_text()
337+
338+
# Validate samplesheet
339+
file_names = [f.file_name for f in self.list_files()]
340+
request = ValidateFileRequirementsRequest(
341+
file_names=file_names,
342+
sample_sheet=samplesheet_contents,
343+
)
344+
requirements = validate_file_requirements.sync(process_id=self.process_id,
345+
body=request,
346+
client=self._client.api_client)
347+
if error_msg := requirements.error_msg:
348+
raise DataPortalInputError(error_msg)
349+
350+
# Update the samplesheet if everything looks ok
351+
self._client.datasets.update_samplesheet(
352+
project_id=self.project_id,
353+
dataset_id=self.id,
354+
samplesheet=samplesheet_contents
355+
)
356+
305357

306358
class DataPortalDatasets(DataPortalAssets[DataPortalDataset]):
307359
"""Collection of multiple DataPortalDataset objects."""

cirro/services/dataset.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,3 +339,30 @@ def download_files(
339339
base_url=dataset.s3)
340340

341341
self._file_service.download_files(access_context, download_location, files)
342+
343+
def update_samplesheet(
344+
self,
345+
project_id: str,
346+
dataset_id: str,
347+
samplesheet: str
348+
):
349+
"""
350+
Updates a samplesheet on a dataset
351+
352+
Args:
353+
project_id (str): ID of the Project
354+
dataset_id (str): ID of the Dataset
355+
samplesheet (str): Samplesheet contents to update (should be a CSV string)
356+
"""
357+
dataset = self.get(project_id, dataset_id)
358+
access_context = FileAccessContext.upload_sample_sheet(project_id=project_id,
359+
dataset_id=dataset_id,
360+
base_url=dataset.s3)
361+
362+
samplesheet_key = f'{access_context.prefix}/samplesheet.csv'
363+
self._file_service.create_file(
364+
access_context=access_context,
365+
key=samplesheet_key,
366+
contents=samplesheet,
367+
content_type='text/csv'
368+
)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "cirro"
3-
version = "1.7.2"
3+
version = "1.8.0"
44
description = "CLI tool and SDK for interacting with the Cirro platform"
55
authors = ["Cirro Bio <[email protected]>"]
66
license = "MIT"

samples/Uploading_a_dataset.ipynb

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -243,12 +243,12 @@
243243
"evalue": "Files do not match dataset type. Expected file type requirements: \nPaired FASTQ (Illumina Format) *_S*_L???_{I,R}{1,2}_001.fastq.gz",
244244
"output_type": "error",
245245
"traceback": [
246-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
247-
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
248-
"\u001b[0;32m/var/folders/ck/j40906kx3mj90bcc8qs7gyxm0000gp/T/ipykernel_83747/2225702019.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;31m# Try to upload the data (which will cause an error)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m project.upload_dataset(\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0mname\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'Test dataset'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mdescription\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
249-
"\u001b[0;32m~/Documents/GitHub/Cirro-client/cirro/sdk/project.py\u001b[0m in \u001b[0;36mupload_dataset\u001b[0;34m(self, name, description, process, upload_folder, files)\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 127\u001b[0m \u001b[0;31m# Make sure that the files match the expected pattern\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 128\u001b[0;31m \u001b[0mcheck_dataset_files\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfiles\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprocess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfile_mapping_rules\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mupload_folder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 129\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[0;31m# Create the ingest process request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
250-
"\u001b[0;32m~/Documents/GitHub/Cirro-client/cirro/file_utils.py\u001b[0m in \u001b[0;36mcheck_dataset_files\u001b[0;34m(files, file_mapping_rules, directory)\u001b[0m\n\u001b[1;32m 148\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0many\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunctools\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpartial\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmatch_pattern\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_mapping_rules\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 150\u001b[0;31m raise ValueError(\"Files do not match dataset type. Expected file type requirements: \\n\" + \"\\n\".join(\n\u001b[0m\u001b[1;32m 151\u001b[0m [f\"{rule.get('description', '')} {rule.get('glob')}\" for rule in file_mapping_rules]))\n",
251-
"\u001b[0;31mValueError\u001b[0m: Files do not match dataset type. Expected file type requirements: \nPaired FASTQ (Illumina Format) *_S*_L???_{I,R}{1,2}_001.fastq.gz"
246+
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
247+
"\u001B[0;31mValueError\u001B[0m Traceback (most recent call last)",
248+
"\u001B[0;32m/var/folders/ck/j40906kx3mj90bcc8qs7gyxm0000gp/T/ipykernel_83747/2225702019.py\u001B[0m in \u001B[0;36m<module>\u001B[0;34m\u001B[0m\n\u001B[1;32m 5\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 6\u001B[0m \u001B[0;31m# Try to upload the data (which will cause an error)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m----> 7\u001B[0;31m project.upload_dataset(\n\u001B[0m\u001B[1;32m 8\u001B[0m \u001B[0mname\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;34m'Test dataset'\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 9\u001B[0m \u001B[0mdescription\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;34m''\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
249+
"\u001B[0;32m~/Documents/GitHub/Cirro-client/cirro/sdk/project.py\u001B[0m in \u001B[0;36mupload_dataset\u001B[0;34m(self, name, description, process, upload_folder, files)\u001B[0m\n\u001B[1;32m 126\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 127\u001B[0m \u001B[0;31m# Make sure that the files match the expected pattern\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 128\u001B[0;31m \u001B[0mcheck_dataset_files\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mfiles\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mprocess\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mfile_mapping_rules\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mupload_folder\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 129\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 130\u001B[0m \u001B[0;31m# Create the ingest process request\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
250+
"\u001B[0;32m~/Documents/GitHub/Cirro-client/cirro/file_utils.py\u001B[0m in \u001B[0;36mcheck_dataset_files\u001B[0;34m(files, file_mapping_rules, directory)\u001B[0m\n\u001B[1;32m 148\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 149\u001B[0m \u001B[0;32mif\u001B[0m \u001B[0;32mnot\u001B[0m \u001B[0many\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmap\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mfunctools\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mpartial\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmatch_pattern\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mfiles\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mfile_mapping_rules\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 150\u001B[0;31m raise ValueError(\"Files do not match dataset type. Expected file type requirements: \\n\" + \"\\n\".join(\n\u001B[0m\u001B[1;32m 151\u001B[0m [f\"{rule.get('description', '')} {rule.get('glob')}\" for rule in file_mapping_rules]))\n",
251+
"\u001B[0;31mValueError\u001B[0m: Files do not match dataset type. Expected file type requirements: \nPaired FASTQ (Illumina Format) *_S*_L???_{I,R}{1,2}_001.fastq.gz"
252252
]
253253
}
254254
],
@@ -259,7 +259,7 @@
259259
"print(json.dumps(ingest_10X.file_mapping_rules, indent=3))\n",
260260
"\n",
261261
"# Try to upload the data (which will cause an error)\n",
262-
"project.upload_dataset(\n",
262+
"dataset = project.upload_dataset(\n",
263263
" name = 'Test dataset',\n",
264264
" description = '',\n",
265265
" upload_folder = '/tmp',\n",
@@ -269,11 +269,43 @@
269269
]
270270
},
271271
{
272-
"cell_type": "code",
273-
"execution_count": null,
274272
"metadata": {},
273+
"cell_type": "markdown",
274+
"source": [
275+
"You can update a sample sheet on an existing dataset by using the `update_samplesheet` method.\n",
276+
"\n",
277+
"You may provide either the CSV contents or a file path."
278+
]
279+
},
280+
{
281+
"metadata": {},
282+
"cell_type": "code",
275283
"outputs": [],
276-
"source": []
284+
"execution_count": null,
285+
"source": [
286+
"from pathlib import Path\n",
287+
"import pandas as pd\n",
288+
"\n",
289+
"samplesheet = pd.DataFrame.from_records([\n",
290+
" {\n",
291+
" 'sample': 'test',\n",
292+
" 'fastq_1': 'test.R1.fastq.gz',\n",
293+
" 'fastq_2': 'test.R2.fastq.gz',\n",
294+
" 'status': 'Normal'\n",
295+
" }\n",
296+
"])\n",
297+
"\n",
298+
"dataset.update_samplesheet(\n",
299+
" contents=samplesheet.to_csv(index=False),\n",
300+
")\n",
301+
"\n",
302+
"\n",
303+
"# OR\n",
304+
"\n",
305+
"dataset.update_samplesheet(\n",
306+
" file_path=Path('~/samplesheet.csv')\n",
307+
")"
308+
]
277309
}
278310
],
279311
"metadata": {

0 commit comments

Comments
 (0)