Skip to content

Commit 554a82b

Browse files
committed
Add CLI method to validate that a local folder matches a dataset in Cirro
1 parent 688beb6 commit 554a82b

File tree

9 files changed

+337
-48
lines changed

9 files changed

+337
-48
lines changed

README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,26 @@ Options:
8181
$ cirro upload --project "Test Project 1" --name "test" --file "sample1.fastq.gz" --file "sample2.fastq.gz" --data-directory "~/data" --data-type "Paired DNAseq (FASTQ)"
8282
```
8383

84+
#### Validating that a dataset matches a local folder
85+
86+
```bash
87+
Usage: cirro validate-folder [OPTIONS]
88+
89+
Validate that the contents of a local folder match those of a dataset in Cirro
90+
91+
Options:
92+
--name TEXT Name or ID of the dataset
93+
--project TEXT Name or ID of the project
94+
--data-directory TEXT Local directory you wish to validate
95+
-i, --interactive Gather arguments interactively
96+
--help Show this message and exit.
97+
98+
```
99+
100+
```bash
101+
$ cirro validate-folder --project "Test Project 1" --name "test" --data-directory "~/data"
102+
```
103+
84104
#### Uploading a reference
85105

86106
```bash

cirro/cli/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1-
from cirro.cli.controller import run_ingest, run_download, run_configure, run_list_datasets, run_create_pipeline_config
1+
from cirro.cli.controller import run_ingest, run_download, run_configure, run_list_datasets
2+
from cirro.cli.controller import run_create_pipeline_config, run_validate_folder
23

34
__all__ = [
45
'run_ingest',
56
'run_download',
67
'run_configure',
78
'run_list_datasets',
8-
'run_create_pipeline_config'
9+
'run_create_pipeline_config',
10+
'run_validate_folder'
911
]

cirro/cli/cli.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
import requests
55
from cirro_api_client.v1.errors import CirroException
66

7-
from cirro.cli import run_ingest, run_download, run_configure, run_list_datasets, run_create_pipeline_config
7+
from cirro.cli import run_ingest, run_download, run_configure, run_list_datasets
8+
from cirro.cli import run_create_pipeline_config, run_validate_folder
89
from cirro.cli.controller import handle_error, run_upload_reference
910
from cirro.cli.interactive.utils import InputError
1011

@@ -81,6 +82,21 @@ def upload(**kwargs):
8182
run_ingest(kwargs, interactive=kwargs.get('interactive'))
8283

8384

85+
@run.command(help='Validate a dataset exactly matches a local folder', no_args_is_help=True)
86+
@click.option('--dataset',
87+
help='Name or ID of the dataset')
88+
@click.option('--project',
89+
help='Name or ID of the project')
90+
@click.option('--data-directory',
91+
help='Local directory you wish to validate')
92+
@click.option('-i', '--interactive',
93+
help='Gather arguments interactively',
94+
is_flag=True, default=False)
95+
def validate_folder(**kwargs):
96+
check_required_args(kwargs)
97+
run_validate_folder(kwargs, interactive=kwargs.get('interactive'))
98+
99+
84100
@run.command(help='Upload a reference to a project', no_args_is_help=True)
85101
@click.option('--name',
86102
help='Name of the reference')

cirro/cli/controller.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from cirro.cli.interactive.download_args import gather_download_arguments_dataset
1414
from cirro.cli.interactive.list_dataset_args import gather_list_arguments
1515
from cirro.cli.interactive.upload_args import gather_upload_arguments
16+
from cirro.cli.interactive.validate_args import gather_validate_arguments, gather_validate_arguments_dataset
1617
from cirro.cli.interactive.upload_reference_args import gather_reference_upload_arguments
1718
from cirro.cli.interactive.utils import get_id_from_name, get_item_from_name_or_id, InputError, validate_files
1819
from cirro.cli.models import ListArguments, UploadArguments, DownloadArguments, CreatePipelineConfigArguments, \
@@ -119,6 +120,57 @@ def run_ingest(input_params: UploadArguments, interactive=False):
119120
logger.info(f"File content validated by {cirro.configuration.checksum_method_display}")
120121

121122

123+
def run_validate_folder(input_params: UploadArguments, interactive=False):
124+
_check_configure()
125+
cirro = CirroApi()
126+
logger.info(f"Collecting data from {cirro.configuration.base_url}")
127+
128+
logger.info("Listing available projects")
129+
projects = cirro.projects.list()
130+
131+
if len(projects) == 0:
132+
raise InputError(NO_PROJECTS)
133+
134+
if interactive:
135+
input_params = gather_validate_arguments(input_params, projects)
136+
137+
input_params['project'] = get_id_from_name(projects, input_params['project'])
138+
datasets = list_all_datasets(project_id=input_params['project'], client=cirro)
139+
# Filter out datasets that are not complete
140+
datasets = [d for d in datasets if d.status == Status.COMPLETED]
141+
input_params = gather_validate_arguments_dataset(input_params, datasets)
142+
files = cirro.datasets.get_assets_listing(input_params['project'], input_params['dataset']).files
143+
144+
if len(files) == 0:
145+
raise InputError('There are no files in this dataset to validate against')
146+
147+
project_id = input_params['project']
148+
dataset_id = input_params['dataset']
149+
150+
else:
151+
project_id = get_id_from_name(projects, input_params['project'])
152+
datasets = cirro.datasets.list(project_id)
153+
dataset_id = get_id_from_name(datasets, input_params['dataset'])
154+
155+
logger.info("Validating files")
156+
157+
validation_results = cirro.datasets.validate_folder(
158+
project_id=project_id,
159+
dataset_id=dataset_id,
160+
local_folder=input_params['data_directory']
161+
)
162+
163+
for file_list, label in [
164+
[validation_results['ds_files_matching'], "Files exactly matching in Cirro and locally"],
165+
[validation_results['ds_files_notmatching'], "Files with differing checksums in Cirro and locally"],
166+
[validation_results['ds_files_missing'], "Files present in Cirro but not locally"],
167+
[validation_results['local_only_files'], "Files present locally but not in Cirro"]
168+
]:
169+
logger.info(f"{label}: {len(file_list):,}")
170+
for file in file_list:
171+
logger.info(f" - {file}")
172+
173+
122174
def run_download(input_params: DownloadArguments, interactive=False):
123175
_check_configure()
124176
cirro = CirroApi()

cirro/cli/interactive/common_args.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,15 @@
11
from typing import List
22

3-
from cirro_api_client.v1.models import Project
3+
from cirro_api_client.v1.models import Project, Dataset
4+
from cirro.cli.interactive.utils import ask, prompt_wrapper, InputError
5+
from cirro.utils import format_date
6+
from cirro.models.dataset import DatasetWithShare
47

5-
from cirro.cli.interactive.utils import ask
8+
9+
def _format_share(dataset: Dataset | DatasetWithShare) -> str:
10+
if isinstance(dataset, DatasetWithShare) and dataset.share:
11+
return f'({dataset.share.name})'
12+
return ''
613

714

815
def ask_project(projects: List[Project], input_value: str) -> str:
@@ -21,3 +28,36 @@ def ask_project(projects: List[Project], input_value: str) -> str:
2128
choices=project_names,
2229
default=input_value if input_value in project_names else ''
2330
)
31+
32+
33+
def ask_dataset(datasets: List[Dataset], input_value: str, msg_action: str) -> str:
34+
if len(datasets) == 0:
35+
raise InputError("No datasets available")
36+
sorted_datasets = sorted(datasets, key=lambda d: d.created_at, reverse=True)
37+
dataset_prompt = {
38+
'type': 'autocomplete',
39+
'name': 'dataset',
40+
'message': f'What dataset would you like to {msg_action}? (Press Tab to see all options)',
41+
'choices': [f'{dataset.name} - {dataset.id}' for dataset in sorted_datasets],
42+
'meta_information': {
43+
f'{dataset.name} - {dataset.id}': f'{format_date(dataset.created_at)} {_format_share(dataset)}'
44+
for dataset in datasets
45+
},
46+
'ignore_case': True
47+
}
48+
answers = prompt_wrapper(dataset_prompt)
49+
choice = answers['dataset']
50+
# Map the answer to a dataset
51+
for dataset in datasets:
52+
if f'{dataset.name} - {dataset.id}' == choice:
53+
return dataset.id
54+
55+
# The user has made a selection which does not match
56+
# any of the options available.
57+
# This is most likely because there was a typo
58+
if ask(
59+
'confirm',
60+
'The selection does match an option available - try again?'
61+
):
62+
return ask_dataset(datasets, input_value)
63+
raise InputError("Exiting - no dataset selected")

cirro/cli/interactive/download_args.py

Lines changed: 2 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -4,51 +4,10 @@
44

55
from cirro_api_client.v1.models import Dataset, Project
66

7-
from cirro.cli.interactive.common_args import ask_project
7+
from cirro.cli.interactive.common_args import ask_project, ask_dataset
88
from cirro.cli.interactive.utils import ask, prompt_wrapper, InputError
99
from cirro.cli.models import DownloadArguments
10-
from cirro.models.dataset import DatasetWithShare
1110
from cirro.models.file import File
12-
from cirro.utils import format_date
13-
14-
15-
def _format_share(dataset: Dataset | DatasetWithShare) -> str:
16-
if isinstance(dataset, DatasetWithShare) and dataset.share:
17-
return f'({dataset.share.name})'
18-
return ''
19-
20-
21-
def ask_dataset(datasets: List[Dataset], input_value: str) -> str:
22-
if len(datasets) == 0:
23-
raise InputError("No datasets available")
24-
sorted_datasets = sorted(datasets, key=lambda d: d.created_at, reverse=True)
25-
dataset_prompt = {
26-
'type': 'autocomplete',
27-
'name': 'dataset',
28-
'message': 'What dataset would you like to download? (Press Tab to see all options)',
29-
'choices': [f'{dataset.name} - {dataset.id}' for dataset in sorted_datasets],
30-
'meta_information': {
31-
f'{dataset.name} - {dataset.id}': f'{format_date(dataset.created_at)} {_format_share(dataset)}'
32-
for dataset in datasets
33-
},
34-
'ignore_case': True
35-
}
36-
answers = prompt_wrapper(dataset_prompt)
37-
choice = answers['dataset']
38-
# Map the answer to a dataset
39-
for dataset in datasets:
40-
if f'{dataset.name} - {dataset.id}' == choice:
41-
return dataset.id
42-
43-
# The user has made a selection which does not match
44-
# any of the options available.
45-
# This is most likely because there was a typo
46-
if ask(
47-
'confirm',
48-
'The selection does match an option available - try again?'
49-
):
50-
return ask_dataset(datasets, input_value)
51-
raise InputError("Exiting - no dataset selected")
5211

5312

5413
def ask_dataset_files(files: List[File]) -> List[File]:
@@ -172,6 +131,6 @@ def gather_download_arguments(input_params: DownloadArguments, projects: List[Pr
172131

173132

174133
def gather_download_arguments_dataset(input_params: DownloadArguments, datasets: List[Dataset]):
175-
input_params['dataset'] = ask_dataset(datasets, input_params.get('dataset'))
134+
input_params['dataset'] = ask_dataset(datasets, input_params.get('dataset'), 'download')
176135
input_params['data_directory'] = ask_directory(input_params.get('data_directory'))
177136
return input_params
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
from fnmatch import fnmatch
2+
from pathlib import Path
3+
from typing import List
4+
5+
from cirro_api_client.v1.models import Dataset, Project
6+
7+
from cirro.cli.interactive.common_args import ask_project, ask_dataset
8+
from cirro.cli.interactive.utils import ask, prompt_wrapper, InputError
9+
from cirro.cli.models import DownloadArguments
10+
from cirro.models.file import File
11+
12+
13+
def ask_dataset_files(files: List[File]) -> List[File]:
14+
"""Get the list of files which the user would like to download from the dataset."""
15+
16+
choices = [
17+
"Download all files",
18+
"Select files from a list",
19+
"Select files with a naming pattern (glob)"
20+
]
21+
22+
selection_mode_prompt = {
23+
'type': 'select',
24+
'name': 'mode',
25+
'message': 'Which files would you like to download from this dataset?',
26+
'choices': choices
27+
}
28+
29+
answers = prompt_wrapper(selection_mode_prompt)
30+
31+
if answers['mode'] == choices[0]:
32+
return files
33+
elif answers['mode'] == choices[1]:
34+
return ask_dataset_files_list(files)
35+
else:
36+
return ask_dataset_files_glob(files)
37+
38+
39+
def strip_prefix(fp: str, prefix: str):
40+
assert fp.startswith(prefix), f"Expected {fp} to start with {prefix}"
41+
return fp[len(prefix):]
42+
43+
44+
def ask_dataset_files_list(files: List[File]) -> List[File]:
45+
answers = prompt_wrapper({
46+
'type': 'checkbox',
47+
'name': 'files',
48+
'message': 'Select the files to download',
49+
'choices': [
50+
strip_prefix(file.relative_path, "data/")
51+
for file in files
52+
]
53+
})
54+
55+
selected_files = [
56+
file
57+
for file in files
58+
if strip_prefix(file.relative_path, "data/") in set(answers['files'])
59+
]
60+
61+
if len(selected_files) == 0:
62+
if ask(
63+
"confirm",
64+
"No files were selected - try again?"
65+
):
66+
return ask_dataset_files_list(files)
67+
else:
68+
raise InputError("No files selected")
69+
else:
70+
return selected_files
71+
72+
73+
def ask_dataset_files_glob(files: List[File]) -> List[File]:
74+
75+
confirmed = False
76+
while not confirmed:
77+
selected_files = ask_dataset_files_glob_single(files)
78+
confirmed = ask(
79+
"confirm",
80+
f'Number of files selected: {len(selected_files):} / {len(files):,}'
81+
)
82+
83+
if len(selected_files) == 0:
84+
raise InputError("No files selected")
85+
86+
return selected_files
87+
88+
89+
def ask_dataset_files_glob_single(files: List[File]) -> List[File]:
90+
91+
print("All Files:")
92+
for file in files:
93+
print(f" - {strip_prefix(file.relative_path, 'data/')}")
94+
95+
answers = prompt_wrapper({
96+
'type': 'text',
97+
'name': 'glob',
98+
'message': 'Select files by naming pattern (using the * wildcard)',
99+
'default': '*'
100+
})
101+
102+
selected_files = [
103+
file
104+
for file in files
105+
if fnmatch(strip_prefix(file.relative_path, "data/"), answers['glob'])
106+
]
107+
108+
print("Selected Files:")
109+
for file in selected_files:
110+
print(f" - {strip_prefix(file.relative_path, 'data/')}")
111+
112+
return selected_files
113+
114+
115+
def ask_directory(input_value: str) -> str:
116+
directory_prompt = {
117+
'type': 'path',
118+
'name': 'directory',
119+
'only_directories': True,
120+
'message': 'What local folder would you like to compare data contents for?',
121+
'default': input_value or str(Path.cwd())
122+
}
123+
124+
answers = prompt_wrapper(directory_prompt)
125+
return answers['directory']
126+
127+
128+
def gather_validate_arguments(input_params: DownloadArguments, projects: List[Project]):
129+
input_params['project'] = ask_project(projects, input_params.get('project'))
130+
return input_params
131+
132+
133+
def gather_validate_arguments_dataset(input_params: DownloadArguments, datasets: List[Dataset]):
134+
input_params['dataset'] = ask_dataset(datasets, input_params.get('dataset'), 'validate')
135+
input_params['data_directory'] = ask_directory(input_params.get('data_directory'))
136+
return input_params

0 commit comments

Comments
 (0)