Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,26 @@ Options:
$ cirro upload --project "Test Project 1" --name "test" --file "sample1.fastq.gz" --file "sample2.fastq.gz" --data-directory "~/data" --data-type "Paired DNAseq (FASTQ)"
```

#### Validating that a dataset matches a local folder

```bash
Usage: cirro validate [OPTIONS]

Validate that the contents of a local folder match those of a dataset in Cirro

Options:
--dataset TEXT Name or ID of the dataset
--project TEXT Name or ID of the project
--data-directory TEXT Local directory you wish to validate
-i, --interactive Gather arguments interactively
--help Show this message and exit.

```

```bash
$ cirro validate --project "Test Project 1" --dataset "test" --data-directory "~/data"
```

#### Uploading a reference

```bash
Expand Down
6 changes: 4 additions & 2 deletions cirro/cli/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from cirro.cli.controller import run_ingest, run_download, run_configure, run_list_datasets, run_create_pipeline_config
from cirro.cli.controller import run_ingest, run_download, run_configure, run_list_datasets
from cirro.cli.controller import run_create_pipeline_config, run_validate_folder

__all__ = [
'run_ingest',
'run_download',
'run_configure',
'run_list_datasets',
'run_create_pipeline_config'
'run_create_pipeline_config',
'run_validate_folder'
]
18 changes: 17 additions & 1 deletion cirro/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import requests
from cirro_api_client.v1.errors import CirroException

from cirro.cli import run_ingest, run_download, run_configure, run_list_datasets, run_create_pipeline_config
from cirro.cli import run_create_pipeline_config, run_validate_folder
from cirro.cli import run_ingest, run_download, run_configure, run_list_datasets
from cirro.cli.controller import handle_error, run_upload_reference
from cirro.cli.interactive.utils import InputError

Expand Down Expand Up @@ -81,6 +82,21 @@ def upload(**kwargs):
run_ingest(kwargs, interactive=kwargs.get('interactive'))


@run.command(help='Validate a dataset exactly matches a local folder', no_args_is_help=True)
@click.option('--dataset',
help='Name or ID of the dataset')
@click.option('--project',
help='Name or ID of the project')
@click.option('--data-directory',
help='Local directory you wish to validate')
@click.option('-i', '--interactive',
help='Gather arguments interactively',
is_flag=True, default=False)
def validate(**kwargs):
check_required_args(kwargs)
run_validate_folder(kwargs, interactive=kwargs.get('interactive'))


@run.command(help='Upload a reference to a project', no_args_is_help=True)
@click.option('--name',
help='Name of the reference')
Expand Down
55 changes: 54 additions & 1 deletion cirro/cli/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@
from cirro.cli.interactive.upload_args import gather_upload_arguments
from cirro.cli.interactive.upload_reference_args import gather_reference_upload_arguments
from cirro.cli.interactive.utils import get_id_from_name, get_item_from_name_or_id, InputError, validate_files
from cirro.cli.interactive.validate_args import gather_validate_arguments, gather_validate_arguments_dataset
from cirro.cli.models import ListArguments, UploadArguments, DownloadArguments, CreatePipelineConfigArguments, \
UploadReferenceArguments
UploadReferenceArguments, ValidateArguments
from cirro.config import UserConfig, save_user_config, load_user_config
from cirro.file_utils import get_files_in_directory
from cirro.models.process import PipelineDefinition, ConfigAppStatus, CONFIG_APP_URL
Expand Down Expand Up @@ -119,6 +120,58 @@ def run_ingest(input_params: UploadArguments, interactive=False):
logger.info(f"File content validated by {cirro.configuration.checksum_method_display}")


def run_validate_folder(input_params: ValidateArguments, interactive=False):
_check_configure()
cirro = CirroApi()
logger.info(f"Collecting data from {cirro.configuration.base_url}")

logger.info("Listing available projects")
projects = cirro.projects.list()

if len(projects) == 0:
raise InputError(NO_PROJECTS)

if interactive:
input_params = gather_validate_arguments(input_params, projects)

input_params['project'] = get_id_from_name(projects, input_params['project'])
datasets = list_all_datasets(project_id=input_params['project'], client=cirro)
# Filter out datasets that are not complete
datasets = [d for d in datasets if d.status == Status.COMPLETED]
input_params = gather_validate_arguments_dataset(input_params, datasets)
files = cirro.datasets.get_assets_listing(input_params['project'], input_params['dataset']).files

if len(files) == 0:
raise InputError('There are no files in this dataset to validate against')

project_id = input_params['project']
dataset_id = input_params['dataset']

else:
project_id = get_id_from_name(projects, input_params['project'])
datasets = cirro.datasets.list(project_id)
dataset_id = get_id_from_name(datasets, input_params['dataset'])

logger.info("Validating files")

validation_results = cirro.datasets.validate_folder(
project_id=project_id,
dataset_id=dataset_id,
local_folder=input_params['data_directory']
)

for file_list, label, log_level in [
(validation_results.files_matching, "✅ Matched Files (identical in Cirro and locally)", logging.INFO),
(validation_results.files_not_matching, "⚠️ Checksum Mismatches (same file name, different content)", logging.WARNING),
(validation_results.files_missing, "⚠️ Missing Locally (present in system but not found locally)", logging.WARNING),
(validation_results.local_only_files, "⚠️ Unexpected Local Files (present locally but not in system)", logging.WARNING),
(validation_results.validate_errors, "⚠️ Validation Failed (checksums may not be available)", logging.WARNING)
]:
logger.log(level=log_level, msg=f"{label}: {len(file_list):,}")
for file in file_list:
logger.log(level=log_level, msg=f" - {file}")


def run_download(input_params: DownloadArguments, interactive=False):
_check_configure()
cirro = CirroApi()
Expand Down
45 changes: 43 additions & 2 deletions cirro/cli/interactive/common_args.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
from typing import List

from cirro_api_client.v1.models import Project
from cirro_api_client.v1.models import Project, Dataset

from cirro.cli.interactive.utils import ask
from cirro.cli.interactive.utils import ask, prompt_wrapper, InputError
from cirro.models.dataset import DatasetWithShare
from cirro.utils import format_date


def _format_share(dataset: Dataset | DatasetWithShare) -> str:
if isinstance(dataset, DatasetWithShare) and dataset.share:
return f'({dataset.share.name})'
return ''


def ask_project(projects: List[Project], input_value: str) -> str:
Expand All @@ -21,3 +29,36 @@ def ask_project(projects: List[Project], input_value: str) -> str:
choices=project_names,
default=input_value if input_value in project_names else ''
)


def ask_dataset(datasets: List[Dataset], input_value: str, msg_action: str) -> str:
if len(datasets) == 0:
raise InputError("No datasets available")
sorted_datasets = sorted(datasets, key=lambda d: d.created_at, reverse=True)
dataset_prompt = {
'type': 'autocomplete',
'name': 'dataset',
'message': f'What dataset would you like to {msg_action}? (Press Tab to see all options)',
'choices': [f'{dataset.name} - {dataset.id}' for dataset in sorted_datasets],
'meta_information': {
f'{dataset.name} - {dataset.id}': f'{format_date(dataset.created_at)} {_format_share(dataset)}'
for dataset in datasets
},
'ignore_case': True
}
answers = prompt_wrapper(dataset_prompt)
choice = answers['dataset']
# Map the answer to a dataset
for dataset in datasets:
if f'{dataset.name} - {dataset.id}' == choice:
return dataset.id

# The user has made a selection which does not match
# any of the options available.
# This is most likely because there was a typo
if ask(
'confirm',
'The selection does match an option available - try again?'
):
return ask_dataset(datasets, input_value, msg_action)
raise InputError("Exiting - no dataset selected")
45 changes: 2 additions & 43 deletions cirro/cli/interactive/download_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,51 +4,10 @@

from cirro_api_client.v1.models import Dataset, Project

from cirro.cli.interactive.common_args import ask_project
from cirro.cli.interactive.common_args import ask_project, ask_dataset
from cirro.cli.interactive.utils import ask, prompt_wrapper, InputError
from cirro.cli.models import DownloadArguments
from cirro.models.dataset import DatasetWithShare
from cirro.models.file import File
from cirro.utils import format_date


def _format_share(dataset: Dataset | DatasetWithShare) -> str:
if isinstance(dataset, DatasetWithShare) and dataset.share:
return f'({dataset.share.name})'
return ''


def ask_dataset(datasets: List[Dataset], input_value: str) -> str:
if len(datasets) == 0:
raise InputError("No datasets available")
sorted_datasets = sorted(datasets, key=lambda d: d.created_at, reverse=True)
dataset_prompt = {
'type': 'autocomplete',
'name': 'dataset',
'message': 'What dataset would you like to download? (Press Tab to see all options)',
'choices': [f'{dataset.name} - {dataset.id}' for dataset in sorted_datasets],
'meta_information': {
f'{dataset.name} - {dataset.id}': f'{format_date(dataset.created_at)} {_format_share(dataset)}'
for dataset in datasets
},
'ignore_case': True
}
answers = prompt_wrapper(dataset_prompt)
choice = answers['dataset']
# Map the answer to a dataset
for dataset in datasets:
if f'{dataset.name} - {dataset.id}' == choice:
return dataset.id

# The user has made a selection which does not match
# any of the options available.
# This is most likely because there was a typo
if ask(
'confirm',
'The selection does match an option available - try again?'
):
return ask_dataset(datasets, input_value)
raise InputError("Exiting - no dataset selected")


def ask_dataset_files(files: List[File]) -> List[File]:
Expand Down Expand Up @@ -172,6 +131,6 @@ def gather_download_arguments(input_params: DownloadArguments, projects: List[Pr


def gather_download_arguments_dataset(input_params: DownloadArguments, datasets: List[Dataset]):
input_params['dataset'] = ask_dataset(datasets, input_params.get('dataset'))
input_params['dataset'] = ask_dataset(datasets, input_params.get('dataset'), msg_action='download')
input_params['data_directory'] = ask_directory(input_params.get('data_directory'))
return input_params
3 changes: 2 additions & 1 deletion cirro/cli/interactive/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ def get_id_from_name(items: List[T], name_or_id: str) -> Optional[str]:
matched = get_item_from_name_or_id(items, name_or_id)
if not matched:
item_type = type(items[0]).__name__
raise InputError(f"Could not find {item_type} {name_or_id}")
item_names = ", ".join([i.id for i in items])
raise InputError(f"Could not find {item_type} {name_or_id} - options: {item_names}")
return matched.id


Expand Down
31 changes: 31 additions & 0 deletions cirro/cli/interactive/validate_args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from pathlib import Path

from cirro_api_client.v1.models import Dataset, Project

from cirro.cli.interactive.common_args import ask_project, ask_dataset
from cirro.cli.interactive.utils import prompt_wrapper
from cirro.cli.models import ValidateArguments


def ask_directory(input_value: str) -> str:
directory_prompt = {
'type': 'path',
'name': 'directory',
'only_directories': True,
'message': 'What local folder would you like to compare data contents for?',
'default': input_value or str(Path.cwd())
}

answers = prompt_wrapper(directory_prompt)
return answers['directory']


def gather_validate_arguments(input_params: ValidateArguments, projects: list[Project]):
input_params['project'] = ask_project(projects, input_params.get('project'))
return input_params


def gather_validate_arguments_dataset(input_params: ValidateArguments, datasets: list[Dataset]):
input_params['dataset'] = ask_dataset(datasets, input_params.get('dataset'), msg_action='validate')
input_params['data_directory'] = ask_directory(input_params.get('data_directory'))
return input_params
7 changes: 7 additions & 0 deletions cirro/cli/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ class UploadArguments(TypedDict):
file: Optional[list[str]]


class ValidateArguments(TypedDict):
dataset: str
project: str
data_directory: str
interactive: bool


class ListArguments(TypedDict):
project: str
interactive: bool
Expand Down
9 changes: 6 additions & 3 deletions cirro/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,11 @@ def generate_flattened_file_map(files: List[PathLike]) -> Dict[PathLike, str]:
}


def _is_hidden_file(file_path: Path):
# Remove hidden files from listing, desktop.ini .DS_Store, etc.
def is_hidden_file(file_path: Path):
"""
Check if a file path is hidden
Such as desktop.ini, .DS_Store, etc.
"""
if os.name == 'nt':
attributes = win32api.GetFileAttributes(str(file_path))
return attributes & (win32con.FILE_ATTRIBUTE_HIDDEN | win32con.FILE_ATTRIBUTE_SYSTEM)
Expand Down Expand Up @@ -86,7 +89,7 @@ def get_files_in_directory(
if file_path.is_dir():
continue

if not include_hidden and _is_hidden_file(file_path):
if not include_hidden and is_hidden_file(file_path):
continue

if not file_path.exists():
Expand Down
9 changes: 9 additions & 0 deletions cirro/models/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,12 @@ def from_dataset(cls, dataset: Dataset, share: Share) -> 'DatasetWithShare':
updated_at=dataset.updated_at,
share=share
)


@_attrs_define
class DatasetValidationResponse:
files_matching: list[str]
files_not_matching: list[str]
files_missing: list[str]
local_only_files: list[str]
validate_errors: list[str]
5 changes: 5 additions & 0 deletions cirro/models/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,11 @@ def from_file_entry(cls, file: FileEntry, project_id: str, dataset: DatasetDetai
access_context=access_context
)

@property
def normalized_path(self) -> str:
""" Without the data prefix """
return self.relative_path[len("data/"):]

@property
def absolute_path(self):
return f'{self.access_context.base_url}/{self.relative_path.strip("/")}'
Expand Down
6 changes: 1 addition & 5 deletions cirro/sdk/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,11 +212,7 @@ def is_valid(self, local_path: PathLike) -> bool:
if not local_path:
raise DataPortalInputError("Must provide local path to validate file")

try:
self.validate(local_path)
return True
except ValueError:
return False
return self._client.file.is_valid_file(self._file, local_path)


class DataPortalFiles(DataPortalAssets[DataPortalFile]):
Expand Down
Loading
Loading