diff --git a/README.md b/README.md index 676a21c..b0b07e4 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,26 @@ Options: $ cirro upload --project "Test Project 1" --name "test" --file "sample1.fastq.gz" --file "sample2.fastq.gz" --data-directory "~/data" --data-type "Paired DNAseq (FASTQ)" ``` +#### Validating that a dataset matches a local folder + +```bash +Usage: cirro validate [OPTIONS] + + Validate that the contents of a local folder match those of a dataset in Cirro + +Options: + --dataset TEXT Name or ID of the dataset + --project TEXT Name or ID of the project + --data-directory TEXT Local directory you wish to validate + -i, --interactive Gather arguments interactively + --help Show this message and exit. + +``` + +```bash +$ cirro validate --project "Test Project 1" --dataset "test" --data-directory "~/data" +``` + #### Uploading a reference ```bash diff --git a/cirro/cli/__init__.py b/cirro/cli/__init__.py index dc5b06a..cd7004a 100644 --- a/cirro/cli/__init__.py +++ b/cirro/cli/__init__.py @@ -1,9 +1,11 @@ -from cirro.cli.controller import run_ingest, run_download, run_configure, run_list_datasets, run_create_pipeline_config +from cirro.cli.controller import run_ingest, run_download, run_configure, run_list_datasets +from cirro.cli.controller import run_create_pipeline_config, run_validate_folder __all__ = [ 'run_ingest', 'run_download', 'run_configure', 'run_list_datasets', - 'run_create_pipeline_config' + 'run_create_pipeline_config', + 'run_validate_folder' ] diff --git a/cirro/cli/cli.py b/cirro/cli/cli.py index 7554aa0..91cc6f7 100644 --- a/cirro/cli/cli.py +++ b/cirro/cli/cli.py @@ -4,7 +4,8 @@ import requests from cirro_api_client.v1.errors import CirroException -from cirro.cli import run_ingest, run_download, run_configure, run_list_datasets, run_create_pipeline_config +from cirro.cli import run_create_pipeline_config, run_validate_folder +from cirro.cli import run_ingest, run_download, run_configure, run_list_datasets from cirro.cli.controller import handle_error, run_upload_reference from cirro.cli.interactive.utils import InputError @@ -81,6 +82,21 @@ def upload(**kwargs): run_ingest(kwargs, interactive=kwargs.get('interactive')) +@run.command(help='Validate a dataset exactly matches a local folder', no_args_is_help=True) +@click.option('--dataset', + help='Name or ID of the dataset') +@click.option('--project', + help='Name or ID of the project') +@click.option('--data-directory', + help='Local directory you wish to validate') +@click.option('-i', '--interactive', + help='Gather arguments interactively', + is_flag=True, default=False) +def validate(**kwargs): + check_required_args(kwargs) + run_validate_folder(kwargs, interactive=kwargs.get('interactive')) + + @run.command(help='Upload a reference to a project', no_args_is_help=True) @click.option('--name', help='Name of the reference') diff --git a/cirro/cli/controller.py b/cirro/cli/controller.py index 8f61d5f..bf486ab 100644 --- a/cirro/cli/controller.py +++ b/cirro/cli/controller.py @@ -15,8 +15,9 @@ from cirro.cli.interactive.upload_args import gather_upload_arguments from cirro.cli.interactive.upload_reference_args import gather_reference_upload_arguments from cirro.cli.interactive.utils import get_id_from_name, get_item_from_name_or_id, InputError, validate_files +from cirro.cli.interactive.validate_args import gather_validate_arguments, gather_validate_arguments_dataset from cirro.cli.models import ListArguments, UploadArguments, DownloadArguments, CreatePipelineConfigArguments, \ - UploadReferenceArguments + UploadReferenceArguments, ValidateArguments from cirro.config import UserConfig, save_user_config, load_user_config from cirro.file_utils import get_files_in_directory from cirro.models.process import PipelineDefinition, ConfigAppStatus, CONFIG_APP_URL @@ -119,6 +120,58 @@ def run_ingest(input_params: UploadArguments, interactive=False): logger.info(f"File content validated by {cirro.configuration.checksum_method_display}") +def run_validate_folder(input_params: ValidateArguments, interactive=False): + _check_configure() + cirro = CirroApi() + logger.info(f"Collecting data from {cirro.configuration.base_url}") + + logger.info("Listing available projects") + projects = cirro.projects.list() + + if len(projects) == 0: + raise InputError(NO_PROJECTS) + + if interactive: + input_params = gather_validate_arguments(input_params, projects) + + input_params['project'] = get_id_from_name(projects, input_params['project']) + datasets = list_all_datasets(project_id=input_params['project'], client=cirro) + # Filter out datasets that are not complete + datasets = [d for d in datasets if d.status == Status.COMPLETED] + input_params = gather_validate_arguments_dataset(input_params, datasets) + files = cirro.datasets.get_assets_listing(input_params['project'], input_params['dataset']).files + + if len(files) == 0: + raise InputError('There are no files in this dataset to validate against') + + project_id = input_params['project'] + dataset_id = input_params['dataset'] + + else: + project_id = get_id_from_name(projects, input_params['project']) + datasets = cirro.datasets.list(project_id) + dataset_id = get_id_from_name(datasets, input_params['dataset']) + + logger.info("Validating files") + + validation_results = cirro.datasets.validate_folder( + project_id=project_id, + dataset_id=dataset_id, + local_folder=input_params['data_directory'] + ) + + for file_list, label, log_level in [ + (validation_results.files_matching, "✅ Matched Files (identical in Cirro and locally)", logging.INFO), + (validation_results.files_not_matching, "⚠️ Checksum Mismatches (same file name, different content)", logging.WARNING), + (validation_results.files_missing, "⚠️ Missing Locally (present in system but not found locally)", logging.WARNING), + (validation_results.local_only_files, "⚠️ Unexpected Local Files (present locally but not in system)", logging.WARNING), + (validation_results.validate_errors, "⚠️ Validation Failed (checksums may not be available)", logging.WARNING) + ]: + logger.log(level=log_level, msg=f"{label}: {len(file_list):,}") + for file in file_list: + logger.log(level=log_level, msg=f" - {file}") + + def run_download(input_params: DownloadArguments, interactive=False): _check_configure() cirro = CirroApi() diff --git a/cirro/cli/interactive/common_args.py b/cirro/cli/interactive/common_args.py index 7c9f07f..24896e3 100644 --- a/cirro/cli/interactive/common_args.py +++ b/cirro/cli/interactive/common_args.py @@ -1,8 +1,16 @@ from typing import List -from cirro_api_client.v1.models import Project +from cirro_api_client.v1.models import Project, Dataset -from cirro.cli.interactive.utils import ask +from cirro.cli.interactive.utils import ask, prompt_wrapper, InputError +from cirro.models.dataset import DatasetWithShare +from cirro.utils import format_date + + +def _format_share(dataset: Dataset | DatasetWithShare) -> str: + if isinstance(dataset, DatasetWithShare) and dataset.share: + return f'({dataset.share.name})' + return '' def ask_project(projects: List[Project], input_value: str) -> str: @@ -21,3 +29,36 @@ def ask_project(projects: List[Project], input_value: str) -> str: choices=project_names, default=input_value if input_value in project_names else '' ) + + +def ask_dataset(datasets: List[Dataset], input_value: str, msg_action: str) -> str: + if len(datasets) == 0: + raise InputError("No datasets available") + sorted_datasets = sorted(datasets, key=lambda d: d.created_at, reverse=True) + dataset_prompt = { + 'type': 'autocomplete', + 'name': 'dataset', + 'message': f'What dataset would you like to {msg_action}? (Press Tab to see all options)', + 'choices': [f'{dataset.name} - {dataset.id}' for dataset in sorted_datasets], + 'meta_information': { + f'{dataset.name} - {dataset.id}': f'{format_date(dataset.created_at)} {_format_share(dataset)}' + for dataset in datasets + }, + 'ignore_case': True + } + answers = prompt_wrapper(dataset_prompt) + choice = answers['dataset'] + # Map the answer to a dataset + for dataset in datasets: + if f'{dataset.name} - {dataset.id}' == choice: + return dataset.id + + # The user has made a selection which does not match + # any of the options available. + # This is most likely because there was a typo + if ask( + 'confirm', + 'The selection does match an option available - try again?' + ): + return ask_dataset(datasets, input_value, msg_action) + raise InputError("Exiting - no dataset selected") diff --git a/cirro/cli/interactive/download_args.py b/cirro/cli/interactive/download_args.py index fad0fa3..fe12642 100644 --- a/cirro/cli/interactive/download_args.py +++ b/cirro/cli/interactive/download_args.py @@ -4,51 +4,10 @@ from cirro_api_client.v1.models import Dataset, Project -from cirro.cli.interactive.common_args import ask_project +from cirro.cli.interactive.common_args import ask_project, ask_dataset from cirro.cli.interactive.utils import ask, prompt_wrapper, InputError from cirro.cli.models import DownloadArguments -from cirro.models.dataset import DatasetWithShare from cirro.models.file import File -from cirro.utils import format_date - - -def _format_share(dataset: Dataset | DatasetWithShare) -> str: - if isinstance(dataset, DatasetWithShare) and dataset.share: - return f'({dataset.share.name})' - return '' - - -def ask_dataset(datasets: List[Dataset], input_value: str) -> str: - if len(datasets) == 0: - raise InputError("No datasets available") - sorted_datasets = sorted(datasets, key=lambda d: d.created_at, reverse=True) - dataset_prompt = { - 'type': 'autocomplete', - 'name': 'dataset', - 'message': 'What dataset would you like to download? (Press Tab to see all options)', - 'choices': [f'{dataset.name} - {dataset.id}' for dataset in sorted_datasets], - 'meta_information': { - f'{dataset.name} - {dataset.id}': f'{format_date(dataset.created_at)} {_format_share(dataset)}' - for dataset in datasets - }, - 'ignore_case': True - } - answers = prompt_wrapper(dataset_prompt) - choice = answers['dataset'] - # Map the answer to a dataset - for dataset in datasets: - if f'{dataset.name} - {dataset.id}' == choice: - return dataset.id - - # The user has made a selection which does not match - # any of the options available. - # This is most likely because there was a typo - if ask( - 'confirm', - 'The selection does match an option available - try again?' - ): - return ask_dataset(datasets, input_value) - raise InputError("Exiting - no dataset selected") def ask_dataset_files(files: List[File]) -> List[File]: @@ -172,6 +131,6 @@ def gather_download_arguments(input_params: DownloadArguments, projects: List[Pr def gather_download_arguments_dataset(input_params: DownloadArguments, datasets: List[Dataset]): - input_params['dataset'] = ask_dataset(datasets, input_params.get('dataset')) + input_params['dataset'] = ask_dataset(datasets, input_params.get('dataset'), msg_action='download') input_params['data_directory'] = ask_directory(input_params.get('data_directory')) return input_params diff --git a/cirro/cli/interactive/utils.py b/cirro/cli/interactive/utils.py index c038fd2..3baad08 100644 --- a/cirro/cli/interactive/utils.py +++ b/cirro/cli/interactive/utils.py @@ -99,7 +99,8 @@ def get_id_from_name(items: List[T], name_or_id: str) -> Optional[str]: matched = get_item_from_name_or_id(items, name_or_id) if not matched: item_type = type(items[0]).__name__ - raise InputError(f"Could not find {item_type} {name_or_id}") + item_names = ", ".join([i.id for i in items]) + raise InputError(f"Could not find {item_type} {name_or_id} - options: {item_names}") return matched.id diff --git a/cirro/cli/interactive/validate_args.py b/cirro/cli/interactive/validate_args.py new file mode 100644 index 0000000..4580bb9 --- /dev/null +++ b/cirro/cli/interactive/validate_args.py @@ -0,0 +1,31 @@ +from pathlib import Path + +from cirro_api_client.v1.models import Dataset, Project + +from cirro.cli.interactive.common_args import ask_project, ask_dataset +from cirro.cli.interactive.utils import prompt_wrapper +from cirro.cli.models import ValidateArguments + + +def ask_directory(input_value: str) -> str: + directory_prompt = { + 'type': 'path', + 'name': 'directory', + 'only_directories': True, + 'message': 'What local folder would you like to compare data contents for?', + 'default': input_value or str(Path.cwd()) + } + + answers = prompt_wrapper(directory_prompt) + return answers['directory'] + + +def gather_validate_arguments(input_params: ValidateArguments, projects: list[Project]): + input_params['project'] = ask_project(projects, input_params.get('project')) + return input_params + + +def gather_validate_arguments_dataset(input_params: ValidateArguments, datasets: list[Dataset]): + input_params['dataset'] = ask_dataset(datasets, input_params.get('dataset'), msg_action='validate') + input_params['data_directory'] = ask_directory(input_params.get('data_directory')) + return input_params diff --git a/cirro/cli/models.py b/cirro/cli/models.py index dacfd61..3fd701c 100644 --- a/cirro/cli/models.py +++ b/cirro/cli/models.py @@ -20,6 +20,13 @@ class UploadArguments(TypedDict): file: Optional[list[str]] +class ValidateArguments(TypedDict): + dataset: str + project: str + data_directory: str + interactive: bool + + class ListArguments(TypedDict): project: str interactive: bool diff --git a/cirro/file_utils.py b/cirro/file_utils.py index d486df6..15f6d49 100644 --- a/cirro/file_utils.py +++ b/cirro/file_utils.py @@ -53,8 +53,11 @@ def generate_flattened_file_map(files: List[PathLike]) -> Dict[PathLike, str]: } -def _is_hidden_file(file_path: Path): - # Remove hidden files from listing, desktop.ini .DS_Store, etc. +def is_hidden_file(file_path: Path): + """ + Check if a file path is hidden + Such as desktop.ini, .DS_Store, etc. + """ if os.name == 'nt': attributes = win32api.GetFileAttributes(str(file_path)) return attributes & (win32con.FILE_ATTRIBUTE_HIDDEN | win32con.FILE_ATTRIBUTE_SYSTEM) @@ -86,7 +89,7 @@ def get_files_in_directory( if file_path.is_dir(): continue - if not include_hidden and _is_hidden_file(file_path): + if not include_hidden and is_hidden_file(file_path): continue if not file_path.exists(): diff --git a/cirro/models/dataset.py b/cirro/models/dataset.py index 14d28ca..37c5bb6 100644 --- a/cirro/models/dataset.py +++ b/cirro/models/dataset.py @@ -22,3 +22,12 @@ def from_dataset(cls, dataset: Dataset, share: Share) -> 'DatasetWithShare': updated_at=dataset.updated_at, share=share ) + + +@_attrs_define +class DatasetValidationResponse: + files_matching: list[str] + files_not_matching: list[str] + files_missing: list[str] + local_only_files: list[str] + validate_errors: list[str] diff --git a/cirro/models/file.py b/cirro/models/file.py index f35ae51..cdd2f12 100644 --- a/cirro/models/file.py +++ b/cirro/models/file.py @@ -139,6 +139,11 @@ def from_file_entry(cls, file: FileEntry, project_id: str, dataset: DatasetDetai access_context=access_context ) + @property + def normalized_path(self) -> str: + """ Without the data prefix """ + return self.relative_path[len("data/"):] + @property def absolute_path(self): return f'{self.access_context.base_url}/{self.relative_path.strip("/")}' diff --git a/cirro/sdk/file.py b/cirro/sdk/file.py index 03acd1e..bb04c00 100644 --- a/cirro/sdk/file.py +++ b/cirro/sdk/file.py @@ -212,11 +212,7 @@ def is_valid(self, local_path: PathLike) -> bool: if not local_path: raise DataPortalInputError("Must provide local path to validate file") - try: - self.validate(local_path) - return True - except ValueError: - return False + return self._client.file.is_valid_file(self._file, local_path) class DataPortalFiles(DataPortalAssets[DataPortalFile]): diff --git a/cirro/services/dataset.py b/cirro/services/dataset.py index 4407f95..3bf4793 100644 --- a/cirro/services/dataset.py +++ b/cirro/services/dataset.py @@ -1,3 +1,5 @@ +import logging +from pathlib import Path from typing import List, Optional, Union, Dict from cirro_api_client.v1.api.datasets import get_datasets, get_dataset, import_public_dataset, upload_dataset, \ @@ -6,11 +8,15 @@ from cirro_api_client.v1.models import ImportDataRequest, UploadDatasetRequest, UpdateDatasetRequest, Dataset, \ DatasetDetail, CreateResponse, UploadDatasetCreateResponse, FileEntry +from cirro.file_utils import is_hidden_file from cirro.models.assets import DatasetAssets, Artifact +from cirro.models.dataset import DatasetValidationResponse from cirro.models.file import FileAccessContext, File, PathLike from cirro.services.base import get_all_records from cirro.services.file import FileEnabledService +logger = logging.getLogger() + class DatasetService(FileEnabledService): """ @@ -299,6 +305,63 @@ def upload_files(self, file_path_map=file_path_map ) + def validate_folder( + self, + project_id: str, + dataset_id: str, + local_folder: PathLike + ) -> DatasetValidationResponse: + """ + Validates that the contents of a dataset match that of a local folder. + """ + ds_files = self.get_assets_listing(project_id, dataset_id).files + + local_folder = Path(local_folder) + if not local_folder.is_dir(): + raise ValueError(f"{local_folder} is not a valid local folder") + + # Keep track of files from the dataset which match by checksum, don't match, or are missing + ds_files_matching = [] + ds_files_not_matching = [] + ds_files_missing = [] + ds_validate_failed = [] + for ds_file in ds_files: + ds_file_path = ds_file.normalized_path + # Get the corresponding local file + local_file = local_folder / ds_file_path + if not local_file.exists(): + ds_files_missing.append(ds_file_path) + else: + try: + if self._file_service.is_valid_file(ds_file, local_file): + ds_files_matching.append(ds_file_path) + else: + ds_files_not_matching.append(ds_file_path) + except RuntimeWarning as e: + logger.warning(f"File validation failed: {e}") + ds_validate_failed.append(ds_file_path) + + # Find local files that are not in the dataset + local_file_paths = [ + file.relative_to(local_folder).as_posix() + for file in local_folder.rglob("*") + if not file.is_dir() and not is_hidden_file(file) + ] + dataset_file_paths = [file.normalized_path for file in ds_files] + local_only_files = [ + file + for file in local_file_paths + if file not in dataset_file_paths + ] + + return DatasetValidationResponse( + files_matching=ds_files_matching, + files_not_matching=ds_files_not_matching, + files_missing=ds_files_missing, + local_only_files=local_only_files, + validate_errors=ds_validate_failed, + ) + def download_files( self, project_id: str, diff --git a/cirro/services/file.py b/cirro/services/file.py index b10a28e..f2cf6e7 100644 --- a/cirro/services/file.py +++ b/cirro/services/file.py @@ -2,6 +2,7 @@ import threading from datetime import datetime, timezone from functools import partial +from pathlib import Path from typing import List, Dict from botocore.client import BaseClient @@ -180,6 +181,27 @@ def download_files(self, access_context: FileAccessContext, directory: str, file access_context.prefix ) + def is_valid_file(self, file: File, local_file: Path) -> bool: + """ + Validates the checksum of a file against a local file + See ``validate_file`` method for details. + + Args: + file (File): Cirro file to validate + local_file (PathLike): Local file path to compare against + + Returns: + bool: True if file integrity matches, False otherwise + + Raises: + RuntimeWarning: If the remote checksum is not available or not supported + """ + try: + self.validate_file(file, local_file) + return True + except ValueError: + return False + def validate_file(self, file: File, local_file: PathLike): """ Validates the checksum of a file against a local file