Skip to content

Commit 3a189f7

Browse files
authored
Refactor file upload methods to accept Path-like objects (#116)
* refactor file upload services to use Path-like objects * fix * lint * remove reference to PathBase * fix code coverage, change KiB to KB
1 parent 60f65c4 commit 3a189f7

File tree

10 files changed

+158
-39
lines changed

10 files changed

+158
-39
lines changed

.github/workflows/lint.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ jobs:
4343
poetry run flake8 cirro
4444
4545
- name: Run Tests
46-
working-directory: tests
4746
run: |
4847
poetry run pytest --cov --cov-report=xml --cov-branch -o junit_family=xunit2 --junitxml=junit/test-results.xml
4948

cirro/cli/controller.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def run_ingest(input_params: UploadArguments, interactive=False):
9898
logger.info("Uploading files")
9999
cirro.datasets.upload_files(project_id=project_id,
100100
dataset_id=create_resp.id,
101-
local_directory=directory,
101+
directory=directory,
102102
files=files)
103103

104104
if cirro.configuration.enable_additional_checksum:

cirro/cli/interactive/upload_args.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def confirm_data_files(data_directory: str, files: List[str]):
7171

7272
if not ask(
7373
"confirm",
74-
f'Please confirm that you wish to upload {stats["numberOfFiles"]} files ({stats["sizeFriendly"]})'
74+
f'Please confirm that you wish to upload {stats.number_of_files} files ({stats.size_friendly})'
7575
):
7676
sys.exit(1)
7777

cirro/file_utils.py

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from botocore.exceptions import ConnectionError
99

1010
from cirro.clients import S3Client
11-
from cirro.models.file import DirectoryStatistics, File
11+
from cirro.models.file import DirectoryStatistics, File, PathLike
1212

1313
if os.name == 'nt':
1414
import win32api
@@ -82,26 +82,58 @@ def get_files_in_directory(
8282
return paths
8383

8484

85-
def get_files_stats(files: List[Path]) -> DirectoryStatistics:
85+
def _bytes_to_human_readable(num_bytes: int) -> str:
86+
for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']:
87+
if num_bytes < 1000.0 or unit == 'PB':
88+
break
89+
num_bytes /= 1000.0
90+
return f"{num_bytes:,.2f} {unit}"
91+
92+
93+
def get_files_stats(files: List[PathLike]) -> DirectoryStatistics:
8694
"""
87-
@private
95+
Returns information about the list of files provided, such as the total size and number of files.
8896
"""
8997
sizes = [f.stat().st_size for f in files]
90-
total_size = sum(sizes) / float(1 << 30)
91-
return {
92-
'sizeFriendly': f'{total_size:,.3f} GB',
93-
'size': total_size,
94-
'numberOfFiles': len(sizes)
95-
}
96-
97-
98-
def upload_directory(directory: str, files: List[str], s3_client: S3Client, bucket: str, prefix: str, max_retries=10):
98+
total_size = sum(sizes)
99+
return DirectoryStatistics(
100+
size_friendly=_bytes_to_human_readable(total_size),
101+
size=total_size,
102+
number_of_files=len(sizes)
103+
)
104+
105+
106+
def upload_directory(directory: PathLike,
107+
files: List[PathLike],
108+
s3_client: S3Client,
109+
bucket: str,
110+
prefix: str,
111+
max_retries=10):
99112
"""
100113
@private
114+
115+
Uploads a list of files from the specified directory
116+
Args:
117+
directory (str|Path): Path to directory
118+
files (typing.List[str|Path]): List of paths to files within the directory
119+
must be the same type as directory.
120+
s3_client (cirro.clients.S3Client): S3 client
121+
bucket (str): S3 bucket
122+
prefix (str): S3 prefix
123+
max_retries (int): Number of retries
101124
"""
125+
# Ensure all files are of the same type as the directory
126+
if not all(isinstance(file, type(directory)) for file in files):
127+
raise ValueError("All files must be of the same type as the directory (str or Path)")
128+
102129
for file in files:
103-
key = f'{prefix}/{file}'
104-
local_path = Path(directory, file)
130+
if isinstance(file, str):
131+
file_path = Path(directory, file)
132+
else:
133+
file_path = file
134+
135+
file_relative = file_path.relative_to(directory).as_posix()
136+
key = f'{prefix}/{file_relative}'
105137
success = False
106138

107139
# Retry up to max_retries times
@@ -110,7 +142,7 @@ def upload_directory(directory: str, files: List[str], s3_client: S3Client, buck
110142
# Try the upload
111143
try:
112144
s3_client.upload_file(
113-
file_path=local_path,
145+
file_path=file_path,
114146
bucket=bucket,
115147
key=key
116148
)

cirro/models/file.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,21 @@
11
from dataclasses import dataclass
2-
from pathlib import PurePath
3-
from typing import TypedDict, Dict, Optional
2+
from pathlib import PurePath, Path
3+
from typing import Dict, Optional, TypeVar, NamedTuple
44

55
from cirro_api_client.v1.models import FileAccessRequest, AccessType, FileEntry
66

77
from cirro.models.s3_path import S3Path
88

9+
PathLike = TypeVar('PathLike', str, Path)
910

10-
class DirectoryStatistics(TypedDict):
11+
12+
class DirectoryStatistics(NamedTuple):
1113
size: float
12-
sizeFriendly: str
13-
numberOfFiles: int
14+
" Size in bytes"
15+
size_friendly: str
16+
" Size in user friendly format (e.g. 1.2 KB)"
17+
number_of_files: int
18+
" Number of files"
1419

1520

1621
class FileAccessContext:

cirro/sdk/project.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ def upload_dataset(
195195
self._client.datasets.upload_files(
196196
project_id=self.id,
197197
dataset_id=create_response.id,
198-
local_directory=upload_folder,
198+
directory=upload_folder,
199199
files=files
200200
)
201201

cirro/services/dataset.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from cirro_api_client.v1.models import ImportDataRequest, UploadDatasetRequest, UpdateDatasetRequest, Dataset, \
66
DatasetDetail, CreateResponse, UploadDatasetCreateResponse
77

8-
from cirro.models.file import FileAccessContext, File
8+
from cirro.models.file import FileAccessContext, File, PathLike
99
from cirro.services.base import get_all_records
1010
from cirro.services.file import FileEnabledService
1111

@@ -166,16 +166,20 @@ def get_file_listing(self, project_id: str, dataset_id: str, file_limit: int = 1
166166
]
167167
return files
168168

169-
def upload_files(self, project_id: str, dataset_id: str, local_directory: str, files: List[str]) -> None:
169+
def upload_files(self,
170+
project_id: str,
171+
dataset_id: str,
172+
directory: PathLike,
173+
files: List[PathLike]) -> None:
170174
"""
171-
Uploads files to a given dataset from the specified local directory
175+
Uploads files to a given dataset from the specified directory.
172176
173177
Args:
174178
project_id (str): ID of the Project
175179
dataset_id (str): ID of the Dataset
176-
local_directory (str): Path to local directory
177-
files (typing.List[str]): List of relative paths to files within the local directory
178-
180+
directory (str|Path): Path to directory
181+
files (typing.List[str|Path]): List of paths to files within the directory,
182+
must be the same type as directory.
179183
"""
180184
dataset = self.get(project_id, dataset_id)
181185

@@ -186,9 +190,9 @@ def upload_files(self, project_id: str, dataset_id: str, local_directory: str, f
186190
)
187191

188192
self._file_service.upload_files(
189-
access_context,
190-
local_directory,
191-
files
193+
access_context=access_context,
194+
directory=directory,
195+
files=files
192196
)
193197

194198
def download_files(

cirro/services/file.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from cirro.clients.s3 import S3Client
1111
from cirro.file_utils import upload_directory, download_directory
12-
from cirro.models.file import FileAccessContext, File
12+
from cirro.models.file import FileAccessContext, File, PathLike
1313
from cirro.services.base import BaseService
1414

1515

@@ -126,14 +126,18 @@ def create_file(self, access_context: FileAccessContext, key: str,
126126
bucket=access_context.bucket
127127
)
128128

129-
def upload_files(self, access_context: FileAccessContext, directory: str, files: List[str]) -> None:
129+
def upload_files(self,
130+
access_context: FileAccessContext,
131+
directory: PathLike,
132+
files: List[PathLike]) -> None:
130133
"""
131134
Uploads a list of files from the specified directory
132135
133136
Args:
134137
access_context (cirro.models.file.FileAccessContext): File access context, use class methods to generate
135-
directory (str): base path to upload from
136-
files (List[str]): relative path of files to upload
138+
directory (str|Path): Path to directory
139+
files (typing.List[str|Path]): List of paths to files within the directory
140+
must be the same type as directory.
137141
"""
138142

139143
s3_client = S3Client(

sonar-project.properties

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
sonar.projectKey=CirroBio_Cirro-client
22
sonar.organization=cirrobio
33
sonar.python.version=3.9
4-
sonar.python.xunit.reportPath=tests/junit/test-results.xml
5-
sonar.python.coverage.reportPaths=tests/coverage.xml
4+
sonar.python.xunit.reportPath=junit/test-results.xml
5+
sonar.python.coverage.reportPaths=coverage.xml
66
sonar.exclusions=**/tests/*.py

tests/test_file_utils.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import unittest
2+
from pathlib import Path
3+
from unittest.mock import Mock, call
4+
5+
from cirro.file_utils import upload_directory, get_files_in_directory, get_files_stats
6+
7+
8+
class TestFileUtils(unittest.TestCase):
9+
def setUp(self):
10+
self.mock_s3_client = Mock()
11+
self.mock_s3_client.upload_file = Mock()
12+
self.test_bucket = 'project-1a1a'
13+
self.test_prefix = 'datasets/1a1a/data'
14+
15+
def test_get_file_stats(self):
16+
directory = Path(__file__).parent / 'data'
17+
files = get_files_in_directory(directory)
18+
files = [Path(directory, f) for f in files]
19+
stats = get_files_stats(files=files)
20+
self.assertGreater(stats.size, 0)
21+
self.assertGreater(stats.number_of_files, 0)
22+
self.assertIn('KB', stats.size_friendly)
23+
24+
def test_upload_directory_pathlike(self):
25+
test_path = Path('/Users/test/Documents/dataset1')
26+
test_files = [
27+
Path('/Users/test/Documents/dataset1/test_file.fastq'),
28+
Path('/Users/test/Documents/dataset1/folder1/test_file.fastq'),
29+
]
30+
upload_directory(directory=test_path,
31+
files=test_files,
32+
s3_client=self.mock_s3_client,
33+
bucket=self.test_bucket,
34+
prefix=self.test_prefix)
35+
36+
# The function should upload files relative to the directory path.
37+
self.mock_s3_client.upload_file.assert_has_calls([
38+
call(file_path=test_files[0], bucket=self.test_bucket, key=f'{self.test_prefix}/test_file.fastq'),
39+
call(file_path=test_files[1], bucket=self.test_bucket, key=f'{self.test_prefix}/folder1/test_file.fastq')
40+
], any_order=True)
41+
42+
def test_upload_directory_string(self):
43+
test_path = 'data'
44+
test_files = [
45+
'file1.txt',
46+
'folder1/file2.txt'
47+
]
48+
upload_directory(directory=test_path,
49+
files=test_files,
50+
s3_client=self.mock_s3_client,
51+
bucket=self.test_bucket,
52+
prefix=self.test_prefix)
53+
54+
# The function should upload files relative to the directory path,
55+
# but also format file_path into the Path object.
56+
self.mock_s3_client.upload_file.assert_has_calls([
57+
call(file_path=Path(test_path, test_files[0]),
58+
bucket=self.test_bucket,
59+
key=f'{self.test_prefix}/file1.txt'),
60+
call(file_path=Path(test_path, test_files[1]),
61+
bucket=self.test_bucket,
62+
key=f'{self.test_prefix}/folder1/file2.txt')
63+
], any_order=True)
64+
65+
def test_upload_directory_different_types(self):
66+
test_path = Path('s3://bucket/dataset1')
67+
test_files = [
68+
's3://bucket/dataset1/file2.txt'
69+
]
70+
with self.assertRaises(ValueError):
71+
upload_directory(directory=test_path,
72+
files=test_files,
73+
s3_client=self.mock_s3_client,
74+
bucket=self.test_bucket,
75+
prefix=self.test_prefix)

0 commit comments

Comments
 (0)