Skip to content

Commit 9bbb0f1

Browse files
authored
CI-614 - Allow override of destination file names when uploading via SDK (#151)
* expose aws s3 client * add optional file_path_map to dataset upload * bump version * test when file is omitted from map * finish comment * add example * lint * add helper function to flatten files * lint
1 parent 58d91cd commit 9bbb0f1

File tree

6 files changed

+144
-32
lines changed

6 files changed

+144
-32
lines changed

cirro/clients/s3.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ def __init__(self, creds_getter: Callable[[], AWSCredentials], enable_additional
3838
self._upload_args = dict(ChecksumAlgorithm='SHA256') if enable_additional_checksum else dict()
3939
self._download_args = dict(ChecksumMode='ENABLED') if enable_additional_checksum else dict()
4040

41+
def get_aws_client(self):
42+
return self._client
43+
4144
def upload_file(self, file_path: Path, bucket: str, key: str):
4245
file_size = file_path.stat().st_size
4346
file_name = file_path.name

cirro/file_utils.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import random
33
import time
44
from pathlib import Path, PurePath
5-
from typing import List, Union
5+
from typing import List, Union, Dict
66

77
from boto3.exceptions import S3UploadFailedError
88
from botocore.exceptions import ConnectionError
@@ -35,6 +35,23 @@ def matches_glob(file: Union[File, str]):
3535
]
3636

3737

38+
def generate_flattened_file_map(files: List[PathLike]) -> Dict[PathLike, str]:
39+
"""
40+
Generates a mapping of file paths "flattened" to their base name.
41+
42+
Example: data1/sample1.fastq.gz -> sample1.fastq.gz
43+
44+
Args:
45+
files: List[PathLike]: List of file paths
46+
47+
Returns:
48+
Dict[PathLike, str]: Mapping of file paths to their base name
49+
"""
50+
return {
51+
file: Path(file).name for file in files
52+
}
53+
54+
3855
def _is_hidden_file(file_path: Path):
3956
# Remove hidden files from listing, desktop.ini .DS_Store, etc.
4057
if os.name == 'nt':
@@ -105,6 +122,7 @@ def get_files_stats(files: List[PathLike]) -> DirectoryStatistics:
105122

106123
def upload_directory(directory: PathLike,
107124
files: List[PathLike],
125+
file_path_map: Dict[PathLike, str],
108126
s3_client: S3Client,
109127
bucket: str,
110128
prefix: str,
@@ -117,6 +135,7 @@ def upload_directory(directory: PathLike,
117135
directory (str|Path): Path to directory
118136
files (typing.List[str|Path]): List of paths to files within the directory
119137
must be the same type as directory.
138+
file_path_map (typing.Dict[str|Path, str]): Map of file paths from source to destination
120139
s3_client (cirro.clients.S3Client): S3 client
121140
bucket (str): S3 bucket
122141
prefix (str): S3 prefix
@@ -132,7 +151,13 @@ def upload_directory(directory: PathLike,
132151
else:
133152
file_path = file
134153

135-
file_relative = file_path.relative_to(directory).as_posix()
154+
# Check if is present in the file_path_map
155+
# if it is, use the mapped value as the destination path
156+
if file in file_path_map:
157+
file_relative = file_path_map[file]
158+
else:
159+
file_relative = file_path.relative_to(directory).as_posix()
160+
136161
key = f'{prefix}/{file_relative}'
137162
success = False
138163

cirro/services/dataset.py

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, Optional, Union
1+
from typing import List, Optional, Union, Dict
22

33
from cirro_api_client.v1.api.datasets import get_datasets, get_dataset, import_public_dataset, upload_dataset, \
44
update_dataset, delete_dataset, get_dataset_manifest
@@ -222,17 +222,53 @@ def upload_files(self,
222222
project_id: str,
223223
dataset_id: str,
224224
directory: PathLike,
225-
files: List[PathLike]) -> None:
225+
files: List[PathLike] = None,
226+
file_path_map: Dict[PathLike, str] = None) -> None:
226227
"""
227228
Uploads files to a given dataset from the specified directory.
228229
230+
All files must be relative to the specified directory.
231+
If files need to be flattened, or you are sourcing files from multiple directories,
232+
please include `file_path_map` or call this method multiple times.
233+
229234
Args:
230235
project_id (str): ID of the Project
231236
dataset_id (str): ID of the Dataset
232237
directory (str|Path): Path to directory
233238
files (typing.List[str|Path]): List of paths to files within the directory,
234239
must be the same type as directory.
240+
file_path_map (typing.Dict[str|Path, str|Path]): Optional mapping of file paths to upload
241+
from source path to destination path, used to "re-write" paths within the dataset.
242+
```python
243+
from cirro.cirro_client import CirroApi
244+
from cirro.file_utils import generate_flattened_file_map
245+
246+
cirro = CirroApi()
247+
248+
directory = "~/Downloads"
249+
# Re-write file paths
250+
file_map = {
251+
"data1/file1.fastq.gz": "file1.fastq.gz",
252+
"data2/file2.fastq.gz": "file2.fastq.gz",
253+
"file3.fastq.gz": "new_file3.txt"
254+
}
255+
256+
# Or you could automate the flattening
257+
files = ["data1/file1.fastq.gz"]
258+
file_map = generate_flattened_file_map(files)
259+
260+
cirro.datasets.upload_files(
261+
project_id="project-id",
262+
dataset_id="dataset-id",
263+
directory=directory,
264+
files=list(file_map.keys()),
265+
file_path_map=file_map
266+
)
267+
```
235268
"""
269+
if file_path_map is None:
270+
file_path_map = {}
271+
236272
dataset = self.get(project_id, dataset_id)
237273

238274
access_context = FileAccessContext.upload_dataset(
@@ -244,7 +280,8 @@ def upload_files(self,
244280
self._file_service.upload_files(
245281
access_context=access_context,
246282
directory=directory,
247-
files=files
283+
files=files,
284+
file_path_map=file_path_map
248285
)
249286

250287
def download_files(

cirro/services/file.py

Lines changed: 35 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from functools import partial
44
from typing import List, Dict
55

6+
from botocore.client import BaseClient
67
from cirro_api_client import CirroApiClient
78
from cirro_api_client.v1.api.file import generate_project_file_access_token
89
from cirro_api_client.v1.models import AWSCredentials, ProjectAccessType
@@ -69,6 +70,17 @@ def _get_project_read_credentials(self, access_context: FileAccessContext):
6970

7071
return self._read_token_cache[project_id]
7172

73+
def get_aws_s3_client(self, access_context: FileAccessContext) -> BaseClient:
74+
"""
75+
Gets the underlying AWS S3 client to perform operations on files
76+
77+
This is seeded with refreshable credentials from the access_context parameter
78+
79+
This may be used to perform advanced operations, such as CopyObject, S3 Select, etc.
80+
"""
81+
s3_client = self._generate_s3_client(access_context)
82+
return s3_client.get_aws_client()
83+
7284
def get_file(self, file: File) -> bytes:
7385
"""
7486
Gets the contents of a file
@@ -92,11 +104,7 @@ def get_file_from_path(self, access_context: FileAccessContext, file_path: str)
92104
Returns:
93105
The raw bytes of the file
94106
"""
95-
96-
s3_client = S3Client(
97-
partial(self.get_access_credentials, access_context),
98-
self.enable_additional_checksum
99-
)
107+
s3_client = self._generate_s3_client(access_context)
100108

101109
full_path = f'{access_context.prefix}/{file_path}'.lstrip('/')
102110

@@ -113,11 +121,7 @@ def create_file(self, access_context: FileAccessContext, key: str,
113121
contents (str): Content of object
114122
content_type (str):
115123
"""
116-
117-
s3_client = S3Client(
118-
partial(self.get_access_credentials, access_context),
119-
self.enable_additional_checksum
120-
)
124+
s3_client = self._generate_s3_client(access_context)
121125

122126
s3_client.create_object(
123127
key=key,
@@ -129,7 +133,8 @@ def create_file(self, access_context: FileAccessContext, key: str,
129133
def upload_files(self,
130134
access_context: FileAccessContext,
131135
directory: PathLike,
132-
files: List[PathLike]) -> None:
136+
files: List[PathLike],
137+
file_path_map: Dict[PathLike, str]) -> None:
133138
"""
134139
Uploads a list of files from the specified directory
135140
@@ -138,19 +143,18 @@ def upload_files(self,
138143
directory (str|Path): Path to directory
139144
files (typing.List[str|Path]): List of paths to files within the directory
140145
must be the same type as directory.
146+
file_path_map (typing.Dict[str|Path, str]): Optional mapping of file paths to upload
147+
from source path to destination path, used to "re-write" paths within the dataset.
141148
"""
142-
143-
s3_client = S3Client(
144-
partial(self.get_access_credentials, access_context),
145-
self.enable_additional_checksum
146-
)
149+
s3_client = self._generate_s3_client(access_context)
147150

148151
upload_directory(
149-
directory,
150-
files,
151-
s3_client,
152-
access_context.bucket,
153-
access_context.prefix,
152+
directory=directory,
153+
files=files,
154+
file_path_map=file_path_map,
155+
s3_client=s3_client,
156+
bucket=access_context.bucket,
157+
prefix=access_context.prefix,
154158
max_retries=self.transfer_retries
155159
)
156160

@@ -163,10 +167,7 @@ def download_files(self, access_context: FileAccessContext, directory: str, file
163167
directory (str): download location
164168
files (List[str]): relative path of files to download
165169
"""
166-
s3_client = S3Client(
167-
partial(self.get_access_credentials, access_context),
168-
self.enable_additional_checksum
169-
)
170+
s3_client = self._generate_s3_client(access_context)
170171

171172
download_directory(
172173
directory,
@@ -176,6 +177,15 @@ def download_files(self, access_context: FileAccessContext, directory: str, file
176177
access_context.prefix
177178
)
178179

180+
def _generate_s3_client(self, access_context: FileAccessContext):
181+
"""
182+
Generates the Cirro-S3 client to perform operations on files
183+
"""
184+
return S3Client(
185+
partial(self.get_access_credentials, access_context),
186+
self.enable_additional_checksum
187+
)
188+
179189

180190
class FileEnabledService(BaseService):
181191
"""

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "cirro"
3-
version = "1.5.0"
3+
version = "1.5.1"
44
description = "CLI tool and SDK for interacting with the Cirro platform"
55
authors = ["Cirro Bio <[email protected]>"]
66
license = "MIT"

tests/test_file_utils.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ def test_upload_directory_pathlike(self):
2929
]
3030
upload_directory(directory=test_path,
3131
files=test_files,
32+
file_path_map={},
3233
s3_client=self.mock_s3_client,
3334
bucket=self.test_bucket,
3435
prefix=self.test_prefix)
@@ -47,6 +48,7 @@ def test_upload_directory_string(self):
4748
]
4849
upload_directory(directory=test_path,
4950
files=test_files,
51+
file_path_map={},
5052
s3_client=self.mock_s3_client,
5153
bucket=self.test_bucket,
5254
prefix=self.test_prefix)
@@ -70,6 +72,41 @@ def test_upload_directory_different_types(self):
7072
with self.assertRaises(ValueError):
7173
upload_directory(directory=test_path,
7274
files=test_files,
75+
file_path_map={},
7376
s3_client=self.mock_s3_client,
7477
bucket=self.test_bucket,
75-
prefix=self.test_prefix)
78+
prefix=self.test_prefix)
79+
80+
def test_upload_directory_file_map_included(self):
81+
test_path = 'data'
82+
test_files = [
83+
'file1.txt',
84+
'folder1/file2.txt',
85+
'folder1/unmapped.txt'
86+
]
87+
88+
file_path_map = {
89+
'file1.txt': 'mapped_file1.txt',
90+
'folder1/file2.txt': 'mapped_file2.txt'
91+
# unmapped file3
92+
}
93+
94+
upload_directory(directory=test_path,
95+
files=test_files,
96+
file_path_map=file_path_map,
97+
s3_client=self.mock_s3_client,
98+
bucket=self.test_bucket,
99+
prefix=self.test_prefix)
100+
101+
# Check that upload file was called with the mapped key
102+
self.mock_s3_client.upload_file.assert_has_calls([
103+
call(file_path=Path(test_path, test_files[0]),
104+
bucket=self.test_bucket,
105+
key=f'{self.test_prefix}/mapped_file1.txt'),
106+
call(file_path=Path(test_path, test_files[1]),
107+
bucket=self.test_bucket,
108+
key=f'{self.test_prefix}/mapped_file2.txt'),
109+
call(file_path=Path(test_path, test_files[2]),
110+
bucket=self.test_bucket,
111+
key=f'{self.test_prefix}/folder1/unmapped.txt')
112+
], any_order=True)

0 commit comments

Comments
 (0)