Skip to content

Commit 60f65c4

Browse files
authored
Support more than 20,000 files on file listing (#114)
* implement pagination on dataset files * pr comments
1 parent a5b9b14 commit 60f65c4

File tree

3 files changed

+227
-206
lines changed

3 files changed

+227
-206
lines changed

cirro/services/dataset.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ class DatasetService(FileEnabledService):
1414
"""
1515
Service for interacting with the Dataset endpoints
1616
"""
17+
1718
def list(self, project_id: str, max_items: int = 10000) -> List[Dataset]:
1819
"""List datasets
1920
@@ -126,27 +127,42 @@ def delete(self, project_id: str, dataset_id: str) -> None:
126127
"""
127128
delete_dataset.sync_detailed(project_id=project_id, dataset_id=dataset_id, client=self._api_client)
128129

129-
def get_file_listing(self, project_id: str, dataset_id: str) -> List[File]:
130+
def get_file_listing(self, project_id: str, dataset_id: str, file_limit: int = 100000) -> List[File]:
130131
"""
131132
Gets a listing of files, charts, and other assets available for the dataset
132133
133134
Args:
134135
project_id (str): ID of the Project
135136
dataset_id (str): ID of the Dataset
137+
file_limit (int): Maximum number of files to get (default 100,000)
136138
"""
137-
manifest = get_dataset_manifest.sync(
138-
project_id=project_id,
139-
dataset_id=dataset_id,
140-
client=self._api_client
141-
)
139+
if file_limit < 1:
140+
raise ValueError("file_limit must be greater than 0")
141+
all_files = []
142+
file_offset = 0
143+
domain = None
144+
145+
while len(all_files) < file_limit:
146+
manifest = get_dataset_manifest.sync(
147+
project_id=project_id,
148+
dataset_id=dataset_id,
149+
file_offset=file_offset,
150+
client=self._api_client
151+
)
152+
all_files.extend(manifest.files)
153+
file_offset += len(manifest.files)
154+
155+
if len(all_files) >= manifest.total_files or len(manifest.files) == 0:
156+
break
157+
domain = manifest.domain
142158

143159
files = [
144160
File.from_file_entry(
145161
f,
146162
project_id=project_id,
147-
domain=manifest.domain
163+
domain=domain
148164
)
149-
for f in manifest.files
165+
for f in all_files
150166
]
151167
return files
152168

0 commit comments

Comments
 (0)