|
137 | 137 | ) |
138 | 138 | from pyiceberg.utils.concurrent import ExecutorFactory |
139 | 139 | from pyiceberg.utils.datetime import datetime_to_millis |
| 140 | +from pyiceberg.utils.singleton import _convert_to_hashable_type |
140 | 141 |
|
141 | 142 | if TYPE_CHECKING: |
142 | 143 | import daft |
@@ -3422,6 +3423,94 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: |
3422 | 3423 | schema=entries_schema, |
3423 | 3424 | ) |
3424 | 3425 |
|
| 3426 | + def partitions(self, snapshot_id: Optional[int] = None) -> "pa.Table": |
| 3427 | + import pyarrow as pa |
| 3428 | + |
| 3429 | + from pyiceberg.io.pyarrow import schema_to_pyarrow |
| 3430 | + |
| 3431 | + table_schema = pa.schema([ |
| 3432 | + pa.field('record_count', pa.int64(), nullable=False), |
| 3433 | + pa.field('file_count', pa.int32(), nullable=False), |
| 3434 | + pa.field('total_data_file_size_in_bytes', pa.int64(), nullable=False), |
| 3435 | + pa.field('position_delete_record_count', pa.int64(), nullable=False), |
| 3436 | + pa.field('position_delete_file_count', pa.int32(), nullable=False), |
| 3437 | + pa.field('equality_delete_record_count', pa.int64(), nullable=False), |
| 3438 | + pa.field('equality_delete_file_count', pa.int32(), nullable=False), |
| 3439 | + pa.field('last_updated_at', pa.timestamp(unit='ms'), nullable=True), |
| 3440 | + pa.field('last_updated_snapshot_id', pa.int64(), nullable=True), |
| 3441 | + ]) |
| 3442 | + |
| 3443 | + partition_record = self.tbl.metadata.specs_struct() |
| 3444 | + has_partitions = len(partition_record.fields) > 0 |
| 3445 | + |
| 3446 | + if has_partitions: |
| 3447 | + pa_record_struct = schema_to_pyarrow(partition_record) |
| 3448 | + partitions_schema = pa.schema([ |
| 3449 | + pa.field('partition', pa_record_struct, nullable=False), |
| 3450 | + pa.field('spec_id', pa.int32(), nullable=False), |
| 3451 | + ]) |
| 3452 | + |
| 3453 | + table_schema = pa.unify_schemas([partitions_schema, table_schema]) |
| 3454 | + |
| 3455 | + def update_partitions_map( |
| 3456 | + partitions_map: Dict[Tuple[str, Any], Any], |
| 3457 | + file: DataFile, |
| 3458 | + partition_record_dict: Dict[str, Any], |
| 3459 | + snapshot: Optional[Snapshot], |
| 3460 | + ) -> None: |
| 3461 | + partition_record_key = _convert_to_hashable_type(partition_record_dict) |
| 3462 | + if partition_record_key not in partitions_map: |
| 3463 | + partitions_map[partition_record_key] = { |
| 3464 | + "partition": partition_record_dict, |
| 3465 | + "spec_id": file.spec_id, |
| 3466 | + "record_count": 0, |
| 3467 | + "file_count": 0, |
| 3468 | + "total_data_file_size_in_bytes": 0, |
| 3469 | + "position_delete_record_count": 0, |
| 3470 | + "position_delete_file_count": 0, |
| 3471 | + "equality_delete_record_count": 0, |
| 3472 | + "equality_delete_file_count": 0, |
| 3473 | + "last_updated_at": snapshot.timestamp_ms if snapshot else None, |
| 3474 | + "last_updated_snapshot_id": snapshot.snapshot_id if snapshot else None, |
| 3475 | + } |
| 3476 | + |
| 3477 | + partition_row = partitions_map[partition_record_key] |
| 3478 | + |
| 3479 | + if snapshot is not None: |
| 3480 | + if partition_row["last_updated_at"] is None or partition_row["last_updated_snapshot_id"] < snapshot.timestamp_ms: |
| 3481 | + partition_row["last_updated_at"] = snapshot.timestamp_ms |
| 3482 | + partition_row["last_updated_snapshot_id"] = snapshot.snapshot_id |
| 3483 | + |
| 3484 | + if file.content == DataFileContent.DATA: |
| 3485 | + partition_row["record_count"] += file.record_count |
| 3486 | + partition_row["file_count"] += 1 |
| 3487 | + partition_row["total_data_file_size_in_bytes"] += file.file_size_in_bytes |
| 3488 | + elif file.content == DataFileContent.POSITION_DELETES: |
| 3489 | + partition_row["position_delete_record_count"] += file.record_count |
| 3490 | + partition_row["position_delete_file_count"] += 1 |
| 3491 | + elif file.content == DataFileContent.EQUALITY_DELETES: |
| 3492 | + partition_row["equality_delete_record_count"] += file.record_count |
| 3493 | + partition_row["equality_delete_file_count"] += 1 |
| 3494 | + else: |
| 3495 | + raise ValueError(f"Unknown DataFileContent ({file.content})") |
| 3496 | + |
| 3497 | + partitions_map: Dict[Tuple[str, Any], Any] = {} |
| 3498 | + snapshot = self._get_snapshot(snapshot_id) |
| 3499 | + for manifest in snapshot.manifests(self.tbl.io): |
| 3500 | + for entry in manifest.fetch_manifest_entry(io=self.tbl.io): |
| 3501 | + partition = entry.data_file.partition |
| 3502 | + partition_record_dict = { |
| 3503 | + field.name: partition[pos] |
| 3504 | + for pos, field in enumerate(self.tbl.metadata.specs()[manifest.partition_spec_id].fields) |
| 3505 | + } |
| 3506 | + entry_snapshot = self.tbl.snapshot_by_id(entry.snapshot_id) if entry.snapshot_id is not None else None |
| 3507 | + update_partitions_map(partitions_map, entry.data_file, partition_record_dict, entry_snapshot) |
| 3508 | + |
| 3509 | + return pa.Table.from_pylist( |
| 3510 | + partitions_map.values(), |
| 3511 | + schema=table_schema, |
| 3512 | + ) |
| 3513 | + |
3425 | 3514 |
|
3426 | 3515 | @dataclass(frozen=True) |
3427 | 3516 | class TablePartition: |
|
0 commit comments