|
26 | 26 | from __future__ import annotations |
27 | 27 |
|
28 | 28 | import concurrent.futures |
| 29 | +import itertools |
29 | 30 | import logging |
30 | 31 | import os |
31 | 32 | import re |
|
34 | 35 | from dataclasses import dataclass |
35 | 36 | from enum import Enum |
36 | 37 | from functools import lru_cache, singledispatch |
37 | | -from itertools import chain |
38 | 38 | from typing import ( |
39 | 39 | TYPE_CHECKING, |
40 | 40 | Any, |
@@ -637,7 +637,7 @@ def _combine_positional_deletes(positional_deletes: List[pa.ChunkedArray], rows: |
637 | 637 | if len(positional_deletes) == 1: |
638 | 638 | all_chunks = positional_deletes[0] |
639 | 639 | else: |
640 | | - all_chunks = pa.chunked_array(chain(*[arr.chunks for arr in positional_deletes])) |
| 640 | + all_chunks = pa.chunked_array(itertools.chain(*[arr.chunks for arr in positional_deletes])) |
641 | 641 | return np.setdiff1d(np.arange(rows), all_chunks, assume_unique=False) |
642 | 642 |
|
643 | 643 |
|
@@ -912,6 +912,21 @@ def after_map_value(self, element: pa.Field) -> None: |
912 | 912 | self._field_names.pop() |
913 | 913 |
|
914 | 914 |
|
| 915 | +class _ConvertToIcebergWithoutIDs(_ConvertToIceberg): |
| 916 | + """ |
| 917 | + Converts PyArrowSchema to Iceberg Schema with all -1 ids. |
| 918 | +
|
| 919 | + The schema generated through this visitor should always be |
| 920 | + used in conjunction with `new_table_metadata` function to |
| 921 | + assign new field ids in order. This is currently used only |
| 922 | + when creating an Iceberg Schema from a PyArrow schema when |
| 923 | + creating a new Iceberg table. |
| 924 | + """ |
| 925 | + |
| 926 | + def _field_id(self, field: pa.Field) -> int: |
| 927 | + return -1 |
| 928 | + |
| 929 | + |
915 | 930 | def _task_to_table( |
916 | 931 | fs: FileSystem, |
917 | 932 | task: FileScanTask, |
@@ -999,7 +1014,7 @@ def _task_to_table( |
999 | 1014 |
|
1000 | 1015 | def _read_all_delete_files(fs: FileSystem, tasks: Iterable[FileScanTask]) -> Dict[str, List[ChunkedArray]]: |
1001 | 1016 | deletes_per_file: Dict[str, List[ChunkedArray]] = {} |
1002 | | - unique_deletes = set(chain.from_iterable([task.delete_files for task in tasks])) |
| 1017 | + unique_deletes = set(itertools.chain.from_iterable([task.delete_files for task in tasks])) |
1003 | 1018 | if len(unique_deletes) > 0: |
1004 | 1019 | executor = ExecutorFactory.get_or_create() |
1005 | 1020 | deletes_per_files: Iterator[Dict[str, ChunkedArray]] = executor.map( |
@@ -1421,7 +1436,7 @@ def schema(self, schema: Schema, struct_result: Callable[[], List[StatisticsColl |
1421 | 1436 | def struct( |
1422 | 1437 | self, struct: StructType, field_results: List[Callable[[], List[StatisticsCollector]]] |
1423 | 1438 | ) -> List[StatisticsCollector]: |
1424 | | - return list(chain(*[result() for result in field_results])) |
| 1439 | + return list(itertools.chain(*[result() for result in field_results])) |
1425 | 1440 |
|
1426 | 1441 | def field(self, field: NestedField, field_result: Callable[[], List[StatisticsCollector]]) -> List[StatisticsCollector]: |
1427 | 1442 | self._field_id = field.field_id |
@@ -1513,7 +1528,7 @@ def schema(self, schema: Schema, struct_result: Callable[[], List[ID2ParquetPath |
1513 | 1528 | return struct_result() |
1514 | 1529 |
|
1515 | 1530 | def struct(self, struct: StructType, field_results: List[Callable[[], List[ID2ParquetPath]]]) -> List[ID2ParquetPath]: |
1516 | | - return list(chain(*[result() for result in field_results])) |
| 1531 | + return list(itertools.chain(*[result() for result in field_results])) |
1517 | 1532 |
|
1518 | 1533 | def field(self, field: NestedField, field_result: Callable[[], List[ID2ParquetPath]]) -> List[ID2ParquetPath]: |
1519 | 1534 | self._field_id = field.field_id |
|
0 commit comments