Skip to content

Commit d507fcd

Browse files
committed
Thanks Honah!
1 parent 405d36c commit d507fcd

File tree

2 files changed

+17
-14
lines changed

2 files changed

+17
-14
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
from dataclasses import dataclass
3535
from enum import Enum
3636
from functools import lru_cache, singledispatch
37-
from itertools import chain, count
37+
from itertools import chain
3838
from typing import (
3939
TYPE_CHECKING,
4040
Any,
@@ -111,6 +111,7 @@
111111
Schema,
112112
SchemaVisitorPerPrimitiveType,
113113
SchemaWithPartnerVisitor,
114+
assign_fresh_schema_ids,
114115
pre_order_visit,
115116
promote,
116117
prune_columns,
@@ -617,7 +618,12 @@ def _combine_positional_deletes(positional_deletes: List[pa.ChunkedArray], rows:
617618

618619
def pyarrow_to_schema(schema: pa.Schema) -> Schema:
619620
visitor = _ConvertToIceberg()
620-
return visit_pyarrow(schema, visitor)
621+
schema = visit_pyarrow(schema, visitor)
622+
623+
if visitor.missing_id_metadata:
624+
return assign_fresh_schema_ids(schema)
625+
else:
626+
return schema
621627

622628

623629
@singledispatch
@@ -715,12 +721,10 @@ def primitive(self, primitive: pa.DataType) -> Optional[T]:
715721

716722

717723
class _ConvertToIceberg(PyArrowSchemaVisitor[Union[IcebergType, Schema]]):
718-
counter: count[int]
719-
missing_is_metadata: Optional[bool]
724+
missing_id_metadata: Optional[bool]
720725

721726
def __init__(self) -> None:
722-
self.counter = count()
723-
self.missing_is_metadata = None
727+
self.missing_id_metadata = None
724728

725729
def _get_field_id(self, field: pa.Field) -> int:
726730
field_id: Optional[int] = None
@@ -730,18 +734,17 @@ def _get_field_id(self, field: pa.Field) -> int:
730734
field_id = int(field_id_str.decode())
731735

732736
if field_id is None:
733-
if self.missing_is_metadata is None:
734-
warnings.warn("Field-ids are missing, generating new IDs")
735-
736-
field_id = next(self.counter)
737+
if self.missing_id_metadata is None:
738+
warnings.warn("Field-ids are missing, new IDs will be set")
739+
field_id = 0
737740
missing_is_metadata = True
738741
else:
739742
missing_is_metadata = False
740743

741-
if self.missing_is_metadata is not None and self.missing_is_metadata != missing_is_metadata:
744+
if self.missing_id_metadata is not None and self.missing_id_metadata != missing_is_metadata:
742745
raise ValueError("Parquet file contains partial field-ids")
743746
else:
744-
self.missing_is_metadata = missing_is_metadata
747+
self.missing_id_metadata = missing_is_metadata
745748

746749
return field_id
747750

tests/io/test_pyarrow_visitor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -278,8 +278,8 @@ def test_schema_to_pyarrow_schema_missing_ids(warn: Mock) -> None:
278278
actual = pyarrow_to_schema(schema)
279279

280280
expected = Schema(
281-
NestedField(field_id=0, name="some_int", field_type=IntegerType(), required=False),
282-
NestedField(field_id=1, name="some_string", field_type=StringType(), required=True),
281+
NestedField(field_id=1, name="some_int", field_type=IntegerType(), required=False),
282+
NestedField(field_id=2, name="some_string", field_type=StringType(), required=True),
283283
)
284284

285285
assert actual == expected

0 commit comments

Comments
 (0)