3434from dataclasses import dataclass
3535from enum import Enum
3636from functools import lru_cache , singledispatch
37- from itertools import chain , count
37+ from itertools import chain
3838from typing import (
3939 TYPE_CHECKING ,
4040 Any ,
111111 Schema ,
112112 SchemaVisitorPerPrimitiveType ,
113113 SchemaWithPartnerVisitor ,
114+ assign_fresh_schema_ids ,
114115 pre_order_visit ,
115116 promote ,
116117 prune_columns ,
@@ -617,7 +618,12 @@ def _combine_positional_deletes(positional_deletes: List[pa.ChunkedArray], rows:
617618
618619def pyarrow_to_schema (schema : pa .Schema ) -> Schema :
619620 visitor = _ConvertToIceberg ()
620- return visit_pyarrow (schema , visitor )
621+ schema = visit_pyarrow (schema , visitor )
622+
623+ if visitor .missing_id_metadata :
624+ return assign_fresh_schema_ids (schema )
625+ else :
626+ return schema
621627
622628
623629@singledispatch
@@ -715,12 +721,10 @@ def primitive(self, primitive: pa.DataType) -> Optional[T]:
715721
716722
717723class _ConvertToIceberg (PyArrowSchemaVisitor [Union [IcebergType , Schema ]]):
718- counter : count [int ]
719- missing_is_metadata : Optional [bool ]
724+ missing_id_metadata : Optional [bool ]
720725
721726 def __init__ (self ) -> None :
722- self .counter = count ()
723- self .missing_is_metadata = None
727+ self .missing_id_metadata = None
724728
725729 def _get_field_id (self , field : pa .Field ) -> int :
726730 field_id : Optional [int ] = None
@@ -730,18 +734,17 @@ def _get_field_id(self, field: pa.Field) -> int:
730734 field_id = int (field_id_str .decode ())
731735
732736 if field_id is None :
733- if self .missing_is_metadata is None :
734- warnings .warn ("Field-ids are missing, generating new IDs" )
735-
736- field_id = next (self .counter )
737+ if self .missing_id_metadata is None :
738+ warnings .warn ("Field-ids are missing, new IDs will be set" )
739+ field_id = 0
737740 missing_is_metadata = True
738741 else :
739742 missing_is_metadata = False
740743
741- if self .missing_is_metadata is not None and self .missing_is_metadata != missing_is_metadata :
744+ if self .missing_id_metadata is not None and self .missing_id_metadata != missing_is_metadata :
742745 raise ValueError ("Parquet file contains partial field-ids" )
743746 else :
744- self .missing_is_metadata = missing_is_metadata
747+ self .missing_id_metadata = missing_is_metadata
745748
746749 return field_id
747750
0 commit comments