2525from __future__ import annotations
2626
2727import concurrent .futures
28+ import itertools
2829import logging
2930import os
3031import re
3435from dataclasses import dataclass
3536from enum import Enum
3637from functools import lru_cache , singledispatch
37- from itertools import chain , count
38+ from itertools import chain
3839from typing import (
3940 TYPE_CHECKING ,
4041 Any ,
111112 Schema ,
112113 SchemaVisitorPerPrimitiveType ,
113114 SchemaWithPartnerVisitor ,
115+ assign_fresh_schema_ids ,
114116 pre_order_visit ,
115117 promote ,
116118 prune_columns ,
@@ -617,7 +619,12 @@ def _combine_positional_deletes(positional_deletes: List[pa.ChunkedArray], rows:
617619
618620def pyarrow_to_schema (schema : pa .Schema ) -> Schema :
619621 visitor = _ConvertToIceberg ()
620- return visit_pyarrow (schema , visitor )
622+ schema = visit_pyarrow (schema , visitor )
623+
624+ if visitor .missing_id_metadata :
625+ return assign_fresh_schema_ids (schema )
626+ else :
627+ return schema
621628
622629
623630@singledispatch
@@ -715,12 +722,12 @@ def primitive(self, primitive: pa.DataType) -> Optional[T]:
715722
716723
717724class _ConvertToIceberg (PyArrowSchemaVisitor [Union [IcebergType , Schema ]]):
718- counter : count [int ]
719- missing_is_metadata : Optional [bool ]
725+ counter : itertools . count [int ]
726+ missing_id_metadata : Optional [bool ]
720727
721728 def __init__ (self ) -> None :
722- self .counter = count ()
723- self .missing_is_metadata = None
729+ self .counter = itertools . count (1 )
730+ self .missing_id_metadata = None
724731
725732 def _get_field_id (self , field : pa .Field ) -> int :
726733 field_id : Optional [int ] = None
@@ -730,18 +737,17 @@ def _get_field_id(self, field: pa.Field) -> int:
730737 field_id = int (field_id_str .decode ())
731738
732739 if field_id is None :
733- if self .missing_is_metadata is None :
734- warnings .warn ("Field-ids are missing, generating new IDs" )
735-
740+ if self .missing_id_metadata is None :
741+ warnings .warn ("Field-ids are missing, new IDs will be set" )
736742 field_id = next (self .counter )
737743 missing_is_metadata = True
738744 else :
739745 missing_is_metadata = False
740746
741- if self .missing_is_metadata is not None and self .missing_is_metadata != missing_is_metadata :
747+ if self .missing_id_metadata is not None and self .missing_id_metadata != missing_is_metadata :
742748 raise ValueError ("Parquet file contains partial field-ids" )
743749 else :
744- self .missing_is_metadata = missing_is_metadata
750+ self .missing_id_metadata = missing_is_metadata
745751
746752 return field_id
747753
0 commit comments