2828import logging
2929import os
3030import re
31+ import warnings
3132from abc import ABC , abstractmethod
3233from concurrent .futures import Future
3334from dataclasses import dataclass
3435from enum import Enum
3536from functools import lru_cache , singledispatch
36- from itertools import chain
37+ from itertools import chain , count
3738from typing import (
3839 TYPE_CHECKING ,
3940 Any ,
@@ -713,28 +714,50 @@ def primitive(self, primitive: pa.DataType) -> Optional[T]:
713714 """Visit a primitive type."""
714715
715716
716- def _get_field_id (field : pa .Field ) -> Optional [int ]:
717- for pyarrow_field_id_key in PYARROW_FIELD_ID_KEYS :
718- if field_id_str := field .metadata .get (pyarrow_field_id_key ):
719- return int (field_id_str .decode ())
720- return None
717+ class _ConvertToIceberg (PyArrowSchemaVisitor [Union [IcebergType , Schema ]]):
718+ counter : count [int ]
719+ missing_is_metadata : Optional [bool ]
721720
721+ def __init__ (self ) -> None :
722+ self .counter = count ()
723+ self .missing_is_metadata = None
722724
723- def _get_field_doc (field : pa .Field ) -> Optional [str ]:
724- for pyarrow_doc_key in PYARROW_FIELD_DOC_KEYS :
725- if doc_str := field .metadata .get (pyarrow_doc_key ):
726- return doc_str .decode ()
727- return None
725+ def _get_field_id (self , field : pa .Field ) -> int :
726+ field_id : Optional [int ] = None
728727
728+ for pyarrow_field_id_key in PYARROW_FIELD_ID_KEYS :
729+ if field .metadata and (field_id_str := field .metadata .get (pyarrow_field_id_key )):
730+ field_id = int (field_id_str .decode ())
731+
732+ if field_id is None :
733+ if self .missing_is_metadata is None :
734+ warnings .warn ("Field-ids are missing, generating new IDs" )
735+
736+ field_id = next (self .counter )
737+ missing_is_metadata = True
738+ else :
739+ missing_is_metadata = False
740+
741+ if self .missing_is_metadata is not None and self .missing_is_metadata != missing_is_metadata :
742+ raise ValueError ("Parquet file contains partial field-ids" )
743+ else :
744+ self .missing_is_metadata = missing_is_metadata
745+
746+ return field_id
747+
748+ def _get_field_doc (self , field : pa .Field ) -> Optional [str ]:
749+ for pyarrow_doc_key in PYARROW_FIELD_DOC_KEYS :
750+ if field .metadata and (doc_str := field .metadata .get (pyarrow_doc_key )):
751+ return doc_str .decode ()
752+ return None
729753
730- class _ConvertToIceberg (PyArrowSchemaVisitor [Union [IcebergType , Schema ]]):
731754 def _convert_fields (self , arrow_fields : Iterable [pa .Field ], field_results : List [Optional [IcebergType ]]) -> List [NestedField ]:
732755 fields = []
733756 for i , field in enumerate (arrow_fields ):
734- field_id = _get_field_id (field )
735- field_doc = _get_field_doc (field )
757+ field_id = self . _get_field_id (field )
758+ field_doc = self . _get_field_doc (field )
736759 field_type = field_results [i ]
737- if field_type is not None and field_id is not None :
760+ if field_type is not None :
738761 fields .append (NestedField (field_id , field .name , field_type , required = not field .nullable , doc = field_doc ))
739762 return fields
740763
@@ -746,7 +769,7 @@ def struct(self, struct: pa.StructType, field_results: List[Optional[IcebergType
746769
747770 def list (self , list_type : pa .ListType , element_result : Optional [IcebergType ]) -> Optional [IcebergType ]:
748771 element_field = list_type .value_field
749- element_id = _get_field_id (element_field )
772+ element_id = self . _get_field_id (element_field )
750773 if element_result is not None and element_id is not None :
751774 return ListType (element_id , element_result , element_required = not element_field .nullable )
752775 return None
@@ -755,9 +778,9 @@ def map(
755778 self , map_type : pa .MapType , key_result : Optional [IcebergType ], value_result : Optional [IcebergType ]
756779 ) -> Optional [IcebergType ]:
757780 key_field = map_type .key_field
758- key_id = _get_field_id (key_field )
781+ key_id = self . _get_field_id (key_field )
759782 value_field = map_type .item_field
760- value_id = _get_field_id (value_field )
783+ value_id = self . _get_field_id (value_field )
761784 if key_result is not None and value_result is not None and key_id is not None and value_id is not None :
762785 return MapType (key_id , key_result , value_id , value_result , value_required = not value_field .nullable )
763786 return None
0 commit comments