From f38d79b98dc8d96f5d65feb790fe9a40e45db831 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Thu, 14 Aug 2025 15:13:34 -0700 Subject: [PATCH 1/2] add unit test for projecting timestamp to timestamptz --- tests/io/test_pyarrow.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index f5c3082edc..bb70099a7d 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2521,6 +2521,45 @@ def test_initial_value() -> None: assert val.as_py() == 22 +def test__to_requested_schema_timestamp_to_timestamptz_projection() -> None: + from datetime import datetime, timezone + + # file is written with timestamp without timezone + file_schema = Schema(NestedField(1, "ts_field", TimestampType(), required=False)) + batch = pa.record_batch( + [ + pa.array( + [ + datetime(2025, 8, 14, 12, 0, 0), + datetime(2025, 8, 14, 13, 0, 0), + ], + type=pa.timestamp("us"), + ) + ], + names=["ts_field"], + ) + + # table is written with timestamp with timezone + table_schema = Schema(NestedField(1, "ts_field", TimestamptzType(), required=False)) + + actual_result = _to_requested_schema(table_schema, file_schema, batch, downcast_ns_timestamp_to_us=True) + expected = pa.record_batch( + [ + pa.array( + [ + datetime(2025, 8, 14, 12, 0, 0), + datetime(2025, 8, 14, 13, 0, 0), + ], + type=pa.timestamp("us", tz=timezone.utc), + ) + ], + names=["ts_field"], + ) + + # expect actual_result to have timezone + assert expected.equals(actual_result) + + def test__to_requested_schema_timestamps( arrow_table_schema_with_all_timestamp_precisions: pa.Schema, arrow_table_with_all_timestamp_precisions: pa.Table, From deb56d905a7baf0b84bb58caf53205ed16298c94 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 18 Aug 2025 20:38:26 -0700 Subject: [PATCH 2/2] reorder --- pyiceberg/io/pyarrow.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index c756487c32..6a64cace3a 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -1776,14 +1776,7 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array: file_field = self._file_schema.find_field(field.field_id) if field.field_type.is_primitive: - if field.field_type != file_field.field_type: - target_schema = schema_to_pyarrow( - promote(file_field.field_type, field.field_type), include_field_ids=self._include_field_ids - ) - if self._use_large_types is False: - target_schema = _pyarrow_schema_ensure_small_types(target_schema) - return values.cast(target_schema) - elif (target_type := schema_to_pyarrow(field.field_type, include_field_ids=self._include_field_ids)) != values.type: + if (target_type := schema_to_pyarrow(field.field_type, include_field_ids=self._include_field_ids)) != values.type: if field.field_type == TimestampType(): # Downcasting of nanoseconds to microseconds if ( @@ -1802,13 +1795,22 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array: pa.types.is_timestamp(target_type) and target_type.tz == "UTC" and pa.types.is_timestamp(values.type) - and values.type.tz in UTC_ALIASES + and (values.type.tz in UTC_ALIASES or values.type.tz is None) ): if target_type.unit == "us" and values.type.unit == "ns" and self._downcast_ns_timestamp_to_us: return values.cast(target_type, safe=False) elif target_type.unit == "us" and values.type.unit in {"s", "ms", "us"}: return values.cast(target_type) raise ValueError(f"Unsupported schema projection from {values.type} to {target_type}") + + if field.field_type != file_field.field_type: + target_schema = schema_to_pyarrow( + promote(file_field.field_type, field.field_type), include_field_ids=self._include_field_ids + ) + if self._use_large_types is False: + target_schema = _pyarrow_schema_ensure_small_types(target_schema) + return values.cast(target_schema) + return values def _construct_field(self, field: NestedField, arrow_type: pa.DataType) -> pa.Field: