PartitionKey; hive partition path; transform key

jqin61 · jqin61 · commit 805c19ccd777 · 2024-02-16T18:48:19.000Z
diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py
@@ -16,7 +16,9 @@
 # under the License.
 from __future__ import annotations
 
-from functools import cached_property
+from dataclasses import dataclass
+from datetime import date, datetime
+from functools import cached_property, singledispatch
 from typing import (
     Any,
     Dict,
@@ -36,7 +38,8 @@
 from pyiceberg.schema import Schema
 from pyiceberg.transforms import Transform, parse_transform
 from pyiceberg.typedef import IcebergBaseModel, Record
-from pyiceberg.types import NestedField, StructType
+from pyiceberg.types import DateType, IcebergType, NestedField, StructType, TimestampType, TimestamptzType
+from pyiceberg.utils.datetime import date_to_days, datetime_to_micros
 
 INITIAL_PARTITION_SPEC_ID = 0
 PARTITION_FIELD_ID_START: int = 1000
@@ -97,7 +100,6 @@ class PartitionSpec(IcebergBaseModel):
 
     spec_id: int = Field(alias="spec-id", default=INITIAL_PARTITION_SPEC_ID)
     fields: Tuple[PartitionField, ...] = Field(default_factory=tuple)
-    schema: Schema
 
     def __init__(
         self,
@@ -205,7 +207,7 @@ def partition_to_path(self, data: Record, schema: Schema) -> str:
             value = getattr(data, field_name)
 
             partition_field = self.fields[pos]  # partition field
-            value_str = partition_field.transform.to_human_string(source_type=field_types[pos].field_type, value=value)
+            value_str = partition_field.transform.to_human_string(field_types[pos].field_type, value=value)
             value_strs.append(value_str)
             field_strs.append(partition_field.name)
             pos += 1
@@ -234,3 +236,53 @@ def assign_fresh_partition_spec_ids(spec: PartitionSpec, old_schema: Schema, fre
             )
         )
     return PartitionSpec(*partition_fields, spec_id=INITIAL_PARTITION_SPEC_ID)
+
+
+@dataclass(frozen=True)
+class PartitionFieldValue:
+    # It seems partition fields could not be nested or have map, list structure
+    # So instead of using an accessor which was built through schema-visitor (like iceberg-spark does) to fetch the partition value,
+    # created this simple class for the first iteration.
+    # Open to discussion and willing to change to conform to row accessors.
+    source_id: int
+    value: Any
+
+
+@dataclass(frozen=True)
+class PartitionKey:
+    raw_partition_field_values: list[PartitionFieldValue]
+    partition_spec: PartitionSpec
+    schema: Schema
+    from functools import cached_property
+
+    @cached_property
+    def partition(self) -> Record:  # partition key in iceberg type
+        iceberg_typed_key_values = {}
+        for raw_partition_field_value in self.raw_partition_field_values:
+            partition_fields = self.partition_spec.source_id_to_fields_map[raw_partition_field_value.source_id]
+            assert len(partition_fields) == 1
+            partition_field = partition_fields[0]
+            iceberg_type = self.schema.find_field(name_or_id=raw_partition_field_value.source_id).field_type
+            _iceberg_typed_value = iceberg_typed_value(iceberg_type, raw_partition_field_value.value)
+            transformed_value = partition_field.transform.transform(iceberg_type)(_iceberg_typed_value)
+            iceberg_typed_key_values[partition_field.name] = transformed_value
+        return Record(**iceberg_typed_key_values)
+
+    def to_path(self) -> str:
+        return self.partition_spec.partition_to_path(self.partition, self.schema)
+
+
+@singledispatch
+def iceberg_typed_value(type: IcebergType, value: Any) -> Any:
+    return value
+
+
+@iceberg_typed_value.register(TimestampType)
+@iceberg_typed_value.register(TimestamptzType)
+def _(type: IcebergType, value: Optional[datetime]) -> Optional[int]:
+    return datetime_to_micros(value) if value is not None else None
+
+
+@iceberg_typed_value.register(DateType)
+def _(type: IcebergType, value: Optional[date]) -> Optional[int]:
+    return date_to_days(value) if value is not None else None
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -22,7 +22,7 @@
 from abc import ABC, abstractmethod
 from copy import copy
 from dataclasses import dataclass
-from datetime import date, datetime
+from datetime import datetime
 from enum import Enum
 from functools import cached_property, singledispatch
 from itertools import chain
@@ -67,7 +67,7 @@
     write_manifest,
     write_manifest_list,
 )
-from pyiceberg.partitioning import PartitionSpec
+from pyiceberg.partitioning import PartitionFieldValue, PartitionKey, PartitionSpec
 from pyiceberg.schema import (
     PartnerAccessor,
     Schema,
@@ -107,7 +107,6 @@
     Identifier,
     KeyDefaultDict,
     Properties,
-    Record,
 )
 from pyiceberg.types import (
     IcebergType,
@@ -118,7 +117,7 @@
     StructType,
 )
 from pyiceberg.utils.concurrent import ExecutorFactory
-from pyiceberg.utils.datetime import date_to_days, datetime_to_micros, datetime_to_millis
+from pyiceberg.utils.datetime import datetime_to_millis
 
 if TYPE_CHECKING:
     import pandas as pd
@@ -2257,7 +2256,7 @@ class WriteTask:
     def generate_data_file_partition_path(self) -> str:
         if self.partition_key is None:
             raise ValueError("Cannot generate partition path based on non-partitioned WriteTask")
-        return self.partition_key.to_path(self.schema)
+        return self.partition_key.to_path()
 
     def generate_data_file_filename(self, extension: str) -> str:
         # Mimics the behavior in the Java API:
@@ -2467,41 +2466,6 @@ class TablePartition:
     arrow_table_partition: pa.Table
 
 
-@dataclass(frozen=True)
-class PartitionKey:
-    raw_partition_key: Record  # partition key in raw python type
-    partition_spec: PartitionSpec
-
-    # this only supports identity transform now
-    @property
-    def partition(self) -> Record:  # partition key in iceberg type
-        iceberg_typed_key_values = {
-            field_name: iceberg_typed_value(getattr(self.raw_partition_key, field_name, None))
-            for field_name in self.raw_partition_key._position_to_field_name
-        }
-
-        return Record(**iceberg_typed_key_values)
-
-    def to_path(self, schema: Schema) -> str:
-        return self.partition_spec.partition_to_path(self.partition, schema)
-
-
-@singledispatch
-def iceberg_typed_value(value: Any) -> Any:
-    return value
-
-
-@iceberg_typed_value.register(datetime)
-def _(value: Any) -> int:
-    val = datetime_to_micros(value)
-    return val
-
-
-@iceberg_typed_value.register(date)
-def _(value: Any) -> int:
-    return date_to_days(value)
-
-
 def _get_partition_sort_order(partition_columns: list[str], reverse: bool = False) -> dict[str, Any]:
     order = 'ascending' if not reverse else 'descending'
     null_placement = 'at_start' if reverse else 'at_end'
@@ -2538,15 +2502,35 @@ def _get_partition_columns(iceberg_table: Table, arrow_table: pa.Table) -> list[
     return partition_cols
 
 
-def _get_partition_key(
-    arrow_table: pa.Table, partition_columns: list[str], offset: int, partition_spec: PartitionSpec
-) -> PartitionKey:
-    # todo: Instead of fetching partition keys one at a time, try filtering by a mask made of offsets, and convert to py together,
-    # possibly slightly more efficient.
-    return PartitionKey(
-        raw_partition_key=Record(**{col: arrow_table.column(col)[offset].as_py() for col in partition_columns}),
-        partition_spec=partition_spec,
-    )
+def _get_table_partitions(
+    arrow_table: pa.Table,
+    partition_spec: PartitionSpec,
+    schema: Schema,
+    slice_instructions: list[dict[str, Any]],
+) -> list[TablePartition]:
+    sorted_slice_instructions = sorted(slice_instructions, key=lambda x: x['offset'])
+
+    partition_fields = partition_spec.fields
+
+    offsets = [inst["offset"] for inst in sorted_slice_instructions]
+    projected_and_filtered = {
+        partition_field.source_id: arrow_table[schema.find_field(name_or_id=partition_field.source_id).name]
+        .take(offsets)
+        .to_pylist()
+        for partition_field in partition_fields
+    }
+
+    table_partitions = []
+    for inst in sorted_slice_instructions:
+        partition_slice = arrow_table.slice(**inst)
+        fieldvalues = [
+            PartitionFieldValue(partition_field.source_id, projected_and_filtered[partition_field.source_id][inst["offset"]])
+            for partition_field in partition_fields
+        ]
+        partition_key = PartitionKey(raw_partition_field_values=fieldvalues, partition_spec=partition_spec, schema=schema)
+        table_partitions.append(TablePartition(partition_key=partition_key, arrow_table_partition=partition_slice))
+
+    return table_partitions
 
 
 def _partition(iceberg_table: Table, arrow_table: pa.Table) -> Iterable[TablePartition]:
@@ -2584,7 +2568,7 @@ def _partition(iceberg_table: Table, arrow_table: pa.Table) -> Iterable[TablePar
     reversing_sort_order_options = _get_partition_sort_order(partition_columns, reverse=True)
     reversed_indices = pa.compute.sort_indices(arrow_table, **reversing_sort_order_options).to_pylist()
 
-    slice_instructions = []
+    slice_instructions: list[dict[str, Any]] = []
     last = len(reversed_indices)
     reversed_indices_size = len(reversed_indices)
     ptr = 0
@@ -2595,13 +2579,10 @@ def _partition(iceberg_table: Table, arrow_table: pa.Table) -> Iterable[TablePar
         last = reversed_indices[ptr]
         ptr = ptr + group_size
 
-    table_partitions: list[TablePartition] = [
-        TablePartition(
-            partition_key=_get_partition_key(arrow_table, partition_columns, inst["offset"], iceberg_table.spec()),
-            arrow_table_partition=arrow_table.slice(**inst),
-        )
-        for inst in slice_instructions
-    ]
+    table_partitions: list[TablePartition] = _get_table_partitions(
+        arrow_table, iceberg_table.spec(), iceberg_table.schema(), slice_instructions
+    )
+
     return table_partitions
 
 
diff --git a/tests/integration/test_partitioned_writes.py b/tests/integration/test_partitioned_writes.py
@@ -360,9 +360,6 @@ def test_query_filter_null_partitioned(spark: SparkSession, part_col: str, forma
     for col in TEST_DATA_WITH_NULL.keys():
         assert df.where(f"{col} is not null").count() == 2, f"Expected 2 rows for {col}"
 
-    spark.sql(f"select path from {identifier}.manifests").show(20, False)
-    spark.sql(f"select path from {identifier}.manifests").collect()
-
 
 @pytest.mark.integration
 @pytest.mark.parametrize(
diff --git a/tests/table/test_init.py b/tests/table/test_init.py
@@ -982,3 +982,105 @@ def test_correct_schema() -> None:
         _ = t.scan(snapshot_id=-1).projection()
 
     assert "Snapshot not found: -1" in str(exc_info.value)
+
+import pytz
+from datetime import date, datetime
+TEST_DATA_WITH_NULL = {
+    'bool': [False, None, True],
+    'string': ['a', None, 'z'],
+    # Go over the 16 bytes to kick in truncation
+    'string_long': ['a' * 22, None, 'z' * 22],
+    'int': [1, None, 9],
+    'long': [1, None, 9],
+    'float': [0.0, None, 0.9],
+    'double': [0.0, None, 0.9],
+    'timestamp': [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)],
+    'timestamptz': [
+        datetime(2023, 1, 1, 19, 25, 00, tzinfo=pytz.timezone('America/New_York')),
+        None,
+        datetime(2023, 3, 1, 19, 25, 00, tzinfo=pytz.timezone('America/New_York')),
+    ],
+    'date': [date(2023, 1, 1), None, date(2023, 3, 1)],
+    # Not supported by Spark
+    # 'time': [time(1, 22, 0), None, time(19, 25, 0)],
+    # Not natively supported by Arrow
+    # 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, uuid.UUID('11111111-1111-1111-1111-111111111111').bytes],
+    'binary': [b'\01', None, b'\22'],
+    'fixed': [
+        uuid.UUID('00000000-0000-0000-0000-000000000000').bytes,
+        None,
+        uuid.UUID('11111111-1111-1111-1111-111111111111').bytes,
+    ],
+}
+import pyarrow as pa
+@pytest.fixture(scope="session")
+def arrow_table_with_null() -> pa.Table:
+    """PyArrow table with all kinds of columns"""
+    pa_schema = pa.schema([
+        ("bool", pa.bool_()),
+        ("string", pa.string()),
+        ("string_long", pa.string()),
+        ("int", pa.int32()),
+        ("long", pa.int64()),
+        ("float", pa.float32()),
+        ("double", pa.float64()),
+        ("timestamp", pa.timestamp(unit="us")),
+        ("timestamptz", pa.timestamp(unit="us", tz="UTC")),
+        ("date", pa.date32()),
+        # Not supported by Spark
+        # ("time", pa.time64("us")),
+        # Not natively supported by Arrow
+        # ("uuid", pa.fixed(16)),
+        ("binary", pa.binary()),
+        ("fixed", pa.binary(16)),
+    ])
+    return pa.Table.from_pydict(TEST_DATA_WITH_NULL, schema=pa_schema)
+
+from pyiceberg.schema import Schema
+from pyiceberg.transforms import IdentityTransform, DayTransform, MonthTransform
+from pyiceberg.types import (
+    BinaryType,
+    BooleanType,
+    DateType,
+    DoubleType,
+    FixedType,
+    FloatType,
+    IntegerType,
+    LongType,
+    NestedField,
+    StringType,
+    TimestampType,
+    TimestamptzType,
+)
+TABLE_SCHEMA = Schema(
+    NestedField(field_id=1, name="bool", field_type=BooleanType(), required=False),
+    NestedField(field_id=2, name="string", field_type=StringType(), required=False),
+    NestedField(field_id=3, name="string_long", field_type=StringType(), required=False),
+    NestedField(field_id=4, name="int", field_type=IntegerType(), required=False),
+    NestedField(field_id=5, name="long", field_type=LongType(), required=False),
+    NestedField(field_id=6, name="float", field_type=FloatType(), required=False),
+    NestedField(field_id=7, name="double", field_type=DoubleType(), required=False),
+    NestedField(field_id=8, name="timestamp", field_type=TimestampType(), required=False),
+    NestedField(field_id=9, name="timestamptz", field_type=TimestamptzType(), required=False),
+    NestedField(field_id=10, name="date", field_type=DateType(), required=False),
+    # NestedField(field_id=11, name="time", field_type=TimeType(), required=False),
+    # NestedField(field_id=12, name="uuid", field_type=UuidType(), required=False),
+    NestedField(field_id=11, name="binary", field_type=BinaryType(), required=False),
+    NestedField(field_id=12, name="fixed", field_type=FixedType(16), required=False),
+)
+
+@pytest.mark.mexico
+def test_partition_key(arrow_table_with_null) -> None:
+    from pyiceberg.table import PartitionKeyNew, PartitionFieldValue
+    from pyiceberg.partitioning import PartitionField, PartitionSpec
+
+    spec = PartitionSpec(
+        PartitionField(source_id=8, field_id=1001, transform=MonthTransform(), name="test_partition_field")
+    )
+    
+
+    key = PartitionKeyNew(raw_partition_key_values = [PartitionFieldValue(source_id = 8, value = datetime(2023, 1, 1,11,55,59))], partition_spec = spec, schema = TABLE_SCHEMA)
+    print(key.partition)
+    print("-----------")
+    print(key.to_path())
+    print("1")