-
Notifications
You must be signed in to change notification settings - Fork 393
Closed
Description
Apache Iceberg version
0.9.1 (latest release)
Please describe the bug 🐞
Traceback (most recent call last):
File "/home/nv/src/pyiceberg-example/nested_partition_field.py", line 145, in <module>
main()
File "/home/nv/src/pyiceberg-example/nested_partition_field.py", line 132, in main
table.append(batch_data)
File "/home/nv/src/pyiceberg-example/.venv/lib/python3.9/site-packages/pyiceberg/table/__init__.py", line 1229, in append
tx.append(df=df, snapshot_properties=snapshot_properties)
File "/home/nv/src/pyiceberg-example/.venv/lib/python3.9/site-packages/pyiceberg/table/__init__.py", line 473, in append
data_files = list(
File "/home/nv/src/pyiceberg-example/.venv/lib/python3.9/site-packages/pyiceberg/io/pyarrow.py", line 2603, in _dataframe_to_data_files
partitions = _determine_partitions(spec=table_metadata.spec(), schema=table_metadata.schema(), arrow_table=df)
File "/home/nv/src/pyiceberg-example/.venv/lib/python3.9/site-packages/pyiceberg/io/pyarrow.py", line 2650, in _determine_partitions
name, partition.transform.pyarrow_transform(source_field.field_type)(arrow_table[source_field.name])
File "pyarrow/table.pxi", line 1693, in pyarrow.lib._Tabular.__getitem__
File "pyarrow/table.pxi", line 1779, in pyarrow.lib._Tabular.column
File "pyarrow/table.pxi", line 1715, in pyarrow.lib._Tabular._ensure_integer_index
KeyError: 'Field "timestamp" does not exist in schema'
# Define schema with nested structure and timestamp
schema = Schema(
NestedField(id=1, name="name", field_type=StringType(), required=True),
NestedField(id=2,
name="nested_field",
field_type=StructType(
NestedField(id=5,
name="timestamp",
field_type=TimestampType(),
required=True), ),
required=True),
)
# Define partition spec to partition by the nested timestamp field (by hour)
partition_spec = PartitionSpec(
PartitionField(
source_id=5, # ID of the timestamp field in the nested struct
field_id=1000, # A unique ID for this partition field
transform=HourTransform(), # Partition by hour
name="timestamp_hour" # Name for the partition
)
)
# Create the table with partition spec
table = catalog.create_table(identifier=table_identifier,
schema=schema,
partition_spec=partition_spec)
print(
f"Table {table.name()} created successfully with partition spec: {table.spec()}"
)
# Function to generate random data with timestamps
def generate_batch_data(batch_size):
data_list = []
for i in range(batch_size):
# Generate a random name
first_names = [
"John", "Jane", "Alex", "Sarah", "Mike", "Emma", "David",
"Lisa"
]
last_names = [
"Smith", "Jones", "Doe", "Brown", "Wilson", "Davis", "Miller",
"Taylor"
]
name = f"{random.choice(first_names)} {random.choice(last_names)}"
data_list.append({
"name": name,
"nested_field": {
"timestamp": datetime.now()
}
})
return pa.Table.from_pylist(data_list,
schema=table.schema().as_arrow())
batch_size = 1000
num_batches = 10
for batch_num in range(num_batches):
batch_data = generate_batch_data(batch_size=batch_size)
# Write the batch to the table
table.append(batch_data)
print(
f"Wrote batch {batch_num + 1}/{num_batches} with {batch_size} records to {table.name()}"
)Willingness to contribute
- I can contribute a fix for this bug independently
- I would be willing to contribute a fix for this bug with guidance from the Iceberg community
- I cannot contribute a fix for this bug at this time
Metadata
Metadata
Assignees
Labels
No labels