From b1af884eea4909f4f3f78bbe989598330938130b Mon Sep 17 00:00:00 2001 From: ZENOTME Date: Thu, 28 Nov 2024 19:41:26 +0800 Subject: [PATCH 01/11] add arrow_struct_to_iceberg_struct --- crates/iceberg/src/arrow/mod.rs | 3 +- crates/iceberg/src/arrow/value.rs | 934 ++++++++++++++++++++++++++++++ crates/iceberg/src/spec/values.rs | 10 + 3 files changed, 946 insertions(+), 1 deletion(-) create mode 100644 crates/iceberg/src/arrow/value.rs diff --git a/crates/iceberg/src/arrow/mod.rs b/crates/iceberg/src/arrow/mod.rs index 0f01324cb8..0c885e65f4 100644 --- a/crates/iceberg/src/arrow/mod.rs +++ b/crates/iceberg/src/arrow/mod.rs @@ -22,5 +22,6 @@ pub use schema::*; mod reader; pub(crate) mod record_batch_projector; pub(crate) mod record_batch_transformer; - +mod value; pub use reader::*; +pub use value::*; diff --git a/crates/iceberg/src/arrow/value.rs b/crates/iceberg/src/arrow/value.rs new file mode 100644 index 0000000000..95712018be --- /dev/null +++ b/crates/iceberg/src/arrow/value.rs @@ -0,0 +1,934 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{ + Array, BinaryArray, BooleanArray, Date32Array, Decimal128Array, Float16Array, Float32Array, + Float64Array, Int16Array, Int32Array, Int64Array, LargeBinaryArray, LargeStringArray, + StringArray, StructArray, Time64MicrosecondArray, TimestampMicrosecondArray, + TimestampNanosecondArray, +}; +use arrow_schema::{DataType, TimeUnit}; +use itertools::Itertools; + +use crate::spec::{Literal, PrimitiveType, Struct, StructType, Type}; +use crate::{Error, ErrorKind, Result}; + +trait ToIcebergLiteralArray { + fn to_primitive_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &PrimitiveType, + ) -> Result>>; + fn to_struct_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &StructType, + ) -> Result>>; +} + +impl ToIcebergLiteralArray for BooleanArray { + fn to_primitive_literal_array( + &self, + _arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>> { + match iceberg_type { + PrimitiveType::Boolean => Ok(self.iter().map(|v| v.map(Literal::bool)).collect()), + _ => Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The type of arrow boolean array is not compatitable with iceberg type {}", + iceberg_type + ), + )), + } + } + + fn to_struct_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &StructType, + ) -> Result>> { + unreachable!() + } +} + +impl ToIcebergLiteralArray for Int16Array { + fn to_primitive_literal_array( + &self, + _arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>> { + match iceberg_type { + PrimitiveType::Int => Ok(self.iter().map(|v| v.map(Literal::int)).collect()), + PrimitiveType::Long => Ok(self.iter().map(|v| v.map(Literal::long)).collect()), + _ => Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The type of arrow int16 array is not compatitable with iceberg type {}", + iceberg_type + ), + )), + } + } + + fn to_struct_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &StructType, + ) -> Result>> { + unreachable!() + } +} + +impl ToIcebergLiteralArray for Int32Array { + fn to_primitive_literal_array( + &self, + _arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>> { + match iceberg_type { + PrimitiveType::Int => Ok(self.iter().map(|v| v.map(Literal::int)).collect()), + PrimitiveType::Long => Ok(self.iter().map(|v| v.map(Literal::long)).collect()), + _ => Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The type of arrow int32 array is not compatitable with iceberg type {}", + iceberg_type + ), + )), + } + } + + fn to_struct_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &StructType, + ) -> Result>> { + unreachable!() + } +} + +impl ToIcebergLiteralArray for Int64Array { + fn to_primitive_literal_array( + &self, + _arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>> { + match iceberg_type { + PrimitiveType::Long => Ok(self.iter().map(|v| v.map(Literal::long)).collect()), + _ => Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The type of arrow int64 array is not compatitable with iceberg type {}", + iceberg_type + ), + )), + } + } + + fn to_struct_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &StructType, + ) -> Result>> { + unreachable!() + } +} + +impl ToIcebergLiteralArray for Float16Array { + fn to_primitive_literal_array( + &self, + _arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>> { + match iceberg_type { + PrimitiveType::Float => Ok(self + .iter() + .map(|v| v.map(|v| Literal::float(v.to_f32()))) + .collect()), + _ => Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The type of arrow float16 array is not compatitable with iceberg type {}", + iceberg_type + ), + )), + } + } + + fn to_struct_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &StructType, + ) -> Result>> { + unreachable!() + } +} + +impl ToIcebergLiteralArray for Float32Array { + fn to_primitive_literal_array( + &self, + _arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>> { + match iceberg_type { + PrimitiveType::Float => Ok(self.iter().map(|v| v.map(Literal::float)).collect()), + _ => Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The type of arrow float32 array is not compatitable with iceberg type {}", + iceberg_type + ), + )), + } + } + + fn to_struct_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &StructType, + ) -> Result>> { + unreachable!() + } +} + +impl ToIcebergLiteralArray for Float64Array { + fn to_primitive_literal_array( + &self, + _arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>> { + match iceberg_type { + PrimitiveType::Double => Ok(self.iter().map(|v| v.map(Literal::double)).collect()), + _ => Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The type of arrow float64 array is not compatitable with iceberg type {}", + iceberg_type + ), + )), + } + } + + fn to_struct_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &StructType, + ) -> Result>> { + unreachable!() + } +} + +impl ToIcebergLiteralArray for Decimal128Array { + fn to_primitive_literal_array( + &self, + arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>> { + let DataType::Decimal128(arrow_precision, arrow_scale) = arrow_type else { + unreachable!() + }; + match iceberg_type { + PrimitiveType::Decimal { precision, scale } => { + if *arrow_precision as u32 != *precision || *arrow_scale as u32 != *scale { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The precision or scale ({},{}) of arrow decimal128 array is not compatitable with iceberg decimal type ({},{})", + arrow_precision, arrow_scale, precision, scale + ), + )); + } + Ok(self.iter().map(|v| v.map(Literal::decimal)).collect()) + } + _ => Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The type of arrow decimal128 array is not compatitable with iceberg type {}", + iceberg_type + ), + )), + } + } + + fn to_struct_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &StructType, + ) -> Result>> { + unreachable!() + } +} + +impl ToIcebergLiteralArray for Date32Array { + fn to_primitive_literal_array( + &self, + _arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>> { + match iceberg_type { + PrimitiveType::Date => Ok(self.iter().map(|v| v.map(Literal::date)).collect()), + _ => Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The type of arrow date32 array is not compatitable with iceberg type {}", + iceberg_type + ), + )), + } + } + + fn to_struct_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &StructType, + ) -> Result>> { + unreachable!() + } +} + +impl ToIcebergLiteralArray for Time64MicrosecondArray { + fn to_primitive_literal_array( + &self, + _arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>> { + match iceberg_type { + PrimitiveType::Time => Ok(self + .iter() + .map(|v| v.map(Literal::time)) + .collect()), + _ => Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The type of arrow time64 microsecond array is not compatitable with iceberg type {}", + iceberg_type + ), + )), + } + } + + fn to_struct_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &StructType, + ) -> Result>> { + unreachable!() + } +} + +impl ToIcebergLiteralArray for TimestampMicrosecondArray { + fn to_primitive_literal_array( + &self, + _arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>> { + match iceberg_type { + PrimitiveType::Timestamp => Ok(self + .iter() + .map(|v| v.map(Literal::timestamp)) + .collect()), + PrimitiveType::Timestamptz => Ok(self + .iter() + .map(|v| v.map(Literal::timestamptz)) + .collect()), + _ => Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The type of arrow timestamp microsecond array is not compatitable with iceberg type {}", + iceberg_type + ), + )), + } + } + + fn to_struct_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &StructType, + ) -> Result>> { + unreachable!() + } +} + +impl ToIcebergLiteralArray for TimestampNanosecondArray { + fn to_primitive_literal_array( + &self, + _arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>> { + match iceberg_type { + PrimitiveType::TimestampNs => Ok(self + .iter() + .map(|v| v.map(Literal::timestamp_nano)) + .collect()), + PrimitiveType::TimestamptzNs => Ok(self + .iter() + .map(|v| v.map(Literal::timestamptz_nano)) + .collect()), + _ => Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The type of arrow timestamp nanosecond array is not compatitable with iceberg type {}", + iceberg_type + ), + )), + } + } + + fn to_struct_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &StructType, + ) -> Result>> { + unreachable!() + } +} + +impl ToIcebergLiteralArray for StringArray { + fn to_primitive_literal_array( + &self, + _arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>> { + match iceberg_type { + PrimitiveType::String => Ok(self.iter().map(|v| v.map(Literal::string)).collect()), + _ => Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The type of arrow string array is not compatitable with iceberg type {}", + iceberg_type + ), + )), + } + } + + fn to_struct_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &StructType, + ) -> Result>> { + unreachable!() + } +} + +impl ToIcebergLiteralArray for LargeStringArray { + fn to_primitive_literal_array( + &self, + _arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>> { + match iceberg_type { + PrimitiveType::String => Ok(self.iter().map(|v| v.map(Literal::string)).collect()), + _ => Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The type of arrow large string array is not compatitable with iceberg type {}", + iceberg_type + ), + )), + } + } + + fn to_struct_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &StructType, + ) -> Result>> { + unreachable!() + } +} + +impl ToIcebergLiteralArray for BinaryArray { + fn to_primitive_literal_array( + &self, + _arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>> { + match iceberg_type { + PrimitiveType::Binary => Ok(self + .iter() + .map(|v| v.map(|v| Literal::binary(v.to_vec()))) + .collect()), + _ => Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The type of arrow binary array is not compatitable with iceberg type {}", + iceberg_type + ), + )), + } + } + + fn to_struct_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &StructType, + ) -> Result>> { + unreachable!() + } +} + +impl ToIcebergLiteralArray for LargeBinaryArray { + fn to_primitive_literal_array( + &self, + _arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>> { + match iceberg_type { + PrimitiveType::Binary => Ok(self + .iter() + .map(|v| v.map(|v| Literal::binary(v.to_vec()))) + .collect()), + _ => Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The type of arrow large binary array is not compatitable with iceberg type {}", + iceberg_type + ), + )), + } + } + + fn to_struct_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &StructType, + ) -> Result>> { + unreachable!() + } +} + +impl ToIcebergLiteralArray for StructArray { + fn to_primitive_literal_array( + &self, + _arrow_type: &DataType, + _iceberg_type: &PrimitiveType, + ) -> Result>> { + unreachable!() + } + + fn to_struct_literal_array( + &self, + arrow_type: &DataType, + iceberg_type: &StructType, + ) -> Result>> { + let DataType::Struct(arrow_struct_fields) = arrow_type else { + return Err(Error::new( + ErrorKind::DataInvalid, + "The type of arrow struct array is not a struct type", + )); + }; + + if self.columns().len() != iceberg_type.fields().len() + || arrow_struct_fields.len() != iceberg_type.fields().len() + { + return Err(Error::new( + ErrorKind::DataInvalid, + "The type of arrow struct array is not compatitable with iceberg struct type", + )); + } + + let mut columns = Vec::with_capacity(self.columns().len()); + + for ((array, arrow_type), iceberg_field) in self + .columns() + .iter() + .zip_eq(arrow_struct_fields.iter().map(|field| field.data_type())) + .zip_eq(iceberg_type.fields().iter()) + { + if array.is_nullable() == iceberg_field.required { + return Err(Error::new( + ErrorKind::DataInvalid, + "The nullable field of arrow struct array is not compatitable with iceberg type", + )); + } + match (arrow_type, iceberg_field.field_type.as_ref()) { + (DataType::Null, _) => { + if iceberg_field.required { + return Err(Error::new( + ErrorKind::DataInvalid, + "column in arrow array should not be optional", + )); + } + columns.push(vec![None; array.len()]); + } + (DataType::Boolean, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); + } + (DataType::Int16, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); + } + (DataType::Int32, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); + } + (DataType::Int64, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); + } + (DataType::Float32, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); + } + (DataType::Float64, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); + } + (DataType::Decimal128(_, _), Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); + } + (DataType::Date32, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); + } + (DataType::Time64(TimeUnit::Microsecond), Type::Primitive(primitive_type)) => { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); + } + ( + DataType::Timestamp(TimeUnit::Microsecond, _), + Type::Primitive(primitive_type), + ) => { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); + } + (DataType::Timestamp(TimeUnit::Nanosecond, _), Type::Primitive(primitive_type)) => { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); + } + (DataType::Utf8, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); + } + (DataType::LargeUtf8, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); + } + (DataType::Binary, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); + } + (DataType::LargeBinary, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); + } + (DataType::Struct(_), Type::Struct(struct_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(array.to_struct_literal_array(arrow_type, struct_type)?); + } + (arrow_type, iceberg_field_type) => { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + format!( + "Unsupported convert arrow type {} to iceberg type: {}", + arrow_type, iceberg_field_type + ), + )) + } + } + } + + let struct_literal_len = columns.first().map(|column| column.len()).unwrap_or(0); + let mut struct_literals = Vec::with_capacity(struct_literal_len); + let mut columns_iters = columns + .into_iter() + .map(|column| column.into_iter()) + .collect::>(); + + for row_idx in 0..struct_literal_len { + if self.is_null(row_idx) { + struct_literals.push(None); + continue; + } + let mut literals = Vec::with_capacity(columns_iters.len()); + for column_iter in columns_iters.iter_mut() { + literals.push(column_iter.next().unwrap()); + } + struct_literals.push(Some(Literal::Struct(Struct::from_iter(literals)))); + } + + Ok(struct_literals) + } +} + +/// Convert arrow struct array to iceberg struct value array. +pub fn arrow_struct_to_iceberg_struct( + struct_array: &StructArray, + ty: StructType, +) -> Result>> { + struct_array.to_struct_literal_array(struct_array.data_type(), &ty) +} + +#[cfg(test)] +mod test { + use std::sync::Arc; + + use arrow_array::{ + ArrayRef, BinaryArray, BooleanArray, Date32Array, Decimal128Array, Float32Array, + Float64Array, Int16Array, Int32Array, Int64Array, StringArray, StructArray, + Time64MicrosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray, + }; + use arrow_schema::{DataType, Field, Fields, TimeUnit}; + + use super::*; + use crate::spec::{Literal, NestedField, PrimitiveType, StructType, Type}; + + #[test] + fn test_arrow_struct_to_iceberg_struct() { + let bool_array = BooleanArray::from(vec![Some(true), Some(false), None]); + let int16_array = Int16Array::from(vec![Some(1), Some(2), None]); + let int32_array = Int32Array::from(vec![Some(3), Some(4), None]); + let int64_array = Int64Array::from(vec![Some(5), Some(6), None]); + let float32_array = Float32Array::from(vec![Some(1.1), Some(2.2), None]); + let float64_array = Float64Array::from(vec![Some(3.3), Some(4.4), None]); + let decimal_array = Decimal128Array::from(vec![Some(1000), Some(2000), None]) + .with_precision_and_scale(10, 2) + .unwrap(); + let date_array = Date32Array::from(vec![Some(18628), Some(18629), None]); + let time_array = Time64MicrosecondArray::from(vec![Some(123456789), Some(987654321), None]); + let timestamp_micro_array = TimestampMicrosecondArray::from(vec![ + Some(1622548800000000), + Some(1622635200000000), + None, + ]); + let timestamp_nano_array = TimestampNanosecondArray::from(vec![ + Some(1622548800000000000), + Some(1622635200000000000), + None, + ]); + let string_array = StringArray::from(vec![Some("a"), Some("b"), None]); + let binary_array = + BinaryArray::from(vec![Some(b"abc".as_ref()), Some(b"def".as_ref()), None]); + + let struct_array = StructArray::from(vec![ + ( + Arc::new(Field::new("bool_field", DataType::Boolean, true)), + Arc::new(bool_array) as ArrayRef, + ), + ( + Arc::new(Field::new("int16_field", DataType::Int16, true)), + Arc::new(int16_array) as ArrayRef, + ), + ( + Arc::new(Field::new("int32_field", DataType::Int32, true)), + Arc::new(int32_array) as ArrayRef, + ), + ( + Arc::new(Field::new("int64_field", DataType::Int64, true)), + Arc::new(int64_array) as ArrayRef, + ), + ( + Arc::new(Field::new("float32_field", DataType::Float32, true)), + Arc::new(float32_array) as ArrayRef, + ), + ( + Arc::new(Field::new("float64_field", DataType::Float64, true)), + Arc::new(float64_array) as ArrayRef, + ), + ( + Arc::new(Field::new( + "decimal_field", + DataType::Decimal128(10, 2), + true, + )), + Arc::new(decimal_array) as ArrayRef, + ), + ( + Arc::new(Field::new("date_field", DataType::Date32, true)), + Arc::new(date_array) as ArrayRef, + ), + ( + Arc::new(Field::new( + "time_field", + DataType::Time64(TimeUnit::Microsecond), + true, + )), + Arc::new(time_array) as ArrayRef, + ), + ( + Arc::new(Field::new( + "timestamp_micro_field", + DataType::Timestamp(TimeUnit::Microsecond, None), + true, + )), + Arc::new(timestamp_micro_array) as ArrayRef, + ), + ( + Arc::new(Field::new( + "timestamp_nano_field", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + )), + Arc::new(timestamp_nano_array) as ArrayRef, + ), + ( + Arc::new(Field::new("string_field", DataType::Utf8, true)), + Arc::new(string_array) as ArrayRef, + ), + ( + Arc::new(Field::new("binary_field", DataType::Binary, true)), + Arc::new(binary_array) as ArrayRef, + ), + ]); + + let iceberg_struct_type = StructType::new(vec![ + Arc::new(NestedField::optional( + 0, + "bool_field", + Type::Primitive(PrimitiveType::Boolean), + )), + Arc::new(NestedField::optional( + 1, + "int16_field", + Type::Primitive(PrimitiveType::Int), + )), + Arc::new(NestedField::optional( + 2, + "int32_field", + Type::Primitive(PrimitiveType::Int), + )), + Arc::new(NestedField::optional( + 3, + "int64_field", + Type::Primitive(PrimitiveType::Long), + )), + Arc::new(NestedField::optional( + 4, + "float32_field", + Type::Primitive(PrimitiveType::Float), + )), + Arc::new(NestedField::optional( + 5, + "float64_field", + Type::Primitive(PrimitiveType::Double), + )), + Arc::new(NestedField::optional( + 6, + "decimal_field", + Type::Primitive(PrimitiveType::Decimal { + precision: 10, + scale: 2, + }), + )), + Arc::new(NestedField::optional( + 7, + "date_field", + Type::Primitive(PrimitiveType::Date), + )), + Arc::new(NestedField::optional( + 8, + "time_field", + Type::Primitive(PrimitiveType::Time), + )), + Arc::new(NestedField::optional( + 9, + "timestamp_micro_field", + Type::Primitive(PrimitiveType::Timestamp), + )), + Arc::new(NestedField::optional( + 10, + "timestamp_nao_field", + Type::Primitive(PrimitiveType::TimestampNs), + )), + Arc::new(NestedField::optional( + 11, + "string_field", + Type::Primitive(PrimitiveType::String), + )), + Arc::new(NestedField::optional( + 12, + "binary_field", + Type::Primitive(PrimitiveType::Binary), + )), + ]); + + let result = arrow_struct_to_iceberg_struct(&struct_array, iceberg_struct_type).unwrap(); + + assert_eq!(result, vec![ + Some(Literal::Struct(Struct::from_iter(vec![ + Some(Literal::bool(true)), + Some(Literal::int(1)), + Some(Literal::int(3)), + Some(Literal::long(5)), + Some(Literal::float(1.1)), + Some(Literal::double(3.3)), + Some(Literal::decimal(1000)), + Some(Literal::date(18628)), + Some(Literal::time(123456789)), + Some(Literal::timestamp(1622548800000000)), + Some(Literal::timestamp_nano(1622548800000000000)), + Some(Literal::string("a".to_string())), + Some(Literal::binary(b"abc".to_vec())), + ]))), + Some(Literal::Struct(Struct::from_iter(vec![ + Some(Literal::bool(false)), + Some(Literal::int(2)), + Some(Literal::int(4)), + Some(Literal::long(6)), + Some(Literal::float(2.2)), + Some(Literal::double(4.4)), + Some(Literal::decimal(2000)), + Some(Literal::date(18629)), + Some(Literal::time(987654321)), + Some(Literal::timestamp(1622635200000000)), + Some(Literal::timestamp_nano(1622635200000000000)), + Some(Literal::string("b".to_string())), + Some(Literal::binary(b"def".to_vec())), + ]))), + Some(Literal::Struct(Struct::from_iter(vec![ + None, None, None, None, None, None, None, None, None, None, None, None, None, + ]))), + ]); + } + + #[test] + fn test_single_column_nullable_struct() { + let struct_array = StructArray::new_null( + Fields::from(vec![Field::new("bool_field", DataType::Boolean, true)]), + 3, + ); + let iceberg_struct_type = StructType::new(vec![Arc::new(NestedField::optional( + 0, + "bool_field", + Type::Primitive(PrimitiveType::Boolean), + ))]); + let result = arrow_struct_to_iceberg_struct(&struct_array, iceberg_struct_type).unwrap(); + assert_eq!(result, vec![None; 3]); + } + + #[test] + fn test_empty_struct() { + let struct_array = StructArray::new_null(Fields::empty(), 3); + let iceberg_struct_type = StructType::new(vec![]); + let result = arrow_struct_to_iceberg_struct(&struct_array, iceberg_struct_type).unwrap(); + assert_eq!(result, vec![None; 0]); + } +} diff --git a/crates/iceberg/src/spec/values.rs b/crates/iceberg/src/spec/values.rs index 0dbd3ad5e1..1d464be591 100644 --- a/crates/iceberg/src/spec/values.rs +++ b/crates/iceberg/src/spec/values.rs @@ -1564,6 +1564,16 @@ impl Literal { Self::Primitive(PrimitiveLiteral::Long(value)) } + /// Creates a timestamp from unix epoch in nanoseconds. + pub fn timestamp_nano(value: i64) -> Self { + Self::Primitive(PrimitiveLiteral::Long(value)) + } + + /// Creates a timestamp with timezone from unix epoch in nanoseconds. + pub fn timestamptz_nano(value: i64) -> Self { + Self::Primitive(PrimitiveLiteral::Long(value)) + } + /// Creates a timestamp from [`DateTime`]. pub fn timestamp_from_datetime(dt: DateTime) -> Self { Self::timestamp(dt.with_timezone(&Utc).timestamp_micros()) From 81e34d22ca04083c92febc9c93b3eac7788e5612 Mon Sep 17 00:00:00 2001 From: ZENOTME Date: Wed, 4 Dec 2024 15:39:51 +0800 Subject: [PATCH 02/11] use visitor pattern --- crates/iceberg/src/arrow/value.rs | 714 ++++++++++++++---------------- 1 file changed, 327 insertions(+), 387 deletions(-) diff --git a/crates/iceberg/src/arrow/value.rs b/crates/iceberg/src/arrow/value.rs index 95712018be..0048826086 100644 --- a/crates/iceberg/src/arrow/value.rs +++ b/crates/iceberg/src/arrow/value.rs @@ -16,10 +16,9 @@ // under the License. use arrow_array::{ - Array, BinaryArray, BooleanArray, Date32Array, Decimal128Array, Float16Array, Float32Array, - Float64Array, Int16Array, Int32Array, Int64Array, LargeBinaryArray, LargeStringArray, - StringArray, StructArray, Time64MicrosecondArray, TimestampMicrosecondArray, - TimestampNanosecondArray, + Array, BinaryArray, BooleanArray, Date32Array, Decimal128Array, Float32Array, Float64Array, + Int16Array, Int32Array, Int64Array, LargeBinaryArray, LargeStringArray, NullArray, StringArray, + StructArray, Time64MicrosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray, }; use arrow_schema::{DataType, TimeUnit}; use itertools::Itertools; @@ -27,194 +26,362 @@ use itertools::Itertools; use crate::spec::{Literal, PrimitiveType, Struct, StructType, Type}; use crate::{Error, ErrorKind, Result}; -trait ToIcebergLiteralArray { - fn to_primitive_literal_array( +trait ArrowArrayVistor { + type T; + fn null( &self, - _arrow_type: &DataType, - _iceberg_type: &PrimitiveType, - ) -> Result>>; - fn to_struct_literal_array( + array: &NullArray, + arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>; + fn boolean( &self, - _arrow_type: &DataType, - _iceberg_type: &StructType, - ) -> Result>>; -} - -impl ToIcebergLiteralArray for BooleanArray { - fn to_primitive_literal_array( + array: &BooleanArray, + arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>; + fn int16( &self, - _arrow_type: &DataType, + array: &Int16Array, + arrow_type: &DataType, iceberg_type: &PrimitiveType, - ) -> Result>> { - match iceberg_type { - PrimitiveType::Boolean => Ok(self.iter().map(|v| v.map(Literal::bool)).collect()), - _ => Err(Error::new( + ) -> Result>; + fn int32( + &self, + array: &Int32Array, + arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>; + fn int64( + &self, + array: &Int64Array, + arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>; + fn float( + &self, + array: &Float32Array, + arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>; + fn double( + &self, + array: &Float64Array, + arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>; + fn decimal( + &self, + array: &Decimal128Array, + arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>; + fn date( + &self, + array: &Date32Array, + arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>; + fn time( + &self, + array: &Time64MicrosecondArray, + arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>; + fn timestamp( + &self, + array: &TimestampMicrosecondArray, + arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>; + fn timestamp_nano( + &self, + array: &TimestampNanosecondArray, + arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>; + fn string( + &self, + array: &StringArray, + arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>; + fn large_string( + &self, + array: &LargeStringArray, + arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>; + fn binary( + &self, + array: &BinaryArray, + arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>; + fn large_binary( + &self, + array: &LargeBinaryArray, + arrow_type: &DataType, + iceberg_type: &PrimitiveType, + ) -> Result>; + fn combine_struct( + &self, + array: &StructArray, + columns: Vec>, + ) -> Result>; + fn r#struct( + &self, + array: &StructArray, + arrow_type: &DataType, + iceberg_type: &StructType, + ) -> Result> { + let DataType::Struct(arrow_struct_fields) = arrow_type else { + return Err(Error::new( ErrorKind::DataInvalid, - format!( - "The type of arrow boolean array is not compatitable with iceberg type {}", - iceberg_type - ), - )), + "The type of arrow struct array is not a struct type", + )); + }; + + if array.columns().len() != iceberg_type.fields().len() + || arrow_struct_fields.len() != iceberg_type.fields().len() + { + return Err(Error::new( + ErrorKind::DataInvalid, + "The type of arrow struct array is not compatitable with iceberg struct type", + )); + } + + let mut columns = Vec::with_capacity(array.columns().len()); + + for ((array, arrow_type), iceberg_field) in array + .columns() + .iter() + .zip_eq(arrow_struct_fields.iter().map(|field| field.data_type())) + .zip_eq(iceberg_type.fields().iter()) + { + if array.is_nullable() == iceberg_field.required { + return Err(Error::new( + ErrorKind::DataInvalid, + "The nullable field of arrow struct array is not compatitable with iceberg type", + )); + } + match (arrow_type, iceberg_field.field_type.as_ref()) { + (DataType::Null, Type::Primitive(primitive_type)) => { + if iceberg_field.required { + return Err(Error::new( + ErrorKind::DataInvalid, + "column in arrow array should not be optional", + )); + } + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(self.null(array, arrow_type, primitive_type)?); + } + (DataType::Boolean, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(self.boolean(array, arrow_type, primitive_type)?); + } + (DataType::Int16, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(self.int16(array, arrow_type, primitive_type)?); + } + (DataType::Int32, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(self.int32(array, arrow_type, primitive_type)?); + } + (DataType::Int64, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(self.int64(array, arrow_type, primitive_type)?); + } + (DataType::Float32, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(self.float(array, arrow_type, primitive_type)?); + } + (DataType::Float64, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(self.double(array, arrow_type, primitive_type)?); + } + (DataType::Decimal128(_, _), Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(self.decimal(array, arrow_type, primitive_type)?); + } + (DataType::Date32, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(self.date(array, arrow_type, primitive_type)?); + } + (DataType::Time64(TimeUnit::Microsecond), Type::Primitive(primitive_type)) => { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + columns.push(self.time(array, arrow_type, primitive_type)?); + } + ( + DataType::Timestamp(TimeUnit::Microsecond, _), + Type::Primitive(primitive_type), + ) => { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + columns.push(self.timestamp(array, arrow_type, primitive_type)?); + } + (DataType::Timestamp(TimeUnit::Nanosecond, _), Type::Primitive(primitive_type)) => { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + columns.push(self.timestamp_nano(array, arrow_type, primitive_type)?); + } + (DataType::Utf8, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(self.string(array, arrow_type, primitive_type)?); + } + (DataType::LargeUtf8, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(self.large_string(array, arrow_type, primitive_type)?); + } + (DataType::Binary, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(self.binary(array, arrow_type, primitive_type)?); + } + (DataType::LargeBinary, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(self.large_binary(array, arrow_type, primitive_type)?); + } + (DataType::Struct(_), Type::Struct(struct_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(self.r#struct(array, arrow_type, struct_type)?); + } + (arrow_type, iceberg_field_type) => { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + format!( + "Unsupported convert arrow type {} to iceberg type: {}", + arrow_type, iceberg_field_type + ), + )) + } + } } + + self.combine_struct(array, columns) } +} + +struct LiteralArrayVisitor; + +impl ArrowArrayVistor for LiteralArrayVisitor { + type T = Option; - fn to_struct_literal_array( + fn null( &self, + array: &NullArray, _arrow_type: &DataType, - _iceberg_type: &StructType, - ) -> Result>> { - unreachable!() + _iceberg_type: &PrimitiveType, + ) -> Result> { + Ok(vec![None; array.len()]) } -} -impl ToIcebergLiteralArray for Int16Array { - fn to_primitive_literal_array( + fn boolean( &self, + array: &BooleanArray, _arrow_type: &DataType, iceberg_type: &PrimitiveType, - ) -> Result>> { + ) -> Result> { match iceberg_type { - PrimitiveType::Int => Ok(self.iter().map(|v| v.map(Literal::int)).collect()), - PrimitiveType::Long => Ok(self.iter().map(|v| v.map(Literal::long)).collect()), + PrimitiveType::Boolean => Ok(array.iter().map(|v| v.map(Literal::bool)).collect()), _ => Err(Error::new( ErrorKind::DataInvalid, format!( - "The type of arrow int16 array is not compatitable with iceberg type {}", + "The type of arrow boolean array is not compatitable with iceberg type {}", iceberg_type ), )), } } - fn to_struct_literal_array( - &self, - _arrow_type: &DataType, - _iceberg_type: &StructType, - ) -> Result>> { - unreachable!() - } -} - -impl ToIcebergLiteralArray for Int32Array { - fn to_primitive_literal_array( + fn int16( &self, + array: &Int16Array, _arrow_type: &DataType, iceberg_type: &PrimitiveType, - ) -> Result>> { + ) -> Result> { match iceberg_type { - PrimitiveType::Int => Ok(self.iter().map(|v| v.map(Literal::int)).collect()), - PrimitiveType::Long => Ok(self.iter().map(|v| v.map(Literal::long)).collect()), + PrimitiveType::Int => Ok(array.iter().map(|v| v.map(Literal::int)).collect()), + PrimitiveType::Long => Ok(array.iter().map(|v| v.map(Literal::long)).collect()), _ => Err(Error::new( ErrorKind::DataInvalid, format!( - "The type of arrow int32 array is not compatitable with iceberg type {}", + "The type of arrow int16 array is not compatitable with iceberg type {}", iceberg_type ), )), } } - fn to_struct_literal_array( - &self, - _arrow_type: &DataType, - _iceberg_type: &StructType, - ) -> Result>> { - unreachable!() - } -} - -impl ToIcebergLiteralArray for Int64Array { - fn to_primitive_literal_array( + fn int32( &self, + array: &Int32Array, _arrow_type: &DataType, iceberg_type: &PrimitiveType, - ) -> Result>> { + ) -> Result> { match iceberg_type { - PrimitiveType::Long => Ok(self.iter().map(|v| v.map(Literal::long)).collect()), + PrimitiveType::Int => Ok(array.iter().map(|v| v.map(Literal::int)).collect()), + PrimitiveType::Long => Ok(array.iter().map(|v| v.map(Literal::long)).collect()), _ => Err(Error::new( ErrorKind::DataInvalid, format!( - "The type of arrow int64 array is not compatitable with iceberg type {}", + "The type of arrow int32 array is not compatitable with iceberg type {}", iceberg_type ), )), } } - fn to_struct_literal_array( - &self, - _arrow_type: &DataType, - _iceberg_type: &StructType, - ) -> Result>> { - unreachable!() - } -} - -impl ToIcebergLiteralArray for Float16Array { - fn to_primitive_literal_array( + fn int64( &self, + array: &Int64Array, _arrow_type: &DataType, iceberg_type: &PrimitiveType, - ) -> Result>> { + ) -> Result> { match iceberg_type { - PrimitiveType::Float => Ok(self - .iter() - .map(|v| v.map(|v| Literal::float(v.to_f32()))) - .collect()), + PrimitiveType::Long => Ok(array.iter().map(|v| v.map(Literal::long)).collect()), _ => Err(Error::new( ErrorKind::DataInvalid, format!( - "The type of arrow float16 array is not compatitable with iceberg type {}", + "The type of arrow int64 array is not compatitable with iceberg type {}", iceberg_type ), )), } } - fn to_struct_literal_array( - &self, - _arrow_type: &DataType, - _iceberg_type: &StructType, - ) -> Result>> { - unreachable!() - } -} - -impl ToIcebergLiteralArray for Float32Array { - fn to_primitive_literal_array( + fn float( &self, + array: &Float32Array, _arrow_type: &DataType, iceberg_type: &PrimitiveType, - ) -> Result>> { + ) -> Result> { match iceberg_type { - PrimitiveType::Float => Ok(self.iter().map(|v| v.map(Literal::float)).collect()), + PrimitiveType::Float => Ok(array.iter().map(|v| v.map(Literal::float)).collect()), _ => Err(Error::new( ErrorKind::DataInvalid, format!( - "The type of arrow float32 array is not compatitable with iceberg type {}", + "The type of arrow float16 array is not compatitable with iceberg type {}", iceberg_type ), )), } } - fn to_struct_literal_array( - &self, - _arrow_type: &DataType, - _iceberg_type: &StructType, - ) -> Result>> { - unreachable!() - } -} - -impl ToIcebergLiteralArray for Float64Array { - fn to_primitive_literal_array( + fn double( &self, + array: &Float64Array, _arrow_type: &DataType, iceberg_type: &PrimitiveType, - ) -> Result>> { + ) -> Result> { match iceberg_type { - PrimitiveType::Double => Ok(self.iter().map(|v| v.map(Literal::double)).collect()), + PrimitiveType::Double => Ok(array.iter().map(|v| v.map(Literal::double)).collect()), _ => Err(Error::new( ErrorKind::DataInvalid, format!( @@ -225,21 +392,12 @@ impl ToIcebergLiteralArray for Float64Array { } } - fn to_struct_literal_array( - &self, - _arrow_type: &DataType, - _iceberg_type: &StructType, - ) -> Result>> { - unreachable!() - } -} - -impl ToIcebergLiteralArray for Decimal128Array { - fn to_primitive_literal_array( + fn decimal( &self, + array: &Decimal128Array, arrow_type: &DataType, iceberg_type: &PrimitiveType, - ) -> Result>> { + ) -> Result> { let DataType::Decimal128(arrow_precision, arrow_scale) = arrow_type else { unreachable!() }; @@ -254,7 +412,7 @@ impl ToIcebergLiteralArray for Decimal128Array { ), )); } - Ok(self.iter().map(|v| v.map(Literal::decimal)).collect()) + Ok(array.iter().map(|v| v.map(Literal::decimal)).collect()) } _ => Err(Error::new( ErrorKind::DataInvalid, @@ -266,23 +424,14 @@ impl ToIcebergLiteralArray for Decimal128Array { } } - fn to_struct_literal_array( - &self, - _arrow_type: &DataType, - _iceberg_type: &StructType, - ) -> Result>> { - unreachable!() - } -} - -impl ToIcebergLiteralArray for Date32Array { - fn to_primitive_literal_array( + fn date( &self, + array: &Date32Array, _arrow_type: &DataType, iceberg_type: &PrimitiveType, - ) -> Result>> { + ) -> Result> { match iceberg_type { - PrimitiveType::Date => Ok(self.iter().map(|v| v.map(Literal::date)).collect()), + PrimitiveType::Date => Ok(array.iter().map(|v| v.map(Literal::date)).collect()), _ => Err(Error::new( ErrorKind::DataInvalid, format!( @@ -293,23 +442,14 @@ impl ToIcebergLiteralArray for Date32Array { } } - fn to_struct_literal_array( - &self, - _arrow_type: &DataType, - _iceberg_type: &StructType, - ) -> Result>> { - unreachable!() - } -} - -impl ToIcebergLiteralArray for Time64MicrosecondArray { - fn to_primitive_literal_array( + fn time( &self, + array: &Time64MicrosecondArray, _arrow_type: &DataType, iceberg_type: &PrimitiveType, - ) -> Result>> { + ) -> Result> { match iceberg_type { - PrimitiveType::Time => Ok(self + PrimitiveType::Time => Ok(array .iter() .map(|v| v.map(Literal::time)) .collect()), @@ -323,27 +463,18 @@ impl ToIcebergLiteralArray for Time64MicrosecondArray { } } - fn to_struct_literal_array( - &self, - _arrow_type: &DataType, - _iceberg_type: &StructType, - ) -> Result>> { - unreachable!() - } -} - -impl ToIcebergLiteralArray for TimestampMicrosecondArray { - fn to_primitive_literal_array( + fn timestamp( &self, + array: &TimestampMicrosecondArray, _arrow_type: &DataType, iceberg_type: &PrimitiveType, - ) -> Result>> { + ) -> Result> { match iceberg_type { - PrimitiveType::Timestamp => Ok(self + PrimitiveType::Timestamp => Ok(array .iter() .map(|v| v.map(Literal::timestamp)) .collect()), - PrimitiveType::Timestamptz => Ok(self + PrimitiveType::Timestamptz => Ok(array .iter() .map(|v| v.map(Literal::timestamptz)) .collect()), @@ -357,27 +488,18 @@ impl ToIcebergLiteralArray for TimestampMicrosecondArray { } } - fn to_struct_literal_array( - &self, - _arrow_type: &DataType, - _iceberg_type: &StructType, - ) -> Result>> { - unreachable!() - } -} - -impl ToIcebergLiteralArray for TimestampNanosecondArray { - fn to_primitive_literal_array( + fn timestamp_nano( &self, + array: &TimestampNanosecondArray, _arrow_type: &DataType, iceberg_type: &PrimitiveType, - ) -> Result>> { + ) -> Result> { match iceberg_type { - PrimitiveType::TimestampNs => Ok(self + PrimitiveType::TimestampNs => Ok(array .iter() .map(|v| v.map(Literal::timestamp_nano)) .collect()), - PrimitiveType::TimestamptzNs => Ok(self + PrimitiveType::TimestamptzNs => Ok(array .iter() .map(|v| v.map(Literal::timestamptz_nano)) .collect()), @@ -391,23 +513,14 @@ impl ToIcebergLiteralArray for TimestampNanosecondArray { } } - fn to_struct_literal_array( - &self, - _arrow_type: &DataType, - _iceberg_type: &StructType, - ) -> Result>> { - unreachable!() - } -} - -impl ToIcebergLiteralArray for StringArray { - fn to_primitive_literal_array( + fn string( &self, + array: &StringArray, _arrow_type: &DataType, iceberg_type: &PrimitiveType, - ) -> Result>> { + ) -> Result> { match iceberg_type { - PrimitiveType::String => Ok(self.iter().map(|v| v.map(Literal::string)).collect()), + PrimitiveType::String => Ok(array.iter().map(|v| v.map(Literal::string)).collect()), _ => Err(Error::new( ErrorKind::DataInvalid, format!( @@ -418,23 +531,14 @@ impl ToIcebergLiteralArray for StringArray { } } - fn to_struct_literal_array( - &self, - _arrow_type: &DataType, - _iceberg_type: &StructType, - ) -> Result>> { - unreachable!() - } -} - -impl ToIcebergLiteralArray for LargeStringArray { - fn to_primitive_literal_array( + fn large_string( &self, + array: &LargeStringArray, _arrow_type: &DataType, iceberg_type: &PrimitiveType, - ) -> Result>> { + ) -> Result> { match iceberg_type { - PrimitiveType::String => Ok(self.iter().map(|v| v.map(Literal::string)).collect()), + PrimitiveType::String => Ok(array.iter().map(|v| v.map(Literal::string)).collect()), _ => Err(Error::new( ErrorKind::DataInvalid, format!( @@ -445,23 +549,14 @@ impl ToIcebergLiteralArray for LargeStringArray { } } - fn to_struct_literal_array( - &self, - _arrow_type: &DataType, - _iceberg_type: &StructType, - ) -> Result>> { - unreachable!() - } -} - -impl ToIcebergLiteralArray for BinaryArray { - fn to_primitive_literal_array( + fn binary( &self, + array: &BinaryArray, _arrow_type: &DataType, iceberg_type: &PrimitiveType, - ) -> Result>> { + ) -> Result> { match iceberg_type { - PrimitiveType::Binary => Ok(self + PrimitiveType::Binary => Ok(array .iter() .map(|v| v.map(|v| Literal::binary(v.to_vec()))) .collect()), @@ -475,187 +570,32 @@ impl ToIcebergLiteralArray for BinaryArray { } } - fn to_struct_literal_array( - &self, - _arrow_type: &DataType, - _iceberg_type: &StructType, - ) -> Result>> { - unreachable!() - } -} - -impl ToIcebergLiteralArray for LargeBinaryArray { - fn to_primitive_literal_array( + fn large_binary( &self, + array: &LargeBinaryArray, _arrow_type: &DataType, iceberg_type: &PrimitiveType, - ) -> Result>> { + ) -> Result> { match iceberg_type { - PrimitiveType::Binary => Ok(self + PrimitiveType::Binary => Ok(array .iter() .map(|v| v.map(|v| Literal::binary(v.to_vec()))) .collect()), _ => Err(Error::new( ErrorKind::DataInvalid, format!( - "The type of arrow large binary array is not compatitable with iceberg type {}", + "The type of arrow binary array is not compatitable with iceberg type {}", iceberg_type ), )), } } - fn to_struct_literal_array( - &self, - _arrow_type: &DataType, - _iceberg_type: &StructType, - ) -> Result>> { - unreachable!() - } -} - -impl ToIcebergLiteralArray for StructArray { - fn to_primitive_literal_array( - &self, - _arrow_type: &DataType, - _iceberg_type: &PrimitiveType, - ) -> Result>> { - unreachable!() - } - - fn to_struct_literal_array( + fn combine_struct( &self, - arrow_type: &DataType, - iceberg_type: &StructType, - ) -> Result>> { - let DataType::Struct(arrow_struct_fields) = arrow_type else { - return Err(Error::new( - ErrorKind::DataInvalid, - "The type of arrow struct array is not a struct type", - )); - }; - - if self.columns().len() != iceberg_type.fields().len() - || arrow_struct_fields.len() != iceberg_type.fields().len() - { - return Err(Error::new( - ErrorKind::DataInvalid, - "The type of arrow struct array is not compatitable with iceberg struct type", - )); - } - - let mut columns = Vec::with_capacity(self.columns().len()); - - for ((array, arrow_type), iceberg_field) in self - .columns() - .iter() - .zip_eq(arrow_struct_fields.iter().map(|field| field.data_type())) - .zip_eq(iceberg_type.fields().iter()) - { - if array.is_nullable() == iceberg_field.required { - return Err(Error::new( - ErrorKind::DataInvalid, - "The nullable field of arrow struct array is not compatitable with iceberg type", - )); - } - match (arrow_type, iceberg_field.field_type.as_ref()) { - (DataType::Null, _) => { - if iceberg_field.required { - return Err(Error::new( - ErrorKind::DataInvalid, - "column in arrow array should not be optional", - )); - } - columns.push(vec![None; array.len()]); - } - (DataType::Boolean, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); - } - (DataType::Int16, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); - } - (DataType::Int32, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); - } - (DataType::Int64, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); - } - (DataType::Float32, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); - } - (DataType::Float64, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); - } - (DataType::Decimal128(_, _), Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); - } - (DataType::Date32, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); - } - (DataType::Time64(TimeUnit::Microsecond), Type::Primitive(primitive_type)) => { - let array = array - .as_any() - .downcast_ref::() - .unwrap(); - columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); - } - ( - DataType::Timestamp(TimeUnit::Microsecond, _), - Type::Primitive(primitive_type), - ) => { - let array = array - .as_any() - .downcast_ref::() - .unwrap(); - columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); - } - (DataType::Timestamp(TimeUnit::Nanosecond, _), Type::Primitive(primitive_type)) => { - let array = array - .as_any() - .downcast_ref::() - .unwrap(); - columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); - } - (DataType::Utf8, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); - } - (DataType::LargeUtf8, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); - } - (DataType::Binary, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); - } - (DataType::LargeBinary, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(array.to_primitive_literal_array(arrow_type, primitive_type)?); - } - (DataType::Struct(_), Type::Struct(struct_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(array.to_struct_literal_array(arrow_type, struct_type)?); - } - (arrow_type, iceberg_field_type) => { - return Err(Error::new( - ErrorKind::FeatureUnsupported, - format!( - "Unsupported convert arrow type {} to iceberg type: {}", - arrow_type, iceberg_field_type - ), - )) - } - } - } - + array: &StructArray, + columns: Vec>, + ) -> Result> { let struct_literal_len = columns.first().map(|column| column.len()).unwrap_or(0); let mut struct_literals = Vec::with_capacity(struct_literal_len); let mut columns_iters = columns @@ -664,7 +604,7 @@ impl ToIcebergLiteralArray for StructArray { .collect::>(); for row_idx in 0..struct_literal_len { - if self.is_null(row_idx) { + if array.is_null(row_idx) { struct_literals.push(None); continue; } @@ -680,11 +620,11 @@ impl ToIcebergLiteralArray for StructArray { } /// Convert arrow struct array to iceberg struct value array. -pub fn arrow_struct_to_iceberg_struct( +pub fn arrow_struct_to_literal( struct_array: &StructArray, ty: StructType, ) -> Result>> { - struct_array.to_struct_literal_array(struct_array.data_type(), &ty) + LiteralArrayVisitor.r#struct(struct_array, struct_array.data_type(), &ty) } #[cfg(test)] @@ -870,7 +810,7 @@ mod test { )), ]); - let result = arrow_struct_to_iceberg_struct(&struct_array, iceberg_struct_type).unwrap(); + let result = arrow_struct_to_literal(&struct_array, iceberg_struct_type).unwrap(); assert_eq!(result, vec![ Some(Literal::Struct(Struct::from_iter(vec![ @@ -920,7 +860,7 @@ mod test { "bool_field", Type::Primitive(PrimitiveType::Boolean), ))]); - let result = arrow_struct_to_iceberg_struct(&struct_array, iceberg_struct_type).unwrap(); + let result = arrow_struct_to_literal(&struct_array, iceberg_struct_type).unwrap(); assert_eq!(result, vec![None; 3]); } @@ -928,7 +868,7 @@ mod test { fn test_empty_struct() { let struct_array = StructArray::new_null(Fields::empty(), 3); let iceberg_struct_type = StructType::new(vec![]); - let result = arrow_struct_to_iceberg_struct(&struct_array, iceberg_struct_type).unwrap(); + let result = arrow_struct_to_literal(&struct_array, iceberg_struct_type).unwrap(); assert_eq!(result, vec![None; 0]); } } From 44fd1e0b6ca8d24fd41bdec70363c48d81df2e35 Mon Sep 17 00:00:00 2001 From: ZENOTME Date: Wed, 11 Dec 2024 12:21:52 +0800 Subject: [PATCH 03/11] refine visitor design --- crates/iceberg/src/arrow/value.rs | 686 ++++++++++++++++++------------ 1 file changed, 411 insertions(+), 275 deletions(-) diff --git a/crates/iceberg/src/arrow/value.rs b/crates/iceberg/src/arrow/value.rs index 0048826086..c1e075e781 100644 --- a/crates/iceberg/src/arrow/value.rs +++ b/crates/iceberg/src/arrow/value.rs @@ -22,272 +22,72 @@ use arrow_array::{ }; use arrow_schema::{DataType, TimeUnit}; use itertools::Itertools; +use parquet::arrow::PARQUET_FIELD_ID_META_KEY; use crate::spec::{Literal, PrimitiveType, Struct, StructType, Type}; use crate::{Error, ErrorKind, Result}; +/// A post order arrow array visitor. trait ArrowArrayVistor { type T; - fn null( - &self, - array: &NullArray, - arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result>; - fn boolean( - &self, - array: &BooleanArray, - arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result>; - fn int16( - &self, - array: &Int16Array, - arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result>; - fn int32( - &self, - array: &Int32Array, - arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result>; - fn int64( - &self, - array: &Int64Array, - arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result>; - fn float( - &self, - array: &Float32Array, - arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result>; - fn double( - &self, - array: &Float64Array, - arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result>; + fn null(&self, array: &NullArray, iceberg_type: &PrimitiveType) -> Result>; + fn boolean(&self, array: &BooleanArray, iceberg_type: &PrimitiveType) -> Result>; + fn int16(&self, array: &Int16Array, iceberg_type: &PrimitiveType) -> Result>; + fn int32(&self, array: &Int32Array, iceberg_type: &PrimitiveType) -> Result>; + fn int64(&self, array: &Int64Array, iceberg_type: &PrimitiveType) -> Result>; + fn float(&self, array: &Float32Array, iceberg_type: &PrimitiveType) -> Result>; + fn double(&self, array: &Float64Array, iceberg_type: &PrimitiveType) -> Result>; fn decimal( &self, array: &Decimal128Array, - arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result>; - fn date( - &self, - array: &Date32Array, - arrow_type: &DataType, iceberg_type: &PrimitiveType, ) -> Result>; + fn date(&self, array: &Date32Array, iceberg_type: &PrimitiveType) -> Result>; fn time( &self, array: &Time64MicrosecondArray, - arrow_type: &DataType, iceberg_type: &PrimitiveType, ) -> Result>; fn timestamp( &self, array: &TimestampMicrosecondArray, - arrow_type: &DataType, iceberg_type: &PrimitiveType, ) -> Result>; fn timestamp_nano( &self, array: &TimestampNanosecondArray, - arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result>; - fn string( - &self, - array: &StringArray, - arrow_type: &DataType, iceberg_type: &PrimitiveType, ) -> Result>; + fn string(&self, array: &StringArray, iceberg_type: &PrimitiveType) -> Result>; fn large_string( &self, array: &LargeStringArray, - arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result>; - fn binary( - &self, - array: &BinaryArray, - arrow_type: &DataType, iceberg_type: &PrimitiveType, ) -> Result>; + fn binary(&self, array: &BinaryArray, iceberg_type: &PrimitiveType) -> Result>; fn large_binary( &self, array: &LargeBinaryArray, - arrow_type: &DataType, iceberg_type: &PrimitiveType, ) -> Result>; - fn combine_struct( - &self, - array: &StructArray, - columns: Vec>, - ) -> Result>; fn r#struct( &self, array: &StructArray, - arrow_type: &DataType, iceberg_type: &StructType, - ) -> Result> { - let DataType::Struct(arrow_struct_fields) = arrow_type else { - return Err(Error::new( - ErrorKind::DataInvalid, - "The type of arrow struct array is not a struct type", - )); - }; - - if array.columns().len() != iceberg_type.fields().len() - || arrow_struct_fields.len() != iceberg_type.fields().len() - { - return Err(Error::new( - ErrorKind::DataInvalid, - "The type of arrow struct array is not compatitable with iceberg struct type", - )); - } - - let mut columns = Vec::with_capacity(array.columns().len()); - - for ((array, arrow_type), iceberg_field) in array - .columns() - .iter() - .zip_eq(arrow_struct_fields.iter().map(|field| field.data_type())) - .zip_eq(iceberg_type.fields().iter()) - { - if array.is_nullable() == iceberg_field.required { - return Err(Error::new( - ErrorKind::DataInvalid, - "The nullable field of arrow struct array is not compatitable with iceberg type", - )); - } - match (arrow_type, iceberg_field.field_type.as_ref()) { - (DataType::Null, Type::Primitive(primitive_type)) => { - if iceberg_field.required { - return Err(Error::new( - ErrorKind::DataInvalid, - "column in arrow array should not be optional", - )); - } - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(self.null(array, arrow_type, primitive_type)?); - } - (DataType::Boolean, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(self.boolean(array, arrow_type, primitive_type)?); - } - (DataType::Int16, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(self.int16(array, arrow_type, primitive_type)?); - } - (DataType::Int32, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(self.int32(array, arrow_type, primitive_type)?); - } - (DataType::Int64, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(self.int64(array, arrow_type, primitive_type)?); - } - (DataType::Float32, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(self.float(array, arrow_type, primitive_type)?); - } - (DataType::Float64, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(self.double(array, arrow_type, primitive_type)?); - } - (DataType::Decimal128(_, _), Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(self.decimal(array, arrow_type, primitive_type)?); - } - (DataType::Date32, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(self.date(array, arrow_type, primitive_type)?); - } - (DataType::Time64(TimeUnit::Microsecond), Type::Primitive(primitive_type)) => { - let array = array - .as_any() - .downcast_ref::() - .unwrap(); - columns.push(self.time(array, arrow_type, primitive_type)?); - } - ( - DataType::Timestamp(TimeUnit::Microsecond, _), - Type::Primitive(primitive_type), - ) => { - let array = array - .as_any() - .downcast_ref::() - .unwrap(); - columns.push(self.timestamp(array, arrow_type, primitive_type)?); - } - (DataType::Timestamp(TimeUnit::Nanosecond, _), Type::Primitive(primitive_type)) => { - let array = array - .as_any() - .downcast_ref::() - .unwrap(); - columns.push(self.timestamp_nano(array, arrow_type, primitive_type)?); - } - (DataType::Utf8, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(self.string(array, arrow_type, primitive_type)?); - } - (DataType::LargeUtf8, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(self.large_string(array, arrow_type, primitive_type)?); - } - (DataType::Binary, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(self.binary(array, arrow_type, primitive_type)?); - } - (DataType::LargeBinary, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(self.large_binary(array, arrow_type, primitive_type)?); - } - (DataType::Struct(_), Type::Struct(struct_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(self.r#struct(array, arrow_type, struct_type)?); - } - (arrow_type, iceberg_field_type) => { - return Err(Error::new( - ErrorKind::FeatureUnsupported, - format!( - "Unsupported convert arrow type {} to iceberg type: {}", - arrow_type, iceberg_field_type - ), - )) - } - } - } - - self.combine_struct(array, columns) - } + childs: Vec>, + ) -> Result>; } -struct LiteralArrayVisitor; +struct ArrowArrayConvert; -impl ArrowArrayVistor for LiteralArrayVisitor { +impl ArrowArrayVistor for ArrowArrayConvert { type T = Option; - fn null( - &self, - array: &NullArray, - _arrow_type: &DataType, - _iceberg_type: &PrimitiveType, - ) -> Result> { + fn null(&self, array: &NullArray, _iceberg_type: &PrimitiveType) -> Result> { Ok(vec![None; array.len()]) } - fn boolean( - &self, - array: &BooleanArray, - _arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result> { + fn boolean(&self, array: &BooleanArray, iceberg_type: &PrimitiveType) -> Result> { match iceberg_type { PrimitiveType::Boolean => Ok(array.iter().map(|v| v.map(Literal::bool)).collect()), _ => Err(Error::new( @@ -300,12 +100,7 @@ impl ArrowArrayVistor for LiteralArrayVisitor { } } - fn int16( - &self, - array: &Int16Array, - _arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result> { + fn int16(&self, array: &Int16Array, iceberg_type: &PrimitiveType) -> Result> { match iceberg_type { PrimitiveType::Int => Ok(array.iter().map(|v| v.map(Literal::int)).collect()), PrimitiveType::Long => Ok(array.iter().map(|v| v.map(Literal::long)).collect()), @@ -319,12 +114,7 @@ impl ArrowArrayVistor for LiteralArrayVisitor { } } - fn int32( - &self, - array: &Int32Array, - _arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result> { + fn int32(&self, array: &Int32Array, iceberg_type: &PrimitiveType) -> Result> { match iceberg_type { PrimitiveType::Int => Ok(array.iter().map(|v| v.map(Literal::int)).collect()), PrimitiveType::Long => Ok(array.iter().map(|v| v.map(Literal::long)).collect()), @@ -338,12 +128,7 @@ impl ArrowArrayVistor for LiteralArrayVisitor { } } - fn int64( - &self, - array: &Int64Array, - _arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result> { + fn int64(&self, array: &Int64Array, iceberg_type: &PrimitiveType) -> Result> { match iceberg_type { PrimitiveType::Long => Ok(array.iter().map(|v| v.map(Literal::long)).collect()), _ => Err(Error::new( @@ -356,12 +141,7 @@ impl ArrowArrayVistor for LiteralArrayVisitor { } } - fn float( - &self, - array: &Float32Array, - _arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result> { + fn float(&self, array: &Float32Array, iceberg_type: &PrimitiveType) -> Result> { match iceberg_type { PrimitiveType::Float => Ok(array.iter().map(|v| v.map(Literal::float)).collect()), _ => Err(Error::new( @@ -374,12 +154,7 @@ impl ArrowArrayVistor for LiteralArrayVisitor { } } - fn double( - &self, - array: &Float64Array, - _arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result> { + fn double(&self, array: &Float64Array, iceberg_type: &PrimitiveType) -> Result> { match iceberg_type { PrimitiveType::Double => Ok(array.iter().map(|v| v.map(Literal::double)).collect()), _ => Err(Error::new( @@ -395,10 +170,9 @@ impl ArrowArrayVistor for LiteralArrayVisitor { fn decimal( &self, array: &Decimal128Array, - arrow_type: &DataType, iceberg_type: &PrimitiveType, ) -> Result> { - let DataType::Decimal128(arrow_precision, arrow_scale) = arrow_type else { + let DataType::Decimal128(arrow_precision, arrow_scale) = array.data_type() else { unreachable!() }; match iceberg_type { @@ -424,12 +198,7 @@ impl ArrowArrayVistor for LiteralArrayVisitor { } } - fn date( - &self, - array: &Date32Array, - _arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result> { + fn date(&self, array: &Date32Array, iceberg_type: &PrimitiveType) -> Result> { match iceberg_type { PrimitiveType::Date => Ok(array.iter().map(|v| v.map(Literal::date)).collect()), _ => Err(Error::new( @@ -445,7 +214,6 @@ impl ArrowArrayVistor for LiteralArrayVisitor { fn time( &self, array: &Time64MicrosecondArray, - _arrow_type: &DataType, iceberg_type: &PrimitiveType, ) -> Result> { match iceberg_type { @@ -466,7 +234,6 @@ impl ArrowArrayVistor for LiteralArrayVisitor { fn timestamp( &self, array: &TimestampMicrosecondArray, - _arrow_type: &DataType, iceberg_type: &PrimitiveType, ) -> Result> { match iceberg_type { @@ -491,7 +258,6 @@ impl ArrowArrayVistor for LiteralArrayVisitor { fn timestamp_nano( &self, array: &TimestampNanosecondArray, - _arrow_type: &DataType, iceberg_type: &PrimitiveType, ) -> Result> { match iceberg_type { @@ -513,12 +279,7 @@ impl ArrowArrayVistor for LiteralArrayVisitor { } } - fn string( - &self, - array: &StringArray, - _arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result> { + fn string(&self, array: &StringArray, iceberg_type: &PrimitiveType) -> Result> { match iceberg_type { PrimitiveType::String => Ok(array.iter().map(|v| v.map(Literal::string)).collect()), _ => Err(Error::new( @@ -534,7 +295,6 @@ impl ArrowArrayVistor for LiteralArrayVisitor { fn large_string( &self, array: &LargeStringArray, - _arrow_type: &DataType, iceberg_type: &PrimitiveType, ) -> Result> { match iceberg_type { @@ -549,12 +309,7 @@ impl ArrowArrayVistor for LiteralArrayVisitor { } } - fn binary( - &self, - array: &BinaryArray, - _arrow_type: &DataType, - iceberg_type: &PrimitiveType, - ) -> Result> { + fn binary(&self, array: &BinaryArray, iceberg_type: &PrimitiveType) -> Result> { match iceberg_type { PrimitiveType::Binary => Ok(array .iter() @@ -573,7 +328,6 @@ impl ArrowArrayVistor for LiteralArrayVisitor { fn large_binary( &self, array: &LargeBinaryArray, - _arrow_type: &DataType, iceberg_type: &PrimitiveType, ) -> Result> { match iceberg_type { @@ -591,9 +345,10 @@ impl ArrowArrayVistor for LiteralArrayVisitor { } } - fn combine_struct( + fn r#struct( &self, array: &StructArray, + _iceberg_type: &StructType, columns: Vec>, ) -> Result> { let struct_literal_len = columns.first().map(|column| column.len()).unwrap_or(0); @@ -619,16 +374,306 @@ impl ArrowArrayVistor for LiteralArrayVisitor { } } +fn visit_arrow_struct_array( + array: &StructArray, + iceberg_type: &StructType, + visitor: &V, +) -> Result> { + let DataType::Struct(arrow_struct_fields) = array.data_type() else { + return Err(Error::new( + ErrorKind::DataInvalid, + "The type of arrow struct array is not a struct type", + )); + }; + + if array.columns().len() != iceberg_type.fields().len() + || arrow_struct_fields.len() != iceberg_type.fields().len() + { + return Err(Error::new( + ErrorKind::DataInvalid, + "The type of arrow struct array is not compatitable with iceberg struct type", + )); + } + + let mut columns = Vec::with_capacity(array.columns().len()); + + for ((array, arrow_type), iceberg_field) in array + .columns() + .iter() + .zip_eq(arrow_struct_fields.iter().map(|field| field.data_type())) + .zip_eq(iceberg_type.fields().iter()) + { + if array.is_nullable() == iceberg_field.required { + return Err(Error::new( + ErrorKind::DataInvalid, + "The nullable field of arrow struct array is not compatitable with iceberg type", + )); + } + match (arrow_type, iceberg_field.field_type.as_ref()) { + (DataType::Null, Type::Primitive(primitive_type)) => { + if iceberg_field.required { + return Err(Error::new( + ErrorKind::DataInvalid, + "column in arrow array should not be optional", + )); + } + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.null(array, primitive_type)?); + } + (DataType::Boolean, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.boolean(array, primitive_type)?); + } + (DataType::Int16, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.int16(array, primitive_type)?); + } + (DataType::Int32, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.int32(array, primitive_type)?); + } + (DataType::Int64, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.int64(array, primitive_type)?); + } + (DataType::Float32, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.float(array, primitive_type)?); + } + (DataType::Float64, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.double(array, primitive_type)?); + } + (DataType::Decimal128(_, _), Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.decimal(array, primitive_type)?); + } + (DataType::Date32, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.date(array, primitive_type)?); + } + (DataType::Time64(TimeUnit::Microsecond), Type::Primitive(primitive_type)) => { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + columns.push(visitor.time(array, primitive_type)?); + } + (DataType::Timestamp(TimeUnit::Microsecond, _), Type::Primitive(primitive_type)) => { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + columns.push(visitor.timestamp(array, primitive_type)?); + } + (DataType::Timestamp(TimeUnit::Nanosecond, _), Type::Primitive(primitive_type)) => { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + columns.push(visitor.timestamp_nano(array, primitive_type)?); + } + (DataType::Utf8, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.string(array, primitive_type)?); + } + (DataType::LargeUtf8, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.large_string(array, primitive_type)?); + } + (DataType::Binary, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.binary(array, primitive_type)?); + } + (DataType::LargeBinary, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.large_binary(array, primitive_type)?); + } + (DataType::Struct(_), Type::Struct(struct_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visit_arrow_struct_array(array, struct_type, visitor)?); + } + (arrow_type, iceberg_field_type) => { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + format!( + "Unsupported convert arrow type {} to iceberg type: {}", + arrow_type, iceberg_field_type + ), + )) + } + } + } + + visitor.r#struct(array, iceberg_type, columns) +} + +fn visit_arrow_struct_array_from_field_id( + array: &StructArray, + iceberg_type: &StructType, + visitor: &V, +) -> Result> { + let DataType::Struct(arrow_struct_fields) = array.data_type() else { + return Err(Error::new( + ErrorKind::DataInvalid, + "The type of arrow struct array is not a struct type", + )); + }; + + if array.columns().len() < iceberg_type.fields().len() + || arrow_struct_fields.len() < iceberg_type.fields().len() + { + return Err(Error::new( + ErrorKind::DataInvalid, + "The type of arrow struct array is not compatitable with iceberg struct type", + )); + } + + let mut columns = Vec::with_capacity(array.columns().len()); + + for iceberg_field in iceberg_type.fields() { + let Some((idx, field)) = arrow_struct_fields.iter().enumerate().find(|(_idx, f)| { + f.metadata() + .get(PARQUET_FIELD_ID_META_KEY) + .and_then(|id| id.parse::().ok().map(|id: i32| id == iceberg_field.id)) + .unwrap_or(false) + }) else { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The field {} in iceberg struct type is not found in arrow struct type", + iceberg_field.name + ), + )); + }; + let array = array.column(idx); + let arrow_type = field.data_type(); + if array.is_nullable() == iceberg_field.required { + return Err(Error::new( + ErrorKind::DataInvalid, + "The nullable field of arrow struct array is not compatitable with iceberg type", + )); + } + match (arrow_type, iceberg_field.field_type.as_ref()) { + (DataType::Null, Type::Primitive(primitive_type)) => { + if iceberg_field.required { + return Err(Error::new( + ErrorKind::DataInvalid, + "column in arrow array should not be optional", + )); + } + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.null(array, primitive_type)?); + } + (DataType::Boolean, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.boolean(array, primitive_type)?); + } + (DataType::Int16, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.int16(array, primitive_type)?); + } + (DataType::Int32, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.int32(array, primitive_type)?); + } + (DataType::Int64, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.int64(array, primitive_type)?); + } + (DataType::Float32, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.float(array, primitive_type)?); + } + (DataType::Float64, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.double(array, primitive_type)?); + } + (DataType::Decimal128(_, _), Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.decimal(array, primitive_type)?); + } + (DataType::Date32, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.date(array, primitive_type)?); + } + (DataType::Time64(TimeUnit::Microsecond), Type::Primitive(primitive_type)) => { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + columns.push(visitor.time(array, primitive_type)?); + } + (DataType::Timestamp(TimeUnit::Microsecond, _), Type::Primitive(primitive_type)) => { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + columns.push(visitor.timestamp(array, primitive_type)?); + } + (DataType::Timestamp(TimeUnit::Nanosecond, _), Type::Primitive(primitive_type)) => { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + columns.push(visitor.timestamp_nano(array, primitive_type)?); + } + (DataType::Utf8, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.string(array, primitive_type)?); + } + (DataType::LargeUtf8, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.large_string(array, primitive_type)?); + } + (DataType::Binary, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.binary(array, primitive_type)?); + } + (DataType::LargeBinary, Type::Primitive(primitive_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visitor.large_binary(array, primitive_type)?); + } + (DataType::Struct(_), Type::Struct(struct_type)) => { + let array = array.as_any().downcast_ref::().unwrap(); + columns.push(visit_arrow_struct_array(array, struct_type, visitor)?); + } + (arrow_type, iceberg_field_type) => { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + format!( + "Unsupported convert arrow type {} to iceberg type: {}", + arrow_type, iceberg_field_type + ), + )) + } + } + } + + visitor.r#struct(array, iceberg_type, columns) +} + /// Convert arrow struct array to iceberg struct value array. +/// This function will assume the schema of arrow struct array is the same as iceberg struct type. pub fn arrow_struct_to_literal( struct_array: &StructArray, ty: StructType, ) -> Result>> { - LiteralArrayVisitor.r#struct(struct_array, struct_array.data_type(), &ty) + visit_arrow_struct_array(struct_array, &ty, &ArrowArrayConvert) +} + +/// Convert arrow struct array to iceberg struct value array. +/// This function will use field id to find the corresponding field in arrow struct array. +pub fn arrow_struct_to_literal_from_field_id( + struct_array: &StructArray, + ty: StructType, +) -> Result>> { + visit_arrow_struct_array_from_field_id(struct_array, &ty, &ArrowArrayConvert) } #[cfg(test)] mod test { + use std::collections::HashMap; use std::sync::Arc; use arrow_array::{ @@ -871,4 +916,95 @@ mod test { let result = arrow_struct_to_literal(&struct_array, iceberg_struct_type).unwrap(); assert_eq!(result, vec![None; 0]); } + + #[test] + fn test_arrow_struct_to_iceberg_struct_from_field_id() { + let bool_array = BooleanArray::from(vec![Some(true), Some(false), None]); + let int16_array = Int16Array::from(vec![Some(1), Some(2), None]); + let int32_array = Int32Array::from(vec![Some(3), Some(4), None]); + let int64_array = Int64Array::from(vec![Some(5), Some(6), None]); + let float32_array = Float32Array::from(vec![Some(1.1), Some(2.2), None]); + let struct_array = StructArray::from(vec![ + ( + Arc::new( + Field::new("bool_field", DataType::Boolean, true).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "2".to_string())], + )), + ), + Arc::new(bool_array) as ArrayRef, + ), + ( + Arc::new( + Field::new("int16_field", DataType::Int16, true).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "1".to_string())], + )), + ), + Arc::new(int16_array) as ArrayRef, + ), + ( + Arc::new( + Field::new("int32_field", DataType::Int32, true).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "4".to_string())], + )), + ), + Arc::new(int32_array) as ArrayRef, + ), + ( + Arc::new( + Field::new("int64_field", DataType::Int64, true).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "3".to_string())], + )), + ), + Arc::new(int64_array) as ArrayRef, + ), + ( + Arc::new( + Field::new("float32_field", DataType::Float32, true).with_metadata( + HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), "5".to_string())]), + ), + ), + Arc::new(float32_array) as ArrayRef, + ), + ]); + let struct_type = StructType::new(vec![ + Arc::new(NestedField::optional( + 1, + "int16_field", + Type::Primitive(PrimitiveType::Int), + )), + Arc::new(NestedField::optional( + 2, + "bool_field", + Type::Primitive(PrimitiveType::Boolean), + )), + Arc::new(NestedField::optional( + 3, + "int64_field", + Type::Primitive(PrimitiveType::Long), + )), + Arc::new(NestedField::optional( + 4, + "int32_field", + Type::Primitive(PrimitiveType::Int), + )), + ]); + let result = arrow_struct_to_literal_from_field_id(&struct_array, struct_type).unwrap(); + assert_eq!(result, vec![ + Some(Literal::Struct(Struct::from_iter(vec![ + Some(Literal::int(1)), + Some(Literal::bool(true)), + Some(Literal::long(5)), + Some(Literal::int(3)), + ]))), + Some(Literal::Struct(Struct::from_iter(vec![ + Some(Literal::int(2)), + Some(Literal::bool(false)), + Some(Literal::long(6)), + Some(Literal::int(4)), + ]))), + Some(Literal::Struct(Struct::from_iter(vec![ + None, None, None, None, + ]))), + ]); + } } From e043e1f6f81654f81d053290c241f73d3990f314 Mon Sep 17 00:00:00 2001 From: ZENOTME Date: Wed, 11 Dec 2024 12:23:41 +0800 Subject: [PATCH 04/11] add todo comment --- crates/iceberg/src/arrow/value.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crates/iceberg/src/arrow/value.rs b/crates/iceberg/src/arrow/value.rs index c1e075e781..563cbb2a28 100644 --- a/crates/iceberg/src/arrow/value.rs +++ b/crates/iceberg/src/arrow/value.rs @@ -28,6 +28,8 @@ use crate::spec::{Literal, PrimitiveType, Struct, StructType, Type}; use crate::{Error, ErrorKind, Result}; /// A post order arrow array visitor. +/// # TODO +/// - Add support for ListArray, MapArray trait ArrowArrayVistor { type T; fn null(&self, array: &NullArray, iceberg_type: &PrimitiveType) -> Result>; @@ -508,6 +510,8 @@ fn visit_arrow_struct_array( visitor.r#struct(array, iceberg_type, columns) } +// # TODO +// Add support for fullfill the missing field in arrow struct array fn visit_arrow_struct_array_from_field_id( array: &StructArray, iceberg_type: &StructType, From e9e3cd79fe86555812d8e73c2455274be190a64d Mon Sep 17 00:00:00 2001 From: ZENOTME Date: Wed, 11 Dec 2024 12:24:49 +0800 Subject: [PATCH 05/11] fix typos --- crates/iceberg/src/arrow/value.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/iceberg/src/arrow/value.rs b/crates/iceberg/src/arrow/value.rs index 563cbb2a28..8fdc6428eb 100644 --- a/crates/iceberg/src/arrow/value.rs +++ b/crates/iceberg/src/arrow/value.rs @@ -76,7 +76,7 @@ trait ArrowArrayVistor { &self, array: &StructArray, iceberg_type: &StructType, - childs: Vec>, + columns: Vec>, ) -> Result>; } @@ -511,7 +511,7 @@ fn visit_arrow_struct_array( } // # TODO -// Add support for fullfill the missing field in arrow struct array +// Add support for fulfill the missing field in arrow struct array fn visit_arrow_struct_array_from_field_id( array: &StructArray, iceberg_type: &StructType, From 32625e83e37726442cc0d3fa799efc6563a3ff0a Mon Sep 17 00:00:00 2001 From: ZENOTME Date: Wed, 11 Dec 2024 14:23:22 +0800 Subject: [PATCH 06/11] use field nullable instead of array nullable --- crates/iceberg/src/arrow/value.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/crates/iceberg/src/arrow/value.rs b/crates/iceberg/src/arrow/value.rs index 8fdc6428eb..e86509927f 100644 --- a/crates/iceberg/src/arrow/value.rs +++ b/crates/iceberg/src/arrow/value.rs @@ -399,13 +399,14 @@ fn visit_arrow_struct_array( let mut columns = Vec::with_capacity(array.columns().len()); - for ((array, arrow_type), iceberg_field) in array + for ((array, arrow_field), iceberg_field) in array .columns() .iter() - .zip_eq(arrow_struct_fields.iter().map(|field| field.data_type())) + .zip_eq(arrow_struct_fields.iter()) .zip_eq(iceberg_type.fields().iter()) { - if array.is_nullable() == iceberg_field.required { + let arrow_type = arrow_field.data_type(); + if arrow_field.is_nullable() == iceberg_field.required { return Err(Error::new( ErrorKind::DataInvalid, "The nullable field of arrow struct array is not compatitable with iceberg type", @@ -552,7 +553,7 @@ fn visit_arrow_struct_array_from_field_id( }; let array = array.column(idx); let arrow_type = field.data_type(); - if array.is_nullable() == iceberg_field.required { + if field.is_nullable() == iceberg_field.required { return Err(Error::new( ErrorKind::DataInvalid, "The nullable field of arrow struct array is not compatitable with iceberg type", From b70e347da83481389f51e78e8545328fb24c1373 Mon Sep 17 00:00:00 2001 From: ZENOTME Date: Mon, 16 Dec 2024 17:03:35 +0800 Subject: [PATCH 07/11] init SchemaWithPartnerVisitor design --- crates/iceberg/src/arrow/schema.rs | 2 +- crates/iceberg/src/arrow/value.rs | 1051 +++++++++------------ crates/iceberg/src/spec/schema/visitor.rs | 207 ++++ 3 files changed, 666 insertions(+), 594 deletions(-) diff --git a/crates/iceberg/src/arrow/schema.rs b/crates/iceberg/src/arrow/schema.rs index 41afd8ea4f..c0cd1a2213 100644 --- a/crates/iceberg/src/arrow/schema.rs +++ b/crates/iceberg/src/arrow/schema.rs @@ -226,7 +226,7 @@ pub fn arrow_type_to_type(ty: &DataType) -> Result { const ARROW_FIELD_DOC_KEY: &str = "doc"; -fn get_field_id(field: &Field) -> Result { +pub(super) fn get_field_id(field: &Field) -> Result { if let Some(value) = field.metadata().get(PARQUET_FIELD_ID_META_KEY) { return value.parse::().map_err(|e| { Error::new( diff --git a/crates/iceberg/src/arrow/value.rs b/crates/iceberg/src/arrow/value.rs index e86509927f..3df12e88cd 100644 --- a/crates/iceberg/src/arrow/value.rs +++ b/crates/iceberg/src/arrow/value.rs @@ -16,355 +16,72 @@ // under the License. use arrow_array::{ - Array, BinaryArray, BooleanArray, Date32Array, Decimal128Array, Float32Array, Float64Array, - Int16Array, Int32Array, Int64Array, LargeBinaryArray, LargeStringArray, NullArray, StringArray, - StructArray, Time64MicrosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray, + Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Decimal128Array, FixedSizeBinaryArray, + FixedSizeListArray, Float32Array, Float64Array, Int32Array, Int64Array, LargeBinaryArray, + LargeListArray, LargeStringArray, ListArray, MapArray, NullArray, StringArray, StructArray, + Time64MicrosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray, }; -use arrow_schema::{DataType, TimeUnit}; -use itertools::Itertools; -use parquet::arrow::PARQUET_FIELD_ID_META_KEY; +use arrow_schema::DataType; +use uuid::Uuid; -use crate::spec::{Literal, PrimitiveType, Struct, StructType, Type}; +use super::get_field_id; +use crate::spec::{ + visit_struct_with_partner, ListPartnerIterator, Literal, Map, MapPartnerIterator, + PartnerAccessor, PrimitiveType, SchemaWithPartnerVisitor, Struct, StructType, +}; use crate::{Error, ErrorKind, Result}; -/// A post order arrow array visitor. -/// # TODO -/// - Add support for ListArray, MapArray -trait ArrowArrayVistor { - type T; - fn null(&self, array: &NullArray, iceberg_type: &PrimitiveType) -> Result>; - fn boolean(&self, array: &BooleanArray, iceberg_type: &PrimitiveType) -> Result>; - fn int16(&self, array: &Int16Array, iceberg_type: &PrimitiveType) -> Result>; - fn int32(&self, array: &Int32Array, iceberg_type: &PrimitiveType) -> Result>; - fn int64(&self, array: &Int64Array, iceberg_type: &PrimitiveType) -> Result>; - fn float(&self, array: &Float32Array, iceberg_type: &PrimitiveType) -> Result>; - fn double(&self, array: &Float64Array, iceberg_type: &PrimitiveType) -> Result>; - fn decimal( - &self, - array: &Decimal128Array, - iceberg_type: &PrimitiveType, - ) -> Result>; - fn date(&self, array: &Date32Array, iceberg_type: &PrimitiveType) -> Result>; - fn time( - &self, - array: &Time64MicrosecondArray, - iceberg_type: &PrimitiveType, - ) -> Result>; - fn timestamp( - &self, - array: &TimestampMicrosecondArray, - iceberg_type: &PrimitiveType, - ) -> Result>; - fn timestamp_nano( - &self, - array: &TimestampNanosecondArray, - iceberg_type: &PrimitiveType, - ) -> Result>; - fn string(&self, array: &StringArray, iceberg_type: &PrimitiveType) -> Result>; - fn large_string( - &self, - array: &LargeStringArray, - iceberg_type: &PrimitiveType, - ) -> Result>; - fn binary(&self, array: &BinaryArray, iceberg_type: &PrimitiveType) -> Result>; - fn large_binary( - &self, - array: &LargeBinaryArray, - iceberg_type: &PrimitiveType, - ) -> Result>; - fn r#struct( - &self, - array: &StructArray, - iceberg_type: &StructType, - columns: Vec>, - ) -> Result>; -} - -struct ArrowArrayConvert; - -impl ArrowArrayVistor for ArrowArrayConvert { - type T = Option; +struct ArrowArrayConverter; - fn null(&self, array: &NullArray, _iceberg_type: &PrimitiveType) -> Result> { - Ok(vec![None; array.len()]) - } - - fn boolean(&self, array: &BooleanArray, iceberg_type: &PrimitiveType) -> Result> { - match iceberg_type { - PrimitiveType::Boolean => Ok(array.iter().map(|v| v.map(Literal::bool)).collect()), - _ => Err(Error::new( - ErrorKind::DataInvalid, - format!( - "The type of arrow boolean array is not compatitable with iceberg type {}", - iceberg_type - ), - )), - } - } - - fn int16(&self, array: &Int16Array, iceberg_type: &PrimitiveType) -> Result> { - match iceberg_type { - PrimitiveType::Int => Ok(array.iter().map(|v| v.map(Literal::int)).collect()), - PrimitiveType::Long => Ok(array.iter().map(|v| v.map(Literal::long)).collect()), - _ => Err(Error::new( - ErrorKind::DataInvalid, - format!( - "The type of arrow int16 array is not compatitable with iceberg type {}", - iceberg_type - ), - )), - } - } - - fn int32(&self, array: &Int32Array, iceberg_type: &PrimitiveType) -> Result> { - match iceberg_type { - PrimitiveType::Int => Ok(array.iter().map(|v| v.map(Literal::int)).collect()), - PrimitiveType::Long => Ok(array.iter().map(|v| v.map(Literal::long)).collect()), - _ => Err(Error::new( - ErrorKind::DataInvalid, - format!( - "The type of arrow int32 array is not compatitable with iceberg type {}", - iceberg_type - ), - )), - } - } +impl SchemaWithPartnerVisitor for ArrowArrayConverter { + type T = Vec>; - fn int64(&self, array: &Int64Array, iceberg_type: &PrimitiveType) -> Result> { - match iceberg_type { - PrimitiveType::Long => Ok(array.iter().map(|v| v.map(Literal::long)).collect()), - _ => Err(Error::new( - ErrorKind::DataInvalid, - format!( - "The type of arrow int64 array is not compatitable with iceberg type {}", - iceberg_type - ), - )), - } - } - - fn float(&self, array: &Float32Array, iceberg_type: &PrimitiveType) -> Result> { - match iceberg_type { - PrimitiveType::Float => Ok(array.iter().map(|v| v.map(Literal::float)).collect()), - _ => Err(Error::new( - ErrorKind::DataInvalid, - format!( - "The type of arrow float16 array is not compatitable with iceberg type {}", - iceberg_type - ), - )), - } - } - - fn double(&self, array: &Float64Array, iceberg_type: &PrimitiveType) -> Result> { - match iceberg_type { - PrimitiveType::Double => Ok(array.iter().map(|v| v.map(Literal::double)).collect()), - _ => Err(Error::new( - ErrorKind::DataInvalid, - format!( - "The type of arrow float64 array is not compatitable with iceberg type {}", - iceberg_type - ), - )), - } - } - - fn decimal( - &self, - array: &Decimal128Array, - iceberg_type: &PrimitiveType, - ) -> Result> { - let DataType::Decimal128(arrow_precision, arrow_scale) = array.data_type() else { - unreachable!() - }; - match iceberg_type { - PrimitiveType::Decimal { precision, scale } => { - if *arrow_precision as u32 != *precision || *arrow_scale as u32 != *scale { - return Err(Error::new( - ErrorKind::DataInvalid, - format!( - "The precision or scale ({},{}) of arrow decimal128 array is not compatitable with iceberg decimal type ({},{})", - arrow_precision, arrow_scale, precision, scale - ), - )); - } - Ok(array.iter().map(|v| v.map(Literal::decimal)).collect()) - } - _ => Err(Error::new( - ErrorKind::DataInvalid, - format!( - "The type of arrow decimal128 array is not compatitable with iceberg type {}", - iceberg_type - ), - )), - } + fn schema( + &mut self, + _schema: &crate::spec::Schema, + _partner: &ArrayRef, + value: Vec>, + ) -> Result>> { + Ok(value) } - fn date(&self, array: &Date32Array, iceberg_type: &PrimitiveType) -> Result> { - match iceberg_type { - PrimitiveType::Date => Ok(array.iter().map(|v| v.map(Literal::date)).collect()), - _ => Err(Error::new( - ErrorKind::DataInvalid, - format!( - "The type of arrow date32 array is not compatitable with iceberg type {}", - iceberg_type - ), - )), - } - } - - fn time( - &self, - array: &Time64MicrosecondArray, - iceberg_type: &PrimitiveType, - ) -> Result> { - match iceberg_type { - PrimitiveType::Time => Ok(array - .iter() - .map(|v| v.map(Literal::time)) - .collect()), - _ => Err(Error::new( - ErrorKind::DataInvalid, - format!( - "The type of arrow time64 microsecond array is not compatitable with iceberg type {}", - iceberg_type - ), - )), - } - } - - fn timestamp( - &self, - array: &TimestampMicrosecondArray, - iceberg_type: &PrimitiveType, - ) -> Result> { - match iceberg_type { - PrimitiveType::Timestamp => Ok(array - .iter() - .map(|v| v.map(Literal::timestamp)) - .collect()), - PrimitiveType::Timestamptz => Ok(array - .iter() - .map(|v| v.map(Literal::timestamptz)) - .collect()), - _ => Err(Error::new( - ErrorKind::DataInvalid, - format!( - "The type of arrow timestamp microsecond array is not compatitable with iceberg type {}", - iceberg_type - ), - )), - } - } - - fn timestamp_nano( - &self, - array: &TimestampNanosecondArray, - iceberg_type: &PrimitiveType, - ) -> Result> { - match iceberg_type { - PrimitiveType::TimestampNs => Ok(array - .iter() - .map(|v| v.map(Literal::timestamp_nano)) - .collect()), - PrimitiveType::TimestamptzNs => Ok(array - .iter() - .map(|v| v.map(Literal::timestamptz_nano)) - .collect()), - _ => Err(Error::new( - ErrorKind::DataInvalid, - format!( - "The type of arrow timestamp nanosecond array is not compatitable with iceberg type {}", - iceberg_type - ), - )), - } - } - - fn string(&self, array: &StringArray, iceberg_type: &PrimitiveType) -> Result> { - match iceberg_type { - PrimitiveType::String => Ok(array.iter().map(|v| v.map(Literal::string)).collect()), - _ => Err(Error::new( - ErrorKind::DataInvalid, - format!( - "The type of arrow string array is not compatitable with iceberg type {}", - iceberg_type - ), - )), - } - } - - fn large_string( - &self, - array: &LargeStringArray, - iceberg_type: &PrimitiveType, - ) -> Result> { - match iceberg_type { - PrimitiveType::String => Ok(array.iter().map(|v| v.map(Literal::string)).collect()), - _ => Err(Error::new( - ErrorKind::DataInvalid, - format!( - "The type of arrow large string array is not compatitable with iceberg type {}", - iceberg_type - ), - )), - } - } - - fn binary(&self, array: &BinaryArray, iceberg_type: &PrimitiveType) -> Result> { - match iceberg_type { - PrimitiveType::Binary => Ok(array - .iter() - .map(|v| v.map(|v| Literal::binary(v.to_vec()))) - .collect()), - _ => Err(Error::new( + fn field( + &mut self, + field: &crate::spec::NestedFieldRef, + _partner: &ArrayRef, + value: Vec>, + ) -> Result>> { + // Make there is no null value if the field is required + if field.required && value.iter().any(Option::is_none) { + return Err(Error::new( ErrorKind::DataInvalid, - format!( - "The type of arrow binary array is not compatitable with iceberg type {}", - iceberg_type - ), - )), + "The field is required but has null value", + )); } + Ok(value) } - fn large_binary( - &self, - array: &LargeBinaryArray, - iceberg_type: &PrimitiveType, - ) -> Result> { - match iceberg_type { - PrimitiveType::Binary => Ok(array - .iter() - .map(|v| v.map(|v| Literal::binary(v.to_vec()))) - .collect()), - _ => Err(Error::new( + fn r#struct( + &mut self, + _struct: &StructType, + _partner: &ArrayRef, + results: Vec>>, + ) -> Result>> { + let row_len = results.first().map(|column| column.len()).unwrap_or(0); + if results.iter().any(|column| column.len() != row_len) { + return Err(Error::new( ErrorKind::DataInvalid, - format!( - "The type of arrow binary array is not compatitable with iceberg type {}", - iceberg_type - ), - )), + "The struct columns have different row length", + )); } - } - fn r#struct( - &self, - array: &StructArray, - _iceberg_type: &StructType, - columns: Vec>, - ) -> Result> { - let struct_literal_len = columns.first().map(|column| column.len()).unwrap_or(0); - let mut struct_literals = Vec::with_capacity(struct_literal_len); - let mut columns_iters = columns + let mut struct_literals = Vec::with_capacity(row_len); + let mut columns_iters = results .into_iter() .map(|column| column.into_iter()) .collect::>(); - for row_idx in 0..struct_literal_len { - if array.is_null(row_idx) { - struct_literals.push(None); - continue; - } + for _ in 0..row_len { let mut literals = Vec::with_capacity(columns_iters.len()); for column_iter in columns_iters.iter_mut() { literals.push(column_iter.next().unwrap()); @@ -374,306 +91,453 @@ impl ArrowArrayVistor for ArrowArrayConvert { Ok(struct_literals) } -} - -fn visit_arrow_struct_array( - array: &StructArray, - iceberg_type: &StructType, - visitor: &V, -) -> Result> { - let DataType::Struct(arrow_struct_fields) = array.data_type() else { - return Err(Error::new( - ErrorKind::DataInvalid, - "The type of arrow struct array is not a struct type", - )); - }; - if array.columns().len() != iceberg_type.fields().len() - || arrow_struct_fields.len() != iceberg_type.fields().len() - { - return Err(Error::new( - ErrorKind::DataInvalid, - "The type of arrow struct array is not compatitable with iceberg struct type", - )); + fn list( + &mut self, + list: &crate::spec::ListType, + _partner: &ArrayRef, + results: Vec>>, + ) -> Result>> { + if list.element_field.required { + if results.iter().any(|row| row.iter().any(Option::is_none)) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The list should not have null value", + )); + } + } + Ok(results + .into_iter() + .map(|row| Some(Literal::List(row))) + .collect()) } - let mut columns = Vec::with_capacity(array.columns().len()); - - for ((array, arrow_field), iceberg_field) in array - .columns() - .iter() - .zip_eq(arrow_struct_fields.iter()) - .zip_eq(iceberg_type.fields().iter()) - { - let arrow_type = arrow_field.data_type(); - if arrow_field.is_nullable() == iceberg_field.required { + fn map( + &mut self, + map: &crate::spec::MapType, + _partner: &ArrayRef, + key_values: Vec>>, + values: Vec>>, + ) -> Result>> { + // Make sure key_value and value have the same row length + if key_values.len() != values.len() { return Err(Error::new( ErrorKind::DataInvalid, - "The nullable field of arrow struct array is not compatitable with iceberg type", + "The key value and value of map should have the same row length", )); } - match (arrow_type, iceberg_field.field_type.as_ref()) { - (DataType::Null, Type::Primitive(primitive_type)) => { - if iceberg_field.required { - return Err(Error::new( - ErrorKind::DataInvalid, - "column in arrow array should not be optional", - )); - } - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.null(array, primitive_type)?); - } - (DataType::Boolean, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.boolean(array, primitive_type)?); - } - (DataType::Int16, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.int16(array, primitive_type)?); - } - (DataType::Int32, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.int32(array, primitive_type)?); + + let mut result = Vec::with_capacity(key_values.len()); + for (key, value) in key_values.into_iter().zip(values.into_iter()) { + // Make sure key_value and value have the same length + if key.len() != value.len() { + return Err(Error::new( + ErrorKind::DataInvalid, + "The key value and value of map should have the same length", + )); } - (DataType::Int64, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.int64(array, primitive_type)?); + // Make sure no null value in key_value + if key.iter().any(Option::is_none) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The key value of map should not have null value", + )); } - (DataType::Float32, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.float(array, primitive_type)?); + + // Make sure no null value in value if value field is required + if map.value_field.required && value.iter().any(Option::is_none) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The value of map should not have null value", + )); } - (DataType::Float64, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.double(array, primitive_type)?); + + let mut map = Map::new(); + for (k, v) in key.into_iter().zip(value.into_iter()) { + map.insert(k.unwrap(), v.clone()); } - (DataType::Decimal128(_, _), Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.decimal(array, primitive_type)?); + result.push(Some(Literal::Map(map))); + } + + Ok(result) + } + + fn primitive(&mut self, p: &PrimitiveType, partner: &ArrayRef) -> Result>> { + if let Some(_) = partner.as_any().downcast_ref::() { + return Ok(vec![None; partner.len()]); + } + match p { + PrimitiveType::Boolean => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The partner is not a boolean array") + })?; + Ok(array.iter().map(|v| v.map(Literal::bool)).collect()) + } + PrimitiveType::Int => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The partner is not a int32 array") + })?; + Ok(array.iter().map(|v| v.map(Literal::int)).collect()) + } + PrimitiveType::Long => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The partner is not a int64 array") + })?; + Ok(array.iter().map(|v| v.map(Literal::long)).collect()) + } + PrimitiveType::Float => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The partner is not a float32 array") + })?; + Ok(array.iter().map(|v| v.map(Literal::float)).collect()) + } + PrimitiveType::Double => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The partner is not a float64 array") + })?; + Ok(array.iter().map(|v| v.map(Literal::double)).collect()) } - (DataType::Date32, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.date(array, primitive_type)?); + PrimitiveType::Decimal { precision, scale } => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "The partner is not a decimal128 array", + ) + })?; + if let DataType::Decimal128(arrow_precision, arrow_scale) = array.data_type() { + if *arrow_precision as u32 != *precision || *arrow_scale as u32 != *scale { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The precision or scale ({},{}) of arrow decimal128 array is not compatitable with iceberg decimal type ({},{})", + arrow_precision, arrow_scale, precision, scale + ), + )); + } + } + Ok(array.iter().map(|v| v.map(Literal::decimal)).collect()) } - (DataType::Time64(TimeUnit::Microsecond), Type::Primitive(primitive_type)) => { - let array = array + PrimitiveType::Date => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The partner is not a date32 array") + })?; + Ok(array.iter().map(|v| v.map(Literal::date)).collect()) + } + PrimitiveType::Time => { + let array = partner .as_any() .downcast_ref::() - .unwrap(); - columns.push(visitor.time(array, primitive_type)?); + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The partner is not a time64 array") + })?; + Ok(array.iter().map(|v| v.map(Literal::time)).collect()) } - (DataType::Timestamp(TimeUnit::Microsecond, _), Type::Primitive(primitive_type)) => { - let array = array + PrimitiveType::Timestamp => { + let array = partner .as_any() .downcast_ref::() - .unwrap(); - columns.push(visitor.timestamp(array, primitive_type)?); - } - (DataType::Timestamp(TimeUnit::Nanosecond, _), Type::Primitive(primitive_type)) => { - let array = array + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "The partner is not a timestamp array", + ) + })?; + Ok(array.iter().map(|v| v.map(Literal::timestamp)).collect()) + } + PrimitiveType::Timestamptz => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "The partner is not a timestamptz array", + ) + })?; + Ok(array.iter().map(|v| v.map(Literal::timestamptz)).collect()) + } + PrimitiveType::TimestampNs => { + let array = partner .as_any() .downcast_ref::() - .unwrap(); - columns.push(visitor.timestamp_nano(array, primitive_type)?); - } - (DataType::Utf8, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.string(array, primitive_type)?); - } - (DataType::LargeUtf8, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.large_string(array, primitive_type)?); - } - (DataType::Binary, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.binary(array, primitive_type)?); - } - (DataType::LargeBinary, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.large_binary(array, primitive_type)?); + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "The partner is not a timestamp_ns array", + ) + })?; + Ok(array + .iter() + .map(|v| v.map(Literal::timestamp_nano)) + .collect()) + } + PrimitiveType::TimestamptzNs => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "The partner is not a timestamptz_ns array", + ) + })?; + Ok(array + .iter() + .map(|v| v.map(Literal::timestamptz_nano)) + .collect()) + } + PrimitiveType::String => { + if let Some(array) = partner.as_any().downcast_ref::() { + Ok(array.iter().map(|v| v.map(Literal::string)).collect()) + } else if let Some(array) = partner.as_any().downcast_ref::() { + Ok(array.iter().map(|v| v.map(Literal::string)).collect()) + } else { + return Err(Error::new( + ErrorKind::DataInvalid, + "The partner is not a string array", + )); + } } - (DataType::Struct(_), Type::Struct(struct_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visit_arrow_struct_array(array, struct_type, visitor)?); + PrimitiveType::Uuid => { + if let Some(array) = partner.as_any().downcast_ref::() { + if array.value_length() != 16 { + return Err(Error::new( + ErrorKind::DataInvalid, + "The partner is not a uuid array", + )); + } + Ok(array + .iter() + .map(|v| { + v.map(|v| { + Ok(Literal::uuid(Uuid::from_bytes(v.try_into().map_err( + |_| { + Error::new( + ErrorKind::DataInvalid, + "Failed to convert binary to uuid", + ) + }, + )?))) + }) + .transpose() + }) + .collect::>>()?) + } else { + return Err(Error::new( + ErrorKind::DataInvalid, + "The partner is not a uuid array", + )); + } } - (arrow_type, iceberg_field_type) => { - return Err(Error::new( - ErrorKind::FeatureUnsupported, - format!( - "Unsupported convert arrow type {} to iceberg type: {}", - arrow_type, iceberg_field_type - ), - )) + PrimitiveType::Fixed(len) => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The partner is not a fixed array") + })?; + if array.value_length() != *len as i32 { + return Err(Error::new( + ErrorKind::DataInvalid, + "The length of fixed size binary array is not compatitable with iceberg fixed type", + )); + } + Ok(array + .iter() + .map(|v| v.map(|v| Literal::fixed(v.iter().cloned()))) + .collect()) + } + PrimitiveType::Binary => { + if let Some(array) = partner.as_any().downcast_ref::() { + Ok(array + .iter() + .map(|v| v.map(|v| Literal::binary(v.to_vec()))) + .collect()) + } else if let Some(array) = partner.as_any().downcast_ref::() { + Ok(array + .iter() + .map(|v| v.map(|v| Literal::binary(v.to_vec()))) + .collect()) + } else { + return Err(Error::new( + ErrorKind::DataInvalid, + "The partner is not a binary array", + )); + } } } } - visitor.r#struct(array, iceberg_type, columns) + fn visit_type_before( + &mut self, + _ty: &crate::spec::Type, + partner: &ArrayRef, + ) -> Result>>> { + if let Some(_) = partner.as_any().downcast_ref::() { + return Ok(Some(vec![None; partner.len()])); + } + Ok(None) + } } -// # TODO -// Add support for fulfill the missing field in arrow struct array -fn visit_arrow_struct_array_from_field_id( - array: &StructArray, - iceberg_type: &StructType, - visitor: &V, -) -> Result> { - let DataType::Struct(arrow_struct_fields) = array.data_type() else { - return Err(Error::new( - ErrorKind::DataInvalid, - "The type of arrow struct array is not a struct type", - )); - }; +struct ArrowArrayAccessor; + +impl PartnerAccessor for ArrowArrayAccessor { + type L = ArrowArrayListIterator; + type M = ArrowArrayMapIterator; - if array.columns().len() < iceberg_type.fields().len() - || arrow_struct_fields.len() < iceberg_type.fields().len() - { - return Err(Error::new( - ErrorKind::DataInvalid, - "The type of arrow struct array is not compatitable with iceberg struct type", - )); + fn struct_parner<'a>(&self, schema_partner: &'a ArrayRef) -> Result<&'a ArrayRef> { + if !matches!(schema_partner.data_type(), DataType::Struct(_)) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The schema partner is not a struct type", + )); + } + Ok(schema_partner) } - let mut columns = Vec::with_capacity(array.columns().len()); + fn field_partner<'a>( + &self, + struct_partner: &'a ArrayRef, + field_id: i32, + _field_name: &str, + ) -> Result<&'a ArrayRef> { + let struct_array = struct_partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "The struct partner is not a struct array", + ) + })?; + let field_pos = struct_array + .fields() + .iter() + .position(|field| { + get_field_id(field) + .map(|id| id == field_id) + .unwrap_or(false) + }) + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!("Field id {} not found in struct array", field_id), + ) + })?; + Ok(struct_array.column(field_pos)) + } - for iceberg_field in iceberg_type.fields() { - let Some((idx, field)) = arrow_struct_fields.iter().enumerate().find(|(_idx, f)| { - f.metadata() - .get(PARQUET_FIELD_ID_META_KEY) - .and_then(|id| id.parse::().ok().map(|id: i32| id == iceberg_field.id)) - .unwrap_or(false) - }) else { + fn list_element_partner<'a>( + &self, + list_partner: &'a ArrayRef, + ) -> Result { + if !matches!( + list_partner.data_type(), + DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _) + ) { return Err(Error::new( ErrorKind::DataInvalid, - format!( - "The field {} in iceberg struct type is not found in arrow struct type", - iceberg_field.name - ), + "The list partner is not a list type", )); - }; - let array = array.column(idx); - let arrow_type = field.data_type(); - if field.is_nullable() == iceberg_field.required { + } + Ok(ArrowArrayListIterator { + array: list_partner.clone(), + index: 0, + }) + } + + fn map_element_partner<'a>(&self, map_partner: &'a ArrayRef) -> Result { + if !matches!(map_partner.data_type(), DataType::Map(_, _)) { return Err(Error::new( ErrorKind::DataInvalid, - "The nullable field of arrow struct array is not compatitable with iceberg type", + "The map partner is not a map type", )); } - match (arrow_type, iceberg_field.field_type.as_ref()) { - (DataType::Null, Type::Primitive(primitive_type)) => { - if iceberg_field.required { - return Err(Error::new( - ErrorKind::DataInvalid, - "column in arrow array should not be optional", - )); - } - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.null(array, primitive_type)?); - } - (DataType::Boolean, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.boolean(array, primitive_type)?); - } - (DataType::Int16, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.int16(array, primitive_type)?); - } - (DataType::Int32, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.int32(array, primitive_type)?); - } - (DataType::Int64, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.int64(array, primitive_type)?); - } - (DataType::Float32, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.float(array, primitive_type)?); - } - (DataType::Float64, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.double(array, primitive_type)?); - } - (DataType::Decimal128(_, _), Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.decimal(array, primitive_type)?); - } - (DataType::Date32, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.date(array, primitive_type)?); - } - (DataType::Time64(TimeUnit::Microsecond), Type::Primitive(primitive_type)) => { - let array = array - .as_any() - .downcast_ref::() - .unwrap(); - columns.push(visitor.time(array, primitive_type)?); - } - (DataType::Timestamp(TimeUnit::Microsecond, _), Type::Primitive(primitive_type)) => { - let array = array - .as_any() - .downcast_ref::() - .unwrap(); - columns.push(visitor.timestamp(array, primitive_type)?); - } - (DataType::Timestamp(TimeUnit::Nanosecond, _), Type::Primitive(primitive_type)) => { - let array = array - .as_any() - .downcast_ref::() - .unwrap(); - columns.push(visitor.timestamp_nano(array, primitive_type)?); - } - (DataType::Utf8, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.string(array, primitive_type)?); - } - (DataType::LargeUtf8, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.large_string(array, primitive_type)?); - } - (DataType::Binary, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.binary(array, primitive_type)?); - } - (DataType::LargeBinary, Type::Primitive(primitive_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visitor.large_binary(array, primitive_type)?); - } - (DataType::Struct(_), Type::Struct(struct_type)) => { - let array = array.as_any().downcast_ref::().unwrap(); - columns.push(visit_arrow_struct_array(array, struct_type, visitor)?); - } - (arrow_type, iceberg_field_type) => { - return Err(Error::new( - ErrorKind::FeatureUnsupported, - format!( - "Unsupported convert arrow type {} to iceberg type: {}", - arrow_type, iceberg_field_type - ), - )) - } + Ok(ArrowArrayMapIterator { + array: map_partner.clone(), + index: 0, + }) + } +} + +struct ArrowArrayListIterator { + array: ArrayRef, + index: usize, +} + +impl ListPartnerIterator for ArrowArrayListIterator { + fn next(&mut self) -> Option { + if self.index >= self.array.len() { + return None; + } + if let Some(array) = self.array.as_any().downcast_ref::() { + let result = Some(array.value(self.index)); + self.index += 1; + result + } else if let Some(array) = self.array.as_any().downcast_ref::() { + let result = Some(array.value(self.index)); + self.index += 1; + result + } else if let Some(array) = self.array.as_any().downcast_ref::() { + let result = Some(array.value(self.index)); + self.index += 1; + result + } else { + None } } +} - visitor.r#struct(array, iceberg_type, columns) +struct ArrowArrayMapIterator { + array: ArrayRef, + index: usize, } -/// Convert arrow struct array to iceberg struct value array. -/// This function will assume the schema of arrow struct array is the same as iceberg struct type. -pub fn arrow_struct_to_literal( - struct_array: &StructArray, - ty: StructType, -) -> Result>> { - visit_arrow_struct_array(struct_array, &ty, &ArrowArrayConvert) +impl MapPartnerIterator for ArrowArrayMapIterator { + fn next(&mut self) -> Option<(ArrayRef, ArrayRef)> { + if let Some(array) = self.array.as_any().downcast_ref::() { + let entry = array.value(self.index); + Some((entry.column(0).clone(), entry.column(1).clone())) + } else { + None + } + } } /// Convert arrow struct array to iceberg struct value array. -/// This function will use field id to find the corresponding field in arrow struct array. -pub fn arrow_struct_to_literal_from_field_id( - struct_array: &StructArray, - ty: StructType, +/// This function will assume the schema of arrow struct array is the same as iceberg struct type. +pub fn arrow_struct_to_literal( + struct_array: &ArrayRef, + ty: &StructType, ) -> Result>> { - visit_arrow_struct_array_from_field_id(struct_array, &ty, &ArrowArrayConvert) + visit_struct_with_partner( + ty, + struct_array, + &mut ArrowArrayConverter, + &ArrowArrayAccessor, + ) } #[cfg(test)] @@ -687,6 +551,7 @@ mod test { Time64MicrosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray, }; use arrow_schema::{DataType, Field, Fields, TimeUnit}; + use parquet::arrow::PARQUET_FIELD_ID_META_KEY; use super::*; use crate::spec::{Literal, NestedField, PrimitiveType, StructType, Type}; @@ -718,7 +583,7 @@ mod test { let binary_array = BinaryArray::from(vec![Some(b"abc".as_ref()), Some(b"def".as_ref()), None]); - let struct_array = StructArray::from(vec![ + let struct_array = Arc::new(StructArray::from(vec![ ( Arc::new(Field::new("bool_field", DataType::Boolean, true)), Arc::new(bool_array) as ArrayRef, @@ -787,7 +652,7 @@ mod test { Arc::new(Field::new("binary_field", DataType::Binary, true)), Arc::new(binary_array) as ArrayRef, ), - ]); + ])) as ArrayRef; let iceberg_struct_type = StructType::new(vec![ Arc::new(NestedField::optional( @@ -860,7 +725,7 @@ mod test { )), ]); - let result = arrow_struct_to_literal(&struct_array, iceberg_struct_type).unwrap(); + let result = arrow_struct_to_literal(&struct_array, &iceberg_struct_type).unwrap(); assert_eq!(result, vec![ Some(Literal::Struct(Struct::from_iter(vec![ @@ -901,24 +766,24 @@ mod test { #[test] fn test_single_column_nullable_struct() { - let struct_array = StructArray::new_null( + let struct_array = Arc::new(StructArray::new_null( Fields::from(vec![Field::new("bool_field", DataType::Boolean, true)]), 3, - ); + )) as ArrayRef; let iceberg_struct_type = StructType::new(vec![Arc::new(NestedField::optional( 0, "bool_field", Type::Primitive(PrimitiveType::Boolean), ))]); - let result = arrow_struct_to_literal(&struct_array, iceberg_struct_type).unwrap(); + let result = arrow_struct_to_literal(&struct_array, &iceberg_struct_type).unwrap(); assert_eq!(result, vec![None; 3]); } #[test] fn test_empty_struct() { - let struct_array = StructArray::new_null(Fields::empty(), 3); + let struct_array = Arc::new(StructArray::new_null(Fields::empty(), 3)) as ArrayRef; let iceberg_struct_type = StructType::new(vec![]); - let result = arrow_struct_to_literal(&struct_array, iceberg_struct_type).unwrap(); + let result = arrow_struct_to_literal(&struct_array, &iceberg_struct_type).unwrap(); assert_eq!(result, vec![None; 0]); } @@ -929,7 +794,7 @@ mod test { let int32_array = Int32Array::from(vec![Some(3), Some(4), None]); let int64_array = Int64Array::from(vec![Some(5), Some(6), None]); let float32_array = Float32Array::from(vec![Some(1.1), Some(2.2), None]); - let struct_array = StructArray::from(vec![ + let struct_array = Arc::new(StructArray::from(vec![ ( Arc::new( Field::new("bool_field", DataType::Boolean, true).with_metadata(HashMap::from( @@ -970,7 +835,7 @@ mod test { ), Arc::new(float32_array) as ArrayRef, ), - ]); + ])) as ArrayRef; let struct_type = StructType::new(vec![ Arc::new(NestedField::optional( 1, @@ -993,7 +858,7 @@ mod test { Type::Primitive(PrimitiveType::Int), )), ]); - let result = arrow_struct_to_literal_from_field_id(&struct_array, struct_type).unwrap(); + let result = arrow_struct_to_literal(&struct_array, &struct_type).unwrap(); assert_eq!(result, vec![ Some(Literal::Struct(Struct::from_iter(vec![ Some(Literal::int(1)), diff --git a/crates/iceberg/src/spec/schema/visitor.rs b/crates/iceberg/src/spec/schema/visitor.rs index 8c6c4a7470..4f9cf4ebf2 100644 --- a/crates/iceberg/src/spec/schema/visitor.rs +++ b/crates/iceberg/src/spec/schema/visitor.rs @@ -121,3 +121,210 @@ pub fn visit_schema(schema: &Schema, visitor: &mut V) -> Resul let result = visit_struct(&schema.r#struct, visitor)?; visitor.schema(schema, result) } + +/// A post order schema visitor with partner. +/// +/// For order of methods called, please refer to [`visit_schema_with_partner`]. +pub trait SchemaWithPartnerVisitor

{ + /// Return type of this visitor. + type T; + + /// Called before struct field. + fn before_struct_field(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called after struct field. + fn after_struct_field(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called before list field. + fn before_list_element(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called after list field. + fn after_list_element(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called before map key field. + fn before_map_key(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called after map key field. + fn after_map_key(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called before map value field. + fn before_map_value(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called after map value field. + fn after_map_value(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + + /// Called before every type, if this function return `Some`, the following visiting will be skipped. + /// This function used to implement early return. + fn visit_type_before(&mut self, _ty: &Type, _partner: &P) -> Result> { + return Ok(None); + } + + /// Called after schema's type visited. + fn schema(&mut self, schema: &Schema, partner: &P, value: Self::T) -> Result; + /// Called after struct's field type visited. + fn field(&mut self, field: &NestedFieldRef, partner: &P, value: Self::T) -> Result; + /// Called after struct's fields visited. + fn r#struct( + &mut self, + r#struct: &StructType, + partner: &P, + results: Vec, + ) -> Result; + /// Called after list fields visited. + fn list(&mut self, list: &ListType, partner: &P, value: Vec) -> Result; + /// Called after map's key and value fields visited. + fn map( + &mut self, + map: &MapType, + partner: &P, + key_value: Vec, + value: Vec, + ) -> Result; + /// Called when see a primitive type. + fn primitive(&mut self, p: &PrimitiveType, partner: &P) -> Result; +} + +/// Accessor used to get child partner from parent partner. +pub trait PartnerAccessor

{ + /// List partner iterator. + type L: ListPartnerIterator

; + /// Map partner iterator. + type M: MapPartnerIterator

; + + /// Get the struct partner from schema partner. + fn struct_parner<'a>(&self, schema_partner: &'a P) -> Result<&'a P>; + /// Get the field partner from struct partner. + fn field_partner<'a>(&self, struct_partner: &'a P, field_id: i32, field: &str) + -> Result<&'a P>; + /// Get the list element partner from list partner. + fn list_element_partner<'a>(&self, list_partner: &'a P) -> Result; + /// Get the map key partner from map partner. + fn map_element_partner<'a>(&self, map_partner: &'a P) -> Result; +} + +/// Iterator for list partner. +pub trait ListPartnerIterator

{ + /// Get the next partner. + fn next(&mut self) -> Option

; +} + +/// Iterator for map partner. +pub trait MapPartnerIterator

{ + /// Get the next partner. + fn next(&mut self) -> Option<(P, P)>; +} + +/// Visiting a type in post order. +pub fn visit_type_with_partner, A: PartnerAccessor

>( + r#type: &Type, + partner: &P, + visitor: &mut V, + accessor: &A, +) -> Result { + if let Some(res) = visitor.visit_type_before(r#type, partner)? { + return Ok(res); + } + match r#type { + Type::Primitive(p) => visitor.primitive(p, partner), + Type::List(list) => { + let mut results = Vec::new(); + let mut list_element_partner_iter = accessor.list_element_partner(partner)?; + if let Some(list_element_partner) = list_element_partner_iter.next() { + visitor.before_list_element(&list.element_field, &list_element_partner)?; + let value = visit_type_with_partner( + &list.element_field.field_type, + &list_element_partner, + visitor, + accessor, + )?; + visitor.after_list_element(&list.element_field, &list_element_partner)?; + results.push(value); + } + visitor.list(list, partner, results) + } + Type::Map(map) => { + let mut k_results = Vec::new(); + let mut v_results = Vec::new(); + let mut kv_partner_iter = accessor.map_element_partner(partner)?; + if let Some((k_partner, v_partner)) = kv_partner_iter.next() { + let key_result = { + visitor.before_map_key(&map.key_field, &k_partner)?; + let ret = visit_type_with_partner( + &map.key_field.field_type, + &k_partner, + visitor, + accessor, + )?; + visitor.after_map_key(&map.key_field, &k_partner)?; + ret + }; + + let value_result = { + visitor.before_map_value(&map.value_field, &v_partner)?; + let ret = visit_type_with_partner( + &map.value_field.field_type, + &v_partner, + visitor, + accessor, + )?; + visitor.after_map_value(&map.value_field, &v_partner)?; + ret + }; + + k_results.push(key_result); + v_results.push(value_result); + } + + visitor.map(map, partner, k_results, v_results) + } + Type::Struct(s) => visit_struct_with_partner(s, partner, visitor, accessor), + } +} + +/// Visit struct type in post order. +pub fn visit_struct_with_partner, A: PartnerAccessor

>( + s: &StructType, + partner: &P, + visitor: &mut V, + accessor: &A, +) -> Result { + if let Some(res) = visitor.visit_type_before(&Type::Struct(s.clone()), partner)? { + return Ok(res); + } + let mut results = Vec::with_capacity(s.fields().len()); + for field in s.fields() { + let field_partner = accessor.field_partner(partner, field.id, &field.name)?; + visitor.before_struct_field(field, field_partner)?; + let result = visit_type_with_partner(&field.field_type, field_partner, visitor, accessor)?; + visitor.after_struct_field(field, field_partner)?; + let result = visitor.field(field, field_partner, result)?; + results.push(result); + } + + visitor.r#struct(s, partner, results) +} + +/// Visit schema in post order. +pub fn visit_schema_with_partner, A: PartnerAccessor

>( + schema: &Schema, + partner: &P, + visitor: &mut V, + accessor: &A, +) -> Result { + let result = visit_struct_with_partner( + &schema.r#struct, + accessor.struct_parner(partner)?, + visitor, + accessor, + )?; + visitor.schema(schema, partner, result) +} \ No newline at end of file From 6bcc840c66eb764a9d36cad0371f6d323e780a40 Mon Sep 17 00:00:00 2001 From: ZENOTME Date: Tue, 11 Feb 2025 01:58:26 +0800 Subject: [PATCH 08/11] refine SchemaWithPartnerVisitor design --- crates/iceberg/src/arrow/value.rs | 876 +++++++++++++++++++++--------- 1 file changed, 609 insertions(+), 267 deletions(-) diff --git a/crates/iceberg/src/arrow/value.rs b/crates/iceberg/src/arrow/value.rs index 3df12e88cd..d08d8f2d16 100644 --- a/crates/iceberg/src/arrow/value.rs +++ b/crates/iceberg/src/arrow/value.rs @@ -18,7 +18,7 @@ use arrow_array::{ Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Decimal128Array, FixedSizeBinaryArray, FixedSizeListArray, Float32Array, Float64Array, Int32Array, Int64Array, LargeBinaryArray, - LargeListArray, LargeStringArray, ListArray, MapArray, NullArray, StringArray, StructArray, + LargeListArray, LargeStringArray, ListArray, MapArray, StringArray, StructArray, Time64MicrosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray, }; use arrow_schema::DataType; @@ -26,8 +26,8 @@ use uuid::Uuid; use super::get_field_id; use crate::spec::{ - visit_struct_with_partner, ListPartnerIterator, Literal, Map, MapPartnerIterator, - PartnerAccessor, PrimitiveType, SchemaWithPartnerVisitor, Struct, StructType, + visit_struct_with_partner, Literal, Map, PartnerAccessor, PrimitiveType, + SchemaWithPartnerVisitor, Struct, StructType, }; use crate::{Error, ErrorKind, Result}; @@ -64,7 +64,7 @@ impl SchemaWithPartnerVisitor for ArrowArrayConverter { fn r#struct( &mut self, _struct: &StructType, - _partner: &ArrayRef, + array: &ArrayRef, results: Vec>>, ) -> Result>> { let row_len = results.first().map(|column| column.len()).unwrap_or(0); @@ -81,12 +81,16 @@ impl SchemaWithPartnerVisitor for ArrowArrayConverter { .map(|column| column.into_iter()) .collect::>(); - for _ in 0..row_len { + for i in 0..row_len { let mut literals = Vec::with_capacity(columns_iters.len()); for column_iter in columns_iters.iter_mut() { literals.push(column_iter.next().unwrap()); } - struct_literals.push(Some(Literal::Struct(Struct::from_iter(literals)))); + if array.is_null(i) { + struct_literals.push(None); + } else { + struct_literals.push(Some(Literal::Struct(Struct::from_iter(literals)))); + } } Ok(struct_literals) @@ -95,29 +99,75 @@ impl SchemaWithPartnerVisitor for ArrowArrayConverter { fn list( &mut self, list: &crate::spec::ListType, - _partner: &ArrayRef, - results: Vec>>, + array: &ArrayRef, + elements: Vec>, ) -> Result>> { - if list.element_field.required { - if results.iter().any(|row| row.iter().any(Option::is_none)) { - return Err(Error::new( - ErrorKind::DataInvalid, - "The list should not have null value", - )); + if list.element_field.required && elements.iter().any(Option::is_none) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The list should not have null value", + )); + } + match array.data_type() { + DataType::List(_) => { + let offset = array + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The partner is not a list array") + })? + .offsets(); + // combine the result according to the offset + let mut result = Vec::with_capacity(offset.len() - 1); + for i in 0..offset.len() - 1 { + let start = offset[i] as usize; + let end = offset[i + 1] as usize; + result.push(Some(Literal::List(elements[start..end].to_vec()))); + } + Ok(result) + } + DataType::LargeList(_) => { + let offset = array + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "The partner is not a large list array", + ) + })? + .offsets(); + // combine the result according to the offset + let mut result = Vec::with_capacity(offset.len() - 1); + for i in 0..offset.len() - 1 { + let start = offset[i] as usize; + let end = offset[i + 1] as usize; + result.push(Some(Literal::List(elements[start..end].to_vec()))); + } + Ok(result) + } + DataType::FixedSizeList(_, len) => { + let mut result = Vec::with_capacity(elements.len() / *len as usize); + for i in 0..elements.len() / *len as usize { + let start = i * *len as usize; + let end = (i + 1) * *len as usize; + result.push(Some(Literal::List(elements[start..end].to_vec()))); + } + Ok(result) } + _ => Err(Error::new( + ErrorKind::DataInvalid, + "The partner is not a list type", + )), } - Ok(results - .into_iter() - .map(|row| Some(Literal::List(row))) - .collect()) } fn map( &mut self, - map: &crate::spec::MapType, - _partner: &ArrayRef, - key_values: Vec>>, - values: Vec>>, + _map: &crate::spec::MapType, + partner: &ArrayRef, + key_values: Vec>, + values: Vec>, ) -> Result>> { // Make sure key_value and value have the same row length if key_values.len() != values.len() { @@ -127,45 +177,26 @@ impl SchemaWithPartnerVisitor for ArrowArrayConverter { )); } - let mut result = Vec::with_capacity(key_values.len()); - for (key, value) in key_values.into_iter().zip(values.into_iter()) { - // Make sure key_value and value have the same length - if key.len() != value.len() { - return Err(Error::new( - ErrorKind::DataInvalid, - "The key value and value of map should have the same length", - )); - } - // Make sure no null value in key_value - if key.iter().any(Option::is_none) { - return Err(Error::new( - ErrorKind::DataInvalid, - "The key value of map should not have null value", - )); - } - - // Make sure no null value in value if value field is required - if map.value_field.required && value.iter().any(Option::is_none) { - return Err(Error::new( - ErrorKind::DataInvalid, - "The value of map should not have null value", - )); - } - + let offsets = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::new(ErrorKind::DataInvalid, "The partner is not a map array"))? + .offsets(); + // combine the result according to the offset + let mut result = Vec::with_capacity(offsets.len() - 1); + for i in 0..offsets.len() - 1 { + let start = offsets[i] as usize; + let end = offsets[i + 1] as usize; let mut map = Map::new(); - for (k, v) in key.into_iter().zip(value.into_iter()) { - map.insert(k.unwrap(), v.clone()); + for (key, value) in key_values[start..end].iter().zip(values[start..end].iter()) { + map.insert(key.clone().unwrap(), value.clone()); } result.push(Some(Literal::Map(map))); } - Ok(result) } fn primitive(&mut self, p: &PrimitiveType, partner: &ArrayRef) -> Result>> { - if let Some(_) = partner.as_any().downcast_ref::() { - return Ok(vec![None; partner.len()]); - } match p { PrimitiveType::Boolean => { let array = partner @@ -344,10 +375,10 @@ impl SchemaWithPartnerVisitor for ArrowArrayConverter { }) .collect::>>()?) } else { - return Err(Error::new( + Err(Error::new( ErrorKind::DataInvalid, "The partner is not a uuid array", - )); + )) } } PrimitiveType::Fixed(len) => { @@ -388,25 +419,11 @@ impl SchemaWithPartnerVisitor for ArrowArrayConverter { } } } - - fn visit_type_before( - &mut self, - _ty: &crate::spec::Type, - partner: &ArrayRef, - ) -> Result>>> { - if let Some(_) = partner.as_any().downcast_ref::() { - return Ok(Some(vec![None; partner.len()])); - } - Ok(None) - } } struct ArrowArrayAccessor; impl PartnerAccessor for ArrowArrayAccessor { - type L = ArrowArrayListIterator; - type M = ArrowArrayMapIterator; - fn struct_parner<'a>(&self, schema_partner: &'a ArrayRef) -> Result<&'a ArrayRef> { if !matches!(schema_partner.data_type(), DataType::Struct(_)) { return Err(Error::new( @@ -449,80 +466,69 @@ impl PartnerAccessor for ArrowArrayAccessor { Ok(struct_array.column(field_pos)) } - fn list_element_partner<'a>( - &self, - list_partner: &'a ArrayRef, - ) -> Result { - if !matches!( - list_partner.data_type(), - DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _) - ) { - return Err(Error::new( + fn list_element_partner<'a>(&self, list_partner: &'a ArrayRef) -> Result<&'a ArrayRef> { + match list_partner.data_type() { + DataType::List(_) => { + let list_array = list_partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "The list partner is not a list array", + ) + })?; + Ok(list_array.values()) + } + DataType::LargeList(_) => { + let list_array = list_partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "The list partner is not a large list array", + ) + })?; + Ok(list_array.values()) + } + DataType::FixedSizeList(_, _) => { + let list_array = list_partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "The list partner is not a fixed size list array", + ) + })?; + Ok(list_array.values()) + } + _ => Err(Error::new( ErrorKind::DataInvalid, "The list partner is not a list type", - )); - } - Ok(ArrowArrayListIterator { - array: list_partner.clone(), - index: 0, - }) - } - - fn map_element_partner<'a>(&self, map_partner: &'a ArrayRef) -> Result { - if !matches!(map_partner.data_type(), DataType::Map(_, _)) { - return Err(Error::new( - ErrorKind::DataInvalid, - "The map partner is not a map type", - )); + )), } - Ok(ArrowArrayMapIterator { - array: map_partner.clone(), - index: 0, - }) } -} - -struct ArrowArrayListIterator { - array: ArrayRef, - index: usize, -} -impl ListPartnerIterator for ArrowArrayListIterator { - fn next(&mut self) -> Option { - if self.index >= self.array.len() { - return None; - } - if let Some(array) = self.array.as_any().downcast_ref::() { - let result = Some(array.value(self.index)); - self.index += 1; - result - } else if let Some(array) = self.array.as_any().downcast_ref::() { - let result = Some(array.value(self.index)); - self.index += 1; - result - } else if let Some(array) = self.array.as_any().downcast_ref::() { - let result = Some(array.value(self.index)); - self.index += 1; - result - } else { - None - } + fn map_key_partner<'a>(&self, map_partner: &'a ArrayRef) -> Result<&'a ArrayRef> { + let map_array = map_partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The map partner is not a map array") + })?; + Ok(map_array.keys()) } -} - -struct ArrowArrayMapIterator { - array: ArrayRef, - index: usize, -} -impl MapPartnerIterator for ArrowArrayMapIterator { - fn next(&mut self) -> Option<(ArrayRef, ArrayRef)> { - if let Some(array) = self.array.as_any().downcast_ref::() { - let entry = array.value(self.index); - Some((entry.column(0).clone(), entry.column(1).clone())) - } else { - None - } + fn map_value_partner<'a>(&self, map_partner: &'a ArrayRef) -> Result<&'a ArrayRef> { + let map_array = map_partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The map partner is not a map array") + })?; + Ok(map_array.values()) } } @@ -545,21 +551,21 @@ mod test { use std::collections::HashMap; use std::sync::Arc; + use arrow_array::builder::{Int32Builder, ListBuilder, MapBuilder, StructBuilder}; use arrow_array::{ ArrayRef, BinaryArray, BooleanArray, Date32Array, Decimal128Array, Float32Array, - Float64Array, Int16Array, Int32Array, Int64Array, StringArray, StructArray, - Time64MicrosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray, + Float64Array, Int32Array, Int64Array, StringArray, StructArray, Time64MicrosecondArray, + TimestampMicrosecondArray, TimestampNanosecondArray, }; use arrow_schema::{DataType, Field, Fields, TimeUnit}; use parquet::arrow::PARQUET_FIELD_ID_META_KEY; use super::*; - use crate::spec::{Literal, NestedField, PrimitiveType, StructType, Type}; + use crate::spec::{ListType, Literal, MapType, NestedField, PrimitiveType, StructType, Type}; #[test] fn test_arrow_struct_to_iceberg_struct() { let bool_array = BooleanArray::from(vec![Some(true), Some(false), None]); - let int16_array = Int16Array::from(vec![Some(1), Some(2), None]); let int32_array = Int32Array::from(vec![Some(3), Some(4), None]); let int64_array = Int64Array::from(vec![Some(5), Some(6), None]); let float32_array = Float32Array::from(vec![Some(1.1), Some(2.2), None]); @@ -585,71 +591,113 @@ mod test { let struct_array = Arc::new(StructArray::from(vec![ ( - Arc::new(Field::new("bool_field", DataType::Boolean, true)), + Arc::new( + Field::new("bool_field", DataType::Boolean, true).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "0".to_string())], + )), + ), Arc::new(bool_array) as ArrayRef, ), ( - Arc::new(Field::new("int16_field", DataType::Int16, true)), - Arc::new(int16_array) as ArrayRef, - ), - ( - Arc::new(Field::new("int32_field", DataType::Int32, true)), + Arc::new( + Field::new("int32_field", DataType::Int32, true).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "2".to_string())], + )), + ), Arc::new(int32_array) as ArrayRef, ), ( - Arc::new(Field::new("int64_field", DataType::Int64, true)), + Arc::new( + Field::new("int64_field", DataType::Int64, true).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "3".to_string())], + )), + ), Arc::new(int64_array) as ArrayRef, ), ( - Arc::new(Field::new("float32_field", DataType::Float32, true)), + Arc::new( + Field::new("float32_field", DataType::Float32, true).with_metadata( + HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), "4".to_string())]), + ), + ), Arc::new(float32_array) as ArrayRef, ), ( - Arc::new(Field::new("float64_field", DataType::Float64, true)), + Arc::new( + Field::new("float64_field", DataType::Float64, true).with_metadata( + HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), "5".to_string())]), + ), + ), Arc::new(float64_array) as ArrayRef, ), ( - Arc::new(Field::new( - "decimal_field", - DataType::Decimal128(10, 2), - true, - )), + Arc::new( + Field::new("decimal_field", DataType::Decimal128(10, 2), true).with_metadata( + HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), "6".to_string())]), + ), + ), Arc::new(decimal_array) as ArrayRef, ), ( - Arc::new(Field::new("date_field", DataType::Date32, true)), + Arc::new( + Field::new("date_field", DataType::Date32, true).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "7".to_string())], + )), + ), Arc::new(date_array) as ArrayRef, ), ( - Arc::new(Field::new( - "time_field", - DataType::Time64(TimeUnit::Microsecond), - true, - )), + Arc::new( + Field::new("time_field", DataType::Time64(TimeUnit::Microsecond), true) + .with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "8".to_string(), + )])), + ), Arc::new(time_array) as ArrayRef, ), ( - Arc::new(Field::new( - "timestamp_micro_field", - DataType::Timestamp(TimeUnit::Microsecond, None), - true, - )), + Arc::new( + Field::new( + "timestamp_micro_field", + DataType::Timestamp(TimeUnit::Microsecond, None), + true, + ) + .with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "9".to_string(), + )])), + ), Arc::new(timestamp_micro_array) as ArrayRef, ), ( - Arc::new(Field::new( - "timestamp_nano_field", - DataType::Timestamp(TimeUnit::Nanosecond, None), - true, - )), + Arc::new( + Field::new( + "timestamp_nano_field", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ) + .with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "10".to_string(), + )])), + ), Arc::new(timestamp_nano_array) as ArrayRef, ), ( - Arc::new(Field::new("string_field", DataType::Utf8, true)), + Arc::new( + Field::new("string_field", DataType::Utf8, true).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "11".to_string())], + )), + ), Arc::new(string_array) as ArrayRef, ), ( - Arc::new(Field::new("binary_field", DataType::Binary, true)), + Arc::new( + Field::new("binary_field", DataType::Binary, true).with_metadata( + HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), "12".to_string())]), + ), + ), Arc::new(binary_array) as ArrayRef, ), ])) as ArrayRef; @@ -660,11 +708,6 @@ mod test { "bool_field", Type::Primitive(PrimitiveType::Boolean), )), - Arc::new(NestedField::optional( - 1, - "int16_field", - Type::Primitive(PrimitiveType::Int), - )), Arc::new(NestedField::optional( 2, "int32_field", @@ -730,7 +773,6 @@ mod test { assert_eq!(result, vec![ Some(Literal::Struct(Struct::from_iter(vec![ Some(Literal::bool(true)), - Some(Literal::int(1)), Some(Literal::int(3)), Some(Literal::long(5)), Some(Literal::float(1.1)), @@ -745,7 +787,6 @@ mod test { ]))), Some(Literal::Struct(Struct::from_iter(vec![ Some(Literal::bool(false)), - Some(Literal::int(2)), Some(Literal::int(4)), Some(Literal::long(6)), Some(Literal::float(2.2)), @@ -759,24 +800,88 @@ mod test { Some(Literal::binary(b"def".to_vec())), ]))), Some(Literal::Struct(Struct::from_iter(vec![ - None, None, None, None, None, None, None, None, None, None, None, None, None, + None, None, None, None, None, None, None, None, None, None, None, None, ]))), ]); } #[test] - fn test_single_column_nullable_struct() { - let struct_array = Arc::new(StructArray::new_null( - Fields::from(vec![Field::new("bool_field", DataType::Boolean, true)]), - 3, - )) as ArrayRef; - let iceberg_struct_type = StructType::new(vec![Arc::new(NestedField::optional( - 0, - "bool_field", - Type::Primitive(PrimitiveType::Boolean), - ))]); + fn test_nullable_struct() { + // test case that partial columns are null + // [ + // {a: null, b: null} // child column is null + // {a: 1, b: null}, // partial child column is null + // null // parent column is null + // ] + let struct_array = { + let mut builder = StructBuilder::from_fields( + Fields::from(vec![ + Field::new("a", DataType::Int32, true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "0".to_string(), + )])), + Field::new("b", DataType::Int32, true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ]), + 3, + ); + builder + .field_builder::(0) + .unwrap() + .append_null(); + builder + .field_builder::(1) + .unwrap() + .append_null(); + builder.append(true); + + builder + .field_builder::(0) + .unwrap() + .append_value(1); + builder + .field_builder::(1) + .unwrap() + .append_null(); + builder.append(true); + + builder + .field_builder::(0) + .unwrap() + .append_value(1); + builder + .field_builder::(1) + .unwrap() + .append_value(1); + builder.append_null(); + + Arc::new(builder.finish()) as ArrayRef + }; + + let iceberg_struct_type = StructType::new(vec![ + Arc::new(NestedField::optional( + 0, + "a", + Type::Primitive(PrimitiveType::Int), + )), + Arc::new(NestedField::optional( + 1, + "b", + Type::Primitive(PrimitiveType::Int), + )), + ]); + let result = arrow_struct_to_literal(&struct_array, &iceberg_struct_type).unwrap(); - assert_eq!(result, vec![None; 3]); + assert_eq!(result, vec![ + Some(Literal::Struct(Struct::from_iter(vec![None, None,]))), + Some(Literal::Struct(Struct::from_iter(vec![ + Some(Literal::int(1)), + None, + ]))), + None, + ]); } #[test] @@ -788,92 +893,329 @@ mod test { } #[test] - fn test_arrow_struct_to_iceberg_struct_from_field_id() { - let bool_array = BooleanArray::from(vec![Some(true), Some(false), None]); - let int16_array = Int16Array::from(vec![Some(1), Some(2), None]); - let int32_array = Int32Array::from(vec![Some(3), Some(4), None]); - let int64_array = Int64Array::from(vec![Some(5), Some(6), None]); - let float32_array = Float32Array::from(vec![Some(1.1), Some(2.2), None]); - let struct_array = Arc::new(StructArray::from(vec![ - ( - Arc::new( - Field::new("bool_field", DataType::Boolean, true).with_metadata(HashMap::from( - [(PARQUET_FIELD_ID_META_KEY.to_string(), "2".to_string())], - )), - ), - Arc::new(bool_array) as ArrayRef, - ), - ( - Arc::new( - Field::new("int16_field", DataType::Int16, true).with_metadata(HashMap::from( - [(PARQUET_FIELD_ID_META_KEY.to_string(), "1".to_string())], - )), - ), - Arc::new(int16_array) as ArrayRef, - ), - ( - Arc::new( - Field::new("int32_field", DataType::Int32, true).with_metadata(HashMap::from( - [(PARQUET_FIELD_ID_META_KEY.to_string(), "4".to_string())], - )), - ), - Arc::new(int32_array) as ArrayRef, - ), - ( - Arc::new( - Field::new("int64_field", DataType::Int64, true).with_metadata(HashMap::from( - [(PARQUET_FIELD_ID_META_KEY.to_string(), "3".to_string())], - )), - ), - Arc::new(int64_array) as ArrayRef, - ), - ( - Arc::new( - Field::new("float32_field", DataType::Float32, true).with_metadata( - HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), "5".to_string())]), - ), - ), - Arc::new(float32_array) as ArrayRef, - ), - ])) as ArrayRef; + fn test_complex_nested() { + // complex nested type for test + // < + // A: list< struct(a1: int, a2: int) >, + // B: list< map >, + // C: list< list >, + // > let struct_type = StructType::new(vec![ - Arc::new(NestedField::optional( - 1, - "int16_field", - Type::Primitive(PrimitiveType::Int), - )), - Arc::new(NestedField::optional( - 2, - "bool_field", - Type::Primitive(PrimitiveType::Boolean), - )), - Arc::new(NestedField::optional( - 3, - "int64_field", - Type::Primitive(PrimitiveType::Long), + Arc::new(NestedField::required( + 0, + "A", + Type::List(ListType::new(Arc::new(NestedField::required( + 1, + "item", + Type::Struct(StructType::new(vec![ + Arc::new(NestedField::required( + 2, + "a1", + Type::Primitive(PrimitiveType::Int), + )), + Arc::new(NestedField::required( + 3, + "a2", + Type::Primitive(PrimitiveType::Int), + )), + ])), + )))), )), - Arc::new(NestedField::optional( + Arc::new(NestedField::required( 4, - "int32_field", - Type::Primitive(PrimitiveType::Int), + "B", + Type::List(ListType::new(Arc::new(NestedField::required( + 5, + "item", + Type::Map(MapType::new( + NestedField::optional(6, "keys", Type::Primitive(PrimitiveType::Int)) + .into(), + NestedField::optional(7, "values", Type::Primitive(PrimitiveType::Int)) + .into(), + )), + )))), + )), + Arc::new(NestedField::required( + 8, + "C", + Type::List(ListType::new(Arc::new(NestedField::required( + 9, + "item", + Type::List(ListType::new(Arc::new(NestedField::optional( + 10, + "item", + Type::Primitive(PrimitiveType::Int), + )))), + )))), )), ]); + + // Generate a complex nested struct array + // [ + // {A: [{a1: 10, a2: 20}, {a1: 11, a2: 21}], B: [{(1,100),(3,300)},{(2,200)}], C: [[100,101,102], [200,201]]}, + // {A: [{a1: 12, a2: 22}, {a1: 13, a2: 23}], B: [{(3,300)},{(4,400)}], C: [[300,301,302], [400,401]]}, + // ] + let struct_array = + { + let a_struct_a1_builder = Int32Builder::new(); + let a_struct_a2_builder = Int32Builder::new(); + let a_struct_builder = + StructBuilder::new( + vec![ + Field::new("a1", DataType::Int32, false).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "2".to_string())], + )), + Field::new("a2", DataType::Int32, false).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "3".to_string())], + )), + ], + vec![Box::new(a_struct_a1_builder), Box::new(a_struct_a2_builder)], + ); + let a_builder = ListBuilder::new(a_struct_builder); + + let map_key_builder = Int32Builder::new(); + let map_value_builder = Int32Builder::new(); + let map_builder = MapBuilder::new(None, map_key_builder, map_value_builder); + let b_builder = ListBuilder::new(map_builder); + + let inner_list_item_builder = Int32Builder::new(); + let inner_list_builder = ListBuilder::new(inner_list_item_builder); + let c_builder = ListBuilder::new(inner_list_builder); + + let mut top_struct_builder = { + let a_struct_type = + DataType::Struct(Fields::from(vec![ + Field::new("a1", DataType::Int32, false).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "2".to_string())], + )), + Field::new("a2", DataType::Int32, false).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "3".to_string())], + )), + ])); + let a_type = + DataType::List(Arc::new(Field::new("item", a_struct_type.clone(), true))); + + let b_map_entry_struct = Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + Field::new("keys", DataType::Int32, false), + Field::new("values", DataType::Int32, true), + ])), + false, + ); + let b_map_type = + DataType::Map(Arc::new(b_map_entry_struct), /* sorted_keys = */ false); + let b_type = + DataType::List(Arc::new(Field::new("item", b_map_type.clone(), true))); + + let c_inner_list_type = + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let c_type = DataType::List(Arc::new(Field::new( + "item", + c_inner_list_type.clone(), + true, + ))); + StructBuilder::new( + Fields::from(vec![ + Field::new("A", a_type.clone(), false).with_metadata(HashMap::from([ + (PARQUET_FIELD_ID_META_KEY.to_string(), "0".to_string()), + ])), + Field::new("B", b_type.clone(), false).with_metadata(HashMap::from([ + (PARQUET_FIELD_ID_META_KEY.to_string(), "4".to_string()), + ])), + Field::new("C", c_type.clone(), false).with_metadata(HashMap::from([ + (PARQUET_FIELD_ID_META_KEY.to_string(), "8".to_string()), + ])), + ]), + vec![ + Box::new(a_builder), + Box::new(b_builder), + Box::new(c_builder), + ], + ) + }; + + // first row + // {A: [{a1: 10, a2: 20}, {a1: 11, a2: 21}], B: [{(1,100),(3,300)},{(2,200)}], C: [[100,101,102], [200,201]]}, + { + let a_builder = top_struct_builder + .field_builder::>(0) + .unwrap(); + let struct_builder = a_builder.values(); + struct_builder + .field_builder::(0) + .unwrap() + .append_value(10); + struct_builder + .field_builder::(1) + .unwrap() + .append_value(20); + struct_builder.append(true); + let struct_builder = a_builder.values(); + struct_builder + .field_builder::(0) + .unwrap() + .append_value(11); + struct_builder + .field_builder::(1) + .unwrap() + .append_value(21); + struct_builder.append(true); + a_builder.append(true); + } + { + let b_builder = top_struct_builder + .field_builder::>>(1) + .unwrap(); + let map_builder = b_builder.values(); + map_builder.keys().append_value(1); + map_builder.values().append_value(100); + map_builder.keys().append_value(3); + map_builder.values().append_value(300); + map_builder.append(true).unwrap(); + + map_builder.keys().append_value(2); + map_builder.values().append_value(200); + map_builder.append(true).unwrap(); + + b_builder.append(true); + } + { + let c_builder = top_struct_builder + .field_builder::>>(2) + .unwrap(); + let inner_list_builder = c_builder.values(); + inner_list_builder.values().append_value(100); + inner_list_builder.values().append_value(101); + inner_list_builder.values().append_value(102); + inner_list_builder.append(true); + let inner_list_builder = c_builder.values(); + inner_list_builder.values().append_value(200); + inner_list_builder.values().append_value(201); + inner_list_builder.append(true); + c_builder.append(true); + } + top_struct_builder.append(true); + + // second row + // {A: [{a1: 12, a2: 22}, {a1: 13, a2: 23}], B: [{(3,300)}], C: [[300,301,302], [400,401]]}, + { + let a_builder = top_struct_builder + .field_builder::>(0) + .unwrap(); + let struct_builder = a_builder.values(); + struct_builder + .field_builder::(0) + .unwrap() + .append_value(12); + struct_builder + .field_builder::(1) + .unwrap() + .append_value(22); + struct_builder.append(true); + let struct_builder = a_builder.values(); + struct_builder + .field_builder::(0) + .unwrap() + .append_value(13); + struct_builder + .field_builder::(1) + .unwrap() + .append_value(23); + struct_builder.append(true); + a_builder.append(true); + } + { + let b_builder = top_struct_builder + .field_builder::>>(1) + .unwrap(); + let map_builder = b_builder.values(); + map_builder.keys().append_value(3); + map_builder.values().append_value(300); + map_builder.append(true).unwrap(); + + b_builder.append(true); + } + { + let c_builder = top_struct_builder + .field_builder::>>(2) + .unwrap(); + let inner_list_builder = c_builder.values(); + inner_list_builder.values().append_value(300); + inner_list_builder.values().append_value(301); + inner_list_builder.values().append_value(302); + inner_list_builder.append(true); + let inner_list_builder = c_builder.values(); + inner_list_builder.values().append_value(400); + inner_list_builder.values().append_value(401); + inner_list_builder.append(true); + c_builder.append(true); + } + top_struct_builder.append(true); + + Arc::new(top_struct_builder.finish()) as ArrayRef + }; + let result = arrow_struct_to_literal(&struct_array, &struct_type).unwrap(); assert_eq!(result, vec![ Some(Literal::Struct(Struct::from_iter(vec![ - Some(Literal::int(1)), - Some(Literal::bool(true)), - Some(Literal::long(5)), - Some(Literal::int(3)), - ]))), - Some(Literal::Struct(Struct::from_iter(vec![ - Some(Literal::int(2)), - Some(Literal::bool(false)), - Some(Literal::long(6)), - Some(Literal::int(4)), + Some(Literal::List(vec![ + Some(Literal::Struct(Struct::from_iter(vec![ + Some(Literal::int(10)), + Some(Literal::int(20)), + ]))), + Some(Literal::Struct(Struct::from_iter(vec![ + Some(Literal::int(11)), + Some(Literal::int(21)), + ]))), + ])), + Some(Literal::List(vec![ + Some(Literal::Map(Map::from_iter(vec![ + (Literal::int(1), Some(Literal::int(100))), + (Literal::int(3), Some(Literal::int(300))), + ]))), + Some(Literal::Map(Map::from_iter(vec![( + Literal::int(2), + Some(Literal::int(200)) + ),]))), + ])), + Some(Literal::List(vec![ + Some(Literal::List(vec![ + Some(Literal::int(100)), + Some(Literal::int(101)), + Some(Literal::int(102)), + ])), + Some(Literal::List(vec![ + Some(Literal::int(200)), + Some(Literal::int(201)), + ])), + ])), ]))), Some(Literal::Struct(Struct::from_iter(vec![ - None, None, None, None, + Some(Literal::List(vec![ + Some(Literal::Struct(Struct::from_iter(vec![ + Some(Literal::int(12)), + Some(Literal::int(22)), + ]))), + Some(Literal::Struct(Struct::from_iter(vec![ + Some(Literal::int(13)), + Some(Literal::int(23)), + ]))), + ])), + Some(Literal::List(vec![Some(Literal::Map(Map::from_iter( + vec![(Literal::int(3), Some(Literal::int(300))),] + ))),])), + Some(Literal::List(vec![ + Some(Literal::List(vec![ + Some(Literal::int(300)), + Some(Literal::int(301)), + Some(Literal::int(302)), + ])), + Some(Literal::List(vec![ + Some(Literal::int(400)), + Some(Literal::int(401)), + ])), + ])), ]))), ]); } From 3e958e2e1d48bf0322d23aa35788056c88fa35e4 Mon Sep 17 00:00:00 2001 From: ZENOTME Date: Mon, 24 Feb 2025 18:24:06 +0800 Subject: [PATCH 09/11] refine --- crates/iceberg/src/arrow/value.rs | 35 +++++++++++++++++-------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/crates/iceberg/src/arrow/value.rs b/crates/iceberg/src/arrow/value.rs index d08d8f2d16..d78c4f4400 100644 --- a/crates/iceberg/src/arrow/value.rs +++ b/crates/iceberg/src/arrow/value.rs @@ -26,14 +26,14 @@ use uuid::Uuid; use super::get_field_id; use crate::spec::{ - visit_struct_with_partner, Literal, Map, PartnerAccessor, PrimitiveType, - SchemaWithPartnerVisitor, Struct, StructType, + visit_struct_with_partner, ListType, Literal, Map, MapType, NestedField, PartnerAccessor, + PrimitiveType, SchemaWithPartnerVisitor, Struct, StructType, }; use crate::{Error, ErrorKind, Result}; -struct ArrowArrayConverter; +struct ArrowArrayToIcebergStructConverter; -impl SchemaWithPartnerVisitor for ArrowArrayConverter { +impl SchemaWithPartnerVisitor for ArrowArrayToIcebergStructConverter { type T = Vec>; fn schema( @@ -56,7 +56,9 @@ impl SchemaWithPartnerVisitor for ArrowArrayConverter { return Err(Error::new( ErrorKind::DataInvalid, "The field is required but has null value", - )); + ) + .with_context("field_id", field.id.to_string()) + .with_context("field_name", &field.name)); } Ok(value) } @@ -68,11 +70,13 @@ impl SchemaWithPartnerVisitor for ArrowArrayConverter { results: Vec>>, ) -> Result>> { let row_len = results.first().map(|column| column.len()).unwrap_or(0); - if results.iter().any(|column| column.len() != row_len) { + if let Some(col) = results.iter().find(|col| col.len() != row_len) { return Err(Error::new( ErrorKind::DataInvalid, "The struct columns have different row length", - )); + ) + .with_context("first col length", row_len.to_string()) + .with_context("actual col length", col.len().to_string())); } let mut struct_literals = Vec::with_capacity(row_len); @@ -98,7 +102,7 @@ impl SchemaWithPartnerVisitor for ArrowArrayConverter { fn list( &mut self, - list: &crate::spec::ListType, + list: &ListType, array: &ArrayRef, elements: Vec>, ) -> Result>> { @@ -164,7 +168,7 @@ impl SchemaWithPartnerVisitor for ArrowArrayConverter { fn map( &mut self, - _map: &crate::spec::MapType, + _map: &MapType, partner: &ArrayRef, key_values: Vec>, values: Vec>, @@ -437,8 +441,7 @@ impl PartnerAccessor for ArrowArrayAccessor { fn field_partner<'a>( &self, struct_partner: &'a ArrayRef, - field_id: i32, - _field_name: &str, + field: &NestedField, ) -> Result<&'a ArrayRef> { let struct_array = struct_partner .as_any() @@ -452,15 +455,15 @@ impl PartnerAccessor for ArrowArrayAccessor { let field_pos = struct_array .fields() .iter() - .position(|field| { - get_field_id(field) - .map(|id| id == field_id) + .position(|arrow_field| { + get_field_id(arrow_field) + .map(|id| id == field.id) .unwrap_or(false) }) .ok_or_else(|| { Error::new( ErrorKind::DataInvalid, - format!("Field id {} not found in struct array", field_id), + format!("Field id {} not found in struct array", field.id), ) })?; Ok(struct_array.column(field_pos)) @@ -541,7 +544,7 @@ pub fn arrow_struct_to_literal( visit_struct_with_partner( ty, struct_array, - &mut ArrowArrayConverter, + &mut ArrowArrayToIcebergStructConverter, &ArrowArrayAccessor, ) } From df3dd40f6d9d2e47794b8f8a5e32de34fe5919af Mon Sep 17 00:00:00 2001 From: ZENOTME Date: Mon, 24 Feb 2025 18:35:49 +0800 Subject: [PATCH 10/11] fix rebase --- crates/iceberg/src/spec/schema/mod.rs | 2 +- crates/iceberg/src/spec/schema/visitor.rs | 118 +++++++--------------- 2 files changed, 36 insertions(+), 84 deletions(-) diff --git a/crates/iceberg/src/spec/schema/mod.rs b/crates/iceberg/src/spec/schema/mod.rs index 092fa25162..b95244f42d 100644 --- a/crates/iceberg/src/spec/schema/mod.rs +++ b/crates/iceberg/src/spec/schema/mod.rs @@ -23,7 +23,7 @@ use std::sync::Arc; mod utils; mod visitor; -pub use self::visitor::{visit_schema, visit_struct, visit_type, SchemaVisitor}; +pub use self::visitor::*; pub(super) mod _serde; mod id_reassigner; mod index; diff --git a/crates/iceberg/src/spec/schema/visitor.rs b/crates/iceberg/src/spec/schema/visitor.rs index 4f9cf4ebf2..0008635bfa 100644 --- a/crates/iceberg/src/spec/schema/visitor.rs +++ b/crates/iceberg/src/spec/schema/visitor.rs @@ -162,12 +162,6 @@ pub trait SchemaWithPartnerVisitor

{ Ok(()) } - /// Called before every type, if this function return `Some`, the following visiting will be skipped. - /// This function used to implement early return. - fn visit_type_before(&mut self, _ty: &Type, _partner: &P) -> Result> { - return Ok(None); - } - /// Called after schema's type visited. fn schema(&mut self, schema: &Schema, partner: &P, value: Self::T) -> Result; /// Called after struct's field type visited. @@ -180,14 +174,14 @@ pub trait SchemaWithPartnerVisitor

{ results: Vec, ) -> Result; /// Called after list fields visited. - fn list(&mut self, list: &ListType, partner: &P, value: Vec) -> Result; + fn list(&mut self, list: &ListType, partner: &P, value: Self::T) -> Result; /// Called after map's key and value fields visited. fn map( &mut self, map: &MapType, partner: &P, - key_value: Vec, - value: Vec, + key_value: Self::T, + value: Self::T, ) -> Result; /// Called when see a primitive type. fn primitive(&mut self, p: &PrimitiveType, partner: &P) -> Result; @@ -195,32 +189,16 @@ pub trait SchemaWithPartnerVisitor

{ /// Accessor used to get child partner from parent partner. pub trait PartnerAccessor

{ - /// List partner iterator. - type L: ListPartnerIterator

; - /// Map partner iterator. - type M: MapPartnerIterator

; - /// Get the struct partner from schema partner. fn struct_parner<'a>(&self, schema_partner: &'a P) -> Result<&'a P>; /// Get the field partner from struct partner. - fn field_partner<'a>(&self, struct_partner: &'a P, field_id: i32, field: &str) - -> Result<&'a P>; + fn field_partner<'a>(&self, struct_partner: &'a P, field: &NestedField) -> Result<&'a P>; /// Get the list element partner from list partner. - fn list_element_partner<'a>(&self, list_partner: &'a P) -> Result; + fn list_element_partner<'a>(&self, list_partner: &'a P) -> Result<&'a P>; /// Get the map key partner from map partner. - fn map_element_partner<'a>(&self, map_partner: &'a P) -> Result; -} - -/// Iterator for list partner. -pub trait ListPartnerIterator

{ - /// Get the next partner. - fn next(&mut self) -> Option

; -} - -/// Iterator for map partner. -pub trait MapPartnerIterator

{ - /// Get the next partner. - fn next(&mut self) -> Option<(P, P)>; + fn map_key_partner<'a>(&self, map_partner: &'a P) -> Result<&'a P>; + /// Get the map value partner from map partner. + fn map_value_partner<'a>(&self, map_partner: &'a P) -> Result<&'a P>; } /// Visiting a type in post order. @@ -230,61 +208,38 @@ pub fn visit_type_with_partner, A: PartnerAcce visitor: &mut V, accessor: &A, ) -> Result { - if let Some(res) = visitor.visit_type_before(r#type, partner)? { - return Ok(res); - } match r#type { Type::Primitive(p) => visitor.primitive(p, partner), Type::List(list) => { - let mut results = Vec::new(); - let mut list_element_partner_iter = accessor.list_element_partner(partner)?; - if let Some(list_element_partner) = list_element_partner_iter.next() { - visitor.before_list_element(&list.element_field, &list_element_partner)?; - let value = visit_type_with_partner( - &list.element_field.field_type, - &list_element_partner, - visitor, - accessor, - )?; - visitor.after_list_element(&list.element_field, &list_element_partner)?; - results.push(value); - } - visitor.list(list, partner, results) + let list_element_partner = accessor.list_element_partner(partner)?; + visitor.before_list_element(&list.element_field, list_element_partner)?; + let element_results = visit_type_with_partner( + &list.element_field.field_type, + list_element_partner, + visitor, + accessor, + )?; + visitor.after_list_element(&list.element_field, list_element_partner)?; + visitor.list(list, partner, element_results) } Type::Map(map) => { - let mut k_results = Vec::new(); - let mut v_results = Vec::new(); - let mut kv_partner_iter = accessor.map_element_partner(partner)?; - if let Some((k_partner, v_partner)) = kv_partner_iter.next() { - let key_result = { - visitor.before_map_key(&map.key_field, &k_partner)?; - let ret = visit_type_with_partner( - &map.key_field.field_type, - &k_partner, - visitor, - accessor, - )?; - visitor.after_map_key(&map.key_field, &k_partner)?; - ret - }; - - let value_result = { - visitor.before_map_value(&map.value_field, &v_partner)?; - let ret = visit_type_with_partner( - &map.value_field.field_type, - &v_partner, - visitor, - accessor, - )?; - visitor.after_map_value(&map.value_field, &v_partner)?; - ret - }; + let key_partner = accessor.map_key_partner(partner)?; + visitor.before_map_key(&map.key_field, key_partner)?; + let key_result = + visit_type_with_partner(&map.key_field.field_type, key_partner, visitor, accessor)?; + visitor.after_map_key(&map.key_field, key_partner)?; - k_results.push(key_result); - v_results.push(value_result); - } + let value_partner = accessor.map_value_partner(partner)?; + visitor.before_map_value(&map.value_field, value_partner)?; + let value_result = visit_type_with_partner( + &map.value_field.field_type, + value_partner, + visitor, + accessor, + )?; + visitor.after_map_value(&map.value_field, value_partner)?; - visitor.map(map, partner, k_results, v_results) + visitor.map(map, partner, key_result, value_result) } Type::Struct(s) => visit_struct_with_partner(s, partner, visitor, accessor), } @@ -297,12 +252,9 @@ pub fn visit_struct_with_partner, A: PartnerAc visitor: &mut V, accessor: &A, ) -> Result { - if let Some(res) = visitor.visit_type_before(&Type::Struct(s.clone()), partner)? { - return Ok(res); - } let mut results = Vec::with_capacity(s.fields().len()); for field in s.fields() { - let field_partner = accessor.field_partner(partner, field.id, &field.name)?; + let field_partner = accessor.field_partner(partner, field)?; visitor.before_struct_field(field, field_partner)?; let result = visit_type_with_partner(&field.field_type, field_partner, visitor, accessor)?; visitor.after_struct_field(field, field_partner)?; @@ -327,4 +279,4 @@ pub fn visit_schema_with_partner, A: PartnerAc accessor, )?; visitor.schema(schema, partner, result) -} \ No newline at end of file +} From 6ddf16014818ab5ead052e08ecc6b82ca4ef23d9 Mon Sep 17 00:00:00 2001 From: ZENOTME Date: Tue, 25 Feb 2025 12:34:38 +0800 Subject: [PATCH 11/11] mark private function --- crates/iceberg/src/spec/schema/visitor.rs | 4 ++-- crates/iceberg/src/spec/values.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/iceberg/src/spec/schema/visitor.rs b/crates/iceberg/src/spec/schema/visitor.rs index 0008635bfa..ebb9b86bba 100644 --- a/crates/iceberg/src/spec/schema/visitor.rs +++ b/crates/iceberg/src/spec/schema/visitor.rs @@ -72,7 +72,7 @@ pub trait SchemaVisitor { } /// Visiting a type in post order. -pub fn visit_type(r#type: &Type, visitor: &mut V) -> Result { +pub(crate) fn visit_type(r#type: &Type, visitor: &mut V) -> Result { match r#type { Type::Primitive(p) => visitor.primitive(p), Type::List(list) => { @@ -202,7 +202,7 @@ pub trait PartnerAccessor

{ } /// Visiting a type in post order. -pub fn visit_type_with_partner, A: PartnerAccessor

>( +pub(crate) fn visit_type_with_partner, A: PartnerAccessor

>( r#type: &Type, partner: &P, visitor: &mut V, diff --git a/crates/iceberg/src/spec/values.rs b/crates/iceberg/src/spec/values.rs index 1d464be591..839d21f06b 100644 --- a/crates/iceberg/src/spec/values.rs +++ b/crates/iceberg/src/spec/values.rs @@ -1565,12 +1565,12 @@ impl Literal { } /// Creates a timestamp from unix epoch in nanoseconds. - pub fn timestamp_nano(value: i64) -> Self { + pub(crate) fn timestamp_nano(value: i64) -> Self { Self::Primitive(PrimitiveLiteral::Long(value)) } /// Creates a timestamp with timezone from unix epoch in nanoseconds. - pub fn timestamptz_nano(value: i64) -> Self { + pub(crate) fn timestamptz_nano(value: i64) -> Self { Self::Primitive(PrimitiveLiteral::Long(value)) }