From 1927dbca92982cbc5b5505f7d5e554b1f63b4321 Mon Sep 17 00:00:00 2001 From: xxchan Date: Thu, 19 Sep 2024 16:24:54 +0800 Subject: [PATCH 1/3] feat: expose arrow type <-> iceberg type Previously we only exposed the schema conversion. Signed-off-by: xxchan --- crates/iceberg/src/arrow/schema.rs | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/crates/iceberg/src/arrow/schema.rs b/crates/iceberg/src/arrow/schema.rs index 2ff43e0f07..a5c93a5384 100644 --- a/crates/iceberg/src/arrow/schema.rs +++ b/crates/iceberg/src/arrow/schema.rs @@ -171,7 +171,6 @@ fn visit_type(r#type: &DataType, visitor: &mut V) -> Resu } /// Visit list types in post order. -#[allow(dead_code)] fn visit_list( data_type: &DataType, element_field: &Field, @@ -184,7 +183,6 @@ fn visit_list( } /// Visit struct type in post order. -#[allow(dead_code)] fn visit_struct(fields: &Fields, visitor: &mut V) -> Result { let mut results = Vec::with_capacity(fields.len()); for field in fields { @@ -198,7 +196,6 @@ fn visit_struct(fields: &Fields, visitor: &mut V) -> Resu } /// Visit schema in post order. -#[allow(dead_code)] fn visit_schema(schema: &ArrowSchema, visitor: &mut V) -> Result { let mut results = Vec::with_capacity(schema.fields().len()); for field in schema.fields() { @@ -211,12 +208,17 @@ fn visit_schema(schema: &ArrowSchema, visitor: &mut V) -> } /// Convert Arrow schema to ceberg schema. -#[allow(dead_code)] pub fn arrow_schema_to_schema(schema: &ArrowSchema) -> Result { let mut visitor = ArrowSchemaConverter::new(); visit_schema(schema, &mut visitor) } +/// Convert Arrow type to iceberg type. +pub fn arrow_type_to_type(ty: &DataType) -> Result { + let mut visitor = ArrowSchemaConverter::new(); + visit_type(ty, &mut visitor) +} + const ARROW_FIELD_DOC_KEY: &str = "doc"; fn get_field_id(field: &Field) -> Result { @@ -246,7 +248,6 @@ fn get_field_doc(field: &Field) -> Option { struct ArrowSchemaConverter; impl ArrowSchemaConverter { - #[allow(dead_code)] fn new() -> Self { Self {} } @@ -615,6 +616,15 @@ pub fn schema_to_arrow_schema(schema: &crate::spec::Schema) -> crate::Result crate::Result { + let mut converter = ToArrowSchemaConverter; + match crate::spec::visit_type(ty, &mut converter)? { + ArrowSchemaOrFieldOrType::Type(ty) => Ok(ty), + _ => unreachable!(), + } +} + /// Convert Iceberg Datum to Arrow Datum. pub(crate) fn get_arrow_datum(datum: &Datum) -> Result> { match (datum.data_type(), datum.literal()) { From 586bd51dfc361411aac4ac46c87e6beccd1c1df2 Mon Sep 17 00:00:00 2001 From: xxchan Date: Fri, 20 Sep 2024 14:17:18 +0800 Subject: [PATCH 2/3] add tests Signed-off-by: xxchan --- crates/iceberg/src/arrow/schema.rs | 88 +++++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 1 deletion(-) diff --git a/crates/iceberg/src/arrow/schema.rs b/crates/iceberg/src/arrow/schema.rs index a5c93a5384..08600664e8 100644 --- a/crates/iceberg/src/arrow/schema.rs +++ b/crates/iceberg/src/arrow/schema.rs @@ -789,7 +789,7 @@ mod tests { use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; use super::*; - use crate::spec::Schema; + use crate::spec::{Literal, Schema}; /// Create a simple field with metadata. fn simple_field(name: &str, ty: DataType, nullable: bool, value: &str) -> Field { @@ -1375,4 +1375,90 @@ mod tests { let converted_arrow_schema = schema_to_arrow_schema(&schema).unwrap(); assert_eq!(converted_arrow_schema, arrow_schema); } + + #[test] + fn test_type_conversion() { + // test primitive type + { + let arrow_type = DataType::Int32; + let iceberg_type = Type::Primitive(PrimitiveType::Int); + assert_eq!(arrow_type, type_to_arrow_type(&iceberg_type).unwrap()); + assert_eq!(iceberg_type, arrow_type_to_type(&arrow_type).unwrap()); + } + + // test struct type + { + // no metadata will cause error + let arrow_type = DataType::Struct(Fields::from(vec![ + Field::new("a", DataType::Int64, false), + Field::new("b", DataType::Utf8, true), + ])); + assert_eq!( + &arrow_type_to_type(&arrow_type).unwrap_err().to_string(), + "DataInvalid => Field id not found in metadata" + ); + + let arrow_type = DataType::Struct(Fields::from(vec![ + Field::new("a", DataType::Int64, false).with_metadata(HashMap::from_iter([( + PARQUET_FIELD_ID_META_KEY.to_string(), + 1.to_string(), + )])), + Field::new("b", DataType::Utf8, true).with_metadata(HashMap::from_iter([( + PARQUET_FIELD_ID_META_KEY.to_string(), + 2.to_string(), + )])), + ])); + let iceberg_type = Type::Struct(StructType::new(vec![ + NestedField { + id: 1, + doc: None, + name: "a".to_string(), + required: true, + field_type: Box::new(Type::Primitive(PrimitiveType::Long)), + initial_default: None, + write_default: None, + } + .into(), + NestedField { + id: 2, + doc: None, + name: "b".to_string(), + required: false, + field_type: Box::new(Type::Primitive(PrimitiveType::String)), + initial_default: None, + write_default: None, + } + .into(), + ])); + assert_eq!(iceberg_type, arrow_type_to_type(&arrow_type).unwrap()); + assert_eq!(arrow_type, type_to_arrow_type(&iceberg_type).unwrap()); + + // initial_default and write_default is ignored + let iceberg_type = Type::Struct(StructType::new(vec![ + NestedField { + id: 1, + doc: None, + name: "a".to_string(), + required: true, + field_type: Box::new(Type::Primitive(PrimitiveType::Long)), + initial_default: Some(Literal::Primitive(PrimitiveLiteral::Int(114514))), + write_default: None, + } + .into(), + NestedField { + id: 2, + doc: None, + name: "b".to_string(), + required: false, + field_type: Box::new(Type::Primitive(PrimitiveType::String)), + initial_default: None, + write_default: Some(Literal::Primitive(PrimitiveLiteral::String( + "514".to_string(), + ))), + } + .into(), + ])); + assert_eq!(arrow_type, type_to_arrow_type(&iceberg_type).unwrap()); + } + } } From 470951c382d0debe3b69f20c8ec8f7fbec0c4b27 Mon Sep 17 00:00:00 2001 From: xxchan Date: Fri, 20 Sep 2024 17:26:39 +0800 Subject: [PATCH 3/3] add Sync to TransformFunction Signed-off-by: xxchan --- crates/iceberg/src/transform/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/transform/mod.rs b/crates/iceberg/src/transform/mod.rs index 72b1797547..81449da559 100644 --- a/crates/iceberg/src/transform/mod.rs +++ b/crates/iceberg/src/transform/mod.rs @@ -29,7 +29,7 @@ mod truncate; mod void; /// TransformFunction is a trait that defines the interface for all transform functions. -pub trait TransformFunction: Send { +pub trait TransformFunction: Send + Sync { /// transform will take an input array and transform it into a new array. /// The implementation of this function will need to check and downcast the input to specific /// type.