From 1eb69504b6b8d59fe5513a2bbef9c2b861848559 Mon Sep 17 00:00:00 2001 From: Christian Thiel Date: Sun, 8 Sep 2024 11:33:18 +0200 Subject: [PATCH 1/6] Reassign field ids for schema --- crates/iceberg/src/spec/datatypes.rs | 6 + crates/iceberg/src/spec/schema.rs | 312 ++++++++++++++++++++++++++- 2 files changed, 315 insertions(+), 3 deletions(-) diff --git a/crates/iceberg/src/spec/datatypes.rs b/crates/iceberg/src/spec/datatypes.rs index d382459600..bce10ad5f7 100644 --- a/crates/iceberg/src/spec/datatypes.rs +++ b/crates/iceberg/src/spec/datatypes.rs @@ -668,6 +668,12 @@ impl NestedField { self.write_default = Some(value); self } + + /// Set the id of the field. + pub(crate) fn with_id(mut self, id: i32) -> Self { + self.id = id; + self + } } impl fmt::Display for NestedField { diff --git a/crates/iceberg/src/spec/schema.rs b/crates/iceberg/src/spec/schema.rs index 106bfb1d84..7b24aa25fd 100644 --- a/crates/iceberg/src/spec/schema.rs +++ b/crates/iceberg/src/spec/schema.rs @@ -39,7 +39,7 @@ use crate::{ensure_data_valid, Error, ErrorKind}; pub type SchemaId = i32; /// Reference to [`Schema`]. pub type SchemaRef = Arc; -const DEFAULT_SCHEMA_ID: SchemaId = 0; +pub(crate) const DEFAULT_SCHEMA_ID: SchemaId = 0; /// Defines schema in iceberg. #[derive(Debug, Serialize, Deserialize, Clone)] @@ -77,6 +77,7 @@ pub struct SchemaBuilder { fields: Vec, alias_to_id: BiHashMap, identifier_field_ids: HashSet, + reassign_field_ids_from: Option, } impl SchemaBuilder { @@ -86,6 +87,16 @@ impl SchemaBuilder { self } + /// Reassign all field-ids (nested) on build. + /// If `start_from` is provided, it will start reassigning from that id (inclusive). + /// If not provided, it will start from 0. + /// + /// All specified aliases and identifier fields will be updated to the new field-ids. + pub fn with_reassigned_field_ids(mut self, start_from: Option) -> Self { + self.reassign_field_ids_from = Some(start_from.unwrap_or(0)); + self + } + /// Set schema id. pub fn with_schema_id(mut self, schema_id: i32) -> Self { self.schema_id = schema_id; @@ -105,13 +116,24 @@ impl SchemaBuilder { } /// Builds the schema. - pub fn build(self) -> Result { - let highest_field_id = self.fields.iter().map(|f| f.id).max().unwrap_or(0); + pub fn build(mut self) -> Result { + let mut highest_field_id = None; + if let Some(start_from) = self.reassign_field_ids_from { + let mut id_reassigner = ReassignFieldIds::new(start_from); + self.fields = id_reassigner.reassign_field_ids(self.fields); + highest_field_id = Some(id_reassigner.next_field_id - 1); + + self.identifier_field_ids = + id_reassigner.apply_to_identifier_fields(self.identifier_field_ids)?; + self.alias_to_id = id_reassigner.apply_to_aliases(self.alias_to_id)?; + } let field_id_to_accessor = self.build_accessors(); let r#struct = StructType::new(self.fields); let id_to_field = index_by_id(&r#struct)?; + let highest_field_id = + highest_field_id.unwrap_or(id_to_field.keys().max().cloned().unwrap_or(0)); Self::validate_identifier_ids( &r#struct, @@ -266,6 +288,7 @@ impl Schema { fields: vec![], identifier_field_ids: HashSet::default(), alias_to_id: BiHashMap::default(), + reassign_field_ids_from: None, } } @@ -276,6 +299,7 @@ impl Schema { fields: self.r#struct.fields().to_vec(), alias_to_id: self.alias_to_id, identifier_field_ids: self.identifier_field_ids, + reassign_field_ids_from: None, } } @@ -944,6 +968,122 @@ impl SchemaVisitor for PruneColumn { } } +struct ReassignFieldIds { + next_field_id: i32, + old_to_new_id: HashMap, +} + +// We are not using the visitor here, as post order traversal is not desired. +// Instead we want to re-assign all fields on one level first before diving deeper. +impl ReassignFieldIds { + fn new(start_from: i32) -> Self { + Self { + next_field_id: start_from, + old_to_new_id: HashMap::new(), + } + } + + fn reassign_field_ids(&mut self, fields: Vec) -> Vec { + // Visit fields on the same level first + let outer_fields = fields + .into_iter() + .map(|field| { + self.old_to_new_id.insert(field.id, self.next_field_id); + let new_field = Arc::unwrap_or_clone(field).with_id(self.next_field_id); + self.next_field_id += 1; + Arc::new(new_field) + }) + .collect::>(); + + // Now visit nested fields + outer_fields + .into_iter() + .map(|field| { + if field.field_type.is_primitive() { + field + } else { + let mut new_field = Arc::unwrap_or_clone(field); + *new_field.field_type = self.reassign_ids_visit_type(*new_field.field_type); + Arc::new(new_field) + } + }) + .collect() + } + + fn reassign_ids_visit_type(&mut self, field_type: Type) -> Type { + match field_type { + Type::Primitive(s) => Type::Primitive(s), + Type::Struct(s) => { + let new_fields = self.reassign_field_ids(s.fields().to_vec()); + Type::Struct(StructType::new(new_fields)) + } + Type::List(l) => { + self.old_to_new_id + .insert(l.element_field.id, self.next_field_id); + let mut element_field = Arc::unwrap_or_clone(l.element_field); + element_field.id = self.next_field_id; + self.next_field_id += 1; + *element_field.field_type = self.reassign_ids_visit_type(*element_field.field_type); + Type::List(ListType { + element_field: Arc::new(element_field), + }) + } + Type::Map(m) => { + self.old_to_new_id + .insert(m.key_field.id, self.next_field_id); + let mut key_field = Arc::unwrap_or_clone(m.key_field); + key_field.id = self.next_field_id; + self.next_field_id += 1; + *key_field.field_type = self.reassign_ids_visit_type(*key_field.field_type); + + self.old_to_new_id + .insert(m.value_field.id, self.next_field_id); + let mut value_field = Arc::unwrap_or_clone(m.value_field); + value_field.id = self.next_field_id; + self.next_field_id += 1; + *value_field.field_type = self.reassign_ids_visit_type(*value_field.field_type); + + Type::Map(MapType { + key_field: Arc::new(key_field), + value_field: Arc::new(value_field), + }) + } + } + } + + fn apply_to_identifier_fields(&self, field_ids: HashSet) -> Result> { + field_ids + .into_iter() + .map(|id| { + self.old_to_new_id.get(&id).copied().ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!("Identifier Field ID {} not found", id), + ) + }) + }) + .collect() + } + + fn apply_to_aliases(&self, alias: BiHashMap) -> Result> { + alias + .into_iter() + .map(|(name, id)| { + self.old_to_new_id + .get(&id) + .copied() + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!("Field with id {} for alias {} not found", id, name), + ) + }) + .map(|new_id| (name, new_id)) + }) + .collect() + } +} + pub(super) mod _serde { /// This is a helper module that defines types to help with serialization/deserialization. /// For deserialization the input first gets read into either the [SchemaV1] or [SchemaV2] struct @@ -1063,6 +1203,8 @@ pub(super) mod _serde { mod tests { use std::collections::{HashMap, HashSet}; + use bimap::BiHashMap; + use super::DEFAULT_SCHEMA_ID; use crate::spec::datatypes::Type::{List, Map, Primitive, Struct}; use crate::spec::datatypes::{ @@ -1335,6 +1477,12 @@ table { assert_eq!(original_schema, schema); } + #[test] + fn test_highest_field_id() { + let schema = table_schema_nested(); + assert_eq!(17, schema.highest_field_id()); + } + #[test] fn test_schema_index_by_name() { let expected_name_to_id = HashMap::from( @@ -2229,4 +2377,162 @@ table { assert!(result.is_ok()); assert_eq!(result.unwrap(), Type::Struct(schema.as_struct().clone())); } + + #[test] + fn test_reassign_ids() { + let schema = Schema::builder() + .with_schema_id(1) + .with_identifier_field_ids(vec![3]) + .with_alias(BiHashMap::from_iter(vec![("bar_alias".to_string(), 3)])) + .with_fields(vec![ + NestedField::optional(5, "foo", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(3, "bar", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(4, "baz", Type::Primitive(PrimitiveType::Boolean)).into(), + ]) + .build() + .unwrap(); + + let reassigned_schema = schema + .into_builder() + .with_reassigned_field_ids(Some(0)) + .build() + .unwrap(); + + let expected = Schema::builder() + .with_schema_id(1) + .with_identifier_field_ids(vec![1]) + .with_alias(BiHashMap::from_iter(vec![("bar_alias".to_string(), 1)])) + .with_fields(vec![ + NestedField::optional(0, "foo", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(1, "bar", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(2, "baz", Type::Primitive(PrimitiveType::Boolean)).into(), + ]) + .build() + .unwrap(); + + pretty_assertions::assert_eq!(expected, reassigned_schema); + assert_eq!(reassigned_schema.highest_field_id(), 2); + } + + #[test] + fn test_reassigned_ids_nested() { + let schema = table_schema_nested(); + let reassigned_schema = schema + .into_builder() + .with_alias(BiHashMap::from_iter(vec![("bar_alias".to_string(), 2)])) + .with_reassigned_field_ids(Some(0)) + .build() + .unwrap(); + + let expected = Schema::builder() + .with_schema_id(1) + .with_identifier_field_ids(vec![1]) + .with_alias(BiHashMap::from_iter(vec![("bar_alias".to_string(), 1)])) + .with_fields(vec![ + NestedField::optional(0, "foo", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(1, "bar", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(2, "baz", Type::Primitive(PrimitiveType::Boolean)).into(), + NestedField::required( + 3, + "qux", + Type::List(ListType { + element_field: NestedField::list_element( + 7, + Type::Primitive(PrimitiveType::String), + true, + ) + .into(), + }), + ) + .into(), + NestedField::required( + 4, + "quux", + Type::Map(MapType { + key_field: NestedField::map_key_element( + 8, + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::map_value_element( + 9, + Type::Map(MapType { + key_field: NestedField::map_key_element( + 10, + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::map_value_element( + 11, + Type::Primitive(PrimitiveType::Int), + true, + ) + .into(), + }), + true, + ) + .into(), + }), + ) + .into(), + NestedField::required( + 5, + "location", + Type::List(ListType { + element_field: NestedField::list_element( + 12, + Type::Struct(StructType::new(vec![ + NestedField::optional( + 13, + "latitude", + Type::Primitive(PrimitiveType::Float), + ) + .into(), + NestedField::optional( + 14, + "longitude", + Type::Primitive(PrimitiveType::Float), + ) + .into(), + ])), + true, + ) + .into(), + }), + ) + .into(), + NestedField::optional( + 6, + "person", + Type::Struct(StructType::new(vec![ + NestedField::optional(15, "name", Type::Primitive(PrimitiveType::String)) + .into(), + NestedField::required(16, "age", Type::Primitive(PrimitiveType::Int)) + .into(), + ])), + ) + .into(), + ]) + .build() + .unwrap(); + + pretty_assertions::assert_eq!(expected, reassigned_schema); + assert_eq!(reassigned_schema.highest_field_id(), 16); + assert_eq!(reassigned_schema.field_by_id(6).unwrap().name, "person"); + assert_eq!(reassigned_schema.field_by_id(16).unwrap().name, "age"); + } + + #[test] + fn test_reassign_ids_empty_schema() { + let schema = Schema::builder().with_schema_id(1).build().unwrap(); + let reassigned_schema = schema + .clone() + .into_builder() + .with_reassigned_field_ids(Some(0)) + .build() + .unwrap(); + + assert_eq!(schema, reassigned_schema); + assert_eq!(schema.highest_field_id(), 0); + } } From 0bbe8689482529a78f1e9ccf66acea6b48a944a6 Mon Sep 17 00:00:00 2001 From: Christian Thiel Date: Wed, 25 Sep 2024 12:24:19 +0200 Subject: [PATCH 2/6] Address comments --- crates/iceberg/src/spec/schema.rs | 133 +++++++++++++++++++++--------- 1 file changed, 92 insertions(+), 41 deletions(-) diff --git a/crates/iceberg/src/spec/schema.rs b/crates/iceberg/src/spec/schema.rs index d0fa3f25d9..430744ba90 100644 --- a/crates/iceberg/src/spec/schema.rs +++ b/crates/iceberg/src/spec/schema.rs @@ -88,12 +88,12 @@ impl SchemaBuilder { } /// Reassign all field-ids (nested) on build. - /// If `start_from` is provided, it will start reassigning from that id (inclusive). - /// If not provided, it will start from 0. + /// Reassignment starts from the field-id specified in `start_from` (inclusive). /// /// All specified aliases and identifier fields will be updated to the new field-ids. - pub fn with_reassigned_field_ids(mut self, start_from: Option) -> Self { - self.reassign_field_ids_from = Some(start_from.unwrap_or(0)); + #[allow(dead_code)] // Will be needed in TableMetadataBuilder + pub(crate) fn with_reassigned_field_ids(mut self, start_from: u32) -> Self { + self.reassign_field_ids_from = Some(start_from.try_into().unwrap_or(i32::MAX)); self } @@ -116,16 +116,7 @@ impl SchemaBuilder { } /// Builds the schema. - pub fn build(mut self) -> Result { - if let Some(start_from) = self.reassign_field_ids_from { - let mut id_reassigner = ReassignFieldIds::new(start_from); - self.fields = id_reassigner.reassign_field_ids(self.fields); - - self.identifier_field_ids = - id_reassigner.apply_to_identifier_fields(self.identifier_field_ids)?; - self.alias_to_id = id_reassigner.apply_to_aliases(self.alias_to_id)?; - } - + pub fn build(self) -> Result { let field_id_to_accessor = self.build_accessors(); let r#struct = StructType::new(self.fields); @@ -150,7 +141,7 @@ impl SchemaBuilder { let highest_field_id = id_to_field.keys().max().cloned().unwrap_or(0); - Ok(Schema { + let mut schema = Schema { r#struct, schema_id: self.schema_id, highest_field_id, @@ -163,7 +154,26 @@ impl SchemaBuilder { id_to_name, field_id_to_accessor, - }) + }; + + if let Some(start_from) = self.reassign_field_ids_from { + let mut id_reassigner = ReassignFieldIds::new(start_from); + let mew_fields = id_reassigner.reassign_field_ids(schema.r#struct.fields().to_vec())?; + let new_identifier_field_ids = + id_reassigner.apply_to_identifier_fields(schema.identifier_field_ids)?; + let new_alias_to_id = id_reassigner.apply_to_aliases(schema.alias_to_id.clone())?; + + schema = SchemaBuilder { + schema_id: schema.schema_id, + fields: mew_fields, + alias_to_id: new_alias_to_id, + identifier_field_ids: new_identifier_field_ids, + reassign_field_ids_from: None, + } + .build()?; + } + + Ok(schema) } fn build_accessors(&self) -> HashMap> { @@ -980,74 +990,95 @@ impl ReassignFieldIds { } } - fn reassign_field_ids(&mut self, fields: Vec) -> Vec { + fn reassign_field_ids(&mut self, fields: Vec) -> Result> { // Visit fields on the same level first let outer_fields = fields .into_iter() .map(|field| { - self.old_to_new_id.insert(field.id, self.next_field_id); + self + .old_to_new_id + .insert(field.id, self.next_field_id) + .map_or_else(|| Ok(()), |_| Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Error reassigning field ids: Found duplicate 'field.id' {}. Field ids must be unique.", + field.id + )) + ))?; + let new_field = Arc::unwrap_or_clone(field).with_id(self.next_field_id); - self.next_field_id += 1; - Arc::new(new_field) + self.increase_next_field_id()?; + Ok(Arc::new(new_field)) }) - .collect::>(); + .collect::>>()?; // Now visit nested fields outer_fields .into_iter() .map(|field| { if field.field_type.is_primitive() { - field + Ok(field) } else { let mut new_field = Arc::unwrap_or_clone(field); - *new_field.field_type = self.reassign_ids_visit_type(*new_field.field_type); - Arc::new(new_field) + *new_field.field_type = self.reassign_ids_visit_type(*new_field.field_type)?; + Ok(Arc::new(new_field)) } }) .collect() } - fn reassign_ids_visit_type(&mut self, field_type: Type) -> Type { + fn reassign_ids_visit_type(&mut self, field_type: Type) -> Result { match field_type { - Type::Primitive(s) => Type::Primitive(s), + Type::Primitive(s) => Ok(Type::Primitive(s)), Type::Struct(s) => { - let new_fields = self.reassign_field_ids(s.fields().to_vec()); - Type::Struct(StructType::new(new_fields)) + let new_fields = self.reassign_field_ids(s.fields().to_vec())?; + Ok(Type::Struct(StructType::new(new_fields))) } Type::List(l) => { self.old_to_new_id .insert(l.element_field.id, self.next_field_id); let mut element_field = Arc::unwrap_or_clone(l.element_field); element_field.id = self.next_field_id; - self.next_field_id += 1; - *element_field.field_type = self.reassign_ids_visit_type(*element_field.field_type); - Type::List(ListType { + self.increase_next_field_id()?; + *element_field.field_type = + self.reassign_ids_visit_type(*element_field.field_type)?; + Ok(Type::List(ListType { element_field: Arc::new(element_field), - }) + })) } Type::Map(m) => { self.old_to_new_id .insert(m.key_field.id, self.next_field_id); let mut key_field = Arc::unwrap_or_clone(m.key_field); key_field.id = self.next_field_id; - self.next_field_id += 1; - *key_field.field_type = self.reassign_ids_visit_type(*key_field.field_type); + self.increase_next_field_id()?; + *key_field.field_type = self.reassign_ids_visit_type(*key_field.field_type)?; self.old_to_new_id .insert(m.value_field.id, self.next_field_id); let mut value_field = Arc::unwrap_or_clone(m.value_field); value_field.id = self.next_field_id; - self.next_field_id += 1; - *value_field.field_type = self.reassign_ids_visit_type(*value_field.field_type); + self.increase_next_field_id()?; + *value_field.field_type = self.reassign_ids_visit_type(*value_field.field_type)?; - Type::Map(MapType { + Ok(Type::Map(MapType { key_field: Arc::new(key_field), value_field: Arc::new(value_field), - }) + })) } } } + fn increase_next_field_id(&mut self) -> Result<()> { + self.next_field_id = self.next_field_id.checked_add(1).ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "Field ID overflowed, cannot add more fields", + ) + })?; + Ok(()) + } + fn apply_to_identifier_fields(&self, field_ids: HashSet) -> Result> { field_ids .into_iter() @@ -2400,7 +2431,7 @@ table { let reassigned_schema = schema .into_builder() - .with_reassigned_field_ids(Some(0)) + .with_reassigned_field_ids(0) .build() .unwrap(); @@ -2426,7 +2457,7 @@ table { let reassigned_schema = schema .into_builder() .with_alias(BiHashMap::from_iter(vec![("bar_alias".to_string(), 2)])) - .with_reassigned_field_ids(Some(0)) + .with_reassigned_field_ids(0) .build() .unwrap(); @@ -2528,13 +2559,33 @@ table { assert_eq!(reassigned_schema.field_by_id(16).unwrap().name, "age"); } + #[test] + fn test_reassign_ids_fails_with_duplicate_ids() { + let reassigned_schema = Schema::builder() + .with_schema_id(1) + .with_identifier_field_ids(vec![3]) + .with_alias(BiHashMap::from_iter(vec![("bar_alias".to_string(), 3)])) + .with_fields(vec![ + NestedField::optional(5, "foo", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(3, "bar", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(3, "baz", Type::Primitive(PrimitiveType::Boolean)).into(), + ]) + .with_reassigned_field_ids(0) + .build() + .unwrap_err(); + + assert!(reassigned_schema + .message() + .contains("Found duplicate 'field.id' 3")); + } + #[test] fn test_reassign_ids_empty_schema() { let schema = Schema::builder().with_schema_id(1).build().unwrap(); let reassigned_schema = schema .clone() .into_builder() - .with_reassigned_field_ids(Some(0)) + .with_reassigned_field_ids(0) .build() .unwrap(); From 8009a4635c58d6838f140e7560f007f1c40cbcd4 Mon Sep 17 00:00:00 2001 From: Christian Thiel Date: Wed, 25 Sep 2024 13:18:04 +0200 Subject: [PATCH 3/6] Schema ensure unique field ids --- crates/iceberg/src/spec/schema.rs | 70 ++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/crates/iceberg/src/spec/schema.rs b/crates/iceberg/src/spec/schema.rs index 430744ba90..d5ae40d79e 100644 --- a/crates/iceberg/src/spec/schema.rs +++ b/crates/iceberg/src/spec/schema.rs @@ -507,8 +507,7 @@ pub fn index_by_id(r#struct: &StructType) -> Result } fn field(&mut self, field: &NestedFieldRef, _value: ()) -> Result<()> { - self.0.insert(field.id, field.clone()); - Ok(()) + try_insert_field(&mut self.0, field.id, field.clone()) } fn r#struct(&mut self, _struct: &StructType, _results: Vec) -> Result { @@ -516,15 +515,16 @@ pub fn index_by_id(r#struct: &StructType) -> Result } fn list(&mut self, list: &ListType, _value: Self::T) -> Result { - self.0 - .insert(list.element_field.id, list.element_field.clone()); - Ok(()) + try_insert_field( + &mut self.0, + list.element_field.id, + list.element_field.clone(), + ) } fn map(&mut self, map: &MapType, _key_value: Self::T, _value: Self::T) -> Result { - self.0.insert(map.key_field.id, map.key_field.clone()); - self.0.insert(map.value_field.id, map.value_field.clone()); - Ok(()) + try_insert_field(&mut self.0, map.key_field.id, map.key_field.clone())?; + try_insert_field(&mut self.0, map.value_field.id, map.value_field.clone()) } fn primitive(&mut self, _: &PrimitiveType) -> Result { @@ -980,6 +980,21 @@ struct ReassignFieldIds { old_to_new_id: HashMap, } +fn try_insert_field(map: &mut HashMap, field_id: i32, value: V) -> Result<()> { + map.insert(field_id, value).map_or_else( + || Ok(()), + |_| { + Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Found duplicate 'field.id' {}. Field ids must be unique.", + field_id + ), + )) + }, + ) +} + // We are not using the visitor here, as post order traversal is not desired. // Instead we want to re-assign all fields on one level first before diving deeper. impl ReassignFieldIds { @@ -995,17 +1010,7 @@ impl ReassignFieldIds { let outer_fields = fields .into_iter() .map(|field| { - self - .old_to_new_id - .insert(field.id, self.next_field_id) - .map_or_else(|| Ok(()), |_| Err(Error::new( - ErrorKind::DataInvalid, - format!( - "Error reassigning field ids: Found duplicate 'field.id' {}. Field ids must be unique.", - field.id - )) - ))?; - + try_insert_field(&mut self.old_to_new_id, field.id, self.next_field_id)?; let new_field = Arc::unwrap_or_clone(field).with_id(self.next_field_id); self.increase_next_field_id()?; Ok(Arc::new(new_field)) @@ -2563,20 +2568,35 @@ table { fn test_reassign_ids_fails_with_duplicate_ids() { let reassigned_schema = Schema::builder() .with_schema_id(1) - .with_identifier_field_ids(vec![3]) + .with_identifier_field_ids(vec![5]) .with_alias(BiHashMap::from_iter(vec![("bar_alias".to_string(), 3)])) .with_fields(vec![ - NestedField::optional(5, "foo", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(3, "bar", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(5, "foo", Type::Primitive(PrimitiveType::String)).into(), + NestedField::optional(3, "bar", Type::Primitive(PrimitiveType::Int)).into(), NestedField::optional(3, "baz", Type::Primitive(PrimitiveType::Boolean)).into(), ]) .with_reassigned_field_ids(0) .build() .unwrap_err(); - assert!(reassigned_schema - .message() - .contains("Found duplicate 'field.id' 3")); + assert!(reassigned_schema.message().contains("'field.id' 3")); + } + + #[test] + fn test_field_ids_must_be_unique() { + let reassigned_schema = Schema::builder() + .with_schema_id(1) + .with_identifier_field_ids(vec![5]) + .with_alias(BiHashMap::from_iter(vec![("bar_alias".to_string(), 3)])) + .with_fields(vec![ + NestedField::required(5, "foo", Type::Primitive(PrimitiveType::String)).into(), + NestedField::optional(3, "bar", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(3, "baz", Type::Primitive(PrimitiveType::Boolean)).into(), + ]) + .build() + .unwrap_err(); + + assert!(reassigned_schema.message().contains("'field.id' 3")); } #[test] From efbd43676c33d2abf2a365d201bbf13373b41498 Mon Sep 17 00:00:00 2001 From: Christian Thiel Date: Wed, 25 Sep 2024 13:59:45 +0200 Subject: [PATCH 4/6] Fix tests with duplicate nested field ids --- crates/iceberg/src/arrow/schema.rs | 20 ++++++++++---------- crates/iceberg/src/spec/values.rs | 22 +++++++++++----------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/crates/iceberg/src/arrow/schema.rs b/crates/iceberg/src/arrow/schema.rs index 2ff43e0f07..d6f7c220bd 100644 --- a/crates/iceberg/src/arrow/schema.rs +++ b/crates/iceberg/src/arrow/schema.rs @@ -791,8 +791,8 @@ mod tests { fn arrow_schema_for_arrow_schema_to_schema_test() -> ArrowSchema { let fields = Fields::from(vec![ - simple_field("key", DataType::Int32, false, "17"), - simple_field("value", DataType::Utf8, true, "18"), + simple_field("key", DataType::Int32, false, "28"), + simple_field("value", DataType::Utf8, true, "29"), ]); let r#struct = DataType::Struct(fields); @@ -1022,9 +1022,9 @@ mod tests { "required": true, "type": { "type": "map", - "key-id": 17, + "key-id": 28, "key": "int", - "value-id": 18, + "value-id": 29, "value-required": false, "value": "string" } @@ -1075,8 +1075,8 @@ mod tests { fn arrow_schema_for_schema_to_arrow_schema_test() -> ArrowSchema { let fields = Fields::from(vec![ - simple_field("key", DataType::Int32, false, "17"), - simple_field("value", DataType::Utf8, true, "18"), + simple_field("key", DataType::Int32, false, "28"), + simple_field("value", DataType::Utf8, true, "29"), ]); let r#struct = DataType::Struct(fields); @@ -1165,7 +1165,7 @@ mod tests { ), simple_field("map", map, false, "16"), simple_field("struct", r#struct, false, "17"), - simple_field("uuid", DataType::FixedSizeBinary(16), false, "26"), + simple_field("uuid", DataType::FixedSizeBinary(16), false, "30"), ]) } @@ -1309,9 +1309,9 @@ mod tests { "required": true, "type": { "type": "map", - "key-id": 17, + "key-id": 28, "key": "int", - "value-id": 18, + "value-id": 29, "value-required": false, "value": "string" } @@ -1345,7 +1345,7 @@ mod tests { } }, { - "id":26, + "id":30, "name":"uuid", "required":true, "type":"uuid" diff --git a/crates/iceberg/src/spec/values.rs b/crates/iceberg/src/spec/values.rs index 3568d3dcde..3c6e2aa680 100644 --- a/crates/iceberg/src/spec/values.rs +++ b/crates/iceberg/src/spec/values.rs @@ -3192,10 +3192,10 @@ mod tests { (Literal::Primitive(PrimitiveLiteral::Int(3)), None), ])), &Type::Map(MapType { - key_field: NestedField::map_key_element(0, Type::Primitive(PrimitiveType::Int)) + key_field: NestedField::map_key_element(2, Type::Primitive(PrimitiveType::Int)) .into(), value_field: NestedField::map_value_element( - 1, + 3, Type::Primitive(PrimitiveType::Long), false, ) @@ -3219,10 +3219,10 @@ mod tests { ), ])), &Type::Map(MapType { - key_field: NestedField::map_key_element(0, Type::Primitive(PrimitiveType::Int)) + key_field: NestedField::map_key_element(2, Type::Primitive(PrimitiveType::Int)) .into(), value_field: NestedField::map_value_element( - 1, + 3, Type::Primitive(PrimitiveType::Long), true, ) @@ -3249,10 +3249,10 @@ mod tests { ), ])), &Type::Map(MapType { - key_field: NestedField::map_key_element(0, Type::Primitive(PrimitiveType::String)) + key_field: NestedField::map_key_element(2, Type::Primitive(PrimitiveType::String)) .into(), value_field: NestedField::map_value_element( - 1, + 3, Type::Primitive(PrimitiveType::Int), false, ) @@ -3276,10 +3276,10 @@ mod tests { ), ])), &Type::Map(MapType { - key_field: NestedField::map_key_element(0, Type::Primitive(PrimitiveType::String)) + key_field: NestedField::map_key_element(2, Type::Primitive(PrimitiveType::String)) .into(), value_field: NestedField::map_value_element( - 1, + 3, Type::Primitive(PrimitiveType::Int), true, ) @@ -3299,9 +3299,9 @@ mod tests { None, ])), &Type::Struct(StructType::new(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(2, "name", Type::Primitive(PrimitiveType::String)).into(), - NestedField::optional(3, "address", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(3, "name", Type::Primitive(PrimitiveType::String)).into(), + NestedField::optional(4, "address", Type::Primitive(PrimitiveType::String)).into(), ])), ); } From 15b54558b4aa84a504fe306fca63f126e94c19fb Mon Sep 17 00:00:00 2001 From: Christian Thiel Date: Tue, 1 Oct 2024 08:58:07 +0200 Subject: [PATCH 5/6] Use Schema::builder() for reassigned ids --- crates/iceberg/src/spec/schema.rs | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/crates/iceberg/src/spec/schema.rs b/crates/iceberg/src/spec/schema.rs index d5ae40d79e..02d42dd4ae 100644 --- a/crates/iceberg/src/spec/schema.rs +++ b/crates/iceberg/src/spec/schema.rs @@ -158,19 +158,17 @@ impl SchemaBuilder { if let Some(start_from) = self.reassign_field_ids_from { let mut id_reassigner = ReassignFieldIds::new(start_from); - let mew_fields = id_reassigner.reassign_field_ids(schema.r#struct.fields().to_vec())?; + let new_fields = id_reassigner.reassign_field_ids(schema.r#struct.fields().to_vec())?; let new_identifier_field_ids = id_reassigner.apply_to_identifier_fields(schema.identifier_field_ids)?; let new_alias_to_id = id_reassigner.apply_to_aliases(schema.alias_to_id.clone())?; - schema = SchemaBuilder { - schema_id: schema.schema_id, - fields: mew_fields, - alias_to_id: new_alias_to_id, - identifier_field_ids: new_identifier_field_ids, - reassign_field_ids_from: None, - } - .build()?; + schema = Schema::builder() + .with_schema_id(schema.schema_id) + .with_fields(new_fields) + .with_identifier_field_ids(new_identifier_field_ids) + .with_alias(new_alias_to_id) + .build()?; } Ok(schema) From 4b2eb5408b45ec1890101d437748c14e8f9b6893 Mon Sep 17 00:00:00 2001 From: Christian Thiel Date: Tue, 1 Oct 2024 09:00:11 +0200 Subject: [PATCH 6/6] Better docs --- crates/iceberg/src/spec/schema.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/spec/schema.rs b/crates/iceberg/src/spec/schema.rs index 02d42dd4ae..cf86874dc8 100644 --- a/crates/iceberg/src/spec/schema.rs +++ b/crates/iceberg/src/spec/schema.rs @@ -87,7 +87,7 @@ impl SchemaBuilder { self } - /// Reassign all field-ids (nested) on build. + /// Reassign all field-ids (including nested) on build. /// Reassignment starts from the field-id specified in `start_from` (inclusive). /// /// All specified aliases and identifier fields will be updated to the new field-ids.