From 92ab3c4356aadb73e433407524567fff7b4083e7 Mon Sep 17 00:00:00 2001 From: feniljain Date: Sat, 14 Dec 2024 12:00:12 +0530 Subject: [PATCH 1/3] fix: set key_metadata to Null by default --- .../src/expr/visitors/expression_evaluator.rs | 4 ++-- .../visitors/inclusive_metrics_evaluator.rs | 12 +++++------ crates/iceberg/src/io/object_cache.rs | 1 + crates/iceberg/src/scan.rs | 1 + crates/iceberg/src/spec/manifest.rs | 21 ++++++++++--------- .../src/writer/file_writer/parquet_writer.rs | 2 +- 6 files changed, 22 insertions(+), 19 deletions(-) diff --git a/crates/iceberg/src/expr/visitors/expression_evaluator.rs b/crates/iceberg/src/expr/visitors/expression_evaluator.rs index 2add5761f8..38ad7a82b2 100644 --- a/crates/iceberg/src/expr/visitors/expression_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/expression_evaluator.rs @@ -338,7 +338,7 @@ mod tests { nan_value_counts: HashMap::new(), lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), - key_metadata: vec![], + key_metadata: Some(vec![]), split_offsets: vec![], equality_ids: vec![], sort_order_id: None, @@ -361,7 +361,7 @@ mod tests { nan_value_counts: HashMap::new(), lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), - key_metadata: vec![], + key_metadata: Some(vec![]), split_offsets: vec![], equality_ids: vec![], sort_order_id: None, diff --git a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs index 1cdc757714..3fd7fcd396 100644 --- a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs @@ -1991,7 +1991,7 @@ mod test { nan_value_counts: Default::default(), lower_bounds: Default::default(), upper_bounds: Default::default(), - key_metadata: vec![], + key_metadata: Some(vec![]), split_offsets: vec![], equality_ids: vec![], sort_order_id: None, @@ -2012,7 +2012,7 @@ mod test { nan_value_counts: Default::default(), lower_bounds: Default::default(), upper_bounds: Default::default(), - key_metadata: vec![], + key_metadata: Some(vec![]), split_offsets: vec![], equality_ids: vec![], sort_order_id: None, @@ -2069,7 +2069,7 @@ mod test { ]), column_sizes: Default::default(), - key_metadata: vec![], + key_metadata: Some(vec![]), split_offsets: vec![], equality_ids: vec![], sort_order_id: None, @@ -2095,7 +2095,7 @@ mod test { upper_bounds: HashMap::from([(3, Datum::string("dC"))]), column_sizes: Default::default(), - key_metadata: vec![], + key_metadata: Some(vec![]), split_offsets: vec![], equality_ids: vec![], sort_order_id: None, @@ -2122,7 +2122,7 @@ mod test { upper_bounds: HashMap::from([(3, Datum::string("3str3"))]), column_sizes: Default::default(), - key_metadata: vec![], + key_metadata: Some(vec![]), split_offsets: vec![], equality_ids: vec![], sort_order_id: None, @@ -2149,7 +2149,7 @@ mod test { upper_bounds: HashMap::from([(3, Datum::string("イロハニホヘト"))]), column_sizes: Default::default(), - key_metadata: vec![], + key_metadata: Some(vec![]), split_offsets: vec![], equality_ids: vec![], sort_order_id: None, diff --git a/crates/iceberg/src/io/object_cache.rs b/crates/iceberg/src/io/object_cache.rs index 88e2d0e2d5..809db33f56 100644 --- a/crates/iceberg/src/io/object_cache.rs +++ b/crates/iceberg/src/io/object_cache.rs @@ -278,6 +278,7 @@ mod tests { .file_size_in_bytes(100) .record_count(1) .partition(Struct::from_iter([Some(Literal::long(100))])) + .key_metadata(None) .build() .unwrap(), ) diff --git a/crates/iceberg/src/scan.rs b/crates/iceberg/src/scan.rs index 8f0bc38f67..89cc21bbf9 100644 --- a/crates/iceberg/src/scan.rs +++ b/crates/iceberg/src/scan.rs @@ -1073,6 +1073,7 @@ mod tests { .file_size_in_bytes(100) .record_count(1) .partition(Struct::from_iter([Some(Literal::long(100))])) + .key_metadata(None) .build() .unwrap(), ) diff --git a/crates/iceberg/src/spec/manifest.rs b/crates/iceberg/src/spec/manifest.rs index a4b4d7cdb5..11802d7cec 100644 --- a/crates/iceberg/src/spec/manifest.rs +++ b/crates/iceberg/src/spec/manifest.rs @@ -1074,7 +1074,7 @@ pub struct DataFile { /// /// Implementation-specific key metadata for encryption #[builder(default)] - pub(crate) key_metadata: Vec, + pub(crate) key_metadata: Option>, /// field id: 132 /// element field id: 133 /// @@ -1164,7 +1164,7 @@ impl DataFile { &self.upper_bounds } /// Get the Implementation-specific key metadata for the data file. - pub fn key_metadata(&self) -> &[u8] { + pub fn key_metadata(&self) -> &Option> { &self.key_metadata } /// Get the split offsets of the data file. @@ -1378,12 +1378,13 @@ mod _serde { nan_value_counts: Some(to_i64_entry(value.nan_value_counts)?), lower_bounds: Some(to_bytes_entry(value.lower_bounds)?), upper_bounds: Some(to_bytes_entry(value.upper_bounds)?), - key_metadata: Some(serde_bytes::ByteBuf::from(value.key_metadata)), + key_metadata: value.key_metadata.map(serde_bytes::ByteBuf::from), split_offsets: Some(value.split_offsets), equality_ids: Some(value.equality_ids), sort_order_id: value.sort_order_id, }) } + pub fn try_into( self, partition_type: &StructType, @@ -1441,7 +1442,7 @@ mod _serde { .map(|v| parse_bytes_entry(v, schema)) .transpose()? .unwrap_or_default(), - key_metadata: self.key_metadata.map(|v| v.to_vec()).unwrap_or_default(), + key_metadata: self.key_metadata.map(|v| v.to_vec()), split_offsets: self.split_offsets.unwrap_or_default(), equality_ids: self.equality_ids.unwrap_or_default(), sort_order_id: self.sort_order_id, @@ -1657,7 +1658,7 @@ mod tests { nan_value_counts: HashMap::new(), lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), - key_metadata: Vec::new(), + key_metadata: Some(Vec::new()), split_offsets: vec![4], equality_ids: Vec::new(), sort_order_id: None, @@ -1813,7 +1814,7 @@ mod tests { nan_value_counts: HashMap::new(), lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), - key_metadata: vec![], + key_metadata: Some(vec![]), split_offsets: vec![4], equality_ids: vec![], sort_order_id: None, @@ -1880,7 +1881,7 @@ mod tests { nan_value_counts: HashMap::new(), lower_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]), upper_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]), - key_metadata: vec![], + key_metadata: Some(vec![]), split_offsets: vec![4], equality_ids: vec![], sort_order_id: Some(0), @@ -1960,7 +1961,7 @@ mod tests { (2, Datum::string("a")), (3, Datum::string("x")) ]), - key_metadata: vec![], + key_metadata: Some(vec![]), split_offsets: vec![4], equality_ids: vec![], sort_order_id: Some(0), @@ -2035,7 +2036,7 @@ mod tests { (2, Datum::int(2)), (3, Datum::string("x")) ]), - key_metadata: vec![], + key_metadata: Some(vec![]), split_offsets: vec![4], equality_ids: vec![], sort_order_id: None, @@ -2105,7 +2106,7 @@ mod tests { (1, Datum::long(1)), (2, Datum::int(2)), ]), - key_metadata: vec![], + key_metadata: Some(vec![]), split_offsets: vec![4], equality_ids: vec![], sort_order_id: None, diff --git a/crates/iceberg/src/writer/file_writer/parquet_writer.rs b/crates/iceberg/src/writer/file_writer/parquet_writer.rs index 09f9a7057a..596228f7c0 100644 --- a/crates/iceberg/src/writer/file_writer/parquet_writer.rs +++ b/crates/iceberg/src/writer/file_writer/parquet_writer.rs @@ -381,7 +381,7 @@ impl ParquetWriter { // # TODO(#417) // - nan_value_counts // - distinct_counts - .key_metadata(metadata.footer_signing_key_metadata.unwrap_or_default()) + .key_metadata(metadata.footer_signing_key_metadata) .split_offsets( metadata .row_groups From 5a7d5abe050396282b143a2e41bd6cd1bec7383c Mon Sep 17 00:00:00 2001 From: feniljain Date: Sat, 14 Dec 2024 13:06:34 +0530 Subject: [PATCH 2/3] fix: return Option<&[u8]> instead of &Option> for key_metadata * test: use `None` instead of `Some` for key_metadata fields --- .../src/expr/visitors/expression_evaluator.rs | 4 ++-- .../visitors/inclusive_metrics_evaluator.rs | 12 +++++----- crates/iceberg/src/spec/manifest.rs | 24 +++++++++---------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/crates/iceberg/src/expr/visitors/expression_evaluator.rs b/crates/iceberg/src/expr/visitors/expression_evaluator.rs index 38ad7a82b2..d451401c18 100644 --- a/crates/iceberg/src/expr/visitors/expression_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/expression_evaluator.rs @@ -338,7 +338,7 @@ mod tests { nan_value_counts: HashMap::new(), lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), - key_metadata: Some(vec![]), + key_metadata: None, split_offsets: vec![], equality_ids: vec![], sort_order_id: None, @@ -361,7 +361,7 @@ mod tests { nan_value_counts: HashMap::new(), lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), - key_metadata: Some(vec![]), + key_metadata: None, split_offsets: vec![], equality_ids: vec![], sort_order_id: None, diff --git a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs index 3fd7fcd396..7b04fae3a2 100644 --- a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs @@ -1991,7 +1991,7 @@ mod test { nan_value_counts: Default::default(), lower_bounds: Default::default(), upper_bounds: Default::default(), - key_metadata: Some(vec![]), + key_metadata: None, split_offsets: vec![], equality_ids: vec![], sort_order_id: None, @@ -2012,7 +2012,7 @@ mod test { nan_value_counts: Default::default(), lower_bounds: Default::default(), upper_bounds: Default::default(), - key_metadata: Some(vec![]), + key_metadata: None, split_offsets: vec![], equality_ids: vec![], sort_order_id: None, @@ -2069,7 +2069,7 @@ mod test { ]), column_sizes: Default::default(), - key_metadata: Some(vec![]), + key_metadata: None, split_offsets: vec![], equality_ids: vec![], sort_order_id: None, @@ -2095,7 +2095,7 @@ mod test { upper_bounds: HashMap::from([(3, Datum::string("dC"))]), column_sizes: Default::default(), - key_metadata: Some(vec![]), + key_metadata: None, split_offsets: vec![], equality_ids: vec![], sort_order_id: None, @@ -2122,7 +2122,7 @@ mod test { upper_bounds: HashMap::from([(3, Datum::string("3str3"))]), column_sizes: Default::default(), - key_metadata: Some(vec![]), + key_metadata: None, split_offsets: vec![], equality_ids: vec![], sort_order_id: None, @@ -2149,7 +2149,7 @@ mod test { upper_bounds: HashMap::from([(3, Datum::string("イロハニホヘト"))]), column_sizes: Default::default(), - key_metadata: Some(vec![]), + key_metadata: None, split_offsets: vec![], equality_ids: vec![], sort_order_id: None, diff --git a/crates/iceberg/src/spec/manifest.rs b/crates/iceberg/src/spec/manifest.rs index 11802d7cec..8a40b822bb 100644 --- a/crates/iceberg/src/spec/manifest.rs +++ b/crates/iceberg/src/spec/manifest.rs @@ -1164,8 +1164,8 @@ impl DataFile { &self.upper_bounds } /// Get the Implementation-specific key metadata for the data file. - pub fn key_metadata(&self) -> &Option> { - &self.key_metadata + pub fn key_metadata(&self) -> Option<&[u8]> { + self.key_metadata.as_ref().map(|x| x as &[u8]) } /// Get the split offsets of the data file. /// For example, all row group offsets in a Parquet file. @@ -1658,7 +1658,7 @@ mod tests { nan_value_counts: HashMap::new(), lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), - key_metadata: Some(Vec::new()), + key_metadata: None, split_offsets: vec![4], equality_ids: Vec::new(), sort_order_id: None, @@ -1814,7 +1814,7 @@ mod tests { nan_value_counts: HashMap::new(), lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), - key_metadata: Some(vec![]), + key_metadata: None, split_offsets: vec![4], equality_ids: vec![], sort_order_id: None, @@ -1881,7 +1881,7 @@ mod tests { nan_value_counts: HashMap::new(), lower_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]), upper_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]), - key_metadata: Some(vec![]), + key_metadata: None, split_offsets: vec![4], equality_ids: vec![], sort_order_id: Some(0), @@ -1961,7 +1961,7 @@ mod tests { (2, Datum::string("a")), (3, Datum::string("x")) ]), - key_metadata: Some(vec![]), + key_metadata: None, split_offsets: vec![4], equality_ids: vec![], sort_order_id: Some(0), @@ -2036,7 +2036,7 @@ mod tests { (2, Datum::int(2)), (3, Datum::string("x")) ]), - key_metadata: Some(vec![]), + key_metadata: None, split_offsets: vec![4], equality_ids: vec![], sort_order_id: None, @@ -2106,7 +2106,7 @@ mod tests { (1, Datum::long(1)), (2, Datum::int(2)), ]), - key_metadata: Some(vec![]), + key_metadata: None, split_offsets: vec![4], equality_ids: vec![], sort_order_id: None, @@ -2184,7 +2184,7 @@ mod tests { nan_value_counts: HashMap::new(), lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), - key_metadata: Vec::new(), + key_metadata: None, split_offsets: vec![4], equality_ids: Vec::new(), sort_order_id: None, @@ -2215,7 +2215,7 @@ mod tests { nan_value_counts: HashMap::new(), lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), - key_metadata: Vec::new(), + key_metadata: None, split_offsets: vec![4], equality_ids: Vec::new(), sort_order_id: None, @@ -2247,7 +2247,7 @@ mod tests { nan_value_counts: HashMap::new(), lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), - key_metadata: Vec::new(), + key_metadata: None, split_offsets: vec![4], equality_ids: Vec::new(), sort_order_id: None, @@ -2279,7 +2279,7 @@ mod tests { nan_value_counts: HashMap::new(), lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), - key_metadata: Vec::new(), + key_metadata: None, split_offsets: vec![4], equality_ids: Vec::new(), sort_order_id: None, From a86b3f635cc532211eeaa14098d9b01637381804 Mon Sep 17 00:00:00 2001 From: feniljain Date: Sat, 14 Dec 2024 15:40:57 +0530 Subject: [PATCH 3/3] refactor: use as_deref instead of explicit ref/deref using map --- crates/iceberg/src/spec/manifest.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/spec/manifest.rs b/crates/iceberg/src/spec/manifest.rs index 8a40b822bb..60f5469cd7 100644 --- a/crates/iceberg/src/spec/manifest.rs +++ b/crates/iceberg/src/spec/manifest.rs @@ -1165,7 +1165,7 @@ impl DataFile { } /// Get the Implementation-specific key metadata for the data file. pub fn key_metadata(&self) -> Option<&[u8]> { - self.key_metadata.as_ref().map(|x| x as &[u8]) + self.key_metadata.as_deref() } /// Get the split offsets of the data file. /// For example, all row group offsets in a Parquet file.