From 47b22958ac1fae90d93a9f2ba0f3e07efdabd889 Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Tue, 29 Jul 2025 17:25:47 -0700 Subject: [PATCH 1/8] Expose DataFile serde functions --- crates/iceberg/src/spec/manifest/_serde.rs | 14 +- crates/iceberg/src/spec/manifest/data_file.rs | 8 +- crates/iceberg/src/spec/manifest/mod.rs | 153 +++++++++++++++++- 3 files changed, 167 insertions(+), 8 deletions(-) diff --git a/crates/iceberg/src/spec/manifest/_serde.rs b/crates/iceberg/src/spec/manifest/_serde.rs index fd7bc2e69a..e9124faf67 100644 --- a/crates/iceberg/src/spec/manifest/_serde.rs +++ b/crates/iceberg/src/spec/manifest/_serde.rs @@ -21,7 +21,7 @@ use serde_derive::{Deserialize, Serialize}; use serde_with::serde_as; use super::{Datum, ManifestEntry, Schema, Struct}; -use crate::spec::{Literal, RawLiteral, StructType, Type}; +use crate::spec::{FormatVersion, Literal, RawLiteral, StructType, Type}; use crate::{Error, ErrorKind}; #[derive(Serialize, Deserialize)] @@ -40,7 +40,7 @@ impl ManifestEntryV2 { snapshot_id: value.snapshot_id, sequence_number: value.sequence_number, file_sequence_number: value.file_sequence_number, - data_file: DataFileSerde::try_from(value.data_file, partition_type, false)?, + data_file: DataFileSerde::try_from(value.data_file, partition_type, FormatVersion::V2)?, }) } @@ -74,7 +74,7 @@ impl ManifestEntryV1 { Ok(Self { status: value.status as i32, snapshot_id: value.snapshot_id.unwrap_or_default(), - data_file: DataFileSerde::try_from(value.data_file, partition_type, true)?, + data_file: DataFileSerde::try_from(value.data_file, partition_type, FormatVersion::V1)?, }) } @@ -129,9 +129,13 @@ impl DataFileSerde { pub fn try_from( value: super::DataFile, partition_type: &StructType, - is_version_1: bool, + format_version: FormatVersion, ) -> Result { - let block_size_in_bytes = if is_version_1 { Some(0) } else { None }; + let block_size_in_bytes = if format_version == FormatVersion::V1 { + Some(0) + } else { + None + }; Ok(Self { content: value.content as i32, file_path: value.file_path, diff --git a/crates/iceberg/src/spec/manifest/data_file.rs b/crates/iceberg/src/spec/manifest/data_file.rs index 1de59a3874..9ea1fcd0a7 100644 --- a/crates/iceberg/src/spec/manifest/data_file.rs +++ b/crates/iceberg/src/spec/manifest/data_file.rs @@ -297,8 +297,12 @@ pub fn write_data_files_to_avro( let mut writer = AvroWriter::new(&avro_schema, writer); for data_file in data_files { - let value = to_value(DataFileSerde::try_from(data_file, partition_type, true)?)? - .resolve(&avro_schema)?; + let value = to_value(DataFileSerde::try_from( + data_file, + partition_type, + FormatVersion::V1, + )?)? + .resolve(&avro_schema)?; writer.append(value)?; } diff --git a/crates/iceberg/src/spec/manifest/mod.rs b/crates/iceberg/src/spec/manifest/mod.rs index 33b7d38706..127e725dc5 100644 --- a/crates/iceberg/src/spec/manifest/mod.rs +++ b/crates/iceberg/src/spec/manifest/mod.rs @@ -34,6 +34,7 @@ use super::{ UNASSIGNED_SEQUENCE_NUMBER, }; use crate::error::Result; +use crate::{Error, ErrorKind}; /// A manifest contains metadata and a list of entries. #[derive(Debug, PartialEq, Eq, Clone)] @@ -119,12 +120,46 @@ impl Manifest { } } +/// Serialize a DataFile to a JSON string. +pub fn serialize_data_file_to_json( + data_file: DataFile, + partition_type: &super::StructType, + format_version: FormatVersion, +) -> Result { + let serde = _serde::DataFileSerde::try_from(data_file, partition_type, format_version)?; + serde_json::to_string(&serde).map_err(|e| { + Error::new( + ErrorKind::DataInvalid, + "Failed to serialize DataFile to JSON!".to_string(), + ) + .with_source(e) + }) +} + +/// Deserialize a DataFile from a JSON string. +pub fn deserialize_data_file_from_json( + json: &str, + partition_spec_id: i32, + partition_type: &super::StructType, + schema: &Schema, +) -> Result { + let serde = serde_json::from_str::<_serde::DataFileSerde>(json).map_err(|e| { + Error::new( + ErrorKind::DataInvalid, + "Failed to deserialize JSON to DataFile!".to_string(), + ) + .with_source(e) + })?; + + serde.try_into(partition_spec_id, partition_type, schema) +} + #[cfg(test)] mod tests { use std::collections::HashMap; use std::fs; use std::sync::Arc; - + use arrow_array::StringArray; use tempfile::TempDir; use super::*; @@ -1056,4 +1091,120 @@ mod tests { assert!(!partitions[2].clone().contains_null); assert_eq!(partitions[2].clone().contains_nan, Some(false)); } + + #[test] + fn test_data_file_serialization() { + // Create a simple schema + let schema = Schema::builder() + .with_schema_id(1) + .with_identifier_field_ids(vec![1]) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Long)) + .into(), + NestedField::required( + 2, + "name", + Type::Primitive(PrimitiveType::String), + ) + .into(), + ]) + .build() + .unwrap(); + + // Create a partition spec + let partition_spec = PartitionSpec::builder(schema.clone()) + .with_spec_id(1) + .add_partition_field("id", "id_partition", Transform::Identity) + .unwrap() + .build() + .unwrap(); + + // Get partition type from the partition spec + let partition_type = partition_spec.partition_type(&schema).unwrap(); + + // Create a vector of DataFile objects + let data_files = vec![ + DataFileBuilder::default() + .content(DataContentType::Data) + .file_format(DataFileFormat::Parquet) + .file_path("path/to/file1.parquet".to_string()) + .file_size_in_bytes(1024) + .record_count(100) + .partition_spec_id(1) + .partition(Struct::empty()) + .column_sizes(HashMap::from([(1, 512), (2, 512)])) + .value_counts(HashMap::from([(1, 100), (2, 100)])) + .null_value_counts(HashMap::from([(1, 0), (2, 0)])) + .build() + .unwrap(), + DataFileBuilder::default() + .content(crate::spec::DataContentType::Data) + .file_format(DataFileFormat::Parquet) + .file_path("path/to/file2.parquet".to_string()) + .file_size_in_bytes(2048) + .record_count(200) + .partition_spec_id(1) + .partition(Struct::empty()) + .column_sizes(HashMap::from([(1, 1024), (2, 1024)])) + .value_counts(HashMap::from([(1, 200), (2, 200)])) + .null_value_counts(HashMap::from([(1, 10), (2, 5)])) + .build() + .unwrap(), + ]; + + // Serialize the DataFile objects + let serialized_files = data_files + .into_iter() + .map(|f| { + let json = + serialize_data_file_to_json(f, &partition_type, FormatVersion::V2).unwrap(); + println!("Test serialized data file: {}", json); + json + }) + .collect::>(); + + // Verify we have the expected number of serialized files + assert_eq!(serialized_files.len(), 2); + + // Verify each serialized file contains expected data + for json in &serialized_files { + assert!(json.contains("path/to/file")); + assert!(json.contains("parquet")); + assert!(json.contains("record_count")); + assert!(json.contains("file_size_in_bytes")); + } + + // Convert Vec to StringArray and print it + let string_array = StringArray::from(serialized_files.clone()); + println!("StringArray: {:?}", string_array); + + // Now deserialize the JSON strings back into DataFile objects + println!("\nDeserializing back to DataFile objects:"); + let deserialized_files: Vec = serialized_files + .into_iter() + .map(|json| { + let data_file = deserialize_data_file_from_json( + &json, + partition_spec.spec_id(), + &partition_type, + &schema, + ) + .unwrap(); + + println!("Deserialized DataFile: {:?}", data_file); + data_file + }) + .collect(); + + // Verify we have the expected number of deserialized files + assert_eq!(deserialized_files.len(), 2); + + // Verify the deserialized files have the expected properties + for file in &deserialized_files { + assert_eq!(file.content_type(), DataContentType::Data); + assert_eq!(file.file_format(), DataFileFormat::Parquet); + assert!(file.file_path().contains("path/to/file")); + assert!(file.record_count() == 100 || file.record_count() == 200); + } + } } From a80a206f2fd314f3d357f11164fb1dbfe70900cf Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Tue, 29 Jul 2025 18:01:42 -0700 Subject: [PATCH 2/8] fmt --- crates/iceberg/src/spec/manifest/mod.rs | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/crates/iceberg/src/spec/manifest/mod.rs b/crates/iceberg/src/spec/manifest/mod.rs index 127e725dc5..b8aa4baa11 100644 --- a/crates/iceberg/src/spec/manifest/mod.rs +++ b/crates/iceberg/src/spec/manifest/mod.rs @@ -159,6 +159,7 @@ mod tests { use std::collections::HashMap; use std::fs; use std::sync::Arc; + use arrow_array::StringArray; use tempfile::TempDir; @@ -1099,14 +1100,8 @@ mod tests { .with_schema_id(1) .with_identifier_field_ids(vec![1]) .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Long)) - .into(), - NestedField::required( - 2, - "name", - Type::Primitive(PrimitiveType::String), - ) - .into(), + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Long)).into(), + NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(), ]) .build() .unwrap(); @@ -1189,7 +1184,7 @@ mod tests { &partition_type, &schema, ) - .unwrap(); + .unwrap(); println!("Deserialized DataFile: {:?}", data_file); data_file From 384ec011722f58dd97a72a4b055205eb411841ac Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Tue, 29 Jul 2025 21:52:03 -0700 Subject: [PATCH 3/8] use expect in ut --- crates/iceberg/src/spec/manifest/mod.rs | 124 ++++++++++++++++-------- 1 file changed, 83 insertions(+), 41 deletions(-) diff --git a/crates/iceberg/src/spec/manifest/mod.rs b/crates/iceberg/src/spec/manifest/mod.rs index b8aa4baa11..02bf89d250 100644 --- a/crates/iceberg/src/spec/manifest/mod.rs +++ b/crates/iceberg/src/spec/manifest/mod.rs @@ -160,7 +160,8 @@ mod tests { use std::fs; use std::sync::Arc; - use arrow_array::StringArray; + use expect_test::expect; + use serde_json::Value; use tempfile::TempDir; use super::*; @@ -1127,22 +1128,22 @@ mod tests { .record_count(100) .partition_spec_id(1) .partition(Struct::empty()) - .column_sizes(HashMap::from([(1, 512), (2, 512)])) - .value_counts(HashMap::from([(1, 100), (2, 100)])) - .null_value_counts(HashMap::from([(1, 0), (2, 0)])) + .column_sizes(HashMap::from([(1, 512)])) + .value_counts(HashMap::from([(1, 100)])) + .null_value_counts(HashMap::from([(1, 0)])) .build() .unwrap(), DataFileBuilder::default() - .content(crate::spec::DataContentType::Data) + .content(DataContentType::Data) .file_format(DataFileFormat::Parquet) .file_path("path/to/file2.parquet".to_string()) .file_size_in_bytes(2048) .record_count(200) .partition_spec_id(1) .partition(Struct::empty()) - .column_sizes(HashMap::from([(1, 1024), (2, 1024)])) - .value_counts(HashMap::from([(1, 200), (2, 200)])) - .null_value_counts(HashMap::from([(1, 10), (2, 5)])) + .column_sizes(HashMap::from([(1, 1024)])) + .value_counts(HashMap::from([(1, 200)])) + .null_value_counts(HashMap::from([(1, 10)])) .build() .unwrap(), ]; @@ -1150,56 +1151,97 @@ mod tests { // Serialize the DataFile objects let serialized_files = data_files .into_iter() - .map(|f| { - let json = - serialize_data_file_to_json(f, &partition_type, FormatVersion::V2).unwrap(); - println!("Test serialized data file: {}", json); - json - }) + .map(|f| serialize_data_file_to_json(f, &partition_type, FormatVersion::V2).unwrap()) .collect::>(); - // Verify we have the expected number of serialized files + // Verify we have the expected serialized files assert_eq!(serialized_files.len(), 2); - - // Verify each serialized file contains expected data - for json in &serialized_files { - assert!(json.contains("path/to/file")); - assert!(json.contains("parquet")); - assert!(json.contains("record_count")); - assert!(json.contains("file_size_in_bytes")); - } - - // Convert Vec to StringArray and print it - let string_array = StringArray::from(serialized_files.clone()); - println!("StringArray: {:?}", string_array); + let pretty_json1: Value = serde_json::from_str(&serialized_files.get(0).unwrap()).unwrap(); + let pretty_json2: Value = serde_json::from_str(&serialized_files.get(1).unwrap()).unwrap(); + let expected_serialized_file1 = serde_json::json!({ + "content": 0, + "file_path": "path/to/file1.parquet", + "file_format": "PARQUET", + "partition": {}, + "record_count": 100, + "file_size_in_bytes": 1024, + "column_sizes": [ + { "key": 1, "value": 512 } + ], + "value_counts": [ + { "key": 1, "value": 100 } + ], + "null_value_counts": [ + { "key": 1, "value": 0 } + ], + "nan_value_counts": [], + "lower_bounds": [], + "upper_bounds": [], + "key_metadata": null, + "split_offsets": [], + "equality_ids": [], + "sort_order_id": null, + "first_row_id": null, + "referenced_data_file": null, + "content_offset": null, + "content_size_in_bytes": null + }); + let expected_serialized_file2 = serde_json::json!({ + "content": 0, + "file_path": "path/to/file2.parquet", + "file_format": "PARQUET", + "partition": {}, + "record_count": 200, + "file_size_in_bytes": 2048, + "column_sizes": [ + { "key": 1, "value": 1024 } + ], + "value_counts": [ + { "key": 1, "value": 200 } + ], + "null_value_counts": [ + { "key": 1, "value": 10 } + ], + "nan_value_counts": [], + "lower_bounds": [], + "upper_bounds": [], + "key_metadata": null, + "split_offsets": [], + "equality_ids": [], + "sort_order_id": null, + "first_row_id": null, + "referenced_data_file": null, + "content_offset": null, + "content_size_in_bytes": null + }); + assert_eq!(pretty_json1, expected_serialized_file1); + assert_eq!(pretty_json2, expected_serialized_file2); // Now deserialize the JSON strings back into DataFile objects - println!("\nDeserializing back to DataFile objects:"); let deserialized_files: Vec = serialized_files .into_iter() .map(|json| { - let data_file = deserialize_data_file_from_json( + deserialize_data_file_from_json( &json, partition_spec.spec_id(), &partition_type, &schema, ) - .unwrap(); - - println!("Deserialized DataFile: {:?}", data_file); - data_file + .unwrap() }) .collect(); // Verify we have the expected number of deserialized files assert_eq!(deserialized_files.len(), 2); - - // Verify the deserialized files have the expected properties - for file in &deserialized_files { - assert_eq!(file.content_type(), DataContentType::Data); - assert_eq!(file.file_format(), DataFileFormat::Parquet); - assert!(file.file_path().contains("path/to/file")); - assert!(file.record_count() == 100 || file.record_count() == 200); - } + let deserialized_data_file1 = deserialized_files.get(0).unwrap(); + let deserialized_data_file2 = deserialized_files.get(1).unwrap(); + let expected_deserialized_file1 = expect![[ + r#"DataFile { content: Data, file_path: "path/to/file1.parquet", file_format: Parquet, partition: Struct { fields: [] }, record_count: 100, file_size_in_bytes: 1024, column_sizes: {1: 512}, value_counts: {1: 100}, null_value_counts: {1: 0}, nan_value_counts: {}, lower_bounds: {}, upper_bounds: {}, key_metadata: None, split_offsets: [], equality_ids: [], sort_order_id: None, first_row_id: None, partition_spec_id: 1, referenced_data_file: None, content_offset: None, content_size_in_bytes: None }"# + ]]; + let expected_deserialized_file2 = expect![[ + r#"DataFile { content: Data, file_path: "path/to/file2.parquet", file_format: Parquet, partition: Struct { fields: [] }, record_count: 200, file_size_in_bytes: 2048, column_sizes: {1: 1024}, value_counts: {1: 200}, null_value_counts: {1: 10}, nan_value_counts: {}, lower_bounds: {}, upper_bounds: {}, key_metadata: None, split_offsets: [], equality_ids: [], sort_order_id: None, first_row_id: None, partition_spec_id: 1, referenced_data_file: None, content_offset: None, content_size_in_bytes: None }"# + ]]; + expected_deserialized_file1.assert_eq(&format!("{:?}", deserialized_data_file1)); + expected_deserialized_file2.assert_eq(&format!("{:?}", deserialized_data_file2)); } } From a761cc8599de92641d904ca6fb3ffb08b4f9557f Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Tue, 29 Jul 2025 21:59:57 -0700 Subject: [PATCH 4/8] why first? clippy is harsh --- crates/iceberg/src/spec/manifest/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/iceberg/src/spec/manifest/mod.rs b/crates/iceberg/src/spec/manifest/mod.rs index 02bf89d250..8e2455c310 100644 --- a/crates/iceberg/src/spec/manifest/mod.rs +++ b/crates/iceberg/src/spec/manifest/mod.rs @@ -1156,8 +1156,8 @@ mod tests { // Verify we have the expected serialized files assert_eq!(serialized_files.len(), 2); - let pretty_json1: Value = serde_json::from_str(&serialized_files.get(0).unwrap()).unwrap(); - let pretty_json2: Value = serde_json::from_str(&serialized_files.get(1).unwrap()).unwrap(); + let pretty_json1: Value = serde_json::from_str(serialized_files.first().unwrap()).unwrap(); + let pretty_json2: Value = serde_json::from_str(serialized_files.get(1).unwrap()).unwrap(); let expected_serialized_file1 = serde_json::json!({ "content": 0, "file_path": "path/to/file1.parquet", @@ -1233,7 +1233,7 @@ mod tests { // Verify we have the expected number of deserialized files assert_eq!(deserialized_files.len(), 2); - let deserialized_data_file1 = deserialized_files.get(0).unwrap(); + let deserialized_data_file1 = deserialized_files.first().unwrap(); let deserialized_data_file2 = deserialized_files.get(1).unwrap(); let expected_deserialized_file1 = expect![[ r#"DataFile { content: Data, file_path: "path/to/file1.parquet", file_format: Parquet, partition: Struct { fields: [] }, record_count: 100, file_size_in_bytes: 1024, column_sizes: {1: 512}, value_counts: {1: 100}, null_value_counts: {1: 0}, nan_value_counts: {}, lower_bounds: {}, upper_bounds: {}, key_metadata: None, split_offsets: [], equality_ids: [], sort_order_id: None, first_row_id: None, partition_spec_id: 1, referenced_data_file: None, content_offset: None, content_size_in_bytes: None }"# From f16cc38dc8bcac3f5133a8dc305a0bbb9dfcac13 Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Fri, 1 Aug 2025 15:55:28 -0700 Subject: [PATCH 5/8] address randomness --- crates/iceberg/src/spec/manifest/_serde.rs | 10 ++++- crates/iceberg/src/spec/manifest/mod.rs | 45 ++++++++++++---------- 2 files changed, 32 insertions(+), 23 deletions(-) diff --git a/crates/iceberg/src/spec/manifest/_serde.rs b/crates/iceberg/src/spec/manifest/_serde.rs index e9124faf67..24e49a8a20 100644 --- a/crates/iceberg/src/spec/manifest/_serde.rs +++ b/crates/iceberg/src/spec/manifest/_serde.rs @@ -297,7 +297,7 @@ fn parse_i64_entry(v: Vec) -> Result, Error> { } fn to_i64_entry(entries: HashMap) -> Result, Error> { - entries + let mut i64_entries = entries .iter() .map(|e| { Ok(I64Entry { @@ -305,7 +305,13 @@ fn to_i64_entry(entries: HashMap) -> Result, Error> { value: (*e.1).try_into()?, }) }) - .collect() + .collect::, Error>>()?; + + if cfg!(test) { + i64_entries.sort_by_key(|e| e.key); + } + + Ok(i64_entries) } #[cfg(test)] diff --git a/crates/iceberg/src/spec/manifest/mod.rs b/crates/iceberg/src/spec/manifest/mod.rs index 8e2455c310..da03977324 100644 --- a/crates/iceberg/src/spec/manifest/mod.rs +++ b/crates/iceberg/src/spec/manifest/mod.rs @@ -160,7 +160,6 @@ mod tests { use std::fs; use std::sync::Arc; - use expect_test::expect; use serde_json::Value; use tempfile::TempDir; @@ -1128,9 +1127,9 @@ mod tests { .record_count(100) .partition_spec_id(1) .partition(Struct::empty()) - .column_sizes(HashMap::from([(1, 512)])) - .value_counts(HashMap::from([(1, 100)])) - .null_value_counts(HashMap::from([(1, 0)])) + .column_sizes(HashMap::from([(1, 512), (2, 1024)])) + .value_counts(HashMap::from([(1, 100), (2, 500)])) + .null_value_counts(HashMap::from([(1, 0), (2, 1)])) .build() .unwrap(), DataFileBuilder::default() @@ -1141,15 +1140,16 @@ mod tests { .record_count(200) .partition_spec_id(1) .partition(Struct::empty()) - .column_sizes(HashMap::from([(1, 1024)])) - .value_counts(HashMap::from([(1, 200)])) - .null_value_counts(HashMap::from([(1, 10)])) + .column_sizes(HashMap::from([(1, 1024), (2, 2048)])) + .value_counts(HashMap::from([(1, 200), (2, 600)])) + .null_value_counts(HashMap::from([(1, 10), (2, 999)])) .build() .unwrap(), ]; // Serialize the DataFile objects let serialized_files = data_files + .clone() .into_iter() .map(|f| serialize_data_file_to_json(f, &partition_type, FormatVersion::V2).unwrap()) .collect::>(); @@ -1166,13 +1166,16 @@ mod tests { "record_count": 100, "file_size_in_bytes": 1024, "column_sizes": [ - { "key": 1, "value": 512 } + { "key": 1, "value": 512 }, + { "key": 2, "value": 1024 } ], "value_counts": [ - { "key": 1, "value": 100 } + { "key": 1, "value": 100 }, + { "key": 2, "value": 500 } ], "null_value_counts": [ - { "key": 1, "value": 0 } + { "key": 1, "value": 0 }, + { "key": 2, "value": 1 } ], "nan_value_counts": [], "lower_bounds": [], @@ -1194,13 +1197,16 @@ mod tests { "record_count": 200, "file_size_in_bytes": 2048, "column_sizes": [ - { "key": 1, "value": 1024 } + { "key": 1, "value": 1024 }, + { "key": 2, "value": 2048 } ], "value_counts": [ - { "key": 1, "value": 200 } + { "key": 1, "value": 200 }, + { "key": 2, "value": 600 } ], "null_value_counts": [ - { "key": 1, "value": 10 } + { "key": 1, "value": 10 }, + { "key": 2, "value": 999 } ], "nan_value_counts": [], "lower_bounds": [], @@ -1235,13 +1241,10 @@ mod tests { assert_eq!(deserialized_files.len(), 2); let deserialized_data_file1 = deserialized_files.first().unwrap(); let deserialized_data_file2 = deserialized_files.get(1).unwrap(); - let expected_deserialized_file1 = expect![[ - r#"DataFile { content: Data, file_path: "path/to/file1.parquet", file_format: Parquet, partition: Struct { fields: [] }, record_count: 100, file_size_in_bytes: 1024, column_sizes: {1: 512}, value_counts: {1: 100}, null_value_counts: {1: 0}, nan_value_counts: {}, lower_bounds: {}, upper_bounds: {}, key_metadata: None, split_offsets: [], equality_ids: [], sort_order_id: None, first_row_id: None, partition_spec_id: 1, referenced_data_file: None, content_offset: None, content_size_in_bytes: None }"# - ]]; - let expected_deserialized_file2 = expect![[ - r#"DataFile { content: Data, file_path: "path/to/file2.parquet", file_format: Parquet, partition: Struct { fields: [] }, record_count: 200, file_size_in_bytes: 2048, column_sizes: {1: 1024}, value_counts: {1: 200}, null_value_counts: {1: 10}, nan_value_counts: {}, lower_bounds: {}, upper_bounds: {}, key_metadata: None, split_offsets: [], equality_ids: [], sort_order_id: None, first_row_id: None, partition_spec_id: 1, referenced_data_file: None, content_offset: None, content_size_in_bytes: None }"# - ]]; - expected_deserialized_file1.assert_eq(&format!("{:?}", deserialized_data_file1)); - expected_deserialized_file2.assert_eq(&format!("{:?}", deserialized_data_file2)); + let original_data_file1 = data_files.first().unwrap(); + let original_data_file2 = data_files.get(1).unwrap(); + + assert_eq!(deserialized_data_file1, original_data_file1); + assert_eq!(deserialized_data_file2, original_data_file2); } } From b2c14bda936ba0ee5c0574c4344ba0c0b4cd450b Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Fri, 1 Aug 2025 16:32:16 -0700 Subject: [PATCH 6/8] minor --- crates/iceberg/src/spec/manifest/_serde.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/iceberg/src/spec/manifest/_serde.rs b/crates/iceberg/src/spec/manifest/_serde.rs index 24e49a8a20..c3a7939a3d 100644 --- a/crates/iceberg/src/spec/manifest/_serde.rs +++ b/crates/iceberg/src/spec/manifest/_serde.rs @@ -307,9 +307,9 @@ fn to_i64_entry(entries: HashMap) -> Result, Error> { }) .collect::, Error>>()?; - if cfg!(test) { - i64_entries.sort_by_key(|e| e.key); - } + // Ensure that the order is deterministic during testing + #[cfg(test)] + i64_entries.sort_by_key(|e| e.key); Ok(i64_entries) } From 474c0fd0fa168a279bd237cc2d3fb6b430194eed Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Sun, 3 Aug 2025 22:51:16 -0700 Subject: [PATCH 7/8] smart clippy --- crates/iceberg/src/spec/manifest/_serde.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/iceberg/src/spec/manifest/_serde.rs b/crates/iceberg/src/spec/manifest/_serde.rs index c3a7939a3d..4cac67772a 100644 --- a/crates/iceberg/src/spec/manifest/_serde.rs +++ b/crates/iceberg/src/spec/manifest/_serde.rs @@ -296,6 +296,7 @@ fn parse_i64_entry(v: Vec) -> Result, Error> { Ok(m) } +#[cfg_attr(test, allow(unused_mut))] fn to_i64_entry(entries: HashMap) -> Result, Error> { let mut i64_entries = entries .iter() From c90fe15d22a9e71beec53137431eef320d464d5c Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Sun, 3 Aug 2025 22:56:06 -0700 Subject: [PATCH 8/8] not so smart clippy --- crates/iceberg/src/spec/manifest/_serde.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/spec/manifest/_serde.rs b/crates/iceberg/src/spec/manifest/_serde.rs index 4cac67772a..99d3128db5 100644 --- a/crates/iceberg/src/spec/manifest/_serde.rs +++ b/crates/iceberg/src/spec/manifest/_serde.rs @@ -296,7 +296,7 @@ fn parse_i64_entry(v: Vec) -> Result, Error> { Ok(m) } -#[cfg_attr(test, allow(unused_mut))] +#[allow(unused_mut)] fn to_i64_entry(entries: HashMap) -> Result, Error> { let mut i64_entries = entries .iter()