Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 54 additions & 4 deletions crates/iceberg/src/spec/manifest/_serde.rs
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ struct BytesEntry {
fn parse_bytes_entry(v: Vec<BytesEntry>, schema: &Schema) -> Result<HashMap<i32, Datum>, Error> {
let mut m = HashMap::with_capacity(v.len());
for entry in v {
// We ignore the entry if the field is not found in the schema, due to schema evolution.
// Try to find the field in the schema to get proper type information
if let Some(field) = schema.field_by_id(entry.key) {
let data_type = field
.field_type
Expand All @@ -258,6 +258,10 @@ fn parse_bytes_entry(v: Vec<BytesEntry>, schema: &Schema) -> Result<HashMap<i32,
})?
.clone();
m.insert(entry.key, Datum::try_from_bytes(&entry.value, data_type)?);
} else {
// Field is not in current schema (e.g., dropped field due to schema evolution).
// Store the statistic as binary data to preserve it even though we don't know its type.
m.insert(entry.key, Datum::binary(entry.value.to_vec()));
Copy link
Contributor

@liurenjie1024 liurenjie1024 Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fix may lead to slient error in user application, because user assumes that the returned statistics in Manifest are all parsed, and they will see a type mismatch. I think there are two ways to do this fix:

  1. We add a new enum:
enum Statistic {
   Parsed(Datum),
   Raw(Vec<u8)
}

And change DataFile's lower/upper bound to HashMap<i32, Statistic>. This will lead to breaking api change, but will users will be aware of this, and will not see slient breaking of their application.

  1. We pass TableMetadata into this parsing function, and search for missing field id in all schemas in TableMetadata. This approach may slow down the deserialization a little when seeing field ids due to schema evolution, but it will not lead to any api change.

Personally I perfer to approach 2, WDYT?

}
}
Ok(m)
Expand Down Expand Up @@ -320,11 +324,11 @@ mod tests {
use std::io::Cursor;
use std::sync::Arc;

use crate::spec::manifest::_serde::{I64Entry, parse_i64_entry};
use crate::spec::manifest::_serde::{BytesEntry, I64Entry, parse_bytes_entry, parse_i64_entry};
use crate::spec::{
DataContentType, DataFile, DataFileFormat, Datum, FormatVersion, NestedField,
PrimitiveType, Schema, Struct, StructType, Type, read_data_files_from_avro,
write_data_files_to_avro,
PrimitiveLiteral, PrimitiveType, Schema, Struct, StructType, Type,
read_data_files_from_avro, write_data_files_to_avro,
};

#[test]
Expand Down Expand Up @@ -590,4 +594,50 @@ mod tests {
assert_eq!(data_file.file_size_in_bytes, 2048);
assert_eq!(data_file.partition_spec_id, 0);
}

#[test]
fn test_parse_bytes_entry_preserves_dropped_field_statistics() {
use serde_bytes::ByteBuf;

// Create a schema with only field ID 1
let schema = Schema::builder()
.with_fields(vec![Arc::new(NestedField::required(
1,
"existing_field",
Type::Primitive(PrimitiveType::Int),
))])
.build()
.unwrap();

// Create entries for field ID 1 (exists in schema) and field ID 2 (dropped from schema)
let entries = vec![
BytesEntry {
key: 1,
value: ByteBuf::from(vec![42, 0, 0, 0]), // int value 42 in little-endian
},
BytesEntry {
key: 2, // This field is not in the schema
value: ByteBuf::from(vec![1, 2, 3, 4]), // Some arbitrary bytes
},
];

let result = parse_bytes_entry(entries, &schema).unwrap();

// Both statistics should be preserved
assert_eq!(result.len(), 2, "Both statistics should be preserved");

// Field 1 should be properly deserialized as Int
let datum1 = result.get(&1).expect("Field 1 should exist");
assert_eq!(datum1.data_type(), &PrimitiveType::Int);
assert_eq!(datum1.to_string(), "42");

// Field 2 should be preserved as Binary since it's not in the schema
let datum2 = result.get(&2).expect("Field 2 should be preserved");
assert_eq!(datum2.data_type(), &PrimitiveType::Binary);
if let PrimitiveLiteral::Binary(bytes) = datum2.literal() {
assert_eq!(bytes, &vec![1, 2, 3, 4]);
} else {
panic!("Field 2 should be stored as Binary");
}
}
}
6 changes: 4 additions & 2 deletions crates/iceberg/src/spec/manifest/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -783,8 +783,8 @@ mod tests {
Manifest::parse_avro(fs::read(path).expect("read_file must succeed").as_slice())
.unwrap();

// Compared with original manifest, the lower_bounds and upper_bounds no longer has data for field 3, and
// other parts should be same.
// Compared with original manifest, the lower_bounds and upper_bounds now PRESERVE data for field 3
// as binary (since field 3 is not in the current schema), and other parts should be same.
// The snapshot id is assigned when the entry is added to the manifest.
let schema = Arc::new(
Schema::builder()
Expand Down Expand Up @@ -834,10 +834,12 @@ mod tests {
lower_bounds: HashMap::from([
(1, Datum::long(1)),
(2, Datum::int(2)),
(3, Datum::binary(vec![120])), // Field 3 preserved as binary
]),
upper_bounds: HashMap::from([
(1, Datum::long(1)),
(2, Datum::int(2)),
(3, Datum::binary(vec![120])), // Field 3 preserved as binary
]),
key_metadata: None,
split_offsets: vec![4],
Expand Down