Skip to content

Commit 7b18773

Browse files
committed
fix nullable field of equality delete writer
1 parent 10e9a61 commit 7b18773

File tree

4 files changed

+70
-10
lines changed

4 files changed

+70
-10
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ apache-avro = "0.17"
4343
array-init = "2"
4444
arrow-arith = { version = "53" }
4545
arrow-array = { version = "53" }
46+
arrow-buffer = { version = "53" }
4647
arrow-cast = { version = "53" }
4748
arrow-ord = { version = "53" }
4849
arrow-schema = { version = "53" }

crates/iceberg/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ apache-avro = { workspace = true }
4646
array-init = { workspace = true }
4747
arrow-arith = { workspace = true }
4848
arrow-array = { workspace = true }
49+
arrow-buffer = { workspace = true }
4950
arrow-cast = { workspace = true }
5051
arrow-ord = { workspace = true }
5152
arrow-schema = { workspace = true }

crates/iceberg/src/arrow/record_batch_projector.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717

1818
use std::sync::Arc;
1919

20-
use arrow_array::{ArrayRef, RecordBatch, StructArray};
20+
use arrow_array::{make_array, ArrayRef, RecordBatch, StructArray};
21+
use arrow_buffer::NullBuffer;
2122
use arrow_schema::{DataType, Field, FieldRef, Fields, Schema, SchemaRef};
2223

2324
use crate::error::Result;
@@ -138,6 +139,7 @@ impl RecordBatchProjector {
138139
fn get_column_by_field_index(batch: &[ArrayRef], field_index: &[usize]) -> Result<ArrayRef> {
139140
let mut rev_iterator = field_index.iter().rev();
140141
let mut array = batch[*rev_iterator.next().unwrap()].clone();
142+
let mut null_buffer = array.logical_nulls();
141143
for idx in rev_iterator {
142144
array = array
143145
.as_any()
@@ -148,8 +150,11 @@ impl RecordBatchProjector {
148150
))?
149151
.column(*idx)
150152
.clone();
153+
null_buffer = NullBuffer::union(null_buffer.as_ref(), array.logical_nulls().as_ref());
151154
}
152-
Ok(array)
155+
Ok(make_array(
156+
array.to_data().into_builder().nulls(null_buffer).build()?,
157+
))
153158
}
154159
}
155160

crates/iceberg/src/writer/base_writer/equality_delete_writer.rs

Lines changed: 61 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -67,13 +67,12 @@ impl EqualityDeleteWriterConfig {
6767
original_arrow_schema,
6868
&equality_ids,
6969
// The following rule comes from https://iceberg.apache.org/spec/#identifier-field-ids
70+
// and https://iceberg.apache.org/spec/#equality-delete-files
7071
// - The identifier field ids must be used for primitive types.
7172
// - The identifier field ids must not be used for floating point types or nullable fields.
72-
// - The identifier field ids can be nested field of struct but not nested field of nullable struct.
7373
|field| {
7474
// Only primitive type is allowed to be used for identifier field ids
75-
if field.is_nullable()
76-
|| field.data_type().is_nested()
75+
if field.data_type().is_nested()
7776
|| matches!(
7877
field.data_type(),
7978
DataType::Float16 | DataType::Float32 | DataType::Float64
@@ -92,7 +91,7 @@ impl EqualityDeleteWriterConfig {
9291
.map_err(|e| Error::new(ErrorKind::Unexpected, e.to_string()))?,
9392
))
9493
},
95-
|field: &Field| !field.is_nullable(),
94+
|_field: &Field| true,
9695
)?;
9796
Ok(Self {
9897
equality_ids,
@@ -172,6 +171,7 @@ mod test {
172171

173172
use arrow_array::types::Int32Type;
174173
use arrow_array::{ArrayRef, BooleanArray, Int32Array, Int64Array, RecordBatch, StructArray};
174+
use arrow_buffer::NullBuffer;
175175
use arrow_schema::DataType;
176176
use arrow_select::concat::concat_batches;
177177
use itertools::Itertools;
@@ -484,14 +484,10 @@ mod test {
484484
// Float and Double are not allowed to be used for equality delete
485485
assert!(EqualityDeleteWriterConfig::new(vec![0], schema.clone(), None).is_err());
486486
assert!(EqualityDeleteWriterConfig::new(vec![1], schema.clone(), None).is_err());
487-
// Int is nullable, not allowed to be used for equality delete
488-
assert!(EqualityDeleteWriterConfig::new(vec![2], schema.clone(), None).is_err());
489487
// Struct is not allowed to be used for equality delete
490488
assert!(EqualityDeleteWriterConfig::new(vec![3], schema.clone(), None).is_err());
491489
// Nested field of struct is allowed to be used for equality delete
492490
assert!(EqualityDeleteWriterConfig::new(vec![4], schema.clone(), None).is_ok());
493-
// Nested field of optional struct is not allowed to be used for equality delete
494-
assert!(EqualityDeleteWriterConfig::new(vec![6], schema.clone(), None).is_err());
495491
// Nested field of map is not allowed to be used for equality delete
496492
assert!(EqualityDeleteWriterConfig::new(vec![7], schema.clone(), None).is_err());
497493
assert!(EqualityDeleteWriterConfig::new(vec![8], schema.clone(), None).is_err());
@@ -657,4 +653,61 @@ mod test {
657653

658654
Ok(())
659655
}
656+
657+
#[tokio::test]
658+
async fn test_equality_delete_with_nullable_field() -> Result<(), anyhow::Error> {
659+
// prepare data
660+
// Int, Struct(Int)
661+
let schema = Schema::builder()
662+
.with_schema_id(1)
663+
.with_fields(vec![
664+
NestedField::optional(0, "col0", Type::Primitive(PrimitiveType::Int)).into(),
665+
NestedField::optional(
666+
1,
667+
"col1",
668+
Type::Struct(StructType::new(vec![NestedField::optional(
669+
2,
670+
"sub_col",
671+
Type::Primitive(PrimitiveType::Int),
672+
)
673+
.into()])),
674+
)
675+
.into(),
676+
])
677+
.build()
678+
.unwrap();
679+
let arrow_schema = Arc::new(schema_to_arrow_schema(&schema).unwrap());
680+
// null 1
681+
// 2 null(struct)
682+
// 3 null(field)
683+
let col0 = Arc::new(Int32Array::from(vec![None, Some(2), Some(3)])) as ArrayRef;
684+
let nulls = NullBuffer::from(vec![true, false, true]);
685+
let col1 = Arc::new(StructArray::new(
686+
if let DataType::Struct(fields) = arrow_schema.fields.get(1).unwrap().data_type() {
687+
fields.clone()
688+
} else {
689+
unreachable!()
690+
},
691+
vec![Arc::new(Int32Array::from(vec![Some(1), Some(2), None]))],
692+
Some(nulls),
693+
));
694+
let columns = vec![col0, col1];
695+
696+
let to_write = RecordBatch::try_new(arrow_schema.clone(), columns).unwrap();
697+
let equality_ids = vec![0_i32, 2];
698+
let equality_config =
699+
EqualityDeleteWriterConfig::new(equality_ids, Arc::new(schema), None).unwrap();
700+
let projector = equality_config.projector.clone();
701+
702+
// check
703+
let to_write_projected = projector.project_bacth(to_write)?;
704+
let expect_batch =
705+
RecordBatch::try_new(equality_config.projected_arrow_schema_ref().clone(), vec![
706+
Arc::new(Int32Array::from(vec![None, Some(2), Some(3)])) as ArrayRef,
707+
Arc::new(Int32Array::from(vec![Some(1), None, None])) as ArrayRef,
708+
])
709+
.unwrap();
710+
assert_eq!(to_write_projected, expect_batch);
711+
Ok(())
712+
}
660713
}

0 commit comments

Comments
 (0)