1515// specific language governing permissions and limitations
1616// under the License.
1717
18- use std:: collections:: HashMap ;
18+ use std:: collections:: { HashMap , HashSet } ;
19+ use std:: ops:: Not ;
1920
20- use arrow_array:: { Int64Array , StringArray } ;
21+ use arrow_array:: {
22+ Array , ArrayRef , BooleanArray , Date32Array , Float32Array , Float64Array , Int32Array , Int64Array ,
23+ StringArray , Time64MicrosecondArray , TimestampMicrosecondArray , TimestampNanosecondArray ,
24+ } ;
2125use futures:: { StreamExt , TryStreamExt } ;
26+ use itertools:: Itertools ;
2227use tokio:: sync:: oneshot:: { Receiver , channel} ;
2328
2429use super :: delete_filter:: DeleteFilter ;
30+ use crate :: arrow:: arrow_schema_to_schema;
2531use crate :: arrow:: delete_file_loader:: BasicDeleteFileLoader ;
2632use crate :: delete_vector:: DeleteVector ;
27- use crate :: expr:: Predicate ;
33+ use crate :: expr:: Predicate :: AlwaysTrue ;
34+ use crate :: expr:: { Predicate , Reference } ;
2835use crate :: io:: FileIO ;
2936use crate :: scan:: { ArrowRecordBatchStream , FileScanTaskDeleteFile } ;
30- use crate :: spec:: { DataContentType , SchemaRef } ;
37+ use crate :: spec:: { DataContentType , Datum , NestedFieldRef , PrimitiveType , SchemaRef } ;
3138use crate :: { Error , ErrorKind , Result } ;
3239
3340#[ derive( Clone , Debug ) ]
@@ -43,6 +50,7 @@ enum DeleteFileContext {
4350 PosDels ( ArrowRecordBatchStream ) ,
4451 FreshEqDel {
4552 batch_stream : ArrowRecordBatchStream ,
53+ equality_ids : HashSet < i32 > ,
4654 sender : tokio:: sync:: oneshot:: Sender < Predicate > ,
4755 } ,
4856}
@@ -224,6 +232,7 @@ impl CachingDeleteFileLoader {
224232 )
225233 . await ?,
226234 sender,
235+ equality_ids : HashSet :: from_iter ( task. equality_ids . clone ( ) ) ,
227236 } )
228237 }
229238
@@ -247,9 +256,11 @@ impl CachingDeleteFileLoader {
247256 DeleteFileContext :: FreshEqDel {
248257 sender,
249258 batch_stream,
259+ equality_ids,
250260 } => {
251261 let predicate =
252- Self :: parse_equality_deletes_record_batch_stream ( batch_stream) . await ?;
262+ Self :: parse_equality_deletes_record_batch_stream ( batch_stream, equality_ids)
263+ . await ?;
253264
254265 sender
255266 . send ( predicate)
@@ -308,28 +319,231 @@ impl CachingDeleteFileLoader {
308319 Ok ( result)
309320 }
310321
311- /// Parses record batch streams from individual equality delete files
312- ///
313- /// Returns an unbound Predicate for each batch stream
314322 async fn parse_equality_deletes_record_batch_stream (
315- streams : ArrowRecordBatchStream ,
323+ mut stream : ArrowRecordBatchStream ,
324+ equality_ids : HashSet < i32 > ,
316325 ) -> Result < Predicate > {
317- // TODO
326+ let mut result_predicate = AlwaysTrue ;
327+
328+ while let Some ( record_batch) = stream. next ( ) . await {
329+ let record_batch = record_batch?;
330+
331+ if record_batch. num_columns ( ) == 0 {
332+ return Ok ( AlwaysTrue ) ;
333+ }
334+
335+ let batch_schema_arrow = record_batch. schema ( ) ;
336+ let batch_schema_iceberg = arrow_schema_to_schema ( batch_schema_arrow. as_ref ( ) ) ?;
337+
338+ let mut datum_columns_with_names: Vec < _ > = record_batch
339+ . columns ( )
340+ . iter ( )
341+ . zip ( batch_schema_iceberg. as_struct ( ) . fields ( ) )
342+ // only use columns that are in the set of equality_ids for this delete file
343+ . filter ( |( field, value) | equality_ids. contains ( & value. id ) )
344+ . map ( |( column, field) | {
345+ let col_as_datum_vec = arrow_array_to_datum_iterator ( column, field) ;
346+ col_as_datum_vec. map ( |c| ( c, field. name . to_string ( ) ) )
347+ } )
348+ . try_collect ( ) ?;
349+
350+ // consume all the iterators in lockstep, creating per-row predicates that get combined
351+ // into a single final predicate
352+
353+ // (2025-06-12) can't use `is_empty` as it depends on unstable library feature `exact_size_is_empty`
354+ #[ allow( clippy:: len_zero) ]
355+ while datum_columns_with_names[ 0 ] . 0 . len ( ) > 0 {
356+ let mut row_predicate = AlwaysTrue ;
357+ for & mut ( ref mut column, ref field_name) in & mut datum_columns_with_names {
358+ if let Some ( item) = column. next ( ) {
359+ if let Some ( datum) = item? {
360+ row_predicate = row_predicate
361+ . and ( Reference :: new ( field_name. clone ( ) ) . equal_to ( datum. clone ( ) ) ) ;
362+ }
363+ }
364+ }
365+ result_predicate = result_predicate. and ( row_predicate. not ( ) ) ;
366+ }
367+ }
368+ Ok ( result_predicate. rewrite_not ( ) )
369+ }
370+ }
371+
372+ macro_rules! prim_to_datum {
373+ ( $column: ident, $arr: ty, $dat: path) => { {
374+ let arr = $column. as_any( ) . downcast_ref:: <$arr>( ) . ok_or( Error :: new(
375+ ErrorKind :: Unexpected ,
376+ format!( "could not downcast ArrayRef to {}" , stringify!( $arr) ) ,
377+ ) ) ?;
378+ Ok ( Box :: new( arr. iter( ) . map( |val| Ok ( val. map( $dat) ) ) ) )
379+ } } ;
380+ }
381+
382+ fn eq_col_unsupported ( ty : & str ) -> Error {
383+ Error :: new (
384+ ErrorKind :: FeatureUnsupported ,
385+ format ! (
386+ "Equality deletes where a predicate acts upon a {} column are not yet supported" ,
387+ ty
388+ ) ,
389+ )
390+ }
318391
319- Err ( Error :: new (
320- ErrorKind :: FeatureUnsupported ,
321- "parsing of equality deletes is not yet supported" ,
322- ) )
392+ fn arrow_array_to_datum_iterator < ' a > (
393+ column : & ' a ArrayRef ,
394+ field : & NestedFieldRef ,
395+ ) -> Result < Box < dyn ExactSizeIterator < Item = Result < Option < Datum > > > + ' a > > {
396+ match field. field_type . as_primitive_type ( ) {
397+ Some ( primitive_type) => match primitive_type {
398+ PrimitiveType :: Int => prim_to_datum ! ( column, Int32Array , Datum :: int) ,
399+ PrimitiveType :: Boolean => {
400+ prim_to_datum ! ( column, BooleanArray , Datum :: bool )
401+ }
402+ PrimitiveType :: Long => prim_to_datum ! ( column, Int64Array , Datum :: long) ,
403+ PrimitiveType :: Float => {
404+ prim_to_datum ! ( column, Float32Array , Datum :: float)
405+ }
406+ PrimitiveType :: Double => {
407+ prim_to_datum ! ( column, Float64Array , Datum :: double)
408+ }
409+ PrimitiveType :: String => {
410+ prim_to_datum ! ( column, StringArray , Datum :: string)
411+ }
412+ PrimitiveType :: Date => prim_to_datum ! ( column, Date32Array , Datum :: date) ,
413+ PrimitiveType :: Timestamp => {
414+ prim_to_datum ! ( column, TimestampMicrosecondArray , Datum :: timestamp_micros)
415+ }
416+ PrimitiveType :: Timestamptz => {
417+ prim_to_datum ! ( column, TimestampMicrosecondArray , Datum :: timestamptz_micros)
418+ }
419+ PrimitiveType :: TimestampNs => {
420+ prim_to_datum ! ( column, TimestampNanosecondArray , Datum :: timestamp_nanos)
421+ }
422+ PrimitiveType :: TimestamptzNs => {
423+ prim_to_datum ! ( column, TimestampNanosecondArray , Datum :: timestamptz_nanos)
424+ }
425+ PrimitiveType :: Time => {
426+ let arr = column
427+ . as_any ( )
428+ . downcast_ref :: < Time64MicrosecondArray > ( )
429+ . ok_or ( Error :: new (
430+ ErrorKind :: Unexpected ,
431+ "could not downcast ArrayRef to Time64MicrosecondArray" ,
432+ ) ) ?;
433+ Ok ( Box :: new ( arr. iter ( ) . map ( |val| match val {
434+ None => Ok ( None ) ,
435+ Some ( val) => Datum :: time_micros ( val) . map ( Some ) ,
436+ } ) ) )
437+ }
438+ PrimitiveType :: Decimal { .. } => Err ( eq_col_unsupported ( "Decimal" ) ) ,
439+ PrimitiveType :: Uuid => Err ( eq_col_unsupported ( "Uuid" ) ) ,
440+ PrimitiveType :: Fixed ( _) => Err ( eq_col_unsupported ( "Fixed" ) ) ,
441+ PrimitiveType :: Binary => Err ( eq_col_unsupported ( "Binary" ) ) ,
442+ } ,
443+ None => Err ( eq_col_unsupported (
444+ "non-primitive (i.e. Struct, List, or Map)" ,
445+ ) ) ,
323446 }
324447}
325448
326449#[ cfg( test) ]
327450mod tests {
451+ use std:: collections:: HashMap ;
452+ use std:: fs:: File ;
453+ use std:: sync:: Arc ;
454+
455+ use arrow_array:: { Int64Array , RecordBatch , StringArray } ;
456+ use parquet:: arrow:: { ArrowWriter , PARQUET_FIELD_ID_META_KEY } ;
457+ use parquet:: basic:: Compression ;
458+ use parquet:: file:: properties:: WriterProperties ;
328459 use tempfile:: TempDir ;
329460
330461 use super :: * ;
331462 use crate :: arrow:: delete_filter:: tests:: setup;
332463
464+ #[ tokio:: test]
465+ async fn test_delete_file_loader_parse_equality_deletes ( ) {
466+ let tmp_dir = TempDir :: new ( ) . unwrap ( ) ;
467+ let table_location = tmp_dir. path ( ) . as_os_str ( ) . to_str ( ) . unwrap ( ) ;
468+ let file_io = FileIO :: from_path ( table_location) . unwrap ( ) . build ( ) . unwrap ( ) ;
469+
470+ let eq_delete_file_path = setup_write_equality_delete_file_1 ( table_location) ;
471+
472+ let basic_delete_file_loader = BasicDeleteFileLoader :: new ( file_io. clone ( ) ) ;
473+ let record_batch_stream = basic_delete_file_loader
474+ . parquet_to_batch_stream ( & eq_delete_file_path)
475+ . await
476+ . expect ( "could not get batch stream" ) ;
477+
478+ let eq_ids = HashSet :: from_iter ( vec ! [ 2 , 3 , 4 ] ) ;
479+
480+ let parsed_eq_delete = CachingDeleteFileLoader :: parse_equality_deletes_record_batch_stream (
481+ record_batch_stream,
482+ eq_ids,
483+ )
484+ . await
485+ . expect ( "error parsing batch stream" ) ;
486+ println ! ( "{}" , parsed_eq_delete) ;
487+
488+ let expected = "(((y != 1) OR (z != 100)) OR (a != \" HELP\" )) AND (y != 2)" . to_string ( ) ;
489+
490+ assert_eq ! ( parsed_eq_delete. to_string( ) , expected) ;
491+ }
492+
493+ fn setup_write_equality_delete_file_1 ( table_location : & str ) -> String {
494+ let col_y_vals = vec ! [ 1 , 2 ] ;
495+ let col_y = Arc :: new ( Int64Array :: from ( col_y_vals) ) as ArrayRef ;
496+
497+ let col_z_vals = vec ! [ Some ( 100 ) , None ] ;
498+ let col_z = Arc :: new ( Int64Array :: from ( col_z_vals) ) as ArrayRef ;
499+
500+ let col_a_vals = vec ! [ Some ( "HELP" ) , None ] ;
501+ let col_a = Arc :: new ( StringArray :: from ( col_a_vals) ) as ArrayRef ;
502+
503+ let equality_delete_schema = {
504+ let fields = vec ! [
505+ arrow_schema:: Field :: new( "y" , arrow_schema:: DataType :: Int64 , true ) . with_metadata(
506+ HashMap :: from( [ ( PARQUET_FIELD_ID_META_KEY . to_string( ) , "2" . to_string( ) ) ] ) ,
507+ ) ,
508+ arrow_schema:: Field :: new( "z" , arrow_schema:: DataType :: Int64 , true ) . with_metadata(
509+ HashMap :: from( [ ( PARQUET_FIELD_ID_META_KEY . to_string( ) , "3" . to_string( ) ) ] ) ,
510+ ) ,
511+ arrow_schema:: Field :: new( "a" , arrow_schema:: DataType :: Utf8 , true ) . with_metadata(
512+ HashMap :: from( [ ( PARQUET_FIELD_ID_META_KEY . to_string( ) , "4" . to_string( ) ) ] ) ,
513+ ) ,
514+ ] ;
515+ Arc :: new ( arrow_schema:: Schema :: new ( fields) )
516+ } ;
517+
518+ let equality_deletes_to_write =
519+ RecordBatch :: try_new ( equality_delete_schema. clone ( ) , vec ! [ col_y, col_z, col_a] )
520+ . unwrap ( ) ;
521+
522+ let path = format ! ( "{}/equality-deletes-1.parquet" , & table_location) ;
523+
524+ let file = File :: create ( & path) . unwrap ( ) ;
525+
526+ let props = WriterProperties :: builder ( )
527+ . set_compression ( Compression :: SNAPPY )
528+ . build ( ) ;
529+
530+ let mut writer = ArrowWriter :: try_new (
531+ file,
532+ equality_deletes_to_write. schema ( ) ,
533+ Some ( props. clone ( ) ) ,
534+ )
535+ . unwrap ( ) ;
536+
537+ writer
538+ . write ( & equality_deletes_to_write)
539+ . expect ( "Writing batch" ) ;
540+
541+ // writer must be closed to write footer
542+ writer. close ( ) . unwrap ( ) ;
543+
544+ path
545+ }
546+
333547 #[ tokio:: test]
334548 async fn test_caching_delete_file_loader_load_deletes ( ) {
335549 let tmp_dir = TempDir :: new ( ) . unwrap ( ) ;
0 commit comments