@@ -760,6 +760,7 @@ impl BoundPredicateVisitor for PageIndexEvaluator<'_> {
760760/// returned: NYYYYYNNYYNYN
761761///
762762/// This can be removed from here once RowSelection::union is in parquet::arrow
763+ /// (Hopefully once https://github.com/apache/arrow-rs/pull/6308 gets merged)
763764fn union_row_selections ( left : & RowSelection , right : & RowSelection ) -> RowSelection {
764765 let mut l_iter = left. iter ( ) . copied ( ) . peekable ( ) ;
765766 let mut r_iter = right. iter ( ) . copied ( ) . peekable ( ) ;
@@ -840,9 +841,24 @@ fn union_row_selections(left: &RowSelection, right: &RowSelection) -> RowSelecti
840841
841842#[ cfg( test) ]
842843mod tests {
843- use parquet:: arrow:: arrow_reader:: { RowSelection , RowSelector } ;
844+ use std:: collections:: HashMap ;
845+ use std:: sync:: Arc ;
844846
845- use crate :: expr:: visitors:: page_index_evaluator:: union_row_selections;
847+ use parquet:: arrow:: arrow_reader:: { RowSelection , RowSelector } ;
848+ use parquet:: basic:: { LogicalType as ParquetLogicalType , Type as ParquetPhysicalType } ;
849+ use parquet:: file:: metadata:: { ColumnChunkMetaData , RowGroupMetaData } ;
850+ use parquet:: file:: page_index:: index:: { Index , NativeIndex } ;
851+ use parquet:: file:: statistics:: Statistics ;
852+ use parquet:: format:: { BoundaryOrder , PageLocation } ;
853+ use parquet:: schema:: types:: {
854+ ColumnDescriptor , ColumnPath , SchemaDescriptor , Type as parquetSchemaType,
855+ } ;
856+
857+ use super :: { union_row_selections, PageIndexEvaluator } ;
858+ // use rand::{thread_rng, Rng};
859+ use crate :: expr:: { Bind , Reference } ;
860+ use crate :: spec:: { Datum , NestedField , PrimitiveType , Schema , Type } ;
861+ use crate :: Result ;
846862
847863 #[ test]
848864 fn test_union_row_selections ( ) {
@@ -873,4 +889,167 @@ mod tests {
873889 & RowSelector :: select( 40 )
874890 ] ) ;
875891 }
892+
893+ #[ test]
894+ fn eval_matches_no_rows_for_empty_row_group ( ) -> Result < ( ) > {
895+ let row_group_metadata = create_row_group_metadata ( 0 , 0 , None , 0 , None ) ?;
896+ let ( column_index, offset_index) = create_page_index ( ) ?;
897+
898+ let ( iceberg_schema_ref, field_id_map) = build_iceberg_schema_and_field_map ( ) ?;
899+
900+ let filter = Reference :: new ( "col_float" )
901+ . greater_than ( Datum :: float ( 1.0 ) )
902+ . bind ( iceberg_schema_ref. clone ( ) , false ) ?;
903+
904+ let result = PageIndexEvaluator :: eval (
905+ & filter,
906+ & column_index,
907+ & offset_index,
908+ & row_group_metadata,
909+ & field_id_map,
910+ iceberg_schema_ref. as_ref ( ) ,
911+ ) ?;
912+
913+ let expected = vec ! [ ] ;
914+
915+ assert_eq ! ( result, expected) ;
916+
917+ Ok ( ( ) )
918+ }
919+
920+ #[ test]
921+ fn eval_is_null_none_null_select_all_rows ( ) -> Result < ( ) > {
922+ let row_group_metadata = create_row_group_metadata ( 0 , 0 , None , 0 , None ) ?;
923+ let ( column_index, offset_index) = create_page_index ( ) ?;
924+
925+ let ( iceberg_schema_ref, field_id_map) = build_iceberg_schema_and_field_map ( ) ?;
926+
927+ let filter = Reference :: new ( "col_float" )
928+ . is_null ( )
929+ . bind ( iceberg_schema_ref. clone ( ) , false ) ?;
930+
931+ let result = PageIndexEvaluator :: eval (
932+ & filter,
933+ & column_index,
934+ & offset_index,
935+ & row_group_metadata,
936+ & field_id_map,
937+ iceberg_schema_ref. as_ref ( ) ,
938+ ) ?;
939+
940+ let expected = vec ! [ ] ;
941+
942+ assert_eq ! ( result, expected) ;
943+
944+ Ok ( ( ) )
945+ }
946+
947+ fn build_iceberg_schema_and_field_map ( ) -> Result < ( Arc < Schema > , HashMap < i32 , usize > ) > {
948+ let iceberg_schema = Schema :: builder ( )
949+ . with_fields ( [
950+ Arc :: new ( NestedField :: new (
951+ 1 ,
952+ "col_float" ,
953+ Type :: Primitive ( PrimitiveType :: Float ) ,
954+ false ,
955+ ) ) ,
956+ Arc :: new ( NestedField :: new (
957+ 2 ,
958+ "col_string" ,
959+ Type :: Primitive ( PrimitiveType :: String ) ,
960+ false ,
961+ ) ) ,
962+ ] )
963+ . build ( ) ?;
964+ let iceberg_schema_ref = Arc :: new ( iceberg_schema) ;
965+
966+ let field_id_map = HashMap :: from_iter ( [ ( 1 , 0 ) , ( 2 , 1 ) ] ) ;
967+
968+ Ok ( ( iceberg_schema_ref, field_id_map) )
969+ }
970+
971+ fn build_parquet_schema_descriptor ( ) -> Result < Arc < SchemaDescriptor > > {
972+ let field_1 = Arc :: new (
973+ parquetSchemaType:: primitive_type_builder ( "col_float" , ParquetPhysicalType :: FLOAT )
974+ . with_id ( Some ( 1 ) )
975+ . build ( ) ?,
976+ ) ;
977+
978+ let field_2 = Arc :: new (
979+ parquetSchemaType:: primitive_type_builder (
980+ "col_string" ,
981+ ParquetPhysicalType :: BYTE_ARRAY ,
982+ )
983+ . with_id ( Some ( 2 ) )
984+ . with_logical_type ( Some ( ParquetLogicalType :: String ) )
985+ . build ( ) ?,
986+ ) ;
987+
988+ let group_type = Arc :: new (
989+ parquetSchemaType:: group_type_builder ( "all" )
990+ . with_id ( Some ( 1000 ) )
991+ . with_fields ( vec ! [ field_1, field_2] )
992+ . build ( ) ?,
993+ ) ;
994+
995+ let schema_descriptor = SchemaDescriptor :: new ( group_type) ;
996+ let schema_descriptor_arc = Arc :: new ( schema_descriptor) ;
997+ Ok ( schema_descriptor_arc)
998+ }
999+
1000+ fn create_row_group_metadata (
1001+ num_rows : i64 ,
1002+ col_1_num_vals : i64 ,
1003+ col_1_stats : Option < Statistics > ,
1004+ col_2_num_vals : i64 ,
1005+ col_2_stats : Option < Statistics > ,
1006+ ) -> Result < RowGroupMetaData > {
1007+ let schema_descriptor_arc = build_parquet_schema_descriptor ( ) ?;
1008+
1009+ let column_1_desc_ptr = Arc :: new ( ColumnDescriptor :: new (
1010+ schema_descriptor_arc. column ( 0 ) . self_type_ptr ( ) ,
1011+ 1 ,
1012+ 1 ,
1013+ ColumnPath :: new ( vec ! [ "col_float" . to_string( ) ] ) ,
1014+ ) ) ;
1015+
1016+ let column_2_desc_ptr = Arc :: new ( ColumnDescriptor :: new (
1017+ schema_descriptor_arc. column ( 1 ) . self_type_ptr ( ) ,
1018+ 1 ,
1019+ 1 ,
1020+ ColumnPath :: new ( vec ! [ "col_string" . to_string( ) ] ) ,
1021+ ) ) ;
1022+
1023+ let mut col_1_meta =
1024+ ColumnChunkMetaData :: builder ( column_1_desc_ptr) . set_num_values ( col_1_num_vals) ;
1025+ if let Some ( stats1) = col_1_stats {
1026+ col_1_meta = col_1_meta. set_statistics ( stats1)
1027+ }
1028+
1029+ let mut col_2_meta =
1030+ ColumnChunkMetaData :: builder ( column_2_desc_ptr) . set_num_values ( col_2_num_vals) ;
1031+ if let Some ( stats2) = col_2_stats {
1032+ col_2_meta = col_2_meta. set_statistics ( stats2)
1033+ }
1034+
1035+ let row_group_metadata = RowGroupMetaData :: builder ( schema_descriptor_arc)
1036+ . set_num_rows ( num_rows)
1037+ . set_column_metadata ( vec ! [
1038+ col_1_meta. build( ) ?,
1039+ // .set_statistics(Statistics::float(None, None, None, 1, false))
1040+ col_2_meta. build( ) ?,
1041+ ] )
1042+ . build ( ) ;
1043+
1044+ Ok ( row_group_metadata?)
1045+ }
1046+
1047+ fn create_page_index ( ) -> Result < ( Vec < Index > , Vec < Vec < PageLocation > > ) > {
1048+ let idx = Index :: FLOAT ( NativeIndex :: < f32 > {
1049+ indexes : vec ! [ ] ,
1050+ boundary_order : BoundaryOrder ( 0 ) , // UNORDERED
1051+ } ) ;
1052+
1053+ Ok ( ( vec ! [ ] , vec ! [ ] ) )
1054+ }
8761055}
0 commit comments