From 8b029a959058b5560af5babc0b3333054517bfe5 Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Thu, 29 Aug 2024 12:06:20 -0600 Subject: [PATCH 1/3] Extract drive-by fixes from PR 12135 for easier reviewing --- .../core/src/datasource/listing/helpers.rs | 16 ++-- .../core/src/datasource/listing/table.rs | 7 +- .../datasource/physical_plan/parquet/mod.rs | 13 +-- .../physical_plan/parquet/row_filter.rs | 95 +++++++------------ .../physical-expr/src/expressions/binary.rs | 20 ++-- .../physical-plan/src/execution_plan.rs | 2 +- 6 files changed, 64 insertions(+), 89 deletions(-) diff --git a/datafusion/core/src/datasource/listing/helpers.rs b/datafusion/core/src/datasource/listing/helpers.rs index dbeaf5dfcc36..33a16237e162 100644 --- a/datafusion/core/src/datasource/listing/helpers.rs +++ b/datafusion/core/src/datasource/listing/helpers.rs @@ -51,12 +51,12 @@ use object_store::{ObjectMeta, ObjectStore}; /// - the table provider can filter the table partition values with this expression /// - the expression can be marked as `TableProviderFilterPushDown::Exact` once this filtering /// was performed -pub fn expr_applicable_for_cols(col_names: &[String], expr: &Expr) -> bool { +pub fn expr_applicable_for_cols(col_names: &[&str], expr: &Expr) -> bool { let mut is_applicable = true; expr.apply(|expr| { match expr { Expr::Column(Column { ref name, .. }) => { - is_applicable &= col_names.contains(name); + is_applicable &= col_names.contains(&name.as_str()); if is_applicable { Ok(TreeNodeRecursion::Jump) } else { @@ -745,27 +745,27 @@ mod tests { #[test] fn test_expr_applicable_for_cols() { assert!(expr_applicable_for_cols( - &[String::from("c1")], + &["c1"], &Expr::eq(col("c1"), lit("value")) )); assert!(!expr_applicable_for_cols( - &[String::from("c1")], + &["c1"], &Expr::eq(col("c2"), lit("value")) )); assert!(!expr_applicable_for_cols( - &[String::from("c1")], + &["c1"], &Expr::eq(col("c1"), col("c2")) )); assert!(expr_applicable_for_cols( - &[String::from("c1"), String::from("c2")], + &["c1", "c2"], &Expr::eq(col("c1"), col("c2")) )); assert!(expr_applicable_for_cols( - &[String::from("c1"), String::from("c2")], + &["c1", "c2"], &(Expr::eq(col("c1"), col("c2").alias("c2_alias"))).not() )); assert!(expr_applicable_for_cols( - &[String::from("c1"), String::from("c2")], + &["c1", "c2"], &(case(col("c1")) .when(lit("v1"), lit(true)) .otherwise(lit(false)) diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 1f5fa738b253..a24ee9d92c28 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -826,7 +826,7 @@ impl TableProvider for ListingTable { &self, filters: &[&Expr], ) -> Result> { - let support: Vec<_> = filters + Ok(filters .iter() .map(|filter| { if expr_applicable_for_cols( @@ -834,7 +834,7 @@ impl TableProvider for ListingTable { .options .table_partition_cols .iter() - .map(|x| x.0.clone()) + .map(|x| x.0.as_str()) .collect::>(), filter, ) { @@ -846,8 +846,7 @@ impl TableProvider for ListingTable { TableProviderFilterPushDown::Inexact } }) - .collect(); - Ok(support) + .collect()) } fn get_table_definition(&self) -> Option<&str> { diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index 85d6f8db2373..b2f86db742f2 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -685,10 +685,12 @@ impl ExecutionPlan for ParquetExec { partition_index: usize, ctx: Arc, ) -> Result { - let projection = match self.base_config.file_column_projection_indices() { - Some(proj) => proj, - None => (0..self.base_config.file_schema.fields().len()).collect(), - }; + let projection = self + .base_config + .file_column_projection_indices() + .unwrap_or_else(|| { + (0..self.base_config.file_schema.fields().len()).collect() + }); let parquet_file_reader_factory = self .parquet_file_reader_factory @@ -698,8 +700,7 @@ impl ExecutionPlan for ParquetExec { ctx.runtime_env() .object_store(&self.base_config.object_store_url) .map(|store| { - Arc::new(DefaultParquetFileReaderFactory::new(store)) - as Arc + Arc::new(DefaultParquetFileReaderFactory::new(store)) as _ }) })?; diff --git a/datafusion/core/src/datasource/physical_plan/parquet/row_filter.rs b/datafusion/core/src/datasource/physical_plan/parquet/row_filter.rs index 23fdadc2cdee..59d23fd68c31 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/row_filter.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/row_filter.rs @@ -59,6 +59,7 @@ //! the unsorted predicates. Within each partition, predicates are //! still be sorted by size. +use std::cmp::Ordering; use std::collections::BTreeSet; use std::sync::Arc; @@ -129,7 +130,7 @@ impl DatafusionArrowPredicate { // on the order they appear in the file let projection = match candidate.projection.len() { 0 | 1 => vec![], - _ => remap_projection(&candidate.projection), + 2.. => remap_projection(&candidate.projection), }; Ok(Self { @@ -151,32 +152,31 @@ impl ArrowPredicate for DatafusionArrowPredicate { &self.projection_mask } - fn evaluate(&mut self, batch: RecordBatch) -> ArrowResult { - let batch = match self.projection.is_empty() { - true => batch, - false => batch.project(&self.projection)?, + fn evaluate(&mut self, mut batch: RecordBatch) -> ArrowResult { + if !self.projection.is_empty() { + batch = batch.project(&self.projection)?; }; let batch = self.schema_mapping.map_partial_batch(batch)?; // scoped timer updates on drop let mut timer = self.time.timer(); - match self - .physical_expr + + self.physical_expr .evaluate(&batch) .and_then(|v| v.into_array(batch.num_rows())) - { - Ok(array) => { + .and_then(|array| { let bool_arr = as_boolean_array(&array)?.clone(); let num_filtered = bool_arr.len() - bool_arr.true_count(); self.rows_filtered.add(num_filtered); timer.stop(); Ok(bool_arr) - } - Err(e) => Err(ArrowError::ComputeError(format!( - "Error evaluating filter predicate: {e:?}" - ))), - } + }) + .map_err(|e| { + ArrowError::ComputeError(format!( + "Error evaluating filter predicate: {e:?}" + )) + }) } } @@ -453,62 +453,33 @@ pub fn build_row_filter( // no candidates if candidates.is_empty() { - Ok(None) - } else if reorder_predicates { - // attempt to reorder the predicates by size and whether they are sorted - candidates.sort_by_key(|c| c.required_bytes); - - let (indexed_candidates, other_candidates): (Vec<_>, Vec<_>) = - candidates.into_iter().partition(|c| c.can_use_index); - - let mut filters: Vec> = vec![]; - - for candidate in indexed_candidates { - let filter = DatafusionArrowPredicate::try_new( - candidate, - file_schema, - metadata, - rows_filtered.clone(), - time.clone(), - Arc::clone(&schema_mapping), - )?; - - filters.push(Box::new(filter)); - } - - for candidate in other_candidates { - let filter = DatafusionArrowPredicate::try_new( - candidate, - file_schema, - metadata, - rows_filtered.clone(), - time.clone(), - Arc::clone(&schema_mapping), - )?; + return Ok(None); + } - filters.push(Box::new(filter)); - } + if reorder_predicates { + candidates.sort_unstable_by(|c1, c2| { + match c1.can_use_index.cmp(&c2.can_use_index) { + Ordering::Equal => c1.required_bytes.cmp(&c2.required_bytes), + ord => ord, + } + }); + } - Ok(Some(RowFilter::new(filters))) - } else { - // otherwise evaluate the predicates in the order the appeared in the - // original expressions - let mut filters: Vec> = vec![]; - for candidate in candidates { - let filter = DatafusionArrowPredicate::try_new( + candidates + .into_iter() + .map(|candidate| { + DatafusionArrowPredicate::try_new( candidate, file_schema, metadata, rows_filtered.clone(), time.clone(), Arc::clone(&schema_mapping), - )?; - - filters.push(Box::new(filter)); - } - - Ok(Some(RowFilter::new(filters))) - } + ) + .map(|pred| Box::new(pred) as _) + }) + .collect::, _>>() + .map(|filters| Some(RowFilter::new(filters))) } #[cfg(test)] diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 2680a7930ff1..08c133d7193a 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -33,6 +33,7 @@ use arrow::compute::kernels::comparison::{ use arrow::compute::kernels::concat_elements::concat_elements_utf8; use arrow::compute::{cast, ilike, like, nilike, nlike}; use arrow::datatypes::*; +use arrow_schema::ArrowError; use datafusion_common::cast::as_boolean_array; use datafusion_common::{internal_err, Result, ScalarValue}; use datafusion_expr::interval_arithmetic::{apply_operator, Interval}; @@ -133,12 +134,15 @@ impl std::fmt::Display for BinaryExpr { } /// Invoke a boolean kernel on a pair of arrays -macro_rules! boolean_op { - ($LEFT:expr, $RIGHT:expr, $OP:ident) => {{ - let ll = as_boolean_array($LEFT).expect("boolean_op failed to downcast array"); - let rr = as_boolean_array($RIGHT).expect("boolean_op failed to downcast array"); - Ok(Arc::new($OP(&ll, &rr)?)) - }}; +#[inline] +fn boolean_op( + left: &dyn Array, + right: &dyn Array, + op: impl FnOnce(&BooleanArray, &BooleanArray) -> Result, +) -> Result, ArrowError> { + let ll = as_boolean_array(left).expect("boolean_op failed to downcast left array"); + let rr = as_boolean_array(right).expect("boolean_op failed to downcast right array"); + op(ll, rr).map(|t| Arc::new(t) as _) } macro_rules! binary_string_array_flag_op { @@ -596,7 +600,7 @@ impl BinaryExpr { | NotLikeMatch | NotILikeMatch => unreachable!(), And => { if left_data_type == &DataType::Boolean { - boolean_op!(&left, &right, and_kleene) + Ok(boolean_op(&left, &right, and_kleene)?) } else { internal_err!( "Cannot evaluate binary expression {:?} with types {:?} and {:?}", @@ -608,7 +612,7 @@ impl BinaryExpr { } Or => { if left_data_type == &DataType::Boolean { - boolean_op!(&left, &right, or_kleene) + Ok(boolean_op(&left, &right, or_kleene)?) } else { internal_err!( "Cannot evaluate binary expression {:?} with types {:?} and {:?}", diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs index c1c66f6d3923..b7cb0ac651f0 100644 --- a/datafusion/physical-plan/src/execution_plan.rs +++ b/datafusion/physical-plan/src/execution_plan.rs @@ -699,7 +699,7 @@ pub fn execute_stream( match plan.output_partitioning().partition_count() { 0 => Ok(Box::pin(EmptyRecordBatchStream::new(plan.schema()))), 1 => plan.execute(0, context), - _ => { + 2.. => { // merge into a single partition let plan = CoalescePartitionsExec::new(Arc::clone(&plan)); // CoalescePartitionsExec must produce a single partition From 2129092941c1527f692c78a3d81fcfec09b09fef Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Thu, 29 Aug 2024 12:28:17 -0600 Subject: [PATCH 2/3] Add a few more cfgs to silence warnings with different feature sets --- datafusion/common/src/hash_utils.rs | 2 ++ datafusion/core/src/datasource/statistics.rs | 24 ++++++++++++++----- .../src/execution/session_state_defaults.rs | 3 +++ .../core/src/physical_optimizer/pruning.rs | 4 ++++ 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs index f3d2a0a4f9ab..72cfeafd0bfe 100644 --- a/datafusion/common/src/hash_utils.rs +++ b/datafusion/common/src/hash_utils.rs @@ -245,6 +245,8 @@ fn hash_struct_array( Ok(()) } +// only adding this `cfg` b/c this function is only used with this `cfg` +#[cfg(not(feature = "force_hash_collisions"))] fn hash_map_array( array: &MapArray, random_state: &RandomState, diff --git a/datafusion/core/src/datasource/statistics.rs b/datafusion/core/src/datasource/statistics.rs index 6f89657defd3..37ac8a4cb9c1 100644 --- a/datafusion/core/src/datasource/statistics.rs +++ b/datafusion/core/src/datasource/statistics.rs @@ -18,16 +18,21 @@ use std::mem; use std::sync::Arc; -use arrow_schema::DataType; use futures::{Stream, StreamExt}; use datafusion_common::stats::Precision; use datafusion_common::ScalarValue; -use crate::arrow::datatypes::{Schema, SchemaRef}; +use crate::arrow::datatypes::SchemaRef; use crate::error::Result; -use crate::functions_aggregate::min_max::{MaxAccumulator, MinAccumulator}; -use crate::physical_plan::{Accumulator, ColumnStatistics, Statistics}; +use crate::physical_plan::{ColumnStatistics, Statistics}; + +#[cfg(feature = "parquet")] +use crate::{ + functions_aggregate::min_max::{MaxAccumulator, MinAccumulator}, + physical_plan::Accumulator, + arrow::datatypes::Schema +}; use super::listing::PartitionedFile; @@ -144,6 +149,8 @@ pub async fn get_statistics_with_limit( Ok((result_files, statistics)) } +// only adding this cfg b/c this is the only feature it's used with currently +#[cfg(feature = "parquet")] pub(crate) fn create_max_min_accs( schema: &Schema, ) -> (Vec>, Vec>) { @@ -175,6 +182,8 @@ fn add_row_stats( } } +// only adding this cfg b/c this is the only feature it's used with currently +#[cfg(feature = "parquet")] pub(crate) fn get_col_stats( schema: &Schema, null_counts: Vec>, @@ -205,8 +214,11 @@ pub(crate) fn get_col_stats( // (aka non Dictionary) output. We need to adjust the output data type to reflect this. // The reason min/max aggregate produces unpacked output because there is only one // min/max value per group; there is no needs to keep them Dictionary encode -fn min_max_aggregate_data_type(input_type: &DataType) -> &DataType { - if let DataType::Dictionary(_, value_type) = input_type { +// +// only adding this cfg b/c this is the only feature it's used with currently +#[cfg(feature = "parquet")] +fn min_max_aggregate_data_type(input_type: &arrow_schema::DataType) -> &arrow_schema::DataType { + if let arrow_schema::DataType::Dictionary(_, value_type) = input_type { value_type.as_ref() } else { input_type diff --git a/datafusion/core/src/execution/session_state_defaults.rs b/datafusion/core/src/execution/session_state_defaults.rs index bc7e194caeae..b5370efa0a97 100644 --- a/datafusion/core/src/execution/session_state_defaults.rs +++ b/datafusion/core/src/execution/session_state_defaults.rs @@ -100,7 +100,9 @@ impl SessionStateDefaults { /// returns the list of default [`ScalarUDF']'s pub fn default_scalar_functions() -> Vec> { + #[cfg_attr(not(feature = "nested_expressions"), allow(unused_mut))] let mut functions: Vec> = functions::all_default_functions(); + #[cfg(feature = "nested_expressions")] functions.append(&mut functions_nested::all_default_nested_functions()); @@ -144,6 +146,7 @@ impl SessionStateDefaults { } /// registers all the builtin array functions + #[cfg_attr(not(feature = "nested_expressions"), allow(unused_variables))] pub fn register_array_functions(state: &mut SessionState) { // register crate of array expressions (if enabled) #[cfg(feature = "nested_expressions")] diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index a16abc607ee6..9bc2bb1d1db9 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -615,6 +615,8 @@ impl PruningPredicate { is_always_true(&self.predicate_expr) && self.literal_guarantees.is_empty() } + // this is only used by `parquet` feature right now + #[allow(dead_code)] pub(crate) fn required_columns(&self) -> &RequiredColumns { &self.required_columns } @@ -746,6 +748,8 @@ impl RequiredColumns { /// * `a > 5 OR a < 10` returns `Some(a)` /// * `a > 5 OR b < 10` returns `None` /// * `true` returns None + #[allow(dead_code)] + // this fn is only used by `parquet` feature right now, thus the `allow(dead_code)` pub(crate) fn single_column(&self) -> Option<&phys_expr::Column> { if self.columns.windows(2).all(|w| { // check if all columns are the same (ignoring statistics and field) From 3fccee3ca4afa4269385a9e9a061f0bb1fa75c4e Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Thu, 29 Aug 2024 12:37:24 -0600 Subject: [PATCH 3/3] fmt --- datafusion/core/src/datasource/statistics.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/statistics.rs b/datafusion/core/src/datasource/statistics.rs index 37ac8a4cb9c1..201bbfd5c007 100644 --- a/datafusion/core/src/datasource/statistics.rs +++ b/datafusion/core/src/datasource/statistics.rs @@ -29,9 +29,9 @@ use crate::physical_plan::{ColumnStatistics, Statistics}; #[cfg(feature = "parquet")] use crate::{ + arrow::datatypes::Schema, functions_aggregate::min_max::{MaxAccumulator, MinAccumulator}, physical_plan::Accumulator, - arrow::datatypes::Schema }; use super::listing::PartitionedFile; @@ -217,7 +217,9 @@ pub(crate) fn get_col_stats( // // only adding this cfg b/c this is the only feature it's used with currently #[cfg(feature = "parquet")] -fn min_max_aggregate_data_type(input_type: &arrow_schema::DataType) -> &arrow_schema::DataType { +fn min_max_aggregate_data_type( + input_type: &arrow_schema::DataType, +) -> &arrow_schema::DataType { if let arrow_schema::DataType::Dictionary(_, value_type) = input_type { value_type.as_ref() } else {