|
17 | 17 |
|
18 | 18 | //! Optimizer rule for reordering joins to minimize query execution cost |
19 | 19 |
|
20 | | -use std::rc::Rc; |
21 | | - |
22 | | -use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion}; |
23 | | -use datafusion_common::Result; |
24 | | -use datafusion_expr::LogicalPlan; |
25 | | - |
26 | | -use crate::optimizer::ApplyOrder; |
27 | | -use crate::{OptimizerConfig, OptimizerRule}; |
28 | | - |
29 | 20 | pub mod cost; |
30 | 21 | pub mod left_deep_join_plan; |
31 | 22 | pub mod query_graph; |
32 | | - |
33 | | -use cost::JoinCostEstimator; |
34 | | -use left_deep_join_plan::optimal_left_deep_join_plan; |
35 | | -use query_graph::{contains_join, QueryGraph}; |
36 | | - |
37 | | -/// Optimizer rule that reorders joins to minimize query execution cost. |
38 | | -/// |
39 | | -/// This rule identifies consecutive join operations in a query plan and reorders |
40 | | -/// them using the Ibaraki-Kameda algorithm. The algorithm: |
41 | | -/// |
42 | | -/// 1. Converts a join subtree into a query graph representation |
43 | | -/// 2. Builds precedence trees to explore different join orderings |
44 | | -/// 3. Normalizes and denormalizes the trees to find optimal orderings |
45 | | -/// 4. Selects the ordering with the lowest estimated cost |
46 | | -/// |
47 | | -/// The rule only reorders inner joins and requires all joins to be consecutive |
48 | | -/// in the plan tree (no other operations between them). |
49 | | -/// |
50 | | -/// # Example |
51 | | -/// |
52 | | -/// Given a query plan like: |
53 | | -/// ```text |
54 | | -/// Join(customer.c_custkey = orders.o_custkey) |
55 | | -/// Join(orders.o_orderkey = lineitem.l_orderkey) |
56 | | -/// TableScan(customer) |
57 | | -/// TableScan(orders) |
58 | | -/// TableScan(lineitem) |
59 | | -/// ``` |
60 | | -/// |
61 | | -/// The optimizer will evaluate different join orderings and select the one |
62 | | -/// that minimizes intermediate result sizes and overall execution cost. |
63 | | -#[derive(Debug)] |
64 | | -pub struct JoinReorder { |
65 | | - cost_estimator: Rc<dyn JoinCostEstimator>, |
66 | | -} |
67 | | - |
68 | | -impl JoinReorder { |
69 | | - /// Creates a new join reorder optimizer rule with the given cost estimator |
70 | | - pub fn new(cost_estimator: Rc<dyn JoinCostEstimator>) -> Self { |
71 | | - Self { cost_estimator } |
72 | | - } |
73 | | -} |
74 | | - |
75 | | -impl Default for JoinReorder { |
76 | | - fn default() -> Self { |
77 | | - Self { |
78 | | - cost_estimator: Rc::new(cost::DefaultCostEstimator), |
79 | | - } |
80 | | - } |
81 | | -} |
82 | | - |
83 | | -impl OptimizerRule for JoinReorder { |
84 | | - fn name(&self) -> &str { |
85 | | - "join_reorder" |
86 | | - } |
87 | | - |
88 | | - fn apply_order(&self) -> Option<ApplyOrder> { |
89 | | - // We need bottom-up traversal to process join subtrees from leaves to root |
90 | | - Some(ApplyOrder::BottomUp) |
91 | | - } |
92 | | - |
93 | | - fn rewrite( |
94 | | - &self, |
95 | | - plan: LogicalPlan, |
96 | | - _config: &dyn OptimizerConfig, |
97 | | - ) -> Result<Transformed<LogicalPlan>> { |
98 | | - // Only try to reorder if this is a join node |
99 | | - if !matches!(plan, LogicalPlan::Join(_)) { |
100 | | - return Ok(Transformed::no(plan)); |
101 | | - } |
102 | | - |
103 | | - // Check if this join is the root of a consecutive join subtree |
104 | | - // (i.e., all its children are either joins or leaf nodes) |
105 | | - if !is_join_subtree_root(&plan) { |
106 | | - return Ok(Transformed::no(plan)); |
107 | | - } |
108 | | - |
109 | | - // Try to convert the join subtree to a query graph and optimize it |
110 | | - match optimize_join_subtree(plan.clone(), Rc::clone(&self.cost_estimator)) { |
111 | | - Ok(optimized_plan) => Ok(Transformed::yes(optimized_plan)), |
112 | | - Err(_) => { |
113 | | - // If optimization fails (e.g., unsupported join type), return original plan |
114 | | - Ok(Transformed::no(plan)) |
115 | | - } |
116 | | - } |
117 | | - } |
118 | | -} |
119 | | - |
120 | | -/// Checks if a plan node is the root of a consecutive join subtree. |
121 | | -/// |
122 | | -/// A node is considered a join subtree root if: |
123 | | -/// - It is a Join node |
124 | | -/// - All its descendants are either Join nodes or don't contain any joins |
125 | | -/// |
126 | | -/// This ensures we only try to optimize complete join subtrees that can be |
127 | | -/// safely reordered without breaking other operators. |
128 | | -fn is_join_subtree_root(plan: &LogicalPlan) -> bool { |
129 | | - if !matches!(plan, LogicalPlan::Join(_)) { |
130 | | - return false; |
131 | | - } |
132 | | - |
133 | | - // Check if all children either are joins themselves or don't contain any joins |
134 | | - let mut all_valid = true; |
135 | | - let _ = plan.apply_children(|child| { |
136 | | - if matches!(child, LogicalPlan::Join(_)) { |
137 | | - // This child is a join, continue checking down the tree |
138 | | - Ok(TreeNodeRecursion::Continue) |
139 | | - } else if !contains_join(child) { |
140 | | - // This child doesn't contain any joins - it's a leaf subtree |
141 | | - Ok(TreeNodeRecursion::Continue) |
142 | | - } else { |
143 | | - // Found a non-join node that contains joins - this breaks the consecutive join pattern |
144 | | - all_valid = false; |
145 | | - Ok(TreeNodeRecursion::Stop) |
146 | | - } |
147 | | - }); |
148 | | - |
149 | | - all_valid |
150 | | -} |
151 | | - |
152 | | -/// Optimizes a join subtree by converting it to a query graph and finding |
153 | | -/// the optimal join ordering. |
154 | | -/// |
155 | | -/// # Arguments |
156 | | -/// |
157 | | -/// * `plan` - The join subtree to optimize (must be a Join node at the root) |
158 | | -/// * `cost_estimator` - The cost estimator to use for optimization |
159 | | -/// |
160 | | -/// # Returns |
161 | | -/// |
162 | | -/// Returns an optimized LogicalPlan with joins reordered for minimal cost. |
163 | | -/// |
164 | | -/// # Errors |
165 | | -/// |
166 | | -/// Returns an error if: |
167 | | -/// - The plan cannot be converted to a query graph |
168 | | -/// - The optimization algorithm fails |
169 | | -fn optimize_join_subtree( |
170 | | - plan: LogicalPlan, |
171 | | - cost_estimator: Rc<dyn JoinCostEstimator>, |
172 | | -) -> Result<LogicalPlan> { |
173 | | - // Convert the join subtree to a query graph |
174 | | - let query_graph = QueryGraph::try_from(plan)?; |
175 | | - |
176 | | - // Use the Ibaraki-Kameda algorithm to find the optimal join ordering |
177 | | - optimal_left_deep_join_plan(query_graph, cost_estimator) |
178 | | -} |
0 commit comments