Skip to content

Commit 16797d2

Browse files
author
Zhenhua Wang
committed
fix more comments
1 parent e1669ed commit 16797d2

File tree

3 files changed

+53
-76
lines changed

3 files changed

+53
-76
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala

Lines changed: 14 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -236,28 +236,8 @@ object EstimationUtils {
236236
// Only collect overlapped ranges.
237237
if (left.lo <= right.hi && left.hi >= right.lo) {
238238
// Collect overlapped ranges.
239-
val range = if (left.lo == left.hi) {
240-
// Case1: the left bin has only one value
241-
OverlappedRange(
242-
lo = left.lo,
243-
hi = left.lo,
244-
leftNdv = 1,
245-
rightNdv = 1,
246-
leftNumRows = leftHeight,
247-
rightNumRows = rightHeight / right.ndv
248-
)
249-
} else if (right.lo == right.hi) {
250-
// Case2: the right bin has only one value
251-
OverlappedRange(
252-
lo = right.lo,
253-
hi = right.lo,
254-
leftNdv = 1,
255-
rightNdv = 1,
256-
leftNumRows = leftHeight / left.ndv,
257-
rightNumRows = rightHeight
258-
)
259-
} else if (right.lo >= left.lo && right.hi >= left.hi) {
260-
// Case3: the left bin is "smaller" than the right bin
239+
val range = if (right.lo >= left.lo && right.hi >= left.hi) {
240+
// Case1: the left bin is "smaller" than the right bin
261241
// left.lo right.lo left.hi right.hi
262242
// --------+------------------+------------+----------------+------->
263243
if (left.hi == right.lo) {
@@ -283,7 +263,7 @@ object EstimationUtils {
283263
)
284264
}
285265
} else if (right.lo <= left.lo && right.hi <= left.hi) {
286-
// Case4: the left bin is "larger" than the right bin
266+
// Case2: the left bin is "larger" than the right bin
287267
// right.lo left.lo right.hi left.hi
288268
// --------+------------------+------------+----------------+------->
289269
if (right.hi == left.lo) {
@@ -309,7 +289,7 @@ object EstimationUtils {
309289
)
310290
}
311291
} else if (right.lo >= left.lo && right.hi <= left.hi) {
312-
// Case5: the left bin contains the right bin
292+
// Case3: the left bin contains the right bin
313293
// left.lo right.lo right.hi left.hi
314294
// --------+------------------+------------+----------------+------->
315295
val leftRatio = (right.hi - right.lo) / (left.hi - left.lo)
@@ -323,7 +303,7 @@ object EstimationUtils {
323303
)
324304
} else {
325305
assert(right.lo <= left.lo && right.hi >= left.hi)
326-
// Case6: the right bin contains the left bin
306+
// Case4: the right bin contains the left bin
327307
// right.lo left.lo left.hi right.hi
328308
// --------+------------------+------------+----------------+------->
329309
val rightRatio = (left.hi - left.lo) / (right.hi - right.lo)
@@ -346,6 +326,11 @@ object EstimationUtils {
346326
/**
347327
* Given an original bin and a value range [lowerBound, upperBound], returns the trimmed part
348328
* of the bin in that range and its number of rows.
329+
* @param bin the input histogram bin.
330+
* @param height the number of rows of the given histogram bin inside an equi-height histogram.
331+
* @param lowerBound lower bound of the given range.
332+
* @param upperBound upper bound of the given range.
333+
* @return trimmed part of the given bin and its number of rows.
349334
*/
350335
def trimBin(bin: HistogramBin, height: Double, lowerBound: Double, upperBound: Double)
351336
: (HistogramBin, Double) = {
@@ -364,14 +349,15 @@ object EstimationUtils {
364349
} else {
365350
// lowerBound bin.lo bin.hi upperBound
366351
// --------+------------------+------------+-------------+------->
352+
assert(bin.lo >= lowerBound && bin.hi <= upperBound)
367353
(bin.lo, bin.hi)
368354
}
369355

370-
if (bin.hi == bin.lo) {
371-
(bin, height)
372-
} else if (hi == lo) {
356+
if (hi == lo) {
357+
// Note that bin.hi == bin.lo also falls into this branch.
373358
(HistogramBin(lo, hi, 1), height / bin.ndv)
374359
} else {
360+
assert(bin.hi != bin.lo)
375361
val ratio = (hi - lo) / (bin.hi - bin.lo)
376362
(HistogramBin(lo, hi, math.ceil(bin.ndv * ratio).toLong), height * ratio)
377363
}

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ case class JoinEstimation(join: Join) extends Logging {
193193
val (newMin, newMax) = ValueInterval.intersect(lInterval, rInterval, leftKey.dataType)
194194
val (card, joinStat) = (leftKeyStat.histogram, rightKeyStat.histogram) match {
195195
case (Some(l: Histogram), Some(r: Histogram)) =>
196-
computeByEquiHeightHistogram(leftKey, rightKey, l, r, newMin, newMax)
196+
computeByHistogram(leftKey, rightKey, l, r, newMin, newMax)
197197
case _ =>
198198
computeByNdv(leftKey, rightKey, newMin, newMax)
199199
}
@@ -237,7 +237,7 @@ case class JoinEstimation(join: Join) extends Logging {
237237
}
238238

239239
/** Compute join cardinality using equi-height histograms. */
240-
private def computeByEquiHeightHistogram(
240+
private def computeByHistogram(
241241
leftKey: AttributeReference,
242242
rightKey: AttributeReference,
243243
leftHistogram: Histogram,

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/JoinEstimationSuite.scala

Lines changed: 37 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -136,24 +136,22 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
136136

137137
val expectedRanges = Seq(
138138
// histogram1.bins(0) overlaps t0
139-
OverlappedRange(10, 30, 10, 40*1/2, 300, 80*1/2),
139+
OverlappedRange(10, 30, 10, 40 * 1 / 2, 300, 80 * 1 / 2),
140140
// histogram1.bins(1) overlaps t0
141-
OverlappedRange(30, 50, 30*2/3, 40*1/2, 300*2/3, 80*1/2),
141+
OverlappedRange(30, 50, 30 * 2 / 3, 40 * 1 / 2, 300 * 2 / 3, 80 * 1 / 2),
142142
// histogram1.bins(1) overlaps t1
143-
OverlappedRange(50, 60, 30*1/3, 8, 300*1/3, 20)
143+
OverlappedRange(50, 60, 30 * 1 / 3, 8, 300 * 1 / 3, 20)
144144
)
145145
assert(expectedRanges.equals(
146-
getOverlappedRanges(histogram1, histogram2, lowerBound = 10D, upperBound = 60D)))
146+
getOverlappedRanges(histogram1, histogram2, lowerBound = 10, upperBound = 60)))
147147

148148
estimateByHistogram(
149149
leftHistogram = histogram1,
150150
rightHistogram = histogram2,
151-
expectedMin = 10D,
152-
expectedMax = 60D,
153-
// 10 + 20 + 8
154-
expectedNdv = 38L,
155-
// 300*40/20 + 200*40/20 + 100*20/10
156-
expectedRows = 1200L)
151+
expectedMin = 10,
152+
expectedMax = 60,
153+
expectedNdv = 10 + 20 + 8,
154+
expectedRows = 300 * 40 / 20 + 200 * 40 / 20 + 100 * 20 / 10)
157155
}
158156

159157
test("equi-height histograms: a bin has only one value after trimming") {
@@ -169,24 +167,22 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
169167

170168
val expectedRanges = Seq(
171169
// histogram1.bins(0) overlaps t0
172-
OverlappedRange(50, 50, 1, 1, 300/10, 2),
170+
OverlappedRange(50, 50, 1, 1, 300 / 10, 2),
173171
// histogram1.bins(0) overlaps t1
174-
OverlappedRange(50, 60, 10, 20*10/25, 300, 50*10/25),
172+
OverlappedRange(50, 60, 10, 20 * 10 / 25, 300, 50 * 10 / 25),
175173
// histogram1.bins(1) overlaps t1
176-
OverlappedRange(60, 75, 3, 20*15/25, 300, 50*15/25)
174+
OverlappedRange(60, 75, 3, 20 * 15 / 25, 300, 50 * 15 / 25)
177175
)
178176
assert(expectedRanges.equals(
179-
getOverlappedRanges(histogram1, histogram2, lowerBound = 50D, upperBound = 75D)))
177+
getOverlappedRanges(histogram1, histogram2, lowerBound = 50, upperBound = 75)))
180178

181179
estimateByHistogram(
182180
leftHistogram = histogram1,
183181
rightHistogram = histogram2,
184-
expectedMin = 50D,
185-
expectedMax = 75D,
186-
// 1 + 8 + 3
187-
expectedNdv = 12L,
188-
// 30*2/1 + 300*20/10 + 300*30/12
189-
expectedRows = 1410L)
182+
expectedMin = 50,
183+
expectedMax = 75,
184+
expectedNdv = 1 + 8 + 3,
185+
expectedRows = 30 * 2 / 1 + 300 * 20 / 10 + 300 * 30 / 12)
190186
}
191187

192188
test("equi-height histograms: skew distribution (some bins have only one value)") {
@@ -203,23 +199,21 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
203199
assert(t1 ==HistogramBin(lo = 50, hi = 60, ndv = 8) && h1 == 20)
204200

205201
val expectedRanges = Seq(
206-
OverlappedRange(30, 30, 1, 1, 300, 40/20),
207-
OverlappedRange(30, 30, 1, 1, 300, 40/20),
208-
OverlappedRange(30, 50, 30*2/3, 20, 300*2/3, 40),
209-
OverlappedRange(50, 60, 30*1/3, 8, 300*1/3, 20)
202+
OverlappedRange(30, 30, 1, 1, 300, 40 / 20),
203+
OverlappedRange(30, 30, 1, 1, 300, 40 / 20),
204+
OverlappedRange(30, 50, 30 * 2 / 3, 20, 300 * 2 / 3, 40),
205+
OverlappedRange(50, 60, 30 * 1 / 3, 8, 300 * 1 / 3, 20)
210206
)
211207
assert(expectedRanges.equals(
212-
getOverlappedRanges(histogram1, histogram2, lowerBound = 30D, upperBound = 60D)))
208+
getOverlappedRanges(histogram1, histogram2, lowerBound = 30, upperBound = 60)))
213209

214210
estimateByHistogram(
215211
leftHistogram = histogram1,
216212
rightHistogram = histogram2,
217-
expectedMin = 30D,
218-
expectedMax = 60D,
219-
// 1 + 20 + 8
220-
expectedNdv = 29L,
221-
// 300*2/1 + 300*2/1 + 200*40/20 + 100*20/10
222-
expectedRows = 1800L)
213+
expectedMin = 30,
214+
expectedMax = 60,
215+
expectedNdv = 1 + 20 + 8,
216+
expectedRows = 300 * 2 / 1 + 300 * 2 / 1 + 200 * 40 / 20 + 100 * 20 / 10)
223217
}
224218

225219
test("equi-height histograms: skew distribution (histograms have different skewed values") {
@@ -234,22 +228,20 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
234228
assert(t1 == HistogramBin(lo = 30, hi = 50, ndv = 20) && h1 == 40)
235229

236230
val expectedRanges = Seq(
237-
OverlappedRange(30, 30, 1, 1, 300, 40/20),
231+
OverlappedRange(30, 30, 1, 1, 300, 40 / 20),
238232
OverlappedRange(30, 50, 20, 20, 200, 40),
239-
OverlappedRange(50, 50, 1, 1, 200/20, 100)
233+
OverlappedRange(50, 50, 1, 1, 200 / 20, 100)
240234
)
241235
assert(expectedRanges.equals(
242-
getOverlappedRanges(histogram1, histogram2, lowerBound = 30D, upperBound = 50D)))
236+
getOverlappedRanges(histogram1, histogram2, lowerBound = 30, upperBound = 50)))
243237

244238
estimateByHistogram(
245239
leftHistogram = histogram1,
246240
rightHistogram = histogram2,
247-
expectedMin = 30D,
248-
expectedMax = 50D,
249-
// 1 + 20
250-
expectedNdv = 21L,
251-
// 300*2/1 + 200*40/20 + 10*100/1
252-
expectedRows = 2000L)
241+
expectedMin = 30,
242+
expectedMax = 50,
243+
expectedNdv = 1 + 20,
244+
expectedRows = 300 * 2 / 1 + 200 * 40 / 20 + 10 * 100 / 1)
253245
}
254246

255247
test("equi-height histograms: skew distribution (both histograms have the same skewed value") {
@@ -270,17 +262,16 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
270262
OverlappedRange(30, 30, 1, 1, 10, 150)
271263
)
272264
assert(expectedRanges.equals(
273-
getOverlappedRanges(histogram1, histogram2, lowerBound = 30D, upperBound = 30D)))
265+
getOverlappedRanges(histogram1, histogram2, lowerBound = 30, upperBound = 30)))
274266

275267
estimateByHistogram(
276268
leftHistogram = histogram1,
277269
rightHistogram = histogram2,
278-
expectedMin = 30D,
279-
expectedMax = 30D,
270+
expectedMin = 30,
271+
expectedMax = 30,
280272
// only one value: 30
281-
expectedNdv = 1L,
282-
// 300*5/1 + 300*150/1 + 10*5/1 + 10*150/1
283-
expectedRows = 48050L)
273+
expectedNdv = 1,
274+
expectedRows = 300 * 5 / 1 + 300 * 150 / 1 + 10 * 5 / 1 + 10 * 150 / 1)
284275
}
285276

286277
test("cross join") {

0 commit comments

Comments
 (0)