Skip to content

Commit bdb5e55

Browse files
committed
[SPARK-21322][SQL][FOLLOWUP] support histogram in filter cardinality estimation
## What changes were proposed in this pull request? some code cleanup/refactor and naming improvement. ## How was this patch tested? existing tests Author: Wenchen Fan <[email protected]> Closes #19952 from cloud-fan/minor.
1 parent 13e489b commit bdb5e55

File tree

2 files changed

+134
-127
lines changed

2 files changed

+134
-127
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala

Lines changed: 56 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -115,14 +115,10 @@ object EstimationUtils {
115115
}
116116

117117
/**
118-
* Returns the number of the first bin into which a column value falls for a specified
118+
* Returns the index of the first bin into which the given value falls for a specified
119119
* numeric equi-height histogram.
120-
*
121-
* @param value a literal value of a column
122-
* @param bins an array of bins for a given numeric equi-height histogram
123-
* @return the id of the first bin into which a column value falls.
124120
*/
125-
def findFirstBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
121+
private def findFirstBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
126122
var i = 0
127123
while ((i < bins.length) && (value > bins(i).hi)) {
128124
i += 1
@@ -131,14 +127,10 @@ object EstimationUtils {
131127
}
132128

133129
/**
134-
* Returns the number of the last bin into which a column value falls for a specified
130+
* Returns the index of the last bin into which the given value falls for a specified
135131
* numeric equi-height histogram.
136-
*
137-
* @param value a literal value of a column
138-
* @param bins an array of bins for a given numeric equi-height histogram
139-
* @return the id of the last bin into which a column value falls.
140132
*/
141-
def findLastBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
133+
private def findLastBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
142134
var i = bins.length - 1
143135
while ((i >= 0) && (value < bins(i).lo)) {
144136
i -= 1
@@ -147,65 +139,76 @@ object EstimationUtils {
147139
}
148140

149141
/**
150-
* Returns a percentage of a bin holding values for column value in the range of
151-
* [lowerValue, higherValue]
152-
*
153-
* @param higherValue a given upper bound value of a specified column value range
154-
* @param lowerValue a given lower bound value of a specified column value range
155-
* @param bin a single histogram bin
156-
* @return the percentage of a single bin holding values in [lowerValue, higherValue].
142+
* Returns the possibility of the given histogram bin holding values within the given range
143+
* [lowerBound, upperBound].
157144
*/
158-
private def getOccupation(
159-
higherValue: Double,
160-
lowerValue: Double,
145+
private def binHoldingRangePossibility(
146+
upperBound: Double,
147+
lowerBound: Double,
161148
bin: HistogramBin): Double = {
162-
assert(bin.lo <= lowerValue && lowerValue <= higherValue && higherValue <= bin.hi)
149+
assert(bin.lo <= lowerBound && lowerBound <= upperBound && upperBound <= bin.hi)
163150
if (bin.hi == bin.lo) {
164151
// the entire bin is covered in the range
165152
1.0
166-
} else if (higherValue == lowerValue) {
153+
} else if (upperBound == lowerBound) {
167154
// set percentage to 1/NDV
168155
1.0 / bin.ndv.toDouble
169156
} else {
170157
// Use proration since the range falls inside this bin.
171-
math.min((higherValue - lowerValue) / (bin.hi - bin.lo), 1.0)
158+
math.min((upperBound - lowerBound) / (bin.hi - bin.lo), 1.0)
172159
}
173160
}
174161

175162
/**
176-
* Returns the number of bins for column values in [lowerValue, higherValue].
177-
* The column value distribution is saved in an equi-height histogram. The return values is a
178-
* double value is because we may return a portion of a bin. For example, a predicate
179-
* "column = 8" may return the number of bins 0.2 if the holding bin has 5 distinct values.
163+
* Returns the number of histogram bins holding values within the given range
164+
* [lowerBound, upperBound].
165+
*
166+
* Note that the returned value is double type, because the range boundaries usually occupy a
167+
* portion of a bin. An extreme case is [value, value] which is generated by equal predicate
168+
* `col = value`, we can get higher accuracy by allowing returning portion of histogram bins.
180169
*
181-
* @param higherId id of the high end bin holding the high end value of a column range
182-
* @param lowerId id of the low end bin holding the low end value of a column range
183-
* @param higherEnd a given upper bound value of a specified column value range
184-
* @param lowerEnd a given lower bound value of a specified column value range
185-
* @param histogram a numeric equi-height histogram
186-
* @return the number of bins for column values in [lowerEnd, higherEnd].
170+
* @param upperBound the highest value of the given range
171+
* @param upperBoundInclusive whether the upperBound is included in the range
172+
* @param lowerBound the lowest value of the given range
173+
* @param lowerBoundInclusive whether the lowerBound is included in the range
174+
* @param bins an array of bins for a given numeric equi-height histogram
187175
*/
188-
def getOccupationBins(
189-
higherId: Int,
190-
lowerId: Int,
191-
higherEnd: Double,
192-
lowerEnd: Double,
193-
histogram: Histogram): Double = {
194-
assert(lowerId <= higherId)
195-
196-
if (lowerId == higherId) {
197-
val curBin = histogram.bins(lowerId)
198-
getOccupation(higherEnd, lowerEnd, curBin)
176+
def numBinsHoldingRange(
177+
upperBound: Double,
178+
upperBoundInclusive: Boolean,
179+
lowerBound: Double,
180+
lowerBoundInclusive: Boolean,
181+
bins: Array[HistogramBin]): Double = {
182+
assert(bins.head.lo <= lowerBound && lowerBound <= upperBound && upperBound <= bins.last.hi,
183+
"Given range does not fit in the given histogram.")
184+
assert(upperBound != lowerBound || upperBoundInclusive || lowerBoundInclusive,
185+
s"'$lowerBound < value < $upperBound' is an invalid range.")
186+
187+
val upperBinIndex = if (upperBoundInclusive) {
188+
findLastBinForValue(upperBound, bins)
189+
} else {
190+
findFirstBinForValue(upperBound, bins)
191+
}
192+
val lowerBinIndex = if (lowerBoundInclusive) {
193+
findFirstBinForValue(lowerBound, bins)
194+
} else {
195+
findLastBinForValue(lowerBound, bins)
196+
}
197+
assert(lowerBinIndex <= upperBinIndex, "Invalid histogram data.")
198+
199+
200+
if (lowerBinIndex == upperBinIndex) {
201+
binHoldingRangePossibility(upperBound, lowerBound, bins(lowerBinIndex))
199202
} else {
200-
// compute how much lowerEnd/higherEnd occupies its bin
201-
val lowerCurBin = histogram.bins(lowerId)
202-
val lowerPart = getOccupation(lowerCurBin.hi, lowerEnd, lowerCurBin)
203+
// Computes the occupied portion of bins of the upperBound and lowerBound.
204+
val lowerBin = bins(lowerBinIndex)
205+
val lowerPart = binHoldingRangePossibility(lowerBin.hi, lowerBound, lowerBin)
203206

204-
val higherCurBin = histogram.bins(higherId)
205-
val higherPart = getOccupation(higherEnd, higherCurBin.lo, higherCurBin)
207+
val higherBin = bins(upperBinIndex)
208+
val higherPart = binHoldingRangePossibility(upperBound, higherBin.lo, higherBin)
206209

207-
// the total length is lowerPart + higherPart + bins between them
208-
lowerPart + higherPart + higherId - lowerId - 1
210+
// The total number of bins is lowerPart + higherPart + bins between them
211+
lowerPart + higherPart + upperBinIndex - lowerBinIndex - 1
209212
}
210213
}
211214

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala

Lines changed: 78 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -336,43 +336,12 @@ case class FilterEstimation(plan: Filter) extends Logging {
336336
// returns 1/ndv if there is no histogram
337337
Some(1.0 / BigDecimal(ndv))
338338
} else {
339-
// We compute filter selectivity using Histogram information.
340-
val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
341-
val histogram = colStat.histogram.get
342-
val hgmBins = histogram.bins
343-
344-
// find bins where column's current min and max locate. Note that a column's [min, max]
345-
// range may change due to another condition applied earlier.
346-
val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
347-
val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
348-
val minBinId = EstimationUtils.findFirstBinForValue(min, hgmBins)
349-
val maxBinId = EstimationUtils.findLastBinForValue(max, hgmBins)
350-
351-
// compute how many bins the column's current valid range [min, max] occupies.
352-
// Note that a column's [min, max] range may vary after we apply some filter conditions.
353-
val validRangeBins = EstimationUtils.getOccupationBins(maxBinId, minBinId, max,
354-
min, histogram)
355-
356-
val lowerBinId = EstimationUtils.findFirstBinForValue(datum, hgmBins)
357-
val higherBinId = EstimationUtils.findLastBinForValue(datum, hgmBins)
358-
assert(lowerBinId <= higherBinId)
359-
val lowerBinNdv = hgmBins(lowerBinId).ndv
360-
val higherBinNdv = hgmBins(higherBinId).ndv
361-
// assume uniform distribution in each bin
362-
val occupiedBins = if (lowerBinId == higherBinId) {
363-
1.0 / lowerBinNdv
364-
} else {
365-
(1.0 / lowerBinNdv) + // lowest bin
366-
(higherBinId - lowerBinId - 1) + // middle bins
367-
(1.0 / higherBinNdv) // highest bin
368-
}
369-
Some(occupiedBins / validRangeBins)
339+
Some(computeEqualityPossibilityByHistogram(literal, colStat))
370340
}
371341

372342
} else { // not in interval
373343
Some(0.0)
374344
}
375-
376345
}
377346

378347
/**
@@ -542,11 +511,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
542511
}
543512
}
544513
} else {
545-
val numericHistogram = colStat.histogram.get
546-
val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
547-
val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
548-
val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
549-
percent = computePercentByEquiHeightHgm(op, numericHistogram, max, min, datum)
514+
percent = computeComparisonPossibilityByHistogram(op, literal, colStat)
550515
}
551516

552517
if (update) {
@@ -574,51 +539,90 @@ case class FilterEstimation(plan: Filter) extends Logging {
574539
}
575540

576541
/**
577-
* Returns the selectivity percentage for binary condition in the column's
578-
* current valid range [min, max]
579-
*
580-
* @param op a binary comparison operator
581-
* @param histogram a numeric equi-height histogram
582-
* @param max the upper bound of the current valid range for a given column
583-
* @param min the lower bound of the current valid range for a given column
584-
* @param datumNumber the numeric value of a literal
585-
* @return the selectivity percentage for a condition in the current range.
542+
* Computes the possibility of an equality predicate using histogram.
586543
*/
544+
private def computeEqualityPossibilityByHistogram(
545+
literal: Literal, colStat: ColumnStat): Double = {
546+
val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
547+
val histogram = colStat.histogram.get
587548

588-
def computePercentByEquiHeightHgm(
589-
op: BinaryComparison,
590-
histogram: Histogram,
591-
max: Double,
592-
min: Double,
593-
datumNumber: Double): Double = {
594549
// find bins where column's current min and max locate. Note that a column's [min, max]
595550
// range may change due to another condition applied earlier.
596-
val minBinId = EstimationUtils.findFirstBinForValue(min, histogram.bins)
597-
val maxBinId = EstimationUtils.findLastBinForValue(max, histogram.bins)
551+
val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
552+
val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
598553

599554
// compute how many bins the column's current valid range [min, max] occupies.
600-
// Note that a column's [min, max] range may vary after we apply some filter conditions.
601-
val minToMaxLength = EstimationUtils.getOccupationBins(maxBinId, minBinId, max, min, histogram)
602-
603-
val datumInBinId = op match {
604-
case LessThan(_, _) | GreaterThanOrEqual(_, _) =>
605-
EstimationUtils.findFirstBinForValue(datumNumber, histogram.bins)
606-
case LessThanOrEqual(_, _) | GreaterThan(_, _) =>
607-
EstimationUtils.findLastBinForValue(datumNumber, histogram.bins)
608-
}
555+
val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
556+
upperBound = max,
557+
upperBoundInclusive = true,
558+
lowerBound = min,
559+
lowerBoundInclusive = true,
560+
histogram.bins)
561+
562+
val numBinsHoldingDatum = EstimationUtils.numBinsHoldingRange(
563+
upperBound = datum,
564+
upperBoundInclusive = true,
565+
lowerBound = datum,
566+
lowerBoundInclusive = true,
567+
histogram.bins)
568+
569+
numBinsHoldingDatum / numBinsHoldingEntireRange
570+
}
609571

610-
op match {
611-
// LessThan and LessThanOrEqual share the same logic,
612-
// but their datumInBinId may be different
613-
case LessThan(_, _) | LessThanOrEqual(_, _) =>
614-
EstimationUtils.getOccupationBins(datumInBinId, minBinId, datumNumber, min,
615-
histogram) / minToMaxLength
616-
// GreaterThan and GreaterThanOrEqual share the same logic,
617-
// but their datumInBinId may be different
618-
case GreaterThan(_, _) | GreaterThanOrEqual(_, _) =>
619-
EstimationUtils.getOccupationBins(maxBinId, datumInBinId, max, datumNumber,
620-
histogram) / minToMaxLength
572+
/**
573+
* Computes the possibility of a comparison predicate using histogram.
574+
*/
575+
private def computeComparisonPossibilityByHistogram(
576+
op: BinaryComparison, literal: Literal, colStat: ColumnStat): Double = {
577+
val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
578+
val histogram = colStat.histogram.get
579+
580+
// find bins where column's current min and max locate. Note that a column's [min, max]
581+
// range may change due to another condition applied earlier.
582+
val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
583+
val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
584+
585+
// compute how many bins the column's current valid range [min, max] occupies.
586+
val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
587+
max, upperBoundInclusive = true, min, lowerBoundInclusive = true, histogram.bins)
588+
589+
val numBinsHoldingRange = op match {
590+
// LessThan and LessThanOrEqual share the same logic, the only difference is whether to
591+
// include the upperBound in the range.
592+
case _: LessThan =>
593+
EstimationUtils.numBinsHoldingRange(
594+
upperBound = datum,
595+
upperBoundInclusive = false,
596+
lowerBound = min,
597+
lowerBoundInclusive = true,
598+
histogram.bins)
599+
case _: LessThanOrEqual =>
600+
EstimationUtils.numBinsHoldingRange(
601+
upperBound = datum,
602+
upperBoundInclusive = true,
603+
lowerBound = min,
604+
lowerBoundInclusive = true,
605+
histogram.bins)
606+
607+
// GreaterThan and GreaterThanOrEqual share the same logic, the only difference is whether to
608+
// include the lowerBound in the range.
609+
case _: GreaterThan =>
610+
EstimationUtils.numBinsHoldingRange(
611+
upperBound = max,
612+
upperBoundInclusive = true,
613+
lowerBound = datum,
614+
lowerBoundInclusive = false,
615+
histogram.bins)
616+
case _: GreaterThanOrEqual =>
617+
EstimationUtils.numBinsHoldingRange(
618+
upperBound = max,
619+
upperBoundInclusive = true,
620+
lowerBound = datum,
621+
lowerBoundInclusive = true,
622+
histogram.bins)
621623
}
624+
625+
numBinsHoldingRange / numBinsHoldingEntireRange
622626
}
623627

624628
/**

0 commit comments

Comments
 (0)