@@ -336,43 +336,12 @@ case class FilterEstimation(plan: Filter) extends Logging {
336336 // returns 1/ndv if there is no histogram
337337 Some (1.0 / BigDecimal (ndv))
338338 } else {
339- // We compute filter selectivity using Histogram information.
340- val datum = EstimationUtils .toDecimal(literal.value, literal.dataType).toDouble
341- val histogram = colStat.histogram.get
342- val hgmBins = histogram.bins
343-
344- // find bins where column's current min and max locate. Note that a column's [min, max]
345- // range may change due to another condition applied earlier.
346- val min = EstimationUtils .toDecimal(colStat.min.get, literal.dataType).toDouble
347- val max = EstimationUtils .toDecimal(colStat.max.get, literal.dataType).toDouble
348- val minBinId = EstimationUtils .findFirstBinForValue(min, hgmBins)
349- val maxBinId = EstimationUtils .findLastBinForValue(max, hgmBins)
350-
351- // compute how many bins the column's current valid range [min, max] occupies.
352- // Note that a column's [min, max] range may vary after we apply some filter conditions.
353- val validRangeBins = EstimationUtils .getOccupationBins(maxBinId, minBinId, max,
354- min, histogram)
355-
356- val lowerBinId = EstimationUtils .findFirstBinForValue(datum, hgmBins)
357- val higherBinId = EstimationUtils .findLastBinForValue(datum, hgmBins)
358- assert(lowerBinId <= higherBinId)
359- val lowerBinNdv = hgmBins(lowerBinId).ndv
360- val higherBinNdv = hgmBins(higherBinId).ndv
361- // assume uniform distribution in each bin
362- val occupiedBins = if (lowerBinId == higherBinId) {
363- 1.0 / lowerBinNdv
364- } else {
365- (1.0 / lowerBinNdv) + // lowest bin
366- (higherBinId - lowerBinId - 1 ) + // middle bins
367- (1.0 / higherBinNdv) // highest bin
368- }
369- Some (occupiedBins / validRangeBins)
339+ Some (computeEqualityPossibilityByHistogram(literal, colStat))
370340 }
371341
372342 } else { // not in interval
373343 Some (0.0 )
374344 }
375-
376345 }
377346
378347 /**
@@ -542,11 +511,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
542511 }
543512 }
544513 } else {
545- val numericHistogram = colStat.histogram.get
546- val datum = EstimationUtils .toDecimal(literal.value, literal.dataType).toDouble
547- val max = EstimationUtils .toDecimal(colStat.max.get, literal.dataType).toDouble
548- val min = EstimationUtils .toDecimal(colStat.min.get, literal.dataType).toDouble
549- percent = computePercentByEquiHeightHgm(op, numericHistogram, max, min, datum)
514+ percent = computeComparisonPossibilityByHistogram(op, literal, colStat)
550515 }
551516
552517 if (update) {
@@ -574,51 +539,90 @@ case class FilterEstimation(plan: Filter) extends Logging {
574539 }
575540
576541 /**
577- * Returns the selectivity percentage for binary condition in the column's
578- * current valid range [min, max]
579- *
580- * @param op a binary comparison operator
581- * @param histogram a numeric equi-height histogram
582- * @param max the upper bound of the current valid range for a given column
583- * @param min the lower bound of the current valid range for a given column
584- * @param datumNumber the numeric value of a literal
585- * @return the selectivity percentage for a condition in the current range.
542+ * Computes the possibility of an equality predicate using histogram.
586543 */
544+ private def computeEqualityPossibilityByHistogram (
545+ literal : Literal , colStat : ColumnStat ): Double = {
546+ val datum = EstimationUtils .toDecimal(literal.value, literal.dataType).toDouble
547+ val histogram = colStat.histogram.get
587548
588- def computePercentByEquiHeightHgm (
589- op : BinaryComparison ,
590- histogram : Histogram ,
591- max : Double ,
592- min : Double ,
593- datumNumber : Double ): Double = {
594549 // find bins where column's current min and max locate. Note that a column's [min, max]
595550 // range may change due to another condition applied earlier.
596- val minBinId = EstimationUtils .findFirstBinForValue( min, histogram.bins)
597- val maxBinId = EstimationUtils .findLastBinForValue( max, histogram.bins)
551+ val min = EstimationUtils .toDecimal(colStat. min.get, literal.dataType).toDouble
552+ val max = EstimationUtils .toDecimal(colStat. max.get, literal.dataType).toDouble
598553
599554 // compute how many bins the column's current valid range [min, max] occupies.
600- // Note that a column's [min, max] range may vary after we apply some filter conditions.
601- val minToMaxLength = EstimationUtils .getOccupationBins(maxBinId, minBinId, max, min, histogram)
602-
603- val datumInBinId = op match {
604- case LessThan (_, _) | GreaterThanOrEqual (_, _) =>
605- EstimationUtils .findFirstBinForValue(datumNumber, histogram.bins)
606- case LessThanOrEqual (_, _) | GreaterThan (_, _) =>
607- EstimationUtils .findLastBinForValue(datumNumber, histogram.bins)
608- }
555+ val numBinsHoldingEntireRange = EstimationUtils .numBinsHoldingRange(
556+ upperBound = max,
557+ upperBoundInclusive = true ,
558+ lowerBound = min,
559+ lowerBoundInclusive = true ,
560+ histogram.bins)
561+
562+ val numBinsHoldingDatum = EstimationUtils .numBinsHoldingRange(
563+ upperBound = datum,
564+ upperBoundInclusive = true ,
565+ lowerBound = datum,
566+ lowerBoundInclusive = true ,
567+ histogram.bins)
568+
569+ numBinsHoldingDatum / numBinsHoldingEntireRange
570+ }
609571
610- op match {
611- // LessThan and LessThanOrEqual share the same logic,
612- // but their datumInBinId may be different
613- case LessThan (_, _) | LessThanOrEqual (_, _) =>
614- EstimationUtils .getOccupationBins(datumInBinId, minBinId, datumNumber, min,
615- histogram) / minToMaxLength
616- // GreaterThan and GreaterThanOrEqual share the same logic,
617- // but their datumInBinId may be different
618- case GreaterThan (_, _) | GreaterThanOrEqual (_, _) =>
619- EstimationUtils .getOccupationBins(maxBinId, datumInBinId, max, datumNumber,
620- histogram) / minToMaxLength
572+ /**
573+ * Computes the possibility of a comparison predicate using histogram.
574+ */
575+ private def computeComparisonPossibilityByHistogram (
576+ op : BinaryComparison , literal : Literal , colStat : ColumnStat ): Double = {
577+ val datum = EstimationUtils .toDecimal(literal.value, literal.dataType).toDouble
578+ val histogram = colStat.histogram.get
579+
580+ // find bins where column's current min and max locate. Note that a column's [min, max]
581+ // range may change due to another condition applied earlier.
582+ val min = EstimationUtils .toDecimal(colStat.min.get, literal.dataType).toDouble
583+ val max = EstimationUtils .toDecimal(colStat.max.get, literal.dataType).toDouble
584+
585+ // compute how many bins the column's current valid range [min, max] occupies.
586+ val numBinsHoldingEntireRange = EstimationUtils .numBinsHoldingRange(
587+ max, upperBoundInclusive = true , min, lowerBoundInclusive = true , histogram.bins)
588+
589+ val numBinsHoldingRange = op match {
590+ // LessThan and LessThanOrEqual share the same logic, the only difference is whether to
591+ // include the upperBound in the range.
592+ case _ : LessThan =>
593+ EstimationUtils .numBinsHoldingRange(
594+ upperBound = datum,
595+ upperBoundInclusive = false ,
596+ lowerBound = min,
597+ lowerBoundInclusive = true ,
598+ histogram.bins)
599+ case _ : LessThanOrEqual =>
600+ EstimationUtils .numBinsHoldingRange(
601+ upperBound = datum,
602+ upperBoundInclusive = true ,
603+ lowerBound = min,
604+ lowerBoundInclusive = true ,
605+ histogram.bins)
606+
607+ // GreaterThan and GreaterThanOrEqual share the same logic, the only difference is whether to
608+ // include the lowerBound in the range.
609+ case _ : GreaterThan =>
610+ EstimationUtils .numBinsHoldingRange(
611+ upperBound = max,
612+ upperBoundInclusive = true ,
613+ lowerBound = datum,
614+ lowerBoundInclusive = false ,
615+ histogram.bins)
616+ case _ : GreaterThanOrEqual =>
617+ EstimationUtils .numBinsHoldingRange(
618+ upperBound = max,
619+ upperBoundInclusive = true ,
620+ lowerBound = datum,
621+ lowerBoundInclusive = true ,
622+ histogram.bins)
621623 }
624+
625+ numBinsHoldingRange / numBinsHoldingEntireRange
622626 }
623627
624628 /**
0 commit comments