apache
diff --git a/‎core/src/main/scala/org/apache/spark/partial/CountEvaluator.scala‎
Lines changed: 36 additions & 17 deletions b/‎core/src/main/scala/org/apache/spark/partial/CountEvaluator.scala‎
Lines changed: 36 additions & 17 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala‎
Lines changed: 5 additions & 25 deletions b/‎core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala‎
Lines changed: 5 additions & 25 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/partial/GroupedMeanEvaluator.scala‎
Lines changed: 0 additions & 80 deletions b/‎core/src/main/scala/org/apache/spark/partial/GroupedMeanEvaluator.scala‎
Lines changed: 0 additions & 80 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/partial/GroupedSumEvaluator.scala‎
Lines changed: 0 additions & 88 deletions b/‎core/src/main/scala/org/apache/spark/partial/GroupedSumEvaluator.scala‎
Lines changed: 0 additions & 88 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/partial/MeanEvaluator.scala‎
Lines changed: 14 additions & 9 deletions b/‎core/src/main/scala/org/apache/spark/partial/MeanEvaluator.scala‎
Lines changed: 14 additions & 9 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/partial/StudentTCacher.scala‎
Lines changed: 0 additions & 46 deletions b/‎core/src/main/scala/org/apache/spark/partial/StudentTCacher.scala‎
Lines changed: 0 additions & 46 deletions
@@ -17,40 +17,59 @@
 
 package org.apache.spark.partial
 
-import org.apache.commons.math3.distribution.NormalDistribution
+import org.apache.commons.math3.distribution.{PascalDistribution, PoissonDistribution}
 
 /**
  * An ApproximateEvaluator for counts.
- *
- * TODO: There's currently a lot of shared code between this and GroupedCountEvaluator. It might
- * be best to make this a special case of GroupedCountEvaluator with one group.
  */
 private[spark] class CountEvaluator(totalOutputs: Int, confidence: Double)
   extends ApproximateEvaluator[Long, BoundedDouble] {
 
-  var outputsMerged = 0
-  var sum: Long = 0
+  private var outputsMerged = 0
+  private var sum: Long = 0
 
-  override def merge(outputId: Int, taskResult: Long) {
+  override def merge(outputId: Int, taskResult: Long): Unit = {
     outputsMerged += 1
     sum += taskResult
   }
 
   override def currentResult(): BoundedDouble = {
     if (outputsMerged == totalOutputs) {
       new BoundedDouble(sum, 1.0, sum, sum)
-    } else if (outputsMerged == 0) {
-      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
+    } else if (outputsMerged == 0 || sum == 0) {
+      new BoundedDouble(0, 0.0, 0.0, Double.PositiveInfinity)
     } else {
       val p = outputsMerged.toDouble / totalOutputs
-      val mean = (sum + 1 - p) / p
-      val variance = (sum + 1) * (1 - p) / (p * p)
-      val stdev = math.sqrt(variance)
-      val confFactor = new NormalDistribution().
-        inverseCumulativeProbability(1 - (1 - confidence) / 2)
-      val low = mean - confFactor * stdev
-      val high = mean + confFactor * stdev
-      new BoundedDouble(mean, confidence, low, high)
+      CountEvaluator.bound(confidence, sum, p)
     }
   }
 }
+
+private[partial] object CountEvaluator {
+
+  def bound(confidence: Double, sum: Long, p: Double): BoundedDouble = {
+    // Let the total count be N. A fraction p has been counted already, with sum 'sum',
+    // as if each element from the total data set had been seen with probability p.
+    val dist =
+      if (sum <= 10000) {
+        // The remaining count, k=N-sum, may be modeled as negative binomial (aka Pascal),
+        // where there have been 'sum' successes of probability p already. (There are several
+        // conventions, but this is the one followed by Commons Math3.)
+        new PascalDistribution(sum.toInt, p)
+      } else {
+        // For large 'sum' (certainly, > Int.MaxValue!), use a Poisson approximation, which has
+        // a different interpretation. "sum" elements have been observed having scanned a fraction
+        // p of the data. This suggests data is counted at a rate of sum / p across the whole data
+        // set. The total expected count from the rest is distributed as
+        // (1-p) Poisson(sum / p) = Poisson(sum*(1-p)/p)
+        new PoissonDistribution(sum * (1 - p) / p)
+      }
+    // Not quite symmetric; calculate interval straight from discrete distribution
+    val low = dist.inverseCumulativeProbability((1 - confidence) / 2)
+    val high = dist.inverseCumulativeProbability((1 + confidence) / 2)
+    // Add 'sum' to each because distribution is just of remaining count, not observed
+    new BoundedDouble(sum + dist.getNumericalMean, confidence, sum + low, sum + high)
+  }
+
+
+}
@@ -17,15 +17,10 @@
 
 package org.apache.spark.partial
 
-import java.util.{HashMap => JHashMap}
-
-import scala.collection.JavaConverters._
 import scala.collection.Map
 import scala.collection.mutable.HashMap
 import scala.reflect.ClassTag
 
-import org.apache.commons.math3.distribution.NormalDistribution
-
 import org.apache.spark.util.collection.OpenHashMap
 
 /**
@@ -34,10 +29,10 @@ import org.apache.spark.util.collection.OpenHashMap
 private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
   extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {
 
-  var outputsMerged = 0
-  var sums = new OpenHashMap[T, Long]()   // Sum of counts for each key
+  private var outputsMerged = 0
+  private val sums = new OpenHashMap[T, Long]()   // Sum of counts for each key
 
-  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) {
+  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = {
     outputsMerged += 1
     taskResult.foreach { case (key, value) =>
       sums.changeValue(key, value, _ + value)
@@ -46,27 +41,12 @@ private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, conf
 
   override def currentResult(): Map[T, BoundedDouble] = {
     if (outputsMerged == totalOutputs) {
-      val result = new JHashMap[T, BoundedDouble](sums.size)
-      sums.foreach { case (key, sum) =>
-        result.put(key, new BoundedDouble(sum, 1.0, sum, sum))
-      }
-      result.asScala
+      sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap
     } else if (outputsMerged == 0) {
       new HashMap[T, BoundedDouble]
     } else {
       val p = outputsMerged.toDouble / totalOutputs
-      val confFactor = new NormalDistribution().
-        inverseCumulativeProbability(1 - (1 - confidence) / 2)
-      val result = new JHashMap[T, BoundedDouble](sums.size)
-      sums.foreach { case (key, sum) =>
-        val mean = (sum + 1 - p) / p
-        val variance = (sum + 1) * (1 - p) / (p * p)
-        val stdev = math.sqrt(variance)
-        val low = mean - confFactor * stdev
-        val high = mean + confFactor * stdev
-        result.put(key, new BoundedDouble(mean, confidence, low, high))
-      }
-      result.asScala
+      sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap
     }
   }
 }
@@ -27,30 +27,35 @@ import org.apache.spark.util.StatCounter
 private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
   extends ApproximateEvaluator[StatCounter, BoundedDouble] {
 
-  var outputsMerged = 0
-  var counter = new StatCounter
+  private var outputsMerged = 0
+  private val counter = new StatCounter()
 
-  override def merge(outputId: Int, taskResult: StatCounter) {
+  override def merge(outputId: Int, taskResult: StatCounter): Unit = {
     outputsMerged += 1
     counter.merge(taskResult)
   }
 
   override def currentResult(): BoundedDouble = {
     if (outputsMerged == totalOutputs) {
       new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
-    } else if (outputsMerged == 0) {
+    } else if (outputsMerged == 0 || counter.count == 0) {
       new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
+    } else if (counter.count == 1) {
+      new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity)
     } else {
       val mean = counter.mean
       val stdev = math.sqrt(counter.sampleVariance / counter.count)
-      val confFactor = {
-        if (counter.count > 100) {
-          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
+      val confFactor = if (counter.count > 100) {
+          // For large n, the normal distribution is a good approximation to t-distribution
+          new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2)
         } else {
+          // t-distribution describes distribution of actual population mean
+          // note that if this goes to 0, TDistribution will throw an exception.
+          // Hence special casing 1 above.
           val degreesOfFreedom = (counter.count - 1).toInt
-          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
+          new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2)
         }
-      }
+      // Symmetric, so confidence interval is symmetric about mean of distribution
       val low = mean - confFactor * stdev
       val high = mean + confFactor * stdev
       new BoundedDouble(mean, confidence, low, high)