addressed comments

brkyvz · brkyvz · commit 34b22e8ef94a · 2015-08-10T14:27:39.000-07:00
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -87,7 +87,10 @@ def _():
     'sum': 'Aggregate function: returns the sum of all values in the expression.',
     'avg': 'Aggregate function: returns the average of the values in a group.',
     'mean': 'Aggregate function: returns the average of the values in a group.',
-    'sumDistinct': 'Aggregate function: returns the sum of distinct values in the expression.',
+    'stddev': 'Aggregate function: returns the sample standard deviation in a group.',
+    'stddevSamp': 'Aggregate function: returns the sample standard deviation in a group.',
+    'stddevPop': 'Aggregate function: returns the population standard deviation in a group.',
+    'sumDistinct': 'Aggregate function: returns the sum of distinct values in the expression.'
 }
 
 _functions_1_4 = {
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
@@ -154,6 +154,48 @@ def min(self, *cols):
         [Row(min(age)=2, min(height)=80)]
         """
 
+    @df_varargs_api
+    @since(1.5)
+    def stddev(self, *cols):
+        """Computes the sample standard deviation for each numeric column for each group.
+        Alias for stddevSamp.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df.groupBy().stddev('age').collect()
+        [Row(stddev_samp(age)=2.12...)]
+        >>> df3.groupBy().stddev('age', 'height').collect()
+        [Row(stddev_samp(age)=2.12..., stddev_samp(height)=3.53...)]
+        """
+
+    @df_varargs_api
+    @since(1.5)
+    def stddevPop(self, *cols):
+        """Computes the sample standard deviation for each numeric column for each group.
+        Alias for stddevSamp.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df.groupBy().stddevPop('age').collect()
+        [Row(min(age)=1.06...)]
+        >>> df3.groupBy().stddevPop('age', 'height').collect()
+        [Row(min(age)=1.06..., min(height)=1.76...)]
+        """
+
+    @df_varargs_api
+    @since(1.5)
+    def stddevSamp(self, *cols):
+        """Computes the sample standard deviation for each numeric column for each group.
+        Alias for stddevSamp.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df.groupBy().stddevSamp('age').collect()
+        [Row(stddev_samp(age)=2.12...)]
+        >>> df3.groupBy().stddevSamp('age', 'height').collect()
+        [Row(stddev_samp(age)=2.12..., stddev_samp(height)=3.53...)]
+        """
+
     @df_varargs_api
     @since(1.3)
     def sum(self, *cols):
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
@@ -304,10 +304,11 @@ case class Sum(child: Expression) extends AlgebraicAggregate {
 }
 
 /**
- * Calculates the unbiased Standard Deviation using the online formula here:
+ * Calculates the Standard Deviation using the online formula here:
  * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+ * If sample is true, then we will return the unbiased standard deviation.
  */
-case class StandardDeviation(child: Expression) extends AlgebraicAggregate {
+case class StandardDeviation(child: Expression, sample: Boolean) extends AlgebraicAggregate {
 
   override def children: Seq[Expression] = child :: Nil
 
@@ -388,8 +389,14 @@ case class StandardDeviation(child: Expression) extends AlgebraicAggregate {
   }
 
   override lazy val evaluateExpression = {
-    val count = If(EqualTo(currentCount, Cast(Literal(0L), LongType)),
-      currentCount, currentCount - Cast(Literal(1L), LongType))
+    val count =
+      if (sample) {
+        If(EqualTo(currentCount, Cast(Literal(0L), LongType)), currentCount,
+          currentCount - Cast(Literal(1L), LongType))
+      } else {
+        currentCount
+      }
+
     child.dataType match {
       case DecimalType.Fixed(p, s) =>
         // increase the precision and scale to prevent precision loss
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
@@ -170,11 +170,13 @@ object Utils {
    * format, but standard deviation uses the new format directly. We wrap it here in one place,
    * and use an Alias so that the column name looks pretty as well instead of a long identifier.
    */
-  def standardDeviation(e: Expression): Expression = {
+  def standardDeviation(e: Expression, sample: Boolean, name: String): Expression = {
     val std = aggregate.AggregateExpression2(
-      aggregateFunction = aggregate.StandardDeviation(e),
+      aggregateFunction = aggregate.StandardDeviation(e, sample),
       mode = aggregate.Complete,
       isDistinct = false)
-    Alias(std, s"std(${e.prettyString})")()
+    Alias(std, s"$name(${e.prettyString})")()
   }
+
+  def sampleStandardDeviation(e: Expression): Expression = standardDeviation(e, true, "stddev_samp")
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1273,7 +1273,7 @@ class DataFrame private[sql](
     val statistics = List[(String, Expression => Expression)](
       "count" -> Count,
       "mean" -> Average,
-      "stddev" -> aggregate.Utils.standardDeviation,
+      "stddev" -> aggregate.Utils.sampleStandardDeviation,
       "min" -> Min,
       "max" -> Max)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -291,8 +291,33 @@ class GroupedData protected[sql](
    * @since 1.5.0
    */
   @scala.annotation.varargs
-  def std(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(aggregate.Utils.standardDeviation)
+  def stddev(colNames: String*): DataFrame = {
+    stddevSamp(colNames : _*)
+  }
+
+  /**
+   * Compute the population standard deviation for each numeric column for each group.
+   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the standard deviation for them.
+   *
+   * @since 1.5.0
+   */
+  @scala.annotation.varargs
+  def stddevPop(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames : _*)(aggregate.Utils.standardDeviation(_, sample = false,
+      "stddev_pop"))
+  }
+
+  /**
+   * Compute the sample standard deviation for each numeric column for each group.
+   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the standard deviation for them.
+   *
+   * @since 1.5.0
+   */
+  @scala.annotation.varargs
+  def stddevSamp(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames : _*)(aggregate.Utils.sampleStandardDeviation)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -22,7 +22,6 @@ import scala.reflect.runtime.universe.{TypeTag, typeTag}
 import scala.util.Try
 
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql.catalyst.expressions.aggregate.StandardDeviation
 import org.apache.spark.sql.catalyst.{SqlParser, ScalaReflection}
 import org.apache.spark.sql.catalyst.analysis.{UnresolvedFunction, Star}
 import org.apache.spark.sql.catalyst.expressions._
@@ -297,19 +296,53 @@ object functions {
 
   /**
    * Aggregate function: returns the sample standard deviation of the values in a group.
+   * Alias for stddevSamp.
    *
    * @group agg_funcs
    * @since 1.5.0
    */
-  def std(e: Column): Column = aggregate.Utils.standardDeviation(e.expr)
+  def stddev(e: Column): Column = stddevSamp(e)
 
   /**
    * Aggregate function: returns the sample standard deviation of the values in a group.
+   * Alias for stddevSamp.
    *
    * @group agg_funcs
    * @since 1.5.0
    */
-  def std(columnName: String): Column = std(Column(columnName))
+  def stddev(columnName: String): Column = stddev(Column(columnName))
+
+  /**
+   * Aggregate function: returns the population standard deviation of the values in a group.
+   *
+   * @group agg_funcs
+   * @since 1.5.0
+   */
+  def stddevPop(e: Column): Column = aggregate.Utils.standardDeviation(e.expr, false, "stddev_pomp")
+
+  /**
+   * Aggregate function: returns the population standard deviation of the values in a group.
+   *
+   * @group agg_funcs
+   * @since 1.5.0
+   */
+  def stddevPop(columnName: String): Column = stddevPop(Column(columnName))
+
+  /**
+   * Aggregate function: returns the sample standard deviation of the values in a group.
+   *
+   * @group agg_funcs
+   * @since 1.5.0
+   */
+  def stddevSamp(e: Column): Column = aggregate.Utils.sampleStandardDeviation(e.expr)
+
+  /**
+   * Aggregate function: returns the sample standard deviation of the values in a group.
+   *
+   * @group agg_funcs
+   * @since 1.5.0
+   */
+  def stddevSamp(columnName: String): Column = stddev(Column(columnName))
 
   /**
    * Aggregate function: returns the sum of all values in the expression.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -116,8 +116,6 @@ object QueryTest {
         Row.fromSeq(s.toSeq.map {
           case d: java.math.BigDecimal => BigDecimal(d)
           case b: Array[Byte] => b.toSeq
-          case d: Double if !d.isNaN && !d.isInfinity => 
-            BigDecimal(d).setScale(10, BigDecimal.RoundingMode.HALF_UP)
           case o => o
         })
       }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 import org.apache.spark.sql._
-import org.apache.spark.sql.functions.std
+import org.apache.spark.sql.functions.{stddev, stddevPop}
 import org.scalatest.BeforeAndAfterAll
 import _root_.test.org.apache.spark.sql.hive.aggregate.{MyDoubleAvg, MyDoubleSum}
 
@@ -285,30 +285,63 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Be
       Row(11.125) :: Nil)
   }
 
+  /** For resilience against rounding mismatches. */
+  private def about(d: Double): BigDecimal = BigDecimal(d).setScale(10, BigDecimal.RoundingMode.HALF_UP)
+
   test("test standard deviation") {
     // All results generated in R. Comparisons will be performed up to 10 digits of precision.
     val df = Seq.tabulate(10)(i => (i, 1)).toDF("val", "key")
     checkAnswer(
-      df.select(std("val")),
-      Row(3.0276503540974917) :: Nil)
+      df.select(stddev("val").cast("decimal(12, 10)")),
+      Row(about(3.0276503540974917)) :: Nil)
+
+    checkAnswer(
+      df.select(stddevPop("val").cast("decimal(12, 10)")),
+      Row(about(2.8722813232690148)) :: Nil)
+
+    checkAnswer(
+      sqlContext.table("agg1").groupBy("key").stddev("value")
+        .select($"key", $"stddev_samp(value)".cast("decimal(12, 10)")),
+      Row(1, about(10.0)) :: Row(2, about(0.7071067811865476)) :: Row(3, null) ::
+        Row(null, about(81.8535277187245)) :: Nil)
+
+    checkAnswer(
+      sqlContext.table("agg1").groupBy("key").stddevPop("value")
+        .select($"key", $"stddev_pop(value)".cast("decimal(12, 10)")),
+      Row(1, about(8.16496580927726)) :: Row(2, about(0.5)) :: Row(3, null) ::
+        Row(null, about(66.83312551921139)) :: Nil)
 
     checkAnswer(
-      sqlContext.table("agg1").groupBy("key").std("value"),
-      Row(1, 10.0) :: Row(2, 0.7071067811865476) :: Row(3, null) ::
-        Row(null, 81.8535277187245) :: Nil)
+      sqlContext.table("agg1").select(stddev("key").cast("decimal(12, 10)"),
+        stddev("value").cast("decimal(12, 10)")),
+      Row(about(0.7817359599705717), about(44.898098909801135)) :: Nil)
 
     checkAnswer(
-      sqlContext.table("agg1").select(std("key"), std("value")),
-      Row(0.7817359599705717, 44.898098909801135) :: Nil)
+      sqlContext.table("agg1").select(stddevPop("key").cast("decimal(12, 10)"),
+        stddevPop("value").cast("decimal(12, 10)")),
+      Row(about(0.7370277311900889), about(41.99832585949111)) :: Nil)
 
     checkAnswer(
-      sqlContext.table("agg2").groupBy("key", "value1").std("value2"),
-      Row(1, 10, null) :: Row(1, 30, 42.42640687119285) :: Row(2, -1, null) ::
-        Row(2, 1, 0.0) :: Row(2, null, null) :: Row(3, null, null) :: Row(null, -10, null) ::
+      sqlContext.table("agg2").groupBy("key", "value1").stddev("value2")
+        .select($"key", $"value1", $"stddev_samp(value2)".cast("decimal(12, 10)")),
+      Row(1, 10, null) :: Row(1, 30, about(42.42640687119285)) :: Row(2, -1, null) ::
+        Row(2, 1, about(0.0)) :: Row(2, null, null) :: Row(3, null, null) :: Row(null, -10, null) ::
         Row(null, -60, null) :: Row(null, 100, null) :: Row(null, null, null) :: Nil)
 
     checkAnswer(
-      sqlContext.table("emptyTable").select(std("value")),
+      sqlContext.table("agg2").groupBy("key", "value1").stddevPop("value2")
+        .select($"key", $"value1", $"stddev_pop(value2)".cast("decimal(12, 10)")),
+      Row(1, 10, about(0.0)) :: Row(1, 30, about(30.0)) :: Row(2, -1, null) ::
+        Row(2, 1, about(0.0)) :: Row(2, null, about(0.0)) :: Row(3, null, about(0.0)) ::
+        Row(null, -10, about(0.0)) :: Row(null, -60, about(0.0)) :: Row(null, 100, about(0.0)) :: 
+        Row(null, null, null) :: Nil)
+
+    checkAnswer(
+      sqlContext.table("emptyTable").select(stddev("value")),
+      Row(null) :: Nil)
+
+    checkAnswer(
+      sqlContext.table("emptyTable").select(stddevPop("value")),
       Row(null) :: Nil)
   }
 

Original file line number	Diff line number	Diff line change
`@@ -116,8 +116,6 @@ object QueryTest {`
`116`	`116`	`Row.fromSeq(s.toSeq.map {`
`117`	`117`	`case d: java.math.BigDecimal => BigDecimal(d)`
`118`	`118`	`case b: Array[Byte] => b.toSeq`
`119`		`- case d: Double if !d.isNaN && !d.isInfinity =>`
`120`		`- BigDecimal(d).setScale(10, BigDecimal.RoundingMode.HALF_UP)`
`121`	`119`	`case o => o`
`122`	`120`	`})`
`123`	`121`	`}`