addressed comments v0.1

brkyvz · brkyvz · commit e3b0b857c860 · 2015-04-30T22:58:47.000-07:00
diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py
@@ -53,8 +53,9 @@
 
 from pyspark.sql.types import Row
 from pyspark.sql.context import SQLContext, HiveContext
-from pyspark.sql.dataframe import DataFrame, GroupedData, Column, SchemaRDD, DataFrameNaFunctions
+from pyspark.sql.dataframe import DataFrame, GroupedData, Column, SchemaRDD, DataFrameNaFunctions, DataFrameStatFunctions
 
 __all__ = [
-    'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row', 'DataFrameNaFunctions'
+    'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row',
+    'DataFrameNaFunctions', 'DataFrameStatFunctions'
 ]
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -877,12 +877,11 @@ def fillna(self, value, subset=None):
 
     def cov(self, col1, col2):
         """
-        Calculate the covariance for the given columns, specified by their names.
-        alias for ``stat.cov()``.
+        Calculate the covariance for the given columns, specified by their names as a double value.
+        :func:`DataFrame.cov` and :func:`DataFrameStatFunctions.cov` are aliases.
 
         :param col1: The name of the first column
         :param col2: The name of the second column
-        :return: the covariance of the columns
         """
         return self.stat.cov(col1, col2)
 
@@ -1337,19 +1336,13 @@ def __init__(self, df):
         self.df = df
 
     def cov(self, col1, col2):
-        """
-        Calculate the covariance for the given columns, specified by their names.
-
-        :param col1: The name of the first column
-        :param col2: The name of the second column
-        :return: the covariance of the columns
-        """
         if not isinstance(col1, str):
             raise ValueError("col1 should be a string.")
         if not isinstance(col2, str):
             raise ValueError("col2 should be a string.")
         return self.df._jdf.stat().cov(col1, col2)
-
+    
+    cov.__doc__ = DataFrame.cov.__doc__
 
 def _test():
     import doctest
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -17,8 +17,9 @@
 
 package org.apache.spark.sql.execution.stat
 
+import org.apache.spark.sql.catalyst.expressions.Cast
 import org.apache.spark.sql.{Column, DataFrame}
-import org.apache.spark.sql.types.NumericType
+import org.apache.spark.sql.types.{DoubleType, NumericType}
 
 private[sql] object StatFunctions {
   
@@ -29,15 +30,12 @@ private[sql] object StatFunctions {
     var Ck = 0.0
     var count = 0
     // add an example to the calculation
-    def add(x: Number, y: Number): this.type = {
+    def add(x: Double, y: Double): this.type = {
       val oldX = xAvg
-      val otherX = x.doubleValue()
-      val otherY = y.doubleValue()
       count += 1
-      xAvg += (otherX - xAvg) / count
-      yAvg += (otherY - yAvg) / count
-      println(oldX)
-      Ck += (otherY - yAvg) * (otherX - oldX)
+      xAvg += (x - xAvg) / count
+      yAvg += (y - yAvg) / count
+      Ck += (y - yAvg) * (x - oldX)
       this
     }
     // merge counters from other partitions
@@ -68,9 +66,10 @@ private[sql] object StatFunctions {
       require(data.get.dataType.isInstanceOf[NumericType], "Covariance calculation for columns " +
         s"with dataType ${data.get.dataType} not supported.")
     }
-    val counts = df.select(cols.map(Column(_)):_*).rdd.aggregate(new CovarianceCounter)(
+    val columns = cols.map(n => Column(Cast(Column(n).expr, DoubleType)))
+    val counts = df.select(columns:_*).rdd.aggregate(new CovarianceCounter)(
       seqOp = (counter, row) => {
-        counter.add(row.getAs[Number](0), row.getAs[Number](1))
+        counter.add(row.getDouble(0), row.getDouble(1))
       },
       combOp = (baseCounter, other) => {
         baseCounter.merge(other)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -49,10 +49,8 @@ class DataFrameStatSuite extends FunSuite  {
   test("covariance") {
     val rows = Array.tabulate(10)(i => (i, 2.0 * i, toLetter(i)))
     val df = sqlCtx.sparkContext.parallelize(rows).toDF("singles", "doubles", "letters")
-    df.show()
 
     val results = df.stat.cov("singles", "doubles")
-    println(results)
     assert(math.abs(results - 16.5) < 1e-6)
     intercept[IllegalArgumentException] {
       df.stat.cov("singles", "letters") // doesn't accept non-numerical dataTypes