Address comments and change Python API too.

viirya · viirya · commit 75edcb1a657e · 2016-04-07T10:35:15.000Z
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -1162,7 +1162,7 @@ def replace(self, to_replace, value, subset=None):
             self._jdf.na().replace(self._jseq(subset), self._jmap(rep_dict)), self.sql_ctx)
 
     @since(2.0)
-    def approxQuantile(self, col, probabilities, relativeError):
+    def approxQuantile(self, cols, probabilities, relativeError):
         """
         Calculates the approximate quantiles of a numerical column of a
         DataFrame.
@@ -1181,7 +1181,7 @@ def approxQuantile(self, col, probabilities, relativeError):
         Space-efficient Online Computation of Quantile Summaries]]
         by Greenwald and Khanna.
 
-        :param col: the name of the numerical column
+        :param cols: the name(s) of the numerical column(s)
         :param probabilities: a list of quantile probabilities
           Each number must belong to [0, 1].
           For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
@@ -1191,8 +1191,13 @@ def approxQuantile(self, col, probabilities, relativeError):
           accepted but give the same result as 1.
         :return:  the approximate quantiles at the given probabilities
         """
-        if not isinstance(col, str):
-            raise ValueError("col should be a string.")
+        if not isinstance(cols, (str, list, tuple)):
+            raise ValueError("col should be a string, list or tuple.")
+
+        if isinstance(cols, tuple):
+            cols = list(cols)
+        if isinstance(cols, list):
+            cols = _to_list(self._sc, cols)
 
         if not isinstance(probabilities, (list, tuple)):
             raise ValueError("probabilities should be a list or tuple")
@@ -1207,8 +1212,12 @@ def approxQuantile(self, col, probabilities, relativeError):
             raise ValueError("relativeError should be numerical (float, int, long) >= 0.")
         relativeError = float(relativeError)
 
-        jaq = self._jdf.stat().approxQuantile(col, probabilities, relativeError)
-        return list(jaq)
+        jaq = self._jdf.stat().approxQuantile(cols, probabilities, relativeError)
+        jaq = list(jaq)
+        for idx, a in enumerate(jaq):
+            if not isinstance(a, (list, float)):
+                jaq[idx] = list(a)
+        return jaq
 
     @since(1.4)
     def corr(self, col1, col2, method=None):
@@ -1440,8 +1449,8 @@ class DataFrameStatFunctions(object):
     def __init__(self, df):
         self.df = df
 
-    def approxQuantile(self, col, probabilities, relativeError):
-        return self.df.approxQuantile(col, probabilities, relativeError)
+    def approxQuantile(self, cols, probabilities, relativeError):
+        return self.df.approxQuantile(cols, probabilities, relativeError)
 
     approxQuantile.__doc__ = DataFrame.approxQuantile.__doc__
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -702,6 +702,14 @@ def test_approxQuantile(self):
         self.assertEqual(len(aq), 3)
         self.assertTrue(all(isinstance(q, float) for q in aq))
 
+        aqs = df.stat.approxQuantile(["a", "a"], [0.1, 0.5, 0.9], 0.1)
+        self.assertTrue(isinstance(aqs[0], list))
+        self.assertEqual(len(aqs[0]), 3)
+        self.assertTrue(all(isinstance(q, float) for q in aqs[0]))
+        self.assertTrue(isinstance(aqs[1], list))
+        self.assertEqual(len(aqs[1]), 3)
+        self.assertTrue(all(isinstance(q, float) for q in aqs[1]))
+
     def test_corr(self):
         import math
         df = self.sc.parallelize([Row(a=i, b=math.sqrt(i)) for i in range(10)]).toDF()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -72,6 +72,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
 
   /**
    * Calculates the approximate quantiles of numerical columns of a DataFrame.
+   * @see approxQuantile for detailed description.
    *
    * @param cols the names of the numerical columns.
    * @param probabilities a list of quantile probabilities
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -149,19 +149,15 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
       assert(math.abs(s2 - q2 * n) < error_single)
       assert(math.abs(d1 - 2 * q1 * n) < error_double)
       assert(math.abs(d2 - 2 * q2 * n) < error_double)
-    }
-
-    for (epsilon <- epsilons) {
-      val Array(Array(s1, s2), Array(d1, d2)) = df.stat.approxQuantile(Array("singles", "doubles"),
-        Array(q1, q2), epsilon)
 
-      val error_single = 2 * 1000 * epsilon
-      val error_double = 2 * 2000 * epsilon
+      // Multiple columns
+      val Array(Array(ms1, ms2), Array(md1, md2)) =
+        df.stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2), epsilon)
 
-      assert(math.abs(s1 - q1 * n) < error_single)
-      assert(math.abs(s2 - q2 * n) < error_single)
-      assert(math.abs(d1 - 2 * q1 * n) < error_double)
-      assert(math.abs(d2 - 2 * q2 * n) < error_double)
+      assert(math.abs(ms1 - q1 * n) < error_single)
+      assert(math.abs(ms2 - q2 * n) < error_single)
+      assert(math.abs(md1 - 2 * q1 * n) < error_double)
+      assert(math.abs(md2 - 2 * q2 * n) < error_double)
     }
   }
 

Original file line number	Diff line number	Diff line change
`@@ -72,6 +72,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {`
`72`	`72`
`73`	`73`	`/**`
`74`	`74`	`* Calculates the approximate quantiles of numerical columns of a DataFrame.`
	`75`	`+ * @see approxQuantile for detailed description.`
`75`	`76`	`*`
`76`	`77`	`* @param cols the names of the numerical columns.`
`77`	`78`	`* @param probabilities a list of quantile probabilities`