From 4d42cfdd3c656d0e42155ad65c42be072f1d4d9c Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Thu, 2 Feb 2017 17:39:02 +0800 Subject: [PATCH 01/14] create pr --- .../spark/sql/DataFrameStatFunctions.scala | 7 ++- .../sql/execution/stat/StatFunctions.scala | 4 +- .../apache/spark/sql/DataFrameStatSuite.scala | 53 ++++++++++++++++--- 3 files changed, 51 insertions(+), 13 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 2b782fd75c6a..3fd6eed64015 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -80,13 +80,12 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @see [[DataFrameStatsFunctions.approxQuantile(col:Str* approxQuantile]] for * detailed description. * - * Note that rows containing any null or NaN values values will be removed before - * calculation. + * Note that rows containing any null or NaN values will be removed before calculation. * @param cols the names of the numerical columns * @param probabilities a list of quantile probabilities * Each number must belong to [0, 1]. * For example 0 is the minimum, 0.5 is the median, 1 is the maximum. - * @param relativeError The relative target precision to achieve (>= 0). + * @param relativeError The relative target precision to achieve (greater or equal to 0). * If set to zero, the exact quantiles are computed, which could be very expensive. * Note that values greater than 1 are accepted but give the same result as 1. * @return the approximate quantiles at the given probabilities of each column @@ -112,7 +111,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { probabilities: List[Double], relativeError: Double): java.util.List[java.util.List[Double]] = { approxQuantile(cols.toArray, probabilities.toArray, relativeError) - .map(_.toList.asJava).toList.asJava + .map(_.toList.asJava).toList.asJava } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala index 2b2e706125ed..2a1755e2ad76 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala @@ -49,7 +49,7 @@ object StatFunctions extends Logging { * @param probabilities a list of quantile probabilities * Each number must belong to [0, 1]. * For example 0 is the minimum, 0.5 is the median, 1 is the maximum. - * @param relativeError The relative target precision to achieve (>= 0). + * @param relativeError The relative target precision to achieve (greater or equal 0). * If set to zero, the exact quantiles are computed, which could be very expensive. * Note that values greater than 1 are accepted but give the same result as 1. * @@ -60,6 +60,8 @@ object StatFunctions extends Logging { cols: Seq[String], probabilities: Seq[Double], relativeError: Double): Seq[Seq[Double]] = { + require(relativeError >= 0, + s"Relative Error must be non-negative but got $relativeError") val columns: Seq[Column] = cols.map { colName => val field = df.schema(colName) require(field.dataType.isInstanceOf[NumericType], diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala index f52b18e27b5f..a2de2556b285 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala @@ -25,7 +25,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.execution.stat.StatFunctions import org.apache.spark.sql.functions.col import org.apache.spark.sql.test.SharedSQLContext -import org.apache.spark.sql.types.DoubleType +import org.apache.spark.sql.types.{DoubleType, StructField, StructType} class DataFrameStatSuite extends QueryTest with SharedSQLContext { import testImplicits._ @@ -159,16 +159,53 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext { assert(math.abs(md1 - 2 * q1 * n) < error_double) assert(math.abs(md2 - 2 * q2 * n) < error_double) } - // test approxQuantile on NaN values - val dfNaN = Seq(Double.NaN, 1.0, Double.NaN, Double.NaN).toDF("input") - val resNaN = dfNaN.stat.approxQuantile("input", Array(q1, q2), epsilons.head) + + // test relativeError greater than 1 return the same result as 1 + val Array(single1_1) = df.stat.approxQuantile("singles", Array(q1), 1.0) + val Array(s1_1, s2_1) = df.stat.approxQuantile("singles", Array(q1, q2), 1.0) + val Array(Array(ms1_1, ms2_1), Array(md1_1, md2_1)) = + df.stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2), 1.0) + for (epsilon <- Seq(2.0, 100.0)) { + val Array(single1) = df.stat.approxQuantile("singles", Array(q1), epsilon) + val Array(s1, s2) = df.stat.approxQuantile("singles", Array(q1, q2), epsilon) + val Array(Array(ms1, ms2), Array(md1, md2)) = + df.stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2), epsilon) + assert(single1_1 === single1) + assert(s1_1 === s1) + assert(s2_1 === s2) + assert(ms1_1 === ms1) + assert(ms2_1 === ms2) + assert(md1_1 === md1) + assert(md2_1 === md2) + } + + // quantile should be in the range [0.0, 1.0] + val e: IllegalArgumentException = intercept[IllegalArgumentException] { + df.stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2, -0.1), epsilons.head) + } + assert(e.getMessage.contains("quantile should be in the range [0.0, 1.0]")) + + // relativeError should be non-negative + val e2: IllegalArgumentException = intercept[IllegalArgumentException] { + df.stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2), -1.0) + } + assert(e2.getMessage.contains("Relative Error must be non-negative")) + + // test approxQuantile on NaN and null values + val rows = spark.sparkContext.parallelize(Seq(Row(Double.NaN, 1.0), Row(1.0, 1.0), + Row(-1.0, Double.NaN), Row(Double.NaN, Double.NaN), Row(null, null), Row(null, 1.0), + Row(-1.0, null), Row(Double.NaN, null))) + val schema = StructType(Seq(StructField("input1", DoubleType, nullable = true), + StructField("input2", DoubleType, nullable = true))) + val dfNaN = spark.createDataFrame(rows, schema) + val resNaN = dfNaN.stat.approxQuantile("input1", Array(q1, q2), epsilons.head) assert(resNaN.count(_.isNaN) === 0) - // test approxQuantile on multi-column NaN values - val dfNaN2 = Seq((Double.NaN, 1.0), (1.0, 1.0), (-1.0, Double.NaN), (Double.NaN, Double.NaN)) - .toDF("input1", "input2") - val resNaN2 = dfNaN2.stat.approxQuantile(Array("input1", "input2"), + assert(resNaN.count(_ == null) === 0) + + val resNaN2 = dfNaN.stat.approxQuantile(Array("input1", "input2"), Array(q1, q2), epsilons.head) assert(resNaN2.flatten.count(_.isNaN) === 0) + assert(resNaN2.flatten.count(_ == null) === 0) } test("crosstab") { From f259543ddc65f54d43f72b7b245d925f98a9fa97 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Thu, 2 Feb 2017 17:58:55 +0800 Subject: [PATCH 02/14] update doc --- .../scala/org/apache/spark/sql/DataFrameStatFunctions.scala | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 3fd6eed64015..66b581e5e749 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -63,7 +63,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * Note that values greater than 1 are accepted but give the same result as 1. * @return the approximate quantiles at the given probabilities * - * @note NaN values will be removed from the numerical column before calculation + * @note null and NaN values will be removed from the numerical column before calculation * * @since 2.0.0 */ @@ -80,7 +80,6 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @see [[DataFrameStatsFunctions.approxQuantile(col:Str* approxQuantile]] for * detailed description. * - * Note that rows containing any null or NaN values will be removed before calculation. * @param cols the names of the numerical columns * @param probabilities a list of quantile probabilities * Each number must belong to [0, 1]. @@ -90,7 +89,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * Note that values greater than 1 are accepted but give the same result as 1. * @return the approximate quantiles at the given probabilities of each column * - * @note Rows containing any NaN values will be removed before calculation + * @note Rows containing any null or NaN values will be removed before calculation * * @since 2.2.0 */ From 5f456b22c1268a31d945f8e0a1fadc66ee6d31ae Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 3 Feb 2017 16:49:03 +0800 Subject: [PATCH 03/14] update tests --- .../apache/spark/sql/DataFrameStatSuite.scala | 47 ++++++++++++------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala index a2de2556b285..7779c19c5cf9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala @@ -160,12 +160,33 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext { assert(math.abs(md2 - 2 * q2 * n) < error_double) } - // test relativeError greater than 1 return the same result as 1 + // quantile should be in the range [0.0, 1.0] + val e = intercept[IllegalArgumentException] { + df.stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2, -0.1), epsilons.head) + } + assert(e.getMessage.contains("quantile should be in the range [0.0, 1.0]")) + + // relativeError should be non-negative + val e2 = intercept[IllegalArgumentException] { + df.stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2), -1.0) + } + assert(e2.getMessage.contains("Relative Error must be non-negative")) + } + + test("approximate quantile 2: test relativeError greater than 1 return the same result as 1") { + val n = 1000 + val df = Seq.tabulate(n)(i => (i, 2.0 * i)).toDF("singles", "doubles") + + val q1 = 0.5 + val q2 = 0.8 + val epsilons = List(2.0, 5.0, 100.0) + val Array(single1_1) = df.stat.approxQuantile("singles", Array(q1), 1.0) val Array(s1_1, s2_1) = df.stat.approxQuantile("singles", Array(q1, q2), 1.0) val Array(Array(ms1_1, ms2_1), Array(md1_1, md2_1)) = df.stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2), 1.0) - for (epsilon <- Seq(2.0, 100.0)) { + + for (epsilon <- epsilons) { val Array(single1) = df.stat.approxQuantile("singles", Array(q1), epsilon) val Array(s1, s2) = df.stat.approxQuantile("singles", Array(q1, q2), epsilon) val Array(Array(ms1, ms2), Array(md1, md2)) = @@ -178,32 +199,24 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext { assert(md1_1 === md1) assert(md2_1 === md2) } + } - // quantile should be in the range [0.0, 1.0] - val e: IllegalArgumentException = intercept[IllegalArgumentException] { - df.stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2, -0.1), epsilons.head) - } - assert(e.getMessage.contains("quantile should be in the range [0.0, 1.0]")) - - // relativeError should be non-negative - val e2: IllegalArgumentException = intercept[IllegalArgumentException] { - df.stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2), -1.0) - } - assert(e2.getMessage.contains("Relative Error must be non-negative")) - - // test approxQuantile on NaN and null values + test("approximate quantile 3: test on NaN and null values") { + val q1 = 0.5 + val q2 = 0.8 + val epsilon = 0.1 val rows = spark.sparkContext.parallelize(Seq(Row(Double.NaN, 1.0), Row(1.0, 1.0), Row(-1.0, Double.NaN), Row(Double.NaN, Double.NaN), Row(null, null), Row(null, 1.0), Row(-1.0, null), Row(Double.NaN, null))) val schema = StructType(Seq(StructField("input1", DoubleType, nullable = true), StructField("input2", DoubleType, nullable = true))) val dfNaN = spark.createDataFrame(rows, schema) - val resNaN = dfNaN.stat.approxQuantile("input1", Array(q1, q2), epsilons.head) + val resNaN = dfNaN.stat.approxQuantile("input1", Array(q1, q2), epsilon) assert(resNaN.count(_.isNaN) === 0) assert(resNaN.count(_ == null) === 0) val resNaN2 = dfNaN.stat.approxQuantile(Array("input1", "input2"), - Array(q1, q2), epsilons.head) + Array(q1, q2), epsilon) assert(resNaN2.flatten.count(_.isNaN) === 0) assert(resNaN2.flatten.count(_ == null) === 0) } From cf0b8088640fde5bf96e90d6a0e75065a0314e0e Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Sat, 4 Feb 2017 15:58:46 +0800 Subject: [PATCH 04/14] update tests --- .../scala/org/apache/spark/sql/DataFrameStatFunctions.scala | 3 +-- .../scala/org/apache/spark/sql/DataFrameStatSuite.scala | 6 ++++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 66b581e5e749..ea63a0c731e6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -77,8 +77,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** * Calculates the approximate quantiles of numerical columns of a DataFrame. - * @see [[DataFrameStatsFunctions.approxQuantile(col:Str* approxQuantile]] for - * detailed description. + * @see `DataFrameStatsFunctions.approxQuantile` for detailed description. * * @param cols the names of the numerical columns * @param probabilities a list of quantile probabilities diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala index 7779c19c5cf9..41d88843afc6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala @@ -171,6 +171,12 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext { df.stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2), -1.0) } assert(e2.getMessage.contains("Relative Error must be non-negative")) + + // dataset should be non-empty + intercept[NoSuchElementException] { + df.selectExpr("*").limit(0) + .stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2), epsilons.head) + } } test("approximate quantile 2: test relativeError greater than 1 return the same result as 1") { From 9a8fc1e5d141e00c8775855e9f9cd3f07f7905d6 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Sun, 5 Feb 2017 17:19:44 +0800 Subject: [PATCH 05/14] retuen null for empty input --- .../spark/sql/DataFrameStatFunctions.scala | 16 ++++++++++++---- .../apache/spark/sql/DataFrameStatSuite.scala | 11 +++++++---- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index ea63a0c731e6..e6f238eacd21 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -71,8 +71,12 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { col: String, probabilities: Array[Double], relativeError: Double): Array[Double] = { - StatFunctions.multipleApproxQuantiles(df.select(col).na.drop(), - Seq(col), probabilities, relativeError).head.toArray + val res = approxQuantile(Array(col), probabilities, relativeError) + if (res != null) { + res.head + } else { + null + } } /** @@ -96,8 +100,12 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { cols: Array[String], probabilities: Array[Double], relativeError: Double): Array[Array[Double]] = { - StatFunctions.multipleApproxQuantiles(df.select(cols.map(col): _*).na.drop(), cols, - probabilities, relativeError).map(_.toArray).toArray + try { + StatFunctions.multipleApproxQuantiles(df.select(cols.map(col): _*).na.drop(), cols, + probabilities, relativeError).map(_.toArray).toArray + } catch { + case e: NoSuchElementException => null + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala index 41d88843afc6..a05e7ce47d84 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala @@ -173,10 +173,13 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext { assert(e2.getMessage.contains("Relative Error must be non-negative")) // dataset should be non-empty - intercept[NoSuchElementException] { - df.selectExpr("*").limit(0) - .stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2), epsilons.head) - } + val res1 = df.selectExpr("*").limit(0) + .stat.approxQuantile("singles", Array(q1, q2), epsilons.head) + assert(res1 === null) + + val res2 = df.selectExpr("*").limit(0) + .stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2), epsilons.head) + assert(res2 === null) } test("approximate quantile 2: test relativeError greater than 1 return the same result as 1") { From 434570958ba35426ed5e733bdea063d3cc7bdf51 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Sun, 5 Feb 2017 17:20:20 +0800 Subject: [PATCH 06/14] retuen null for empty input --- .../test/scala/org/apache/spark/sql/DataFrameStatSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala index a05e7ce47d84..d0910e618a04 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala @@ -172,7 +172,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext { } assert(e2.getMessage.contains("Relative Error must be non-negative")) - // dataset should be non-empty + // return null if the dataset is empty val res1 = df.selectExpr("*").limit(0) .stat.approxQuantile("singles", Array(q1, q2), epsilons.head) assert(res1 === null) From 2fbe21a8b9835c15733ef9e3a18ef5c70ceb43a6 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Thu, 9 Feb 2017 15:41:27 +0800 Subject: [PATCH 07/14] update doc --- .../org/apache/spark/sql/DataFrameStatFunctions.scala | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index e6f238eacd21..77429a415aba 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -58,12 +58,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @param probabilities a list of quantile probabilities * Each number must belong to [0, 1]. * For example 0 is the minimum, 0.5 is the median, 1 is the maximum. - * @param relativeError The relative target precision to achieve (greater or equal to 0). + * @param relativeError The relative target precision to achieve (greater than or equal to 0). * If set to zero, the exact quantiles are computed, which could be very expensive. * Note that values greater than 1 are accepted but give the same result as 1. * @return the approximate quantiles at the given probabilities * - * @note null and NaN values will be removed from the numerical column before calculation + * @note null and NaN values will be removed from the numerical column before calculation. If + * the dataframe is empty or all rows contain null or NaN, null is returned. * * @since 2.0.0 */ @@ -87,12 +88,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @param probabilities a list of quantile probabilities * Each number must belong to [0, 1]. * For example 0 is the minimum, 0.5 is the median, 1 is the maximum. - * @param relativeError The relative target precision to achieve (greater or equal to 0). + * @param relativeError The relative target precision to achieve (greater than or equal to 0). * If set to zero, the exact quantiles are computed, which could be very expensive. * Note that values greater than 1 are accepted but give the same result as 1. * @return the approximate quantiles at the given probabilities of each column * - * @note Rows containing any null or NaN values will be removed before calculation + * @note Rows containing any null or NaN values will be removed before calculation. If + * the dataframe is empty or all rows contain null or NaN, null is returned. * * @since 2.2.0 */ From 322141ac5978f66f342e6f6fcbd4f33c39cccbb6 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Mon, 13 Feb 2017 09:57:06 +0800 Subject: [PATCH 08/14] add TODO & update @see --- .../org/apache/spark/sql/DataFrameStatFunctions.scala | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 77429a415aba..53c72d981ebe 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -73,16 +73,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { probabilities: Array[Double], relativeError: Double): Array[Double] = { val res = approxQuantile(Array(col), probabilities, relativeError) - if (res != null) { - res.head - } else { - null - } + Option(res).map(_.head).orNull } /** * Calculates the approximate quantiles of numerical columns of a DataFrame. - * @see `DataFrameStatsFunctions.approxQuantile` for detailed description. + * @see `[[DataFrameStatsFunctions.approxQuantile(col:Str* approxQuantile]]` for detailed + * description. * * @param cols the names of the numerical columns * @param probabilities a list of quantile probabilities @@ -102,6 +99,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { cols: Array[String], probabilities: Array[Double], relativeError: Double): Array[Array[Double]] = { + // TODO: Update NaN/null handling to keep consistent with the single-column version try { StatFunctions.multipleApproxQuantiles(df.select(cols.map(col): _*).na.drop(), cols, probabilities, relativeError).map(_.toArray).toArray From e5bac5324048363e3fa214487ca9e75113dc1884 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Mon, 13 Feb 2017 09:58:51 +0800 Subject: [PATCH 09/14] fix one nit --- .../org/apache/spark/sql/execution/stat/StatFunctions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala index 2a1755e2ad76..c3d8859cb7a9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala @@ -49,7 +49,7 @@ object StatFunctions extends Logging { * @param probabilities a list of quantile probabilities * Each number must belong to [0, 1]. * For example 0 is the minimum, 0.5 is the median, 1 is the maximum. - * @param relativeError The relative target precision to achieve (greater or equal 0). + * @param relativeError The relative target precision to achieve (greater than or equal 0). * If set to zero, the exact quantiles are computed, which could be very expensive. * Note that values greater than 1 are accepted but give the same result as 1. * From fa1069a0ef078c13a76653fcda2540baf272765a Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Mon, 13 Feb 2017 13:59:48 +0800 Subject: [PATCH 10/14] add ) --- .../scala/org/apache/spark/sql/DataFrameStatFunctions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 53c72d981ebe..11bee00fa6d9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -78,7 +78,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** * Calculates the approximate quantiles of numerical columns of a DataFrame. - * @see `[[DataFrameStatsFunctions.approxQuantile(col:Str* approxQuantile]]` for detailed + * @see `[[DataFrameStatsFunctions.approxQuantile(col:Str* approxQuantile)]]` for detailed * description. * * @param cols the names of the numerical columns From d3e3ee0601afb80a5202bc978a425306390ad76f Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Mon, 13 Feb 2017 16:10:42 +0800 Subject: [PATCH 11/14] update doc --- .../scala/org/apache/spark/sql/DataFrameStatFunctions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 11bee00fa6d9..0f36c5fc96e8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -78,7 +78,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** * Calculates the approximate quantiles of numerical columns of a DataFrame. - * @see `[[DataFrameStatsFunctions.approxQuantile(col:Str* approxQuantile)]]` for detailed + * @see `DataFrameStatsFunctions.approxQuantile(col:Str* approxQuantile)` for detailed * description. * * @param cols the names of the numerical columns From db23d1164edab591816a07686558f95ba6ee9721 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Tue, 14 Feb 2017 17:31:15 +0800 Subject: [PATCH 12/14] update see annotation --- .../scala/org/apache/spark/sql/DataFrameStatFunctions.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 0f36c5fc96e8..0ceca52111f8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -78,8 +78,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** * Calculates the approximate quantiles of numerical columns of a DataFrame. - * @see `DataFrameStatsFunctions.approxQuantile(col:Str* approxQuantile)` for detailed - * description. + * @see `approxQuantile(String, Array[Double], Double)` for detailed description. * * @param cols the names of the numerical columns * @param probabilities a list of quantile probabilities From 7c542341cb086a64b9c78b067c7b167a5a085ab1 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Tue, 14 Feb 2017 17:32:17 +0800 Subject: [PATCH 13/14] update see annotation --- .../main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 0ceca52111f8..432c0a186a96 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -78,7 +78,6 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** * Calculates the approximate quantiles of numerical columns of a DataFrame. - * @see `approxQuantile(String, Array[Double], Double)` for detailed description. * * @param cols the names of the numerical columns * @param probabilities a list of quantile probabilities From 2268d360206fd3e262d316ba3e02b35a525796da Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 16 Feb 2017 13:17:49 +0800 Subject: [PATCH 14/14] update see --- .../main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 432c0a186a96..bdcdf0c61ff3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -78,6 +78,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** * Calculates the approximate quantiles of numerical columns of a DataFrame. + * @see `approxQuantile(col:Str* approxQuantile)` for detailed description. * * @param cols the names of the numerical columns * @param probabilities a list of quantile probabilities