From 307ecbf314b1a01a8f5051e5f9aa8b5985b93ac0 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Sun, 17 May 2015 01:54:12 -0700 Subject: [PATCH 1/4] Add failing regression test for SPARK-7687 --- python/pyspark/sql/dataframe.py | 6 +++--- .../test/scala/org/apache/spark/sql/DataFrameSuite.scala | 5 +++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index e4a191a9ef07..03f40af5aa00 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -599,11 +599,11 @@ def describe(self, *cols): +-------+---+ |summary|age| +-------+---+ - | count| 2| + | count|2.0| | mean|3.5| | stddev|1.5| - | min| 2| - | max| 5| + | min|2.0| + | max|5.0| +-------+---+ """ jdf = self._jdf.describe(self._jseq(cols)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 054b23dba84c..8678a715f126 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -388,6 +388,11 @@ class DataFrameSuite extends QueryTest { val describeTwoCols = describeTestData.describe("age", "height") assert(getSchemaAsSeq(describeTwoCols) === Seq("summary", "age", "height")) checkAnswer(describeTwoCols, describeResult) + // All aggregate value should have been cast to double, including `count` + describeTwoCols.collect().foreach { row => + assert(row.get(1).isInstanceOf[Double], "expected double but found " + row.get(1).getClass) + assert(row.get(2).isInstanceOf[Double], "expected double but found " + row.get(2).getClass) + } val describeAllCols = describeTestData.describe() assert(getSchemaAsSeq(describeAllCols) === Seq("summary", "age", "height")) From f2065809cace506d84daaca7a8fc466742103e99 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Sun, 17 May 2015 01:54:27 -0700 Subject: [PATCH 2/4] Cast to double to fix SPARK-7687 --- sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index 27e9af49f066..b7a881c5737f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -1063,7 +1063,7 @@ class DataFrame private[sql]( val ret: Seq[Row] = if (outputCols.nonEmpty) { val aggExprs = statistics.flatMap { case (_, colToAgg) => - outputCols.map(c => Column(colToAgg(Column(c).expr)).as(c)) + outputCols.map(c => Column(Cast(colToAgg(Column(c).expr), DoubleType)).as(c)) } val row = agg(aggExprs.head, aggExprs.tail: _*).head().toSeq From 2974bd52b5fdc8517cf0568301108cea9ebfc2d6 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Sun, 17 May 2015 21:30:58 -0700 Subject: [PATCH 3/4] Cast to string type instead --- R/pkg/inst/tests/test_sparkSQL.R | 6 +++--- python/pyspark/sql/dataframe.py | 6 +++--- .../scala/org/apache/spark/sql/DataFrame.scala | 6 +++--- .../org/apache/spark/sql/DataFrameSuite.scala | 18 +++++++++--------- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 3e5658eb5b24..32744adaff47 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -758,11 +758,11 @@ test_that("describe() on a DataFrame", { df <- jsonFile(sqlCtx, jsonPath) stats <- describe(df, "age") expect_true(collect(stats)[1, "summary"] == "count") - expect_true(collect(stats)[2, "age"] == 24.5) - expect_true(collect(stats)[3, "age"] == 5.5) + expect_true(collect(stats)[2, "age"] == "24.5") + expect_true(collect(stats)[3, "age"] == "5.5") stats <- describe(df) expect_true(collect(stats)[4, "name"] == "Andy") - expect_true(collect(stats)[5, "age"] == 30.0) + expect_true(collect(stats)[5, "age"] == "30.0") }) unlink(parquetPath) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 03f40af5aa00..e4a191a9ef07 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -599,11 +599,11 @@ def describe(self, *cols): +-------+---+ |summary|age| +-------+---+ - | count|2.0| + | count| 2| | mean|3.5| | stddev|1.5| - | min|2.0| - | max|5.0| + | min| 2| + | max| 5| +-------+---+ """ jdf = self._jdf.describe(self._jseq(cols)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index b7a881c5737f..adad85806d1e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -1063,7 +1063,7 @@ class DataFrame private[sql]( val ret: Seq[Row] = if (outputCols.nonEmpty) { val aggExprs = statistics.flatMap { case (_, colToAgg) => - outputCols.map(c => Column(Cast(colToAgg(Column(c).expr), DoubleType)).as(c)) + outputCols.map(c => Column(Cast(colToAgg(Column(c).expr), StringType)).as(c)) } val row = agg(aggExprs.head, aggExprs.tail: _*).head().toSeq @@ -1077,9 +1077,9 @@ class DataFrame private[sql]( statistics.map { case (name, _) => Row(name) } } - // The first column is string type, and the rest are double type. + // All columns are string type val schema = StructType( - StructField("summary", StringType) :: outputCols.map(StructField(_, DoubleType))).toAttributes + StructField("summary", StringType) :: outputCols.map(StructField(_, StringType))).toAttributes LocalRelation(schema, ret) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 8678a715f126..a5adb96a08bb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -370,14 +370,14 @@ class DataFrameSuite extends QueryTest { ("Amy", 24, 180)).toDF("name", "age", "height") val describeResult = Seq( - Row("count", 4, 4), - Row("mean", 33.0, 178.0), - Row("stddev", 16.583123951777, 10.0), - Row("min", 16, 164), - Row("max", 60, 192)) + Row("count", "4", "4"), + Row("mean", "33.0", "178.0"), + Row("stddev", "16.583123951777", "10.0"), + Row("min", "16", "164"), + Row("max", "60", "192")) val emptyDescribeResult = Seq( - Row("count", 0, 0), + Row("count", "0", "0"), Row("mean", null, null), Row("stddev", null, null), Row("min", null, null), @@ -388,10 +388,10 @@ class DataFrameSuite extends QueryTest { val describeTwoCols = describeTestData.describe("age", "height") assert(getSchemaAsSeq(describeTwoCols) === Seq("summary", "age", "height")) checkAnswer(describeTwoCols, describeResult) - // All aggregate value should have been cast to double, including `count` + // All aggregate value should have been cast to string describeTwoCols.collect().foreach { row => - assert(row.get(1).isInstanceOf[Double], "expected double but found " + row.get(1).getClass) - assert(row.get(2).isInstanceOf[Double], "expected double but found " + row.get(2).getClass) + assert(row.get(1).isInstanceOf[String], "expected string but found " + row.get(1).getClass) + assert(row.get(2).isInstanceOf[String], "expected string but found " + row.get(2).getClass) } val describeAllCols = describeTestData.describe() From 146b615a9e543101f16f3e8e9bfa705abbc4444c Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Mon, 18 May 2015 14:08:26 -0700 Subject: [PATCH 4/4] Fix R test. --- R/pkg/inst/tests/test_sparkSQL.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 32744adaff47..1768c57fd02e 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -757,12 +757,12 @@ test_that("parquetFile works with multiple input paths", { test_that("describe() on a DataFrame", { df <- jsonFile(sqlCtx, jsonPath) stats <- describe(df, "age") - expect_true(collect(stats)[1, "summary"] == "count") - expect_true(collect(stats)[2, "age"] == "24.5") - expect_true(collect(stats)[3, "age"] == "5.5") + expect_equal(collect(stats)[1, "summary"], "count") + expect_equal(collect(stats)[2, "age"], "24.5") + expect_equal(collect(stats)[3, "age"], "5.5") stats <- describe(df) - expect_true(collect(stats)[4, "name"] == "Andy") - expect_true(collect(stats)[5, "age"] == "30.0") + expect_equal(collect(stats)[4, "name"], "Andy") + expect_equal(collect(stats)[5, "age"], "30") }) unlink(parquetPath)