[SPARK-31735][CORE] Include date/timestamp in the summary report

Fokko · Fokko · commit b34ec2de66d1 · 2020-05-17T21:18:56.000+02:00
Currently dates are missing from the export:

from datetime import datetime, timedelta, timezone
from pyspark.sql import types as T
from pyspark.sql import Row
from pyspark.sql import functions as F

START = datetime(2014, 1, 1, tzinfo=timezone.utc)

n_days = 22

date_range = [Row(date=(START + timedelta(days=n))) for n in range(0, n_days)]

schema = T.StructType([T.StructField(name="date", dataType=T.DateType(), nullable=False)])

rdd = spark.sparkContext.parallelize(date_range)

df = spark.createDataFrame(data=rdd, schema=schema)

df.agg(F.max("date")).show()

df.summary().show()
+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    25%|
|    50%|
|    75%|
|    max|
+-------+

Would be nice to include these as well

Signed-off-by: Fokko Driesprong &lt;fokko@apache.org&gt;
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -264,7 +264,10 @@ object StatFunctions extends Logging {
     }
 
     val selectedCols = ds.logicalPlan.output
-      .filter(a => a.dataType.isInstanceOf[NumericType] || a.dataType.isInstanceOf[StringType])
+      .filter(a => a.dataType.isInstanceOf[NumericType]
+        || a.dataType.isInstanceOf[StringType]
+        || a.dataType.isInstanceOf[DateType]
+        || a.dataType.isInstanceOf[TimestampType])
 
     val aggExprs = statisticFns.flatMap { func =>
       selectedCols.map(c => Column(Cast(func(c), StringType)).as(c.name))