@@ -25,6 +25,7 @@ import org.apache.spark.internal.Logging
2525import org .apache .spark .sql .{AnalysisException , Row }
2626import org .apache .spark .sql .catalyst .expressions ._
2727import org .apache .spark .sql .catalyst .expressions .aggregate ._
28+ import org .apache .spark .sql .catalyst .util .DateTimeUtils
2829import org .apache .spark .sql .types ._
2930import org .apache .spark .util .Utils
3031
@@ -74,11 +75,10 @@ case class Statistics(
7475 * Statistics collected for a column.
7576 *
7677 * 1. Supported data types are defined in `ColumnStat.supportsType`.
77- * 2. The JVM data type stored in min/max is the external data type (used in Row) for the
78- * corresponding Catalyst data type. For example, for DateType we store java.sql.Date, and for
79- * TimestampType we store java.sql.Timestamp.
80- * 3. For integral types, they are all upcasted to longs, i.e. shorts are stored as longs.
81- * 4. There is no guarantee that the statistics collected are accurate. Approximation algorithms
78+ * 2. The JVM data type stored in min/max is the internal data type for the corresponding
79+ * Catalyst data type. For example, the internal type of DateType is Int, and that the internal
80+ * type of TimestampType is Long.
81+ * 3. There is no guarantee that the statistics collected are accurate. Approximation algorithms
8282 * (sketches) might have been used, and the data collected can also be stale.
8383 *
8484 * @param distinctCount number of distinct values
@@ -104,22 +104,43 @@ case class ColumnStat(
104104 /**
105105 * Returns a map from string to string that can be used to serialize the column stats.
106106 * The key is the name of the field (e.g. "distinctCount" or "min"), and the value is the string
107- * representation for the value. The deserialization side is defined in [[ColumnStat.fromMap ]].
107+ * representation for the value. min/max values are converted to the external data type. For
108+ * example, for DateType we store java.sql.Date, and for TimestampType we store
109+ * java.sql.Timestamp. The deserialization side is defined in [[ColumnStat.fromMap ]].
108110 *
109111 * As part of the protocol, the returned map always contains a key called "version".
110112 * In the case min/max values are null (None), they won't appear in the map.
111113 */
112- def toMap : Map [String , String ] = {
114+ def toMap ( colName : String , dataType : DataType ) : Map [String , String ] = {
113115 val map = new scala.collection.mutable.HashMap [String , String ]
114116 map.put(ColumnStat .KEY_VERSION , " 1" )
115117 map.put(ColumnStat .KEY_DISTINCT_COUNT , distinctCount.toString)
116118 map.put(ColumnStat .KEY_NULL_COUNT , nullCount.toString)
117119 map.put(ColumnStat .KEY_AVG_LEN , avgLen.toString)
118120 map.put(ColumnStat .KEY_MAX_LEN , maxLen.toString)
119- min.foreach { v => map.put(ColumnStat .KEY_MIN_VALUE , v.toString ) }
120- max.foreach { v => map.put(ColumnStat .KEY_MAX_VALUE , v.toString ) }
121+ min.foreach { v => map.put(ColumnStat .KEY_MIN_VALUE , toExternalString(v, colName, dataType) ) }
122+ max.foreach { v => map.put(ColumnStat .KEY_MAX_VALUE , toExternalString(v, colName, dataType) ) }
121123 map.toMap
122124 }
125+
126+ /**
127+ * Converts the given value from Catalyst data type to string representation of external
128+ * data type.
129+ */
130+ private def toExternalString (v : Any , colName : String , dataType : DataType ): String = {
131+ val externalValue = dataType match {
132+ case DateType => DateTimeUtils .toJavaDate(v.asInstanceOf [Int ])
133+ case TimestampType => DateTimeUtils .toJavaTimestamp(v.asInstanceOf [Long ])
134+ case BooleanType | _ : IntegralType | FloatType | DoubleType => v
135+ case _ : DecimalType => v.asInstanceOf [Decimal ].toJavaBigDecimal
136+ // This version of Spark does not use min/max for binary/string types so we ignore it.
137+ case _ =>
138+ throw new AnalysisException (" Column statistics deserialization is not supported for " +
139+ s " column $colName of data type: $dataType. " )
140+ }
141+ externalValue.toString
142+ }
143+
123144}
124145
125146
@@ -150,28 +171,15 @@ object ColumnStat extends Logging {
150171 * Creates a [[ColumnStat ]] object from the given map. This is used to deserialize column stats
151172 * from some external storage. The serialization side is defined in [[ColumnStat.toMap ]].
152173 */
153- def fromMap (table : String , field : StructField , map : Map [String , String ])
154- : Option [ColumnStat ] = {
155- val str2val : (String => Any ) = field.dataType match {
156- case _ : IntegralType => _.toLong
157- case _ : DecimalType => new java.math.BigDecimal (_)
158- case DoubleType | FloatType => _.toDouble
159- case BooleanType => _.toBoolean
160- case DateType => java.sql.Date .valueOf
161- case TimestampType => java.sql.Timestamp .valueOf
162- // This version of Spark does not use min/max for binary/string types so we ignore it.
163- case BinaryType | StringType => _ => null
164- case _ =>
165- throw new AnalysisException (" Column statistics deserialization is not supported for " +
166- s " column ${field.name} of data type: ${field.dataType}. " )
167- }
168-
174+ def fromMap (table : String , field : StructField , map : Map [String , String ]): Option [ColumnStat ] = {
169175 try {
170176 Some (ColumnStat (
171177 distinctCount = BigInt (map(KEY_DISTINCT_COUNT ).toLong),
172178 // Note that flatMap(Option.apply) turns Option(null) into None.
173- min = map.get(KEY_MIN_VALUE ).map(str2val).flatMap(Option .apply),
174- max = map.get(KEY_MAX_VALUE ).map(str2val).flatMap(Option .apply),
179+ min = map.get(KEY_MIN_VALUE )
180+ .map(fromExternalString(_, field.name, field.dataType)).flatMap(Option .apply),
181+ max = map.get(KEY_MAX_VALUE )
182+ .map(fromExternalString(_, field.name, field.dataType)).flatMap(Option .apply),
175183 nullCount = BigInt (map(KEY_NULL_COUNT ).toLong),
176184 avgLen = map.getOrElse(KEY_AVG_LEN , field.dataType.defaultSize.toString).toLong,
177185 maxLen = map.getOrElse(KEY_MAX_LEN , field.dataType.defaultSize.toString).toLong
@@ -183,6 +191,30 @@ object ColumnStat extends Logging {
183191 }
184192 }
185193
194+ /**
195+ * Converts from string representation of external data type to the corresponding Catalyst data
196+ * type.
197+ */
198+ private def fromExternalString (s : String , name : String , dataType : DataType ): Any = {
199+ dataType match {
200+ case BooleanType => s.toBoolean
201+ case DateType => DateTimeUtils .fromJavaDate(java.sql.Date .valueOf(s))
202+ case TimestampType => DateTimeUtils .fromJavaTimestamp(java.sql.Timestamp .valueOf(s))
203+ case ByteType => s.toByte
204+ case ShortType => s.toShort
205+ case IntegerType => s.toInt
206+ case LongType => s.toLong
207+ case FloatType => s.toFloat
208+ case DoubleType => s.toDouble
209+ case _ : DecimalType => Decimal (s)
210+ // This version of Spark does not use min/max for binary/string types so we ignore it.
211+ case BinaryType | StringType => null
212+ case _ =>
213+ throw new AnalysisException (" Column statistics deserialization is not supported for " +
214+ s " column $name of data type: $dataType. " )
215+ }
216+ }
217+
186218 /**
187219 * Constructs an expression to compute column statistics for a given column.
188220 *
@@ -232,11 +264,14 @@ object ColumnStat extends Logging {
232264 }
233265
234266 /** Convert a struct for column stats (defined in statExprs) into [[ColumnStat ]]. */
235- def rowToColumnStat (row : Row ): ColumnStat = {
267+ def rowToColumnStat (row : Row , attr : Attribute ): ColumnStat = {
236268 ColumnStat (
237269 distinctCount = BigInt (row.getLong(0 )),
238- min = Option (row.get(1 )), // for string/binary min/max, get should return null
239- max = Option (row.get(2 )),
270+ // for string/binary min/max, get should return null
271+ min = Option (row.get(1 ))
272+ .map(v => fromExternalString(v.toString, attr.name, attr.dataType)).flatMap(Option .apply),
273+ max = Option (row.get(2 ))
274+ .map(v => fromExternalString(v.toString, attr.name, attr.dataType)).flatMap(Option .apply),
240275 nullCount = BigInt (row.getLong(3 )),
241276 avgLen = row.getLong(4 ),
242277 maxLen = row.getLong(5 )
0 commit comments