Skip to content

Commit 2f05ab8

Browse files
committed
[SPARK-23436][SQL] Infer partition as Date only if it can be casted to Date
1 parent 44e20c4 commit 2f05ab8

File tree

2 files changed

+38
-12
lines changed

2 files changed

+38
-12
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,7 @@ import java.util.{Locale, TimeZone}
2323

2424
import scala.collection.mutable.ArrayBuffer
2525
import scala.util.Try
26-
2726
import org.apache.hadoop.fs.Path
28-
2927
import org.apache.spark.sql.AnalysisException
3028
import org.apache.spark.sql.catalyst.InternalRow
3129
import org.apache.spark.sql.catalyst.analysis.{Resolver, TypeCoercion}
@@ -34,6 +32,7 @@ import org.apache.spark.sql.catalyst.expressions.{Cast, Literal}
3432
import org.apache.spark.sql.catalyst.util.DateTimeUtils
3533
import org.apache.spark.sql.types._
3634
import org.apache.spark.sql.util.SchemaUtils
35+
import org.apache.spark.unsafe.types.UTF8String
3736

3837
// TODO: We should tighten up visibility of the classes here once we clean up Hive coupling.
3938

@@ -407,6 +406,29 @@ object PartitioningUtils {
407406
Literal(bigDecimal)
408407
}
409408

409+
val dateTry = Try {
410+
// try and parse the date, if no exception occurs this is a candidate to be resolved as
411+
// DateType
412+
DateTimeUtils.getThreadLocalDateFormat.parse(raw)
413+
// SPARK-23436: Casting the string to date may still return null if a bad Date is provided.
414+
// We need to check that we can cast the raw string since we later can use Cast to get
415+
// the partition values with the right DataType (see
416+
// org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex.inferPartitioning)
417+
val dateOption = Option(Cast(Literal(raw), DateType).eval())
418+
Literal.create(dateOption.get, DateType)
419+
}
420+
421+
val timestampTry = Try {
422+
val unescapedRaw = unescapePathName(raw)
423+
// try and parse the date, if no exception occurs this is a candidate to be resolved as
424+
// TimestampType
425+
DateTimeUtils.getThreadLocalTimestampFormat(timeZone).parse(unescapedRaw)
426+
// SPARK-23436: see comment for date
427+
val timestampOption = Option(Cast(Literal(unescapedRaw), TimestampType,
428+
Some(timeZone.getID)).eval())
429+
Literal.create(timestampOption.get, TimestampType)
430+
}
431+
410432
if (typeInference) {
411433
// First tries integral types
412434
Try(Literal.create(Integer.parseInt(raw), IntegerType))
@@ -415,16 +437,8 @@ object PartitioningUtils {
415437
// Then falls back to fractional types
416438
.orElse(Try(Literal.create(JDouble.parseDouble(raw), DoubleType)))
417439
// Then falls back to date/timestamp types
418-
.orElse(Try(
419-
Literal.create(
420-
DateTimeUtils.getThreadLocalTimestampFormat(timeZone)
421-
.parse(unescapePathName(raw)).getTime * 1000L,
422-
TimestampType)))
423-
.orElse(Try(
424-
Literal.create(
425-
DateTimeUtils.millisToDays(
426-
DateTimeUtils.getThreadLocalDateFormat.parse(raw).getTime),
427-
DateType)))
440+
.orElse(timestampTry)
441+
.orElse(dateTry)
428442
// Then falls back to string
429443
.getOrElse {
430444
if (raw == DEFAULT_PARTITION_NAME) {

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1120,4 +1120,16 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
11201120
Row(3, BigDecimal("2" * 30)) :: Nil)
11211121
}
11221122
}
1123+
1124+
test("SPARK-23436: invalid Dates should be inferred as String in partition inference") {
1125+
withTempPath { path =>
1126+
val data = Seq(("1", "2018-01", "2018-01-01-04", "test"))
1127+
.toDF("id", "date_month", "date_hour", "data")
1128+
1129+
data.write.partitionBy("date_month", "date_hour").parquet(path.getAbsolutePath)
1130+
val input = spark.read.parquet(path.getAbsolutePath)
1131+
checkAnswer(input.select("id", "date_month", "date_hour", "data"),
1132+
data.select("id", "date_month", "date_hour", "data"))
1133+
}
1134+
}
11231135
}

0 commit comments

Comments
 (0)