Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions docs/sql-data-sources-csv.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,9 @@ Data source options of CSV can be set via:
<td>read</td>
</tr>
<tr>
<td><code>inferDate</code></td>
<td><code>prefersDate</code></td>
<td>false</td>
<td>Whether or not to infer columns that satisfy the <code>dateFormat</code> option as <code>Date</code>. Requires <code>inferSchema</code> to be <code>true</code>. When <code>false</code>, columns with dates will be inferred as <code>String</code> (or as <code>Timestamp</code> if it fits the <code>timestampFormat</code>).</td>
<td>During schema inference (<code>inferSchema</code>), attempts to infer string columns that contain dates or timestamps as <code>Date</code> if the values satisfy the <code>dateFormat</code> option and failed to be parsed by the respective formatter. With a user-provided schema, attempts to parse timestamp columns as dates using <code>dateFormat</code> if they fail to conform to <code>timestampFormat</code>, in this case the parsed values will be cast to timestamp type afterwards.</td>
<td>read</td>
</tr>
<tr>
Expand Down Expand Up @@ -176,8 +176,8 @@ Data source options of CSV can be set via:
</tr>
<tr>
<td><code>enableDateTimeParsingFallback</code></td>
<td>Enabled if the time parser policy is legacy or no custom date or timestamp pattern was provided</td>
<td>Allows to fall back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps if values do not match the set patterns.</td>
<td>Enabled if the time parser policy has legacy settings or if no custom date or timestamp pattern was provided.</td>
<td>Allows falling back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps if values do not match the set patterns.</td>
<td>read</td>
</tr>
<tr>
Expand Down
4 changes: 2 additions & 2 deletions docs/sql-data-sources-json.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,8 @@ Data source options of JSON can be set via:
</tr>
<tr>
<td><code>enableDateTimeParsingFallback</code></td>
<td>Enabled if the time parser policy is legacy or no custom date or timestamp pattern was provided</td>
<td>Allows to fall back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps if values do not match the set patterns.</td>
<td>Enabled if the time parser policy has legacy settings or if no custom date or timestamp pattern was provided.</td>
<td>Allows falling back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps if values do not match the set patterns.</td>
<td>read</td>
</tr>
<tr>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,9 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
case _: DecimalType => tryParseDecimal(field)
case DoubleType => tryParseDouble(field)
case DateType => tryParseDateTime(field)
case TimestampNTZType if options.inferDate => tryParseDateTime(field)
case TimestampNTZType if options.prefersDate => tryParseDateTime(field)
case TimestampNTZType => tryParseTimestampNTZ(field)
case TimestampType if options.inferDate => tryParseDateTime(field)
case TimestampType if options.prefersDate => tryParseDateTime(field)
case TimestampType => tryParseTimestamp(field)
case BooleanType => tryParseBoolean(field)
case StringType => StringType
Expand Down Expand Up @@ -178,7 +178,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
private def tryParseDouble(field: String): DataType = {
if ((allCatch opt field.toDouble).isDefined || isInfOrNan(field)) {
DoubleType
} else if (options.inferDate) {
} else if (options.prefersDate) {
tryParseDateTime(field)
} else {
tryParseTimestampNTZ(field)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,23 +149,29 @@ class CSVOptions(
val locale: Locale = parameters.get("locale").map(Locale.forLanguageTag).getOrElse(Locale.US)

/**
* Infer columns with all valid date entries as date type (otherwise inferred as timestamp type).
* Disabled by default for backwards compatibility and performance. When enabled, date entries in
* timestamp columns will be cast to timestamp upon parsing. Not compatible with
* legacyTimeParserPolicy == LEGACY since legacy date parser will accept extra trailing characters
* Infer columns with all valid date entries as date type (otherwise inferred as timestamp type)
* if schema inference is enabled. When being used with user-provided schema, tries to parse
* timestamp values as dates if the values do not conform to the timestamp formatter before
* falling back to the backward compatible parsing - the parsed values will be cast to timestamp
* afterwards.
*
* Disabled by default for backwards compatibility and performance.
*
* Not compatible with legacyTimeParserPolicy == LEGACY since legacy date parser will accept
* extra trailing characters.
*/
val inferDate = {
val inferDateFlag = getBool("inferDate")
if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY && inferDateFlag) {
val prefersDate = {
val inferDateFlag = getBool("prefersDate")
if (inferDateFlag && SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) {
throw QueryExecutionErrors.inferDateWithLegacyTimeParserError()
}
inferDateFlag
}

// Provide a default value for dateFormatInRead when inferDate. This ensures that the
// Provide a default value for dateFormatInRead when prefersDate. This ensures that the
// Iso8601DateFormatter (with strict date parsing) is used for date inference
val dateFormatInRead: Option[String] =
if (inferDate) {
if (prefersDate) {
Option(parameters.getOrElse("dateFormat", DateFormatter.defaultPattern))
} else {
parameters.get("dateFormat")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ class UnivocityParser(
} catch {
case NonFatal(e) =>
// There may be date type entries in timestamp column due to schema inference
if (options.inferDate) {
if (options.prefersDate) {
daysToMicros(dateFormatter.parse(datum), options.zoneId)
} else {
// If fails to parse, then tries the way used in 2.0 and 1.x for backwards
Expand All @@ -254,7 +254,7 @@ class UnivocityParser(
try {
timestampNTZFormatter.parseWithoutTimeZone(datum, false)
} catch {
case NonFatal(e) if (options.inferDate) =>
case NonFatal(e) if options.prefersDate =>
daysToMicros(dateFormatter.parse(datum), TimeZoneUTC.toZoneId)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,30 +201,30 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper {

test("SPARK-39469: inferring date type") {
// "yyyy/MM/dd" format
var options = new CSVOptions(Map("dateFormat" -> "yyyy/MM/dd", "inferDate" -> "true"),
var options = new CSVOptions(Map("dateFormat" -> "yyyy/MM/dd", "prefersDate" -> "true"),
false, "UTC")
var inferSchema = new CSVInferSchema(options)
assert(inferSchema.inferField(NullType, "2018/12/02") == DateType)
// "MMM yyyy" format
options = new CSVOptions(Map("dateFormat" -> "MMM yyyy", "inferDate" -> "true"),
options = new CSVOptions(Map("dateFormat" -> "MMM yyyy", "prefersDate" -> "true"),
false, "GMT")
inferSchema = new CSVInferSchema(options)
assert(inferSchema.inferField(NullType, "Dec 2018") == DateType)
// Field should strictly match date format to infer as date
options = new CSVOptions(
Map("dateFormat" -> "yyyy-MM-dd", "timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss",
"inferDate" -> "true"),
"prefersDate" -> "true"),
columnPruning = false,
defaultTimeZoneId = "GMT")
inferSchema = new CSVInferSchema(options)
assert(inferSchema.inferField(NullType, "2018-12-03T11:00:00") == TimestampType)
assert(inferSchema.inferField(NullType, "2018-12-03") == DateType)
}

test("SPARK-39469: inferring date and timestamp types in a mixed column with inferDate=true") {
test("SPARK-39469: inferring date and timestamp types in a mixed column with prefersDate=true") {
var options = new CSVOptions(
Map("dateFormat" -> "yyyy_MM_dd", "timestampFormat" -> "yyyy|MM|dd",
"timestampNTZFormat" -> "yyyy/MM/dd", "inferDate" -> "true"),
"timestampNTZFormat" -> "yyyy/MM/dd", "prefersDate" -> "true"),
columnPruning = false,
defaultTimeZoneId = "UTC")
var inferSchema = new CSVInferSchema(options)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -373,10 +373,10 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper {
assert(err.getMessage.contains("Illegal pattern character: n"))
}

test("SPARK-39469: dates should be parsed correctly in a timestamp column when inferDate=true") {
test("SPARK-39469: dates should be parsed correctly in timestamp column when prefersDate=true") {
def checkDate(dataType: DataType): Unit = {
val timestampsOptions =
new CSVOptions(Map("inferDate" -> "true", "timestampFormat" -> "dd/MM/yyyy HH:mm",
new CSVOptions(Map("prefersDate" -> "true", "timestampFormat" -> "dd/MM/yyyy HH:mm",
"timestampNTZFormat" -> "dd-MM-yyyy HH:mm", "dateFormat" -> "dd_MM_yyyy"),
false, DateTimeUtils.getZoneId("-08:00").toString)
// Use CSVOption ZoneId="-08:00" (PST) to test that Dates in TimestampNTZ column are always
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2797,13 +2797,13 @@ abstract class CSVSuite
"inferSchema" -> "true",
"timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss",
"dateFormat" -> "yyyy-MM-dd",
"inferDate" -> "true")
"prefersDate" -> "true")
val options2 = Map(
"header" -> "true",
"inferSchema" -> "true",
"inferDate" -> "true")
"prefersDate" -> "true")

// Error should be thrown when attempting to inferDate with Legacy parser
// Error should be thrown when attempting to prefersDate with Legacy parser
if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) {
val msg = intercept[IllegalArgumentException] {
spark.read
Expand Down Expand Up @@ -2840,6 +2840,42 @@ abstract class CSVSuite
}
}

test("SPARK-39904: Parse incorrect timestamp values with prefersDate=true") {
withTempPath { path =>
Seq(
"2020-02-01 12:34:56",
"2020-02-02",
"invalid"
).toDF()
.repartition(1)
.write.text(path.getAbsolutePath)

val schema = new StructType()
.add("ts", TimestampType)

val output = spark.read
.schema(schema)
.option("prefersDate", "true")
.csv(path.getAbsolutePath)

if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) {
val msg = intercept[IllegalArgumentException] {
output.collect()
}.getMessage
assert(msg.contains("CANNOT_INFER_DATE"))
} else {
checkAnswer(
output,
Seq(
Row(Timestamp.valueOf("2020-02-01 12:34:56")),
Row(Timestamp.valueOf("2020-02-02 00:00:00")),
Row(null)
)
)
}
}
}

test("SPARK-39731: Correctly parse dates and timestamps with yyyyMMdd pattern") {
withTempPath { path =>
Seq(
Expand Down