Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions core/src/main/resources/error/error-classes.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@
],
"sqlState" : "22005"
},
"CANNOT_INFER_DATE" : {
"message" : [
"Cannot infer date in schema inference when LegacyTimeParserPolicy is \"LEGACY\". Legacy Date formatter does not support strict date format matching which is required to avoid inferring timestamps and other non-date entries to date."
],
"sqlState" : "22007"
},
"CANNOT_PARSE_DECIMAL" : {
"message" : [
"Cannot parse decimal"
Expand Down
6 changes: 6 additions & 0 deletions docs/sql-data-sources-csv.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,12 @@ Data source options of CSV can be set via:
<td>Infers the input schema automatically from data. It requires one extra pass over the data. CSV built-in functions ignore this option.</td>
<td>read</td>
</tr>
<tr>
<td><code>inferDate</code></td>
<td>false</td>
<td>Whether or not to infer columns that satisfy the <code>dateFormat</code> option as <code>Date</code>. Requires <code>inferSchema</code> to be <code>true</code>. When <code>false</code>, columns with dates will be inferred as <code>String</code> (or as <code>Timestamp</code> if it fits the <code>timestampFormat</code>).</td>
<td>read</td>
</tr>
<tr>
<td><code>enforceSchema</code></td>
<td>true</td>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ import scala.util.control.Exception.allCatch
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.analysis.TypeCoercion
import org.apache.spark.sql.catalyst.expressions.ExprUtils
import org.apache.spark.sql.catalyst.util.{DateFormatter, TimestampFormatter}
import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
import org.apache.spark.sql.catalyst.util.TimestampFormatter
import org.apache.spark.sql.errors.QueryExecutionErrors
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
Expand All @@ -46,6 +46,12 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
isParsing = true,
forTimestampNTZ = true)

private lazy val dateFormatter = DateFormatter(
options.dateFormatInRead,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
isParsing = true)

private val decimalParser = if (options.locale == Locale.US) {
// Special handling the default locale for backward compatibility
s: String => new java.math.BigDecimal(s)
Expand Down Expand Up @@ -117,7 +123,10 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
case LongType => tryParseLong(field)
case _: DecimalType => tryParseDecimal(field)
case DoubleType => tryParseDouble(field)
case DateType => tryParseDateTime(field)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems changing the method tryParseDouble should be enough

Copy link
Contributor Author

@Jonathancui123 Jonathancui123 Jun 23, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this change is necessary:
Consider the column of a TimestampType followed by a DateType entry. We would expect this column to be inferred as a TimestampType column.
typeSoFar will be Timestamp when inferField is called on the second entry which is DateType. We need logic in inferField to try and parse DateType even when typeSoFar is Timestamp

case TimestampNTZType if options.inferDate => tryParseDateTime(field)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shall we call tryParseTimestampNTZ? The type so far is timestamp, and inferring date is useless as we need to promote it to timestamp anyway.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Our expected behavior is that in a column with TimestampType entries and then DateType entries, the column will be inferred as TimestampType.

Here, tryParseTimestampNTZ and tryParseTimestamp will not be able to parse the DateType entries that show up later in the column and the column will be promoted to string type. So we must use tryParseDateTime which will give a chance for the date to be parsed.

case TimestampNTZType => tryParseTimestampNTZ(field)
case TimestampType if options.inferDate => tryParseDateTime(field)
case TimestampType => tryParseTimestamp(field)
case BooleanType => tryParseBoolean(field)
case StringType => StringType
Expand Down Expand Up @@ -169,6 +178,16 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
private def tryParseDouble(field: String): DataType = {
if ((allCatch opt field.toDouble).isDefined || isInfOrNan(field)) {
DoubleType
} else if (options.inferDate) {
tryParseDateTime(field)
} else {
tryParseTimestampNTZ(field)
}
}

private def tryParseDateTime(field: String): DataType = {
if ((allCatch opt dateFormatter.parse(field)).isDefined) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a very expensive way

DateType
} else {
tryParseTimestampNTZ(field)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,28 @@ class CSVOptions(
// A language tag in IETF BCP 47 format
val locale: Locale = parameters.get("locale").map(Locale.forLanguageTag).getOrElse(Locale.US)

val dateFormatInRead: Option[String] = parameters.get("dateFormat")
/**
* Infer columns with all valid date entries as date type (otherwise inferred as timestamp type).
* Disabled by default for backwards compatibility and performance. When enabled, date entries in
* timestamp columns will be cast to timestamp upon parsing. Not compatible with
* legacyTimeParserPolicy == LEGACY since legacy date parser will accept extra trailing characters
*/
val inferDate = {
val inferDateFlag = getBool("inferDate")
if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY && inferDateFlag) {
throw QueryExecutionErrors.inferDateWithLegacyTimeParserError()
}
inferDateFlag
}

// Provide a default value for dateFormatInRead when inferDate. This ensures that the
// Iso8601DateFormatter (with strict date parsing) is used for date inference
val dateFormatInRead: Option[String] =
if (inferDate) {
Option(parameters.getOrElse("dateFormat", DateFormatter.defaultPattern))
} else {
parameters.get("dateFormat")
}
val dateFormatInWrite: String = parameters.getOrElse("dateFormat", DateFormatter.defaultPattern)

val timestampFormatInRead: Option[String] =
Expand Down Expand Up @@ -195,7 +216,6 @@ class CSVOptions(
*/
val enforceSchema = getBool("enforceSchema", default = true)


/**
* String representation of an empty value in read and in write.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.{InternalRow, NoopFilters, OrderedFilters}
import org.apache.spark.sql.catalyst.expressions.{Cast, EmptyRow, ExprUtils, GenericInternalRow, Literal}
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.catalyst.util.DateTimeUtils.{daysToMicros, TimeZoneUTC}
import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
import org.apache.spark.sql.catalyst.util.ResolveDefaultColumns._
import org.apache.spark.sql.errors.QueryExecutionErrors
Expand Down Expand Up @@ -197,34 +198,46 @@ class UnivocityParser(
Decimal(decimalParser(datum), dt.precision, dt.scale)
}

case _: TimestampType => (d: String) =>
case _: DateType => (d: String) =>
nullSafeDatum(d, name, nullable, options) { datum =>
try {
timestampFormatter.parse(datum)
dateFormatter.parse(datum)
} catch {
case NonFatal(e) =>
// If fails to parse, then tries the way used in 2.0 and 1.x for backwards
// compatibility.
val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(datum))
DateTimeUtils.stringToTimestamp(str, options.zoneId).getOrElse(throw e)
DateTimeUtils.stringToDate(str).getOrElse(throw e)
}
}

case _: TimestampNTZType => (d: String) =>
nullSafeDatum(d, name, nullable, options) { datum =>
timestampNTZFormatter.parseWithoutTimeZone(datum, false)
}

case _: DateType => (d: String) =>
case _: TimestampType => (d: String) =>
nullSafeDatum(d, name, nullable, options) { datum =>
try {
dateFormatter.parse(datum)
timestampFormatter.parse(datum)
} catch {
case NonFatal(e) =>
// If fails to parse, then tries the way used in 2.0 and 1.x for backwards
// compatibility.
val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(datum))
DateTimeUtils.stringToDate(str).getOrElse(throw e)
DateTimeUtils.stringToTimestamp(str, options.zoneId).getOrElse {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the issue here is, if the timestamp parsing fails, maybe it's because this is a date, or maybe it's a legacy timestamp format. We need to define the priority here. Since inferDate is opt-in, I think it makes more sense to try parsing as date first, then the legacy format.

cc @sadikovi

Copy link
Member

@MaxGekk MaxGekk Jul 22, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just wonder, all issues mentioned by @HyukjinKwon in my PR #23202 (comment) have been addressed by this PR.

Copy link
Contributor

@sadikovi sadikovi Jul 24, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. We should address the order. Otherwise, it is unclear how to handle fallback. Fixed here: 10ca4a4.

// There may be date type entries in timestamp column due to schema inference
if (options.inferDate) {
daysToMicros(dateFormatter.parse(datum), options.zoneId)
Comment on lines +225 to +226
Copy link
Contributor Author

@Jonathancui123 Jonathancui123 Jun 22, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do not use the legacy DateFormatter here to avoid parsing timestamps with invalid suffixes. We want to throw an error when invalid timstamps are given.

e.g. The legacy DateFormatter will parse the following string without throwing an error:
dateFormat: yyyy-mm-dd
string: 2001-09-08-randomtext

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I think it makes sense to throw an exception or disallow when legacy parser is used (that doesn't care about surffixes)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do not use the legacy DateFormatter here to avoid parsing timestamps with invalid suffixes.

I think you could still make it work, but you would need a new extension of LegacySimpleDateFormatter (maybe LegacyStrictSimpleDateFormatter), with an override like this:

  def parseToDate(s: String): Date = {
    val pp = new ParsePosition(0)
    val res = sdf.parse(s, pp)
    if (s.length != pp.getIndex) {
      throw new RuntimeException(s"$s is not a date")
    }
    res
  }

2001-09-08-randomtext would not parse, neither would 2022-01-02 12:56:33, but 2022-01-02 would (assuming a format of yyyy-MM-dd).

I assume it would be slow (but I have not tested it).

Maybe not worth the extra code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bersprockets Thanks for the suggestion! Do you know what is the advantage of allowing Legacy Formatter? i.e. what is a date format that the legacy formatter can handle but the current formatter cannot?

I'm wondering if there will be a sufficient population of users who want to infer date in schema and also use legacy date formats

cc: @Yaohua628

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know what is the advantage of allowing Legacy Formatter?

One benefit of the legacy formatter is that it recognizes some pre-Gregorian leap years (like 1500-02-29) that exist only in the hybrid Julian calendar. Note how schema inference chooses string until you set the legacy parser.

scala> val csvInput = Seq("1425-03-22T00:00:00", "2022-01-01T00:00:00", "1500-02-29T00:00:00").toDS()
csvInput: org.apache.spark.sql.Dataset[String] = [value: string]

scala> spark.read.options(Map("inferSchema" -> "true", "timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss")).csv(csvInput).printSchema
root
 |-- _c0: string (nullable = true)

scala> sql("set spark.sql.legacy.timeParserPolicy=legacy")
res1: org.apache.spark.sql.DataFrame = [key: string, value: string]

scala> spark.read.options(Map("inferSchema" -> "true", "timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss")).csv(csvInput).printSchema
root
 |-- _c0: timestamp (nullable = true)

scala> 

That, of course, matters only if the application's input comes from legacy systems that still use hybrid Julian, and the input contains pre-Gregorian dates (e.g., for date encoding, which is the only real-world use case I have come across). I would imagine that audience is small and probably getting smaller.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you could still make it work, but you would need a new extension of LegacySimpleDateFormatter

By the way, to avoid confusion, I meant the above in the context of inferring dates when using the legacy parser (I realize now that this discussion is happening in reference to code changes in UnivocityParser).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks Bruce! This is great context! This will definitely be necessary if we want to support inference along with legacy date formats. Users on legacy dates will be unaffected by this change - how about we can open another ticket for date inference with legacy formats if the demand exists (and merge this PR without legacy date inference support)?

} else {
throw(e)
}
}
}
}

case _: TimestampNTZType => (d: String) =>
nullSafeDatum(d, name, nullable, options) { datum =>
try {
timestampNTZFormatter.parseWithoutTimeZone(datum, false)
} catch {
case NonFatal(e) if (options.inferDate) =>
daysToMicros(dateFormatter.parse(datum), TimeZoneUTC.toZoneId)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ import org.apache.hadoop.fs.permission.FsPermission
import org.codehaus.commons.compiler.CompileException
import org.codehaus.janino.InternalCompilerException

import org.apache.spark.{Partition, SparkArithmeticException, SparkArrayIndexOutOfBoundsException, SparkClassNotFoundException, SparkConcurrentModificationException, SparkDateTimeException, SparkException, SparkFileAlreadyExistsException, SparkFileNotFoundException, SparkIllegalArgumentException, SparkIndexOutOfBoundsException, SparkNoSuchElementException, SparkNoSuchMethodException, SparkNumberFormatException, SparkRuntimeException, SparkSecurityException, SparkSQLException, SparkSQLFeatureNotSupportedException, SparkUnsupportedOperationException, SparkUpgradeException}
import org.apache.spark.{Partition, SparkArithmeticException, SparkArrayIndexOutOfBoundsException, SparkClassNotFoundException, SparkConcurrentModificationException, SparkDateTimeException, SparkException, SparkFileAlreadyExistsException, SparkFileNotFoundException, SparkIllegalArgumentException, SparkIndexOutOfBoundsException, SparkNoSuchElementException, SparkNoSuchMethodException, SparkNumberFormatException, SparkRuntimeException, SparkSecurityException, SparkSQLException, SparkSQLFeatureNotSupportedException, SparkThrowable, SparkUnsupportedOperationException, SparkUpgradeException}
import org.apache.spark.executor.CommitDeniedException
import org.apache.spark.launcher.SparkLauncher
import org.apache.spark.memory.SparkOutOfMemoryError
Expand Down Expand Up @@ -529,6 +529,12 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase {
""".stripMargin)
}

def inferDateWithLegacyTimeParserError(): Throwable with SparkThrowable = {
new SparkIllegalArgumentException(errorClass = "CANNOT_INFER_DATE",
messageParameters = Array()
)
}

def streamedOperatorUnsupportedByDataSourceError(
className: String, operator: String): Throwable = {
new UnsupportedOperationException(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,12 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper {
assert(
inferSchema.mergeRowTypes(Array(DoubleType),
Array(LongType)).sameElements(Array(DoubleType)))
assert(
inferSchema.mergeRowTypes(Array(DateType),
Array(TimestampNTZType)).sameElements(Array(TimestampNTZType)))
assert(
inferSchema.mergeRowTypes(Array(DateType),
Array(TimestampType)).sameElements(Array(TimestampType)))
}

test("Null fields are handled properly when a nullValue is specified") {
Expand Down Expand Up @@ -192,4 +198,53 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper {
Seq("en-US").foreach(checkDecimalInfer(_, StringType))
Seq("ko-KR", "ru-RU", "de-DE").foreach(checkDecimalInfer(_, DecimalType(7, 0)))
}

test("SPARK-39469: inferring date type") {
// "yyyy/MM/dd" format
var options = new CSVOptions(Map("dateFormat" -> "yyyy/MM/dd", "inferDate" -> "true"),
false, "UTC")
var inferSchema = new CSVInferSchema(options)
assert(inferSchema.inferField(NullType, "2018/12/02") == DateType)
// "MMM yyyy" format
options = new CSVOptions(Map("dateFormat" -> "MMM yyyy", "inferDate" -> "true"),
false, "GMT")
inferSchema = new CSVInferSchema(options)
assert(inferSchema.inferField(NullType, "Dec 2018") == DateType)
// Field should strictly match date format to infer as date
options = new CSVOptions(
Map("dateFormat" -> "yyyy-MM-dd", "timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss",
"inferDate" -> "true"),
columnPruning = false,
defaultTimeZoneId = "GMT")
inferSchema = new CSVInferSchema(options)
assert(inferSchema.inferField(NullType, "2018-12-03T11:00:00") == TimestampType)
assert(inferSchema.inferField(NullType, "2018-12-03") == DateType)
}

test("SPARK-39469: inferring date and timestamp types in a mixed column with inferDate=true") {
var options = new CSVOptions(
Map("dateFormat" -> "yyyy_MM_dd", "timestampFormat" -> "yyyy|MM|dd",
"timestampNTZFormat" -> "yyyy/MM/dd", "inferDate" -> "true"),
columnPruning = false,
defaultTimeZoneId = "UTC")
var inferSchema = new CSVInferSchema(options)
assert(inferSchema.inferField(DateType, "2012_12_12") == DateType)
assert(inferSchema.inferField(DateType, "2003|01|01") == TimestampType)
// SQL configuration must be set to default to TimestampNTZ
withSQLConf(SQLConf.TIMESTAMP_TYPE.key -> "TIMESTAMP_NTZ") {
assert(inferSchema.inferField(DateType, "2003/02/05") == TimestampNTZType)
}

// inferField should upgrade a date field to timestamp if the typeSoFar is a timestamp
assert(inferSchema.inferField(TimestampNTZType, "2012_12_12") == TimestampNTZType)
assert(inferSchema.inferField(TimestampType, "2018_12_03") == TimestampType)

// No errors when Date and Timestamp have the same format. Inference defaults to date
options = new CSVOptions(
Map("dateFormat" -> "yyyy_MM_dd", "timestampFormat" -> "yyyy_MM_dd"),
columnPruning = false,
defaultTimeZoneId = "UTC")
inferSchema = new CSVInferSchema(options)
assert(inferSchema.inferField(DateType, "2012_12_12") == DateType)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.csv

import java.math.BigDecimal
import java.text.{DecimalFormat, DecimalFormatSymbols}
import java.time.{ZoneOffset}
import java.util.{Locale, TimeZone}

import org.apache.commons.lang3.time.FastDateFormat
Expand Down Expand Up @@ -358,4 +359,26 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper {
Map("timestampFormat" -> "invalid", "dateFormat" -> "invalid"), false, "UTC")
check(new UnivocityParser(StructType(Seq.empty), optionsWithPattern))
}

test("SPARK-39469: dates should be parsed correctly in a timestamp column when inferDate=true") {
def checkDate(dataType: DataType): Unit = {
val timestampsOptions =
new CSVOptions(Map("inferDate" -> "true", "timestampFormat" -> "dd/MM/yyyy HH:mm",
"timestampNTZFormat" -> "dd-MM-yyyy HH:mm", "dateFormat" -> "dd_MM_yyyy"),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One test we might need would be:

timestampFormat" -> "dd/MM/yyyy HH:mm and dateFormat -> dd/MM/yyyy to make sure timestamps are not parsed as date types without conflicting.

Copy link
Contributor

@bersprockets bersprockets Jun 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to make sure timestamps are not parsed as date types without conflicting.

That's actually what happens:

Before this PR:

scala> val csvInput = Seq("0,2012-01-01 12:00:00", "1,2021-07-01 15:00:00").toDS()
csvInput: org.apache.spark.sql.Dataset[String] = [value: string]

scala> val df = spark.read.option("inferSchema", "true").csv(csvInput)
df: org.apache.spark.sql.DataFrame = [_c0: int, _c1: timestamp]

scala> df.printSchema
root
 |-- _c0: integer (nullable = true)
 |-- _c1: timestamp (nullable = true)

scala> 

After this PR:

scala> val csvInput = Seq("0,2012-01-01 12:00:00", "1,2021-07-01 15:00:00").toDS()
csvInput: org.apache.spark.sql.Dataset[String] = [value: string]

scala> val df = spark.read.option("inferSchema", "true").csv(csvInput)
df: org.apache.spark.sql.DataFrame = [_c0: int, _c1: date]

scala> df.printSchema
root
 |-- _c0: integer (nullable = true)
 |-- _c1: date (nullable = true)

scala>

It looks like some tests fail too, like CSVInferSchemaSuite, and CSVv1Suite, possibly others (I ran these two suites on my laptop. For some reason, the github actions didn't run tests for this PR. Maybe @Jonathancui123 needs to turn them on in his fork?).

We should probably 1. add either SQL configuration or an option e.g., infersDate

I think you would need something like that: when set, the date formatter could use the slower, more strict method of parsing (so "2012-01-01 12:00:00" wouldn't parse as a date).

Edit: To do a strict parsing, one might need to use ParsePosition and check that the whole date/time value string was consumed. Even after setting lenient=false SimpleDateFormat.parse didn't complain about extra characters that weren't consumed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I addressed inference mistakes in the following code snippet and comment

false, DateTimeUtils.getZoneId("-08:00").toString)
// Use CSVOption ZoneId="-08:00" (PST) to test that Dates in TimestampNTZ column are always
// converted to their equivalent UTC timestamp
val dateString = "08_09_2001"
val expected = dataType match {
case TimestampType => date(2001, 9, 8, 0, 0, 0, 0, ZoneOffset.of("-08:00"))
case TimestampNTZType => date(2001, 9, 8, 0, 0, 0, 0, ZoneOffset.UTC)
Comment on lines +368 to +374
Copy link
Contributor Author

@Jonathancui123 Jonathancui123 Jun 28, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think zoneId should probably be UTC for timestamp_ntz. Otherwise, you end up with oddities like this...

@bersprockets I've modified this test to have the user in PST and check that the parsed date is converted to a timestamp in UTC. This checks for the error you caught in your previous comment. Thanks!

I think the PR should be ready to merge. Please let me know if there's anything else we need to fix. I'll keep an eye on the Github Actions results.

case DateType => days(2001, 9, 8)
}
val parser = new UnivocityParser(new StructType(), timestampsOptions)
assert(parser.makeConverter("d", dataType).apply(dateString) == expected)
}
checkDate(TimestampType)
checkDate(TimestampNTZType)
checkDate(DateType)
}
}
4 changes: 4 additions & 0 deletions sql/core/src/test/resources/test-data/date-infer-schema.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
date,timestamp-date,date-timestamp
2001-09-08,2014-10-27T18:30:00,1765-03-28
1941-01-02,2000-09-14T01:01:00,1423-11-12T23:41:00
0293-11-07,1995-06-25,2016-01-28T20:00:00
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import org.apache.spark.sql.{AnalysisException, Column, DataFrame, Encoders, Que
import org.apache.spark.sql.catalyst.util.{DateTimeTestUtils, DateTimeUtils}
import org.apache.spark.sql.execution.datasources.CommonFileDataSourceSuite
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy
import org.apache.spark.sql.test.SharedSparkSession
import org.apache.spark.sql.types._

Expand Down Expand Up @@ -74,6 +75,7 @@ abstract class CSVSuite
private val simpleSparseFile = "test-data/simple_sparse.csv"
private val numbersFile = "test-data/numbers.csv"
private val datesFile = "test-data/dates.csv"
private val dateInferSchemaFile = "test-data/date-infer-schema.csv"
private val unescapedQuotesFile = "test-data/unescaped-quotes.csv"
private val valueMalformedFile = "test-data/value-malformed.csv"
private val badAfterGoodFile = "test-data/bad_after_good.csv"
Expand Down Expand Up @@ -2788,6 +2790,56 @@ abstract class CSVSuite
}
}
}

test("SPARK-39469: Infer schema for date type") {
val options1 = Map(
"header" -> "true",
"inferSchema" -> "true",
"timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss",
"dateFormat" -> "yyyy-MM-dd",
"inferDate" -> "true")
val options2 = Map(
"header" -> "true",
"inferSchema" -> "true",
"inferDate" -> "true")

// Error should be thrown when attempting to inferDate with Legacy parser
if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) {
val msg = intercept[IllegalArgumentException] {
spark.read
.format("csv")
.options(options1)
.load(testFile(dateInferSchemaFile))
}.getMessage
assert(msg.contains("CANNOT_INFER_DATE"))
} else {
// 1. Specify date format and timestamp format
// 2. Date inference should work with default date format when dateFormat is not provided
Seq(options1, options2).foreach {options =>
val results = spark.read
.format("csv")
.options(options)
.load(testFile(dateInferSchemaFile))

val expectedSchema = StructType(List(StructField("date", DateType),
StructField("timestamp-date", TimestampType),
StructField("date-timestamp", TimestampType)))
assert(results.schema == expectedSchema)

val expected =
Seq(
Seq(Date.valueOf("2001-9-8"), Timestamp.valueOf("2014-10-27 18:30:0.0"),
Timestamp.valueOf("1765-03-28 00:00:0.0")),
Seq(Date.valueOf("1941-1-2"), Timestamp.valueOf("2000-09-14 01:01:0.0"),
Timestamp.valueOf("1423-11-12 23:41:0.0")),
Seq(Date.valueOf("0293-11-7"), Timestamp.valueOf("1995-06-25 00:00:00.0"),
Timestamp.valueOf("2016-01-28 20:00:00.0"))
)
assert(results.collect().toSeq.map(_.toSeq) == expected)
}

}
}
}

class CSVv1Suite extends CSVSuite {
Expand Down