Skip to content

Commit 7514db1

Browse files
HyukjinKwonsrowen
authored andcommitted
[SPARK-21263][SQL] Do not allow partially parsing double and floats via NumberFormat in CSV
## What changes were proposed in this pull request? This PR proposes to remove `NumberFormat.parse` use to disallow a case of partially parsed data. For example, ``` scala> spark.read.schema("a DOUBLE").option("mode", "FAILFAST").csv(Seq("10u12").toDS).show() +----+ | a| +----+ |10.0| +----+ ``` ## How was this patch tested? Unit tests added in `UnivocityParserSuite` and `CSVSuite`. Author: hyukjinkwon <[email protected]> Closes #18532 from HyukjinKwon/SPARK-21263.
1 parent a4baa8f commit 7514db1

File tree

3 files changed

+34
-16
lines changed

3 files changed

+34
-16
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -111,19 +111,15 @@ class UnivocityParser(
111111
case options.nanValue => Float.NaN
112112
case options.negativeInf => Float.NegativeInfinity
113113
case options.positiveInf => Float.PositiveInfinity
114-
case datum =>
115-
Try(datum.toFloat)
116-
.getOrElse(NumberFormat.getInstance(Locale.US).parse(datum).floatValue())
114+
case datum => datum.toFloat
117115
}
118116

119117
case _: DoubleType => (d: String) =>
120118
nullSafeDatum(d, name, nullable, options) {
121119
case options.nanValue => Double.NaN
122120
case options.negativeInf => Double.NegativeInfinity
123121
case options.positiveInf => Double.PositiveInfinity
124-
case datum =>
125-
Try(datum.toDouble)
126-
.getOrElse(NumberFormat.getInstance(Locale.US).parse(datum).doubleValue())
122+
case datum => datum.toDouble
127123
}
128124

129125
case _: BooleanType => (d: String) =>

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1174,4 +1174,25 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
11741174
}
11751175
}
11761176
}
1177+
1178+
test("SPARK-21263: Invalid float and double are handled correctly in different modes") {
1179+
val exception = intercept[SparkException] {
1180+
spark.read.schema("a DOUBLE")
1181+
.option("mode", "FAILFAST")
1182+
.csv(Seq("10u12").toDS())
1183+
.collect()
1184+
}
1185+
assert(exception.getMessage.contains("""input string: "10u12""""))
1186+
1187+
val count = spark.read.schema("a FLOAT")
1188+
.option("mode", "DROPMALFORMED")
1189+
.csv(Seq("10u12").toDS())
1190+
.count()
1191+
assert(count == 0)
1192+
1193+
val results = spark.read.schema("a FLOAT")
1194+
.option("mode", "PERMISSIVE")
1195+
.csv(Seq("10u12").toDS())
1196+
checkAnswer(results, Row(null))
1197+
}
11771198
}

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -130,16 +130,17 @@ class UnivocityParserSuite extends SparkFunSuite {
130130
DateTimeUtils.millisToDays(DateTimeUtils.stringToTime("2015-01-01").getTime))
131131
}
132132

133-
test("Float and Double Types are cast without respect to platform default Locale") {
134-
val originalLocale = Locale.getDefault
135-
try {
136-
Locale.setDefault(new Locale("fr", "FR"))
137-
// Would parse as 1.0 in fr-FR
138-
val options = new CSVOptions(Map.empty[String, String], "GMT")
139-
assert(parser.makeConverter("_1", FloatType, options = options).apply("1,00") == 100.0)
140-
assert(parser.makeConverter("_1", DoubleType, options = options).apply("1,00") == 100.0)
141-
} finally {
142-
Locale.setDefault(originalLocale)
133+
test("Throws exception for casting an invalid string to Float and Double Types") {
134+
val options = new CSVOptions(Map.empty[String, String], "GMT")
135+
val types = Seq(DoubleType, FloatType)
136+
val input = Seq("10u000", "abc", "1 2/3")
137+
types.foreach { dt =>
138+
input.foreach { v =>
139+
val message = intercept[NumberFormatException] {
140+
parser.makeConverter("_1", dt, options = options).apply(v)
141+
}.getMessage
142+
assert(message.contains(v))
143+
}
143144
}
144145
}
145146

0 commit comments

Comments
 (0)