Skip to content

Commit 19d12b2

Browse files
liujiayi771cloud-fan
authored andcommitted
[SPARK-48241][SQL][3.5] CSV parsing failure with char/varchar type columns
### What changes were proposed in this pull request? CSV table containing char and varchar columns will result in the following error when selecting from the CSV table: ``` spark-sql (default)> show create table test_csv; CREATE TABLE default.test_csv ( id INT, name CHAR(10)) USING csv ``` ``` java.lang.IllegalArgumentException: requirement failed: requiredSchema (struct<id:int,name:string>) should be the subset of dataSchema (struct<id:int,name:string>). at scala.Predef$.require(Predef.scala:281) at org.apache.spark.sql.catalyst.csv.UnivocityParser.<init>(UnivocityParser.scala:56) at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.$anonfun$buildReader$2(CSVFileFormat.scala:127) at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:155) at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:140) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:231) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:293) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:125) ``` ### Why are the changes needed? For char and varchar types, Spark will convert them to `StringType` in `CharVarcharUtils.replaceCharVarcharWithStringInSchema` and record `__CHAR_VARCHAR_TYPE_STRING` in the metadata. The reason for the above error is that the `StringType` columns in the `dataSchema` and `requiredSchema` of `UnivocityParser` are not consistent. The `StringType` in the `dataSchema` has metadata, while the metadata in the `requiredSchema` is empty. We need to retain the metadata when resolving schema. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Add a new test case in `CSVSuite`. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46565 from liujiayi771/branch-3.5-SPARK-48241. Authored-by: joey.ljy <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent ab511a7 commit 19d12b2

File tree

3 files changed

+31
-1
lines changed

3 files changed

+31
-1
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,9 @@ abstract class LogicalPlan
116116
def resolve(schema: StructType, resolver: Resolver): Seq[Attribute] = {
117117
schema.map { field =>
118118
resolve(field.name :: Nil, resolver).map {
119-
case a: AttributeReference => a
119+
case a: AttributeReference =>
120+
// Keep the metadata in given schema.
121+
a.withMetadata(field.metadata)
120122
case _ => throw QueryExecutionErrors.resolveCannotHandleNestedSchema(this)
121123
}.getOrElse {
122124
throw QueryCompilationErrors.cannotResolveAttributeError(
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
color,name
2+
pink,Bob
3+
blue,Mike
4+
grey,Tom

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ abstract class CSVSuite
8080
private val valueMalformedFile = "test-data/value-malformed.csv"
8181
private val badAfterGoodFile = "test-data/bad_after_good.csv"
8282
private val malformedRowFile = "test-data/malformedRow.csv"
83+
private val charFile = "test-data/char.csv"
8384

8485
/** Verifies data and schema. */
8586
private def verifyCars(
@@ -3226,6 +3227,29 @@ abstract class CSVSuite
32263227
}
32273228
}
32283229
}
3230+
3231+
test("SPARK-48241: CSV parsing failure with char/varchar type columns") {
3232+
withTable("charVarcharTable") {
3233+
spark.sql(
3234+
s"""
3235+
|CREATE TABLE charVarcharTable(
3236+
| color char(4),
3237+
| name varchar(10))
3238+
|USING csv
3239+
|OPTIONS (
3240+
| header "true",
3241+
| path "${testFile(charFile)}"
3242+
|)
3243+
""".stripMargin)
3244+
val expected = Seq(
3245+
Row("pink", "Bob"),
3246+
Row("blue", "Mike"),
3247+
Row("grey", "Tom"))
3248+
checkAnswer(
3249+
sql("SELECT * FROM charVarcharTable"),
3250+
expected)
3251+
}
3252+
}
32293253
}
32303254

32313255
class CSVv1Suite extends CSVSuite {

0 commit comments

Comments
 (0)