Skip to content

Commit b14abb3

Browse files
liujiayi771cloud-fan
authored andcommitted
[SPARK-48241][SQL] CSV parsing failure with char/varchar type columns
### What changes were proposed in this pull request? CSV table containing char and varchar columns will result in the following error when selecting from the CSV table: ``` spark-sql (default)> show create table test_csv; CREATE TABLE default.test_csv ( id INT, name CHAR(10)) USING csv ``` ``` java.lang.IllegalArgumentException: requirement failed: requiredSchema (struct<id:int,name:string>) should be the subset of dataSchema (struct<id:int,name:string>). at scala.Predef$.require(Predef.scala:281) at org.apache.spark.sql.catalyst.csv.UnivocityParser.<init>(UnivocityParser.scala:56) at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.$anonfun$buildReader$2(CSVFileFormat.scala:127) at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:155) at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:140) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:231) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:293) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:125) ``` ### Why are the changes needed? For char and varchar types, Spark will convert them to `StringType` in `CharVarcharUtils.replaceCharVarcharWithStringInSchema` and record `__CHAR_VARCHAR_TYPE_STRING` in the metadata. The reason for the above error is that the `StringType` columns in the `dataSchema` and `requiredSchema` of `UnivocityParser` are not consistent. The `StringType` in the `dataSchema` has metadata, while the metadata in the `requiredSchema` is empty. We need to retain the metadata when resolving schema. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Add a new test case in `CSVSuite`. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46537 from liujiayi771/csv-char. Authored-by: joey.ljy <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 3456d4f commit b14abb3

File tree

3 files changed

+31
-1
lines changed

3 files changed

+31
-1
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,9 @@ abstract class LogicalPlan
118118
def resolve(schema: StructType, resolver: Resolver): Seq[Attribute] = {
119119
schema.map { field =>
120120
resolve(field.name :: Nil, resolver).map {
121-
case a: AttributeReference => a
121+
case a: AttributeReference =>
122+
// Keep the metadata in given schema.
123+
a.withMetadata(field.metadata)
122124
case _ => throw QueryExecutionErrors.resolveCannotHandleNestedSchema(this)
123125
}.getOrElse {
124126
throw QueryCompilationErrors.cannotResolveAttributeError(
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
color,name
2+
pink,Bob
3+
blue,Mike
4+
grey,Tom

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ abstract class CSVSuite
8080
private val valueMalformedFile = "test-data/value-malformed.csv"
8181
private val badAfterGoodFile = "test-data/bad_after_good.csv"
8282
private val malformedRowFile = "test-data/malformedRow.csv"
83+
private val charFile = "test-data/char.csv"
8384

8485
/** Verifies data and schema. */
8586
private def verifyCars(
@@ -3342,6 +3343,29 @@ abstract class CSVSuite
33423343
expected)
33433344
}
33443345
}
3346+
3347+
test("SPARK-48241: CSV parsing failure with char/varchar type columns") {
3348+
withTable("charVarcharTable") {
3349+
spark.sql(
3350+
s"""
3351+
|CREATE TABLE charVarcharTable(
3352+
| color char(4),
3353+
| name varchar(10))
3354+
|USING csv
3355+
|OPTIONS (
3356+
| header "true",
3357+
| path "${testFile(charFile)}"
3358+
|)
3359+
""".stripMargin)
3360+
val expected = Seq(
3361+
Row("pink", "Bob"),
3362+
Row("blue", "Mike"),
3363+
Row("grey", "Tom"))
3364+
checkAnswer(
3365+
sql("SELECT * FROM charVarcharTable"),
3366+
expected)
3367+
}
3368+
}
33453369
}
33463370

33473371
class CSVv1Suite extends CSVSuite {

0 commit comments

Comments
 (0)