Skip to content

Commit c6a5bf6

Browse files
committed
Add Hive datatype (char/varchar) to struct field metadata. This fixes issues with char/varchar columns in ORC.
1 parent 2f3c20b commit c6a5bf6

File tree

5 files changed

+59
-5
lines changed

5 files changed

+59
-5
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1457,8 +1457,31 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
14571457
*/
14581458
override def visitColType(ctx: ColTypeContext): StructField = withOrigin(ctx) {
14591459
import ctx._
1460-
val structField = StructField(identifier.getText, typedVisit(dataType), nullable = true)
1461-
if (STRING == null) structField else structField.withComment(string(STRING))
1460+
1461+
val builder = new MetadataBuilder
1462+
// Add comment to metadata
1463+
if (STRING != null) {
1464+
builder.putString("comment", string(STRING))
1465+
}
1466+
// Add Hive type string to metadata.
1467+
dataType match {
1468+
case p: PrimitiveDataTypeContext =>
1469+
val dt = p.identifier.getText.toLowerCase
1470+
(dt, p.INTEGER_VALUE().asScala.toList) match {
1471+
case ("varchar" | "char", Nil) =>
1472+
builder.putString(HIVE_TYPE_STRING, dt)
1473+
case ("varchar" | "char", size :: Nil) =>
1474+
builder.putString(HIVE_TYPE_STRING, dt + "(" + size.getText + ")")
1475+
case _ =>
1476+
}
1477+
case _ =>
1478+
}
1479+
1480+
StructField(
1481+
identifier.getText,
1482+
typedVisit(dataType),
1483+
nullable = true,
1484+
builder.build())
14621485
}
14631486

14641487
/**

sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,10 @@ package org.apache.spark.sql
2121
* Contains a type system for attributes produced by relations, including complex types like
2222
* structs, arrays and maps.
2323
*/
24-
package object types
24+
package object types {
25+
/**
26+
* Metadata key used to store the Hive type name. This is relevant for datatypes that do not
27+
* have a direct Spark SQL counterpart, such as CHAR and VARCHAR.
28+
*/
29+
val HIVE_TYPE_STRING = "HIVE_TYPE_STRING"
30+
}

sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,10 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext {
203203
(2 to 10).map(i => Row(i, i - 1)).toSeq)
204204

205205
test("Schema and all fields") {
206+
def hiveMetadata(dt: String): Metadata = {
207+
new MetadataBuilder().putString("HIVE_TYPE_STRING", dt).build()
208+
}
209+
206210
val expectedSchema = StructType(
207211
StructField("string$%Field", StringType, true) ::
208212
StructField("binaryField", BinaryType, true) ::
@@ -217,8 +221,8 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext {
217221
StructField("decimalField2", DecimalType(9, 2), true) ::
218222
StructField("dateField", DateType, true) ::
219223
StructField("timestampField", TimestampType, true) ::
220-
StructField("varcharField", StringType, true) ::
221-
StructField("charField", StringType, true) ::
224+
StructField("varcharField", StringType, true, hiveMetadata("varchar(12)")) ::
225+
StructField("charField", StringType, true, hiveMetadata("char(18)")) ::
222226
StructField("arrayFieldSimple", ArrayType(IntegerType), true) ::
223227
StructField("arrayFieldComplex",
224228
ArrayType(
395 Bytes
Binary file not shown.

sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,27 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA
162162
hiveClient.runSqlHive("DROP TABLE IF EXISTS orc_varchar")
163163
}
164164
}
165+
166+
test("read varchar column from orc tables created by hive") {
167+
try {
168+
// This is an ORC file with a single VARCHAR(10) column that's created using Hive 1.2.1
169+
val hiveOrc = new File(Thread.currentThread().getContextClassLoader
170+
.getResource(s"data/files/orc/").getFile).toURI
171+
sql(
172+
s"""
173+
|CREATE EXTERNAL TABLE test_hive_orc(
174+
| a STRING,
175+
| b CHAR(10),
176+
| c VARCHAR(10)
177+
|)
178+
|STORED AS ORC
179+
|LOCATION '$hiveOrc'
180+
""".stripMargin)
181+
checkAnswer(spark.table("test_hive_orc"), Row("a", "b ", "c"))
182+
} finally {
183+
sql("DROP TABLE IF EXISTS test_hive_orc")
184+
}
185+
}
165186
}
166187

167188
class OrcSourceSuite extends OrcSuite {

0 commit comments

Comments
 (0)