Skip to content

Commit 69626ad

Browse files
committed
[SPARK-16632][SQL] Revert PR #14272: Respect Hive schema when merging parquet schema
## What changes were proposed in this pull request? PR #14278 is a more general and simpler fix for SPARK-16632 than PR #14272. After merging #14278, we no longer need changes made in #14272. So here I revert them. This PR targets both master and branch-2.0. ## How was this patch tested? Existing tests. Author: Cheng Lian <[email protected]> Closes #14300 from liancheng/revert-pr-14272.
1 parent 6203668 commit 69626ad

File tree

2 files changed

+0
-57
lines changed

2 files changed

+0
-57
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@ import org.apache.parquet.hadoop.api.{InitContext, ReadSupport}
2626
import org.apache.parquet.hadoop.api.ReadSupport.ReadContext
2727
import org.apache.parquet.io.api.RecordMaterializer
2828
import org.apache.parquet.schema._
29-
import org.apache.parquet.schema.OriginalType._
30-
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._
3129
import org.apache.parquet.schema.Type.Repetition
3230

3331
import org.apache.spark.internal.Logging
@@ -123,12 +121,6 @@ private[parquet] object ParquetReadSupport {
123121
}
124122

125123
private def clipParquetType(parquetType: Type, catalystType: DataType): Type = {
126-
val primName = if (parquetType.isPrimitive()) {
127-
parquetType.asPrimitiveType().getPrimitiveTypeName()
128-
} else {
129-
null
130-
}
131-
132124
catalystType match {
133125
case t: ArrayType if !isPrimitiveCatalystType(t.elementType) =>
134126
// Only clips array types with nested type as element type.
@@ -143,16 +135,6 @@ private[parquet] object ParquetReadSupport {
143135
case t: StructType =>
144136
clipParquetGroup(parquetType.asGroupType(), t)
145137

146-
case _: ByteType if primName == INT32 =>
147-
// SPARK-16632: Handle case where Hive stores bytes in a int32 field without specifying
148-
// the original type.
149-
Types.primitive(INT32, parquetType.getRepetition()).as(INT_8).named(parquetType.getName())
150-
151-
case _: ShortType if primName == INT32 =>
152-
// SPARK-16632: Handle case where Hive stores shorts in a int32 field without specifying
153-
// the original type.
154-
Types.primitive(INT32, parquetType.getRepetition()).as(INT_16).named(parquetType.getName())
155-
156138
case _ =>
157139
// UDTs and primitive types are not clipped. For UDTs, a clipped version might not be able
158140
// to be mapped to desired user-space types. So UDTs shouldn't participate schema merging.

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1581,43 +1581,4 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
15811581
| }
15821582
|}
15831583
""".stripMargin)
1584-
1585-
testSchemaClipping(
1586-
"int32 parquet field with byte schema field",
1587-
1588-
parquetSchema =
1589-
"""message root {
1590-
| optional int32 value;
1591-
|}
1592-
""".stripMargin,
1593-
1594-
catalystSchema =
1595-
new StructType()
1596-
.add("value", ByteType, nullable = true),
1597-
1598-
expectedSchema =
1599-
"""message root {
1600-
| optional int32 value (INT_8);
1601-
|}
1602-
""".stripMargin)
1603-
1604-
testSchemaClipping(
1605-
"int32 parquet field with short schema field",
1606-
1607-
parquetSchema =
1608-
"""message root {
1609-
| optional int32 value;
1610-
|}
1611-
""".stripMargin,
1612-
1613-
catalystSchema =
1614-
new StructType()
1615-
.add("value", ShortType, nullable = true),
1616-
1617-
expectedSchema =
1618-
"""message root {
1619-
| optional int32 value (INT_16);
1620-
|}
1621-
""".stripMargin)
1622-
16231584
}

0 commit comments

Comments
 (0)