apache · dongjoon-hyun · Jun 13, 2018 · dongjoon-hyun · Jul 9, 2018 · viirya
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
@@ -2,7 +2,7 @@ JavaEWAH-0.3.2.jar
 RoaringBitmap-0.5.11.jar
 ST4-4.0.4.jar
 activation-1.1.1.jar
-aircompressor-0.8.jar
+aircompressor-0.10.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
 antlr4-runtime-4.7.jar
@@ -157,8 +157,9 @@ objenesis-2.1.jar
 okhttp-3.8.1.jar
 okio-1.13.0.jar
 opencsv-2.3.jar
-orc-core-1.4.4-nohive.jar
-orc-mapreduce-1.4.4-nohive.jar
+orc-core-1.5.2-nohive.jar
+orc-mapreduce-1.5.2-nohive.jar
+orc-shims-1.5.2.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar

diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
@@ -2,7 +2,7 @@ JavaEWAH-0.3.2.jar
 RoaringBitmap-0.5.11.jar
 ST4-4.0.4.jar
 activation-1.1.1.jar
-aircompressor-0.8.jar
+aircompressor-0.10.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
 antlr4-runtime-4.7.jar
@@ -158,8 +158,9 @@ objenesis-2.1.jar
 okhttp-3.8.1.jar
 okio-1.13.0.jar
 opencsv-2.3.jar
-orc-core-1.4.4-nohive.jar
-orc-mapreduce-1.4.4-nohive.jar
+orc-core-1.5.2-nohive.jar
+orc-mapreduce-1.5.2-nohive.jar
+orc-shims-1.5.2.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar

diff --git a/dev/deps/spark-deps-hadoop-3.1 b/dev/deps/spark-deps-hadoop-3.1
@@ -4,7 +4,7 @@ RoaringBitmap-0.5.11.jar
 ST4-4.0.4.jar
 accessors-smart-1.2.jar
 activation-1.1.1.jar
-aircompressor-0.8.jar
+aircompressor-0.10.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
 antlr4-runtime-4.7.jar
@@ -176,8 +176,9 @@ okhttp-2.7.5.jar
 okhttp-3.8.1.jar
 okio-1.13.0.jar
 opencsv-2.3.jar
-orc-core-1.4.4-nohive.jar
-orc-mapreduce-1.4.4-nohive.jar
+orc-core-1.5.2-nohive.jar
+orc-mapreduce-1.5.2-nohive.jar
+orc-shims-1.5.2.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar

diff --git a/pom.xml b/pom.xml
@@ -130,7 +130,7 @@
     <hive.version.short>1.2.1</hive.version.short>
     <derby.version>10.12.1.1</derby.version>
     <parquet.version>1.10.0</parquet.version>
-    <orc.version>1.4.4</orc.version>
+    <orc.version>1.5.2</orc.version>
     <orc.classifier>nohive</orc.classifier>
     <hive.parquet.version>1.6.0</hive.parquet.version>
     <jetty.version>9.3.20.v20170531</jetty.version>

diff --git a/sql/core/pom.xml b/sql/core/pom.xml
@@ -90,11 +90,39 @@
       <groupId>org.apache.orc</groupId>
       <artifactId>orc-core</artifactId>
       <classifier>${orc.classifier}</classifier>
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-hdfs</artifactId>
+        </exclusion>
+        <!--
+          orc-core:nohive doesn't have this dependency, but we adds this to prevent
+          sbt from getting confused.
+        -->
+        <exclusion>
+          <groupId>org.apache.hive</groupId>
+          <artifactId>hive-storage-api</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
     <dependency>
       <groupId>org.apache.orc</groupId>
       <artifactId>orc-mapreduce</artifactId>
       <classifier>${orc.classifier}</classifier>
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-hdfs</artifactId>
+        </exclusion>
+        <!--
+          orc-core:nohive doesn't have this dependency, but we adds this to prevent
+          sbt from getting confused.
+        -->
+        <exclusion>
+          <groupId>org.apache.hive</groupId>
+          <artifactId>hive-storage-api</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
     <dependency>
       <groupId>org.apache.parquet</groupId>

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
@@ -59,6 +59,19 @@ private[sql] object OrcFileFormat {
   def checkFieldNames(names: Seq[String]): Unit = {
     names.foreach(checkFieldName)
   }
+
+  def getQuotedSchemaString(dataType: DataType): String = dataType match {
+    case _: AtomicType => dataType.catalogString
+    case StructType(fields) =>
+      fields.map(f => s"`${f.name}`:${getQuotedSchemaString(f.dataType)}")
+        .mkString("struct<", ",", ">")
+    case ArrayType(elementType, _) =>
+      s"array<${getQuotedSchemaString(elementType)}>"
+    case MapType(keyType, valueType, _) =>
+      s"map<${getQuotedSchemaString(keyType)},${getQuotedSchemaString(valueType)}>"
+    case _ => // UDT and others
+      dataType.catalogString
+  }
 }
 
 /**
@@ -95,7 +108,7 @@ class OrcFileFormat
 
     val conf = job.getConfiguration
 
-    conf.set(MAPRED_OUTPUT_SCHEMA.getAttribute, dataSchema.catalogString)
+    conf.set(MAPRED_OUTPUT_SCHEMA.getAttribute, OrcFileFormat.getQuotedSchemaString(dataSchema))
 
     conf.set(COMPRESS.getAttribute, orcOptions.compressionCodec)
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala
@@ -223,6 +223,6 @@ class OrcSerializer(dataSchema: StructType) {
    * Return a Orc value object for the given Spark schema.
    */
   private def createOrcValue(dataType: DataType) = {
-    OrcStruct.createValue(TypeDescription.fromString(dataType.catalogString))
+    OrcStruct.createValue(TypeDescription.fromString(OrcFileFormat.getQuotedSchemaString(dataType)))
   }
 }