[SPARK-23342][SQL][TEST] Add ORC configuration tests for ORC data source

dongjoon-hyun · dongjoon-hyun · commit 13a012ff41e9 · 2018-02-06T01:09:51.000-08:00
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
@@ -20,7 +20,8 @@ package org.apache.spark.sql.execution.datasources.orc
 import java.io.File
 import java.util.Locale
 
-import org.apache.orc.OrcConf.COMPRESS
+import org.apache.orc.{OrcFile, Reader}
+import org.apache.orc.OrcConf.{BUFFER_SIZE, COMPRESS, ROW_INDEX_STRIDE, STRIPE_SIZE}
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.Row
@@ -160,6 +161,77 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll {
       }
     }
   }
+
+  private def getReader(path: String): Reader = {
+    val conf = spark.sessionState.newHadoopConf()
+    val files = OrcUtils.listOrcFiles(path, conf)
+    assert(files.length == 1)
+    val file = files.head
+    val fs = file.getFileSystem(conf)
+    val readerOptions = org.apache.orc.OrcFile.readerOptions(conf).filesystem(fs)
+    OrcFile.createReader(file, readerOptions)
+  }
+
+  test("SPARK-23342 Support orc.stripe.size and hive.exec.orc.default.stripe.size") {
+    val df = spark.range(1000000).map(_ => scala.util.Random.nextLong).repartition(1)
+
+    Seq(org.apache.orc.OrcConf.STRIPE_SIZE).foreach { conf =>
+      Seq(conf.getAttribute, conf.getHiveConfName).foreach { name =>
+        // Since the default value of orc.stripe.size is 64MB, there exists only 1 stripe.
+        withTempPath { path =>
+          val dir = path.getCanonicalPath
+          df.write.format("orc").save(dir)
+          assert(getReader(dir).getStripes().size === 1)
+        }
+
+        withTempPath { path =>
+          val dir = path.getCanonicalPath
+          df.write.format("orc").option(name, "10000").save(dir)
+          assert(getReader(dir).getStripes().size > 100)
+        }
+      }
+    }
+  }
+
+  test("SPARK-23342 Support orc.row.index.stride and hive.exec.orc.default.row.index.stride") {
+    val df = spark.range(1000000).map(_ => scala.util.Random.nextLong).repartition(1)
+
+    Seq(ROW_INDEX_STRIDE).foreach { conf =>
+      Seq(conf.getAttribute, conf.getHiveConfName).foreach { name =>
+        withTempPath { path =>
+          val dir = path.getCanonicalPath
+          df.write.format("orc").save(dir)
+        }
+
+        withTempPath { path =>
+          val dir = path.getCanonicalPath
+          df.write.format("orc").option(name, "1024").save(dir)
+          assert(getReader(dir).getRowIndexStride === 1024)
+        }
+      }
+    }
+  }
+
+  test("SPARK-23342 Support orc.compress.size and hive.exec.orc.default.buffer.size") {
+    val df = spark.range(1000000).map(_ => scala.util.Random.nextLong).repartition(1)
+
+    Seq(BUFFER_SIZE).foreach { conf =>
+      Seq(conf.getAttribute, conf.getHiveConfName).foreach { name =>
+        withTempPath { path =>
+          val dir = path.getCanonicalPath
+          df.write.format("orc").save(dir)
+          assert(getReader(dir).getCompressionSize === BUFFER_SIZE.getDefaultValue)
+        }
+
+        withTempPath { path =>
+          val dir = path.getCanonicalPath
+
+          df.write.format("orc").option(name, "1024").save(dir)
+          assert(getReader(dir).getCompressionSize === 1024)
+        }
+      }
+    }
+  }
 }
 
 class OrcSourceSuite extends OrcSuite with SharedSQLContext {