[SPARK-12334][SQL][PYSPARK] Support read from multiple input paths for orc file in DataFrameReader.orc

zjffdu · holdenk · commit cabe1df8606e · 2017-03-09T11:44:34.000-08:00
Beside the issue in spark api, also fix 2 minor issues in pyspark - support read from multiple input paths for orc - support read from multiple input paths for text Author: Jeff Zhang <zjffdu@apache.org> Closes #10307 from zjffdu/SPARK-12334.
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -161,15 +161,15 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
              mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None,
              timeZone=None, wholeFile=None):
         """
-        Loads a JSON file and returns the results as a :class:`DataFrame`.
+        Loads JSON files and returns the results as a :class:`DataFrame`.
 
         `JSON Lines <http://jsonlines.org/>`_(newline-delimited JSON) is supported by default.
         For JSON (one record per file), set the `wholeFile` parameter to ``true``.
 
         If the ``schema`` parameter is not specified, this function goes
         through the input once to determine the input schema.
 
-        :param path: string represents path to the JSON dataset,
+        :param path: string represents path to the JSON dataset, or a list of paths,
                      or RDD of Strings storing JSON objects.
         :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema.
         :param primitivesAsString: infers all primitive values as a string type. If None is set,
@@ -252,7 +252,7 @@ def func(iterator):
             jrdd = keyed._jrdd.map(self._spark._jvm.BytesToString())
             return self._df(self._jreader.json(jrdd))
         else:
-            raise TypeError("path can be only string or RDD")
+            raise TypeError("path can be only string, list or RDD")
 
     @since(1.4)
     def table(self, tableName):
@@ -269,7 +269,7 @@ def table(self, tableName):
 
     @since(1.4)
     def parquet(self, *paths):
-        """Loads a Parquet file, returning the result as a :class:`DataFrame`.
+        """Loads Parquet files, returning the result as a :class:`DataFrame`.
 
         You can set the following Parquet-specific option(s) for reading Parquet files:
             * ``mergeSchema``: sets whether we should merge schemas collected from all \
@@ -407,15 +407,17 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
 
     @since(1.5)
     def orc(self, path):
-        """Loads an ORC file, returning the result as a :class:`DataFrame`.
+        """Loads ORC files, returning the result as a :class:`DataFrame`.
 
         .. note:: Currently ORC support is only available together with Hive support.
 
         >>> df = spark.read.orc('python/test_support/sql/orc_partitioned')
         >>> df.dtypes
         [('a', 'bigint'), ('b', 'int'), ('c', 'int')]
         """
-        return self._df(self._jreader.orc(path))
+        if isinstance(path, basestring):
+            path = [path]
+        return self._df(self._jreader.orc(_to_seq(self._spark._sc, path)))
 
     @since(1.4)
     def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None, numPartitions=None,
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -450,6 +450,11 @@ def test_wholefile_csv(self):
                     Row(_c0=u'Hyukjin', _c1=u'25', _c2=u'I am Hyukjin\n\nI love Spark!')]
         self.assertEqual(ages_newlines.collect(), expected)
 
+    def test_read_multiple_orc_file(self):
+        df = self.spark.read.orc(["python/test_support/sql/orc_partitioned/b=0/c=0",
+                                  "python/test_support/sql/orc_partitioned/b=1/c=1"])
+        self.assertEqual(2, df.count())
+
     def test_udf_with_input_file_name(self):
         from pyspark.sql.functions import udf, input_file_name
         from pyspark.sql.types import StringType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -262,7 +262,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads a JSON file and returns the results as a `DataFrame`.
+   * Loads JSON files and returns the results as a `DataFrame`.
    *
    * <a href="http://jsonlines.org/">JSON Lines</a> (newline-delimited JSON) is supported by
    * default. For JSON (one record per file), set the `wholeFile` option to true.
@@ -438,7 +438,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads a CSV file and returns the result as a `DataFrame`.
+   * Loads CSV files and returns the result as a `DataFrame`.
    *
    * This function will go through the input once to determine the input schema if `inferSchema`
    * is enabled. To avoid going through the entire data once, disable `inferSchema` option or
@@ -549,7 +549,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads an ORC file and returns the result as a `DataFrame`.
+   * Loads ORC files and returns the result as a `DataFrame`.
    *
    * @param paths input paths
    * @since 2.0.0
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -33,6 +33,7 @@ import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{IntegerType, StructType}
+import org.apache.spark.util.Utils
 
 case class AllDataTypesWithNonPrimitiveType(
     stringField: String,
@@ -611,4 +612,12 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
       }
     }
   }
+
+   test("read from multiple orc input paths") {
+     val path1 = Utils.createTempDir()
+     val path2 = Utils.createTempDir()
+     makeOrcFile((1 to 10).map(Tuple1.apply), path1)
+     makeOrcFile((1 to 10).map(Tuple1.apply), path2)
+     assertResult(20)(read.orc(path1.getCanonicalPath, path2.getCanonicalPath).count())
+   }
 }

Original file line number	Diff line number	Diff line change
`@@ -262,7 +262,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {`
`262`	`262`	`}`
`263`	`263`
`264`	`264`	`/**`
`265`		- * Loads a JSON file and returns the results as a `DataFrame`.
	`265`	+ * Loads JSON files and returns the results as a `DataFrame`.
`266`	`266`	`*`
`267`	`267`	`* <a href="http://jsonlines.org/">JSON Lines</a> (newline-delimited JSON) is supported by`
`268`	`268`	* default. For JSON (one record per file), set the `wholeFile` option to true.
`@@ -438,7 +438,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {`
`438`	`438`	`}`
`439`	`439`
`440`	`440`	`/**`
`441`		- * Loads a CSV file and returns the result as a `DataFrame`.
	`441`	+ * Loads CSV files and returns the result as a `DataFrame`.
`442`	`442`	`*`
`443`	`443`	* This function will go through the input once to determine the input schema if `inferSchema`
`444`	`444`	* is enabled. To avoid going through the entire data once, disable `inferSchema` option or
`@@ -549,7 +549,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {`
`549`	`549`	`}`
`550`	`550`
`551`	`551`	`/**`
`552`		- * Loads an ORC file and returns the result as a `DataFrame`.
	`552`	+ * Loads ORC files and returns the result as a `DataFrame`.
`553`	`553`	`*`
`554`	`554`	`* @param paths input paths`
`555`	`555`	`* @since 2.0.0`
Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,7 @@ import org.apache.spark.sql.hive.test.TestHive._`
`33`	`33`	`import org.apache.spark.sql.hive.test.TestHive.implicits._`
`34`	`34`	`import org.apache.spark.sql.internal.SQLConf`
`35`	`35`	`import org.apache.spark.sql.types.{IntegerType, StructType}`
	`36`	`+import org.apache.spark.util.Utils`
`36`	`37`
`37`	`38`	`case class AllDataTypesWithNonPrimitiveType(`
`38`	`39`	`stringField: String,`
`@@ -611,4 +612,12 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {`
`611`	`612`	`}`
`612`	`613`	`}`
`613`	`614`	`}`
	`615`	`+`
	`616`	`+ test("read from multiple orc input paths") {`
	`617`	`+ val path1 = Utils.createTempDir()`
	`618`	`+ val path2 = Utils.createTempDir()`
	`619`	`+ makeOrcFile((1 to 10).map(Tuple1.apply), path1)`
	`620`	`+ makeOrcFile((1 to 10).map(Tuple1.apply), path2)`
	`621`	`+ assertResult(20)(read.orc(path1.getCanonicalPath, path2.getCanonicalPath).count())`
	`622`	`+ }`
`614`	`623`	`}`