From 48a598efe78ef1c9560a6af2d74e98b3bfaa8819 Mon Sep 17 00:00:00 2001 From: Kashish Jain Date: Wed, 13 Apr 2016 16:10:34 +0530 Subject: [PATCH 1/2] [SPARK-14557][SQL] Reading textfile (created though CTAS) doesn't work when pathFilter is enabled. 1) A bug in HadoopFileReader. Resolved by passing the directory instead of a list of files in case of pathFilter also, since it gets triggerred in FileInputFormat. This also saves multiple filterings in the codePath. 2) Not using the applyFilterIfApplicable --- .../main/scala/org/apache/spark/sql/hive/TableReader.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala index 54afe9c2a3550..9dc9db89ec440 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala @@ -108,7 +108,7 @@ class HadoopTableReader( val broadcastedHiveConf = _broadcastedHiveConf val tablePath = hiveTable.getPath - val inputPathStr = applyFilterIfNeeded(tablePath, filterOpt) + val inputPathStr = tablePath.toString // logDebug("Table input: %s".format(tablePath)) val ifc = hiveTable.getInputFormatClass @@ -190,7 +190,7 @@ class HadoopTableReader( .map { case (partition, partDeserializer) => val partDesc = Utilities.getPartitionDesc(partition) val partPath = partition.getDataLocation - val inputPathStr = applyFilterIfNeeded(partPath, filterOpt) + val inputPathStr = partPath.toString val ifc = partDesc.getInputFileFormatClass .asInstanceOf[java.lang.Class[InputFormat[Writable, Writable]]] // Get partition field info From 6bd529cb8dfc6a2322a45eb7f6f606fcfc764202 Mon Sep 17 00:00:00 2001 From: Kashish Jain Date: Thu, 14 Apr 2016 11:02:03 +0530 Subject: [PATCH 2/2] [SPARK-14557][SQL] Removing unused method --- .../org/apache/spark/sql/hive/TableReader.scala | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala index 9dc9db89ec440..5af63d629470b 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala @@ -252,20 +252,6 @@ class HadoopTableReader( } } - /** - * If `filterOpt` is defined, then it will be used to filter files from `path`. These files are - * returned in a single, comma-separated string. - */ - private def applyFilterIfNeeded(path: Path, filterOpt: Option[PathFilter]): String = { - filterOpt match { - case Some(filter) => - val fs = path.getFileSystem(sc.hiveconf) - val filteredFiles = fs.listStatus(path, filter).map(_.getPath.toString) - filteredFiles.mkString(",") - case None => path.toString - } - } - /** * Creates a HadoopRDD based on the broadcasted HiveConf and other job properties that will be * applied locally on each slave.