palantir
diff --git a/‎core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala‎
Lines changed: 33 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala‎
Lines changed: 33 additions & 1 deletion
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala‎
Lines changed: 3 additions & 2 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala‎
Lines changed: 3 additions & 1 deletion b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala‎
Lines changed: 12 additions & 3 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala‎
Lines changed: 2 additions & 2 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala‎
Lines changed: 1 addition & 1 deletion b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala‎
Lines changed: 21 additions & 7 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala‎
Lines changed: 21 additions & 7 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala‎
Lines changed: 2 additions & 0 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala‎
Lines changed: 1 addition & 1 deletion b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala‎
Lines changed: 1 addition & 3 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala‎
Lines changed: 1 addition & 3 deletions
@@ -26,7 +26,7 @@ private[spark] object StaticSources {
    * The set of all static sources. These sources may be reported to from any class, including
    * static classes, without requiring reference to a SparkEnv.
    */
-  val allSources = Seq(CodegenMetrics)
+  val allSources = Seq(CodegenMetrics, HiveCatalogMetrics)
 }
 
 /**
@@ -60,3 +60,35 @@ object CodegenMetrics extends Source {
   val METRIC_GENERATED_METHOD_BYTECODE_SIZE =
     metricRegistry.histogram(MetricRegistry.name("generatedMethodSize"))
 }
+
+/**
+ * :: Experimental ::
+ * Metrics for access to the hive external catalog.
+ */
+@Experimental
+object HiveCatalogMetrics extends Source {
+  override val sourceName: String = "HiveExternalCatalog"
+  override val metricRegistry: MetricRegistry = new MetricRegistry()
+
+  /**
+   * Tracks the total number of partition metadata entries fetched via the client api.
+   */
+  val METRIC_PARTITIONS_FETCHED = metricRegistry.counter(MetricRegistry.name("partitionsFetched"))
+
+  /**
+   * Tracks the total number of files discovered off of the filesystem by ListingFileCatalog.
+   */
+  val METRIC_FILES_DISCOVERED = metricRegistry.counter(MetricRegistry.name("filesDiscovered"))
+
+  /**
+   * Resets the values of all metrics to zero. This is useful in tests.
+   */
+  def reset(): Unit = {
+    METRIC_PARTITIONS_FETCHED.dec(METRIC_PARTITIONS_FETCHED.getCount())
+    METRIC_FILES_DISCOVERED.dec(METRIC_FILES_DISCOVERED.getCount())
+  }
+
+  // clients can use these to avoid classloader issues with the codahale classes
+  def incrementFetchedPartitions(n: Int): Unit = METRIC_PARTITIONS_FETCHED.inc(n)
+  def incrementFilesDiscovered(n: Int): Unit = METRIC_FILES_DISCOVERED.inc(n)
+}
@@ -198,11 +198,12 @@ abstract class ExternalCatalog {
       partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition]
 
   /**
-   * List the metadata of selected partitions according to the given partition predicates.
+   * List the metadata of partitions that belong to the specified table, assuming it exists, that
+   * satisfy the given partition-pruning predicate expressions.
    *
    * @param db database name
    * @param table table name
-   * @param predicates partition predicated
+   * @param predicates  partition-pruning predicates
    */
   def listPartitionsByFilter(
       db: String,
 
@@ -482,7 +482,9 @@ class InMemoryCatalog(
       db: String,
       table: String,
       predicates: Seq[Expression]): Seq[CatalogTablePartition] = {
-    throw new UnsupportedOperationException("listPartitionsByFilter is not implemented.")
+    // TODO: Provide an implementation
+    throw new UnsupportedOperationException(
+      "listPartitionsByFilter is not implemented for InMemoryCatalog")
   }
 
   // --------------------------------------------------------------------------
 
@@ -20,11 +20,11 @@ package org.apache.spark.sql.catalyst.catalog
 import java.util.Date
 
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
-import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow, TableIdentifier}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal}
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.{StructField, StructType}
 
 
 /**
@@ -97,6 +97,15 @@ case class CatalogTablePartition(
 
     output.filter(_.nonEmpty).mkString("CatalogPartition(\n\t", "\n\t", ")")
   }
+
+  /**
+   * Given the partition schema, returns a row with that schema holding the partition values.
+   */
+  def toRow(partitionSchema: StructType): InternalRow = {
+    InternalRow.fromSeq(partitionSchema.map { case StructField(name, dataType, _, _) =>
+      Cast(Literal(spec(name)), dataType).eval()
+    })
+  }
 }
 
 
 
@@ -43,7 +43,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util.usePrettyExpression
 import org.apache.spark.sql.execution.{FileRelation, LogicalRDD, QueryExecution, SQLExecution}
 import org.apache.spark.sql.execution.command.{CreateViewCommand, ExplainCommand, GlobalTempView, LocalTempView}
-import org.apache.spark.sql.execution.datasources.LogicalRelation
+import org.apache.spark.sql.execution.datasources.{FileCatalog, HadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.execution.datasources.json.JacksonGenerator
 import org.apache.spark.sql.execution.python.EvaluatePython
 import org.apache.spark.sql.streaming.{DataStreamWriter, StreamingQuery}
@@ -2614,7 +2614,7 @@ class Dataset[T] private[sql](
    * @since 2.0.0
    */
   def inputFiles: Array[String] = {
-    val files: Seq[String] = logicalPlan.collect {
+    val files: Seq[String] = queryExecution.optimizedPlan.collect {
       case LogicalRelation(fsBasedRelation: FileRelation, _, _) =>
         fsBasedRelation.inputFiles
       case fr: FileRelation =>
 
@@ -185,7 +185,7 @@ class CacheManager extends Logging {
     plan match {
       case lr: LogicalRelation => lr.relation match {
         case hr: HadoopFsRelation =>
-          val invalidate = hr.location.paths
+          val invalidate = hr.location.rootPaths
             .map(_.makeQualified(fs.getUri, fs.getWorkingDirectory))
             .contains(qualifiedPath)
           if (invalidate) hr.location.refresh()
 
@@ -225,13 +225,27 @@ case class FileSourceScanExec(
   }
 
   // These metadata values make scan plans uniquely identifiable for equality checking.
-  override val metadata: Map[String, String] = Map(
-    "Format" -> relation.fileFormat.toString,
-    "ReadSchema" -> outputSchema.catalogString,
-    "Batched" -> supportsBatch.toString,
-    "PartitionFilters" -> partitionFilters.mkString("[", ", ", "]"),
-    "PushedFilters" -> dataFilters.mkString("[", ", ", "]"),
-    "InputPaths" -> relation.location.paths.mkString(", "))
+  override val metadata: Map[String, String] = {
+    def seqToString(seq: Seq[Any]) = seq.mkString("[", ", ", "]")
+    val location = relation.location
+    val locationDesc =
+      location.getClass.getSimpleName + seqToString(location.rootPaths)
+    val metadata =
+      Map(
+        "Format" -> relation.fileFormat.toString,
+        "ReadSchema" -> outputSchema.catalogString,
+        "Batched" -> supportsBatch.toString,
+        "PartitionFilters" -> seqToString(partitionFilters),
+        "PushedFilters" -> seqToString(dataFilters),
+        "Location" -> locationDesc)
+    val withOptPartitionCount =
+      relation.partitionSchemaOption.map { _ =>
+        metadata + ("PartitionCount" -> selectedPartitions.size.toString)
+      } getOrElse {
+        metadata
+      }
+    withOptPartitionCount
+  }
 
   private lazy val inputRDD: RDD[InternalRow] = {
     val originalPartitions = relation.location.listFiles(partitionFilters)
 
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution
 import org.apache.spark.sql.ExperimentalMethods
 import org.apache.spark.sql.catalyst.catalog.SessionCatalog
 import org.apache.spark.sql.catalyst.optimizer.Optimizer
+import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions
 import org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate
 import org.apache.spark.sql.internal.SQLConf
 
@@ -32,5 +33,6 @@ class SparkOptimizer(
   override def batches: Seq[Batch] = super.batches :+
     Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog, conf)) :+
     Batch("Extract Python UDF from Aggregate", Once, ExtractPythonUDFFromAggregate) :+
+    Batch("Prune File Source Table Partitions", Once, PruneFileSourcePartitions) :+
     Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*)
 }
@@ -67,7 +67,7 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
 
     dataSource match {
       case fs: HadoopFsRelation =>
-        if (table.tableType == CatalogTableType.EXTERNAL && fs.location.paths.isEmpty) {
+        if (table.tableType == CatalogTableType.EXTERNAL && fs.location.rootPaths.isEmpty) {
           throw new AnalysisException(
             "Cannot create a file-based external data source table without path")
         }
 
@@ -471,9 +471,7 @@ case class DataSource(
           val existingPartitionColumns = Try {
             resolveRelation()
               .asInstanceOf[HadoopFsRelation]
-              .location
-              .partitionSpec()
-              .partitionColumns
+              .partitionSchema
               .fieldNames
               .toSeq
           }.getOrElse(Seq.empty[String])
Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,7 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo`
`67`	`67`
`68`	`68`	`dataSource match {`
`69`	`69`	`case fs: HadoopFsRelation =>`
`70`		`- if (table.tableType == CatalogTableType.EXTERNAL && fs.location.paths.isEmpty) {`
	`70`	`+ if (table.tableType == CatalogTableType.EXTERNAL && fs.location.rootPaths.isEmpty) {`
`71`	`71`	`throw new AnalysisException(`
`72`	`72`	`"Cannot create a file-based external data source table without path")`
`73`	`73`	`}`