1717
1818package org .apache .spark .sql .execution .command
1919
20- import scala .collection .GenSeq
20+ import scala .collection .{ GenMap , GenSeq }
2121import scala .collection .parallel .ForkJoinTaskSupport
2222import scala .concurrent .forkjoin .ForkJoinPool
2323import scala .util .control .NonFatal
2424
25- import org .apache .hadoop .fs .{FileStatus , FileSystem , Path , PathFilter }
25+ import org .apache .hadoop .conf .Configuration
26+ import org .apache .hadoop .fs ._
2627import org .apache .hadoop .mapred .{FileInputFormat , JobConf }
2728
2829import org .apache .spark .sql .{AnalysisException , Row , SparkSession }
@@ -32,6 +33,7 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
3233import org .apache .spark .sql .catalyst .expressions .{Attribute , AttributeReference }
3334import org .apache .spark .sql .execution .datasources .PartitioningUtils
3435import org .apache .spark .sql .types ._
36+ import org .apache .spark .util .SerializableConfiguration
3537
3638// Note: The definition of these commands are based on the ones described in
3739// https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL
@@ -422,6 +424,9 @@ case class AlterTableDropPartitionCommand(
422424
423425}
424426
427+
428+ case class PartitionStatistics (numFiles : Int , totalSize : Long )
429+
425430/**
426431 * Recover Partitions in ALTER TABLE: recover all the partition in the directory of a table and
427432 * update the catalog.
@@ -435,6 +440,31 @@ case class AlterTableDropPartitionCommand(
435440case class AlterTableRecoverPartitionsCommand (
436441 tableName : TableIdentifier ,
437442 cmd : String = " ALTER TABLE RECOVER PARTITIONS" ) extends RunnableCommand {
443+
444+ // These are list of statistics that can be collected quickly without requiring a scan of the data
445+ // see https://github.com/apache/hive/blob/master/
446+ // common/src/java/org/apache/hadoop/hive/common/StatsSetupConst.java
447+ val NUM_FILES = " numFiles"
448+ val TOTAL_SIZE = " totalSize"
449+ val DDL_TIME = " transient_lastDdlTime"
450+
451+ private def getPathFilter (hadoopConf : Configuration ): PathFilter = {
452+ // Dummy jobconf to get to the pathFilter defined in configuration
453+ // It's very expensive to create a JobConf(ClassUtil.findContainingJar() is slow)
454+ val jobConf = new JobConf (hadoopConf, this .getClass)
455+ val pathFilter = FileInputFormat .getInputPathFilter(jobConf)
456+ new PathFilter {
457+ override def accept (path : Path ): Boolean = {
458+ val name = path.getName
459+ if (name != " _SUCCESS" && name != " _temporary" && ! name.startsWith(" ." )) {
460+ pathFilter == null || pathFilter.accept(path)
461+ } else {
462+ false
463+ }
464+ }
465+ }
466+ }
467+
438468 override def run (spark : SparkSession ): Seq [Row ] = {
439469 val catalog = spark.sessionState.catalog
440470 if (! catalog.tableExists(tableName)) {
@@ -449,10 +479,6 @@ case class AlterTableRecoverPartitionsCommand(
449479 throw new AnalysisException (
450480 s " Operation not allowed: $cmd on datasource tables: $tableName" )
451481 }
452- if (table.tableType != CatalogTableType .EXTERNAL ) {
453- throw new AnalysisException (
454- s " Operation not allowed: $cmd only works on external tables: $tableName" )
455- }
456482 if (table.partitionColumnNames.isEmpty) {
457483 throw new AnalysisException (
458484 s " Operation not allowed: $cmd only works on partitioned tables: $tableName" )
@@ -463,19 +489,26 @@ case class AlterTableRecoverPartitionsCommand(
463489 }
464490
465491 val root = new Path (table.storage.locationUri.get)
492+ logInfo(s " Recover all the partitions in $root" )
466493 val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration)
467- // Dummy jobconf to get to the pathFilter defined in configuration
468- // It's very expensive to create a JobConf(ClassUtil.findContainingJar() is slow)
469- val jobConf = new JobConf ( spark.sparkContext.hadoopConfiguration, this .getClass)
470- val pathFilter = FileInputFormat .getInputPathFilter(jobConf )
494+
495+ val threshold = spark.conf.get( " spark.rdd.parallelListingThreshold " , " 10 " ).toInt
496+ val hadoopConf = spark.sparkContext.hadoopConfiguration
497+ val pathFilter = getPathFilter(hadoopConf )
471498 val partitionSpecsAndLocs = scanPartitions(
472- spark, fs, pathFilter, root, Map (), table.partitionColumnNames.map(_.toLowerCase))
473- val parts = partitionSpecsAndLocs.map { case (spec, location) =>
474- // inherit table storage format (possibly except for location)
475- CatalogTablePartition (spec, table.storage.copy(locationUri = Some (location.toUri.toString)))
499+ spark, fs, pathFilter, root, Map (), table.partitionColumnNames.map(_.toLowerCase), threshold)
500+ val total = partitionSpecsAndLocs.length
501+ logInfo(s " Found $total partitions in $root" )
502+
503+ val partitionStats = if (spark.sqlContext.conf.gatherFastStats) {
504+ gatherPartitionStats(spark, partitionSpecsAndLocs, fs, pathFilter, threshold)
505+ } else {
506+ GenMap .empty[String , PartitionStatistics ]
476507 }
477- spark.sessionState.catalog.createPartitions(tableName,
478- parts.toArray[CatalogTablePartition ], ignoreIfExists = true )
508+ logInfo(s " Finished to gather the fast stats for all $total partitions. " )
509+
510+ addPartitions(spark, table, partitionSpecsAndLocs, partitionStats)
511+ logInfo(s " Recovered all partitions ( $total). " )
479512 Seq .empty[Row ]
480513 }
481514
@@ -487,15 +520,16 @@ case class AlterTableRecoverPartitionsCommand(
487520 filter : PathFilter ,
488521 path : Path ,
489522 spec : TablePartitionSpec ,
490- partitionNames : Seq [String ]): GenSeq [(TablePartitionSpec , Path )] = {
491- if (partitionNames.length == 0 ) {
523+ partitionNames : Seq [String ],
524+ threshold : Int ): GenSeq [(TablePartitionSpec , Path )] = {
525+ if (partitionNames.isEmpty) {
492526 return Seq (spec -> path)
493527 }
494528
495- val statuses = fs.listStatus(path)
496- val threshold = spark.conf.get(" spark.rdd.parallelListingThreshold" , " 10" ).toInt
529+ val statuses = fs.listStatus(path, filter)
497530 val statusPar : GenSeq [FileStatus ] =
498531 if (partitionNames.length > 1 && statuses.length > threshold || partitionNames.length > 2 ) {
532+ // parallelize the list of partitions here, then we can have better parallelism later.
499533 val parArray = statuses.par
500534 parArray.tasksupport = evalTaskSupport
501535 parArray
@@ -510,21 +544,89 @@ case class AlterTableRecoverPartitionsCommand(
510544 // TODO: Validate the value
511545 val value = PartitioningUtils .unescapePathName(ps(1 ))
512546 // comparing with case-insensitive, but preserve the case
513- if (columnName == partitionNames( 0 ) ) {
514- scanPartitions(
515- spark, fs, filter, st.getPath, spec ++ Map (columnName -> value), partitionNames.drop(1 ))
547+ if (columnName == partitionNames.head ) {
548+ scanPartitions(spark, fs, filter, st.getPath, spec ++ Map (columnName -> value),
549+ partitionNames.drop(1 ), threshold )
516550 } else {
517- logWarning(s " expect partition column ${partitionNames( 0 ) }, but got ${ps(0 )}, ignore it " )
551+ logWarning(s " expect partition column ${partitionNames.head }, but got ${ps(0 )}, ignore it " )
518552 Seq ()
519553 }
520554 } else {
521- if (name != " _SUCCESS" && name != " _temporary" && ! name.startsWith(" ." )) {
522- logWarning(s " ignore ${new Path (path, name)}" )
523- }
555+ logWarning(s " ignore ${new Path (path, name)}" )
524556 Seq ()
525557 }
526558 }
527559 }
560+
561+ private def gatherPartitionStats (
562+ spark : SparkSession ,
563+ partitionSpecsAndLocs : GenSeq [(TablePartitionSpec , Path )],
564+ fs : FileSystem ,
565+ pathFilter : PathFilter ,
566+ threshold : Int ): GenMap [String , PartitionStatistics ] = {
567+ if (partitionSpecsAndLocs.length > threshold) {
568+ val hadoopConf = spark.sparkContext.hadoopConfiguration
569+ val serializableConfiguration = new SerializableConfiguration (hadoopConf)
570+ val serializedPaths = partitionSpecsAndLocs.map(_._2.toString).toArray
571+
572+ // Set the number of parallelism to prevent following file listing from generating many tasks
573+ // in case of large #defaultParallelism.
574+ val numParallelism = Math .min(serializedPaths.length,
575+ Math .min(spark.sparkContext.defaultParallelism, 10000 ))
576+ // gather the fast stats for all the partitions otherwise Hive metastore will list all the
577+ // files for all the new partitions in sequential way, which is super slow.
578+ logInfo(s " Gather the fast stats in parallel using $numParallelism tasks. " )
579+ spark.sparkContext.parallelize(serializedPaths, numParallelism)
580+ .mapPartitions { paths =>
581+ val pathFilter = getPathFilter(serializableConfiguration.value)
582+ paths.map(new Path (_)).map{ path =>
583+ val fs = path.getFileSystem(serializableConfiguration.value)
584+ val statuses = fs.listStatus(path, pathFilter)
585+ (path.toString, PartitionStatistics (statuses.length, statuses.map(_.getLen).sum))
586+ }
587+ }.collectAsMap()
588+ } else {
589+ partitionSpecsAndLocs.map { case (_, location) =>
590+ val statuses = fs.listStatus(location, pathFilter)
591+ (location.toString, PartitionStatistics (statuses.length, statuses.map(_.getLen).sum))
592+ }.toMap
593+ }
594+ }
595+
596+ private def addPartitions (
597+ spark : SparkSession ,
598+ table : CatalogTable ,
599+ partitionSpecsAndLocs : GenSeq [(TablePartitionSpec , Path )],
600+ partitionStats : GenMap [String , PartitionStatistics ]): Unit = {
601+ val total = partitionSpecsAndLocs.length
602+ var done = 0L
603+ // Hive metastore may not have enough memory to handle millions of partitions in single RPC,
604+ // we should split them into smaller batches. Since Hive client is not thread safe, we cannot
605+ // do this in parallel.
606+ val batchSize = 100
607+ partitionSpecsAndLocs.toIterator.grouped(batchSize).foreach { batch =>
608+ val now = System .currentTimeMillis() / 1000
609+ val parts = batch.map { case (spec, location) =>
610+ val params = partitionStats.get(location.toString).map {
611+ case PartitionStatistics (numFiles, totalSize) =>
612+ // This two fast stat could prevent Hive metastore to list the files again.
613+ Map (NUM_FILES -> numFiles.toString,
614+ TOTAL_SIZE -> totalSize.toString,
615+ // Workaround a bug in HiveMetastore that try to mutate a read-only parameters.
616+ // see metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java
617+ DDL_TIME -> now.toString)
618+ }.getOrElse(Map .empty)
619+ // inherit table storage format (possibly except for location)
620+ CatalogTablePartition (
621+ spec,
622+ table.storage.copy(locationUri = Some (location.toUri.toString)),
623+ params)
624+ }
625+ spark.sessionState.catalog.createPartitions(tableName, parts, ignoreIfExists = true )
626+ done += parts.length
627+ logDebug(s " Recovered ${parts.length} partitions ( $done/ $total so far) " )
628+ }
629+ }
528630}
529631
530632
0 commit comments