1717
1818package org .apache .spark .sql .execution .command
1919
20- import scala .collection .GenSeq
20+ import scala .collection .{ GenMap , GenSeq }
2121import scala .collection .parallel .ForkJoinTaskSupport
2222import scala .concurrent .forkjoin .ForkJoinPool
2323import scala .util .control .NonFatal
2424
25- import org .apache .hadoop .fs .{FileStatus , FileSystem , Path , PathFilter }
25+ import org .apache .hadoop .conf .Configuration
26+ import org .apache .hadoop .fs ._
2627import org .apache .hadoop .mapred .{FileInputFormat , JobConf }
2728
2829import org .apache .spark .sql .{AnalysisException , Row , SparkSession }
@@ -34,6 +35,7 @@ import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._
3435import org .apache .spark .sql .execution .datasources .BucketSpec
3536import org .apache .spark .sql .execution .datasources .PartitioningUtils
3637import org .apache .spark .sql .types ._
38+ import org .apache .spark .util .SerializableConfiguration
3739
3840// Note: The definition of these commands are based on the ones described in
3941// https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL
@@ -429,6 +431,9 @@ case class AlterTableDropPartitionCommand(
429431
430432}
431433
434+
435+ case class PartitionStatistics (numFiles : Int , totalSize : Long )
436+
432437/**
433438 * Recover Partitions in ALTER TABLE: recover all the partition in the directory of a table and
434439 * update the catalog.
@@ -442,6 +447,31 @@ case class AlterTableDropPartitionCommand(
442447case class AlterTableRecoverPartitionsCommand (
443448 tableName : TableIdentifier ,
444449 cmd : String = " ALTER TABLE RECOVER PARTITIONS" ) extends RunnableCommand {
450+
451+ // These are list of statistics that can be collected quickly without requiring a scan of the data
452+ // see https://github.com/apache/hive/blob/master/
453+ // common/src/java/org/apache/hadoop/hive/common/StatsSetupConst.java
454+ val NUM_FILES = " numFiles"
455+ val TOTAL_SIZE = " totalSize"
456+ val DDL_TIME = " transient_lastDdlTime"
457+
458+ private def getPathFilter (hadoopConf : Configuration ): PathFilter = {
459+ // Dummy jobconf to get to the pathFilter defined in configuration
460+ // It's very expensive to create a JobConf(ClassUtil.findContainingJar() is slow)
461+ val jobConf = new JobConf (hadoopConf, this .getClass)
462+ val pathFilter = FileInputFormat .getInputPathFilter(jobConf)
463+ new PathFilter {
464+ override def accept (path : Path ): Boolean = {
465+ val name = path.getName
466+ if (name != " _SUCCESS" && name != " _temporary" && ! name.startsWith(" ." )) {
467+ pathFilter == null || pathFilter.accept(path)
468+ } else {
469+ false
470+ }
471+ }
472+ }
473+ }
474+
445475 override def run (spark : SparkSession ): Seq [Row ] = {
446476 val catalog = spark.sessionState.catalog
447477 if (! catalog.tableExists(tableName)) {
@@ -456,10 +486,6 @@ case class AlterTableRecoverPartitionsCommand(
456486 throw new AnalysisException (
457487 s " Operation not allowed: $cmd on datasource tables: $tableName" )
458488 }
459- if (table.tableType != CatalogTableType .EXTERNAL ) {
460- throw new AnalysisException (
461- s " Operation not allowed: $cmd only works on external tables: $tableName" )
462- }
463489 if (! DDLUtils .isTablePartitioned(table)) {
464490 throw new AnalysisException (
465491 s " Operation not allowed: $cmd only works on partitioned tables: $tableName" )
@@ -470,19 +496,26 @@ case class AlterTableRecoverPartitionsCommand(
470496 }
471497
472498 val root = new Path (table.storage.locationUri.get)
499+ logInfo(s " Recover all the partitions in $root" )
473500 val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration)
474- // Dummy jobconf to get to the pathFilter defined in configuration
475- // It's very expensive to create a JobConf(ClassUtil.findContainingJar() is slow)
476- val jobConf = new JobConf ( spark.sparkContext.hadoopConfiguration, this .getClass)
477- val pathFilter = FileInputFormat .getInputPathFilter(jobConf )
501+
502+ val threshold = spark.conf.get( " spark.rdd.parallelListingThreshold " , " 10 " ).toInt
503+ val hadoopConf = spark.sparkContext.hadoopConfiguration
504+ val pathFilter = getPathFilter(hadoopConf )
478505 val partitionSpecsAndLocs = scanPartitions(
479- spark, fs, pathFilter, root, Map (), table.partitionColumnNames.map(_.toLowerCase))
480- val parts = partitionSpecsAndLocs.map { case (spec, location) =>
481- // inherit table storage format (possibly except for location)
482- CatalogTablePartition (spec, table.storage.copy(locationUri = Some (location.toUri.toString)))
506+ spark, fs, pathFilter, root, Map (), table.partitionColumnNames.map(_.toLowerCase), threshold)
507+ val total = partitionSpecsAndLocs.length
508+ logInfo(s " Found $total partitions in $root" )
509+
510+ val partitionStats = if (spark.sqlContext.conf.gatherFastStats) {
511+ gatherPartitionStats(spark, partitionSpecsAndLocs, fs, pathFilter, threshold)
512+ } else {
513+ GenMap .empty[String , PartitionStatistics ]
483514 }
484- spark.sessionState.catalog.createPartitions(tableName,
485- parts.toArray[CatalogTablePartition ], ignoreIfExists = true )
515+ logInfo(s " Finished to gather the fast stats for all $total partitions. " )
516+
517+ addPartitions(spark, table, partitionSpecsAndLocs, partitionStats)
518+ logInfo(s " Recovered all partitions ( $total). " )
486519 Seq .empty[Row ]
487520 }
488521
@@ -494,15 +527,16 @@ case class AlterTableRecoverPartitionsCommand(
494527 filter : PathFilter ,
495528 path : Path ,
496529 spec : TablePartitionSpec ,
497- partitionNames : Seq [String ]): GenSeq [(TablePartitionSpec , Path )] = {
498- if (partitionNames.length == 0 ) {
530+ partitionNames : Seq [String ],
531+ threshold : Int ): GenSeq [(TablePartitionSpec , Path )] = {
532+ if (partitionNames.isEmpty) {
499533 return Seq (spec -> path)
500534 }
501535
502- val statuses = fs.listStatus(path)
503- val threshold = spark.conf.get(" spark.rdd.parallelListingThreshold" , " 10" ).toInt
536+ val statuses = fs.listStatus(path, filter)
504537 val statusPar : GenSeq [FileStatus ] =
505538 if (partitionNames.length > 1 && statuses.length > threshold || partitionNames.length > 2 ) {
539+ // parallelize the list of partitions here, then we can have better parallelism later.
506540 val parArray = statuses.par
507541 parArray.tasksupport = evalTaskSupport
508542 parArray
@@ -517,21 +551,89 @@ case class AlterTableRecoverPartitionsCommand(
517551 // TODO: Validate the value
518552 val value = PartitioningUtils .unescapePathName(ps(1 ))
519553 // comparing with case-insensitive, but preserve the case
520- if (columnName == partitionNames( 0 ) ) {
521- scanPartitions(
522- spark, fs, filter, st.getPath, spec ++ Map (columnName -> value), partitionNames.drop(1 ))
554+ if (columnName == partitionNames.head ) {
555+ scanPartitions(spark, fs, filter, st.getPath, spec ++ Map (columnName -> value),
556+ partitionNames.drop(1 ), threshold )
523557 } else {
524- logWarning(s " expect partition column ${partitionNames( 0 ) }, but got ${ps(0 )}, ignore it " )
558+ logWarning(s " expect partition column ${partitionNames.head }, but got ${ps(0 )}, ignore it " )
525559 Seq ()
526560 }
527561 } else {
528- if (name != " _SUCCESS" && name != " _temporary" && ! name.startsWith(" ." )) {
529- logWarning(s " ignore ${new Path (path, name)}" )
530- }
562+ logWarning(s " ignore ${new Path (path, name)}" )
531563 Seq ()
532564 }
533565 }
534566 }
567+
568+ private def gatherPartitionStats (
569+ spark : SparkSession ,
570+ partitionSpecsAndLocs : GenSeq [(TablePartitionSpec , Path )],
571+ fs : FileSystem ,
572+ pathFilter : PathFilter ,
573+ threshold : Int ): GenMap [String , PartitionStatistics ] = {
574+ if (partitionSpecsAndLocs.length > threshold) {
575+ val hadoopConf = spark.sparkContext.hadoopConfiguration
576+ val serializableConfiguration = new SerializableConfiguration (hadoopConf)
577+ val serializedPaths = partitionSpecsAndLocs.map(_._2.toString).toArray
578+
579+ // Set the number of parallelism to prevent following file listing from generating many tasks
580+ // in case of large #defaultParallelism.
581+ val numParallelism = Math .min(serializedPaths.length,
582+ Math .min(spark.sparkContext.defaultParallelism, 10000 ))
583+ // gather the fast stats for all the partitions otherwise Hive metastore will list all the
584+ // files for all the new partitions in sequential way, which is super slow.
585+ logInfo(s " Gather the fast stats in parallel using $numParallelism tasks. " )
586+ spark.sparkContext.parallelize(serializedPaths, numParallelism)
587+ .mapPartitions { paths =>
588+ val pathFilter = getPathFilter(serializableConfiguration.value)
589+ paths.map(new Path (_)).map{ path =>
590+ val fs = path.getFileSystem(serializableConfiguration.value)
591+ val statuses = fs.listStatus(path, pathFilter)
592+ (path.toString, PartitionStatistics (statuses.length, statuses.map(_.getLen).sum))
593+ }
594+ }.collectAsMap()
595+ } else {
596+ partitionSpecsAndLocs.map { case (_, location) =>
597+ val statuses = fs.listStatus(location, pathFilter)
598+ (location.toString, PartitionStatistics (statuses.length, statuses.map(_.getLen).sum))
599+ }.toMap
600+ }
601+ }
602+
603+ private def addPartitions (
604+ spark : SparkSession ,
605+ table : CatalogTable ,
606+ partitionSpecsAndLocs : GenSeq [(TablePartitionSpec , Path )],
607+ partitionStats : GenMap [String , PartitionStatistics ]): Unit = {
608+ val total = partitionSpecsAndLocs.length
609+ var done = 0L
610+ // Hive metastore may not have enough memory to handle millions of partitions in single RPC,
611+ // we should split them into smaller batches. Since Hive client is not thread safe, we cannot
612+ // do this in parallel.
613+ val batchSize = 100
614+ partitionSpecsAndLocs.toIterator.grouped(batchSize).foreach { batch =>
615+ val now = System .currentTimeMillis() / 1000
616+ val parts = batch.map { case (spec, location) =>
617+ val params = partitionStats.get(location.toString).map {
618+ case PartitionStatistics (numFiles, totalSize) =>
619+ // This two fast stat could prevent Hive metastore to list the files again.
620+ Map (NUM_FILES -> numFiles.toString,
621+ TOTAL_SIZE -> totalSize.toString,
622+ // Workaround a bug in HiveMetastore that try to mutate a read-only parameters.
623+ // see metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java
624+ DDL_TIME -> now.toString)
625+ }.getOrElse(Map .empty)
626+ // inherit table storage format (possibly except for location)
627+ CatalogTablePartition (
628+ spec,
629+ table.storage.copy(locationUri = Some (location.toUri.toString)),
630+ params)
631+ }
632+ spark.sessionState.catalog.createPartitions(tableName, parts, ignoreIfExists = true )
633+ done += parts.length
634+ logDebug(s " Recovered ${parts.length} partitions ( $done/ $total so far) " )
635+ }
636+ }
535637}
536638
537639
0 commit comments