-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-16905] SQL DDL: MSCK REPAIR TABLE #14500
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c5edbdf
89d22f4
f338516
7f4f38d
e478c3a
e5906cf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,18 +17,23 @@ | |
|
|
||
| package org.apache.spark.sql.execution.command | ||
|
|
||
| import scala.collection.GenSeq | ||
| import scala.collection.parallel.ForkJoinTaskSupport | ||
| import scala.concurrent.forkjoin.ForkJoinPool | ||
| import scala.util.control.NonFatal | ||
|
|
||
| import org.apache.hadoop.fs.{FileStatus, FileSystem, Path, PathFilter} | ||
| import org.apache.hadoop.mapred.{FileInputFormat, JobConf} | ||
|
|
||
| import org.apache.spark.sql.{AnalysisException, Row, SparkSession} | ||
| import org.apache.spark.sql.catalyst.TableIdentifier | ||
| import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogDatabase, CatalogTable} | ||
| import org.apache.spark.sql.catalyst.catalog.{CatalogTablePartition, CatalogTableType, SessionCatalog} | ||
| import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec | ||
| import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogDatabase, CatalogTable, CatalogTablePartition, CatalogTableType, SessionCatalog} | ||
| import org.apache.spark.sql.catalyst.catalog.CatalogTypes._ | ||
| import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} | ||
| import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._ | ||
| import org.apache.spark.sql.execution.datasources.PartitioningUtils | ||
| import org.apache.spark.sql.types._ | ||
|
|
||
|
|
||
| // Note: The definition of these commands are based on the ones described in | ||
| // https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL | ||
|
|
||
|
|
@@ -425,6 +430,111 @@ case class AlterTableDropPartitionCommand( | |
|
|
||
| } | ||
|
|
||
| /** | ||
| * Recover Partitions in ALTER TABLE: recover all the partition in the directory of a table and | ||
| * update the catalog. | ||
| * | ||
| * The syntax of this command is: | ||
| * {{{ | ||
| * ALTER TABLE table RECOVER PARTITIONS; | ||
| * MSCK REPAIR TABLE table; | ||
| * }}} | ||
| */ | ||
| case class AlterTableRecoverPartitionsCommand( | ||
| tableName: TableIdentifier, | ||
| cmd: String = "ALTER TABLE RECOVER PARTITIONS") extends RunnableCommand { | ||
| override def run(spark: SparkSession): Seq[Row] = { | ||
| val catalog = spark.sessionState.catalog | ||
| if (!catalog.tableExists(tableName)) { | ||
| throw new AnalysisException(s"Table $tableName in $cmd does not exist.") | ||
| } | ||
| val table = catalog.getTableMetadata(tableName) | ||
| if (catalog.isTemporaryTable(tableName)) { | ||
| throw new AnalysisException( | ||
| s"Operation not allowed: $cmd on temporary tables: $tableName") | ||
| } | ||
| if (DDLUtils.isDatasourceTable(table)) { | ||
| throw new AnalysisException( | ||
| s"Operation not allowed: $cmd on datasource tables: $tableName") | ||
| } | ||
| if (table.tableType != CatalogTableType.EXTERNAL) { | ||
| throw new AnalysisException( | ||
| s"Operation not allowed: $cmd only works on external tables: $tableName") | ||
| } | ||
| if (!DDLUtils.isTablePartitioned(table)) { | ||
| throw new AnalysisException( | ||
| s"Operation not allowed: $cmd only works on partitioned tables: $tableName") | ||
| } | ||
| if (table.storage.locationUri.isEmpty) { | ||
| throw new AnalysisException( | ||
| s"Operation not allowed: $cmd only works on table with location provided: $tableName") | ||
| } | ||
|
|
||
| val root = new Path(table.storage.locationUri.get) | ||
| val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration) | ||
| // Dummy jobconf to get to the pathFilter defined in configuration | ||
| // It's very expensive to create a JobConf(ClassUtil.findContainingJar() is slow) | ||
| val jobConf = new JobConf(spark.sparkContext.hadoopConfiguration, this.getClass) | ||
| val pathFilter = FileInputFormat.getInputPathFilter(jobConf) | ||
| val partitionSpecsAndLocs = scanPartitions( | ||
| spark, fs, pathFilter, root, Map(), table.partitionColumnNames.map(_.toLowerCase)) | ||
| val parts = partitionSpecsAndLocs.map { case (spec, location) => | ||
| // inherit table storage format (possibly except for location) | ||
| CatalogTablePartition(spec, table.storage.copy(locationUri = Some(location.toUri.toString))) | ||
| } | ||
| spark.sessionState.catalog.createPartitions(tableName, | ||
| parts.toArray[CatalogTablePartition], ignoreIfExists = true) | ||
| Seq.empty[Row] | ||
| } | ||
|
|
||
| @transient private lazy val evalTaskSupport = new ForkJoinTaskSupport(new ForkJoinPool(8)) | ||
|
|
||
| private def scanPartitions( | ||
| spark: SparkSession, | ||
| fs: FileSystem, | ||
| filter: PathFilter, | ||
| path: Path, | ||
| spec: TablePartitionSpec, | ||
| partitionNames: Seq[String]): GenSeq[(TablePartitionSpec, Path)] = { | ||
| if (partitionNames.length == 0) { | ||
| return Seq(spec -> path) | ||
| } | ||
|
|
||
| val statuses = fs.listStatus(path) | ||
| val threshold = spark.conf.get("spark.rdd.parallelListingThreshold", "10").toInt | ||
| val statusPar: GenSeq[FileStatus] = | ||
| if (partitionNames.length > 1 && statuses.length > threshold || partitionNames.length > 2) { | ||
| val parArray = statuses.par | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i didn't look carefully - but if you are using the default exec context, please create a new one. otherwise it'd block.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cool. can we make it explicit, e.g.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is copied from UnionRDD.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did not figure out how it work, at least |
||
| parArray.tasksupport = evalTaskSupport | ||
| parArray | ||
| } else { | ||
| statuses | ||
| } | ||
| statusPar.flatMap { st => | ||
| val name = st.getPath.getName | ||
| if (st.isDirectory && name.contains("=")) { | ||
| val ps = name.split("=", 2) | ||
| val columnName = PartitioningUtils.unescapePathName(ps(0)).toLowerCase | ||
| // TODO: Validate the value | ||
| val value = PartitioningUtils.unescapePathName(ps(1)) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to check if the value is valid? E.g., for a partition column "a" of IntegerType, "a=abc" is invalid.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could have a TODO here.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this escaping cause problems in (say) S3 for objects of the form "foo%20bar"?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the partitions are generated by Spark, they could be unescape back correctly. For others, they could be compatibility issues. For example, Spark does not escape
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, that makes sense. |
||
| // comparing with case-insensitive, but preserve the case | ||
| if (columnName == partitionNames(0)) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A directory name like "a=" will pass this condition and get empty partition value. Is it allowed?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's valid. |
||
| scanPartitions( | ||
| spark, fs, filter, st.getPath, spec ++ Map(columnName -> value), partitionNames.drop(1)) | ||
| } else { | ||
| logWarning(s"expect partition column ${partitionNames(0)}, but got ${ps(0)}, ignore it") | ||
| Seq() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Like hive, we may consider throwing an exception here (that could be turned off via a config).
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The Hive only throws exception when there are not allowed character in the value, not other cases. I'd like to avoid any configs if no serious problem here. |
||
| } | ||
| } else { | ||
| if (name != "_SUCCESS" && name != "_temporary" && !name.startsWith(".")) { | ||
| logWarning(s"ignore ${new Path(path, name)}") | ||
| } | ||
| Seq() | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
| /** | ||
| * A command that sets the location of a table or a partition. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -864,6 +864,55 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { | |
| testAddPartitions(isDatasourceTable = true) | ||
| } | ||
|
|
||
| test("alter table: recover partitions (sequential)") { | ||
| withSQLConf("spark.rdd.parallelListingThreshold" -> "1") { | ||
| testRecoverPartitions() | ||
| } | ||
| } | ||
|
|
||
| test("alter table: recover partition (parallel)") { | ||
| withSQLConf("spark.rdd.parallelListingThreshold" -> "10") { | ||
| testRecoverPartitions() | ||
| } | ||
| } | ||
|
|
||
| private def testRecoverPartitions() { | ||
| val catalog = spark.sessionState.catalog | ||
| // table to alter does not exist | ||
| intercept[AnalysisException] { | ||
| sql("ALTER TABLE does_not_exist RECOVER PARTITIONS") | ||
| } | ||
|
|
||
| val tableIdent = TableIdentifier("tab1") | ||
| createTable(catalog, tableIdent) | ||
| val part1 = Map("a" -> "1", "b" -> "5") | ||
| createTablePartition(catalog, part1, tableIdent) | ||
| assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1)) | ||
|
|
||
| val part2 = Map("a" -> "2", "b" -> "6") | ||
| val root = new Path(catalog.getTableMetadata(tableIdent).storage.locationUri.get) | ||
| val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration) | ||
| // valid | ||
| fs.mkdirs(new Path(new Path(root, "a=1"), "b=5")) | ||
| fs.mkdirs(new Path(new Path(root, "A=2"), "B=6")) | ||
| // invalid | ||
| fs.mkdirs(new Path(new Path(root, "a"), "b")) // bad name | ||
| fs.mkdirs(new Path(new Path(root, "b=1"), "a=1")) // wrong order | ||
| fs.mkdirs(new Path(root, "a=4")) // not enough columns | ||
| fs.createNewFile(new Path(new Path(root, "a=1"), "b=4")) // file | ||
| fs.createNewFile(new Path(new Path(root, "a=1"), "_SUCCESS")) // _SUCCESS | ||
| fs.mkdirs(new Path(new Path(root, "a=1"), "_temporary")) // _temporary | ||
| fs.mkdirs(new Path(new Path(root, "a=1"), ".b=4")) // start with . | ||
|
|
||
| try { | ||
| sql("ALTER TABLE tab1 RECOVER PARTITIONS") | ||
| assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == | ||
| Set(part1, part2)) | ||
| } finally { | ||
| fs.delete(root, true) | ||
| } | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's add tests to exercise the command more. Here are three examples.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
|
|
||
| test("alter table: add partition is not supported for views") { | ||
| assertUnsupported("ALTER VIEW dbx.tab1 ADD IF NOT EXISTS PARTITION (b='2')") | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What will happen if we get thousands of new partitions of tens thousands of new partitions?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good question, see the implementation in HiveShim:
All these partitions will be insert into Hive in sequential way, so group them as batches will not help here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no, this is true for Hive <=0.12, for Hive 0.13+, they are sent in single RPC. so we should verify that what's limit for a single RPC
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems that the Hive Metastore can't handle a RPC with millions of partitions, I will send a patch to do it in batch.