analyze all tables in a specific database

wangyum · wangyum · commit 78b9ffc48d1a · 2020-12-07T21:34:27.000+08:00
diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -134,6 +134,8 @@ statement
         (AS? query)?                                                   #replaceTable
     | ANALYZE TABLE multipartIdentifier partitionSpec? COMPUTE STATISTICS
         (identifier | FOR COLUMNS identifierSeq | FOR ALL COLUMNS)?    #analyze
+    | ANALYZE TABLES ((FROM | IN) multipartIdentifier)? COMPUTE STATISTICS
+        (identifier)?                                                  #analyzeTables
     | ALTER TABLE multipartIdentifier
         ADD (COLUMN | COLUMNS)
         columns=qualifiedColTypeWithPositionList                       #addTableColumns
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -3547,6 +3547,25 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg
     }
   }
 
+  /**
+   * Create an [[AnalyzeTables]].
+   * Example SQL for analyzing all tables in default database:
+   * {{{
+   *   ANALYZE TABLES IN default COMPUTE STATISTICS;
+   * }}}
+   */
+  override def visitAnalyzeTables(ctx: AnalyzeTablesContext): LogicalPlan = withOrigin(ctx) {
+    if (ctx.identifier != null &&
+      ctx.identifier.getText.toLowerCase(Locale.ROOT) != "noscan") {
+      throw new ParseException(s"Expected `NOSCAN` instead of `${ctx.identifier.getText}`",
+        ctx.identifier())
+    }
+    val multiPart = Option(ctx.multipartIdentifier).map(visitMultipartIdentifier)
+    AnalyzeTables(
+      UnresolvedNamespace(multiPart.getOrElse(Seq.empty[String])),
+      noScan = ctx.identifier != null)
+  }
+
   /**
    * Create a [[RepairTableStatement]].
    *
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala
@@ -602,6 +602,15 @@ case class AnalyzeTable(
   override def children: Seq[LogicalPlan] = child :: Nil
 }
 
+/**
+ * The logical plan of the ANALYZE TABLES command.
+ */
+case class AnalyzeTables(
+    namespace: LogicalPlan,
+    noScan: Boolean) extends Command {
+  override def children: Seq[LogicalPlan] = Seq(namespace)
+}
+
 /**
  * The logical plan of the ANALYZE TABLE FOR COLUMNS command.
  */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala
@@ -1898,6 +1898,15 @@ class DDLParserSuite extends AnalysisTest {
       "Expected `NOSCAN` instead of `xxxx`")
   }
 
+  test("analyze tables statistics") {
+    comparePlans(parsePlan("analyze tables in a.b.c compute statistics"),
+      AnalyzeTables(UnresolvedNamespace(Seq("a", "b", "c")), noScan = false))
+    comparePlans(parsePlan("analyze tables in a compute statistics noscan"),
+      AnalyzeTables(UnresolvedNamespace(Seq("a")), noScan = true))
+    intercept("analyze tables in a.b.c compute statistics xxxx",
+      "Expected `NOSCAN` instead of `xxxx`")
+  }
+
   test("analyze table column statistics") {
     intercept("ANALYZE TABLE a.b.c COMPUTE STATISTICS FOR COLUMNS", "")
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala
@@ -401,6 +401,13 @@ class ResolveSessionCatalog(
         AnalyzePartitionCommand(ident.asTableIdentifier, partitionSpec, noScan)
       }
 
+    case AnalyzeTables(SessionCatalogAndNamespace(_, ns), noScan) =>
+      if (ns.length > 1) {
+        throw new AnalysisException(
+          s"The database name is not valid: ${ns.quoted}")
+      }
+      AnalyzeTablesCommand(ns.headOption, noScan)
+
     case AnalyzeColumn(ResolvedV1TableOrViewIdentifier(ident), columnNames, allColumns) =>
       AnalyzeColumnCommand(ident.asTableIdentifier, columnNames, allColumns)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTablesCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTablesCommand.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.command
+
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.catalog.CatalogTableType
+
+
+/**
+ * Analyzes all tables in the given database to generate statistics.
+ */
+case class AnalyzeTablesCommand(
+    databaseName: Option[String],
+    noScan: Boolean) extends RunnableCommand {
+
+  override def run(sparkSession: SparkSession): Seq[Row] = {
+
+    val catalog = sparkSession.sessionState.catalog
+    val db = databaseName.getOrElse(catalog.getCurrentDatabase)
+    catalog.listTables(db).foreach { tbl =>
+      try {
+        val tableMeta = catalog.getTableMetadata(tbl)
+        if (tableMeta.tableType == CatalogTableType.MANAGED ||
+          tableMeta.tableType == CatalogTableType.EXTERNAL) {
+          // Compute stats for the whole table
+          val newTotalSize = CommandUtils.calculateTotalSize(sparkSession, tableMeta)
+          val tableIdentWithDB = TableIdentifier(tbl.table, Some(db))
+          val newRowCount =
+            if (noScan) None else Some(BigInt(sparkSession.table(tableIdentWithDB).count()))
+
+          // Update the metastore if the above statistics of the table are different from those
+          // recorded in the metastore.
+          val newStats =
+            CommandUtils.compareAndGetNewStats(tableMeta.stats, newTotalSize, newRowCount)
+          if (newStats.isDefined) {
+            catalog.alterTableStats(tableIdentWithDB, newStats)
+          }
+        }
+      } catch {
+        case e: Exception =>
+          logError(s"Failed to analyze table: ${tbl.identifier}.", e)
+      }
+    }
+
+    Seq.empty[Row]
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -671,4 +671,24 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
       }
     }
   }
+
+  test("analyze all tables in a specific database") {
+    withTempDir { dir =>
+      withTable("t1", "t2") {
+        spark.range(10).write.saveAsTable("t1")
+        sql(s"CREATE EXTERNAL TABLE t2 USING parquet LOCATION '${dir.toURI}' " +
+          "AS SELECT * FROM range(20)")
+        withView("v1") {
+          sql(s"CREATE VIEW v1 AS SELECT * FROM t1")
+          sql(s"ANALYZE TABLES IN default COMPUTE STATISTICS NOSCAN")
+          checkTableStats("t1", hasSizeInBytes = true, expectedRowCounts = None)
+          checkTableStats("t2", hasSizeInBytes = true, expectedRowCounts = None)
+
+          sql(s"ANALYZE TABLES COMPUTE STATISTICS")
+          checkTableStats("t1", hasSizeInBytes = true, expectedRowCounts = Some(10))
+          checkTableStats("t2", hasSizeInBytes = true, expectedRowCounts = Some(20))
+        }
+      }
+    }
+  }
 }