Skip to content

Commit 5b8e848

Browse files
dongjoon-hyunliancheng
authored andcommitted
[SPARK-16621][SQL] Generate stable SQLs in SQLBuilder
## What changes were proposed in this pull request? Currently, the generated SQLs have not-stable IDs for generated attributes. The stable generated SQL will give more benefit for understanding or testing the queries. This PR provides stable SQL generation by the followings. - Provide unique ids for generated subqueries, `gen_subquery_xxx`. - Provide unique and stable ids for generated attributes, `gen_attr_xxx`. **Before** ```scala scala> new org.apache.spark.sql.catalyst.SQLBuilder(sql("select 1")).toSQL res0: String = SELECT `gen_attr_0` AS `1` FROM (SELECT 1 AS `gen_attr_0`) AS gen_subquery_0 scala> new org.apache.spark.sql.catalyst.SQLBuilder(sql("select 1")).toSQL res1: String = SELECT `gen_attr_4` AS `1` FROM (SELECT 1 AS `gen_attr_4`) AS gen_subquery_0 ``` **After** ```scala scala> new org.apache.spark.sql.catalyst.SQLBuilder(sql("select 1")).toSQL res1: String = SELECT `gen_attr_0` AS `1` FROM (SELECT 1 AS `gen_attr_0`) AS gen_subquery_0 scala> new org.apache.spark.sql.catalyst.SQLBuilder(sql("select 1")).toSQL res2: String = SELECT `gen_attr_0` AS `1` FROM (SELECT 1 AS `gen_attr_0`) AS gen_subquery_0 ``` ## How was this patch tested? Pass the existing Jenkins tests. Author: Dongjoon Hyun <[email protected]> Closes #14257 from dongjoon-hyun/SPARK-16621.
1 parent 738b4cc commit 5b8e848

File tree

105 files changed

+124
-114
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

105 files changed

+124
-114
lines changed

sql/core/src/main/scala/org/apache/spark/sql/catalyst/SQLBuilder.scala

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst
1919

2020
import java.util.concurrent.atomic.AtomicLong
2121

22+
import scala.collection.mutable.Map
2223
import scala.util.control.NonFatal
2324

2425
import org.apache.spark.internal.Logging
@@ -38,14 +39,23 @@ import org.apache.spark.sql.types.{ByteType, DataType, IntegerType, NullType}
3839
* representations (e.g. logical plans that operate on local Scala collections), or are simply not
3940
* supported by this builder (yet).
4041
*/
41-
class SQLBuilder(logicalPlan: LogicalPlan) extends Logging {
42+
class SQLBuilder private (
43+
logicalPlan: LogicalPlan,
44+
nextSubqueryId: AtomicLong,
45+
nextGenAttrId: AtomicLong,
46+
exprIdMap: Map[Long, Long]) extends Logging {
4247
require(logicalPlan.resolved,
4348
"SQLBuilder only supports resolved logical query plans. Current plan:\n" + logicalPlan)
4449

50+
def this(logicalPlan: LogicalPlan) =
51+
this(logicalPlan, new AtomicLong(0), new AtomicLong(0), Map.empty[Long, Long])
52+
4553
def this(df: Dataset[_]) = this(df.queryExecution.analyzed)
4654

47-
private val nextSubqueryId = new AtomicLong(0)
4855
private def newSubqueryName(): String = s"gen_subquery_${nextSubqueryId.getAndIncrement()}"
56+
private def normalizedName(n: NamedExpression): String = synchronized {
57+
"gen_attr_" + exprIdMap.getOrElseUpdate(n.exprId.id, nextGenAttrId.getAndIncrement())
58+
}
4959

5060
def toSQL: String = {
5161
val canonicalizedPlan = Canonicalizer.execute(logicalPlan)
@@ -70,7 +80,7 @@ class SQLBuilder(logicalPlan: LogicalPlan) extends Logging {
7080
try {
7181
val replaced = finalPlan.transformAllExpressions {
7282
case s: SubqueryExpression =>
73-
val query = new SQLBuilder(s.query).toSQL
83+
val query = new SQLBuilder(s.query, nextSubqueryId, nextGenAttrId, exprIdMap).toSQL
7484
val sql = s match {
7585
case _: ListQuery => query
7686
case _: Exists => s"EXISTS($query)"
@@ -169,6 +179,11 @@ class SQLBuilder(logicalPlan: LogicalPlan) extends Logging {
169179
qualifiedName + " TABLESAMPLE(" + fraction + " PERCENT)"
170180
}.getOrElse(qualifiedName)
171181

182+
case relation: CatalogRelation =>
183+
val m = relation.catalogTable
184+
val qualifiedName = s"${quoteIdentifier(m.database)}.${quoteIdentifier(m.identifier.table)}"
185+
qualifiedName
186+
172187
case Sort(orders, _, RepartitionByExpression(partitionExprs, child, _))
173188
if orders.map(_.child) == partitionExprs =>
174189
build(toSQL(child), "CLUSTER BY", partitionExprs.map(_.sql).mkString(", "))
@@ -376,8 +391,6 @@ class SQLBuilder(logicalPlan: LogicalPlan) extends Logging {
376391
)
377392
}
378393

379-
private def normalizedName(n: NamedExpression): String = "gen_attr_" + n.exprId.id
380-
381394
object Canonicalizer extends RuleExecutor[LogicalPlan] {
382395
override protected def batches: Seq[Batch] = Seq(
383396
Batch("Prepare", FixedPoint(100),
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
-- This file is automatically generated by LogicalPlanToSQLSuite.
22
SELECT COUNT(value) FROM parquet_t1 GROUP BY key HAVING MAX(key) > 0
33
--------------------------------------------------------------------------------
4-
SELECT `gen_attr` AS `count(value)` FROM (SELECT `gen_attr` FROM (SELECT count(`gen_attr`) AS `gen_attr`, max(`gen_attr`) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY `gen_attr` HAVING (`gen_attr` > CAST(0 AS BIGINT))) AS gen_subquery_1) AS gen_subquery_2
4+
SELECT `gen_attr_0` AS `count(value)` FROM (SELECT `gen_attr_0` FROM (SELECT count(`gen_attr_3`) AS `gen_attr_0`, max(`gen_attr_2`) AS `gen_attr_1` FROM (SELECT `key` AS `gen_attr_2`, `value` AS `gen_attr_3` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY `gen_attr_2` HAVING (`gen_attr_1` > CAST(0 AS BIGINT))) AS gen_subquery_1) AS gen_subquery_2
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
-- This file is automatically generated by LogicalPlanToSQLSuite.
22
SELECT COUNT(value) FROM parquet_t1 GROUP BY key ORDER BY MAX(key)
33
--------------------------------------------------------------------------------
4-
SELECT `gen_attr` AS `count(value)` FROM (SELECT `gen_attr` FROM (SELECT count(`gen_attr`) AS `gen_attr`, max(`gen_attr`) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY `gen_attr` ORDER BY `gen_attr` ASC) AS gen_subquery_1) AS gen_subquery_2
4+
SELECT `gen_attr_0` AS `count(value)` FROM (SELECT `gen_attr_0` FROM (SELECT count(`gen_attr_3`) AS `gen_attr_0`, max(`gen_attr_2`) AS `gen_attr_1` FROM (SELECT `key` AS `gen_attr_2`, `value` AS `gen_attr_3` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY `gen_attr_2` ORDER BY `gen_attr_1` ASC) AS gen_subquery_1) AS gen_subquery_2
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
-- This file is automatically generated by LogicalPlanToSQLSuite.
22
SELECT COUNT(value) FROM parquet_t1 GROUP BY key ORDER BY key, MAX(key)
33
--------------------------------------------------------------------------------
4-
SELECT `gen_attr` AS `count(value)` FROM (SELECT `gen_attr` FROM (SELECT count(`gen_attr`) AS `gen_attr`, `gen_attr` AS `gen_attr`, max(`gen_attr`) AS `gen_attr` FROM (SELECT `key` AS `gen_attr`, `value` AS `gen_attr` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY `gen_attr` ORDER BY `gen_attr` ASC, `gen_attr` ASC) AS gen_subquery_1) AS gen_subquery_2
4+
SELECT `gen_attr_0` AS `count(value)` FROM (SELECT `gen_attr_0` FROM (SELECT count(`gen_attr_4`) AS `gen_attr_0`, `gen_attr_3` AS `gen_attr_1`, max(`gen_attr_3`) AS `gen_attr_2` FROM (SELECT `key` AS `gen_attr_3`, `value` AS `gen_attr_4` FROM `default`.`parquet_t1`) AS gen_subquery_0 GROUP BY `gen_attr_3` ORDER BY `gen_attr_1` ASC, `gen_attr_2` ASC) AS gen_subquery_1) AS gen_subquery_2
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
-- This file is automatically generated by LogicalPlanToSQLSuite.
22
SELECT MAX(c) + COUNT(a) OVER () FROM parquet_t2 GROUP BY a, b
33
--------------------------------------------------------------------------------
4-
SELECT `gen_attr` AS `(max(c) + count(a) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING))` FROM (SELECT (`gen_attr` + `gen_attr`) AS `gen_attr` FROM (SELECT gen_subquery_1.`gen_attr`, gen_subquery_1.`gen_attr`, count(`gen_attr`) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `gen_attr` FROM (SELECT max(`gen_attr`) AS `gen_attr`, `gen_attr` FROM (SELECT `a` AS `gen_attr`, `b` AS `gen_attr`, `c` AS `gen_attr`, `d` AS `gen_attr` FROM `default`.`parquet_t2`) AS gen_subquery_0 GROUP BY `gen_attr`, `gen_attr`) AS gen_subquery_1) AS gen_subquery_2) AS gen_subquery_3
4+
SELECT `gen_attr_0` AS `(max(c) + count(a) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING))` FROM (SELECT (`gen_attr_1` + `gen_attr_2`) AS `gen_attr_0` FROM (SELECT gen_subquery_1.`gen_attr_1`, gen_subquery_1.`gen_attr_3`, count(`gen_attr_3`) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `gen_attr_2` FROM (SELECT max(`gen_attr_5`) AS `gen_attr_1`, `gen_attr_3` FROM (SELECT `a` AS `gen_attr_3`, `b` AS `gen_attr_4`, `c` AS `gen_attr_5`, `d` AS `gen_attr_6` FROM `default`.`parquet_t2`) AS gen_subquery_0 GROUP BY `gen_attr_3`, `gen_attr_4`) AS gen_subquery_1) AS gen_subquery_2) AS gen_subquery_3
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
-- This file is automatically generated by LogicalPlanToSQLSuite.
22
SELECT CASE WHEN id % 2 > 0 THEN 0 WHEN id % 2 = 0 THEN 1 END FROM parquet_t0
33
--------------------------------------------------------------------------------
4-
SELECT `gen_attr` AS `CASE WHEN ((id % CAST(2 AS BIGINT)) > CAST(0 AS BIGINT)) THEN 0 WHEN ((id % CAST(2 AS BIGINT)) = CAST(0 AS BIGINT)) THEN 1 END` FROM (SELECT CASE WHEN ((`gen_attr` % CAST(2 AS BIGINT)) > CAST(0 AS BIGINT)) THEN 0 WHEN ((`gen_attr` % CAST(2 AS BIGINT)) = CAST(0 AS BIGINT)) THEN 1 END AS `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_0) AS gen_subquery_1
4+
SELECT `gen_attr_0` AS `CASE WHEN ((id % CAST(2 AS BIGINT)) > CAST(0 AS BIGINT)) THEN 0 WHEN ((id % CAST(2 AS BIGINT)) = CAST(0 AS BIGINT)) THEN 1 END` FROM (SELECT CASE WHEN ((`gen_attr_1` % CAST(2 AS BIGINT)) > CAST(0 AS BIGINT)) THEN 0 WHEN ((`gen_attr_1` % CAST(2 AS BIGINT)) = CAST(0 AS BIGINT)) THEN 1 END AS `gen_attr_0` FROM (SELECT `id` AS `gen_attr_1` FROM `default`.`parquet_t0`) AS gen_subquery_0) AS gen_subquery_1
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
-- This file is automatically generated by LogicalPlanToSQLSuite.
22
SELECT CASE WHEN id % 2 > 0 THEN 0 ELSE 1 END FROM parquet_t0
33
--------------------------------------------------------------------------------
4-
SELECT `gen_attr` AS `CASE WHEN ((id % CAST(2 AS BIGINT)) > CAST(0 AS BIGINT)) THEN 0 ELSE 1 END` FROM (SELECT CASE WHEN ((`gen_attr` % CAST(2 AS BIGINT)) > CAST(0 AS BIGINT)) THEN 0 ELSE 1 END AS `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_0) AS gen_subquery_1
4+
SELECT `gen_attr_0` AS `CASE WHEN ((id % CAST(2 AS BIGINT)) > CAST(0 AS BIGINT)) THEN 0 ELSE 1 END` FROM (SELECT CASE WHEN ((`gen_attr_1` % CAST(2 AS BIGINT)) > CAST(0 AS BIGINT)) THEN 0 ELSE 1 END AS `gen_attr_0` FROM (SELECT `id` AS `gen_attr_1` FROM `default`.`parquet_t0`) AS gen_subquery_0) AS gen_subquery_1
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
-- This file is automatically generated by LogicalPlanToSQLSuite.
22
SELECT CASE id WHEN 0 THEN 'foo' WHEN 1 THEN 'bar' END FROM parquet_t0
33
--------------------------------------------------------------------------------
4-
SELECT `gen_attr` AS `CASE WHEN (id = CAST(0 AS BIGINT)) THEN foo WHEN (id = CAST(1 AS BIGINT)) THEN bar END` FROM (SELECT CASE WHEN (`gen_attr` = CAST(0 AS BIGINT)) THEN "foo" WHEN (`gen_attr` = CAST(1 AS BIGINT)) THEN "bar" END AS `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_0) AS gen_subquery_1
4+
SELECT `gen_attr_0` AS `CASE WHEN (id = CAST(0 AS BIGINT)) THEN foo WHEN (id = CAST(1 AS BIGINT)) THEN bar END` FROM (SELECT CASE WHEN (`gen_attr_1` = CAST(0 AS BIGINT)) THEN "foo" WHEN (`gen_attr_1` = CAST(1 AS BIGINT)) THEN "bar" END AS `gen_attr_0` FROM (SELECT `id` AS `gen_attr_1` FROM `default`.`parquet_t0`) AS gen_subquery_0) AS gen_subquery_1
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
-- This file is automatically generated by LogicalPlanToSQLSuite.
22
SELECT CASE id WHEN 0 THEN 'foo' WHEN 1 THEN 'bar' ELSE 'baz' END FROM parquet_t0
33
--------------------------------------------------------------------------------
4-
SELECT `gen_attr` AS `CASE WHEN (id = CAST(0 AS BIGINT)) THEN foo WHEN (id = CAST(1 AS BIGINT)) THEN bar ELSE baz END` FROM (SELECT CASE WHEN (`gen_attr` = CAST(0 AS BIGINT)) THEN "foo" WHEN (`gen_attr` = CAST(1 AS BIGINT)) THEN "bar" ELSE "baz" END AS `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_0) AS gen_subquery_1
4+
SELECT `gen_attr_0` AS `CASE WHEN (id = CAST(0 AS BIGINT)) THEN foo WHEN (id = CAST(1 AS BIGINT)) THEN bar ELSE baz END` FROM (SELECT CASE WHEN (`gen_attr_1` = CAST(0 AS BIGINT)) THEN "foo" WHEN (`gen_attr_1` = CAST(1 AS BIGINT)) THEN "bar" ELSE "baz" END AS `gen_attr_0` FROM (SELECT `id` AS `gen_attr_1` FROM `default`.`parquet_t0`) AS gen_subquery_0) AS gen_subquery_1
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
-- This file is automatically generated by LogicalPlanToSQLSuite.
22
SELECT id FROM parquet_t0 CLUSTER BY id
33
--------------------------------------------------------------------------------
4-
SELECT `gen_attr` AS `id` FROM (SELECT `gen_attr` FROM (SELECT `id` AS `gen_attr` FROM `default`.`parquet_t0`) AS gen_subquery_0 CLUSTER BY `gen_attr`) AS parquet_t0
4+
SELECT `gen_attr_0` AS `id` FROM (SELECT `gen_attr_0` FROM (SELECT `id` AS `gen_attr_0` FROM `default`.`parquet_t0`) AS gen_subquery_0 CLUSTER BY `gen_attr_0`) AS parquet_t0

0 commit comments

Comments
 (0)