apache · cloud-fan · Mar 3, 2020 · HyukjinKwon · Mar 4, 2020 · HyukjinKwon
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
@@ -341,6 +341,8 @@ license: |
 
     - The decimal string representation can be different between Hive 1.2 and Hive 2.3 when using `TRANSFORM` operator in SQL for script transformation, which depends on hive's behavior. In Hive 1.2, the string representation omits trailing zeroes. But in Hive 2.3, it is always padded to 18 digits with trailing zeroes if necessary.
 
+  - Since Spark 3.0, group by alias fails if there are name conflicts like `SELECT col + 1 as col FROM t GROUP BY col`. In Spark version 2.4 and earlier, it works and the column will be resolved using child output. To restore the previous behaviour, set `spark.sql.legacy.allowAmbiguousGroupByAlias` to `true`.
+
 ## Upgrading from Spark SQL 2.4.4 to 2.4.5
 
   - Since Spark 2.4.5, `TRUNCATE TABLE` command tries to set back original permission and ACLs during re-creating the table/partition paths. To restore the behaviour of earlier versions, set `spark.sql.truncateTable.ignorePermissionAcl.enabled` to `true`.

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1636,9 +1636,33 @@ class Analyzer(
 
     override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp {
       case agg @ Aggregate(groups, aggs, child)
-          if conf.groupByAliases && child.resolved && aggs.forall(_.resolved) &&
-            groups.exists(!_.resolved) =>
-        agg.copy(groupingExpressions = mayResolveAttrByAggregateExprs(groups, aggs, child))
+          if conf.groupByAliases && child.resolved && aggs.forall(_.resolved) =>
+        if (groups.exists(!_.resolved)) {
+          agg.copy(groupingExpressions = mayResolveAttrByAggregateExprs(groups, aggs, child))
+        } else {
+          // If we can resolve the GROUP BY column with both child output or aliased column in
+          // the SELECT clause, we should fail as it's ambiguous.
+          // Note: this can't be done in `CheckAnalysis`, as the alias in the grouping expressions
+          //       will be removed at that time.
+          if (!SQLConf.get.getConf(SQLConf.LEGACY_ALLOW_AMBIGUOUS_GROUP_BY_ALIAS)) {
+            groups.foreach {
+              case attr: Attribute =>
+                val resolvedWithAlias = aggs.find {
+                  case a: Alias => resolver(a.name, attr.name)
+                  case _ => false
+                }
+                if (resolvedWithAlias.isDefined) {
+                  throw new AnalysisException(s"GROUP BY column ${attr.name} is ambiguous. It " +
+                    "can refer to a column in the child plan, or an aliased column in the " +
+                    "SELECT clause. Please fix the name conflicts, or set " +
+                    s"${SQLConf.LEGACY_ALLOW_AMBIGUOUS_GROUP_BY_ALIAS.key} to true to resolve " +
+                    "the column with the child plan.")
+                }
+              case _ =>
+            }
+          }
+          agg
+        }
 
       case gs @ GroupingSets(selectedGroups, groups, child, aggs)
           if conf.groupByAliases && child.resolved && aggs.forall(_.resolved) &&

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2453,6 +2453,15 @@ object SQLConf {
     .booleanConf
     .createWithDefault(false)
 
+  val LEGACY_ALLOW_AMBIGUOUS_GROUP_BY_ALIAS =
+    buildConf("spark.sql.legacy.allowAmbiguousGroupByAlias")
+      .doc(s"When ${GROUP_BY_ALIASES.key} is enabled and this conf is true, Spark will resolve " +
+        "the GROUP BY column using child's output, even though there is an ambiguous alias in " +
+        "the SELECT clause. Id false, Spark fails the query.")
+      .version("3.0.0")
+      .booleanConf
+      .createWithDefault(false)
+
   /**
    * Holds information about keys that have been deprecated.
    *

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -3454,6 +3454,24 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
       """.stripMargin)
     checkAnswer(df2, Row(1) :: Nil)
   }
+
+  test("SPARK-31022: group by alias should fail if there are name conflicts") {
+    withTempView("v") {
+      spark.range(10).toDF("col").createTempView("v")
+      checkAnswer(sql("SELECT div(col, 10) AS new_col FROM v GROUP BY new_col"), Row(0))
+
+      val e = intercept[AnalysisException] {
+        sql("SELECT div(col, 10) AS col FROM v GROUP BY col")
+      }
+      assert(e.message.contains("ambiguous"))
+
+      withSQLConf(SQLConf.LEGACY_ALLOW_AMBIGUOUS_GROUP_BY_ALIAS.key -> "true") {
+        checkAnswer(
+          sql("SELECT div(col, 10) AS col FROM v GROUP BY col"),
+          spark.range(10).select(lit(0).as("col")))
+      }
+    }
+  }
 }
 
 case class Foo(bar: Option[String])