diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index a9a39ceca877..dae1bf0f55ad 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -341,6 +341,8 @@ license: | - The decimal string representation can be different between Hive 1.2 and Hive 2.3 when using `TRANSFORM` operator in SQL for script transformation, which depends on hive's behavior. In Hive 1.2, the string representation omits trailing zeroes. But in Hive 2.3, it is always padded to 18 digits with trailing zeroes if necessary. + - Since Spark 3.0, group by alias fails if there are name conflicts like `SELECT col + 1 as col FROM t GROUP BY col`. In Spark version 2.4 and earlier, it works and the column will be resolved using child output. To restore the previous behaviour, set `spark.sql.legacy.allowAmbiguousGroupByAlias` to `true`. + ## Upgrading from Spark SQL 2.4.4 to 2.4.5 - Since Spark 2.4.5, `TRUNCATE TABLE` command tries to set back original permission and ACLs during re-creating the table/partition paths. To restore the behaviour of earlier versions, set `spark.sql.truncateTable.ignorePermissionAcl.enabled` to `true`. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 486b9525a0e6..efe590a700e4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -1636,9 +1636,33 @@ class Analyzer( override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { case agg @ Aggregate(groups, aggs, child) - if conf.groupByAliases && child.resolved && aggs.forall(_.resolved) && - groups.exists(!_.resolved) => - agg.copy(groupingExpressions = mayResolveAttrByAggregateExprs(groups, aggs, child)) + if conf.groupByAliases && child.resolved && aggs.forall(_.resolved) => + if (groups.exists(!_.resolved)) { + agg.copy(groupingExpressions = mayResolveAttrByAggregateExprs(groups, aggs, child)) + } else { + // If we can resolve the GROUP BY column with both child output or aliased column in + // the SELECT clause, we should fail as it's ambiguous. + // Note: this can't be done in `CheckAnalysis`, as the alias in the grouping expressions + // will be removed at that time. + if (!SQLConf.get.getConf(SQLConf.LEGACY_ALLOW_AMBIGUOUS_GROUP_BY_ALIAS)) { + groups.foreach { + case attr: Attribute => + val resolvedWithAlias = aggs.find { + case a: Alias => resolver(a.name, attr.name) + case _ => false + } + if (resolvedWithAlias.isDefined) { + throw new AnalysisException(s"GROUP BY column ${attr.name} is ambiguous. It " + + "can refer to a column in the child plan, or an aliased column in the " + + "SELECT clause. Please fix the name conflicts, or set " + + s"${SQLConf.LEGACY_ALLOW_AMBIGUOUS_GROUP_BY_ALIAS.key} to true to resolve " + + "the column with the child plan.") + } + case _ => + } + } + agg + } case gs @ GroupingSets(selectedGroups, groups, child, aggs) if conf.groupByAliases && child.resolved && aggs.forall(_.resolved) && diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 7f55f2272bfb..0816d8c1f56d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2453,6 +2453,15 @@ object SQLConf { .booleanConf .createWithDefault(false) + val LEGACY_ALLOW_AMBIGUOUS_GROUP_BY_ALIAS = + buildConf("spark.sql.legacy.allowAmbiguousGroupByAlias") + .doc(s"When ${GROUP_BY_ALIASES.key} is enabled and this conf is true, Spark will resolve " + + "the GROUP BY column using child's output, even though there is an ambiguous alias in " + + "the SELECT clause. Id false, Spark fails the query.") + .version("3.0.0") + .booleanConf + .createWithDefault(false) + /** * Holds information about keys that have been deprecated. * diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 563b4d142739..18b73c7f4b6e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -3454,6 +3454,24 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark """.stripMargin) checkAnswer(df2, Row(1) :: Nil) } + + test("SPARK-31022: group by alias should fail if there are name conflicts") { + withTempView("v") { + spark.range(10).toDF("col").createTempView("v") + checkAnswer(sql("SELECT div(col, 10) AS new_col FROM v GROUP BY new_col"), Row(0)) + + val e = intercept[AnalysisException] { + sql("SELECT div(col, 10) AS col FROM v GROUP BY col") + } + assert(e.message.contains("ambiguous")) + + withSQLConf(SQLConf.LEGACY_ALLOW_AMBIGUOUS_GROUP_BY_ALIAS.key -> "true") { + checkAnswer( + sql("SELECT div(col, 10) AS col FROM v GROUP BY col"), + spark.range(10).select(lit(0).as("col"))) + } + } + } } case class Foo(bar: Option[String])