[SPARK-40260][SQL] Use error classes in the compilation errors of GROUP BY a position

MaxGekk · MaxGekk · commit 296fe49ec855 · 2022-08-30T20:43:19.000+03:00
### What changes were proposed in this pull request? In the PR, I propose to the following new error classes: - GROUP_BY_POS_OUT_OF_RANGE - GROUP_BY_POS_REFERS_AGG_EXPR and migrate 2 compilation exceptions related to GROUP BY a position onto them. ### Why are the changes needed? The migration onto error classes makes the errors searchable in docs, and allows to edit error's text messages w/o modifying the source code. ### Does this PR introduce _any_ user-facing change? Yes, in some sense because it modifies user-facing error messages. ### How was this patch tested? By running the affected test suites: ``` $ build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite" $ build/sbt "core/testOnly *SparkThrowableSuite" ``` Closes #37712 from MaxGekk/group-ref-agg-error. Lead-authored-by: Max Gekk <max.gekk@gmail.com> Co-authored-by: Maxim Gekk <max.gekk@gmail.com> Signed-off-by: Max Gekk <max.gekk@gmail.com>
diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json
@@ -136,6 +136,18 @@
       "Grouping sets size cannot be greater than <maxSize>"
     ]
   },
+  "GROUP_BY_POS_OUT_OF_RANGE" : {
+    "message" : [
+      "GROUP BY position <index> is not in select list (valid range is [1, <size>])."
+    ],
+    "sqlState" : "42000"
+  },
+  "GROUP_BY_POS_REFERS_AGG_EXPR" : {
+    "message" : [
+      "GROUP BY <index> refers to an expression <aggExpr> that contains an aggregate function. Aggregate functions are not allowed in GROUP BY."
+    ],
+    "sqlState" : "42000"
+  },
   "INCOMPARABLE_PIVOT_COLUMN" : {
     "message" : [
       "Invalid pivot column <columnName>. Pivot columns must be comparable."
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala
@@ -100,7 +100,7 @@ class AnalysisException protected[sql] (
       line = origin.line,
       startPosition = origin.startPosition,
       errorClass = Some(errorClass),
-      errorSubClass = Some(errorSubClass),
+      errorSubClass = Option(errorSubClass),
       messageParameters = messageParameters)
 
   def copy(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
@@ -366,14 +366,15 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase {
   def groupByPositionRefersToAggregateFunctionError(
       index: Int,
       expr: Expression): Throwable = {
-    new AnalysisException(s"GROUP BY $index refers to an expression that is or contains " +
-      "an aggregate function. Aggregate functions are not allowed in GROUP BY, " +
-      s"but got ${expr.sql}")
+    new AnalysisException(
+      errorClass = "GROUP_BY_POS_REFERS_AGG_EXPR",
+      messageParameters = Array(index.toString, expr.sql))
   }
 
   def groupByPositionRangeError(index: Int, size: Int): Throwable = {
-    new AnalysisException(s"GROUP BY position $index is not in select list " +
-      s"(valid range is [1, $size])")
+    new AnalysisException(
+      errorClass = "GROUP_BY_POS_OUT_OF_RANGE",
+      messageParameters = Array(index.toString, size.toString))
   }
 
   def generatorNotExpectedError(name: FunctionIdentifier, classCanonicalName: String): Throwable = {
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out
@@ -92,7 +92,14 @@ select a, b from data group by -1
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-GROUP BY position -1 is not in select list (valid range is [1, 2]); line 1 pos 31
+{
+  "errorClass" : "GROUP_BY_POS_OUT_OF_RANGE",
+  "sqlState" : "42000",
+  "messageParameters" : {
+    "index" : "-1",
+    "size" : "2"
+  }
+}
 
 
 -- !query
@@ -101,7 +108,14 @@ select a, b from data group by 0
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-GROUP BY position 0 is not in select list (valid range is [1, 2]); line 1 pos 31
+{
+  "errorClass" : "GROUP_BY_POS_OUT_OF_RANGE",
+  "sqlState" : "42000",
+  "messageParameters" : {
+    "index" : "0",
+    "size" : "2"
+  }
+}
 
 
 -- !query
@@ -110,7 +124,14 @@ select a, b from data group by 3
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-GROUP BY position 3 is not in select list (valid range is [1, 2]); line 1 pos 31
+{
+  "errorClass" : "GROUP_BY_POS_OUT_OF_RANGE",
+  "sqlState" : "42000",
+  "messageParameters" : {
+    "index" : "3",
+    "size" : "2"
+  }
+}
 
 
 -- !query
@@ -119,7 +140,14 @@ select a, b, sum(b) from data group by 3
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-GROUP BY 3 refers to an expression that is or contains an aggregate function. Aggregate functions are not allowed in GROUP BY, but got sum(data.b) AS `sum(b)`; line 1 pos 39
+{
+  "errorClass" : "GROUP_BY_POS_REFERS_AGG_EXPR",
+  "sqlState" : "42000",
+  "messageParameters" : {
+    "index" : "3",
+    "aggExpr" : "sum(data.b) AS `sum(b)`"
+  }
+}
 
 
 -- !query
@@ -128,7 +156,14 @@ select a, b, sum(b) + 2 from data group by 3
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-GROUP BY 3 refers to an expression that is or contains an aggregate function. Aggregate functions are not allowed in GROUP BY, but got (sum(data.b) + CAST(2 AS BIGINT)) AS `(sum(b) + 2)`; line 1 pos 43
+{
+  "errorClass" : "GROUP_BY_POS_REFERS_AGG_EXPR",
+  "sqlState" : "42000",
+  "messageParameters" : {
+    "index" : "3",
+    "aggExpr" : "(sum(data.b) + CAST(2 AS BIGINT)) AS `(sum(b) + 2)`"
+  }
+}
 
 
 -- !query
@@ -349,7 +384,14 @@ select a, b, count(1) from data group by a, -1
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-GROUP BY position -1 is not in select list (valid range is [1, 3]); line 1 pos 44
+{
+  "errorClass" : "GROUP_BY_POS_OUT_OF_RANGE",
+  "sqlState" : "42000",
+  "messageParameters" : {
+    "index" : "-1",
+    "size" : "3"
+  }
+}
 
 
 -- !query
@@ -358,7 +400,14 @@ select a, b, count(1) from data group by a, 3
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-GROUP BY 3 refers to an expression that is or contains an aggregate function. Aggregate functions are not allowed in GROUP BY, but got count(1) AS `count(1)`; line 1 pos 44
+{
+  "errorClass" : "GROUP_BY_POS_REFERS_AGG_EXPR",
+  "sqlState" : "42000",
+  "messageParameters" : {
+    "index" : "3",
+    "aggExpr" : "count(1) AS `count(1)`"
+  }
+}
 
 
 -- !query
@@ -367,7 +416,14 @@ select a, b, count(1) from data group by cube(-1, 2)
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-GROUP BY position -1 is not in select list (valid range is [1, 3]); line 1 pos 46
+{
+  "errorClass" : "GROUP_BY_POS_OUT_OF_RANGE",
+  "sqlState" : "42000",
+  "messageParameters" : {
+    "index" : "-1",
+    "size" : "3"
+  }
+}
 
 
 -- !query
@@ -376,7 +432,14 @@ select a, b, count(1) from data group by cube(1, 3)
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-GROUP BY 3 refers to an expression that is or contains an aggregate function. Aggregate functions are not allowed in GROUP BY, but got count(1) AS `count(1)`; line 1 pos 49
+{
+  "errorClass" : "GROUP_BY_POS_REFERS_AGG_EXPR",
+  "sqlState" : "42000",
+  "messageParameters" : {
+    "index" : "3",
+    "aggExpr" : "count(1) AS `count(1)`"
+  }
+}
 
 
 -- !query
diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/select_implicit.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/select_implicit.sql.out
@@ -207,7 +207,14 @@ SELECT c, count(*) FROM test_missing_target GROUP BY 3
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-GROUP BY position 3 is not in select list (valid range is [1, 2]); line 1 pos 53
+{
+  "errorClass" : "GROUP_BY_POS_OUT_OF_RANGE",
+  "sqlState" : "42000",
+  "messageParameters" : {
+    "index" : "3",
+    "size" : "2"
+  }
+}
 
 
 -- !query
diff --git a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_implicit.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_implicit.sql.out
@@ -210,7 +210,14 @@ SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY 3
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-GROUP BY position 3 is not in select list (valid range is [1, 2]); line 1 pos 63
+{
+  "errorClass" : "GROUP_BY_POS_OUT_OF_RANGE",
+  "sqlState" : "42000",
+  "messageParameters" : {
+    "index" : "3",
+    "size" : "2"
+  }
+}
 
 
 -- !query