apache · dilipbiswal · Jul 19, 2018 · Jul 19, 2018 · Jul 19, 2018 · Jul 19, 2018
diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -406,6 +406,7 @@ aggregation
       WITH kind=ROLLUP
     | WITH kind=CUBE
     | kind=GROUPING SETS '(' groupingSet (',' groupingSet)* ')')?
+    | GROUP BY kind=GROUPING SETS '(' groupingSet (',' groupingSet)* ')'
     ;
 
 groupingSet

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -442,17 +442,35 @@ class Analyzer(
         child: LogicalPlan): LogicalPlan = {
       val gid = AttributeReference(VirtualColumn.groupingIdName, IntegerType, false)()
 
+      // In case of ANSI-SQL compliant syntax for GROUPING SETS, groupByExprs is optional and
+      // can be null. In such case, we derive the groupByExprs from the user supplied values for
+      // grouping sets.
+      val finalGroupByExpressions = if (groupByExprs == Nil) {
+        selectedGroupByExprs.flatten.foldLeft(Seq.empty[Expression]) { (result, currentExpr) =>
+          // Only unique expressions are included in the group by expressions and is determined
+          // based on their semantic equality. Example. grouping sets ((a * b), (b * a)) results
+          // in grouping expression (a * b)
+          if (result.find(_.semanticEquals(currentExpr)).isDefined) {
+            result
+          } else {
+            result :+ currentExpr
+          }
+        }
+      } else {
+        groupByExprs
+      }
+
       // Expand works by setting grouping expressions to null as determined by the
       // `selectedGroupByExprs`. To prevent these null values from being used in an aggregate
       // instead of the original value we need to create new aliases for all group by expressions
       // that will only be used for the intended purpose.
-      val groupByAliases = constructGroupByAlias(groupByExprs)
+      val groupByAliases = constructGroupByAlias(finalGroupByExpressions)
 
       val expand = constructExpand(selectedGroupByExprs, child, groupByAliases, gid)
       val groupingAttrs = expand.output.drop(child.output.length)
 
       val aggregations = constructAggregateExprs(
-        groupByExprs, aggregationExprs, groupByAliases, groupingAttrs, gid)
+        finalGroupByExpressions, aggregationExprs, groupByAliases, groupingAttrs, gid)
 
       Aggregate(groupingAttrs, aggregations, expand)
     }

diff --git a/...src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveGroupingAnalyticsSuite.scala b/...src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveGroupingAnalyticsSuite.scala
@@ -91,6 +91,34 @@ class ResolveGroupingAnalyticsSuite extends AnalysisTest {
     assertAnalysisError(originalPlan3, Seq("doesn't show up in the GROUP BY list"))
   }
 
+  test("grouping sets with no explicit group by expressions") {
+    val originalPlan = GroupingSets(Seq(Seq(), Seq(unresolved_a), Seq(unresolved_a, unresolved_b)),
+      Nil, r1,
+      Seq(unresolved_a, unresolved_b, UnresolvedAlias(count(unresolved_c))))
+    val expected = Aggregate(Seq(a, b, gid), Seq(a, b, count(c).as("count(c)")),
+      Expand(
+        Seq(Seq(a, b, c, nulInt, nulStr, 3), Seq(a, b, c, a, nulStr, 1), Seq(a, b, c, a, b, 0)),
+        Seq(a, b, c, a, b, gid),
+        Project(Seq(a, b, c, a.as("a"), b.as("b")), r1)))
+    checkAnalysis(originalPlan, expected)
+
+    // Computation of grouping expression should remove duplicate expression based on their
+    // semantics (semanticEqual).
+    val originalPlan2 = GroupingSets(Seq(Seq(Multiply(unresolved_a, Literal(2))),
+      Seq(Multiply(Literal(2), unresolved_a), unresolved_b)), Nil, r1,
+      Seq(UnresolvedAlias(Multiply(unresolved_a, Literal(2))),
+        unresolved_b, UnresolvedAlias(count(unresolved_c))))
+
+    val resultPlan = getAnalyzer(true).executeAndCheck(originalPlan2)
+    val gExpressions = resultPlan.asInstanceOf[Aggregate].groupingExpressions
+    assert(gExpressions.size == 3)
+    val firstGroupingExprAttrName =
+      gExpressions(0).asInstanceOf[AttributeReference].name.replaceAll("#[0-9]*", "#0")
+    assert(firstGroupingExprAttrName == "(a#0 * 2)")
+    assert(gExpressions(1).asInstanceOf[AttributeReference].name == "b")
+    assert(gExpressions(2).asInstanceOf[AttributeReference].name == VirtualColumn.groupingIdName)
+  }
+
   test("cube") {
     val originalPlan = Aggregate(Seq(Cube(Seq(unresolved_a, unresolved_b))),
       Seq(unresolved_a, unresolved_b, UnresolvedAlias(count(unresolved_c))), r1)

diff --git a/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql b/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql
@@ -13,5 +13,41 @@ SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((a));
 -- SPARK-17849: grouping set throws NPE #3
 SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((c));
 
+-- Group sets without explicit group by
+SELECT c1, sum(c2) FROM (VALUES ('x', 10, 0), ('y', 20, 0)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1);
 
+-- Group sets without group by and with grouping
+SELECT c1, sum(c2), grouping(c1) FROM (VALUES ('x', 10, 0), ('y', 20, 0)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1);
+
+-- Mutiple grouping within a grouping set
+SELECT c1, c2, Sum(c3), grouping__id
+FROM   (VALUES ('x', 'a', 10), ('y', 'b', 20) ) AS t (c1, c2, c3)
+GROUP  BY GROUPING SETS ( ( c1 ), ( c2 ) )
+HAVING GROUPING__ID > 1;
+
+-- Group sets without explicit group by
+SELECT grouping(c1) FROM (VALUES ('x', 'a', 10), ('y', 'b', 20)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1,c2);
+
+-- Mutiple grouping within a grouping set
+SELECT -c1 AS c1 FROM (values (1,2), (3,2)) t(c1, c2) GROUP BY GROUPING SETS ((c1), (c1, c2));
+
+-- complex expression in grouping sets
+SELECT a + b, b, sum(c) FROM (VALUES (1,1,1),(2,2,2)) AS t(a,b,c) GROUP BY GROUPING SETS ( (a + b), (b));
+
+-- complex expression in grouping sets
+SELECT a + b, b, sum(c) FROM (VALUES (1,1,1),(2,2,2)) AS t(a,b,c) GROUP BY GROUPING SETS ( (a + b), (b + a), (b));
+
+-- more query constructs with grouping sets
+SELECT c1 AS col1, c2 AS col2
+FROM   (VALUES (1, 2), (3, 2)) t(c1, c2)
+GROUP  BY GROUPING SETS ( ( c1 ), ( c1, c2 ) )
+HAVING col2 IS NOT NULL
+ORDER  BY -col1;
+
+-- negative tests - must have at least one grouping expression
+SELECT a, b, c, count(d) FROM grouping GROUP BY WITH ROLLUP;
+
+SELECT a, b, c, count(d) FROM grouping GROUP BY WITH CUBE;
+
+SELECT c1 FROM (values (1,2), (3,2)) t(c1, c2) GROUP BY GROUPING SETS (());
 
diff --git a/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 4
+-- Number of queries: 15
 
 
 -- !query 0
@@ -40,3 +40,127 @@ struct<a:string,b:string,c:string,count(d):bigint>
 NULL	NULL	3	1
 NULL	NULL	6	1
 NULL	NULL	9	1
+
+
+-- !query 4
+SELECT c1, sum(c2) FROM (VALUES ('x', 10, 0), ('y', 20, 0)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1)
+-- !query 4 schema
+struct<c1:string,sum(c2):bigint>
+-- !query 4 output
+x	10
+y	20
+
+
+-- !query 5
+SELECT c1, sum(c2), grouping(c1) FROM (VALUES ('x', 10, 0), ('y', 20, 0)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1)
+-- !query 5 schema
+struct<c1:string,sum(c2):bigint,grouping(c1):tinyint>
+-- !query 5 output
+x	10	0
+y	20	0
+
+
+-- !query 6
+SELECT c1, c2, Sum(c3), grouping__id
+FROM   (VALUES ('x', 'a', 10), ('y', 'b', 20) ) AS t (c1, c2, c3)
+GROUP  BY GROUPING SETS ( ( c1 ), ( c2 ) )
+HAVING GROUPING__ID > 1
+-- !query 6 schema
+struct<c1:string,c2:string,sum(c3):bigint,grouping__id:int>
+-- !query 6 output
+NULL	a	10	2
+NULL	b	20	2
+
+
+-- !query 7
+SELECT grouping(c1) FROM (VALUES ('x', 'a', 10), ('y', 'b', 20)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1,c2)
+-- !query 7 schema
+struct<grouping(c1):tinyint>
+-- !query 7 output
+0
+0
+1
+1
+
+
+-- !query 8
+SELECT -c1 AS c1 FROM (values (1,2), (3,2)) t(c1, c2) GROUP BY GROUPING SETS ((c1), (c1, c2))
+-- !query 8 schema
+struct<c1:int>
+-- !query 8 output
+-1
+-1
+-3
+-3
+
+
+-- !query 9
+SELECT a + b, b, sum(c) FROM (VALUES (1,1,1),(2,2,2)) AS t(a,b,c) GROUP BY GROUPING SETS ( (a + b), (b))
+-- !query 9 schema
+struct<(a + b):int,b:int,sum(c):bigint>
+-- !query 9 output
+2	NULL	1
+4	NULL	2
+NULL	1	1
+NULL	2	2
+
+
+-- !query 10
+SELECT a + b, b, sum(c) FROM (VALUES (1,1,1),(2,2,2)) AS t(a,b,c) GROUP BY GROUPING SETS ( (a + b), (b + a), (b))
+-- !query 10 schema
+struct<(a + b):int,b:int,sum(c):bigint>
+-- !query 10 output
+2	NULL	2
+4	NULL	4
+NULL	1	1
+NULL	2	2
+
+
+-- !query 11
+SELECT c1 AS col1, c2 AS col2
+FROM   (VALUES (1, 2), (3, 2)) t(c1, c2)
+GROUP  BY GROUPING SETS ( ( c1 ), ( c1, c2 ) )
+HAVING col2 IS NOT NULL
+ORDER  BY -col1
+-- !query 11 schema
+struct<col1:int,col2:int>
+-- !query 11 output
+3	2
+1	2
+
+
+-- !query 12
+SELECT a, b, c, count(d) FROM grouping GROUP BY WITH ROLLUP
+-- !query 12 schema
+struct<>
+-- !query 12 output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+extraneous input 'ROLLUP' expecting <EOF>(line 1, pos 53)
+
+== SQL ==
+SELECT a, b, c, count(d) FROM grouping GROUP BY WITH ROLLUP
+-----------------------------------------------------^^^
+
+
+-- !query 13
+SELECT a, b, c, count(d) FROM grouping GROUP BY WITH CUBE
+-- !query 13 schema
+struct<>
+-- !query 13 output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+extraneous input 'CUBE' expecting <EOF>(line 1, pos 53)
+
+== SQL ==
+SELECT a, b, c, count(d) FROM grouping GROUP BY WITH CUBE
+-----------------------------------------------------^^^
+
+
+-- !query 14
+SELECT c1 FROM (values (1,2), (3,2)) t(c1, c2) GROUP BY GROUPING SETS (())
+-- !query 14 schema
+struct<>
+-- !query 14 output
+org.apache.spark.sql.AnalysisException
+expression '`c1`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;