Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,7 @@ aggregation
WITH kind=ROLLUP
| WITH kind=CUBE
| kind=GROUPING SETS '(' groupingSet (',' groupingSet)* ')')?
| GROUP BY kind=GROUPING SETS '(' groupingSet (',' groupingSet)* ')'
;

groupingSet
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -442,17 +442,35 @@ class Analyzer(
child: LogicalPlan): LogicalPlan = {
val gid = AttributeReference(VirtualColumn.groupingIdName, IntegerType, false)()

// In case of ANSI-SQL compliant syntax for GROUPING SETS, groupByExprs is optional and
// can be null. In such case, we derive the groupByExprs from the user supplied values for
// grouping sets.
val finalGroupByExpressions = if (groupByExprs == Nil) {
selectedGroupByExprs.flatten.foldLeft(Seq.empty[Expression]) { (result, currentExpr) =>
Copy link
Member

@viirya viirya Jul 20, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if GROUP BY GROUPING SETS (())? Is it a valid query?

Copy link
Contributor Author

@dilipbiswal dilipbiswal Jul 20, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@viirya No. We should be getting an error as we don't have a group by specification. I had tried this scenario against db2 to double check.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we have a test case for it too?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@viirya Yeah.. was already adding it .. knew u would ask :-)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dilipbiswal Thanks! :-)

// Only unique expressions are included in the group by expressions and is determined
// based on their semantic equality. Example. grouping sets ((a * b), (b * a)) results
// in grouping expression (a * b)
if (result.find(_.semanticEquals(currentExpr)).isDefined) {
result
} else {
result :+ currentExpr
}
}
} else {
groupByExprs
}

// Expand works by setting grouping expressions to null as determined by the
// `selectedGroupByExprs`. To prevent these null values from being used in an aggregate
// instead of the original value we need to create new aliases for all group by expressions
// that will only be used for the intended purpose.
val groupByAliases = constructGroupByAlias(groupByExprs)
val groupByAliases = constructGroupByAlias(finalGroupByExpressions)

val expand = constructExpand(selectedGroupByExprs, child, groupByAliases, gid)
val groupingAttrs = expand.output.drop(child.output.length)

val aggregations = constructAggregateExprs(
groupByExprs, aggregationExprs, groupByAliases, groupingAttrs, gid)
finalGroupByExpressions, aggregationExprs, groupByAliases, groupingAttrs, gid)

Aggregate(groupingAttrs, aggregations, expand)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,34 @@ class ResolveGroupingAnalyticsSuite extends AnalysisTest {
assertAnalysisError(originalPlan3, Seq("doesn't show up in the GROUP BY list"))
}

test("grouping sets with no explicit group by expressions") {
val originalPlan = GroupingSets(Seq(Seq(), Seq(unresolved_a), Seq(unresolved_a, unresolved_b)),
Nil, r1,
Seq(unresolved_a, unresolved_b, UnresolvedAlias(count(unresolved_c))))
val expected = Aggregate(Seq(a, b, gid), Seq(a, b, count(c).as("count(c)")),
Expand(
Seq(Seq(a, b, c, nulInt, nulStr, 3), Seq(a, b, c, a, nulStr, 1), Seq(a, b, c, a, b, 0)),
Seq(a, b, c, a, b, gid),
Project(Seq(a, b, c, a.as("a"), b.as("b")), r1)))
checkAnalysis(originalPlan, expected)

// Computation of grouping expression should remove duplicate expression based on their
// semantics (semanticEqual).
val originalPlan2 = GroupingSets(Seq(Seq(Multiply(unresolved_a, Literal(2))),
Seq(Multiply(Literal(2), unresolved_a), unresolved_b)), Nil, r1,
Seq(UnresolvedAlias(Multiply(unresolved_a, Literal(2))),
unresolved_b, UnresolvedAlias(count(unresolved_c))))

val resultPlan = getAnalyzer(true).executeAndCheck(originalPlan2)
val gExpressions = resultPlan.asInstanceOf[Aggregate].groupingExpressions
assert(gExpressions.size == 3)
val firstGroupingExprAttrName =
gExpressions(0).asInstanceOf[AttributeReference].name.replaceAll("#[0-9]*", "#0")
assert(firstGroupingExprAttrName == "(a#0 * 2)")
assert(gExpressions(1).asInstanceOf[AttributeReference].name == "b")
assert(gExpressions(2).asInstanceOf[AttributeReference].name == VirtualColumn.groupingIdName)
}

test("cube") {
val originalPlan = Aggregate(Seq(Cube(Seq(unresolved_a, unresolved_b))),
Seq(unresolved_a, unresolved_b, UnresolvedAlias(count(unresolved_c))), r1)
Expand Down
36 changes: 36 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,41 @@ SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((a));
-- SPARK-17849: grouping set throws NPE #3
SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((c));

-- Group sets without explicit group by
SELECT c1, sum(c2) FROM (VALUES ('x', 10, 0), ('y', 20, 0)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1);

-- Group sets without group by and with grouping
SELECT c1, sum(c2), grouping(c1) FROM (VALUES ('x', 10, 0), ('y', 20, 0)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1);

-- Mutiple grouping within a grouping set
SELECT c1, c2, Sum(c3), grouping__id
FROM (VALUES ('x', 'a', 10), ('y', 'b', 20) ) AS t (c1, c2, c3)
GROUP BY GROUPING SETS ( ( c1 ), ( c2 ) )
HAVING GROUPING__ID > 1;

-- Group sets without explicit group by
SELECT grouping(c1) FROM (VALUES ('x', 'a', 10), ('y', 'b', 20)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1,c2);

-- Mutiple grouping within a grouping set
SELECT -c1 AS c1 FROM (values (1,2), (3,2)) t(c1, c2) GROUP BY GROUPING SETS ((c1), (c1, c2));

-- complex expression in grouping sets
SELECT a + b, b, sum(c) FROM (VALUES (1,1,1),(2,2,2)) AS t(a,b,c) GROUP BY GROUPING SETS ( (a + b), (b));

-- complex expression in grouping sets
SELECT a + b, b, sum(c) FROM (VALUES (1,1,1),(2,2,2)) AS t(a,b,c) GROUP BY GROUPING SETS ( (a + b), (b + a), (b));

-- more query constructs with grouping sets
SELECT c1 AS col1, c2 AS col2
FROM (VALUES (1, 2), (3, 2)) t(c1, c2)
GROUP BY GROUPING SETS ( ( c1 ), ( c1, c2 ) )
HAVING col2 IS NOT NULL
ORDER BY -col1;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've manually verified the results should be correct.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@viirya Sorry Simon.. do i have to do something for this comment ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not at all. @dilipbiswal :-)


-- negative tests - must have at least one grouping expression
SELECT a, b, c, count(d) FROM grouping GROUP BY WITH ROLLUP;

SELECT a, b, c, count(d) FROM grouping GROUP BY WITH CUBE;

SELECT c1 FROM (values (1,2), (3,2)) t(c1, c2) GROUP BY GROUPING SETS (());

126 changes: 125 additions & 1 deletion sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 4
-- Number of queries: 15


-- !query 0
Expand Down Expand Up @@ -40,3 +40,127 @@ struct<a:string,b:string,c:string,count(d):bigint>
NULL NULL 3 1
NULL NULL 6 1
NULL NULL 9 1


-- !query 4
SELECT c1, sum(c2) FROM (VALUES ('x', 10, 0), ('y', 20, 0)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1)
-- !query 4 schema
struct<c1:string,sum(c2):bigint>
-- !query 4 output
x 10
y 20


-- !query 5
SELECT c1, sum(c2), grouping(c1) FROM (VALUES ('x', 10, 0), ('y', 20, 0)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1)
-- !query 5 schema
struct<c1:string,sum(c2):bigint,grouping(c1):tinyint>
-- !query 5 output
x 10 0
y 20 0


-- !query 6
SELECT c1, c2, Sum(c3), grouping__id
FROM (VALUES ('x', 'a', 10), ('y', 'b', 20) ) AS t (c1, c2, c3)
GROUP BY GROUPING SETS ( ( c1 ), ( c2 ) )
HAVING GROUPING__ID > 1
-- !query 6 schema
struct<c1:string,c2:string,sum(c3):bigint,grouping__id:int>
-- !query 6 output
NULL a 10 2
NULL b 20 2


-- !query 7
SELECT grouping(c1) FROM (VALUES ('x', 'a', 10), ('y', 'b', 20)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1,c2)
-- !query 7 schema
struct<grouping(c1):tinyint>
-- !query 7 output
0
0
1
1


-- !query 8
SELECT -c1 AS c1 FROM (values (1,2), (3,2)) t(c1, c2) GROUP BY GROUPING SETS ((c1), (c1, c2))
-- !query 8 schema
struct<c1:int>
-- !query 8 output
-1
-1
-3
-3


-- !query 9
SELECT a + b, b, sum(c) FROM (VALUES (1,1,1),(2,2,2)) AS t(a,b,c) GROUP BY GROUPING SETS ( (a + b), (b))
-- !query 9 schema
struct<(a + b):int,b:int,sum(c):bigint>
-- !query 9 output
2 NULL 1
4 NULL 2
NULL 1 1
NULL 2 2


-- !query 10
SELECT a + b, b, sum(c) FROM (VALUES (1,1,1),(2,2,2)) AS t(a,b,c) GROUP BY GROUPING SETS ( (a + b), (b + a), (b))
-- !query 10 schema
struct<(a + b):int,b:int,sum(c):bigint>
-- !query 10 output
2 NULL 2
4 NULL 4
NULL 1 1
NULL 2 2


-- !query 11
SELECT c1 AS col1, c2 AS col2
FROM (VALUES (1, 2), (3, 2)) t(c1, c2)
GROUP BY GROUPING SETS ( ( c1 ), ( c1, c2 ) )
HAVING col2 IS NOT NULL
ORDER BY -col1
-- !query 11 schema
struct<col1:int,col2:int>
-- !query 11 output
3 2
1 2


-- !query 12
SELECT a, b, c, count(d) FROM grouping GROUP BY WITH ROLLUP
-- !query 12 schema
struct<>
-- !query 12 output
org.apache.spark.sql.catalyst.parser.ParseException

extraneous input 'ROLLUP' expecting <EOF>(line 1, pos 53)

== SQL ==
SELECT a, b, c, count(d) FROM grouping GROUP BY WITH ROLLUP
-----------------------------------------------------^^^


-- !query 13
SELECT a, b, c, count(d) FROM grouping GROUP BY WITH CUBE
-- !query 13 schema
struct<>
-- !query 13 output
org.apache.spark.sql.catalyst.parser.ParseException

extraneous input 'CUBE' expecting <EOF>(line 1, pos 53)

== SQL ==
SELECT a, b, c, count(d) FROM grouping GROUP BY WITH CUBE
-----------------------------------------------------^^^


-- !query 14
SELECT c1 FROM (values (1,2), (3,2)) t(c1, c2) GROUP BY GROUPING SETS (())
-- !query 14 schema
struct<>
-- !query 14 output
org.apache.spark.sql.AnalysisException
expression '`c1`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;