Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -509,12 +509,7 @@ class Analyzer(
|| !p.pivotColumn.resolved => p
case Pivot(groupByExprsOpt, pivotColumn, pivotValues, aggregates, child) =>
// Check all aggregate expressions.
aggregates.foreach { e =>
if (!isAggregateExpression(e)) {
throw new AnalysisException(
s"Aggregate expression required for pivot, found '$e'")
}
}
aggregates.foreach(checkValidAggregateExpression)
// Group-by expressions coming from SQL are implicit and need to be deduced.
val groupByExprs = groupByExprsOpt.getOrElse(
(child.outputSet -- aggregates.flatMap(_.references) -- pivotColumn.references).toSeq)
Expand Down Expand Up @@ -586,12 +581,17 @@ class Analyzer(
}
}

private def isAggregateExpression(expr: Expression): Boolean = {
expr match {
case Alias(e, _) => isAggregateExpression(e)
case AggregateExpression(_, _, _, _) => true
case _ => false
}
// Support any aggregate expression that can appear in an Aggregate plan except Pandas UDF.
// TODO: Support Pandas UDF.
private def checkValidAggregateExpression(expr: Expression): Unit = expr match {
case _: AggregateExpression => // OK and leave the argument check to CheckAnalysis.
case expr: PythonUDF if PythonUDF.isGroupedAggPandasUDF(expr) =>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I created a JIRA for this support https://issues.apache.org/jira/browse/SPARK-24796

failAnalysis("Pandas UDF aggregate expressions are currently not supported in pivot.")
case e: Attribute =>
failAnalysis(
s"Aggregate expression required for pivot, but '${e.sql}' " +
s"did not appear in any aggregate function.")
case e => e.children.foreach(checkValidAggregateExpression)
}
}

Expand Down
18 changes: 18 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/pivot.sql
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,21 @@ PIVOT (
sum(earnings)
FOR year IN (2012, 2013)
);

-- pivot with complex aggregate expressions
SELECT * FROM (
SELECT year, course, earnings FROM courseSales
)
PIVOT (
ceil(sum(earnings)), avg(earnings) + 1 as a1
FOR course IN ('dotNET', 'Java')
);

-- pivot with invalid arguments in aggregate expressions
SELECT * FROM (
SELECT year, course, earnings FROM courseSales
)
PIVOT (
sum(avg(earnings))
FOR course IN ('dotNET', 'Java')
);
34 changes: 32 additions & 2 deletions sql/core/src/test/resources/sql-tests/results/pivot.sql.out
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 13
-- Number of queries: 15


-- !query 0
Expand Down Expand Up @@ -176,7 +176,7 @@ PIVOT (
struct<>
-- !query 11 output
org.apache.spark.sql.AnalysisException
Aggregate expression required for pivot, found 'abs(earnings#x)';
Aggregate expression required for pivot, but 'coursesales.`earnings`' did not appear in any aggregate function.;


-- !query 12
Expand All @@ -192,3 +192,33 @@ struct<>
-- !query 12 output
org.apache.spark.sql.AnalysisException
cannot resolve '`year`' given input columns: [__auto_generated_subquery_name.course, __auto_generated_subquery_name.earnings]; line 4 pos 0


-- !query 13
SELECT * FROM (
SELECT year, course, earnings FROM courseSales
)
PIVOT (
ceil(sum(earnings)), avg(earnings) + 1 as a1
FOR course IN ('dotNET', 'Java')
)
-- !query 13 schema
struct<year:int,dotNET_CEIL(sum(CAST(earnings AS BIGINT))):bigint,dotNET_a1:double,Java_CEIL(sum(CAST(earnings AS BIGINT))):bigint,Java_a1:double>
-- !query 13 output
2012 15000 7501.0 20000 20001.0
2013 48000 48001.0 30000 30001.0


-- !query 14
SELECT * FROM (
SELECT year, course, earnings FROM courseSales
)
PIVOT (
sum(avg(earnings))
FOR course IN ('dotNET', 'Java')
)
-- !query 14 schema
struct<>
-- !query 14 output
org.apache.spark.sql.AnalysisException
It is not allowed to use an aggregate function in the argument of another aggregate function. Please use the inner aggregate function in a sub-query.;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is related to this pr? I think the output does not change with/without this pr.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right. I think it's still worth adding such a test for pivot.
But you reminded me that I might not need to check the aggregate function arguments here and leave it to CheckAnalysis since this check is independent of the context and always outputs the same error message. WDYT, @maropu and @gatorsmile ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding this test is just to improve the test coverage. It looks reasonable.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But you reminded me that I might not need to check the aggregate function arguments here and leave it to CheckAnalysis since this check is independent of the context and always outputs the same error message.

The general principle in our Analyzer is do the error handling in CheckAnalysis, unless a better (more readable) error message can be issued from the rule.