-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-9830] [SQL] Remove AggregateExpression1 and Aggregate Operator used to evaluate AggregateExpression1s #9556
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
96e3e36
9921733
f126520
5a177ec
f5a389f
3d06cfd
7e77628
55841a5
d2ca290
aa8daa9
10630e1
e8d39d1
a3df1fd
5ed4970
881afe8
2451233
1ef0b92
35dfe93
6d29195
3835110
c88c97d
c6c6c09
b73156a
cb3f4d3
bf3dfb7
c22c05e
10a86fd
bad4184
76502b5
302edac
b13c3f0
5b9f4b5
c37d179
fb64896
16826e6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,7 @@ import scala.language.implicitConversions | |
| import org.apache.spark.sql.AnalysisException | ||
| import org.apache.spark.sql.catalyst.analysis._ | ||
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.catalyst.expressions.aggregate._ | ||
| import org.apache.spark.sql.catalyst.plans._ | ||
| import org.apache.spark.sql.catalyst.plans.logical._ | ||
| import org.apache.spark.sql.catalyst.util.DataTypeParser | ||
|
|
@@ -272,7 +273,7 @@ object SqlParser extends AbstractSparkSQLParser with DataTypeParser { | |
| protected lazy val function: Parser[Expression] = | ||
| ( ident <~ ("(" ~ "*" ~ ")") ^^ { case udfName => | ||
| if (lexical.normalizeKeyword(udfName) == "count") { | ||
| Count(Literal(1)) | ||
| AggregateExpression(Count(Literal(1)), mode = Complete, isDistinct = false) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason we can't just create unresolved functions here? (other than maybe the distinct case?)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems we hard code this when we deal with count(*). We can change it in another PR. |
||
| } else { | ||
| throw new AnalysisException(s"invalid expression $udfName(*)") | ||
| } | ||
|
|
@@ -281,22 +282,25 @@ object SqlParser extends AbstractSparkSQLParser with DataTypeParser { | |
| { case udfName ~ exprs => UnresolvedFunction(udfName, exprs, isDistinct = false) } | ||
| | ident ~ ("(" ~ DISTINCT ~> repsep(expression, ",")) <~ ")" ^^ { case udfName ~ exprs => | ||
| lexical.normalizeKeyword(udfName) match { | ||
| case "sum" => SumDistinct(exprs.head) | ||
| case "count" => CountDistinct(exprs) | ||
| case "count" => | ||
| aggregate.Count(exprs).toAggregateExpression(isDistinct = true) | ||
| case _ => UnresolvedFunction(udfName, exprs, isDistinct = true) | ||
| } | ||
| } | ||
| | APPROXIMATE ~> ident ~ ("(" ~ DISTINCT ~> expression <~ ")") ^^ { case udfName ~ exp => | ||
| if (lexical.normalizeKeyword(udfName) == "count") { | ||
| ApproxCountDistinct(exp) | ||
| AggregateExpression(new HyperLogLogPlusPlus(exp), mode = Complete, isDistinct = false) | ||
| } else { | ||
| throw new AnalysisException(s"invalid function approximate $udfName") | ||
| } | ||
| } | ||
| | APPROXIMATE ~> "(" ~> unsignedFloat ~ ")" ~ ident ~ "(" ~ DISTINCT ~ expression <~ ")" ^^ | ||
| { case s ~ _ ~ udfName ~ _ ~ _ ~ exp => | ||
| if (lexical.normalizeKeyword(udfName) == "count") { | ||
| ApproxCountDistinct(exp, s.toDouble) | ||
| AggregateExpression( | ||
| HyperLogLogPlusPlus(exp, s.toDouble, 0, 0), | ||
| mode = Complete, | ||
| isDistinct = false) | ||
| } else { | ||
| throw new AnalysisException(s"invalid function approximate($s) $udfName") | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,8 +20,8 @@ package org.apache.spark.sql.catalyst.analysis | |
| import scala.collection.mutable.ArrayBuffer | ||
|
|
||
| import org.apache.spark.sql.AnalysisException | ||
| import org.apache.spark.sql.catalyst.expressions.aggregate.{Complete, AggregateExpression2, AggregateFunction2} | ||
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.catalyst.expressions.aggregate._ | ||
| import org.apache.spark.sql.catalyst.plans.logical._ | ||
| import org.apache.spark.sql.catalyst.rules._ | ||
| import org.apache.spark.sql.catalyst.trees.TreeNodeRef | ||
|
|
@@ -79,6 +79,7 @@ class Analyzer( | |
| ExtractWindowExpressions :: | ||
| GlobalAggregates :: | ||
| ResolveAggregateFunctions :: | ||
| DistinctAggregationRewriter(conf) :: | ||
| HiveTypeCoercion.typeCoercionRules ++ | ||
| extendedResolutionRules : _*), | ||
| Batch("Nondeterministic", Once, | ||
|
|
@@ -525,21 +526,14 @@ class Analyzer( | |
| case u @ UnresolvedFunction(name, children, isDistinct) => | ||
| withPosition(u) { | ||
| registry.lookupFunction(name, children) match { | ||
| // We get an aggregate function built based on AggregateFunction2 interface. | ||
| // So, we wrap it in AggregateExpression2. | ||
| case agg2: AggregateFunction2 => AggregateExpression2(agg2, Complete, isDistinct) | ||
| // Currently, our old aggregate function interface supports SUM(DISTINCT ...) | ||
| // and COUTN(DISTINCT ...). | ||
| case sumDistinct: SumDistinct => sumDistinct | ||
| case countDistinct: CountDistinct => countDistinct | ||
| // DISTINCT is not meaningful with Max and Min. | ||
| case max: Max if isDistinct => max | ||
| case min: Min if isDistinct => min | ||
| // For other aggregate functions, DISTINCT keyword is not supported for now. | ||
| // Once we converted to the new code path, we will allow using DISTINCT keyword. | ||
| case other: AggregateExpression1 if isDistinct => | ||
| failAnalysis(s"$name does not support DISTINCT keyword.") | ||
| // If it does not have DISTINCT keyword, we will return it as is. | ||
| // DISTINCT is not meaningful for a Max or a Min. | ||
| case max: Max if isDistinct => | ||
| AggregateExpression(max, Complete, isDistinct = false) | ||
| case min: Min if isDistinct => | ||
| AggregateExpression(min, Complete, isDistinct = false) | ||
| // We get an aggregate function, we need to wrap it in an AggregateExpression. | ||
| case agg2: AggregateFunction => AggregateExpression(agg2, Complete, isDistinct) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agg2 -> agg
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
| // This function is not an aggregate function, just return the resolved one. | ||
| case other => other | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.analysis | |
|
|
||
| import org.apache.spark.sql.AnalysisException | ||
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateFunction, AggregateExpression} | ||
| import org.apache.spark.sql.catalyst.plans.logical._ | ||
| import org.apache.spark.sql.types._ | ||
|
|
||
|
|
@@ -108,7 +109,19 @@ trait CheckAnalysis { | |
|
|
||
| case Aggregate(groupingExprs, aggregateExprs, child) => | ||
| def checkValidAggregateExpression(expr: Expression): Unit = expr match { | ||
| case _: AggregateExpression => // OK | ||
| case aggExpr: AggregateExpression => | ||
| // TODO: Is it possible that the child of a agg function is another | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After rewrite for multiple distinct, I think it's possible
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. seems our error message is not good. I will update this. We should not users to have nested agg functions like |
||
| // agg function? | ||
| aggExpr.aggregateFunction.children.foreach { | ||
| // This is just a sanity check, our analysis rule PullOutNondeterministic should | ||
| // already pull out those nondeterministic expressions and evaluate them in | ||
| // a Project node. | ||
| case child if !child.deterministic => | ||
| failAnalysis( | ||
| s"nondeterministic expression ${expr.prettyString} should not " + | ||
| s"appear in the arguments of an aggregate function.") | ||
| case child => // OK | ||
| } | ||
| case e: Attribute if !groupingExprs.exists(_.semanticEquals(e)) => | ||
| failAnalysis( | ||
| s"expression '${e.prettyString}' is neither present in the group by, " + | ||
|
|
@@ -120,14 +133,26 @@ trait CheckAnalysis { | |
| case e => e.children.foreach(checkValidAggregateExpression) | ||
| } | ||
|
|
||
| def checkValidGroupingExprs(expr: Expression): Unit = expr.dataType match { | ||
| case BinaryType => | ||
| failAnalysis(s"binary type expression ${expr.prettyString} cannot be used " + | ||
| "in grouping expression") | ||
| case m: MapType => | ||
| failAnalysis(s"map type expression ${expr.prettyString} cannot be used " + | ||
| "in grouping expression") | ||
| case _ => // OK | ||
| def checkValidGroupingExprs(expr: Expression): Unit = { | ||
| expr.dataType match { | ||
| case BinaryType => | ||
| failAnalysis(s"binary type expression ${expr.prettyString} cannot be used " + | ||
| "in grouping expression") | ||
| case a: ArrayType => | ||
| failAnalysis(s"array type expression ${expr.prettyString} cannot be used " + | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this a regression?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately, it is the case. |
||
| "in grouping expression") | ||
| case m: MapType => | ||
| failAnalysis(s"map type expression ${expr.prettyString} cannot be used " + | ||
| "in grouping expression") | ||
| case _ => // OK | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should also check UDT and types inside StructType
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
| } | ||
| if (!expr.deterministic) { | ||
| // This is just a sanity check, our analysis rule PullOutNondeterministic should | ||
| // already pull out those nondeterministic expressions and evaluate them in | ||
| // a Project node. | ||
| failAnalysis(s"nondeterministic expression ${expr.prettyString} should not " + | ||
| s"appear in grouping expression.") | ||
| } | ||
| } | ||
|
|
||
| aggregateExprs.foreach(checkValidAggregateExpression) | ||
|
|
@@ -179,7 +204,8 @@ trait CheckAnalysis { | |
| s"unresolved operator ${operator.simpleString}") | ||
|
|
||
| case o if o.expressions.exists(!_.deterministic) && | ||
| !o.isInstanceOf[Project] && !o.isInstanceOf[Filter] => | ||
| !o.isInstanceOf[Project] && !o.isInstanceOf[Filter] & !o.isInstanceOf[Aggregate] => | ||
| // The rule above is used to check Aggregate operator. | ||
| failAnalysis( | ||
| s"""nondeterministic expressions are only allowed in Project or Filter, found: | ||
| | ${o.expressions.map(_.prettyString).mkString(",")} | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@shivaram Seems this default value should be 0.05 instead of 0.95, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah looks like it should be 0.05 -- cc @davies
This seems to have been added back when SparkR was a separate code-base in davies/SparkR-pkg@d7b17a4
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch, thanks!