apache · yhuai · Nov 6, 2015 · Nov 7, 2015 · Nov 7, 2015 · Nov 7, 2015
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
@@ -1339,7 +1339,7 @@ setMethod("pmod", signature(y = "Column"),
 #' @export
 setMethod("approxCountDistinct",
           signature(x = "Column"),
-          function(x, rsd = 0.95) {
+          function(x, rsd = 0.05) {
             jc <- callJStatic("org.apache.spark.sql.functions", "approxCountDistinct", x@jc, rsd)
             column(jc)
           })

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -866,7 +866,7 @@ def selectExpr(self, *expr):
         This is a variant of :func:`select` that accepts SQL expressions.
 
         >>> df.selectExpr("age * 2", "abs(age)").collect()
-        [Row((age * 2)=4, 'abs(age)=2), Row((age * 2)=10, 'abs(age)=5)]
+        [Row((age * 2)=4, abs(age)=2), Row((age * 2)=10, abs(age)=5)]
         """
         if len(expr) == 1 and isinstance(expr[0], list):
             expr = expr[0]

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -382,7 +382,7 @@ def expr(str):
     """Parses the expression string into the column that it represents
 
     >>> df.select(expr("length(name)")).collect()
-    [Row('length(name)=5), Row('length(name)=3)]
+    [Row(length(name)=5), Row(length(name)=3)]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.expr(str))

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -1017,7 +1017,7 @@ def test_expr(self):
         row = Row(a="length string", b=75)
         df = self.sqlCtx.createDataFrame([row])
         result = df.select(functions.expr("length(a)")).collect()[0].asDict()
-        self.assertEqual(13, result["'length(a)"])
+        self.assertEqual(13, result["length(a)"])
 
     def test_replace(self):
         schema = StructType([

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.catalyst
 
 private[spark] trait CatalystConf {
   def caseSensitiveAnalysis: Boolean
+
+  protected[spark] def specializeSingleDistinctAggPlanning: Boolean
 }
 
 /**
@@ -29,7 +31,13 @@ object EmptyConf extends CatalystConf {
   override def caseSensitiveAnalysis: Boolean = {
     throw new UnsupportedOperationException
   }
+
+  protected[spark] override def specializeSingleDistinctAggPlanning: Boolean = {
+    throw new UnsupportedOperationException
+  }
 }
 
 /** A CatalystConf that can be used for local testing. */
-case class SimpleCatalystConf(caseSensitiveAnalysis: Boolean) extends CatalystConf
+case class SimpleCatalystConf(caseSensitiveAnalysis: Boolean) extends CatalystConf {
+  protected[spark] override def specializeSingleDistinctAggPlanning: Boolean = true
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -22,6 +22,7 @@ import scala.language.implicitConversions
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util.DataTypeParser
@@ -272,7 +273,7 @@ object SqlParser extends AbstractSparkSQLParser with DataTypeParser {
   protected lazy val function: Parser[Expression] =
     ( ident <~ ("(" ~ "*" ~ ")") ^^ { case udfName =>
       if (lexical.normalizeKeyword(udfName) == "count") {
-        Count(Literal(1))
+        AggregateExpression(Count(Literal(1)), mode = Complete, isDistinct = false)
       } else {
         throw new AnalysisException(s"invalid expression $udfName(*)")
       }
@@ -281,22 +282,25 @@ object SqlParser extends AbstractSparkSQLParser with DataTypeParser {
       { case udfName ~ exprs => UnresolvedFunction(udfName, exprs, isDistinct = false) }
     | ident ~ ("(" ~ DISTINCT ~> repsep(expression, ",")) <~ ")" ^^ { case udfName ~ exprs =>
       lexical.normalizeKeyword(udfName) match {
-        case "sum" => SumDistinct(exprs.head)
-        case "count" => CountDistinct(exprs)
+        case "count" =>
+          aggregate.Count(exprs).toAggregateExpression(isDistinct = true)
         case _ => UnresolvedFunction(udfName, exprs, isDistinct = true)
       }
     }
     | APPROXIMATE ~> ident ~ ("(" ~ DISTINCT ~> expression <~ ")") ^^ { case udfName ~ exp =>
       if (lexical.normalizeKeyword(udfName) == "count") {
-        ApproxCountDistinct(exp)
+        AggregateExpression(new HyperLogLogPlusPlus(exp), mode = Complete, isDistinct = false)
       } else {
         throw new AnalysisException(s"invalid function approximate $udfName")
       }
     }
     | APPROXIMATE ~> "(" ~> unsignedFloat ~ ")" ~ ident ~ "(" ~ DISTINCT ~ expression <~ ")" ^^
       { case s ~ _ ~ udfName ~ _ ~ _ ~ exp =>
         if (lexical.normalizeKeyword(udfName) == "count") {
-          ApproxCountDistinct(exp, s.toDouble)
+          AggregateExpression(
+            HyperLogLogPlusPlus(exp, s.toDouble, 0, 0),
+            mode = Complete,
+            isDistinct = false)
         } else {
           throw new AnalysisException(s"invalid function approximate($s) $udfName")
         }

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -20,8 +20,8 @@ package org.apache.spark.sql.catalyst.analysis
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.expressions.aggregate.{Complete, AggregateExpression2, AggregateFunction2}
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.catalyst.trees.TreeNodeRef
@@ -79,6 +79,7 @@ class Analyzer(
       ExtractWindowExpressions ::
       GlobalAggregates ::
       ResolveAggregateFunctions ::
+      DistinctAggregationRewriter(conf) ::
       HiveTypeCoercion.typeCoercionRules ++
       extendedResolutionRules : _*),
     Batch("Nondeterministic", Once,
@@ -525,21 +526,14 @@ class Analyzer(
           case u @ UnresolvedFunction(name, children, isDistinct) =>
             withPosition(u) {
               registry.lookupFunction(name, children) match {
-                // We get an aggregate function built based on AggregateFunction2 interface.
-                // So, we wrap it in AggregateExpression2.
-                case agg2: AggregateFunction2 => AggregateExpression2(agg2, Complete, isDistinct)
-                // Currently, our old aggregate function interface supports SUM(DISTINCT ...)
-                // and COUTN(DISTINCT ...).
-                case sumDistinct: SumDistinct => sumDistinct
-                case countDistinct: CountDistinct => countDistinct
-                // DISTINCT is not meaningful with Max and Min.
-                case max: Max if isDistinct => max
-                case min: Min if isDistinct => min
-                // For other aggregate functions, DISTINCT keyword is not supported for now.
-                // Once we converted to the new code path, we will allow using DISTINCT keyword.
-                case other: AggregateExpression1 if isDistinct =>
-                  failAnalysis(s"$name does not support DISTINCT keyword.")
-                // If it does not have DISTINCT keyword, we will return it as is.
+                // DISTINCT is not meaningful for a Max or a Min.
+                case max: Max if isDistinct =>
+                  AggregateExpression(max, Complete, isDistinct = false)
+                case min: Min if isDistinct =>
+                  AggregateExpression(min, Complete, isDistinct = false)
+                // We get an aggregate function, we need to wrap it in an AggregateExpression.
+                case agg2: AggregateFunction => AggregateExpression(agg2, Complete, isDistinct)
+                // This function is not an aggregate function, just return the resolved one.
                 case other => other
               }
             }

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateFunction, AggregateExpression}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types._
 
@@ -108,7 +109,19 @@ trait CheckAnalysis {
 
           case Aggregate(groupingExprs, aggregateExprs, child) =>
             def checkValidAggregateExpression(expr: Expression): Unit = expr match {
-              case _: AggregateExpression => // OK
+              case aggExpr: AggregateExpression =>
+                // TODO: Is it possible that the child of a agg function is another
+                // agg function?
+                aggExpr.aggregateFunction.children.foreach {
+                  // This is just a sanity check, our analysis rule PullOutNondeterministic should
+                  // already pull out those nondeterministic expressions and evaluate them in
+                  // a Project node.
+                  case child if !child.deterministic =>
+                    failAnalysis(
+                      s"nondeterministic expression ${expr.prettyString} should not " +
+                        s"appear in the arguments of an aggregate function.")
+                  case child => // OK
+                }
               case e: Attribute if !groupingExprs.exists(_.semanticEquals(e)) =>
                 failAnalysis(
                   s"expression '${e.prettyString}' is neither present in the group by, " +
@@ -120,14 +133,26 @@ trait CheckAnalysis {
               case e => e.children.foreach(checkValidAggregateExpression)
             }
 
-            def checkValidGroupingExprs(expr: Expression): Unit = expr.dataType match {
-              case BinaryType =>
-                failAnalysis(s"binary type expression ${expr.prettyString} cannot be used " +
-                  "in grouping expression")
-              case m: MapType =>
-                failAnalysis(s"map type expression ${expr.prettyString} cannot be used " +
-                  "in grouping expression")
-              case _ => // OK
+            def checkValidGroupingExprs(expr: Expression): Unit = {
+              expr.dataType match {
+                case BinaryType =>
+                  failAnalysis(s"binary type expression ${expr.prettyString} cannot be used " +
+                    "in grouping expression")
+                case a: ArrayType =>
+                  failAnalysis(s"array type expression ${expr.prettyString} cannot be used " +
+                    "in grouping expression")
+                case m: MapType =>
+                  failAnalysis(s"map type expression ${expr.prettyString} cannot be used " +
+                    "in grouping expression")
+                case _ => // OK
+              }
+              if (!expr.deterministic) {
+                // This is just a sanity check, our analysis rule PullOutNondeterministic should
+                // already pull out those nondeterministic expressions and evaluate them in
+                // a Project node.
+                failAnalysis(s"nondeterministic expression ${expr.prettyString} should not " +
+                  s"appear in grouping expression.")
+              }
             }
 
             aggregateExprs.foreach(checkValidAggregateExpression)
@@ -179,7 +204,8 @@ trait CheckAnalysis {
               s"unresolved operator ${operator.simpleString}")
 
           case o if o.expressions.exists(!_.deterministic) &&
-            !o.isInstanceOf[Project] && !o.isInstanceOf[Filter] =>
+            !o.isInstanceOf[Project] && !o.isInstanceOf[Filter] & !o.isInstanceOf[Aggregate] =>
+            // The rule above is used to check Aggregate operator.
             failAnalysis(
               s"""nondeterministic expressions are only allowed in Project or Filter, found:
                  | ${o.expressions.map(_.prettyString).mkString(",")}