Add UDAF interface.

yhuai · yhuai · commit a19fea644887 · 2015-07-17T01:13:34.000-07:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate2/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate2/aggregates.scala
@@ -20,8 +20,9 @@ package org.apache.spark.sql.catalyst.expressions.aggregate2
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.Row
 
 /** The mode of an [[AggregateFunction]]. */
 private[sql] sealed trait AggregateMode
@@ -277,3 +278,78 @@ case class Average(child: Expression) extends AlgebraicAggregate {
   override def dataType: DataType = resultType
   override def children: Seq[Expression] = child :: Nil
 }
+
+abstract class AggregationBuffer(
+    toCatalystConverters: Array[Any => Any],
+    toScalaConverters: Array[Any => Any],
+    bufferOffset: Int)
+  extends Row {
+
+  override def length: Int = toCatalystConverters.length
+
+  protected val offsets: Array[Int] = {
+    val newOffsets = new Array[Int](length)
+    var i = 0
+    while (i < newOffsets.length) {
+      newOffsets(i) = bufferOffset + i
+      i += 1
+    }
+    newOffsets
+  }
+}
+
+class MutableAggregationBuffer(
+    toCatalystConverters: Array[Any => Any],
+    toScalaConverters: Array[Any => Any],
+    bufferOffset: Int,
+    var underlyingBuffer: MutableRow)
+  extends AggregationBuffer(toCatalystConverters, toScalaConverters, bufferOffset) {
+
+  override def apply(i: Int): Any = {
+    if (i >= length || i < 0) {
+      throw new IllegalArgumentException(
+        s"Could not access ${i}th value in this buffer because it only has $length values.")
+    }
+    toScalaConverters(i)(underlyingBuffer(offsets(i)))
+  }
+
+  def update(i: Int, value: Any): Unit = {
+    if (i >= length || i < 0) {
+      throw new IllegalArgumentException(
+        s"Could not update ${i}th value in this buffer because it only has $length values.")
+    }
+    underlyingBuffer.update(offsets(i), toCatalystConverters(i)(value))
+  }
+
+  override def copy(): MutableAggregationBuffer = {
+    new MutableAggregationBuffer(
+      toCatalystConverters,
+      toScalaConverters,
+      bufferOffset,
+      underlyingBuffer)
+  }
+}
+
+class InputAggregationBuffer(
+    toCatalystConverters: Array[Any => Any],
+    toScalaConverters: Array[Any => Any],
+    bufferOffset: Int,
+    var underlyingInputBuffer: Row)
+  extends AggregationBuffer(toCatalystConverters, toScalaConverters, bufferOffset) {
+
+  override def apply(i: Int): Any = {
+    if (i >= length || i < 0) {
+      throw new IllegalArgumentException(
+        s"Could not access ${i}th value in this buffer because it only has $length values.")
+    }
+    toScalaConverters(i)(underlyingInputBuffer(offsets(i)))
+  }
+
+  override def copy(): InputAggregationBuffer = {
+    new InputAggregationBuffer(
+      toCatalystConverters,
+      toScalaConverters,
+      bufferOffset,
+      underlyingInputBuffer)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -287,6 +287,9 @@ class SQLContext(@transient val sparkContext: SparkContext)
   @transient
   val udf: UDFRegistration = new UDFRegistration(this)
 
+  @transient
+  val udaf: UDAFRegistration = new UDAFRegistration(this)
+
   /**
    * Returns true if the table is currently cached in-memory.
    * @group cachemgmt
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDAFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDAFRegistration.scala
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.expressions.{Expression}
+import org.apache.spark.sql.execution.expressions.aggregate2.{ScalaUDAF, UserDefinedAggregateFunction}
+
+
+class UDAFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
+
+  private val functionRegistry = sqlContext.functionRegistry
+
+  def register(
+      name: String,
+      func: UserDefinedAggregateFunction): UserDefinedAggregateFunction = {
+    def builder(children: Seq[Expression]) = ScalaUDAF(children, func)
+    functionRegistry.registerFunction(name, builder)
+    func
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/aggregate2/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/aggregate2/udaf.scala
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.expressions.aggregate2
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
+import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters}
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate2.{InputAggregationBuffer, AggregateFunction2, MutableAggregationBuffer}
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.Row
+
+abstract class UserDefinedAggregateFunction extends Serializable {
+
+  def inputDataType: StructType
+
+  def bufferSchema: StructType
+
+  def returnDataType: DataType
+
+  def deterministic: Boolean
+
+  def initialize(buffer: MutableAggregationBuffer): Unit
+
+  def update(buffer: MutableAggregationBuffer, input: Row): Unit
+
+  def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit
+
+  def evaluate(buffer: Row): Any
+
+}
+
+case class ScalaUDAF(
+    children: Seq[Expression],
+    udaf: UserDefinedAggregateFunction)
+  extends AggregateFunction2 with ImplicitCastInputTypes with Logging {
+
+  require(
+    children.length == udaf.inputDataType.length,
+    s"$udaf only accepts ${udaf.inputDataType.length} arguments, " +
+      s"but ${children.length} are provided.")
+
+  override def nullable: Boolean = true
+
+  override def dataType: DataType = udaf.returnDataType
+
+  override def deterministic: Boolean = udaf.deterministic
+
+  override val inputTypes: Seq[DataType] = udaf.inputDataType.map(_.dataType)
+
+  override val bufferSchema: StructType = udaf.bufferSchema
+
+  override val bufferAttributes: Seq[AttributeReference] = bufferSchema.toAttributes
+
+  override lazy val cloneBufferAttributes = bufferAttributes.map(_.newInstance())
+
+  val childrenSchema: StructType = {
+    val inputFields = children.zipWithIndex.map {
+      case (child, index) =>
+        StructField(s"input$index", child.dataType, child.nullable, Metadata.empty)
+    }
+    StructType(inputFields)
+  }
+
+  lazy val inputProjection = {
+    val inputAttributes = childrenSchema.toAttributes
+    log.debug(
+      s"Creating MutableProj: $children, inputSchema: $inputAttributes.")
+    try {
+      GenerateMutableProjection.generate(children, inputAttributes)()
+    } catch {
+      case e: Exception =>
+        log.error("Failed to generate mutable projection, fallback to interpreted", e)
+        new InterpretedMutableProjection(children, inputAttributes)
+    }
+  }
+
+  val inputToScalaConverters: Any => Any =
+    CatalystTypeConverters.createToScalaConverter(childrenSchema)
+
+  val bufferValuesToCatalystConverters: Array[Any => Any] = bufferSchema.fields.map { field =>
+    CatalystTypeConverters.createToCatalystConverter(field.dataType)
+  }
+
+  val bufferValuesToScalaConverters: Array[Any => Any] = bufferSchema.fields.map { field =>
+    CatalystTypeConverters.createToScalaConverter(field.dataType)
+  }
+
+  lazy val inputAggregateBuffer: InputAggregationBuffer =
+    new InputAggregationBuffer(
+      bufferValuesToCatalystConverters,
+      bufferValuesToScalaConverters,
+      bufferOffset,
+      null)
+
+  lazy val mutableAggregateBuffer: MutableAggregationBuffer =
+    new MutableAggregationBuffer(
+      bufferValuesToCatalystConverters,
+      bufferValuesToScalaConverters,
+      bufferOffset,
+      null)
+
+
+  override def initialize(buffer: MutableRow): Unit = {
+    mutableAggregateBuffer.underlyingBuffer = buffer
+
+    udaf.initialize(mutableAggregateBuffer)
+  }
+
+  override def update(buffer: MutableRow, input: InternalRow): Unit = {
+    mutableAggregateBuffer.underlyingBuffer = buffer
+
+    udaf.update(
+      mutableAggregateBuffer,
+      inputToScalaConverters(inputProjection(input)).asInstanceOf[Row])
+  }
+
+  override def merge(buffer1: MutableRow, buffer2: InternalRow): Unit = {
+    mutableAggregateBuffer.underlyingBuffer = buffer1
+    inputAggregateBuffer.underlyingInputBuffer = buffer2
+
+    udaf.update(mutableAggregateBuffer, inputAggregateBuffer)
+  }
+
+  override def eval(buffer: InternalRow = null): Any = {
+    inputAggregateBuffer.underlyingInputBuffer = buffer
+
+    udaf.evaluate(inputAggregateBuffer)
+  }
+
+  override def toString: String = {
+    s"""${udaf.getClass.getSimpleName}(${children.mkString(",")})"""
+  }
+}
diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate2/MyJavaUDAF.java b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate2/MyJavaUDAF.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.sql.hive.aggregate2;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.spark.sql.catalyst.expressions.aggregate2.MutableAggregationBuffer;
+import org.apache.spark.sql.execution.expressions.aggregate2.UserDefinedAggregateFunction;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.types.DataType;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.Row;
+
+public class MyJavaUDAF extends UserDefinedAggregateFunction {
+
+  private StructType _inputDataType;
+
+  private StructType _bufferSchema;
+
+  private DataType _returnDataType;
+
+  public MyJavaUDAF() {
+    List<StructField> inputfields = new ArrayList<StructField>();
+    inputfields.add(DataTypes.createStructField("inputDouble", DataTypes.DoubleType, true));
+    _inputDataType = DataTypes.createStructType(inputfields);
+
+    List<StructField> bufferFields = new ArrayList<StructField>();
+    bufferFields.add(DataTypes.createStructField("bufferDouble", DataTypes.DoubleType, true));
+    _bufferSchema = DataTypes.createStructType(bufferFields);
+
+    _returnDataType = DataTypes.DoubleType;
+  }
+
+  @Override public StructType inputDataType() {
+    return _inputDataType;
+  }
+
+  @Override public StructType bufferSchema() {
+    return _bufferSchema;
+  }
+
+  @Override public DataType returnDataType() {
+    return _returnDataType;
+  }
+
+  @Override public boolean deterministic() {
+    return true;
+  }
+
+  @Override public void initialize(MutableAggregationBuffer buffer) {
+    buffer.update(0, null);
+  }
+
+  @Override public void update(MutableAggregationBuffer buffer, Row input) {
+    if (!input.isNullAt(0)) {
+      if (buffer.isNullAt(0)) {
+        buffer.update(0, input.getDouble(0));
+      } else {
+        Double newValue = input.getDouble(0) * buffer.getDouble(0);
+        buffer.update(0, newValue);
+      }
+    }
+  }
+
+  @Override public void merge(MutableAggregationBuffer buffer1, Row buffer2) {
+    if (!buffer2.isNullAt(0)) {
+      if (buffer1.isNullAt(0)) {
+        buffer1.update(0, buffer2.getDouble(0));
+      } else {
+        Double newValue = buffer2.getDouble(0) * buffer1.getDouble(0);
+        buffer1.update(0, newValue);
+      }
+    }
+  }
+
+  @Override public Object evaluate(Row buffer) {
+    if (buffer.isNullAt(0)) {
+      return null;
+    } else {
+      return buffer.getDouble(0);
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Aggregate2Suite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Aggregate2Suite.scala