From f3fa6580bca70f3307d70e938ef8531c928d958b Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Fri, 3 Mar 2017 10:36:02 -0800
Subject: [PATCH 01/26] work

---
 .../org/apache/spark/ml/stat/Summarizer.scala | 582 ++++++++++++++++++
 .../spark/ml/stat/SummarizerSuite.scala       |  61 ++
 2 files changed, 643 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
new file mode 100644
index 000000000000..f94fc10721bb
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -0,0 +1,582 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.stat
+
+import org.apache.spark.annotation.{DeveloperApi, Since}
+import org.apache.spark.ml.linalg.{SQLDataTypes, Vector, Vectors}
+import org.apache.spark.sql.{Column, Row}
+import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
+import org.apache.spark.sql.types._
+
+/**
+ * A builder object that provides summary statistics about a given column.
+ *
+ * Users should not directly create such builders, but instead use one of the methods in
+ * [[Summarizer]].
+ */
+@Since("2.2.0")
+abstract class SummaryBuilder {
+  /**
+   * Returns an aggregate object that contains the summary of the column with the requested metrics.
+   * @param column a column that contains Vector object.
+   * @return an aggregate column that contains the statistics. The exact content of this
+   *         structure is determined during the creation of the builder. It is also provided by
+   *         [[summaryStructure]].
+   */
+  @Since("2.2.0")
+  def summary(column: Column): Column
+
+  /**
+   * The structure of the summary that will be returned by this builder.
+   * @return the structure of the aggregate data returned by this builder.
+   */
+  @Since("2.2.0")
+  @DeveloperApi
+  private[spark] def summaryStructure: StructType
+}
+
+/**
+ * Tools for vectorized statistics on MLlib Vectors.
+ *
+ * The methods in this package provide various statistics for Vectors contained inside DataFrames.
+ *
+ * This class lets users pick the statistics they would like to extract for a given column. Here is
+ * an example in Scala:
+ * {{{
+ *   val dataframe = ... // Some dataframe containing a feature column
+ *   val allStats = dataframe.select(Summarizer.metrics("min", "max").summary($"features"))
+ *   val Row(min_, max_) = allStats.first()
+ * }}}
+ *
+ * If one wants to get a single metric, shortcuts are also available:
+ * {{{
+ *   val meanDF = dataframe.select(Summarizer.mean($"features"))
+ *   val Row(mean_) = meanDF.first()
+ * }}}
+ */
+@Since("2.2.0")
+object Summarizer {
+
+  import SummaryBuilderImpl._
+
+  /**
+   * Given a list of metrics, provides a builder that it turns computes metrics from a column.
+   *
+   * See the documentation of [[Summarizer]] for an example.
+   *
+   * @param firstMetric the metric being provided
+   * @param metrics additional metrics that can be provided.
+   * @return a builder.
+   * @throws IllegalArgumentException if one of the metric names is not understood.
+   */
+  @Since("2.2.0")
+  def metrics(firstMetric: String, metrics: String*): SummaryBuilder = {
+    val (typedMetrics, computeMetrics) = getRelevantMetrics(Seq(firstMetric) ++ metrics)
+    new SummaryBuilderImpl(
+      SummaryBuilderImpl.buildUdafForMetrics(computeMetrics),
+      SummaryBuilderImpl.structureForMetrics(typedMetrics))
+  }
+
+  def mean(col: Column): Column = metrics("mean").summary(col)
+}
+
+private[ml] class SummaryBuilderImpl(
+  udaf: UserDefinedAggregateFunction,
+  structType: StructType) extends SummaryBuilder {
+
+  override def summary(column: Column): Column = udaf.apply(column)
+
+  override val summaryStructure: StructType = structType
+}
+
+private[ml]
+object SummaryBuilderImpl {
+
+  def implementedMetrics: Seq[String] = allMetrics.map(_._1).sorted
+
+  @throws[IllegalArgumentException]("When the list is empty or not a subset of known metrics")
+  def getRelevantMetrics(requested: Seq[String]): (Seq[Metrics], Seq[ComputeMetrics]) = {
+    val all = requested.map { req =>
+      val (_, metric, deps) = allMetrics.find(tup => tup._1 == req).getOrElse {
+        throw new IllegalArgumentException(s"Metric $req cannot be found." +
+          s" Valid metris are $implementedMetrics")
+      }
+      metric -> deps
+    }
+    // TODO: do not sort, otherwise the user has to look the schema to see the order that it
+    // is going to be given in.
+    val metrics = all.map(_._1).distinct.sortBy(_.toString)
+    val computeMetrics = all.flatMap(_._2).distinct.sortBy(_.toString)
+    metrics -> computeMetrics
+  }
+
+  def structureForMetrics(metrics: Seq[Metrics]): StructType = {
+    val fields = metrics.map { m =>
+      // All types are vectors, except for one.
+      val tpe = if (m == Count) { LongType } else { SQLDataTypes.VectorType }
+      // We know the metric is part of the list.
+      val (name, _, _) = allMetrics.find(tup => tup._2 == m).get
+      StructField(name, tpe, nullable = false)
+    }
+    StructType(fields)
+  }
+
+  def buildUdafForMetrics(metrics: Seq[ComputeMetrics]): UserDefinedAggregateFunction = null
+
+  /**
+   * All the metrics that can be currently computed by Spark for vectors.
+   *
+   * This list associates the user name, the internal (typed) name, and the list of computation
+   * metrics that need to de computed internally to get the final result.
+   */
+  private val allMetrics = Seq(
+    ("mean", Mean, Seq(ComputeMean, ComputeWeightSum, ComputeTotalWeightSum)),
+    ("variance", Variance, Seq(ComputeTotalWeightSum, ComputeWeightSquareSum, ComputeMean,
+                               ComputeM2n)),
+    ("count", Count, Seq(ComputeCount)),
+    ("numNonZeros", NumNonZeros, Seq(ComputeNNZ)),
+    ("max", Max, Seq(ComputeMax, ComputeTotalWeightSum, ComputeNNZ, ComputeCount)),
+    ("min", Min, Seq(ComputeMin, ComputeTotalWeightSum, ComputeNNZ, ComputeCount)),
+    ("normL2", NormL2, Seq(ComputeTotalWeightSum, ComputeM2)),
+    ("normL1", Min, Seq(ComputeTotalWeightSum, ComputeL1))
+  )
+
+  /**
+   * The metrics that are currently implemented.
+   */
+  sealed trait Metrics
+  case object Mean extends Metrics
+  case object Variance extends Metrics
+  case object Count extends Metrics
+  case object NumNonZeros extends Metrics
+  case object Max extends Metrics
+  case object Min extends Metrics
+  case object NormL2 extends Metrics
+  case object NormL1 extends Metrics
+
+  /**
+   * The running metrics that are going to be computing.
+   *
+   * There is a bipartite graph between the metrics and the computed metrics.
+   */
+  private sealed trait ComputeMetrics
+  case object ComputeMean extends ComputeMetrics
+  case object ComputeM2n extends ComputeMetrics
+  case object ComputeM2 extends ComputeMetrics
+  case object ComputeL1 extends ComputeMetrics
+  case object ComputeCount extends ComputeMetrics
+  case object ComputeTotalWeightSum extends ComputeMetrics
+  case object ComputeWeightSquareSum extends ComputeMetrics
+  case object ComputeWeightSum extends ComputeMetrics
+  case object ComputeNNZ extends ComputeMetrics
+  case object ComputeMax extends ComputeMetrics
+  case object ComputeMin extends ComputeMetrics
+
+  // The order in which the metrics will be stored in the buffer.
+  // This order replicates the order above.
+  private val metricsWithOrder = Seq(ComputeMean, ComputeM2n, ComputeM2, ComputeL1, ComputeCount,
+    ComputeTotalWeightSum, ComputeWeightSquareSum, ComputeWeightSum, ComputeNNZ, ComputeMax,
+    ComputeMin).zipWithIndex
+
+
+  private class MetricsUDAF(
+      requested: Seq[Metrics],
+      metrics: Seq[ComputeMetrics]) extends UserDefinedAggregateFunction {
+
+    // All of these are the indexes of the value in the temporary buffer, or -1 if these values are
+    // not requested.
+    // All these values are unrolled so that we do not pay too steep a penalty for the extra level
+    // of indirection associated with selectively computing some metrics.
+    private[this] val computeMean = getIndexFor(ComputeMean)
+    private[this] val computeM2n = getIndexFor(ComputeM2n)
+    private[this] val computeM2 = getIndexFor(ComputeM2)
+    private[this] val computeL1 = getIndexFor(ComputeL1)
+    private[this] val computeTotalCount = getIndexFor(ComputeCount)
+    // TODO: why is it not used?
+    private[this] val computeTotalWeightSum = getIndexFor(ComputeTotalWeightSum)
+    private[this] val computeWeightSquareSum = getIndexFor(ComputeWeightSquareSum)
+    private[this] val computeWeightSum = getIndexFor(ComputeWeightSum)
+    private[this] val computeNNZ = getIndexFor(ComputeNNZ)
+    private[this] val computeCurrMax = getIndexFor(ComputeMax)
+    private[this] val computeCurrMin = getIndexFor(ComputeMin)
+    // Always computed, and put at the end.
+    private[this] val computeN = metricsWithOrder.size
+
+    override def inputSchema: StructType = {
+      StructType(Seq(StructField("features", SQLDataTypes.VectorType)))
+    }
+
+    // The schema for the buffer contains all the structures, but a subset of them may be null.
+    override def bufferSchema: StructType = {
+      val fields = metricsWithOrder.map { case (m, _) =>
+        // TODO(thunterdb) this could be merged with the types above so that there is no confusion
+        // when adding other types later, and with the initialization below.
+        val tpe = m match {
+          case ComputeCount => LongType
+          case ComputeTotalWeightSum => DoubleType
+          case ComputeWeightSquareSum => DoubleType
+          case _ => SQLDataTypes.VectorType
+        }
+        StructField(m.toString, tpe, nullable = true)
+      } :+ StructField("n", IntegerType, nullable = false)
+      StructType(fields)
+    }
+
+    override def dataType: DataType = {
+      structureForMetrics(requested)
+    }
+
+    override def deterministic: Boolean = true
+
+    // The initialization does not do anything because we do not know the size of the vectors yet.
+    override def initialize(buffer: MutableAggregationBuffer): Unit = {
+      // Set n to -1 to indicate that we do not know the size yet.
+      buffer.update(computeN, -1)
+    }
+
+    override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
+      val v = input.getAs[Vector](0)
+      // TODO: add weights later.
+      update(buffer, v, 1.0)
+    }
+
+    override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
+      val thisUsed = ((computeTotalWeightSum >= 0 && buffer1.getDouble(computeTotalWeightSum) > 0)
+                    || (computeTotalCount >= 0 && buffer1.getLong(computeTotalCount) > 0))
+      val otherUsed = ((computeTotalWeightSum >= 0 && buffer2.getDouble(computeTotalWeightSum) > 0)
+        || (computeTotalCount >= 0 && buffer2.getLong(computeTotalCount) > 0))
+      if (thisUsed && otherUsed) {
+        // Merge onto this one.
+        val n1 = getN(buffer1)
+        val n2 = getN(buffer2)
+        require(n1 == n2, s"Dimensions mismatch between summarizers: expected $n1 but got $n2")
+        require(n1 > 0, s"Expected the summarizers to be initialized")
+        // The scalar values.
+        if (computeTotalCount >= 0) {
+          buffer1.update(computeTotalCount,
+            buffer1.getLong(computeTotalCount) + buffer2.getLong(computeTotalCount))
+        }
+        if (computeTotalWeightSum >= 0) {
+          buffer1.update(computeTotalWeightSum,
+            buffer1.getDouble(computeTotalWeightSum) + buffer2.getDouble(computeTotalWeightSum))
+        }
+        if (computeWeightSquareSum >= 0) {
+          buffer1.update(computeWeightSquareSum,
+            buffer1.getDouble(computeWeightSquareSum) + buffer2.getDouble(computeWeightSquareSum))
+        }
+        // The vector values.
+        val currMean = exposeDArray(buffer1, computeMean)
+        val currM2n = exposeDArray(buffer1, computeM2n)
+        val currM2 = exposeDArray(buffer1, computeM2)
+        val currL1 = exposeDArray(buffer1, computeL1)
+        val weightSum = exposeDArray(buffer1, computeWeightSum)
+        val numNonzeros = exposeDArray(buffer1, computeNNZ)
+        val currMax = exposeDArray(buffer1, computeCurrMax)
+        val currMin = exposeDArray(buffer1, computeCurrMin)
+
+        val otherMean = exposeDArray2(buffer2, computeMean)
+        val otherM2n = exposeDArray2(buffer2, computeM2n)
+        val otherM2 = exposeDArray2(buffer2, computeM2)
+        val otherL1 = exposeDArray2(buffer2, computeL1)
+        val otherWeightSum = exposeDArray2(buffer2, computeWeightSum)
+        val otherNumNonzeros = exposeDArray2(buffer2, computeNNZ)
+        val otherMax = exposeDArray2(buffer2, computeCurrMax)
+        val otherMin = exposeDArray2(buffer2, computeCurrMin)
+
+
+        var i = 0
+        while (i < n1) {
+          // This case is written do maximize conformance with the current code.
+          // For more proper implementation, the differnt loops should be disentangled.
+          // The case of 0.0 values being used should not happen if the proper selection
+          // of metrics is done.
+          // TODO: should it be NaN so that any computation with it will indicate an error,
+          // instead of failing silently?
+          // Using exceptions is not great for performance in this loop.
+          val thisNnz = if (weightSum == null) { 0.0 } else { weightSum(i) }
+          val otherNnz = if (otherWeightSum == null) { 0.0 } else { otherWeightSum(i) }
+          val totalNnz = thisNnz + otherNnz
+          val totalCnnz = if (numNonzeros == null || numNonzeros == null) { 0.0 } else {
+            numNonzeros(i) + otherNumNonzeros(i)
+          }
+          if (totalNnz != 0.0) {
+            val deltaMean = if (computeMean >= 0) { otherMean(i) - currMean(i) } else { 0.0 }
+            // merge mean together
+            if (computeMean >= 0) {
+              currMean(i) += deltaMean * otherNnz / totalNnz
+            }
+            // merge m2n together
+            if (computeM2n >= 0) {
+              currM2n(i) += otherM2n(i) + deltaMean * deltaMean * thisNnz * otherNnz / totalNnz
+            }
+            // merge m2 together
+            if (computeM2 >= 0) {
+              currM2(i) += otherM2(i)
+            }
+            // merge l1 together
+            if (computeL1 >= 0) {
+              currL1(i) += otherL1(i)
+            }
+            // merge max and min
+            if (computeCurrMax >= 0) {
+              currMax(i) = math.max(currMax(i), otherMax(i))
+            }
+            if (computeCurrMin >= 0) {
+              currMin(i) = math.min(currMin(i), otherMin(i))
+            }
+          }
+          if (computeWeightSum >= 0) {
+            weightSum(i) = totalNnz
+          }
+          if (computeNNZ >= 0) {
+            numNonzeros(i) = totalCnnz
+          }
+
+          i += 1
+        }
+      } else if (otherUsed) {
+        // Copy buffer1 onto buffer2.
+        // TODO
+      }
+    }
+
+    // This function packs the requested values, in order.
+    override def evaluate(buffer: Row): Row = {
+      val values = requested.map {
+        case Mean => evaluateMean(buffer)
+        case Variance => evaluateVariance(buffer)
+        case NumNonZeros => evaluateNNZ(buffer)
+        case Max => evaluateMax(buffer)
+        case Min => evaluateMin(buffer)
+        case NormL1 => evaluateNormL1(buffer)
+        case NormL2 => evaluateNormL2(buffer)
+      }
+      Row(values: _*)
+    }
+
+    private def evaluateMean(buffer: Row): Vector = {
+      assert(computeMean >= 0) // This is a programming error.
+      assert(computeWeightSum >= 0)
+      assert(computeTotalWeightSum >= 0)
+      val n = getN(buffer)
+      assert(n >= 0)
+      val realMean = Array.ofDim[Double](n)
+      val currMean = exposeDArray2(buffer, computeMean)
+      val weightSum = exposeDArray2(buffer, computeWeightSum)
+      val totalWeightSum = buffer.getDouble(computeTotalWeightSum)
+      // TODO: should this be turned into NaN instead? That would be more consistent than
+      // an error.
+      require(totalWeightSum > 0, "Data has zero weight")
+      var i = 0
+      while (i < n) {
+        realMean(i) = currMean(i) * (weightSum(i) / totalWeightSum)
+        i += 1
+      }
+      Vectors.dense(realMean)
+    }
+
+    private def evaluateVariance(buffer: Row): Vector = {
+      assert(computeTotalWeightSum >= 0)
+      assert(computeWeightSquareSum >= 0)
+      assert(computeWeightSum >= 0)
+      assert(computeM2 >= 0)
+      assert(computeM2n >= 0)
+
+      val totalWeightSum = buffer.getDouble(computeTotalWeightSum)
+      require(totalWeightSum > 0, s"Nothing has been added to this summarizer.")
+
+      val n = getN(buffer)
+      assert(n >= 0)
+
+      val weightSquareSum = buffer.getDouble(computeWeightSquareSum)
+      val realVariance = Array.ofDim[Double](n)
+
+      val currMean = exposeDArray2(buffer, computeMean)
+      val currM2n = exposeDArray2(buffer, computeM2n)
+      val weightSum = exposeDArray2(buffer, computeWeightSum)
+      val denominator = totalWeightSum - (weightSquareSum / totalWeightSum)
+
+      // Sample variance is computed, if the denominator is less than 0, the variance is just 0.
+      if (denominator > 0.0) {
+        val deltaMean = currMean
+        var i = 0
+        val len = currM2n.length
+        while (i < len) {
+          realVariance(i) = (currM2n(i) + deltaMean(i) * deltaMean(i) * weightSum(i) *
+            (totalWeightSum - weightSum(i)) / totalWeightSum) / denominator
+          i += 1
+        }
+      }
+      Vectors.dense(realVariance)
+    }
+
+    private def evaluateNNZ(buffer: Row): Vector = {
+      null // TODO
+    }
+
+    private def evaluateMax(buffer: Row): Vector = {
+      null // TODO
+    }
+
+    private def evaluateMin(buffer: Row): Vector = {
+      null // TODO
+    }
+
+    private def evaluateNormL1(buffer: Row): Vector = {
+      null // TODO
+    }
+
+    private def evaluateNormL2(buffer: Row): Vector = {
+      null // TODO
+    }
+
+    private def update(buffer: MutableAggregationBuffer, instance: Vector, weight: Double): Unit = {
+      val startN = getN(buffer)
+      if (startN == -1) {
+        initialize(buffer, instance.size)
+      } else {
+        require(startN == instance.size,
+          s"Trying to insert a vector of size $instance into a buffer that " +
+          s"has been sized with $startN")
+      }
+      // It should be initialized.
+      assert(getN(buffer) > 0)
+      // Update the scalars that do not require going through the vector first.
+      if (computeTotalWeightSum >= 0) {
+        buffer.update(computeTotalWeightSum, buffer.getDouble(computeTotalWeightSum) + weight)
+      }
+      if (computeWeightSquareSum >= 0) {
+        buffer.update(computeWeightSquareSum,
+          buffer.getDouble(computeWeightSquareSum) + weight * weight)
+      }
+      if (computeTotalCount >= 0) {
+        buffer.update(computeTotalCount, buffer.getLong(computeTotalCount) + 1)
+      }
+
+      // All these may be null pointers.
+      val localCurrMean = exposeDArray(buffer, computeMean)
+      val localCurrM2n = exposeDArray(buffer, computeM2n)
+      val localCurrM2 = exposeDArray(buffer, computeM2)
+      val localCurrL1 = exposeDArray(buffer, computeL1)
+      val localWeightSum = exposeDArray(buffer, computeWeightSum)
+      val localNumNonzeros = exposeDArray(buffer, computeNNZ)
+      val localCurrMax = exposeDArray(buffer, computeCurrMax)
+      val localCurrMin = exposeDArray(buffer, computeCurrMin)
+
+      // Update the requested vector values.
+      // All these changes are in place. There is no need to make copies.
+      instance.foreachActive { (index, value) =>
+        if (value != 0.0) {
+          if (localCurrMax != null && localCurrMax(index) < value) {
+             localCurrMax(index) = value
+          }
+          if (localCurrMin != null && localCurrMin(index) > value) {
+            localCurrMin(index) = value
+          }
+
+          if (localCurrMean != null) {
+            val prevMean = localCurrMean(index)
+            val diff = value - prevMean
+            localCurrMean(index) = prevMean + weight * diff / (localWeightSum(index) + weight)
+            if (localCurrM2n != null) {
+              // require: localCurrMean != null.
+              localCurrM2n(index) += weight * (value - localCurrMean(index)) * diff
+            }
+          }
+
+          if (localCurrM2 != null) {
+            localCurrM2(index) += weight * value * value
+          }
+          if (localCurrL1 != null) {
+            localCurrL1(index) += weight * math.abs(value)
+          }
+          if (localWeightSum != null) {
+            localWeightSum(index) += weight
+          }
+          if (localNumNonzeros != null) {
+            localNumNonzeros(index) += 1
+          }
+        }
+      }
+    }
+
+    // the current vector size, or -1 if not available yet.
+    private def getN(buffer: Row): Int = {
+      buffer.getInt(computeN)
+    }
+
+
+    private[this] def exposeDArray2(buffer: Row, index: Int): Array[Double] = {
+      if (index == -1) {
+        null
+      } else {
+        buffer.getAs[Array[Double]](index)
+      }
+    }
+
+    private[this] def exposeDArray(buffer: MutableAggregationBuffer, index: Int): Array[Double] = {
+      if (index == -1) {
+        null
+      } else {
+        buffer.getAs[Array[Double]](index)
+      }
+    }
+
+    // Performs the initialization of the requested fields, if required.
+    private def initialize(buffer: MutableAggregationBuffer, n: Int): Unit = {
+      require(n > 0, s"Cannot set n to $n: must be positive")
+      val currentN = buffer.getInt(computeN)
+      require(currentN == -1 || (currentN == n),
+        s"The vector size has already been set with $currentN, requested change to $n")
+      if (currentN == -1) {
+        // Set all the vectors to the given value.
+        // This is not a tight loop, so there should be no performance issue in
+        // making lookups for the values.
+        buffer.update(computeN, n)
+        // Only initialize the metricts that we want to compute:
+        metricsWithOrder.filter(p => metrics.contains(p._1)).foreach { case (m, idx) =>
+          // Special cases for the scalar values.
+          m match {
+            case ComputeCount =>
+              buffer.update(idx, 0L)
+            case ComputeTotalWeightSum =>
+              buffer.update(idx, 0.0)
+            case ComputeWeightSquareSum =>
+              buffer.update(idx, 0.0)
+            case _ =>
+              buffer.update(idx, Array.ofDim[Double](n))
+          }
+        }
+      }
+    }
+
+    /**
+     * Returns a fixed index for the compute metric if it is in the list of requested metrics,
+     * and -1 otherwise.
+     */
+    private def getIndexFor(m: ComputeMetrics): Int = {
+      if (metrics.contains(m)) {
+        metricsWithOrder.find(p => p._1 == m).get._2
+      } else {
+        -1
+      }
+    }
+
+  }
+
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
new file mode 100644
index 000000000000..685fea8575c9
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.stat
+
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
+
+
+class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  import testImplicits._
+  import Summarizer._
+
+  private val seed = 42
+  private val eps: Double = 1e-5
+
+  private def denseData(input: Seq[Seq[Double]]): DataFrame = {
+    val data = input.map(_.toArray).map(Vectors.dense).map(Tuple1.apply)
+    sc.parallelize(data).toDF("features")
+  }
+
+  private def compare(df: DataFrame, exp: Seq[Any]): Unit = {
+    val res = df.first().toSeq
+    val names = df.schema.fieldNames.zipWithIndex.map { case (n, idx) => s"$n ($idx)" }
+    assert(res.size === exp.size)
+    for (((x1, x2), name) <- res.zip(exp).zip(names)) {
+      (x1, x2) match {
+        case (d1: Double, d2: Double) =>
+          assert(Vectors.dense(d1) ~== Vectors.dense(d2) absTol 1e-4, name)
+      }
+    }
+  }
+
+  test("basic error handling") {
+    val df = denseData(Nil)
+    val c = df.col("features")
+    val res = df.select(metrics("mean").summary(c), mean(c))
+    intercept[IllegalArgumentException] {
+      compare(res, Seq.empty)
+    }
+  }
+}

From 7539835dad863a6b73d88d79983342f9ddb7fb9d Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Mon, 6 Mar 2017 14:38:41 -0800
Subject: [PATCH 02/26] work on the test suite

---
 .../org/apache/spark/ml/stat/Summarizer.scala | 88 +++++++++++++------
 .../spark/ml/stat/SummarizerSuite.scala       | 87 +++++++++++++++---
 2 files changed, 137 insertions(+), 38 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index f94fc10721bb..1de782af9ca1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -17,12 +17,17 @@
 
 package org.apache.spark.ml.stat
 
+import scala.collection.mutable
+
 import org.apache.spark.annotation.{DeveloperApi, Since}
+import org.apache.spark.internal.Logging
 import org.apache.spark.ml.linalg.{SQLDataTypes, Vector, Vectors}
 import org.apache.spark.sql.{Column, Row}
 import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
 import org.apache.spark.sql.types._
 
+// scalastyle:off println
+
 /**
  * A builder object that provides summary statistics about a given column.
  *
@@ -70,7 +75,7 @@ abstract class SummaryBuilder {
  * }}}
  */
 @Since("2.2.0")
-object Summarizer {
+object Summarizer extends Logging {
 
   import SummaryBuilderImpl._
 
@@ -79,6 +84,15 @@ object Summarizer {
    *
    * See the documentation of [[Summarizer]] for an example.
    *
+   * The following metrics are accepted (case sensitive):
+   *  - mean: a vector that contains the coefficient-wise mean.
+   *  - variance: a vector tha contains the coefficient-wise variance.
+   *  - count: the count of all vectors seen.
+   *  - numNonzeros: a vector with the number of non-zeros for each coefficients
+   *  - max: the maximum for each coefficient.
+   *  - min: the minimum for each coefficient.
+   *  - normL2: the Euclidian norm for each coefficient.
+   *  - normL1: the L1 norm of each coefficient (sum of the absolute values).
    * @param firstMetric the metric being provided
    * @param metrics additional metrics that can be provided.
    * @return a builder.
@@ -88,7 +102,7 @@ object Summarizer {
   def metrics(firstMetric: String, metrics: String*): SummaryBuilder = {
     val (typedMetrics, computeMetrics) = getRelevantMetrics(Seq(firstMetric) ++ metrics)
     new SummaryBuilderImpl(
-      SummaryBuilderImpl.buildUdafForMetrics(computeMetrics),
+      SummaryBuilderImpl.buildUdafForMetrics(typedMetrics, computeMetrics),
       SummaryBuilderImpl.structureForMetrics(typedMetrics))
   }
 
@@ -105,7 +119,7 @@ private[ml] class SummaryBuilderImpl(
 }
 
 private[ml]
-object SummaryBuilderImpl {
+object SummaryBuilderImpl extends Logging {
 
   def implementedMetrics: Seq[String] = allMetrics.map(_._1).sorted
 
@@ -118,9 +132,9 @@ object SummaryBuilderImpl {
       }
       metric -> deps
     }
-    // TODO: do not sort, otherwise the user has to look the schema to see the order that it
+    // Do not sort, otherwise the user has to look the schema to see the order that it
     // is going to be given in.
-    val metrics = all.map(_._1).distinct.sortBy(_.toString)
+    val metrics = all.map(_._1)
     val computeMetrics = all.flatMap(_._2).distinct.sortBy(_.toString)
     metrics -> computeMetrics
   }
@@ -136,7 +150,11 @@ object SummaryBuilderImpl {
     StructType(fields)
   }
 
-  def buildUdafForMetrics(metrics: Seq[ComputeMetrics]): UserDefinedAggregateFunction = null
+  def buildUdafForMetrics(
+      metrics: Seq[Metrics],
+      computeMetrics: Seq[ComputeMetrics]): UserDefinedAggregateFunction = {
+    new MetricsUDAF(metrics, computeMetrics)
+  }
 
   /**
    * All the metrics that can be currently computed by Spark for vectors.
@@ -145,15 +163,15 @@ object SummaryBuilderImpl {
    * metrics that need to de computed internally to get the final result.
    */
   private val allMetrics = Seq(
-    ("mean", Mean, Seq(ComputeMean, ComputeWeightSum, ComputeTotalWeightSum)),
+    ("mean", Mean, Seq(ComputeMean, ComputeWeightSum, ComputeTotalWeightSum)), // Vector
     ("variance", Variance, Seq(ComputeTotalWeightSum, ComputeWeightSquareSum, ComputeMean,
-                               ComputeM2n)),
-    ("count", Count, Seq(ComputeCount)),
-    ("numNonZeros", NumNonZeros, Seq(ComputeNNZ)),
-    ("max", Max, Seq(ComputeMax, ComputeTotalWeightSum, ComputeNNZ, ComputeCount)),
-    ("min", Min, Seq(ComputeMin, ComputeTotalWeightSum, ComputeNNZ, ComputeCount)),
-    ("normL2", NormL2, Seq(ComputeTotalWeightSum, ComputeM2)),
-    ("normL1", Min, Seq(ComputeTotalWeightSum, ComputeL1))
+                               ComputeM2n)), // Vector
+    ("count", Count, Seq(ComputeCount)), // Long
+    ("numNonZeros", NumNonZeros, Seq(ComputeNNZ)), // Vector
+    ("max", Max, Seq(ComputeMax, ComputeTotalWeightSum, ComputeNNZ, ComputeCount)), // Vector
+    ("min", Min, Seq(ComputeMin, ComputeTotalWeightSum, ComputeNNZ, ComputeCount)), // Vector
+    ("normL2", NormL2, Seq(ComputeTotalWeightSum, ComputeM2)), // Vector
+    ("normL1", Min, Seq(ComputeTotalWeightSum, ComputeL1)) // Vector
   )
 
   /**
@@ -174,13 +192,13 @@ object SummaryBuilderImpl {
    *
    * There is a bipartite graph between the metrics and the computed metrics.
    */
-  private sealed trait ComputeMetrics
+  sealed trait ComputeMetrics
   case object ComputeMean extends ComputeMetrics
   case object ComputeM2n extends ComputeMetrics
   case object ComputeM2 extends ComputeMetrics
   case object ComputeL1 extends ComputeMetrics
-  case object ComputeCount extends ComputeMetrics
-  case object ComputeTotalWeightSum extends ComputeMetrics
+  case object ComputeCount extends ComputeMetrics // Always computed
+  case object ComputeTotalWeightSum extends ComputeMetrics // Always computed
   case object ComputeWeightSquareSum extends ComputeMetrics
   case object ComputeWeightSum extends ComputeMetrics
   case object ComputeNNZ extends ComputeMetrics
@@ -206,9 +224,8 @@ object SummaryBuilderImpl {
     private[this] val computeM2n = getIndexFor(ComputeM2n)
     private[this] val computeM2 = getIndexFor(ComputeM2)
     private[this] val computeL1 = getIndexFor(ComputeL1)
-    private[this] val computeTotalCount = getIndexFor(ComputeCount)
-    // TODO: why is it not used?
-    private[this] val computeTotalWeightSum = getIndexFor(ComputeTotalWeightSum)
+    private[this] val computeTotalCount = getForcedIndexFor(ComputeCount) // Always compute it.
+    private[this] val computeTotalWeightSum = getForcedIndexFor(ComputeTotalWeightSum) // Always.
     private[this] val computeWeightSquareSum = getIndexFor(ComputeWeightSquareSum)
     private[this] val computeWeightSum = getIndexFor(ComputeWeightSum)
     private[this] val computeNNZ = getIndexFor(ComputeNNZ)
@@ -230,7 +247,7 @@ object SummaryBuilderImpl {
           case ComputeCount => LongType
           case ComputeTotalWeightSum => DoubleType
           case ComputeWeightSquareSum => DoubleType
-          case _ => SQLDataTypes.VectorType
+          case _ => ArrayType(DoubleType, containsNull = false)
         }
         StructField(m.toString, tpe, nullable = true)
       } :+ StructField("n", IntegerType, nullable = false)
@@ -246,7 +263,11 @@ object SummaryBuilderImpl {
     // The initialization does not do anything because we do not know the size of the vectors yet.
     override def initialize(buffer: MutableAggregationBuffer): Unit = {
       // Set n to -1 to indicate that we do not know the size yet.
+      println("initialize: size=" + buffer.size + " computeN = " + computeN)
+      // Thes metrics are always computed.
       buffer.update(computeN, -1)
+      buffer.update(computeTotalCount, 0L)
+      buffer.update(computeTotalWeightSum, 0.0)
     }
 
     override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
@@ -256,10 +277,11 @@ object SummaryBuilderImpl {
     }
 
     override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
-      val thisUsed = ((computeTotalWeightSum >= 0 && buffer1.getDouble(computeTotalWeightSum) > 0)
-                    || (computeTotalCount >= 0 && buffer1.getLong(computeTotalCount) > 0))
-      val otherUsed = ((computeTotalWeightSum >= 0 && buffer2.getDouble(computeTotalWeightSum) > 0)
-        || (computeTotalCount >= 0 && buffer2.getLong(computeTotalCount) > 0))
+      // The metrics that are used to determine if the buffers are used are always computed.
+      val thisUsed = (buffer1.getDouble(computeTotalWeightSum) > 0.0
+                    || buffer1.getLong(computeTotalCount) > 0L)
+      val otherUsed = (buffer2.getDouble(computeTotalWeightSum) > 0.0
+        || buffer2.getLong(computeTotalCount) > 0L)
       if (thisUsed && otherUsed) {
         // Merge onto this one.
         val n1 = getN(buffer1)
@@ -360,6 +382,7 @@ object SummaryBuilderImpl {
       val values = requested.map {
         case Mean => evaluateMean(buffer)
         case Variance => evaluateVariance(buffer)
+        case Count => evaluateCount(buffer)
         case NumNonZeros => evaluateNNZ(buffer)
         case Max => evaluateMax(buffer)
         case Min => evaluateMin(buffer)
@@ -374,7 +397,9 @@ object SummaryBuilderImpl {
       assert(computeWeightSum >= 0)
       assert(computeTotalWeightSum >= 0)
       val n = getN(buffer)
-      assert(n >= 0)
+      if (n < 0) {
+        throw new IllegalArgumentException("No data point has been seen so far")
+      }
       val realMean = Array.ofDim[Double](n)
       val currMean = exposeDArray2(buffer, computeMean)
       val weightSum = exposeDArray2(buffer, computeWeightSum)
@@ -425,6 +450,11 @@ object SummaryBuilderImpl {
       Vectors.dense(realVariance)
     }
 
+    private def evaluateCount(buffer: Row): Long = {
+      assert(computeTotalCount >= 0)
+      buffer.getLong(computeTotalCount)
+    }
+
     private def evaluateNNZ(buffer: Row): Vector = {
       null // TODO
     }
@@ -533,7 +563,7 @@ object SummaryBuilderImpl {
       if (index == -1) {
         null
       } else {
-        buffer.getAs[Array[Double]](index)
+        buffer.getAs[mutable.WrappedArray[Double]](index).array
       }
     }
 
@@ -577,6 +607,10 @@ object SummaryBuilderImpl {
       }
     }
 
+    private def getForcedIndexFor(m: ComputeMetrics): Int = {
+      metricsWithOrder.find(p => p._1 == m).get._2
+    }
+
   }
 
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
index 685fea8575c9..38e7bcd157d4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
@@ -17,13 +17,13 @@
 
 package org.apache.spark.ml.stat
 
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 
+// scalastyle:off println
 
 class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
@@ -33,29 +33,94 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
   private val seed = 42
   private val eps: Double = 1e-5
 
+  private case class ExpectedMetrics(mean: Vector, min: Vector)
+
+  // The input is expected to be either a sparse vector, a dense vector or an array of doubles
+  // (which will be converted to a dense vector)
+  // The expected is the list of all the known metrics.
+  //
+  // The tests take an list of input vectors and a list of all the summary values that
+  // are expected for this input. They currently test against some fixed subset of the
+  // metrics, but should be made fuzzy in the future.
+
+  private def testExample(input: Seq[Any], exp: ExpectedMetrics): Unit = {
+    val inputVec: Seq[Vector] = input.map {
+      case x: Array[Double] => Vectors.dense(x)
+      case x: Vector => x
+      case x => throw new Exception(x.toString)
+    }
+    val df = sc.parallelize(inputVec).map(Tuple1.apply).toDF("features")
+    val c = df.col("features")
+
+    compare(df.select(metrics("mean").summary(c), mean(c)), Seq(exp.mean, exp.mean))
+  }
+
   private def denseData(input: Seq[Seq[Double]]): DataFrame = {
     val data = input.map(_.toArray).map(Vectors.dense).map(Tuple1.apply)
     sc.parallelize(data).toDF("features")
   }
 
   private def compare(df: DataFrame, exp: Seq[Any]): Unit = {
-    val res = df.first().toSeq
+    println(s"compare: df=$df")
+    val coll = df.collect().toSeq
+    println(s"compare: coll=$coll")
+    val Seq(row) = coll
+    val res = row.toSeq
+    println(s"compare: ress=${res.map(_.getClass)}")
     val names = df.schema.fieldNames.zipWithIndex.map { case (n, idx) => s"$n ($idx)" }
-    assert(res.size === exp.size)
+    assert(res.size === exp.size, (res.size, exp.size))
     for (((x1, x2), name) <- res.zip(exp).zip(names)) {
-      (x1, x2) match {
-        case (d1: Double, d2: Double) =>
-          assert(Vectors.dense(d1) ~== Vectors.dense(d2) absTol 1e-4, name)
-      }
+      compareStructures(x1, x2, name)
     }
   }
 
+  // Compares structured content.
+  private def compareStructures(x1: Any, x2: Any, name: String): Unit = (x1, x2) match {
+    case (d1: Double, d2: Double) =>
+      assert(Vectors.dense(d1) ~== Vectors.dense(d2) absTol 1e-4, name)
+    case (r1: Row, r2: Row) =>
+      assert(r1.size === r2.size, (r1, r2))
+      for ((x1, x2) <- r1.toSeq.zip(r2.toSeq)) { compareStructures(x1, x2, name) }
+    case (v1: Vector, v2: Vector) =>
+      assert(v1 ~== v2 absTol 1e-4, name)
+    case (l1: Long, l2: Long) => assert(l1 === l2)
+  }
+
   test("basic error handling") {
     val df = denseData(Nil)
     val c = df.col("features")
     val res = df.select(metrics("mean").summary(c), mean(c))
-    intercept[IllegalArgumentException] {
+    intercept[SparkException] {
       compare(res, Seq.empty)
     }
   }
+
+  test("no element, working metrics") {
+    val df = denseData(Nil)
+    val c = df.col("features")
+    val res = df.select(metrics("count").summary(c))
+    compare(res, Seq(Row(0L)))
+  }
+
+  test("single element in vector, all metrics") {
+    testExample(Seq(Array(1.0, 2.0)), ExpectedMetrics(
+      Vectors.dense(1.0, 2.0),
+      Vectors.dense(1.0, 2.0)
+    ))
+  }
+
+  test("repeated metrics") {
+
+  }
+
+  test("dense vector input") {
+
+  }
+
+  test("sparse vector input") {
+
+  }
+
+  test("merging summarizer when one side has zero mean (SPARK-4355)") {
+  }
 }

From 673943f334b94e5d1ecd8874cb82bbc875d739c6 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Mon, 6 Mar 2017 16:01:30 -0800
Subject: [PATCH 03/26] last work

---
 .../org/apache/spark/ml/stat/Summarizer.scala | 51 ++++++++++++-------
 1 file changed, 33 insertions(+), 18 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index 1de782af9ca1..625e1e2145c9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -331,35 +331,38 @@ object SummaryBuilderImpl extends Logging {
           // instead of failing silently?
           // Using exceptions is not great for performance in this loop.
           val thisNnz = if (weightSum == null) { 0.0 } else { weightSum(i) }
-          val otherNnz = if (otherWeightSum == null) { 0.0 } else { otherWeightSum(i) }
+          val otherNnz = if (otherWeightSum == null) { 0.0 } else { otherWeightSum.getDouble(i) }
           val totalNnz = thisNnz + otherNnz
           val totalCnnz = if (numNonzeros == null || numNonzeros == null) { 0.0 } else {
-            numNonzeros(i) + otherNumNonzeros(i)
+            numNonzeros(i) + otherNumNonzeros.getDouble(i)
           }
           if (totalNnz != 0.0) {
-            val deltaMean = if (computeMean >= 0) { otherMean(i) - currMean(i) } else { 0.0 }
+            val deltaMean = if (computeMean >= 0) {
+              otherMean.getDouble(i) - currMean(i)
+            } else { 0.0 }
             // merge mean together
             if (computeMean >= 0) {
               currMean(i) += deltaMean * otherNnz / totalNnz
             }
             // merge m2n together
             if (computeM2n >= 0) {
-              currM2n(i) += otherM2n(i) + deltaMean * deltaMean * thisNnz * otherNnz / totalNnz
+              val incr = deltaMean * deltaMean * thisNnz * otherNnz / totalNnz
+              currM2n(i) += otherM2n.getDouble(i) + incr
             }
             // merge m2 together
             if (computeM2 >= 0) {
-              currM2(i) += otherM2(i)
+              currM2(i) += otherM2.getDouble(i)
             }
             // merge l1 together
             if (computeL1 >= 0) {
-              currL1(i) += otherL1(i)
+              currL1(i) += otherL1.getDouble(i)
             }
             // merge max and min
             if (computeCurrMax >= 0) {
-              currMax(i) = math.max(currMax(i), otherMax(i))
+              currMax(i) = math.max(currMax(i), otherMax.getDouble(i))
             }
             if (computeCurrMin >= 0) {
-              currMin(i) = math.min(currMin(i), otherMin(i))
+              currMin(i) = math.min(currMin(i), otherMin.getDouble(i))
             }
           }
           if (computeWeightSum >= 0) {
@@ -409,7 +412,7 @@ object SummaryBuilderImpl extends Logging {
       require(totalWeightSum > 0, "Data has zero weight")
       var i = 0
       while (i < n) {
-        realMean(i) = currMean(i) * (weightSum(i) / totalWeightSum)
+        realMean(i) = currMean.getDouble(i) * (weightSum.getDouble(i) / totalWeightSum)
         i += 1
       }
       Vectors.dense(realMean)
@@ -442,8 +445,10 @@ object SummaryBuilderImpl extends Logging {
         var i = 0
         val len = currM2n.length
         while (i < len) {
-          realVariance(i) = (currM2n(i) + deltaMean(i) * deltaMean(i) * weightSum(i) *
-            (totalWeightSum - weightSum(i)) / totalWeightSum) / denominator
+          val m = deltaMean.getDouble(i)
+          val num = currM2n.getDouble(i) +  m * m * weightSum.getDouble(i)
+          realVariance(i) = (num *
+            (totalWeightSum - weightSum.getDouble(i)) / totalWeightSum) / denominator
           i += 1
         }
       }
@@ -499,7 +504,7 @@ object SummaryBuilderImpl extends Logging {
       }
 
       // All these may be null pointers.
-      val localCurrMean = exposeDArray(buffer, computeMean)
+      val localCurrMean = exposeDArray3(buffer, computeMean)
       val localCurrM2n = exposeDArray(buffer, computeM2n)
       val localCurrM2 = exposeDArray(buffer, computeM2)
       val localCurrL1 = exposeDArray(buffer, computeL1)
@@ -520,12 +525,13 @@ object SummaryBuilderImpl extends Logging {
           }
 
           if (localCurrMean != null) {
-            val prevMean = localCurrMean(index)
+            val prevMean = localCurrMean.getDouble(index)
             val diff = value - prevMean
-            localCurrMean(index) = prevMean + weight * diff / (localWeightSum(index) + weight)
+            localCurrMean.update(index, prevMean + weight * diff / (localWeightSum(index) + weight))
+//            localCurrMean(index) = prevMean + weight * diff / (localWeightSum(index) + weight)
             if (localCurrM2n != null) {
               // require: localCurrMean != null.
-              localCurrM2n(index) += weight * (value - localCurrMean(index)) * diff
+              localCurrM2n(index) += weight * (value - localCurrMean.getDouble(index)) * diff
             }
           }
 
@@ -550,12 +556,11 @@ object SummaryBuilderImpl extends Logging {
       buffer.getInt(computeN)
     }
 
-
-    private[this] def exposeDArray2(buffer: Row, index: Int): Array[Double] = {
+    private[this] def exposeDArray2(buffer: Row, index: Int): Row = {
       if (index == -1) {
         null
       } else {
-        buffer.getAs[Array[Double]](index)
+        buffer.getStruct(index)
       }
     }
 
@@ -563,10 +568,20 @@ object SummaryBuilderImpl extends Logging {
       if (index == -1) {
         null
       } else {
+        buffer.getAs[MutableAggregationBuffer](index)
         buffer.getAs[mutable.WrappedArray[Double]](index).array
       }
     }
 
+    private[this] def exposeDArray3(
+        buffer: MutableAggregationBuffer, index: Int): MutableAggregationBuffer = {
+      if (index == -1) {
+        null
+      } else {
+        buffer.getAs[MutableAggregationBuffer](index)
+      }
+    }
+
     // Performs the initialization of the requested fields, if required.
     private def initialize(buffer: MutableAggregationBuffer, n: Int): Unit = {
       require(n > 0, s"Cannot set n to $n: must be positive")

From 202b672afec127f4e0885cf3a58f4dfc97031fc6 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Mon, 13 Mar 2017 15:48:47 -0700
Subject: [PATCH 04/26] work on using imperative aggregators

---
 .../org/apache/spark/ml/stat/Summarizer.scala | 133 +++++++++++++++++-
 1 file changed, 132 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index 625e1e2145c9..93e0c7429d10 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -17,11 +17,16 @@
 
 package org.apache.spark.ml.stat
 
-import scala.collection.mutable
+import org.apache.spark.SparkException
 
+import scala.collection.mutable
 import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.linalg.{SQLDataTypes, Vector, Vectors}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeProjection, UnsafeRow}
+import org.apache.spark.sql.catalyst.expressions.aggregate.TypedImperativeAggregate
+import org.apache.spark.sql.catalyst.util.GenericArrayData
 import org.apache.spark.sql.{Column, Row}
 import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
 import org.apache.spark.sql.types._
@@ -211,6 +216,132 @@ object SummaryBuilderImpl extends Logging {
     ComputeTotalWeightSum, ComputeWeightSquareSum, ComputeWeightSum, ComputeNNZ, ComputeMax,
     ComputeMin).zipWithIndex
 
+  /**
+   * The buffer that contains all the summary statistics. If the value is null, it is considered
+   * to be not required.
+   *
+   * If it is required but the size of the vectors (n) is not yet know, it is initialized to
+   * an empty array.
+   */
+  private case class Buffer private (
+    var n: Int = -1,
+    var mean: Array[Double] = null,
+    var m2n: Array[Double] = null,
+    var m2: Array[Double] = null,
+    var l1: Array[Double] = null,
+    var totalCount: Long = 0,
+    var totalWeightSum: Double = 0.0,
+    var weightSquareSum: Array[Double] = null,
+    var weightSum: Array[Double] = null,
+    var nnz: Array[Long] = null,
+    var max: Array[Double] = null,
+    var min: Array[Double] = null)
+
+  object Buffer {
+    // Recursive function, but the number of cases is really small.
+    def fromMetrics(requested: Seq[ComputeMetrics]): Buffer = {
+      if (requested.isEmpty) {
+        new Buffer()
+      } else {
+        val b = fromMetrics(requested.tail)
+        requested.head match {
+          case ComputeMean => b.copy(mean = Array.empty)
+          case ComputeM2n => b.copy(m2n = Array.empty)
+          case ComputeM2 => b.copy(m2 = Array.empty)
+          case ComputeL1 => b.copy(l1 = Array.empty)
+          case ComputeWeightSquareSum => b.copy(weightSquareSum = Array.empty)
+          case ComputeWeightSum => b.copy(weightSum = Array.empty)
+          case ComputeNNZ => b.copy(nnz = Array.empty)
+          case ComputeMax => b.copy(max = Array.empty)
+          case ComputeMin => b.copy(min = Array.empty)
+          case _ => b // These cases are already being computed
+        }
+      }
+    }
+
+    def bufferSchema: StructType = {
+      // TODO: there is room for optimization here: we could take as argument the initialized
+      // start buffer and only attempt to compute the fields we know will be requested.
+      val fields = metricsWithOrder.map { case (m, _) =>
+        // TODO(thunterdb) this could be merged with the types above so that there is no confusion
+        // when adding other types later, and with the initialization below.
+        val tpe = m match {
+          case ComputeCount => LongType
+          case ComputeTotalWeightSum => DoubleType
+          case ComputeWeightSquareSum => DoubleType
+          case _ => ArrayType(DoubleType, containsNull = false)
+        }
+        StructField(m.toString, tpe, nullable = true)
+      }
+      val n = StructField("n", IntegerType, nullable = false)
+      StructType(n +: fields)
+    }
+
+    def updateInPlace(buffer: Buffer, v: Vector, w: Double): Unit = {
+      ???
+    }
+
+    @throws[SparkException]("When the buffers are not compatible")
+    def mergeBuffers(buffer: Buffer, other: Buffer): Buffer = {
+      ???
+    }
+
+
+  }
+
+  private case class MetricsAggregate(
+      requested: Seq[Metrics],
+      startBuffer: Buffer,
+      child: Expression)
+    extends TypedImperativeAggregate[Buffer] {
+
+    override def eval(buff: Buffer): Buffer = Buffer
+
+    override def children: Seq[Expression] = child :: Nil
+
+    override def update(buff: Buffer, row: InternalRow): Buffer = {
+      val v = row.get(0, SQLDataTypes.VectorType).asInstanceOf[Vector]
+
+      val w = row.numFields match {
+        case 1 => 1.0
+        case 2 => row.getDouble(1)
+        case x => throw new SparkException(s"Expected 1 or 2 fields, got $x fields.")
+      }
+      Buffer.updateInPlace(buff, v, w)
+      buff
+    }
+
+    override def merge(buff: Buffer, other: Buffer): Buffer = {
+      Buffer.mergeBuffers(buff, other)
+    }
+
+    override def nullable: Boolean = false
+
+    // Make a copy of the start buffer so that the current aggregator can be safely copied around.
+    override def createAggregationBuffer(): Buffer = startBuffer.copy()
+
+    override def serialize(buff: Buffer): Array[Byte] = {
+      val array = new GenericArrayData(buff.productIterator.toArray)
+      projection.apply(InternalRow.apply(array)).getBytes
+    }
+
+    override def deserialize(bytes: Array[Byte]): Buffer = {
+      val buffer = createAggregationBuffer()
+      row.pointTo(bytes, bytes.length)
+      row.getArray(0).foreach(child.dataType, (_, x: Any) => buffer += x)
+      buffer
+    }
+
+
+
+    private lazy val projection = UnsafeProjection.create(
+      Array[DataType](ArrayType(elementType = child.dataType, containsNull = false)))
+    private lazy val row = new UnsafeRow(1)
+
+
+  }
+
+
 
   private class MetricsUDAF(
       requested: Seq[Metrics],

From a983284cfeddabd017792e3991cf99a7d3ab1e16 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Fri, 17 Mar 2017 17:14:40 -0700
Subject: [PATCH 05/26] more work on summarizer

---
 .../org/apache/spark/ml/stat/Summarizer.scala | 182 +++++++++++++++++-
 1 file changed, 174 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index 93e0c7429d10..728faab71b37 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -17,17 +17,17 @@
 
 package org.apache.spark.ml.stat
 
-import org.apache.spark.SparkException
+import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
 
-import scala.collection.mutable
+import org.apache.spark.SparkException
 import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.internal.Logging
-import org.apache.spark.ml.linalg.{SQLDataTypes, Vector, Vectors}
+import org.apache.spark.ml.linalg.{BLAS, SQLDataTypes, Vector, Vectors}
+import org.apache.spark.sql.{Column, Row}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.catalyst.expressions.aggregate.TypedImperativeAggregate
 import org.apache.spark.sql.catalyst.util.GenericArrayData
-import org.apache.spark.sql.{Column, Row}
 import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
 import org.apache.spark.sql.types._
 
@@ -231,7 +231,7 @@ object SummaryBuilderImpl extends Logging {
     var l1: Array[Double] = null,
     var totalCount: Long = 0,
     var totalWeightSum: Double = 0.0,
-    var weightSquareSum: Array[Double] = null,
+    var totalWeightSquareSum: Double = 0.0,
     var weightSum: Array[Double] = null,
     var nnz: Array[Long] = null,
     var max: Array[Double] = null,
@@ -249,7 +249,6 @@ object SummaryBuilderImpl extends Logging {
           case ComputeM2n => b.copy(m2n = Array.empty)
           case ComputeM2 => b.copy(m2 = Array.empty)
           case ComputeL1 => b.copy(l1 = Array.empty)
-          case ComputeWeightSquareSum => b.copy(weightSquareSum = Array.empty)
           case ComputeWeightSum => b.copy(weightSum = Array.empty)
           case ComputeNNZ => b.copy(nnz = Array.empty)
           case ComputeMax => b.copy(max = Array.empty)
@@ -278,15 +277,182 @@ object SummaryBuilderImpl extends Logging {
     }
 
     def updateInPlace(buffer: Buffer, v: Vector, w: Double): Unit = {
-      ???
+      val startN = buffer.n
+      if (startN == -1) {
+        buffer.n = v.size
+      } else {
+        require(startN == v.size,
+          s"Trying to insert a vector of size $v into a buffer that " +
+            s"has been sized with $startN")
+      }
+      val n = buffer.n
+      assert(n > 0, n)
+      // Always update the following fields.
+      buffer.totalWeightSum += w
+      buffer.totalCount += 1
+      buffer.totalWeightSquareSum += w * w
+      // All the fields that we compute on demand:
+      // TODO: the most common case is dense vectors. In that case we should
+      // directly use BLAS instructions instead of iterating through a scala iterator.
+      v.foreachActive { (index, value) =>
+        if (value != 0.0) {
+          if (buffer.max != null && buffer.max(index) < value) {
+            buffer.max(index) = value
+          }
+          if (buffer.min != null && buffer.min(index) > value) {
+            buffer.min(index) = value
+          }
+
+          if (buffer.mean != null) {
+            val prevMean = buffer.mean(index)
+            val diff = value - prevMean
+            buffer.mean(index) += w * diff / (buffer.weightSum(index) + w)
+            if (buffer.m2n != null) {
+              buffer.m2n(index) += w * (value - buffer.mean(index)) * diff
+            }
+          }
+          if (buffer.m2 != null) {
+            buffer.m2(index) += w * value * value
+          }
+          if (buffer.l1 != null) {
+            buffer.l1(index) += w * math.abs(value)
+          }
+          if (buffer.weightSum != null) {
+            buffer.weightSum(index) += w
+          }
+          if (buffer.nnz != null) {
+            buffer.nnz(index) += 1
+          }
+        }
+      }
     }
 
     @throws[SparkException]("When the buffers are not compatible")
     def mergeBuffers(buffer: Buffer, other: Buffer): Buffer = {
-      ???
+      if (buffer.n == -1) {
+        // buffer is not initialized.
+        if (other.n == -1) {
+          // Both are not initialized.
+          buffer
+        } else {
+          // other is initialized
+          other
+        }
+      } else {
+        // Buffer is initialized.
+        if (other.n == -1) {
+          buffer
+        } else {
+          mergeBuffers0(buffer, other)
+          buffer
+        }
+      }
+    }
+
+    private def axpy(a: Double, x: Array[Double], y: Array[Double]): Unit = {
+      BLAS.axpy(a, Vectors.dense(x), Vectors.dense(y))
+    }
+
+    private def b(x: Array[Double]): BV[Double] = Vectors.dense(x).asBreeze
+
+    private def bl(x: Array[Long]): BV[Long] = BV.apply(x)
+
+    private def maxInPlace(x: Array[Double], y: Array[Double]): Unit = {
+      var i = 0
+      while(i < x.length) {
+        // Note: do not use conditions, it is wrong when handling NaNs.
+        x(i) = Math.max(x(i), y(i))
+        i += 1
+      }
+    }
+
+    private def minInPlace(x: Array[Double], y: Array[Double]): Unit = {
+      var i = 0
+      while(i < x.length) {
+        // Note: do not use conditions, it is wrong when handling NaNs.
+        x(i) = Math.min(x(i), y(i))
+        i += 1
+      }
+    }
+
+
+    /**
+     * Merges other into buffer.
+     */
+    private def mergeBuffers0(buffer: Buffer, other: Buffer): Unit = {
+      // Each buffer needs to be properly initialized.
+      require(buffer.n > 0 && other.n > 0, (buffer.n, other.n))
+      require(buffer.n == other.n, (buffer.n, other.n))
+      // Mandatory scalar values
+      buffer.totalWeightSquareSum += other.totalWeightSquareSum
+      buffer.totalWeightSum += other.totalWeightSum
+      buffer.totalCount += buffer.totalCount
+      // Keep the original weight sums.
+      val weightSum1 = if (buffer.weightSum == null) null else { buffer.weightSum.clone() }
+      val weightSum2 = if (other.weightSum == null) null else { other.weightSum.clone() }
+
+      val weightSum = if (weightSum1 == null) null else {
+        require(weightSum2 != null)
+        val arr: Array[Double] = Array.ofDim(buffer.n)
+        b(arr) :+= b(weightSum1) :- b(weightSum1)
+        arr
+      }
+
+
+      // Since the operations are dense, we can directly use BLAS calls here.
+      val deltaMean: Array[Double] = if (buffer.mean != null) {
+        require(other.mean != null)
+        val arr: Array[Double] = Array.ofDim(buffer.n)
+        b(arr) :+= b(other.mean) :- b(buffer.mean)
+        arr
+      } else { null }
+
+      if (buffer.mean != null) {
+        require(other.mean != null)
+        require(weightSum != null)
+        b(buffer.mean) :+= b(deltaMean) :* (b(weightSum2) / b(weightSum))
+      }
+
+      if (buffer.m2n != null) {
+        require(other.m2n != null)
+        val w = (b(weightSum1) :* b(weightSum2)) :/ b(weightSum)
+        b(buffer.m2n) :+= b(other.m2n) :+ (b(deltaMean) :* b(deltaMean)) :* w
+      }
+
+      if (buffer.m2 != null) {
+        require(other.m2 != null)
+        b(buffer.m2) :+= b(other.m2)
+      }
+
+      if (buffer.l1 != null) {
+        require(other.l1 != null)
+        b(buffer.l1) :+= b(other.l1)
+      }
+
+      if (buffer.max != null) {
+        require(other.max != null)
+        maxInPlace(buffer.max, other.max)
+      }
+
+      if (buffer.min != null) {
+        require(other.min != null)
+        minInPlace(buffer.min, other.min)
+      }
+
+      if (buffer.nnz != null) {
+        require(other.nnz != null)
+        bl(buffer.nnz) :+= bl(other.nnz)
+      }
+
+      if (buffer.weightSum != null) {
+        require(other.weightSum != null)
+        b(buffer.weightSum) :+= b(other.weightSum)
+      }
     }
 
 
+
+
   }
 
   private case class MetricsAggregate(

From 647a4fecb17d478c3c8cd68d40f2a9456eb10c66 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Tue, 21 Mar 2017 10:47:30 -0700
Subject: [PATCH 06/26] work

---
 .../org/apache/spark/ml/stat/Summarizer.scala | 190 +++++++++++++++---
 1 file changed, 159 insertions(+), 31 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index 728faab71b37..abd5fda96e12 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.stat
 
 import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
+import breeze.numerics
 
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.{DeveloperApi, Since}
@@ -28,7 +29,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.catalyst.expressions.aggregate.TypedImperativeAggregate
 import org.apache.spark.sql.catalyst.util.GenericArrayData
-import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
+import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction, UserDefinedFunction}
 import org.apache.spark.sql.types._
 
 // scalastyle:off println
@@ -224,18 +225,18 @@ object SummaryBuilderImpl extends Logging {
    * an empty array.
    */
   private case class Buffer private (
-    var n: Int = -1,
-    var mean: Array[Double] = null,
-    var m2n: Array[Double] = null,
-    var m2: Array[Double] = null,
-    var l1: Array[Double] = null,
-    var totalCount: Long = 0,
-    var totalWeightSum: Double = 0.0,
-    var totalWeightSquareSum: Double = 0.0,
-    var weightSum: Array[Double] = null,
-    var nnz: Array[Long] = null,
-    var max: Array[Double] = null,
-    var min: Array[Double] = null)
+    var n: Int = -1,                          // 0
+    var mean: Array[Double] = null,           // 1
+    var m2n: Array[Double] = null,            // 2
+    var m2: Array[Double] = null,             // 3
+    var l1: Array[Double] = null,             // 4
+    var totalCount: Long = 0,                 // 5
+    var totalWeightSum: Double = 0.0,         // 6
+    var totalWeightSquareSum: Double = 0.0,   // 7
+    var weightSum: Array[Double] = null,      // 8
+    var nnz: Array[Long] = null,              // 9
+    var max: Array[Double] = null,            // 10
+    var min: Array[Double] = null)            // 11
 
   object Buffer {
     // Recursive function, but the number of cases is really small.
@@ -327,6 +328,9 @@ object SummaryBuilderImpl extends Logging {
       }
     }
 
+    /**
+     * Updates 'buffer' with the content of 'other', and returns 'buffer'.
+     */
     @throws[SparkException]("When the buffers are not compatible")
     def mergeBuffers(buffer: Buffer, other: Buffer): Buffer = {
       if (buffer.n == -1) {
@@ -349,8 +353,120 @@ object SummaryBuilderImpl extends Logging {
       }
     }
 
-    private def axpy(a: Double, x: Array[Double], y: Array[Double]): Unit = {
-      BLAS.axpy(a, Vectors.dense(x), Vectors.dense(y))
+    /**
+     * Reads a buffer from a serialized form, using the row object as an assistant.
+     */
+    def read(bytes: Array[Byte], row: UnsafeRow): Buffer = {
+      row.pointTo(bytes, bytes.length)
+      new Buffer(
+        n = row.getInt(0),
+        mean = nullableArrayD(row, 1),
+        m2n = nullableArrayD(row, 2),
+        m2 = nullableArrayD(row, 3),
+        l1 = nullableArrayD(row, 4),
+        totalCount = row.getLong(5),
+        totalWeightSum = row.getDouble(6),
+        totalWeightSquareSum = row.getDouble(7),
+        weightSum = nullableArrayD(row, 8),
+        nnz = nullableArrayL(row, 9),
+        max = nullableArrayD(row, 10),
+        min = nullableArrayD(row, 11)
+      )
+    }
+
+
+    def write(buffer: Buffer): Array[Byte] = {
+      val ir = InternalRow.apply(
+        buffer.n,
+        buffer.mean,
+        buffer.m2n,
+        buffer.m2,
+        buffer.l1,
+        buffer.totalCount,
+        buffer.totalWeightSum,
+        buffer.totalWeightSquareSum,
+        buffer.weightSum,
+        buffer.nnz,
+        buffer.max,
+        buffer.min
+      )
+      projection.apply(ir).getBytes
+    }
+
+    def mean(buffer: Buffer): Array[Double] = {
+      require(buffer.totalWeightSum > 0)
+      require(buffer.mean != null)
+      require(buffer.weightSum != null)
+      val res = b(buffer.mean) :* b(buffer.weightSum) :/ buffer.totalWeightSum
+      res.toArray
+    }
+
+    def variance(buffer: Buffer): Array[Double] = {
+      import buffer._
+      require(n >= 0)
+      require(totalWeightSum > 0)
+      require(totalWeightSquareSum > 0)
+      require(mean != null)
+      require(m2n != null)
+      require(weightSum != null)
+      val denom = totalWeightSum - (totalWeightSquareSum / totalWeightSum)
+      if (denom > 0.0) {
+        val normWs = b(weightSum) :/ totalWeightSum
+        val x = b(mean) :* b(mean) :* b(weightSum) :* (1.0 - normWs)
+        val res = (b(m2n) :+ x) :/ denom
+        res.toArray
+      } else {
+        Array.ofDim(n) // Return 0.0 instead.
+      }
+    }
+
+    def totalCount(buffer: Buffer): Long = buffer.totalCount
+
+    def nnz(buffer: Buffer): Array[Long] = {
+      require(buffer.nnz != null)
+      buffer.nnz
+    }
+
+    def max(buffer: Buffer): Array[Double] = {
+      require(buffer.max != null)
+      buffer.max
+    }
+
+    def min(buffer: Buffer): Array[Double] = {
+      require(buffer.min != null)
+      buffer.min
+    }
+
+    def l2(buffer: Buffer): Array[Double] = {
+      import buffer._
+      require(totalWeightSum > 0.0)
+      require(m2 != null)
+      numerics.sqrt(b(m2)).toArray
+    }
+
+    def l1(buffer: Buffer): Array[Double] = {
+      require(buffer.l1 != null)
+      buffer.l1
+    }
+
+    private[this] lazy val projection = UnsafeProjection.create(bufferSchema)
+
+    // Returns the array at a given index, or null if the array is null.
+    private def nullableArrayD(row: UnsafeRow, ordinal: Int): Array[Double] = {
+      if (row.isNullAt(ordinal)) {
+        null
+      } else {
+        row.getArray(ordinal).toDoubleArray
+      }
+    }
+
+    // Returns the array at a given index, or null if the array is null.
+    private def nullableArrayL(row: UnsafeRow, ordinal: Int): Array[Long] = {
+      if (row.isNullAt(ordinal)) {
+        null
+      } else {
+        row.getArray(ordinal).toLongArray
+      }
     }
 
     private def b(x: Array[Double]): BV[Double] = Vectors.dense(x).asBreeze
@@ -449,19 +565,31 @@ object SummaryBuilderImpl extends Logging {
         b(buffer.weightSum) :+= b(other.weightSum)
       }
     }
-
-
-
-
   }
 
   private case class MetricsAggregate(
       requested: Seq[Metrics],
       startBuffer: Buffer,
-      child: Expression)
+      child: Expression,
+      mutableAggBufferOffset: Int,
+      inputAggBufferOffset: Int)
     extends TypedImperativeAggregate[Buffer] {
 
-    override def eval(buff: Buffer): Buffer = Buffer
+    private lazy val row = new UnsafeRow(1)
+
+    override def eval(buff: Buffer): Row = {
+      val metrics = requested.map({
+        case Mean => Buffer.mean(buff)
+        case Variance => Buffer.variance(buff)
+        case Count => Buffer.totalCount(buff)
+        case NumNonZeros => Buffer.nnz(buff)
+        case Max => Buffer.max(buff)
+        case Min => Buffer.min(buff)
+        case NormL2 => Buffer.l2(buff)
+        case NormL1 => Buffer.l1(buff)
+      })
+      Row(metrics: _*)
+    }
 
     override def children: Seq[Expression] = child :: Nil
 
@@ -487,24 +615,24 @@ object SummaryBuilderImpl extends Logging {
     override def createAggregationBuffer(): Buffer = startBuffer.copy()
 
     override def serialize(buff: Buffer): Array[Byte] = {
-      val array = new GenericArrayData(buff.productIterator.toArray)
-      projection.apply(InternalRow.apply(array)).getBytes
+      Buffer.write(buff)
     }
 
     override def deserialize(bytes: Array[Byte]): Buffer = {
-      val buffer = createAggregationBuffer()
-      row.pointTo(bytes, bytes.length)
-      row.getArray(0).foreach(child.dataType, (_, x: Any) => buffer += x)
-      buffer
+      Buffer.read(bytes, row)
     }
 
+    override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): MetricsAggregate = {
+      copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+    }
 
+    override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): MetricsAggregate = {
+      copy(inputAggBufferOffset = newInputAggBufferOffset)
+    }
 
-    private lazy val projection = UnsafeProjection.create(
-      Array[DataType](ArrayType(elementType = child.dataType, containsNull = false)))
-    private lazy val row = new UnsafeRow(1)
-
+    override def dataType: DataType = Buffer.bufferSchema
 
+    override def prettyName: String = "aggregate_metrics"
   }
 
 

From 3c4bef772a3cbc759e43223af658a357c5ca6bc2 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Tue, 21 Mar 2017 11:54:16 -0700
Subject: [PATCH 07/26] changes

---
 .../org/apache/spark/ml/stat/Summarizer.scala | 475 +-----------------
 .../spark/ml/stat/SummarizerSuite.scala       |  13 +
 2 files changed, 36 insertions(+), 452 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index abd5fda96e12..018fb4cbc092 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -24,10 +24,11 @@ import org.apache.spark.SparkException
 import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.linalg.{BLAS, SQLDataTypes, Vector, Vectors}
+import org.apache.spark.ml.stat.SummaryBuilderImpl.MetricsAggregate
 import org.apache.spark.sql.{Column, Row}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeProjection, UnsafeRow}
-import org.apache.spark.sql.catalyst.expressions.aggregate.TypedImperativeAggregate
+import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Final, TypedImperativeAggregate}
 import org.apache.spark.sql.catalyst.util.GenericArrayData
 import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction, UserDefinedFunction}
 import org.apache.spark.sql.types._
@@ -46,19 +47,10 @@ abstract class SummaryBuilder {
    * Returns an aggregate object that contains the summary of the column with the requested metrics.
    * @param column a column that contains Vector object.
    * @return an aggregate column that contains the statistics. The exact content of this
-   *         structure is determined during the creation of the builder. It is also provided by
-   *         [[summaryStructure]].
+   *         structure is determined during the creation of the builder.
    */
   @Since("2.2.0")
   def summary(column: Column): Column
-
-  /**
-   * The structure of the summary that will be returned by this builder.
-   * @return the structure of the aggregate data returned by this builder.
-   */
-  @Since("2.2.0")
-  @DeveloperApi
-  private[spark] def summaryStructure: StructType
 }
 
 /**
@@ -107,21 +99,26 @@ object Summarizer extends Logging {
   @Since("2.2.0")
   def metrics(firstMetric: String, metrics: String*): SummaryBuilder = {
     val (typedMetrics, computeMetrics) = getRelevantMetrics(Seq(firstMetric) ++ metrics)
-    new SummaryBuilderImpl(
-      SummaryBuilderImpl.buildUdafForMetrics(typedMetrics, computeMetrics),
-      SummaryBuilderImpl.structureForMetrics(typedMetrics))
+    new SummaryBuilderImpl(typedMetrics, computeMetrics)
   }
 
   def mean(col: Column): Column = metrics("mean").summary(col)
 }
 
 private[ml] class SummaryBuilderImpl(
-  udaf: UserDefinedAggregateFunction,
-  structType: StructType) extends SummaryBuilder {
-
-  override def summary(column: Column): Column = udaf.apply(column)
-
-  override val summaryStructure: StructType = structType
+    requestedMetrics: Seq[SummaryBuilderImpl.Metrics],
+    requestedCompMetrics: Seq[SummaryBuilderImpl.ComputeMetrics]) extends SummaryBuilder {
+
+  override def summary(column: Column): Column = {
+    val start = SummaryBuilderImpl.Buffer.fromMetrics(requestedCompMetrics)
+    val agg = SummaryBuilderImpl.MetricsAggregate(
+      requestedMetrics,
+      start,
+      column.expr,
+      mutableAggBufferOffset = 0,
+      inputAggBufferOffset = 0)
+    new Column(AggregateExpression(agg, mode = Final, isDistinct = false))
+  }
 }
 
 private[ml]
@@ -148,7 +145,8 @@ object SummaryBuilderImpl extends Logging {
   def structureForMetrics(metrics: Seq[Metrics]): StructType = {
     val fields = metrics.map { m =>
       // All types are vectors, except for one.
-      val tpe = if (m == Count) { LongType } else { SQLDataTypes.VectorType }
+//      val tpe = if (m == Count) { LongType } else { SQLDataTypes.VectorType }
+      val tpe = if (m == Count) { LongType } else { ArrayType(DoubleType, containsNull = false) }
       // We know the metric is part of the list.
       val (name, _, _) = allMetrics.find(tup => tup._2 == m).get
       StructField(name, tpe, nullable = false)
@@ -156,11 +154,6 @@ object SummaryBuilderImpl extends Logging {
     StructType(fields)
   }
 
-  def buildUdafForMetrics(
-      metrics: Seq[Metrics],
-      computeMetrics: Seq[ComputeMetrics]): UserDefinedAggregateFunction = {
-    new MetricsUDAF(metrics, computeMetrics)
-  }
 
   /**
    * All the metrics that can be currently computed by Spark for vectors.
@@ -224,7 +217,7 @@ object SummaryBuilderImpl extends Logging {
    * If it is required but the size of the vectors (n) is not yet know, it is initialized to
    * an empty array.
    */
-  private case class Buffer private (
+  case class Buffer private (
     var n: Int = -1,                          // 0
     var mean: Array[Double] = null,           // 1
     var m2n: Array[Double] = null,            // 2
@@ -260,11 +253,7 @@ object SummaryBuilderImpl extends Logging {
     }
 
     def bufferSchema: StructType = {
-      // TODO: there is room for optimization here: we could take as argument the initialized
-      // start buffer and only attempt to compute the fields we know will be requested.
       val fields = metricsWithOrder.map { case (m, _) =>
-        // TODO(thunterdb) this could be merged with the types above so that there is no confusion
-        // when adding other types later, and with the initialization below.
         val tpe = m match {
           case ComputeCount => LongType
           case ComputeTotalWeightSum => DoubleType
@@ -406,13 +395,13 @@ object SummaryBuilderImpl extends Logging {
       require(n >= 0)
       require(totalWeightSum > 0)
       require(totalWeightSquareSum > 0)
-      require(mean != null)
+      require(buffer.mean != null)
       require(m2n != null)
       require(weightSum != null)
       val denom = totalWeightSum - (totalWeightSquareSum / totalWeightSum)
       if (denom > 0.0) {
         val normWs = b(weightSum) :/ totalWeightSum
-        val x = b(mean) :* b(mean) :* b(weightSum) :* (1.0 - normWs)
+        val x = b(buffer.mean) :* b(buffer.mean) :* b(weightSum) :* (- normWs :+ 1.0)
         val res = (b(m2n) :+ x) :/ denom
         res.toArray
       } else {
@@ -630,427 +619,9 @@ object SummaryBuilderImpl extends Logging {
       copy(inputAggBufferOffset = newInputAggBufferOffset)
     }
 
-    override def dataType: DataType = Buffer.bufferSchema
+    override lazy val dataType: DataType = structureForMetrics(requested)
 
     override def prettyName: String = "aggregate_metrics"
   }
 
-
-
-  private class MetricsUDAF(
-      requested: Seq[Metrics],
-      metrics: Seq[ComputeMetrics]) extends UserDefinedAggregateFunction {
-
-    // All of these are the indexes of the value in the temporary buffer, or -1 if these values are
-    // not requested.
-    // All these values are unrolled so that we do not pay too steep a penalty for the extra level
-    // of indirection associated with selectively computing some metrics.
-    private[this] val computeMean = getIndexFor(ComputeMean)
-    private[this] val computeM2n = getIndexFor(ComputeM2n)
-    private[this] val computeM2 = getIndexFor(ComputeM2)
-    private[this] val computeL1 = getIndexFor(ComputeL1)
-    private[this] val computeTotalCount = getForcedIndexFor(ComputeCount) // Always compute it.
-    private[this] val computeTotalWeightSum = getForcedIndexFor(ComputeTotalWeightSum) // Always.
-    private[this] val computeWeightSquareSum = getIndexFor(ComputeWeightSquareSum)
-    private[this] val computeWeightSum = getIndexFor(ComputeWeightSum)
-    private[this] val computeNNZ = getIndexFor(ComputeNNZ)
-    private[this] val computeCurrMax = getIndexFor(ComputeMax)
-    private[this] val computeCurrMin = getIndexFor(ComputeMin)
-    // Always computed, and put at the end.
-    private[this] val computeN = metricsWithOrder.size
-
-    override def inputSchema: StructType = {
-      StructType(Seq(StructField("features", SQLDataTypes.VectorType)))
-    }
-
-    // The schema for the buffer contains all the structures, but a subset of them may be null.
-    override def bufferSchema: StructType = {
-      val fields = metricsWithOrder.map { case (m, _) =>
-        // TODO(thunterdb) this could be merged with the types above so that there is no confusion
-        // when adding other types later, and with the initialization below.
-        val tpe = m match {
-          case ComputeCount => LongType
-          case ComputeTotalWeightSum => DoubleType
-          case ComputeWeightSquareSum => DoubleType
-          case _ => ArrayType(DoubleType, containsNull = false)
-        }
-        StructField(m.toString, tpe, nullable = true)
-      } :+ StructField("n", IntegerType, nullable = false)
-      StructType(fields)
-    }
-
-    override def dataType: DataType = {
-      structureForMetrics(requested)
-    }
-
-    override def deterministic: Boolean = true
-
-    // The initialization does not do anything because we do not know the size of the vectors yet.
-    override def initialize(buffer: MutableAggregationBuffer): Unit = {
-      // Set n to -1 to indicate that we do not know the size yet.
-      println("initialize: size=" + buffer.size + " computeN = " + computeN)
-      // Thes metrics are always computed.
-      buffer.update(computeN, -1)
-      buffer.update(computeTotalCount, 0L)
-      buffer.update(computeTotalWeightSum, 0.0)
-    }
-
-    override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
-      val v = input.getAs[Vector](0)
-      // TODO: add weights later.
-      update(buffer, v, 1.0)
-    }
-
-    override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
-      // The metrics that are used to determine if the buffers are used are always computed.
-      val thisUsed = (buffer1.getDouble(computeTotalWeightSum) > 0.0
-                    || buffer1.getLong(computeTotalCount) > 0L)
-      val otherUsed = (buffer2.getDouble(computeTotalWeightSum) > 0.0
-        || buffer2.getLong(computeTotalCount) > 0L)
-      if (thisUsed && otherUsed) {
-        // Merge onto this one.
-        val n1 = getN(buffer1)
-        val n2 = getN(buffer2)
-        require(n1 == n2, s"Dimensions mismatch between summarizers: expected $n1 but got $n2")
-        require(n1 > 0, s"Expected the summarizers to be initialized")
-        // The scalar values.
-        if (computeTotalCount >= 0) {
-          buffer1.update(computeTotalCount,
-            buffer1.getLong(computeTotalCount) + buffer2.getLong(computeTotalCount))
-        }
-        if (computeTotalWeightSum >= 0) {
-          buffer1.update(computeTotalWeightSum,
-            buffer1.getDouble(computeTotalWeightSum) + buffer2.getDouble(computeTotalWeightSum))
-        }
-        if (computeWeightSquareSum >= 0) {
-          buffer1.update(computeWeightSquareSum,
-            buffer1.getDouble(computeWeightSquareSum) + buffer2.getDouble(computeWeightSquareSum))
-        }
-        // The vector values.
-        val currMean = exposeDArray(buffer1, computeMean)
-        val currM2n = exposeDArray(buffer1, computeM2n)
-        val currM2 = exposeDArray(buffer1, computeM2)
-        val currL1 = exposeDArray(buffer1, computeL1)
-        val weightSum = exposeDArray(buffer1, computeWeightSum)
-        val numNonzeros = exposeDArray(buffer1, computeNNZ)
-        val currMax = exposeDArray(buffer1, computeCurrMax)
-        val currMin = exposeDArray(buffer1, computeCurrMin)
-
-        val otherMean = exposeDArray2(buffer2, computeMean)
-        val otherM2n = exposeDArray2(buffer2, computeM2n)
-        val otherM2 = exposeDArray2(buffer2, computeM2)
-        val otherL1 = exposeDArray2(buffer2, computeL1)
-        val otherWeightSum = exposeDArray2(buffer2, computeWeightSum)
-        val otherNumNonzeros = exposeDArray2(buffer2, computeNNZ)
-        val otherMax = exposeDArray2(buffer2, computeCurrMax)
-        val otherMin = exposeDArray2(buffer2, computeCurrMin)
-
-
-        var i = 0
-        while (i < n1) {
-          // This case is written do maximize conformance with the current code.
-          // For more proper implementation, the differnt loops should be disentangled.
-          // The case of 0.0 values being used should not happen if the proper selection
-          // of metrics is done.
-          // TODO: should it be NaN so that any computation with it will indicate an error,
-          // instead of failing silently?
-          // Using exceptions is not great for performance in this loop.
-          val thisNnz = if (weightSum == null) { 0.0 } else { weightSum(i) }
-          val otherNnz = if (otherWeightSum == null) { 0.0 } else { otherWeightSum.getDouble(i) }
-          val totalNnz = thisNnz + otherNnz
-          val totalCnnz = if (numNonzeros == null || numNonzeros == null) { 0.0 } else {
-            numNonzeros(i) + otherNumNonzeros.getDouble(i)
-          }
-          if (totalNnz != 0.0) {
-            val deltaMean = if (computeMean >= 0) {
-              otherMean.getDouble(i) - currMean(i)
-            } else { 0.0 }
-            // merge mean together
-            if (computeMean >= 0) {
-              currMean(i) += deltaMean * otherNnz / totalNnz
-            }
-            // merge m2n together
-            if (computeM2n >= 0) {
-              val incr = deltaMean * deltaMean * thisNnz * otherNnz / totalNnz
-              currM2n(i) += otherM2n.getDouble(i) + incr
-            }
-            // merge m2 together
-            if (computeM2 >= 0) {
-              currM2(i) += otherM2.getDouble(i)
-            }
-            // merge l1 together
-            if (computeL1 >= 0) {
-              currL1(i) += otherL1.getDouble(i)
-            }
-            // merge max and min
-            if (computeCurrMax >= 0) {
-              currMax(i) = math.max(currMax(i), otherMax.getDouble(i))
-            }
-            if (computeCurrMin >= 0) {
-              currMin(i) = math.min(currMin(i), otherMin.getDouble(i))
-            }
-          }
-          if (computeWeightSum >= 0) {
-            weightSum(i) = totalNnz
-          }
-          if (computeNNZ >= 0) {
-            numNonzeros(i) = totalCnnz
-          }
-
-          i += 1
-        }
-      } else if (otherUsed) {
-        // Copy buffer1 onto buffer2.
-        // TODO
-      }
-    }
-
-    // This function packs the requested values, in order.
-    override def evaluate(buffer: Row): Row = {
-      val values = requested.map {
-        case Mean => evaluateMean(buffer)
-        case Variance => evaluateVariance(buffer)
-        case Count => evaluateCount(buffer)
-        case NumNonZeros => evaluateNNZ(buffer)
-        case Max => evaluateMax(buffer)
-        case Min => evaluateMin(buffer)
-        case NormL1 => evaluateNormL1(buffer)
-        case NormL2 => evaluateNormL2(buffer)
-      }
-      Row(values: _*)
-    }
-
-    private def evaluateMean(buffer: Row): Vector = {
-      assert(computeMean >= 0) // This is a programming error.
-      assert(computeWeightSum >= 0)
-      assert(computeTotalWeightSum >= 0)
-      val n = getN(buffer)
-      if (n < 0) {
-        throw new IllegalArgumentException("No data point has been seen so far")
-      }
-      val realMean = Array.ofDim[Double](n)
-      val currMean = exposeDArray2(buffer, computeMean)
-      val weightSum = exposeDArray2(buffer, computeWeightSum)
-      val totalWeightSum = buffer.getDouble(computeTotalWeightSum)
-      // TODO: should this be turned into NaN instead? That would be more consistent than
-      // an error.
-      require(totalWeightSum > 0, "Data has zero weight")
-      var i = 0
-      while (i < n) {
-        realMean(i) = currMean.getDouble(i) * (weightSum.getDouble(i) / totalWeightSum)
-        i += 1
-      }
-      Vectors.dense(realMean)
-    }
-
-    private def evaluateVariance(buffer: Row): Vector = {
-      assert(computeTotalWeightSum >= 0)
-      assert(computeWeightSquareSum >= 0)
-      assert(computeWeightSum >= 0)
-      assert(computeM2 >= 0)
-      assert(computeM2n >= 0)
-
-      val totalWeightSum = buffer.getDouble(computeTotalWeightSum)
-      require(totalWeightSum > 0, s"Nothing has been added to this summarizer.")
-
-      val n = getN(buffer)
-      assert(n >= 0)
-
-      val weightSquareSum = buffer.getDouble(computeWeightSquareSum)
-      val realVariance = Array.ofDim[Double](n)
-
-      val currMean = exposeDArray2(buffer, computeMean)
-      val currM2n = exposeDArray2(buffer, computeM2n)
-      val weightSum = exposeDArray2(buffer, computeWeightSum)
-      val denominator = totalWeightSum - (weightSquareSum / totalWeightSum)
-
-      // Sample variance is computed, if the denominator is less than 0, the variance is just 0.
-      if (denominator > 0.0) {
-        val deltaMean = currMean
-        var i = 0
-        val len = currM2n.length
-        while (i < len) {
-          val m = deltaMean.getDouble(i)
-          val num = currM2n.getDouble(i) +  m * m * weightSum.getDouble(i)
-          realVariance(i) = (num *
-            (totalWeightSum - weightSum.getDouble(i)) / totalWeightSum) / denominator
-          i += 1
-        }
-      }
-      Vectors.dense(realVariance)
-    }
-
-    private def evaluateCount(buffer: Row): Long = {
-      assert(computeTotalCount >= 0)
-      buffer.getLong(computeTotalCount)
-    }
-
-    private def evaluateNNZ(buffer: Row): Vector = {
-      null // TODO
-    }
-
-    private def evaluateMax(buffer: Row): Vector = {
-      null // TODO
-    }
-
-    private def evaluateMin(buffer: Row): Vector = {
-      null // TODO
-    }
-
-    private def evaluateNormL1(buffer: Row): Vector = {
-      null // TODO
-    }
-
-    private def evaluateNormL2(buffer: Row): Vector = {
-      null // TODO
-    }
-
-    private def update(buffer: MutableAggregationBuffer, instance: Vector, weight: Double): Unit = {
-      val startN = getN(buffer)
-      if (startN == -1) {
-        initialize(buffer, instance.size)
-      } else {
-        require(startN == instance.size,
-          s"Trying to insert a vector of size $instance into a buffer that " +
-          s"has been sized with $startN")
-      }
-      // It should be initialized.
-      assert(getN(buffer) > 0)
-      // Update the scalars that do not require going through the vector first.
-      if (computeTotalWeightSum >= 0) {
-        buffer.update(computeTotalWeightSum, buffer.getDouble(computeTotalWeightSum) + weight)
-      }
-      if (computeWeightSquareSum >= 0) {
-        buffer.update(computeWeightSquareSum,
-          buffer.getDouble(computeWeightSquareSum) + weight * weight)
-      }
-      if (computeTotalCount >= 0) {
-        buffer.update(computeTotalCount, buffer.getLong(computeTotalCount) + 1)
-      }
-
-      // All these may be null pointers.
-      val localCurrMean = exposeDArray3(buffer, computeMean)
-      val localCurrM2n = exposeDArray(buffer, computeM2n)
-      val localCurrM2 = exposeDArray(buffer, computeM2)
-      val localCurrL1 = exposeDArray(buffer, computeL1)
-      val localWeightSum = exposeDArray(buffer, computeWeightSum)
-      val localNumNonzeros = exposeDArray(buffer, computeNNZ)
-      val localCurrMax = exposeDArray(buffer, computeCurrMax)
-      val localCurrMin = exposeDArray(buffer, computeCurrMin)
-
-      // Update the requested vector values.
-      // All these changes are in place. There is no need to make copies.
-      instance.foreachActive { (index, value) =>
-        if (value != 0.0) {
-          if (localCurrMax != null && localCurrMax(index) < value) {
-             localCurrMax(index) = value
-          }
-          if (localCurrMin != null && localCurrMin(index) > value) {
-            localCurrMin(index) = value
-          }
-
-          if (localCurrMean != null) {
-            val prevMean = localCurrMean.getDouble(index)
-            val diff = value - prevMean
-            localCurrMean.update(index, prevMean + weight * diff / (localWeightSum(index) + weight))
-//            localCurrMean(index) = prevMean + weight * diff / (localWeightSum(index) + weight)
-            if (localCurrM2n != null) {
-              // require: localCurrMean != null.
-              localCurrM2n(index) += weight * (value - localCurrMean.getDouble(index)) * diff
-            }
-          }
-
-          if (localCurrM2 != null) {
-            localCurrM2(index) += weight * value * value
-          }
-          if (localCurrL1 != null) {
-            localCurrL1(index) += weight * math.abs(value)
-          }
-          if (localWeightSum != null) {
-            localWeightSum(index) += weight
-          }
-          if (localNumNonzeros != null) {
-            localNumNonzeros(index) += 1
-          }
-        }
-      }
-    }
-
-    // the current vector size, or -1 if not available yet.
-    private def getN(buffer: Row): Int = {
-      buffer.getInt(computeN)
-    }
-
-    private[this] def exposeDArray2(buffer: Row, index: Int): Row = {
-      if (index == -1) {
-        null
-      } else {
-        buffer.getStruct(index)
-      }
-    }
-
-    private[this] def exposeDArray(buffer: MutableAggregationBuffer, index: Int): Array[Double] = {
-      if (index == -1) {
-        null
-      } else {
-        buffer.getAs[MutableAggregationBuffer](index)
-        buffer.getAs[mutable.WrappedArray[Double]](index).array
-      }
-    }
-
-    private[this] def exposeDArray3(
-        buffer: MutableAggregationBuffer, index: Int): MutableAggregationBuffer = {
-      if (index == -1) {
-        null
-      } else {
-        buffer.getAs[MutableAggregationBuffer](index)
-      }
-    }
-
-    // Performs the initialization of the requested fields, if required.
-    private def initialize(buffer: MutableAggregationBuffer, n: Int): Unit = {
-      require(n > 0, s"Cannot set n to $n: must be positive")
-      val currentN = buffer.getInt(computeN)
-      require(currentN == -1 || (currentN == n),
-        s"The vector size has already been set with $currentN, requested change to $n")
-      if (currentN == -1) {
-        // Set all the vectors to the given value.
-        // This is not a tight loop, so there should be no performance issue in
-        // making lookups for the values.
-        buffer.update(computeN, n)
-        // Only initialize the metricts that we want to compute:
-        metricsWithOrder.filter(p => metrics.contains(p._1)).foreach { case (m, idx) =>
-          // Special cases for the scalar values.
-          m match {
-            case ComputeCount =>
-              buffer.update(idx, 0L)
-            case ComputeTotalWeightSum =>
-              buffer.update(idx, 0.0)
-            case ComputeWeightSquareSum =>
-              buffer.update(idx, 0.0)
-            case _ =>
-              buffer.update(idx, Array.ofDim[Double](n))
-          }
-        }
-      }
-    }
-
-    /**
-     * Returns a fixed index for the compute metric if it is in the list of requested metrics,
-     * and -1 otherwise.
-     */
-    private def getIndexFor(m: ComputeMetrics): Int = {
-      if (metrics.contains(m)) {
-        metricsWithOrder.find(p => p._1 == m).get._2
-      } else {
-        -1
-      }
-    }
-
-    private def getForcedIndexFor(m: ComputeMetrics): Int = {
-      metricsWithOrder.find(p => p._1 == m).get._2
-    }
-
-  }
-
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
index 38e7bcd157d4..925b07e34841 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
@@ -86,6 +86,19 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
     case (l1: Long, l2: Long) => assert(l1 === l2)
   }
 
+  test("debugging test") {
+    val df = denseData(Nil)
+    val c = df.col("features")
+    println(s">>>>> c=$c")
+    val c1 = metrics("mean").summary(c)
+    println(s">>>>> c1=$c1")
+    val res = df.select(c1)
+    println(s">>>>> res=$res")
+    intercept[SparkException] {
+      compare(res, Seq.empty)
+    }
+  }
+
   test("basic error handling") {
     val df = denseData(Nil)
     val c = df.col("features")

From c3f236c4422031ae818cb6bbec2415b3f1bf7b70 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Tue, 21 Mar 2017 12:03:07 -0700
Subject: [PATCH 08/26] cleanup

---
 .../org/apache/spark/ml/stat/Summarizer.scala | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index 018fb4cbc092..8ae2eacfd615 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -187,7 +187,7 @@ object SummaryBuilderImpl extends Logging {
   case object NormL1 extends Metrics
 
   /**
-   * The running metrics that are going to be computing.
+   * The running metrics that are going to be computed.
    *
    * There is a bipartite graph between the metrics and the computed metrics.
    */
@@ -204,12 +204,6 @@ object SummaryBuilderImpl extends Logging {
   case object ComputeMax extends ComputeMetrics
   case object ComputeMin extends ComputeMetrics
 
-  // The order in which the metrics will be stored in the buffer.
-  // This order replicates the order above.
-  private val metricsWithOrder = Seq(ComputeMean, ComputeM2n, ComputeM2, ComputeL1, ComputeCount,
-    ComputeTotalWeightSum, ComputeWeightSquareSum, ComputeWeightSum, ComputeNNZ, ComputeMax,
-    ComputeMin).zipWithIndex
-
   /**
    * The buffer that contains all the summary statistics. If the value is null, it is considered
    * to be not required.
@@ -253,17 +247,23 @@ object SummaryBuilderImpl extends Logging {
     }
 
     def bufferSchema: StructType = {
-      val fields = metricsWithOrder.map { case (m, _) =>
-        val tpe = m match {
-          case ComputeCount => LongType
-          case ComputeTotalWeightSum => DoubleType
-          case ComputeWeightSquareSum => DoubleType
-          case _ => ArrayType(DoubleType, containsNull = false)
-        }
-        StructField(m.toString, tpe, nullable = true)
-      }
-      val n = StructField("n", IntegerType, nullable = false)
-      StructType(n +: fields)
+      val at = ArrayType(DoubleType, containsNull = false)
+      val al = ArrayType(LongType, containsNull = false)
+      val fields = Seq(
+        "n" -> IntegerType,
+        "mean" -> at,
+        "m2n" -> at,
+        "m2" -> at,
+        "l1" -> at,
+        "totalCount" -> LongType,
+        "totalWeightSum" -> DoubleType,
+        "totalWeightSquareSum" -> DoubleType,
+        "weightSum" -> at,
+        "nnz" -> al,
+        "max" -> at,
+        "min" -> at
+      )
+      StructType(fields.map { case (name, t) => StructField(name, t, nullable = true)})
     }
 
     def updateInPlace(buffer: Buffer, v: Vector, w: Double): Unit = {

From ef955c00275705f14342f3e4ed970a78f0f3c141 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Tue, 21 Mar 2017 15:42:42 -0700
Subject: [PATCH 09/26] debugging

---
 .../org/apache/spark/ml/stat/Summarizer.scala | 38 +++++++++++++------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index 8ae2eacfd615..16c842c3f8cb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -27,8 +27,8 @@ import org.apache.spark.ml.linalg.{BLAS, SQLDataTypes, Vector, Vectors}
 import org.apache.spark.ml.stat.SummaryBuilderImpl.MetricsAggregate
 import org.apache.spark.sql.{Column, Row}
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeProjection, UnsafeRow}
-import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Final, TypedImperativeAggregate}
+import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeArrayData, UnsafeProjection, UnsafeRow}
+import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete, Final, TypedImperativeAggregate}
 import org.apache.spark.sql.catalyst.util.GenericArrayData
 import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction, UserDefinedFunction}
 import org.apache.spark.sql.types._
@@ -117,7 +117,7 @@ private[ml] class SummaryBuilderImpl(
       column.expr,
       mutableAggBufferOffset = 0,
       inputAggBufferOffset = 0)
-    new Column(AggregateExpression(agg, mode = Final, isDistinct = false))
+    new Column(AggregateExpression(agg, mode = Complete, isDistinct = false))
   }
 }
 
@@ -367,17 +367,17 @@ object SummaryBuilderImpl extends Logging {
     def write(buffer: Buffer): Array[Byte] = {
       val ir = InternalRow.apply(
         buffer.n,
-        buffer.mean,
-        buffer.m2n,
-        buffer.m2,
-        buffer.l1,
+        gadD(buffer.mean),
+        gadD(buffer.m2n),
+        gadD(buffer.m2),
+        gadD(buffer.l1),
         buffer.totalCount,
         buffer.totalWeightSum,
         buffer.totalWeightSquareSum,
-        buffer.weightSum,
-        buffer.nnz,
-        buffer.max,
-        buffer.min
+        gadD(buffer.weightSum),
+        gadL(buffer.nnz),
+        gadD(buffer.max),
+        gadD(buffer.min)
       )
       projection.apply(ir).getBytes
     }
@@ -438,6 +438,22 @@ object SummaryBuilderImpl extends Logging {
       buffer.l1
     }
 
+    private def gadD(arr: Array[Double]): UnsafeArrayData = {
+      if (arr == null) {
+        null
+      } else {
+        UnsafeArrayData.fromPrimitiveArray(arr)
+      }
+    }
+
+    private def gadL(arr: Array[Long]): UnsafeArrayData = {
+      if (arr == null) {
+        null
+      } else {
+        UnsafeArrayData.fromPrimitiveArray(arr)
+      }
+    }
+
     private[this] lazy val projection = UnsafeProjection.create(bufferSchema)
 
     // Returns the array at a given index, or null if the array is null.

From a04f923913ca1118a61d66bd53b8514af62594d7 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Tue, 21 Mar 2017 16:14:23 -0700
Subject: [PATCH 10/26] work

---
 .../org/apache/spark/ml/stat/Summarizer.scala | 56 +++++++++++++++++--
 1 file changed, 50 insertions(+), 6 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index 16c842c3f8cb..8894dc596e51 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -24,7 +24,7 @@ import org.apache.spark.SparkException
 import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.linalg.{BLAS, SQLDataTypes, Vector, Vectors}
-import org.apache.spark.ml.stat.SummaryBuilderImpl.MetricsAggregate
+import org.apache.spark.ml.linalg.VectorUDT
 import org.apache.spark.sql.{Column, Row}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeArrayData, UnsafeProjection, UnsafeRow}
@@ -269,7 +269,7 @@ object SummaryBuilderImpl extends Logging {
     def updateInPlace(buffer: Buffer, v: Vector, w: Double): Unit = {
       val startN = buffer.n
       if (startN == -1) {
-        buffer.n = v.size
+        resizeArraysInPlace(buffer, v.size)
       } else {
         require(startN == v.size,
           s"Trying to insert a vector of size $v into a buffer that " +
@@ -345,8 +345,9 @@ object SummaryBuilderImpl extends Logging {
     /**
      * Reads a buffer from a serialized form, using the row object as an assistant.
      */
-    def read(bytes: Array[Byte], row: UnsafeRow): Buffer = {
-      row.pointTo(bytes, bytes.length)
+    def read(bytes: Array[Byte], row2: UnsafeRow): Buffer = {
+      row2.pointTo(bytes, bytes.length)
+      val row = row2.getStruct(0, 12)
       new Buffer(
         n = row.getInt(0),
         mean = nullableArrayD(row, 1),
@@ -438,6 +439,40 @@ object SummaryBuilderImpl extends Logging {
       buffer.l1
     }
 
+    private def resizeArraysInPlace(buffer: Buffer, n: Int): Unit = {
+      // It should either be unsized or of the same size.
+      require(buffer.n == -1 || buffer.n == n, (buffer.n, n))
+      if (buffer.n == n) {
+        return
+      }
+      buffer.n = n
+      // Conditional resize of the non-null arrays
+      if (buffer.mean != null) {
+        buffer.mean = Array.ofDim(n)
+      }
+      if (buffer.m2n != null) {
+        buffer.m2n = Array.ofDim(n)
+      }
+      if (buffer.m2 != null) {
+        buffer.m2 = Array.ofDim(n)
+      }
+      if (buffer.l1 != null) {
+        buffer.l1 = Array.ofDim(n)
+      }
+      if (buffer.weightSum != null) {
+        buffer.weightSum = Array.ofDim(n)
+      }
+      if (buffer.nnz != null) {
+        buffer.nnz = Array.ofDim(n)
+      }
+      if (buffer.max != null) {
+        buffer.max = Array.ofDim(n)
+      }
+      if (buffer.min != null) {
+        buffer.min = Array.ofDim(n)
+      }
+    }
+
     private def gadD(arr: Array[Double]): UnsafeArrayData = {
       if (arr == null) {
         null
@@ -599,7 +634,10 @@ object SummaryBuilderImpl extends Logging {
     override def children: Seq[Expression] = child :: Nil
 
     override def update(buff: Buffer, row: InternalRow): Buffer = {
-      val v = row.get(0, SQLDataTypes.VectorType).asInstanceOf[Vector]
+      println(s"UPDATE: ROW=$row")
+      // Unsafe rows do not play well with UDTs, it seems.
+      // Directly call the deserializer.
+      val v = udt.deserialize(row.getStruct(0, udt.sqlType.size))
 
       val w = row.numFields match {
         case 1 => 1.0
@@ -620,7 +658,11 @@ object SummaryBuilderImpl extends Logging {
     override def createAggregationBuffer(): Buffer = startBuffer.copy()
 
     override def serialize(buff: Buffer): Array[Byte] = {
-      Buffer.write(buff)
+      val x = Buffer.write(buff)
+      println(s"serialize: buff=$buff")
+      val b2 = deserialize(x)
+      println(s"serialize: b2=$b2")
+      x
     }
 
     override def deserialize(bytes: Array[Byte]): Buffer = {
@@ -640,4 +682,6 @@ object SummaryBuilderImpl extends Logging {
     override def prettyName: String = "aggregate_metrics"
   }
 
+  private[this] val udt = new VectorUDT
+
 }

From 201eb7712054967cd5093d3a908f4ebbd73f30a8 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Wed, 22 Mar 2017 14:19:57 -0700
Subject: [PATCH 11/26] debug

---
 .../test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
index 925b07e34841..746357924a25 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
@@ -88,6 +88,7 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("debugging test") {
     val df = denseData(Nil)
+    println(s">>> df=${df.collect().toSeq}")
     val c = df.col("features")
     println(s">>>>> c=$c")
     val c1 = metrics("mean").summary(c)

From f4dec88a49d0a20e1b328617fd721633fd8c201a Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Thu, 23 Mar 2017 11:27:19 -0700
Subject: [PATCH 12/26] trying to debug serialization issue

---
 .../org/apache/spark/ml/stat/Summarizer.scala | 105 ++++++++++++------
 .../spark/ml/stat/SummarizerSuite.scala       |  80 +++++++++++--
 .../expressions/aggregate/interfaces.scala    |   6 +
 3 files changed, 148 insertions(+), 43 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index 8894dc596e51..10b9eafcbae6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -35,6 +35,7 @@ import org.apache.spark.sql.types._
 
 // scalastyle:off println
 
+
 /**
  * A builder object that provides summary statistics about a given column.
  *
@@ -102,7 +103,26 @@ object Summarizer extends Logging {
     new SummaryBuilderImpl(typedMetrics, computeMetrics)
   }
 
-  def mean(col: Column): Column = metrics("mean").summary(col)
+  def mean(col: Column): Column = getSingleMetric(col, "mean")
+
+  def variance(col: Column): Column = getSingleMetric(col, "variance")
+
+  def count(col: Column): Column = getSingleMetric(col, "count")
+
+  def numNonZeros(col: Column): Column = getSingleMetric(col, "numNonZeros")
+
+  def max(col: Column): Column = getSingleMetric(col, "max")
+
+  def min(col: Column): Column = getSingleMetric(col, "min")
+
+  def normL1(col: Column): Column = getSingleMetric(col, "normL1")
+
+  def normL2(col: Column): Column = getSingleMetric(col, "normL2")
+
+  private def getSingleMetric(col: Column, metric: String): Column = {
+    val c1 = metrics(metric).summary(col)
+    c1.getField(metric).as(s"$metric($col)")
+  }
 }
 
 private[ml] class SummaryBuilderImpl(
@@ -111,6 +131,9 @@ private[ml] class SummaryBuilderImpl(
 
   override def summary(column: Column): Column = {
     val start = SummaryBuilderImpl.Buffer.fromMetrics(requestedCompMetrics)
+    println(s"summary: requestedMetrics=$requestedMetrics")
+    println(s"summary: requestedCompMetrics=$requestedCompMetrics")
+    println(s"summary: start=$start")
     val agg = SummaryBuilderImpl.MetricsAggregate(
       requestedMetrics,
       start,
@@ -129,9 +152,9 @@ object SummaryBuilderImpl extends Logging {
   @throws[IllegalArgumentException]("When the list is empty or not a subset of known metrics")
   def getRelevantMetrics(requested: Seq[String]): (Seq[Metrics], Seq[ComputeMetrics]) = {
     val all = requested.map { req =>
-      val (_, metric, deps) = allMetrics.find(tup => tup._1 == req).getOrElse {
+      val (_, metric, _, deps) = allMetrics.find(tup => tup._1 == req).getOrElse {
         throw new IllegalArgumentException(s"Metric $req cannot be found." +
-          s" Valid metris are $implementedMetrics")
+          s" Valid metrics are $implementedMetrics")
       }
       metric -> deps
     }
@@ -143,17 +166,15 @@ object SummaryBuilderImpl extends Logging {
   }
 
   def structureForMetrics(metrics: Seq[Metrics]): StructType = {
-    val fields = metrics.map { m =>
-      // All types are vectors, except for one.
-//      val tpe = if (m == Count) { LongType } else { SQLDataTypes.VectorType }
-      val tpe = if (m == Count) { LongType } else { ArrayType(DoubleType, containsNull = false) }
-      // We know the metric is part of the list.
-      val (name, _, _) = allMetrics.find(tup => tup._2 == m).get
-      StructField(name, tpe, nullable = false)
+    val dct = allMetrics.map { case (n, m, dt, _) => m -> (n, dt) }.toMap
+    val fields = metrics.map(dct.apply).map { case (n, dt) =>
+        StructField(n, dt, nullable = false)
     }
     StructType(fields)
   }
 
+  private val arrayDType = ArrayType(DoubleType, containsNull = false)
+  private val arrayLType = ArrayType(LongType, containsNull = false)
 
   /**
    * All the metrics that can be currently computed by Spark for vectors.
@@ -161,16 +182,16 @@ object SummaryBuilderImpl extends Logging {
    * This list associates the user name, the internal (typed) name, and the list of computation
    * metrics that need to de computed internally to get the final result.
    */
-  private val allMetrics = Seq(
-    ("mean", Mean, Seq(ComputeMean, ComputeWeightSum, ComputeTotalWeightSum)), // Vector
-    ("variance", Variance, Seq(ComputeTotalWeightSum, ComputeWeightSquareSum, ComputeMean,
-                               ComputeM2n)), // Vector
-    ("count", Count, Seq(ComputeCount)), // Long
-    ("numNonZeros", NumNonZeros, Seq(ComputeNNZ)), // Vector
-    ("max", Max, Seq(ComputeMax, ComputeTotalWeightSum, ComputeNNZ, ComputeCount)), // Vector
-    ("min", Min, Seq(ComputeMin, ComputeTotalWeightSum, ComputeNNZ, ComputeCount)), // Vector
-    ("normL2", NormL2, Seq(ComputeTotalWeightSum, ComputeM2)), // Vector
-    ("normL1", Min, Seq(ComputeTotalWeightSum, ComputeL1)) // Vector
+  private val allMetrics: Seq[(String, Metrics, DataType, Seq[ComputeMetrics])] = Seq(
+    ("mean", Mean, arrayDType, Seq(ComputeMean, ComputeWeightSum, ComputeTotalWeightSum)),
+    ("variance", Variance, arrayDType, Seq(ComputeTotalWeightSum, ComputeWeightSum,
+                                           ComputeWeightSquareSum, ComputeMean, ComputeM2n)),
+    ("count", Count, LongType, Seq(ComputeCount)),
+    ("numNonZeros", NumNonZeros, arrayLType, Seq(ComputeNNZ)),
+    ("max", Max, arrayDType, Seq(ComputeMax)),
+    ("min", Min, arrayDType, Seq(ComputeMin)),
+    ("normL2", NormL2, arrayDType, Seq(ComputeTotalWeightSum, ComputeM2)),
+    ("normL1", Min, arrayDType, Seq(ComputeTotalWeightSum, ComputeL1))
   )
 
   /**
@@ -223,8 +244,18 @@ object SummaryBuilderImpl extends Logging {
     var weightSum: Array[Double] = null,      // 8
     var nnz: Array[Long] = null,              // 9
     var max: Array[Double] = null,            // 10
-    var min: Array[Double] = null)            // 11
-
+    var min: Array[Double] = null             // 11
+  ) {
+      override def toString: String = {
+        def v(x: Array[Double]) = if (x==null) "null" else x.toSeq.mkString("[", " ", "]")
+        def vl(x: Array[Long]) = if (x==null) "null" else x.toSeq.mkString("[", " ", "]")
+
+        s"Buffer(n=$n mean=${v(mean)} m2n=${v(m2n)} m2=${v(m2)} l1=${v(l1)}" +
+          s" totalCount=$totalCount totalWeightSum=$totalWeightSum" +
+          s" totalWeightSquareSum=$totalWeightSquareSum weightSum=${v(weightSum)} nnz=${vl(nnz)}" +
+          s" max=${v(max)} min=${v(min)})"
+      }
+    }
   object Buffer {
     // Recursive function, but the number of cases is really small.
     def fromMetrics(requested: Seq[ComputeMetrics]): Buffer = {
@@ -294,6 +325,7 @@ object SummaryBuilderImpl extends Logging {
           }
 
           if (buffer.mean != null) {
+            assert(buffer.weightSum != null)
             val prevMean = buffer.mean(index)
             val diff = value - prevMean
             buffer.mean(index) += w * diff / (buffer.weightSum(index) + w)
@@ -548,7 +580,7 @@ object SummaryBuilderImpl extends Logging {
       val weightSum2 = if (other.weightSum == null) null else { other.weightSum.clone() }
 
       val weightSum = if (weightSum1 == null) null else {
-        require(weightSum2 != null)
+        require(weightSum2 != null, s"buffer=$buffer other=$other")
         val arr: Array[Double] = Array.ofDim(buffer.n)
         b(arr) :+= b(weightSum1) :- b(weightSum1)
         arr
@@ -615,20 +647,20 @@ object SummaryBuilderImpl extends Logging {
       inputAggBufferOffset: Int)
     extends TypedImperativeAggregate[Buffer] {
 
-    private lazy val row = new UnsafeRow(1)
+//    private lazy val row = new UnsafeRow(1)
 
-    override def eval(buff: Buffer): Row = {
+    override def eval(buff: Buffer): InternalRow = {
       val metrics = requested.map({
-        case Mean => Buffer.mean(buff)
-        case Variance => Buffer.variance(buff)
+        case Mean => UnsafeArrayData.fromPrimitiveArray(Buffer.mean(buff))
+        case Variance => UnsafeArrayData.fromPrimitiveArray(Buffer.variance(buff))
         case Count => Buffer.totalCount(buff)
-        case NumNonZeros => Buffer.nnz(buff)
-        case Max => Buffer.max(buff)
-        case Min => Buffer.min(buff)
-        case NormL2 => Buffer.l2(buff)
-        case NormL1 => Buffer.l1(buff)
+        case NumNonZeros => UnsafeArrayData.fromPrimitiveArray(Buffer.nnz(buff))
+        case Max => UnsafeArrayData.fromPrimitiveArray(Buffer.max(buff))
+        case Min => UnsafeArrayData.fromPrimitiveArray(Buffer.min(buff))
+        case NormL2 => UnsafeArrayData.fromPrimitiveArray(Buffer.l2(buff))
+        case NormL1 => UnsafeArrayData.fromPrimitiveArray(Buffer.l1(buff))
       })
-      Row(metrics: _*)
+      InternalRow.apply(metrics: _*)
     }
 
     override def children: Seq[Expression] = child :: Nil
@@ -659,14 +691,15 @@ object SummaryBuilderImpl extends Logging {
 
     override def serialize(buff: Buffer): Array[Byte] = {
       val x = Buffer.write(buff)
-      println(s"serialize: buff=$buff")
+      println(s"serialize: ${buff.hashCode()} x=${x.length} buff=$buff")
       val b2 = deserialize(x)
-      println(s"serialize: b2=$b2")
+      println(s"serialize: ${buff.hashCode()} b2=$b2")
       x
     }
 
     override def deserialize(bytes: Array[Byte]): Buffer = {
-      Buffer.read(bytes, row)
+//      Buffer.read(bytes, row)
+      Buffer.read(bytes, new UnsafeRow(1))
     }
 
     override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): MetricsAggregate = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
index 746357924a25..0479117498ff 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
@@ -22,6 +22,7 @@ import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
 
 // scalastyle:off println
 
@@ -33,7 +34,15 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
   private val seed = 42
   private val eps: Double = 1e-5
 
-  private case class ExpectedMetrics(mean: Vector, min: Vector)
+  private case class ExpectedMetrics(
+      mean: Seq[Double],
+      variance: Seq[Double],
+      count: Long,
+      numNonZeros: Seq[Long],
+      max: Seq[Double],
+      min: Seq[Double],
+      normL2: Seq[Double],
+      normL1: Seq[Double])
 
   // The input is expected to be either a sparse vector, a dense vector or an array of doubles
   // (which will be converted to a dense vector)
@@ -46,13 +55,49 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
   private def testExample(input: Seq[Any], exp: ExpectedMetrics): Unit = {
     val inputVec: Seq[Vector] = input.map {
       case x: Array[Double] => Vectors.dense(x)
+      case x: Seq[Double] => Vectors.dense(x.toArray)
       case x: Vector => x
       case x => throw new Exception(x.toString)
     }
     val df = sc.parallelize(inputVec).map(Tuple1.apply).toDF("features")
     val c = df.col("features")
 
-    compare(df.select(metrics("mean").summary(c), mean(c)), Seq(exp.mean, exp.mean))
+    compare(df.select(metrics("mean").summary(c), mean(c)), Seq(Row(exp.mean), exp.mean))
+
+    compare(df.select(metrics("variance").summary(c), variance(c)),
+      Seq(Row(exp.variance), exp.variance))
+
+    compare(df.select(metrics("count").summary(c), count(c)),
+      Seq(Row(exp.count), exp.count))
+
+    compare(df.select(metrics("numNonZeros").summary(c), numNonZeros(c)),
+      Seq(Row(exp.numNonZeros), exp.numNonZeros))
+
+    println("STARTING MIN")
+
+    compare(df.select(metrics("min").summary(c), min(c)),
+      Seq(Row(exp.min), exp.min))
+
+    println("STARTING MAX")
+
+    compare(df.select(metrics("max").summary(c), max(c)),
+      Seq(Row(exp.max), exp.max))
+
+    println("STARTING NORML1")
+
+    compare(df.select(metrics("normL1").summary(c), normL1(c)),
+      Seq(Row(exp.normL1), exp.normL1))
+
+    println("STARTING NORML2")
+
+    compare(df.select(metrics("normL2").summary(c), normL2(c)),
+      Seq(Row(exp.normL2), exp.normL2))
+
+    compare(df.select(
+      metrics("mean", "variance", "count", "numNonZeros").summary(c),
+      mean(c), variance(c), count(c), numNonZeros(c)),
+      Seq(Row(exp.mean, exp.variance, exp.count, exp.numNonZeros),
+        exp.mean, exp.variance, exp.count, exp.numNonZeros))
   }
 
   private def denseData(input: Seq[Seq[Double]]): DataFrame = {
@@ -67,6 +112,7 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
     val Seq(row) = coll
     val res = row.toSeq
     println(s"compare: ress=${res.map(_.getClass)}")
+    println(s"compare: row=${row}")
     val names = df.schema.fieldNames.zipWithIndex.map { case (n, idx) => s"$n ($idx)" }
     assert(res.size === exp.size, (res.size, exp.size))
     for (((x1, x2), name) <- res.zip(exp).zip(names)) {
@@ -78,12 +124,25 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
   private def compareStructures(x1: Any, x2: Any, name: String): Unit = (x1, x2) match {
     case (d1: Double, d2: Double) =>
       assert(Vectors.dense(d1) ~== Vectors.dense(d2) absTol 1e-4, name)
+    case (r1: GenericRowWithSchema, r2: Row) =>
+      assert(r1.size === r2.size, (r1, r2))
+      for (((fname, x1), x2) <- r1.schema.fieldNames.zip(r1.toSeq).zip(r2.toSeq)) {
+        compareStructures(x1, x2, s"$name.$fname")
+      }
     case (r1: Row, r2: Row) =>
       assert(r1.size === r2.size, (r1, r2))
       for ((x1, x2) <- r1.toSeq.zip(r2.toSeq)) { compareStructures(x1, x2, name) }
     case (v1: Vector, v2: Vector) =>
       assert(v1 ~== v2 absTol 1e-4, name)
     case (l1: Long, l2: Long) => assert(l1 === l2)
+    case (s1: Seq[_], s2: Seq[_]) =>
+      assert(s1.size === s2.size, s"$name ${(s1, s2)}")
+      for (((x1, idx), x2) <- s1.zipWithIndex.zip(s2)) {
+        compareStructures(x1, x2, s"$name.$idx")
+      }
+    case (arr1: Array[_], arr2: Array[_]) =>
+      assert(arr1.toSeq === arr2.toSeq)
+    case _ => throw new Exception(s"$name: ${x1.getClass} ${x2.getClass} $x1 $x2")
   }
 
   test("debugging test") {
@@ -112,14 +171,21 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("no element, working metrics") {
     val df = denseData(Nil)
     val c = df.col("features")
-    val res = df.select(metrics("count").summary(c))
-    compare(res, Seq(Row(0L)))
+    val res = df.select(metrics("count").summary(c), count(c))
+    compare(res, Seq(Row(0L), 0L))
   }
 
   test("single element in vector, all metrics") {
-    testExample(Seq(Array(1.0, 2.0)), ExpectedMetrics(
-      Vectors.dense(1.0, 2.0),
-      Vectors.dense(1.0, 2.0)
+    val x = Seq(1.0, 2.0)
+    testExample(Seq(x), ExpectedMetrics(
+      mean = x,
+      variance = Seq(0.0, 0.0),
+      count = 1,
+      numNonZeros = Seq(1, 1),
+      max = x,
+      min = x,
+      normL1 = x,
+      normL2 = x
     ))
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
index 80c25d0b0fb7..3c4c9690f378 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -495,6 +495,12 @@ abstract class TypedImperativeAggregate[T] extends ImperativeAggregate {
    * Generates the final aggregation result value for current key group with the aggregation buffer
    * object.
    *
+   * Developer note: the only return types accepted by Spark are:
+   *   - primitive types
+   *   - InternalRow and subclasses
+   *   - ArrayData
+   *   - MapData
+   *
    * @param buffer aggregation buffer object.
    * @return The aggregation result of current key group
    */

From 4af0f47d326ef91d7cf9ccaf6a45ee3f904b191f Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Thu, 23 Mar 2017 16:16:10 -0700
Subject: [PATCH 13/26] better tests

---
 .../org/apache/spark/ml/stat/Summarizer.scala | 151 ++++++++++++------
 .../spark/ml/stat/SummarizerSuite.scala       | 118 ++++++++------
 2 files changed, 169 insertions(+), 100 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index 10b9eafcbae6..330a98090901 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -23,8 +23,7 @@ import breeze.numerics
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.internal.Logging
-import org.apache.spark.ml.linalg.{BLAS, SQLDataTypes, Vector, Vectors}
-import org.apache.spark.ml.linalg.VectorUDT
+import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors, VectorUDT}
 import org.apache.spark.sql.{Column, Row}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeArrayData, UnsafeProjection, UnsafeRow}
@@ -191,7 +190,7 @@ object SummaryBuilderImpl extends Logging {
     ("max", Max, arrayDType, Seq(ComputeMax)),
     ("min", Min, arrayDType, Seq(ComputeMin)),
     ("normL2", NormL2, arrayDType, Seq(ComputeTotalWeightSum, ComputeM2)),
-    ("normL1", Min, arrayDType, Seq(ComputeTotalWeightSum, ComputeL1))
+    ("normL1", NormL1, arrayDType, Seq(ComputeTotalWeightSum, ComputeL1))
   )
 
   /**
@@ -277,30 +276,32 @@ object SummaryBuilderImpl extends Logging {
       }
     }
 
-    def bufferSchema: StructType = {
-      val at = ArrayType(DoubleType, containsNull = false)
-      val al = ArrayType(LongType, containsNull = false)
+    val bufferSchema: StructType = {
       val fields = Seq(
         "n" -> IntegerType,
-        "mean" -> at,
-        "m2n" -> at,
-        "m2" -> at,
-        "l1" -> at,
+        "mean" -> arrayDType,
+        "m2n" -> arrayDType,
+        "m2" -> arrayDType,
+        "l1" -> arrayDType,
         "totalCount" -> LongType,
         "totalWeightSum" -> DoubleType,
         "totalWeightSquareSum" -> DoubleType,
-        "weightSum" -> at,
-        "nnz" -> al,
-        "max" -> at,
-        "min" -> at
+        "weightSum" -> arrayDType,
+        "nnz" -> arrayLType,
+        "max" -> arrayDType,
+        "min" -> arrayDType
       )
       StructType(fields.map { case (name, t) => StructField(name, t, nullable = true)})
     }
 
+    val numFields = bufferSchema.fields.length
+
     def updateInPlace(buffer: Buffer, v: Vector, w: Double): Unit = {
       val startN = buffer.n
       if (startN == -1) {
-        resizeArraysInPlace(buffer, v.size)
+        // The buffer was not initialized, we initialize it with the incoming row.
+        fillBufferWithRow(buffer, v, w)
+        return
       } else {
         require(startN == v.size,
           s"Trying to insert a vector of size $v into a buffer that " +
@@ -368,7 +369,7 @@ object SummaryBuilderImpl extends Logging {
         if (other.n == -1) {
           buffer
         } else {
-          mergeBuffers0(buffer, other)
+          mergeInitializedBuffers(buffer, other)
           buffer
         }
       }
@@ -379,7 +380,7 @@ object SummaryBuilderImpl extends Logging {
      */
     def read(bytes: Array[Byte], row2: UnsafeRow): Buffer = {
       row2.pointTo(bytes, bytes.length)
-      val row = row2.getStruct(0, 12)
+      val row = row2.getStruct(0, numFields)
       new Buffer(
         n = row.getInt(0),
         mean = nullableArrayD(row, 1),
@@ -471,39 +472,6 @@ object SummaryBuilderImpl extends Logging {
       buffer.l1
     }
 
-    private def resizeArraysInPlace(buffer: Buffer, n: Int): Unit = {
-      // It should either be unsized or of the same size.
-      require(buffer.n == -1 || buffer.n == n, (buffer.n, n))
-      if (buffer.n == n) {
-        return
-      }
-      buffer.n = n
-      // Conditional resize of the non-null arrays
-      if (buffer.mean != null) {
-        buffer.mean = Array.ofDim(n)
-      }
-      if (buffer.m2n != null) {
-        buffer.m2n = Array.ofDim(n)
-      }
-      if (buffer.m2 != null) {
-        buffer.m2 = Array.ofDim(n)
-      }
-      if (buffer.l1 != null) {
-        buffer.l1 = Array.ofDim(n)
-      }
-      if (buffer.weightSum != null) {
-        buffer.weightSum = Array.ofDim(n)
-      }
-      if (buffer.nnz != null) {
-        buffer.nnz = Array.ofDim(n)
-      }
-      if (buffer.max != null) {
-        buffer.max = Array.ofDim(n)
-      }
-      if (buffer.min != null) {
-        buffer.min = Array.ofDim(n)
-      }
-    }
 
     private def gadD(arr: Array[Double]): UnsafeArrayData = {
       if (arr == null) {
@@ -563,11 +531,88 @@ object SummaryBuilderImpl extends Logging {
       }
     }
 
+    /**
+     * Sets the content of a buffer based on a single row (initialization).
+     *
+     * The buffer must be uninitialized first.
+     */
+    private def fillBufferWithRow(buffer: Buffer, v: Vector, w: Double): Unit = {
+      require(buffer.n == -1, (buffer.n, buffer))
+      buffer.n = v.size
+      buffer.totalCount = 1L
+      buffer.totalWeightSum = w
+      buffer.totalWeightSquareSum = w * w
+
+      val arr = v.toArray
+      if (buffer.mean != null) {
+        buffer.mean = arr.clone()
+      }
+      if (buffer.m2n != null) {
+        buffer.m2n = Array.ofDim(buffer.n)
+      }
+      if (buffer.max != null) {
+        buffer.max = arr.clone()
+      }
+      if (buffer.min != null) {
+        buffer.min = arr.clone()
+      }
+
+      // The rest of these operations have efficient bulk versions.
+      v match {
+        case dv: DenseVector =>
+          if (buffer.m2 != null) {
+            buffer.m2 = Array.ofDim(buffer.n)
+            b(buffer.m2) := w * (b(arr) :* b(arr))
+          }
+          if (buffer.l1 != null) {
+            buffer.l1 = Array.ofDim(buffer.n)
+            b(buffer.l1) := numerics.abs(b(arr))
+          }
+
+        case sv: SparseVector =>
+          if (buffer.m2 != null) {
+            buffer.m2 = Array.ofDim(buffer.n)
+            v.foreachActive { (index, value) =>
+              buffer.weightSum(index) = w * value * value
+            }
+          }
+
+          if (buffer.l1 != null) {
+            buffer.l1 = Array.ofDim(buffer.n)
+            v.foreachActive { (index, value) =>
+              buffer.weightSum(index) = w * math.abs(value)
+            }
+          }
+
+
+        // In the case of the weightSum and NNZ, we also have to account for the value of
+        // the elements.
+        if (buffer.weightSum != null) {
+          buffer.weightSum = Array.ofDim(buffer.n)
+          v.foreachActive { (index, value) =>
+            if (value != 0.0) {
+              buffer.weightSum(index) = w
+            }
+          }
+        }
+
+        if (buffer.nnz != null) {
+          buffer.nnz = Array.ofDim(buffer.n)
+          v.foreachActive { (index, value) =>
+            if (value != 0.0) {
+              buffer.nnz(index) = 1L
+            }
+          }
+        }
+
+      }
+    }
+
 
     /**
      * Merges other into buffer.
      */
-    private def mergeBuffers0(buffer: Buffer, other: Buffer): Unit = {
+    private def mergeInitializedBuffers(buffer: Buffer, other: Buffer): Unit = {
       // Each buffer needs to be properly initialized.
       require(buffer.n > 0 && other.n > 0, (buffer.n, other.n))
       require(buffer.n == other.n, (buffer.n, other.n))
@@ -647,7 +692,7 @@ object SummaryBuilderImpl extends Logging {
       inputAggBufferOffset: Int)
     extends TypedImperativeAggregate[Buffer] {
 
-//    private lazy val row = new UnsafeRow(1)
+    private lazy val row = new UnsafeRow(Buffer.numFields)
 
     override def eval(buff: Buffer): InternalRow = {
       val metrics = requested.map({
@@ -699,7 +744,7 @@ object SummaryBuilderImpl extends Logging {
 
     override def deserialize(bytes: Array[Byte]): Buffer = {
 //      Buffer.read(bytes, row)
-      Buffer.read(bytes, new UnsafeRow(1))
+      Buffer.read(bytes, new UnsafeRow(Buffer.numFields))
     }
 
     override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): MetricsAggregate = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
index 0479117498ff..c8d842125cdc 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
@@ -52,52 +52,75 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
   // are expected for this input. They currently test against some fixed subset of the
   // metrics, but should be made fuzzy in the future.
 
-  private def testExample(input: Seq[Any], exp: ExpectedMetrics): Unit = {
-    val inputVec: Seq[Vector] = input.map {
+  private def testExample(name: String, input: Seq[Any], exp: ExpectedMetrics): Unit = {
+    def inputVec: Seq[Vector] = input.map {
       case x: Array[Double] => Vectors.dense(x)
       case x: Seq[Double] => Vectors.dense(x.toArray)
       case x: Vector => x
       case x => throw new Exception(x.toString)
     }
-    val df = sc.parallelize(inputVec).map(Tuple1.apply).toDF("features")
-    val c = df.col("features")
-
-    compare(df.select(metrics("mean").summary(c), mean(c)), Seq(Row(exp.mean), exp.mean))
-
-    compare(df.select(metrics("variance").summary(c), variance(c)),
-      Seq(Row(exp.variance), exp.variance))
 
-    compare(df.select(metrics("count").summary(c), count(c)),
-      Seq(Row(exp.count), exp.count))
-
-    compare(df.select(metrics("numNonZeros").summary(c), numNonZeros(c)),
-      Seq(Row(exp.numNonZeros), exp.numNonZeros))
+    def wrapped() = {
+      val df = sc.parallelize(inputVec).map(Tuple1.apply).toDF("features")
+      val c = df.col("features")
+      df -> c
+    }
 
-    println("STARTING MIN")
+    registerTest(s"$name - mean only") {
+      val (df, c) = wrapped()
+      compare(df.select(metrics("mean").summary(c), mean(c)), Seq(Row(exp.mean), exp.mean))
+    }
 
-    compare(df.select(metrics("min").summary(c), min(c)),
-      Seq(Row(exp.min), exp.min))
+    registerTest(s"$name - variance only") {
+      val (df, c) = wrapped()
+      compare(df.select(metrics("variance").summary(c), variance(c)),
+        Seq(Row(exp.variance), exp.variance))
+    }
 
-    println("STARTING MAX")
+    registerTest(s"$name - count only") {
+      val (df, c) = wrapped()
+      compare(df.select(metrics("count").summary(c), count(c)),
+        Seq(Row(exp.count), exp.count))
+    }
 
-    compare(df.select(metrics("max").summary(c), max(c)),
-      Seq(Row(exp.max), exp.max))
+    registerTest(s"$name - numNonZeros only") {
+      val (df, c) = wrapped()
+      compare(df.select(metrics("numNonZeros").summary(c), numNonZeros(c)),
+        Seq(Row(exp.numNonZeros), exp.numNonZeros))
+    }
 
-    println("STARTING NORML1")
+    registerTest(s"$name - min only") {
+      val (df, c) = wrapped()
+      compare(df.select(metrics("min").summary(c), min(c)),
+        Seq(Row(exp.min), exp.min))
+    }
 
-    compare(df.select(metrics("normL1").summary(c), normL1(c)),
-      Seq(Row(exp.normL1), exp.normL1))
+    registerTest(s"$name - max only") {
+      val (df, c) = wrapped()
+      compare(df.select(metrics("max").summary(c), max(c)),
+        Seq(Row(exp.max), exp.max))
+    }
 
-    println("STARTING NORML2")
+    registerTest(s"$name - normL1 only") {
+      val (df, c) = wrapped()
+      compare(df.select(metrics("normL1").summary(c), normL1(c)),
+        Seq(Row(exp.normL1), exp.normL1))
+    }
 
-    compare(df.select(metrics("normL2").summary(c), normL2(c)),
-      Seq(Row(exp.normL2), exp.normL2))
+    registerTest(s"$name - normL2 only") {
+      val (df, c) = wrapped()
+      compare(df.select(metrics("normL2").summary(c), normL2(c)),
+        Seq(Row(exp.normL2), exp.normL2))
+    }
 
-    compare(df.select(
-      metrics("mean", "variance", "count", "numNonZeros").summary(c),
-      mean(c), variance(c), count(c), numNonZeros(c)),
-      Seq(Row(exp.mean, exp.variance, exp.count, exp.numNonZeros),
-        exp.mean, exp.variance, exp.count, exp.numNonZeros))
+    registerTest(s"$name - all metrics at once") {
+      val (df, c) = wrapped()
+      compare(df.select(
+        metrics("mean", "variance", "count", "numNonZeros").summary(c),
+        mean(c), variance(c), count(c), numNonZeros(c)),
+        Seq(Row(exp.mean, exp.variance, exp.count, exp.numNonZeros),
+          exp.mean, exp.variance, exp.count, exp.numNonZeros))
+    }
   }
 
   private def denseData(input: Seq[Seq[Double]]): DataFrame = {
@@ -145,6 +168,8 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
     case _ => throw new Exception(s"$name: ${x1.getClass} ${x2.getClass} $x1 $x2")
   }
 
+
+
   test("debugging test") {
     val df = denseData(Nil)
     println(s">>> df=${df.collect().toSeq}")
@@ -175,13 +200,13 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
     compare(res, Seq(Row(0L), 0L))
   }
 
-  test("single element in vector, all metrics") {
-    val x = Seq(1.0, 2.0)
-    testExample(Seq(x), ExpectedMetrics(
+  {
+    val x = Seq(0.0, 1.0, 2.0)
+    testExample("single element", Seq(x), ExpectedMetrics(
       mean = x,
-      variance = Seq(0.0, 0.0),
+      variance = Seq(0.0, 0.0, 0.0),
       count = 1,
-      numNonZeros = Seq(1, 1),
+      numNonZeros = Seq(0, 1, 1),
       max = x,
       min = x,
       normL1 = x,
@@ -189,18 +214,17 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
     ))
   }
 
-  test("repeated metrics") {
+  testExample("two elements", Seq(Seq(0.0, 1.0, 2.0), Seq(0.0, -1.0, -2.0)), ExpectedMetrics(
+    mean = Seq(0.0, 0.0, 0.0),
+    variance = Seq(0.0, 1.0, 2.0),
+    count = 2,
+    numNonZeros = Seq(0, 2, 2),
+    max = Seq(0.0, 1.0, 2.0),
+    min = Seq(0.0, -1.0, -2.0),
+    normL1 = Seq(0.0, 2.0, 4.0),
+    normL2 = Seq(0.0, 2.0, 4.0)
+  ))
 
-  }
 
-  test("dense vector input") {
 
-  }
-
-  test("sparse vector input") {
-
-  }
-
-  test("merging summarizer when one side has zero mean (SPARK-4355)") {
-  }
 }

From 9f29030f75089884156bdc4ee634857b3730114d Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Thu, 23 Mar 2017 17:12:28 -0700
Subject: [PATCH 14/26] changes

---
 .../org/apache/spark/ml/stat/Summarizer.scala | 75 +++++++++++--------
 .../spark/ml/stat/SummarizerSuite.scala       | 11 +++
 2 files changed, 55 insertions(+), 31 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index 330a98090901..57bfa17fa900 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -255,7 +255,7 @@ object SummaryBuilderImpl extends Logging {
           s" max=${v(max)} min=${v(min)})"
       }
     }
-  object Buffer {
+  object Buffer extends Logging {
     // Recursive function, but the number of cases is really small.
     def fromMetrics(requested: Seq[ComputeMetrics]): Buffer = {
       if (requested.isEmpty) {
@@ -297,6 +297,7 @@ object SummaryBuilderImpl extends Logging {
     val numFields = bufferSchema.fields.length
 
     def updateInPlace(buffer: Buffer, v: Vector, w: Double): Unit = {
+      println(s"updateInPlace: buffer=$buffer v=$v w=$w")
       val startN = buffer.n
       if (startN == -1) {
         // The buffer was not initialized, we initialize it with the incoming row.
@@ -348,6 +349,7 @@ object SummaryBuilderImpl extends Logging {
           }
         }
       }
+      println(s"updateInPlace:2: buffer=$buffer v=$v w=$w")
     }
 
     /**
@@ -355,6 +357,7 @@ object SummaryBuilderImpl extends Logging {
      */
     @throws[SparkException]("When the buffers are not compatible")
     def mergeBuffers(buffer: Buffer, other: Buffer): Buffer = {
+      println(s"mergeBuffers: buffer=$buffer other=$buffer")
       if (buffer.n == -1) {
         // buffer is not initialized.
         if (other.n == -1) {
@@ -378,9 +381,11 @@ object SummaryBuilderImpl extends Logging {
     /**
      * Reads a buffer from a serialized form, using the row object as an assistant.
      */
-    def read(bytes: Array[Byte], row2: UnsafeRow): Buffer = {
-      row2.pointTo(bytes, bytes.length)
-      val row = row2.getStruct(0, numFields)
+    def read(bytes: Array[Byte]): Buffer = {
+      assert(numFields == 12, numFields)
+      val row3 = new UnsafeRow(numFields)
+      row3.pointTo(bytes.clone(), bytes.length)
+      val row = row3.getStruct(0, numFields)
       new Buffer(
         n = row.getInt(0),
         mean = nullableArrayD(row, 1),
@@ -417,18 +422,21 @@ object SummaryBuilderImpl extends Logging {
     }
 
     def mean(buffer: Buffer): Array[Double] = {
+      println(s"Buffer:mean: buffer=$buffer")
       require(buffer.totalWeightSum > 0)
       require(buffer.mean != null)
       require(buffer.weightSum != null)
       val res = b(buffer.mean) :* b(buffer.weightSum) :/ buffer.totalWeightSum
+      println(s"Buffer:mean: buffer=$buffer res=$res")
       res.toArray
     }
 
     def variance(buffer: Buffer): Array[Double] = {
+      println(s"Buffer:variance: buffer=$buffer")
       import buffer._
-      require(n >= 0)
-      require(totalWeightSum > 0)
-      require(totalWeightSquareSum > 0)
+      require(n >= 0, n)
+      require(totalWeightSum > 0, totalWeightSum)
+      require(totalWeightSquareSum > 0, totalWeightSquareSum)
       require(buffer.mean != null)
       require(m2n != null)
       require(weightSum != null)
@@ -538,17 +546,19 @@ object SummaryBuilderImpl extends Logging {
      */
     private def fillBufferWithRow(buffer: Buffer, v: Vector, w: Double): Unit = {
       require(buffer.n == -1, (buffer.n, buffer))
-      buffer.n = v.size
+      val n = v.size
+      buffer.n = n
       buffer.totalCount = 1L
       buffer.totalWeightSum = w
       buffer.totalWeightSquareSum = w * w
 
       val arr = v.toArray
+      assert(arr.length == n, (arr.toSeq, n))
       if (buffer.mean != null) {
         buffer.mean = arr.clone()
       }
       if (buffer.m2n != null) {
-        buffer.m2n = Array.ofDim(buffer.n)
+        buffer.m2n = Array.ofDim(n)
       }
       if (buffer.max != null) {
         buffer.max = arr.clone()
@@ -561,51 +571,51 @@ object SummaryBuilderImpl extends Logging {
       v match {
         case dv: DenseVector =>
           if (buffer.m2 != null) {
-            buffer.m2 = Array.ofDim(buffer.n)
+            buffer.m2 = Array.ofDim(n)
             b(buffer.m2) := w * (b(arr) :* b(arr))
           }
           if (buffer.l1 != null) {
-            buffer.l1 = Array.ofDim(buffer.n)
+            buffer.l1 = Array.ofDim(n)
             b(buffer.l1) := numerics.abs(b(arr))
           }
 
         case sv: SparseVector =>
           if (buffer.m2 != null) {
-            buffer.m2 = Array.ofDim(buffer.n)
+            buffer.m2 = Array.ofDim(n)
             v.foreachActive { (index, value) =>
               buffer.weightSum(index) = w * value * value
             }
           }
 
           if (buffer.l1 != null) {
-            buffer.l1 = Array.ofDim(buffer.n)
+            buffer.l1 = Array.ofDim(n)
             v.foreachActive { (index, value) =>
               buffer.weightSum(index) = w * math.abs(value)
             }
           }
+      }
 
-
-        // In the case of the weightSum and NNZ, we also have to account for the value of
-        // the elements.
-        if (buffer.weightSum != null) {
-          buffer.weightSum = Array.ofDim(buffer.n)
-          v.foreachActive { (index, value) =>
-            if (value != 0.0) {
-              buffer.weightSum(index) = w
-            }
+      // In the case of the weightSum and NNZ, we also have to account for the value of
+      // the elements.
+      // TODO It would be nice to vectorize these operations too.
+      if (buffer.weightSum != null) {
+        buffer.weightSum = Array.ofDim(n)
+        v.foreachActive { (index, value) =>
+          if (value != 0.0) {
+            buffer.weightSum(index) = w
           }
         }
+      }
 
-        if (buffer.nnz != null) {
-          buffer.nnz = Array.ofDim(buffer.n)
-          v.foreachActive { (index, value) =>
-            if (value != 0.0) {
-              buffer.nnz(index) = 1L
-            }
+      if (buffer.nnz != null) {
+        buffer.nnz = Array.ofDim(n)
+        v.foreachActive { (index, value) =>
+          if (value != 0.0) {
+            buffer.nnz(index) = 1L
           }
         }
-
       }
+
     }
 
 
@@ -613,6 +623,7 @@ object SummaryBuilderImpl extends Logging {
      * Merges other into buffer.
      */
     private def mergeInitializedBuffers(buffer: Buffer, other: Buffer): Unit = {
+      println(s"mergeInitializedBuffers: buffer=$buffer other=$other")
       // Each buffer needs to be properly initialized.
       require(buffer.n > 0 && other.n > 0, (buffer.n, other.n))
       require(buffer.n == other.n, (buffer.n, other.n))
@@ -681,6 +692,7 @@ object SummaryBuilderImpl extends Logging {
         require(other.weightSum != null)
         b(buffer.weightSum) :+= b(other.weightSum)
       }
+      println(s"mergeInitializedBuffers:2: buffer=$buffer other=$other")
     }
   }
 
@@ -692,7 +704,7 @@ object SummaryBuilderImpl extends Logging {
       inputAggBufferOffset: Int)
     extends TypedImperativeAggregate[Buffer] {
 
-    private lazy val row = new UnsafeRow(Buffer.numFields)
+//    private lazy val row = new UnsafeRow(Buffer.numFields)
 
     override def eval(buff: Buffer): InternalRow = {
       val metrics = requested.map({
@@ -744,7 +756,8 @@ object SummaryBuilderImpl extends Logging {
 
     override def deserialize(bytes: Array[Byte]): Buffer = {
 //      Buffer.read(bytes, row)
-      Buffer.read(bytes, new UnsafeRow(Buffer.numFields))
+      assert(Buffer.numFields == 12, Buffer.numFields)
+      Buffer.read(bytes)
     }
 
     override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): MetricsAggregate = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
index c8d842125cdc..b5df4f0cbc12 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
@@ -71,6 +71,11 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
       compare(df.select(metrics("mean").summary(c), mean(c)), Seq(Row(exp.mean), exp.mean))
     }
 
+    registerTest(s"$name - mean only (direct)") {
+      val (df, c) = wrapped()
+      compare(df.select(mean(c)), Seq(exp.mean))
+    }
+
     registerTest(s"$name - variance only") {
       val (df, c) = wrapped()
       compare(df.select(metrics("variance").summary(c), variance(c)),
@@ -89,6 +94,12 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
         Seq(Row(exp.numNonZeros), exp.numNonZeros))
     }
 
+    registerTest(s"$name - numNonZeros only (direct)") {
+      val (df, c) = wrapped()
+      compare(df.select(numNonZeros(c)),
+        Seq(exp.numNonZeros))
+    }
+
     registerTest(s"$name - min only") {
       val (df, c) = wrapped()
       compare(df.select(metrics("min").summary(c), min(c)),

From e9877dc2f08d393f079bdf6fbbf1b9b9abaa21da Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Fri, 24 Mar 2017 14:04:32 -0700
Subject: [PATCH 15/26] debugging

---
 .../org/apache/spark/ml/stat/Summarizer.scala | 23 +++++++++++--------
 .../spark/ml/stat/SummarizerSuite.scala       |  2 +-
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index 57bfa17fa900..ea7fd5db9d09 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.ml.stat
 
+import breeze.{linalg => la}
 import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
 import breeze.numerics
 
@@ -418,6 +419,8 @@ object SummaryBuilderImpl extends Logging {
         gadD(buffer.max),
         gadD(buffer.min)
       )
+      // TODO: the projection should be passed as an argument.
+      val projection = UnsafeProjection.create(bufferSchema)
       projection.apply(ir).getBytes
     }
 
@@ -497,7 +500,7 @@ object SummaryBuilderImpl extends Logging {
       }
     }
 
-    private[this] lazy val projection = UnsafeProjection.create(bufferSchema)
+//    private[this] lazy val projection = UnsafeProjection.create(bufferSchema)
 
     // Returns the array at a given index, or null if the array is null.
     private def nullableArrayD(row: UnsafeRow, ordinal: Int): Array[Double] = {
@@ -635,11 +638,13 @@ object SummaryBuilderImpl extends Logging {
       val weightSum1 = if (buffer.weightSum == null) null else { buffer.weightSum.clone() }
       val weightSum2 = if (other.weightSum == null) null else { other.weightSum.clone() }
 
-      val weightSum = if (weightSum1 == null) null else {
+      // This sum is going to be used as a denominator. In order to guarantee that the
+      // division is well-defined, we add an epsilon to the zero coefficients.
+      // This is not going to change the value of the resul since the numerator will also be zero.
+      val weightSum: BV[Double] = if (weightSum1 == null) null else {
         require(weightSum2 != null, s"buffer=$buffer other=$other")
-        val arr: Array[Double] = Array.ofDim(buffer.n)
-        b(arr) :+= b(weightSum1) :- b(weightSum1)
-        arr
+        val b1 = b(weightSum1) :+ b(weightSum2)
+        la.max(b1, Double.MinPositiveValue)
       }
 
 
@@ -654,12 +659,12 @@ object SummaryBuilderImpl extends Logging {
       if (buffer.mean != null) {
         require(other.mean != null)
         require(weightSum != null)
-        b(buffer.mean) :+= b(deltaMean) :* (b(weightSum2) / b(weightSum))
+        b(buffer.mean) :+= b(deltaMean) :* (b(weightSum2) :/ weightSum)
       }
 
       if (buffer.m2n != null) {
         require(other.m2n != null)
-        val w = (b(weightSum1) :* b(weightSum2)) :/ b(weightSum)
+        val w = (b(weightSum1) :* b(weightSum2)) :/ weightSum
         b(buffer.m2n) :+= b(other.m2n) :+ (b(deltaMean) :* b(deltaMean)) :* w
       }
 
@@ -675,12 +680,12 @@ object SummaryBuilderImpl extends Logging {
 
       if (buffer.max != null) {
         require(other.max != null)
-        maxInPlace(buffer.max, other.max)
+        b(buffer.max) := la.max(b(buffer.max), b(other.max))
       }
 
       if (buffer.min != null) {
         require(other.min != null)
-        minInPlace(buffer.min, other.min)
+        b(buffer.min) := la.min(b(buffer.min), b(other.min))
       }
 
       if (buffer.nnz != null) {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
index b5df4f0cbc12..f145c3dac6f1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
@@ -233,7 +233,7 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
     max = Seq(0.0, 1.0, 2.0),
     min = Seq(0.0, -1.0, -2.0),
     normL1 = Seq(0.0, 2.0, 4.0),
-    normL2 = Seq(0.0, 2.0, 4.0)
+    normL2 = Seq(0.0, math.sqrt(2.0), math.sqrt(2.0) * 2.0)
   ))
 
 

From 3a11d0265ef665a63cd070eeb1ae4ac25bc89908 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Fri, 24 Mar 2017 15:14:06 -0700
Subject: [PATCH 16/26] more tests and debugging

---
 .../org/apache/spark/ml/stat/Summarizer.scala |  41 +++---
 .../spark/ml/stat/SummarizerSuite.scala       | 125 +++++++++++++++++-
 2 files changed, 135 insertions(+), 31 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index ea7fd5db9d09..a8cde2864f34 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ml.stat
 
 import breeze.{linalg => la}
-import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
+import breeze.linalg.{Vector => BV}
 import breeze.numerics
 
 import org.apache.spark.SparkException
@@ -217,8 +217,8 @@ object SummaryBuilderImpl extends Logging {
   case object ComputeM2n extends ComputeMetrics
   case object ComputeM2 extends ComputeMetrics
   case object ComputeL1 extends ComputeMetrics
-  case object ComputeCount extends ComputeMetrics // Always computed
-  case object ComputeTotalWeightSum extends ComputeMetrics // Always computed
+  case object ComputeCount extends ComputeMetrics // Always computed -> TODO: remove
+  case object ComputeTotalWeightSum extends ComputeMetrics // Always computed -> TODO: remove
   case object ComputeWeightSquareSum extends ComputeMetrics
   case object ComputeWeightSum extends ComputeMetrics
   case object ComputeNNZ extends ComputeMetrics
@@ -256,6 +256,7 @@ object SummaryBuilderImpl extends Logging {
           s" max=${v(max)} min=${v(min)})"
       }
     }
+
   object Buffer extends Logging {
     // Recursive function, but the number of cases is really small.
     def fromMetrics(requested: Seq[ComputeMetrics]): Buffer = {
@@ -277,6 +278,15 @@ object SummaryBuilderImpl extends Logging {
       }
     }
 
+    /**
+     * (testing only). Makes a buffer with all the metrics enabled.
+     */
+    def allMetrics(): Buffer = {
+      fromMetrics(Seq(ComputeMean, ComputeM2n, ComputeM2, ComputeL1, ComputeCount,
+        ComputeTotalWeightSum, ComputeWeightSquareSum, ComputeWeightSum, ComputeNNZ, ComputeMax,
+        ComputeMin))
+    }
+
     val bufferSchema: StructType = {
       val fields = Seq(
         "n" -> IntegerType,
@@ -443,6 +453,7 @@ object SummaryBuilderImpl extends Logging {
       require(buffer.mean != null)
       require(m2n != null)
       require(weightSum != null)
+
       val denom = totalWeightSum - (totalWeightSquareSum / totalWeightSum)
       if (denom > 0.0) {
         val normWs = b(weightSum) :/ totalWeightSum
@@ -500,8 +511,6 @@ object SummaryBuilderImpl extends Logging {
       }
     }
 
-//    private[this] lazy val projection = UnsafeProjection.create(bufferSchema)
-
     // Returns the array at a given index, or null if the array is null.
     private def nullableArrayD(row: UnsafeRow, ordinal: Int): Array[Double] = {
       if (row.isNullAt(ordinal)) {
@@ -524,24 +533,6 @@ object SummaryBuilderImpl extends Logging {
 
     private def bl(x: Array[Long]): BV[Long] = BV.apply(x)
 
-    private def maxInPlace(x: Array[Double], y: Array[Double]): Unit = {
-      var i = 0
-      while(i < x.length) {
-        // Note: do not use conditions, it is wrong when handling NaNs.
-        x(i) = Math.max(x(i), y(i))
-        i += 1
-      }
-    }
-
-    private def minInPlace(x: Array[Double], y: Array[Double]): Unit = {
-      var i = 0
-      while(i < x.length) {
-        // Note: do not use conditions, it is wrong when handling NaNs.
-        x(i) = Math.min(x(i), y(i))
-        i += 1
-      }
-    }
-
     /**
      * Sets the content of a buffer based on a single row (initialization).
      *
@@ -586,14 +577,14 @@ object SummaryBuilderImpl extends Logging {
           if (buffer.m2 != null) {
             buffer.m2 = Array.ofDim(n)
             v.foreachActive { (index, value) =>
-              buffer.weightSum(index) = w * value * value
+              buffer.m2(index) = w * value * value
             }
           }
 
           if (buffer.l1 != null) {
             buffer.l1 = Array.ofDim(n)
             v.foreachActive { (index, value) =>
-              buffer.weightSum(index) = w * math.abs(value)
+              buffer.l1(index) = w * math.abs(value)
             }
           }
       }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
index f145c3dac6f1..4bfdb96f5df9 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
@@ -17,9 +17,14 @@
 
 package org.apache.spark.ml.stat
 
+import org.scalatest.exceptions.TestFailedException
+
 import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.ml.stat.SummaryBuilderImpl.Buffer
 import org.apache.spark.ml.util.TestingUtils._
+import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors}
+import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
@@ -60,15 +65,22 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
       case x => throw new Exception(x.toString)
     }
 
+    val s = {
+      val s2 = new MultivariateOnlineSummarizer
+      inputVec.foreach(v => s2OldVectors.fromML(v)))
+      s2
+    }
+
+    // Because the Spark context is reset between tests, we cannot hold a reference onto it.
     def wrapped() = {
       val df = sc.parallelize(inputVec).map(Tuple1.apply).toDF("features")
       val c = df.col("features")
-      df -> c
+      (df, c)
     }
 
     registerTest(s"$name - mean only") {
       val (df, c) = wrapped()
-      compare(df.select(metrics("mean").summary(c), mean(c)), Seq(Row(exp.mean), exp.mean))
+      compare(df.select(metrics("mean").summary(c), mean(c)), Seq(Row(exp.mean), s.mean))
     }
 
     registerTest(s"$name - mean only (direct)") {
@@ -79,7 +91,12 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
     registerTest(s"$name - variance only") {
       val (df, c) = wrapped()
       compare(df.select(metrics("variance").summary(c), variance(c)),
-        Seq(Row(exp.variance), exp.variance))
+        Seq(Row(exp.variance), s.variance))
+    }
+
+    registerTest(s"$name - variance only (direct)") {
+      val (df, c) = wrapped()
+      compare(df.select(variance(c)), Seq(s.variance))
     }
 
     registerTest(s"$name - count only") {
@@ -88,6 +105,12 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
         Seq(Row(exp.count), exp.count))
     }
 
+    registerTest(s"$name - count only (direct)") {
+      val (df, c) = wrapped()
+      compare(df.select(count(c)),
+        Seq(exp.count))
+    }
+
     registerTest(s"$name - numNonZeros only") {
       val (df, c) = wrapped()
       compare(df.select(metrics("numNonZeros").summary(c), numNonZeros(c)),
@@ -147,6 +170,7 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
     val res = row.toSeq
     println(s"compare: ress=${res.map(_.getClass)}")
     println(s"compare: row=${row}")
+    println(s"compare: exp=${exp}")
     val names = df.schema.fieldNames.zipWithIndex.map { case (n, idx) => s"$n ($idx)" }
     assert(res.size === exp.size, (res.size, exp.size))
     for (((x1, x2), name) <- res.zip(exp).zip(names)) {
@@ -156,8 +180,10 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   // Compares structured content.
   private def compareStructures(x1: Any, x2: Any, name: String): Unit = (x1, x2) match {
+    case (y1: Seq[Double], v1: OldVector) =>
+      compareStructures(y1, v1.toArray.toSeq, name)
     case (d1: Double, d2: Double) =>
-      assert(Vectors.dense(d1) ~== Vectors.dense(d2) absTol 1e-4, name)
+      assert2(Vectors.dense(d1) ~== Vectors.dense(d2) absTol 1e-4, name)
     case (r1: GenericRowWithSchema, r2: Row) =>
       assert(r1.size === r2.size, (r1, r2))
       for (((fname, x1), x2) <- r1.schema.fieldNames.zip(r1.toSeq).zip(r2.toSeq)) {
@@ -167,7 +193,7 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(r1.size === r2.size, (r1, r2))
       for ((x1, x2) <- r1.toSeq.zip(r2.toSeq)) { compareStructures(x1, x2, name) }
     case (v1: Vector, v2: Vector) =>
-      assert(v1 ~== v2 absTol 1e-4, name)
+      assert2(v1 ~== v2 absTol 1e-4, name)
     case (l1: Long, l2: Long) => assert(l1 === l2)
     case (s1: Seq[_], s2: Seq[_]) =>
       assert(s1.size === s2.size, s"$name ${(s1, s2)}")
@@ -179,7 +205,24 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
     case _ => throw new Exception(s"$name: ${x1.getClass} ${x2.getClass} $x1 $x2")
   }
 
+  private def assert2(x: => Boolean, hint: String): Unit = {
+    try {
+      assert(x, hint)
+    } catch {
+      case tfe: TestFailedException =>
+        throw new TestFailedException(Some(s"Failure with hint $hint"), Some(tfe), 1)
+    }
+  }
+
+  private def makeBuffer(vecs: Seq[Vector]): Buffer = {
+    val b = Buffer.allMetrics()
+    for (v <- vecs) { Buffer.updateInPlace(b, v, 1.0) }
+    b
+  }
+
+  private def b(x: Array[Double]): Vector = Vectors.dense(x)
 
+  private def l(x: Array[Long]): Vector = b(x.map(_.toDouble))
 
   test("debugging test") {
     val df = denseData(Nil)
@@ -227,7 +270,8 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   testExample("two elements", Seq(Seq(0.0, 1.0, 2.0), Seq(0.0, -1.0, -2.0)), ExpectedMetrics(
     mean = Seq(0.0, 0.0, 0.0),
-    variance = Seq(0.0, 1.0, 2.0),
+    // TODO: I have a doubt about these values, they are not normalized.
+    variance = Seq(0.0, 2.0, 8.0),
     count = 2,
     numNonZeros = Seq(0, 2, 2),
     max = Seq(0.0, 1.0, 2.0),
@@ -236,6 +280,75 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
     normL2 = Seq(0.0, math.sqrt(2.0), math.sqrt(2.0) * 2.0)
   ))
 
+  testExample("dense vector input",
+    Seq(Seq(-1.0, 0.0, 6.0), Seq(3.0, -3.0, 0.0)),
+    ExpectedMetrics(
+      mean = Seq(1.0, -1.5, 3.0),
+      variance = Seq(8.0, 4.5, 18.0),
+      count = 2,
+      numNonZeros = Seq(2, 1, 1),
+      max = Seq(3.0, 0.0, 6.0),
+      min = Seq(-1.0, -3, 0.0),
+      normL1 = Seq(4.0, 4.0, 6.0),
+      normL2 = Seq(0.0, 0.0, 0.0)
+  ))
+
+  test("mixing dense and sparse vector input") {
+    val summarizer = makeBuffer(Seq(
+      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
+      Vectors.dense(0.0, -1.0, -3.0),
+      Vectors.sparse(3, Seq((1, -5.1))),
+      Vectors.dense(3.8, 0.0, 1.9),
+      Vectors.dense(1.7, -0.6, 0.0),
+      Vectors.sparse(3, Seq((1, 1.9), (2, 0.0)))))
+
+    assert(b(Buffer.mean(summarizer)) ~==
+      Vectors.dense(0.583333333333, -0.416666666666, -0.183333333333) absTol 1E-5, "mean mismatch")
+
+    assert(b(Buffer.min(summarizer)) ~== Vectors.dense(-2.0, -5.1, -3) absTol 1E-5, "min " +
+      "mismatch")
+
+    assert(b(Buffer.max(summarizer)) ~== Vectors.dense(3.8, 2.3, 1.9) absTol 1E-5, "max mismatch")
+
+    assert(l(Buffer.nnz(summarizer)) ~== Vectors.dense(3, 5, 2) absTol 1E-5, "numNonzeros mismatch")
+
+    assert(b(Buffer.variance(summarizer)) ~==
+      Vectors.dense(3.857666666666, 7.0456666666666, 2.48166666666666) absTol 1E-5,
+      "variance mismatch")
+
+    assert(Buffer.totalCount(summarizer) === 6)
+  }
+
+
+  test("merging two summarizers") {
+    val summarizer1 = makeBuffer(Seq(
+      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
+      Vectors.dense(0.0, -1.0, -3.0)))
+
+    val summarizer2 = makeBuffer(Seq(
+      Vectors.sparse(3, Seq((1, -5.1))),
+      Vectors.dense(3.8, 0.0, 1.9),
+      Vectors.dense(1.7, -0.6, 0.0),
+      Vectors.sparse(3, Seq((1, 1.9), (2, 0.0)))))
+
+    val summarizer = Buffer.mergeBuffers(summarizer1, summarizer2)
+
+    assert(b(Buffer.mean(summarizer)) ~==
+      Vectors.dense(0.583333333333, -0.416666666666, -0.183333333333) absTol 1E-5, "mean mismatch")
+
+    assert(b(Buffer.min(summarizer)) ~== Vectors.dense(-2.0, -5.1, -3) absTol 1E-5, "min mismatch")
+
+    assert(b(Buffer.max(summarizer)) ~== Vectors.dense(3.8, 2.3, 1.9) absTol 1E-5, "max mismatch")
+
+    assert(l(Buffer.nnz(summarizer)) ~== Vectors.dense(3, 5, 2) absTol 1E-5, "numNonzeros mismatch")
+
+    assert(b(Buffer.variance(summarizer)) ~==
+
+      Vectors.dense(3.857666666666, 7.0456666666666, 2.48166666666666) absTol 1E-5,
+      "variance mismatch")
+
+    assert(Buffer.totalCount(summarizer) === 6)
+  }
 
 
 }

From 6d26c17d0bd4ab18d564ee7f37916780211702d5 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Fri, 24 Mar 2017 16:12:19 -0700
Subject: [PATCH 17/26] fixed tests

---
 .../org/apache/spark/ml/stat/Summarizer.scala | 23 +++++++++++--------
 .../stat/MultivariateOnlineSummarizer.scala   | 20 ++++++++++++++++
 .../spark/ml/stat/SummarizerSuite.scala       | 19 +++++++++++----
 3 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index a8cde2864f34..d6704799e104 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -368,7 +368,7 @@ object SummaryBuilderImpl extends Logging {
      */
     @throws[SparkException]("When the buffers are not compatible")
     def mergeBuffers(buffer: Buffer, other: Buffer): Buffer = {
-      println(s"mergeBuffers: buffer=$buffer other=$buffer")
+      println(s"mergeBuffers: buffer=$buffer other=$other")
       if (buffer.n == -1) {
         // buffer is not initialized.
         if (other.n == -1) {
@@ -624,7 +624,7 @@ object SummaryBuilderImpl extends Logging {
       // Mandatory scalar values
       buffer.totalWeightSquareSum += other.totalWeightSquareSum
       buffer.totalWeightSum += other.totalWeightSum
-      buffer.totalCount += buffer.totalCount
+      buffer.totalCount += other.totalCount
       // Keep the original weight sums.
       val weightSum1 = if (buffer.weightSum == null) null else { buffer.weightSum.clone() }
       val weightSum2 = if (other.weightSum == null) null else { other.weightSum.clone() }
@@ -634,29 +634,32 @@ object SummaryBuilderImpl extends Logging {
       // This is not going to change the value of the resul since the numerator will also be zero.
       val weightSum: BV[Double] = if (weightSum1 == null) null else {
         require(weightSum2 != null, s"buffer=$buffer other=$other")
-        val b1 = b(weightSum1) :+ b(weightSum2)
-        la.max(b1, Double.MinPositiveValue)
+        val x = b(weightSum1) :+ b(weightSum2)
+        la.max(x, Double.MinPositiveValue)
       }
 
 
       // Since the operations are dense, we can directly use BLAS calls here.
-      val deltaMean: Array[Double] = if (buffer.mean != null) {
+      val deltaMean: BV[Double] = if (buffer.mean != null) {
         require(other.mean != null)
-        val arr: Array[Double] = Array.ofDim(buffer.n)
-        b(arr) :+= b(other.mean) :- b(buffer.mean)
-        arr
+        b(other.mean) :- b(buffer.mean)
       } else { null }
 
       if (buffer.mean != null) {
         require(other.mean != null)
         require(weightSum != null)
-        b(buffer.mean) :+= b(deltaMean) :* (b(weightSum2) :/ weightSum)
+        b(buffer.mean) :+= deltaMean :* (b(weightSum2) :/ weightSum)
       }
 
       if (buffer.m2n != null) {
         require(other.m2n != null)
         val w = (b(weightSum1) :* b(weightSum2)) :/ weightSum
-        b(buffer.m2n) :+= b(other.m2n) :+ (b(deltaMean) :* b(deltaMean)) :* w
+        val z = (deltaMean :* deltaMean) :* w
+        println(s"weightSum1=$weightSum1 weightSum2=$weightSum2 weightSum=$weightSum z=$z")
+        println(s"mergeInitializedBuffers: buffer.m2n=${b(buffer.m2n)}")
+        println(s"mergeInitializedBuffers: other.m2n=${b(other.m2n)}")
+        b(buffer.m2n) :+= b(other.m2n) :+ z
+        println(s"mergeInitializedBuffers: buffer.m2n_2=${b(buffer.m2n)}")
       }
 
       if (buffer.m2 != null) {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
index 7dc0c459ec03..d40d64bf9aa4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -20,6 +20,8 @@ package org.apache.spark.mllib.stat
 import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 
+// scalastyle:off println
+
 /**
  * :: DeveloperApi ::
  * MultivariateOnlineSummarizer implements [[MultivariateStatisticalSummary]] to compute the mean,
@@ -57,6 +59,17 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
   private var currMax: Array[Double] = _
   private var currMin: Array[Double] = _
 
+  override def toString: String = {
+    def v(x: Array[Double]) = if (x==null) "null" else x.toSeq.mkString("[", " ", "]")
+    def vl(x: Array[Long]) = if (x==null) "null" else x.toSeq.mkString("[", " ", "]")
+
+    s"MultivariateOnlineSummarizer(n=$n mean=${v(currMean)} m2n=${v(currM2n)} m2=${v(currM2)} " +
+      s"l1=${v(currL1)}" +
+      s" totalCount=$totalCnt totalWeightSum=$totalWeightSum" +
+      s" totalWeightSquareSum=$weightSquareSum weightSum=${v(weightSum)} nnz=${vl(nnz)}" +
+      s" max=${v(currMax)} min=${v(currMin)})"
+  }
+
   /**
    * Add a new sample to this summarizer, and update the statistical summary.
    *
@@ -131,6 +144,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    */
   @Since("1.1.0")
   def merge(other: MultivariateOnlineSummarizer): this.type = {
+    println(s"MultivariateOnlineSummarizer:merge: this=$this")
+    println(s"MultivariateOnlineSummarizer:merge: other=$other")
     if (this.totalWeightSum != 0.0 && other.totalWeightSum != 0.0) {
       require(n == other.n, s"Dimensions mismatch when merging with another summarizer. " +
         s"Expecting $n but got ${other.n}.")
@@ -148,7 +163,11 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
           // merge mean together
           currMean(i) += deltaMean * otherNnz / totalNnz
           // merge m2n together
+          val z = deltaMean * deltaMean * thisNnz * otherNnz / totalNnz
+          println(s"i=$i thisNnz=$thisNnz otherNnz=$otherNnz totalNnz=$totalNnz z=$z")
+          println(s"i=$i currM2n=${currM2n(i)} other.currM2n=${other.currM2n(i)}")
           currM2n(i) += other.currM2n(i) + deltaMean * deltaMean * thisNnz * otherNnz / totalNnz
+          println(s"i=$i >> currM2n=${currM2n(i)} other.currM2n=${other.currM2n(i)}")
           // merge m2 together
           currM2(i) += other.currM2(i)
           // merge l1 together
@@ -175,6 +194,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
       this.currMax = other.currMax.clone()
       this.currMin = other.currMin.clone()
     }
+    println(s"MultivariateOnlineSummarizer:merge(2): this=$this")
     this
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
index 4bfdb96f5df9..9a54a386b953 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
@@ -67,7 +67,7 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     val s = {
       val s2 = new MultivariateOnlineSummarizer
-      inputVec.foreach(v => s2OldVectors.fromML(v)))
+      inputVec.foreach(v => s2.add(OldVectors.fromML(v)))
       s2
     }
 
@@ -289,8 +289,8 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
       numNonZeros = Seq(2, 1, 1),
       max = Seq(3.0, 0.0, 6.0),
       min = Seq(-1.0, -3, 0.0),
-      normL1 = Seq(4.0, 4.0, 6.0),
-      normL2 = Seq(0.0, 0.0, 0.0)
+      normL1 = Seq(4.0, 3.0, 6.0),
+      normL2 = Seq(math.sqrt(10), 3, 6.0)
   ))
 
   test("mixing dense and sparse vector input") {
@@ -324,14 +324,25 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
     val summarizer1 = makeBuffer(Seq(
       Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
       Vectors.dense(0.0, -1.0, -3.0)))
+//    val s1 = new MultivariateOnlineSummarizer
+//    Seq(
+//      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
+//      Vectors.dense(0.0, -1.0, -3.0)).foreach(v => s1.add(OldVectors.fromML(v)))
 
     val summarizer2 = makeBuffer(Seq(
       Vectors.sparse(3, Seq((1, -5.1))),
       Vectors.dense(3.8, 0.0, 1.9),
       Vectors.dense(1.7, -0.6, 0.0),
       Vectors.sparse(3, Seq((1, 1.9), (2, 0.0)))))
+//    val s2 = new MultivariateOnlineSummarizer
+//    Seq(
+//      Vectors.sparse(3, Seq((1, -5.1))),
+//      Vectors.dense(3.8, 0.0, 1.9),
+//      Vectors.dense(1.7, -0.6, 0.0),
+//      Vectors.sparse(3, Seq((1, 1.9), (2, 0.0)))).foreach(v => s2.add(OldVectors.fromML(v)))
 
     val summarizer = Buffer.mergeBuffers(summarizer1, summarizer2)
+//    val s = s1.merge(s2)
 
     assert(b(Buffer.mean(summarizer)) ~==
       Vectors.dense(0.583333333333, -0.416666666666, -0.183333333333) absTol 1E-5, "mean mismatch")
@@ -343,7 +354,6 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(l(Buffer.nnz(summarizer)) ~== Vectors.dense(3, 5, 2) absTol 1E-5, "numNonzeros mismatch")
 
     assert(b(Buffer.variance(summarizer)) ~==
-
       Vectors.dense(3.857666666666, 7.0456666666666, 2.48166666666666) absTol 1E-5,
       "variance mismatch")
 
@@ -351,4 +361,5 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
   }
 
 
+
 }

From 35eaeb0d02ae9cc29ae559231fe4858935315477 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Fri, 24 Mar 2017 16:23:15 -0700
Subject: [PATCH 18/26] doc

---
 .../main/scala/org/apache/spark/ml/stat/Summarizer.scala    | 1 +
 .../apache/spark/sql/catalyst/expressions/Projection.scala  | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index d6704799e104..536ef85d8a18 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -393,6 +393,7 @@ object SummaryBuilderImpl extends Logging {
      * Reads a buffer from a serialized form, using the row object as an assistant.
      */
     def read(bytes: Array[Byte]): Buffer = {
+      // TODO move this row outside to the aggregate
       assert(numFields == 12, numFields)
       val row3 = new UnsafeRow(numFields)
       row3.pointTo(bytes.clone(), bytes.length)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index 7c57025f995d..64b94f0a2c10 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -101,6 +101,8 @@ case class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mu
 
 /**
  * A projection that returns UnsafeRow.
+ *
+ * CAUTION: the returned projection object should *not* be assumed to be thread-safe.
  */
 abstract class UnsafeProjection extends Projection {
   override def apply(row: InternalRow): UnsafeRow
@@ -110,11 +112,15 @@ object UnsafeProjection {
 
   /**
    * Returns an UnsafeProjection for given StructType.
+   *
+   * CAUTION: the returned projection object is *not* thread-safe.
    */
   def create(schema: StructType): UnsafeProjection = create(schema.fields.map(_.dataType))
 
   /**
    * Returns an UnsafeProjection for given Array of DataTypes.
+   *
+   * CAUTION: the returned projection object is *not* thread-safe.
    */
   def create(fields: Array[DataType]): UnsafeProjection = {
     create(fields.zipWithIndex.map(x => BoundReference(x._2, x._1, true)))

From 58b17dc3af5ebc5eaeff82559bc3be3a1f64ef38 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Fri, 24 Mar 2017 16:30:40 -0700
Subject: [PATCH 19/26] cleanups

---
 .../org/apache/spark/ml/stat/Summarizer.scala | 28 ++-----------------
 .../stat/MultivariateOnlineSummarizer.scala   |  8 ------
 2 files changed, 3 insertions(+), 33 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index 536ef85d8a18..b6832375135a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -22,19 +22,15 @@ import breeze.linalg.{Vector => BV}
 import breeze.numerics
 
 import org.apache.spark.SparkException
-import org.apache.spark.annotation.{DeveloperApi, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors, VectorUDT}
-import org.apache.spark.sql.{Column, Row}
+import org.apache.spark.sql.Column
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeArrayData, UnsafeProjection, UnsafeRow}
-import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete, Final, TypedImperativeAggregate}
-import org.apache.spark.sql.catalyst.util.GenericArrayData
-import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction, UserDefinedFunction}
+import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete, TypedImperativeAggregate}
 import org.apache.spark.sql.types._
 
-// scalastyle:off println
-
 
 /**
  * A builder object that provides summary statistics about a given column.
@@ -131,9 +127,6 @@ private[ml] class SummaryBuilderImpl(
 
   override def summary(column: Column): Column = {
     val start = SummaryBuilderImpl.Buffer.fromMetrics(requestedCompMetrics)
-    println(s"summary: requestedMetrics=$requestedMetrics")
-    println(s"summary: requestedCompMetrics=$requestedCompMetrics")
-    println(s"summary: start=$start")
     val agg = SummaryBuilderImpl.MetricsAggregate(
       requestedMetrics,
       start,
@@ -308,7 +301,6 @@ object SummaryBuilderImpl extends Logging {
     val numFields = bufferSchema.fields.length
 
     def updateInPlace(buffer: Buffer, v: Vector, w: Double): Unit = {
-      println(s"updateInPlace: buffer=$buffer v=$v w=$w")
       val startN = buffer.n
       if (startN == -1) {
         // The buffer was not initialized, we initialize it with the incoming row.
@@ -360,7 +352,6 @@ object SummaryBuilderImpl extends Logging {
           }
         }
       }
-      println(s"updateInPlace:2: buffer=$buffer v=$v w=$w")
     }
 
     /**
@@ -368,7 +359,6 @@ object SummaryBuilderImpl extends Logging {
      */
     @throws[SparkException]("When the buffers are not compatible")
     def mergeBuffers(buffer: Buffer, other: Buffer): Buffer = {
-      println(s"mergeBuffers: buffer=$buffer other=$other")
       if (buffer.n == -1) {
         // buffer is not initialized.
         if (other.n == -1) {
@@ -436,17 +426,14 @@ object SummaryBuilderImpl extends Logging {
     }
 
     def mean(buffer: Buffer): Array[Double] = {
-      println(s"Buffer:mean: buffer=$buffer")
       require(buffer.totalWeightSum > 0)
       require(buffer.mean != null)
       require(buffer.weightSum != null)
       val res = b(buffer.mean) :* b(buffer.weightSum) :/ buffer.totalWeightSum
-      println(s"Buffer:mean: buffer=$buffer res=$res")
       res.toArray
     }
 
     def variance(buffer: Buffer): Array[Double] = {
-      println(s"Buffer:variance: buffer=$buffer")
       import buffer._
       require(n >= 0, n)
       require(totalWeightSum > 0, totalWeightSum)
@@ -618,7 +605,6 @@ object SummaryBuilderImpl extends Logging {
      * Merges other into buffer.
      */
     private def mergeInitializedBuffers(buffer: Buffer, other: Buffer): Unit = {
-      println(s"mergeInitializedBuffers: buffer=$buffer other=$other")
       // Each buffer needs to be properly initialized.
       require(buffer.n > 0 && other.n > 0, (buffer.n, other.n))
       require(buffer.n == other.n, (buffer.n, other.n))
@@ -656,11 +642,7 @@ object SummaryBuilderImpl extends Logging {
         require(other.m2n != null)
         val w = (b(weightSum1) :* b(weightSum2)) :/ weightSum
         val z = (deltaMean :* deltaMean) :* w
-        println(s"weightSum1=$weightSum1 weightSum2=$weightSum2 weightSum=$weightSum z=$z")
-        println(s"mergeInitializedBuffers: buffer.m2n=${b(buffer.m2n)}")
-        println(s"mergeInitializedBuffers: other.m2n=${b(other.m2n)}")
         b(buffer.m2n) :+= b(other.m2n) :+ z
-        println(s"mergeInitializedBuffers: buffer.m2n_2=${b(buffer.m2n)}")
       }
 
       if (buffer.m2 != null) {
@@ -692,7 +674,6 @@ object SummaryBuilderImpl extends Logging {
         require(other.weightSum != null)
         b(buffer.weightSum) :+= b(other.weightSum)
       }
-      println(s"mergeInitializedBuffers:2: buffer=$buffer other=$other")
     }
   }
 
@@ -723,7 +704,6 @@ object SummaryBuilderImpl extends Logging {
     override def children: Seq[Expression] = child :: Nil
 
     override def update(buff: Buffer, row: InternalRow): Buffer = {
-      println(s"UPDATE: ROW=$row")
       // Unsafe rows do not play well with UDTs, it seems.
       // Directly call the deserializer.
       val v = udt.deserialize(row.getStruct(0, udt.sqlType.size))
@@ -748,9 +728,7 @@ object SummaryBuilderImpl extends Logging {
 
     override def serialize(buff: Buffer): Array[Byte] = {
       val x = Buffer.write(buff)
-      println(s"serialize: ${buff.hashCode()} x=${x.length} buff=$buff")
       val b2 = deserialize(x)
-      println(s"serialize: ${buff.hashCode()} b2=$b2")
       x
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
index d40d64bf9aa4..c1a66cae3837 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -20,8 +20,6 @@ package org.apache.spark.mllib.stat
 import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 
-// scalastyle:off println
-
 /**
  * :: DeveloperApi ::
  * MultivariateOnlineSummarizer implements [[MultivariateStatisticalSummary]] to compute the mean,
@@ -144,8 +142,6 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    */
   @Since("1.1.0")
   def merge(other: MultivariateOnlineSummarizer): this.type = {
-    println(s"MultivariateOnlineSummarizer:merge: this=$this")
-    println(s"MultivariateOnlineSummarizer:merge: other=$other")
     if (this.totalWeightSum != 0.0 && other.totalWeightSum != 0.0) {
       require(n == other.n, s"Dimensions mismatch when merging with another summarizer. " +
         s"Expecting $n but got ${other.n}.")
@@ -164,10 +160,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
           currMean(i) += deltaMean * otherNnz / totalNnz
           // merge m2n together
           val z = deltaMean * deltaMean * thisNnz * otherNnz / totalNnz
-          println(s"i=$i thisNnz=$thisNnz otherNnz=$otherNnz totalNnz=$totalNnz z=$z")
-          println(s"i=$i currM2n=${currM2n(i)} other.currM2n=${other.currM2n(i)}")
           currM2n(i) += other.currM2n(i) + deltaMean * deltaMean * thisNnz * otherNnz / totalNnz
-          println(s"i=$i >> currM2n=${currM2n(i)} other.currM2n=${other.currM2n(i)}")
           // merge m2 together
           currM2(i) += other.currM2(i)
           // merge l1 together
@@ -194,7 +187,6 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
       this.currMax = other.currMax.clone()
       this.currMin = other.currMin.clone()
     }
-    println(s"MultivariateOnlineSummarizer:merge(2): this=$this")
     this
   }
 

From 18078c1b82a55ac27ae241796c30f284c705ed42 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Fri, 24 Mar 2017 16:32:11 -0700
Subject: [PATCH 20/26] cleanups

---
 .../spark/ml/stat/SummarizerSuite.scala       | 29 +------------------
 1 file changed, 1 insertion(+), 28 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
index 9a54a386b953..781340887e46 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
@@ -26,19 +26,14 @@ import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors}
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
 
-// scalastyle:off println
-
 class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   import testImplicits._
   import Summarizer._
 
-  private val seed = 42
-  private val eps: Double = 1e-5
-
   private case class ExpectedMetrics(
       mean: Seq[Double],
       variance: Seq[Double],
@@ -163,14 +158,9 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
   }
 
   private def compare(df: DataFrame, exp: Seq[Any]): Unit = {
-    println(s"compare: df=$df")
     val coll = df.collect().toSeq
-    println(s"compare: coll=$coll")
     val Seq(row) = coll
     val res = row.toSeq
-    println(s"compare: ress=${res.map(_.getClass)}")
-    println(s"compare: row=${row}")
-    println(s"compare: exp=${exp}")
     val names = df.schema.fieldNames.zipWithIndex.map { case (n, idx) => s"$n ($idx)" }
     assert(res.size === exp.size, (res.size, exp.size))
     for (((x1, x2), name) <- res.zip(exp).zip(names)) {
@@ -226,13 +216,9 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("debugging test") {
     val df = denseData(Nil)
-    println(s">>> df=${df.collect().toSeq}")
     val c = df.col("features")
-    println(s">>>>> c=$c")
     val c1 = metrics("mean").summary(c)
-    println(s">>>>> c1=$c1")
     val res = df.select(c1)
-    println(s">>>>> res=$res")
     intercept[SparkException] {
       compare(res, Seq.empty)
     }
@@ -324,25 +310,14 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
     val summarizer1 = makeBuffer(Seq(
       Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
       Vectors.dense(0.0, -1.0, -3.0)))
-//    val s1 = new MultivariateOnlineSummarizer
-//    Seq(
-//      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
-//      Vectors.dense(0.0, -1.0, -3.0)).foreach(v => s1.add(OldVectors.fromML(v)))
 
     val summarizer2 = makeBuffer(Seq(
       Vectors.sparse(3, Seq((1, -5.1))),
       Vectors.dense(3.8, 0.0, 1.9),
       Vectors.dense(1.7, -0.6, 0.0),
       Vectors.sparse(3, Seq((1, 1.9), (2, 0.0)))))
-//    val s2 = new MultivariateOnlineSummarizer
-//    Seq(
-//      Vectors.sparse(3, Seq((1, -5.1))),
-//      Vectors.dense(3.8, 0.0, 1.9),
-//      Vectors.dense(1.7, -0.6, 0.0),
-//      Vectors.sparse(3, Seq((1, 1.9), (2, 0.0)))).foreach(v => s2.add(OldVectors.fromML(v)))
 
     val summarizer = Buffer.mergeBuffers(summarizer1, summarizer2)
-//    val s = s1.merge(s2)
 
     assert(b(Buffer.mean(summarizer)) ~==
       Vectors.dense(0.583333333333, -0.416666666666, -0.183333333333) absTol 1E-5, "mean mismatch")
@@ -360,6 +335,4 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(Buffer.totalCount(summarizer) === 6)
   }
 
-
-
 }

From ffe5cfe2e5a60f3f6056d8f89f7c50bbedca3fbf Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Fri, 24 Mar 2017 16:41:24 -0700
Subject: [PATCH 21/26] Cleanups

---
 .../org/apache/spark/ml/stat/Summarizer.scala | 46 ++++++++-----------
 .../spark/ml/stat/SummarizerSuite.scala       |  6 +--
 2 files changed, 21 insertions(+), 31 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index b6832375135a..b87087168e71 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -176,15 +176,14 @@ object SummaryBuilderImpl extends Logging {
    * metrics that need to de computed internally to get the final result.
    */
   private val allMetrics: Seq[(String, Metrics, DataType, Seq[ComputeMetrics])] = Seq(
-    ("mean", Mean, arrayDType, Seq(ComputeMean, ComputeWeightSum, ComputeTotalWeightSum)),
-    ("variance", Variance, arrayDType, Seq(ComputeTotalWeightSum, ComputeWeightSum,
-                                           ComputeWeightSquareSum, ComputeMean, ComputeM2n)),
-    ("count", Count, LongType, Seq(ComputeCount)),
+    ("mean", Mean, arrayDType, Seq(ComputeMean, ComputeWeightSum)),
+    ("variance", Variance, arrayDType, Seq(ComputeWeightSum, ComputeMean, ComputeM2n)),
+    ("count", Count, LongType, Seq()),
     ("numNonZeros", NumNonZeros, arrayLType, Seq(ComputeNNZ)),
     ("max", Max, arrayDType, Seq(ComputeMax)),
     ("min", Min, arrayDType, Seq(ComputeMin)),
-    ("normL2", NormL2, arrayDType, Seq(ComputeTotalWeightSum, ComputeM2)),
-    ("normL1", NormL1, arrayDType, Seq(ComputeTotalWeightSum, ComputeL1))
+    ("normL2", NormL2, arrayDType, Seq(ComputeM2)),
+    ("normL1", NormL1, arrayDType, Seq(ComputeL1))
   )
 
   /**
@@ -210,9 +209,6 @@ object SummaryBuilderImpl extends Logging {
   case object ComputeM2n extends ComputeMetrics
   case object ComputeM2 extends ComputeMetrics
   case object ComputeL1 extends ComputeMetrics
-  case object ComputeCount extends ComputeMetrics // Always computed -> TODO: remove
-  case object ComputeTotalWeightSum extends ComputeMetrics // Always computed -> TODO: remove
-  case object ComputeWeightSquareSum extends ComputeMetrics
   case object ComputeWeightSum extends ComputeMetrics
   case object ComputeNNZ extends ComputeMetrics
   case object ComputeMax extends ComputeMetrics
@@ -275,8 +271,8 @@ object SummaryBuilderImpl extends Logging {
      * (testing only). Makes a buffer with all the metrics enabled.
      */
     def allMetrics(): Buffer = {
-      fromMetrics(Seq(ComputeMean, ComputeM2n, ComputeM2, ComputeL1, ComputeCount,
-        ComputeTotalWeightSum, ComputeWeightSquareSum, ComputeWeightSum, ComputeNNZ, ComputeMax,
+      fromMetrics(Seq(ComputeMean, ComputeM2n, ComputeM2, ComputeL1,
+        ComputeWeightSum, ComputeNNZ, ComputeMax,
         ComputeMin))
     }
 
@@ -382,12 +378,9 @@ object SummaryBuilderImpl extends Logging {
     /**
      * Reads a buffer from a serialized form, using the row object as an assistant.
      */
-    def read(bytes: Array[Byte]): Buffer = {
-      // TODO move this row outside to the aggregate
-      assert(numFields == 12, numFields)
-      val row3 = new UnsafeRow(numFields)
-      row3.pointTo(bytes.clone(), bytes.length)
-      val row = row3.getStruct(0, numFields)
+    def read(bytes: Array[Byte], backingRow: UnsafeRow): Buffer = {
+      backingRow.pointTo(bytes.clone(), bytes.length)
+      val row = backingRow.getStruct(0, numFields)
       new Buffer(
         n = row.getInt(0),
         mean = nullableArrayD(row, 1),
@@ -405,7 +398,7 @@ object SummaryBuilderImpl extends Logging {
     }
 
 
-    def write(buffer: Buffer): Array[Byte] = {
+    def write(buffer: Buffer, project: UnsafeProjection): Array[Byte] = {
       val ir = InternalRow.apply(
         buffer.n,
         gadD(buffer.mean),
@@ -420,9 +413,7 @@ object SummaryBuilderImpl extends Logging {
         gadD(buffer.max),
         gadD(buffer.min)
       )
-      // TODO: the projection should be passed as an argument.
-      val projection = UnsafeProjection.create(bufferSchema)
-      projection.apply(ir).getBytes
+      project.apply(ir).getBytes
     }
 
     def mean(buffer: Buffer): Array[Double] = {
@@ -685,7 +676,10 @@ object SummaryBuilderImpl extends Logging {
       inputAggBufferOffset: Int)
     extends TypedImperativeAggregate[Buffer] {
 
-//    private lazy val row = new UnsafeRow(Buffer.numFields)
+    // These objects are not thread-safe, allocate them in the aggregator.
+    private[this] lazy val row = new UnsafeRow(Buffer.numFields)
+    private[this] lazy val projection = UnsafeProjection.create(Buffer.bufferSchema)
+
 
     override def eval(buff: Buffer): InternalRow = {
       val metrics = requested.map({
@@ -727,15 +721,11 @@ object SummaryBuilderImpl extends Logging {
     override def createAggregationBuffer(): Buffer = startBuffer.copy()
 
     override def serialize(buff: Buffer): Array[Byte] = {
-      val x = Buffer.write(buff)
-      val b2 = deserialize(x)
-      x
+      Buffer.write(buff, projection)
     }
 
     override def deserialize(bytes: Array[Byte]): Buffer = {
-//      Buffer.read(bytes, row)
-      assert(Buffer.numFields == 12, Buffer.numFields)
-      Buffer.read(bytes)
+      Buffer.read(bytes, row)
     }
 
     override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): MetricsAggregate = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
index 781340887e46..3181886fb08a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
@@ -54,8 +54,8 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   private def testExample(name: String, input: Seq[Any], exp: ExpectedMetrics): Unit = {
     def inputVec: Seq[Vector] = input.map {
-      case x: Array[Double] => Vectors.dense(x)
-      case x: Seq[Double] => Vectors.dense(x.toArray)
+      case x: Array[Double @unchecked] => Vectors.dense(x)
+      case x: Seq[Double @unchecked] => Vectors.dense(x.toArray)
       case x: Vector => x
       case x => throw new Exception(x.toString)
     }
@@ -170,7 +170,7 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   // Compares structured content.
   private def compareStructures(x1: Any, x2: Any, name: String): Unit = (x1, x2) match {
-    case (y1: Seq[Double], v1: OldVector) =>
+    case (y1: Seq[Double @unchecked], v1: OldVector) =>
       compareStructures(y1, v1.toArray.toSeq, name)
     case (d1: Double, d2: Double) =>
       assert2(Vectors.dense(d1) ~== Vectors.dense(d2) absTol 1e-4, name)

From 41f4be6044588dc1db7a8366282fd8aeb5343483 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Fri, 24 Mar 2017 16:43:14 -0700
Subject: [PATCH 22/26] Cleanups

---
 .../apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
index c1a66cae3837..20e2cfc7ccdf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -159,7 +159,6 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
           // merge mean together
           currMean(i) += deltaMean * otherNnz / totalNnz
           // merge m2n together
-          val z = deltaMean * deltaMean * thisNnz * otherNnz / totalNnz
           currM2n(i) += other.currM2n(i) + deltaMean * deltaMean * thisNnz * otherNnz / totalNnz
           // merge m2 together
           currM2(i) += other.currM2(i)

From ba200bb1d52d468acebc9fe44f6727c570a35335 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Fri, 24 Mar 2017 16:43:32 -0700
Subject: [PATCH 23/26] Cleanups

---
 .../mllib/stat/MultivariateOnlineSummarizer.scala     | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
index 20e2cfc7ccdf..7dc0c459ec03 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -57,17 +57,6 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
   private var currMax: Array[Double] = _
   private var currMin: Array[Double] = _
 
-  override def toString: String = {
-    def v(x: Array[Double]) = if (x==null) "null" else x.toSeq.mkString("[", " ", "]")
-    def vl(x: Array[Long]) = if (x==null) "null" else x.toSeq.mkString("[", " ", "]")
-
-    s"MultivariateOnlineSummarizer(n=$n mean=${v(currMean)} m2n=${v(currM2n)} m2=${v(currM2)} " +
-      s"l1=${v(currL1)}" +
-      s" totalCount=$totalCnt totalWeightSum=$totalWeightSum" +
-      s" totalWeightSquareSum=$weightSquareSum weightSum=${v(weightSum)} nnz=${vl(nnz)}" +
-      s" max=${v(currMax)} min=${v(currMin)})"
-  }
-
   /**
    * Add a new sample to this summarizer, and update the statistical summary.
    *

From 662f62c5dc13b0b8b21f1c0fa374e4279f724a8e Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Mon, 27 Mar 2017 17:12:11 -0700
Subject: [PATCH 24/26] small test to find perf issues

---
 .../spark/ml/stat/SummarizerSuite.scala       | 63 ++++++++++++++++++-
 1 file changed, 62 insertions(+), 1 deletion(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
index 3181886fb08a..f691148c0c3a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.stat.SummaryBuilderImpl.Buffer
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors}
-import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
+import org.apache.spark.mllib.stat.{MultivariateOnlineSummarizer, Statistics}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
@@ -335,4 +335,65 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(Buffer.totalCount(summarizer) === 6)
   }
 
+  // TODO: this test should not be committed. It is here to isolate some performance hotspots.
+  test("perf test") {
+    val n = 10000000
+    val rdd1 = sc.parallelize(1 to n).map { idx =>
+      OldVectors.dense(idx.toDouble)
+    }
+    val trieouts = 10
+    rdd1.cache()
+    rdd1.count()
+    val rdd2 = sc.parallelize(1 to n).map { idx =>
+      Vectors.dense(idx.toDouble)
+    }
+    rdd2.cache()
+    rdd2.count()
+    val df = rdd2.map(Tuple1.apply).toDF("features")
+    df.cache()
+    df.count()
+    val x = df.select(
+      metrics("mean", "variance", "count", "numNonZeros", "max", "min", "normL1",
+              "normL2").summary($"features"))
+    val x1 = df.select(metrics("variance").summary($"features"))
+
+    var times_df: List[Long] = Nil
+    for (_ <- 1 to trieouts) {
+      val t21 = System.nanoTime()
+      x.head()
+      val t22 = System.nanoTime()
+      times_df ::= (t22 - t21)
+    }
+
+    var times_rdd: List[Long] = Nil
+    for (_ <- 1 to trieouts) {
+      val t21 = System.nanoTime()
+      Statistics.colStats(rdd1)
+      val t22 = System.nanoTime()
+      times_rdd ::= (t22 - t21)
+    }
+
+    var times_df_variance: List[Long] = Nil
+    for (_ <- 1 to trieouts) {
+      val t21 = System.nanoTime()
+      x1.head()
+      val t22 = System.nanoTime()
+      times_df_variance ::= (t22 - t21)
+    }
+
+    def print(name: String, l: List[Long]): Unit = {
+      def f(z: Long) = (1e6 * n.toDouble) / z
+      val min = f(l.max)
+      val max = f(l.min)
+      val med = f(l.sorted.drop(l.size / 2).head)
+
+      // scalastyle:off println
+      println(s"dataframe = [$min ~ $med ~ $max] records / milli")
+    }
+
+    print("RDD", times_rdd)
+    print("Dataframes (variance only)", times_df_variance)
+    print("Dataframes", times_df)
+  }
+
 }

From 96be071b30a896abc1a1e2f0342bd333dbfda7c5 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Thu, 30 Mar 2017 15:10:13 -0700
Subject: [PATCH 25/26] Current speed:

Dataframes = [2766.648008567718 ~ 5091.204527768661 ~ 5716.359795809639] records / milli
---
 .../apache/spark/ml/linalg/VectorUDT.scala    | 24 ++++----
 .../spark/ml/stat/SummarizerSuite.scala       | 55 +++++++++++--------
 2 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorUDT.scala b/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorUDT.scala
index 917861309c57..37f173bc2046 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorUDT.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorUDT.scala
@@ -27,17 +27,7 @@ import org.apache.spark.sql.types._
  */
 private[spark] class VectorUDT extends UserDefinedType[Vector] {
 
-  override def sqlType: StructType = {
-    // type: 0 = sparse, 1 = dense
-    // We only use "values" for dense vectors, and "size", "indices", and "values" for sparse
-    // vectors. The "values" field is nullable because we might want to add binary vectors later,
-    // which uses "size" and "indices", but not "values".
-    StructType(Seq(
-      StructField("type", ByteType, nullable = false),
-      StructField("size", IntegerType, nullable = true),
-      StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true),
-      StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true)))
-  }
+  override final def sqlType: StructType = _sqlType
 
   override def serialize(obj: Vector): InternalRow = {
     obj match {
@@ -94,4 +84,16 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] {
   override def typeName: String = "vector"
 
   private[spark] override def asNullable: VectorUDT = this
+
+  private[this] val _sqlType = {
+    // type: 0 = sparse, 1 = dense
+    // We only use "values" for dense vectors, and "size", "indices", and "values" for sparse
+    // vectors. The "values" field is nullable because we might want to add binary vectors later,
+    // which uses "size" and "indices", but not "values".
+    StructType(Seq(
+      StructField("type", ByteType, nullable = false),
+      StructField("size", IntegerType, nullable = true),
+      StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true),
+      StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true)))
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
index f691148c0c3a..b7d0c5635bbc 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
@@ -341,7 +341,7 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
     val rdd1 = sc.parallelize(1 to n).map { idx =>
       OldVectors.dense(idx.toDouble)
     }
-    val trieouts = 10
+    val trieouts = 100
     rdd1.cache()
     rdd1.count()
     val rdd2 = sc.parallelize(1 to n).map { idx =>
@@ -357,43 +357,50 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
               "normL2").summary($"features"))
     val x1 = df.select(metrics("variance").summary($"features"))
 
+    def print(name: String, l: List[Long]): Unit = {
+      def f(z: Long) = (1e6 * n.toDouble) / z
+      val min = f(l.max)
+      val max = f(l.min)
+      val med = f(l.sorted.drop(l.size / 2).head)
+
+      // scalastyle:off println
+      println(s"$name = [$min ~ $med ~ $max] records / milli")
+    }
+
+
     var times_df: List[Long] = Nil
     for (_ <- 1 to trieouts) {
+      System.gc()
       val t21 = System.nanoTime()
       x.head()
       val t22 = System.nanoTime()
-      times_df ::= (t22 - t21)
+      val dt = t22 - t21
+      times_df ::= dt
+      // scalastyle:off
+      print("Dataframes", times_df)
+      // scalastyle:on
     }
 
     var times_rdd: List[Long] = Nil
-    for (_ <- 1 to trieouts) {
-      val t21 = System.nanoTime()
-      Statistics.colStats(rdd1)
-      val t22 = System.nanoTime()
-      times_rdd ::= (t22 - t21)
-    }
+//    for (_ <- 1 to trieouts) {
+//      val t21 = System.nanoTime()
+//      Statistics.colStats(rdd1)
+//      val t22 = System.nanoTime()
+//      times_rdd ::= (t22 - t21)
+//    }
 
     var times_df_variance: List[Long] = Nil
-    for (_ <- 1 to trieouts) {
-      val t21 = System.nanoTime()
-      x1.head()
-      val t22 = System.nanoTime()
-      times_df_variance ::= (t22 - t21)
-    }
-
-    def print(name: String, l: List[Long]): Unit = {
-      def f(z: Long) = (1e6 * n.toDouble) / z
-      val min = f(l.max)
-      val max = f(l.min)
-      val med = f(l.sorted.drop(l.size / 2).head)
+//    for (_ <- 1 to trieouts) {
+//      val t21 = System.nanoTime()
+//      x1.head()
+//      val t22 = System.nanoTime()
+//      times_df_variance ::= (t22 - t21)
+//    }
 
-      // scalastyle:off println
-      println(s"dataframe = [$min ~ $med ~ $max] records / milli")
-    }
 
+    print("Dataframes", times_df)
     print("RDD", times_rdd)
     print("Dataframes (variance only)", times_df_variance)
-    print("Dataframes", times_df)
   }
 
 }

From a569dac8998d63adbc9adba1f2eb2f42967533e7 Mon Sep 17 00:00:00 2001
From: Timothy Hunter <timhunter@databricks.com>
Date: Thu, 30 Mar 2017 15:46:56 -0700
Subject: [PATCH 26/26] BLAS calls for dense interface

---
 .../org/apache/spark/ml/stat/Summarizer.scala | 119 +++++++++++++-----
 1 file changed, 86 insertions(+), 33 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index b87087168e71..23ee455b9a66 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -313,40 +313,10 @@ object SummaryBuilderImpl extends Logging {
       buffer.totalWeightSum += w
       buffer.totalCount += 1
       buffer.totalWeightSquareSum += w * w
-      // All the fields that we compute on demand:
-      // TODO: the most common case is dense vectors. In that case we should
-      // directly use BLAS instructions instead of iterating through a scala iterator.
-      v.foreachActive { (index, value) =>
-        if (value != 0.0) {
-          if (buffer.max != null && buffer.max(index) < value) {
-            buffer.max(index) = value
-          }
-          if (buffer.min != null && buffer.min(index) > value) {
-            buffer.min(index) = value
-          }
 
-          if (buffer.mean != null) {
-            assert(buffer.weightSum != null)
-            val prevMean = buffer.mean(index)
-            val diff = value - prevMean
-            buffer.mean(index) += w * diff / (buffer.weightSum(index) + w)
-            if (buffer.m2n != null) {
-              buffer.m2n(index) += w * (value - buffer.mean(index)) * diff
-            }
-          }
-          if (buffer.m2 != null) {
-            buffer.m2(index) += w * value * value
-          }
-          if (buffer.l1 != null) {
-            buffer.l1(index) += w * math.abs(value)
-          }
-          if (buffer.weightSum != null) {
-            buffer.weightSum(index) += w
-          }
-          if (buffer.nnz != null) {
-            buffer.nnz(index) += 1
-          }
-        }
+      v match {
+        case dv: DenseVector => updateInPlaceDense(buffer, dv, w)
+        case sv: SparseVector => updateInPlaceSparse(buffer, sv, w)
       }
     }
 
@@ -588,10 +558,93 @@ object SummaryBuilderImpl extends Logging {
           }
         }
       }
+    }
+
+    private def updateInPlaceDense(buffer: Buffer, v: DenseVector, w: Double): Unit = {
+      val epsi = Double.MinPositiveValue
+      lazy val value = v.asBreeze
+      // The mask is zero for all the zero values, and one otherwise.
+      lazy val mask = numerics.ceil(la.min(numerics.abs(value), epsi))
+      lazy val maskWeight = w * mask
+
+      if (buffer.max != null) {
+        val x = b(buffer.max)
+        x := la.max(x, value)
+      }
+
+      if (buffer.min != null) {
+        val x = b(buffer.min)
+        x := la.min(x, value)
+      }
+
+      if (buffer.mean != null) {
+        assert(buffer.weightSum != null)
+        val prevMean = b(buffer.mean).copy
+        val diff = value :- prevMean
+        // Adding an epsilon to ensure that the denominator is always positive.
+        // This epsilon is not going to have impact since numerator(i) == 0 => denominator(i) == 0.
+        val denom = la.max(b(buffer.weightSum) :+ maskWeight, epsi)
+        b(buffer.mean) :+= (maskWeight :* diff) :/ denom
+        if (buffer.m2n != null) {
+          b(buffer.m2n) :+= maskWeight :* ((value :- b(buffer.mean)) :* diff)
+        }
+      }
+
+      if (buffer.m2 != null) {
+        b(buffer.m2) :+= maskWeight :* (value :* value)
+      }
+
+      if (buffer.l1 != null) {
+        b(buffer.l1) :+= maskWeight :* numerics.abs(value)
+      }
+
+
+      if (buffer.weightSum != null) {
+        b(buffer.weightSum) :+= maskWeight
+      }
 
+      if (buffer.nnz != null) {
+        bl(buffer.nnz) :+= la.convert(maskWeight, Long)
+      }
     }
 
 
+    private def updateInPlaceSparse(buffer: Buffer, v: SparseVector, w: Double): Unit = {
+      v.foreachActive { (index, value) =>
+        if (value != 0.0) {
+          if (buffer.max != null && buffer.max(index) < value) {
+            buffer.max(index) = value
+          }
+          if (buffer.min != null && buffer.min(index) > value) {
+            buffer.min(index) = value
+          }
+
+          if (buffer.mean != null) {
+            assert(buffer.weightSum != null)
+            val prevMean = buffer.mean(index)
+            val diff = value - prevMean
+            buffer.mean(index) += w * diff / (buffer.weightSum(index) + w)
+            if (buffer.m2n != null) {
+              buffer.m2n(index) += w * (value - buffer.mean(index)) * diff
+            }
+          }
+          if (buffer.m2 != null) {
+            buffer.m2(index) += w * value * value
+          }
+          if (buffer.l1 != null) {
+            buffer.l1(index) += w * math.abs(value)
+          }
+          if (buffer.weightSum != null) {
+            buffer.weightSum(index) += w
+          }
+          if (buffer.nnz != null) {
+            buffer.nnz(index) += 1
+          }
+        }
+      }
+
+    }
+
     /**
      * Merges other into buffer.
      */