From f3fa6580bca70f3307d70e938ef8531c928d958b Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Fri, 3 Mar 2017 10:36:02 -0800 Subject: [PATCH 01/26] work --- .../org/apache/spark/ml/stat/Summarizer.scala | 582 ++++++++++++++++++ .../spark/ml/stat/SummarizerSuite.scala | 61 ++ 2 files changed, 643 insertions(+) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala create mode 100644 mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala new file mode 100644 index 000000000000..f94fc10721bb --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -0,0 +1,582 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.stat + +import org.apache.spark.annotation.{DeveloperApi, Since} +import org.apache.spark.ml.linalg.{SQLDataTypes, Vector, Vectors} +import org.apache.spark.sql.{Column, Row} +import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} +import org.apache.spark.sql.types._ + +/** + * A builder object that provides summary statistics about a given column. + * + * Users should not directly create such builders, but instead use one of the methods in + * [[Summarizer]]. + */ +@Since("2.2.0") +abstract class SummaryBuilder { + /** + * Returns an aggregate object that contains the summary of the column with the requested metrics. + * @param column a column that contains Vector object. + * @return an aggregate column that contains the statistics. The exact content of this + * structure is determined during the creation of the builder. It is also provided by + * [[summaryStructure]]. + */ + @Since("2.2.0") + def summary(column: Column): Column + + /** + * The structure of the summary that will be returned by this builder. + * @return the structure of the aggregate data returned by this builder. + */ + @Since("2.2.0") + @DeveloperApi + private[spark] def summaryStructure: StructType +} + +/** + * Tools for vectorized statistics on MLlib Vectors. + * + * The methods in this package provide various statistics for Vectors contained inside DataFrames. + * + * This class lets users pick the statistics they would like to extract for a given column. Here is + * an example in Scala: + * {{{ + * val dataframe = ... // Some dataframe containing a feature column + * val allStats = dataframe.select(Summarizer.metrics("min", "max").summary($"features")) + * val Row(min_, max_) = allStats.first() + * }}} + * + * If one wants to get a single metric, shortcuts are also available: + * {{{ + * val meanDF = dataframe.select(Summarizer.mean($"features")) + * val Row(mean_) = meanDF.first() + * }}} + */ +@Since("2.2.0") +object Summarizer { + + import SummaryBuilderImpl._ + + /** + * Given a list of metrics, provides a builder that it turns computes metrics from a column. + * + * See the documentation of [[Summarizer]] for an example. + * + * @param firstMetric the metric being provided + * @param metrics additional metrics that can be provided. + * @return a builder. + * @throws IllegalArgumentException if one of the metric names is not understood. + */ + @Since("2.2.0") + def metrics(firstMetric: String, metrics: String*): SummaryBuilder = { + val (typedMetrics, computeMetrics) = getRelevantMetrics(Seq(firstMetric) ++ metrics) + new SummaryBuilderImpl( + SummaryBuilderImpl.buildUdafForMetrics(computeMetrics), + SummaryBuilderImpl.structureForMetrics(typedMetrics)) + } + + def mean(col: Column): Column = metrics("mean").summary(col) +} + +private[ml] class SummaryBuilderImpl( + udaf: UserDefinedAggregateFunction, + structType: StructType) extends SummaryBuilder { + + override def summary(column: Column): Column = udaf.apply(column) + + override val summaryStructure: StructType = structType +} + +private[ml] +object SummaryBuilderImpl { + + def implementedMetrics: Seq[String] = allMetrics.map(_._1).sorted + + @throws[IllegalArgumentException]("When the list is empty or not a subset of known metrics") + def getRelevantMetrics(requested: Seq[String]): (Seq[Metrics], Seq[ComputeMetrics]) = { + val all = requested.map { req => + val (_, metric, deps) = allMetrics.find(tup => tup._1 == req).getOrElse { + throw new IllegalArgumentException(s"Metric $req cannot be found." + + s" Valid metris are $implementedMetrics") + } + metric -> deps + } + // TODO: do not sort, otherwise the user has to look the schema to see the order that it + // is going to be given in. + val metrics = all.map(_._1).distinct.sortBy(_.toString) + val computeMetrics = all.flatMap(_._2).distinct.sortBy(_.toString) + metrics -> computeMetrics + } + + def structureForMetrics(metrics: Seq[Metrics]): StructType = { + val fields = metrics.map { m => + // All types are vectors, except for one. + val tpe = if (m == Count) { LongType } else { SQLDataTypes.VectorType } + // We know the metric is part of the list. + val (name, _, _) = allMetrics.find(tup => tup._2 == m).get + StructField(name, tpe, nullable = false) + } + StructType(fields) + } + + def buildUdafForMetrics(metrics: Seq[ComputeMetrics]): UserDefinedAggregateFunction = null + + /** + * All the metrics that can be currently computed by Spark for vectors. + * + * This list associates the user name, the internal (typed) name, and the list of computation + * metrics that need to de computed internally to get the final result. + */ + private val allMetrics = Seq( + ("mean", Mean, Seq(ComputeMean, ComputeWeightSum, ComputeTotalWeightSum)), + ("variance", Variance, Seq(ComputeTotalWeightSum, ComputeWeightSquareSum, ComputeMean, + ComputeM2n)), + ("count", Count, Seq(ComputeCount)), + ("numNonZeros", NumNonZeros, Seq(ComputeNNZ)), + ("max", Max, Seq(ComputeMax, ComputeTotalWeightSum, ComputeNNZ, ComputeCount)), + ("min", Min, Seq(ComputeMin, ComputeTotalWeightSum, ComputeNNZ, ComputeCount)), + ("normL2", NormL2, Seq(ComputeTotalWeightSum, ComputeM2)), + ("normL1", Min, Seq(ComputeTotalWeightSum, ComputeL1)) + ) + + /** + * The metrics that are currently implemented. + */ + sealed trait Metrics + case object Mean extends Metrics + case object Variance extends Metrics + case object Count extends Metrics + case object NumNonZeros extends Metrics + case object Max extends Metrics + case object Min extends Metrics + case object NormL2 extends Metrics + case object NormL1 extends Metrics + + /** + * The running metrics that are going to be computing. + * + * There is a bipartite graph between the metrics and the computed metrics. + */ + private sealed trait ComputeMetrics + case object ComputeMean extends ComputeMetrics + case object ComputeM2n extends ComputeMetrics + case object ComputeM2 extends ComputeMetrics + case object ComputeL1 extends ComputeMetrics + case object ComputeCount extends ComputeMetrics + case object ComputeTotalWeightSum extends ComputeMetrics + case object ComputeWeightSquareSum extends ComputeMetrics + case object ComputeWeightSum extends ComputeMetrics + case object ComputeNNZ extends ComputeMetrics + case object ComputeMax extends ComputeMetrics + case object ComputeMin extends ComputeMetrics + + // The order in which the metrics will be stored in the buffer. + // This order replicates the order above. + private val metricsWithOrder = Seq(ComputeMean, ComputeM2n, ComputeM2, ComputeL1, ComputeCount, + ComputeTotalWeightSum, ComputeWeightSquareSum, ComputeWeightSum, ComputeNNZ, ComputeMax, + ComputeMin).zipWithIndex + + + private class MetricsUDAF( + requested: Seq[Metrics], + metrics: Seq[ComputeMetrics]) extends UserDefinedAggregateFunction { + + // All of these are the indexes of the value in the temporary buffer, or -1 if these values are + // not requested. + // All these values are unrolled so that we do not pay too steep a penalty for the extra level + // of indirection associated with selectively computing some metrics. + private[this] val computeMean = getIndexFor(ComputeMean) + private[this] val computeM2n = getIndexFor(ComputeM2n) + private[this] val computeM2 = getIndexFor(ComputeM2) + private[this] val computeL1 = getIndexFor(ComputeL1) + private[this] val computeTotalCount = getIndexFor(ComputeCount) + // TODO: why is it not used? + private[this] val computeTotalWeightSum = getIndexFor(ComputeTotalWeightSum) + private[this] val computeWeightSquareSum = getIndexFor(ComputeWeightSquareSum) + private[this] val computeWeightSum = getIndexFor(ComputeWeightSum) + private[this] val computeNNZ = getIndexFor(ComputeNNZ) + private[this] val computeCurrMax = getIndexFor(ComputeMax) + private[this] val computeCurrMin = getIndexFor(ComputeMin) + // Always computed, and put at the end. + private[this] val computeN = metricsWithOrder.size + + override def inputSchema: StructType = { + StructType(Seq(StructField("features", SQLDataTypes.VectorType))) + } + + // The schema for the buffer contains all the structures, but a subset of them may be null. + override def bufferSchema: StructType = { + val fields = metricsWithOrder.map { case (m, _) => + // TODO(thunterdb) this could be merged with the types above so that there is no confusion + // when adding other types later, and with the initialization below. + val tpe = m match { + case ComputeCount => LongType + case ComputeTotalWeightSum => DoubleType + case ComputeWeightSquareSum => DoubleType + case _ => SQLDataTypes.VectorType + } + StructField(m.toString, tpe, nullable = true) + } :+ StructField("n", IntegerType, nullable = false) + StructType(fields) + } + + override def dataType: DataType = { + structureForMetrics(requested) + } + + override def deterministic: Boolean = true + + // The initialization does not do anything because we do not know the size of the vectors yet. + override def initialize(buffer: MutableAggregationBuffer): Unit = { + // Set n to -1 to indicate that we do not know the size yet. + buffer.update(computeN, -1) + } + + override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { + val v = input.getAs[Vector](0) + // TODO: add weights later. + update(buffer, v, 1.0) + } + + override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { + val thisUsed = ((computeTotalWeightSum >= 0 && buffer1.getDouble(computeTotalWeightSum) > 0) + || (computeTotalCount >= 0 && buffer1.getLong(computeTotalCount) > 0)) + val otherUsed = ((computeTotalWeightSum >= 0 && buffer2.getDouble(computeTotalWeightSum) > 0) + || (computeTotalCount >= 0 && buffer2.getLong(computeTotalCount) > 0)) + if (thisUsed && otherUsed) { + // Merge onto this one. + val n1 = getN(buffer1) + val n2 = getN(buffer2) + require(n1 == n2, s"Dimensions mismatch between summarizers: expected $n1 but got $n2") + require(n1 > 0, s"Expected the summarizers to be initialized") + // The scalar values. + if (computeTotalCount >= 0) { + buffer1.update(computeTotalCount, + buffer1.getLong(computeTotalCount) + buffer2.getLong(computeTotalCount)) + } + if (computeTotalWeightSum >= 0) { + buffer1.update(computeTotalWeightSum, + buffer1.getDouble(computeTotalWeightSum) + buffer2.getDouble(computeTotalWeightSum)) + } + if (computeWeightSquareSum >= 0) { + buffer1.update(computeWeightSquareSum, + buffer1.getDouble(computeWeightSquareSum) + buffer2.getDouble(computeWeightSquareSum)) + } + // The vector values. + val currMean = exposeDArray(buffer1, computeMean) + val currM2n = exposeDArray(buffer1, computeM2n) + val currM2 = exposeDArray(buffer1, computeM2) + val currL1 = exposeDArray(buffer1, computeL1) + val weightSum = exposeDArray(buffer1, computeWeightSum) + val numNonzeros = exposeDArray(buffer1, computeNNZ) + val currMax = exposeDArray(buffer1, computeCurrMax) + val currMin = exposeDArray(buffer1, computeCurrMin) + + val otherMean = exposeDArray2(buffer2, computeMean) + val otherM2n = exposeDArray2(buffer2, computeM2n) + val otherM2 = exposeDArray2(buffer2, computeM2) + val otherL1 = exposeDArray2(buffer2, computeL1) + val otherWeightSum = exposeDArray2(buffer2, computeWeightSum) + val otherNumNonzeros = exposeDArray2(buffer2, computeNNZ) + val otherMax = exposeDArray2(buffer2, computeCurrMax) + val otherMin = exposeDArray2(buffer2, computeCurrMin) + + + var i = 0 + while (i < n1) { + // This case is written do maximize conformance with the current code. + // For more proper implementation, the differnt loops should be disentangled. + // The case of 0.0 values being used should not happen if the proper selection + // of metrics is done. + // TODO: should it be NaN so that any computation with it will indicate an error, + // instead of failing silently? + // Using exceptions is not great for performance in this loop. + val thisNnz = if (weightSum == null) { 0.0 } else { weightSum(i) } + val otherNnz = if (otherWeightSum == null) { 0.0 } else { otherWeightSum(i) } + val totalNnz = thisNnz + otherNnz + val totalCnnz = if (numNonzeros == null || numNonzeros == null) { 0.0 } else { + numNonzeros(i) + otherNumNonzeros(i) + } + if (totalNnz != 0.0) { + val deltaMean = if (computeMean >= 0) { otherMean(i) - currMean(i) } else { 0.0 } + // merge mean together + if (computeMean >= 0) { + currMean(i) += deltaMean * otherNnz / totalNnz + } + // merge m2n together + if (computeM2n >= 0) { + currM2n(i) += otherM2n(i) + deltaMean * deltaMean * thisNnz * otherNnz / totalNnz + } + // merge m2 together + if (computeM2 >= 0) { + currM2(i) += otherM2(i) + } + // merge l1 together + if (computeL1 >= 0) { + currL1(i) += otherL1(i) + } + // merge max and min + if (computeCurrMax >= 0) { + currMax(i) = math.max(currMax(i), otherMax(i)) + } + if (computeCurrMin >= 0) { + currMin(i) = math.min(currMin(i), otherMin(i)) + } + } + if (computeWeightSum >= 0) { + weightSum(i) = totalNnz + } + if (computeNNZ >= 0) { + numNonzeros(i) = totalCnnz + } + + i += 1 + } + } else if (otherUsed) { + // Copy buffer1 onto buffer2. + // TODO + } + } + + // This function packs the requested values, in order. + override def evaluate(buffer: Row): Row = { + val values = requested.map { + case Mean => evaluateMean(buffer) + case Variance => evaluateVariance(buffer) + case NumNonZeros => evaluateNNZ(buffer) + case Max => evaluateMax(buffer) + case Min => evaluateMin(buffer) + case NormL1 => evaluateNormL1(buffer) + case NormL2 => evaluateNormL2(buffer) + } + Row(values: _*) + } + + private def evaluateMean(buffer: Row): Vector = { + assert(computeMean >= 0) // This is a programming error. + assert(computeWeightSum >= 0) + assert(computeTotalWeightSum >= 0) + val n = getN(buffer) + assert(n >= 0) + val realMean = Array.ofDim[Double](n) + val currMean = exposeDArray2(buffer, computeMean) + val weightSum = exposeDArray2(buffer, computeWeightSum) + val totalWeightSum = buffer.getDouble(computeTotalWeightSum) + // TODO: should this be turned into NaN instead? That would be more consistent than + // an error. + require(totalWeightSum > 0, "Data has zero weight") + var i = 0 + while (i < n) { + realMean(i) = currMean(i) * (weightSum(i) / totalWeightSum) + i += 1 + } + Vectors.dense(realMean) + } + + private def evaluateVariance(buffer: Row): Vector = { + assert(computeTotalWeightSum >= 0) + assert(computeWeightSquareSum >= 0) + assert(computeWeightSum >= 0) + assert(computeM2 >= 0) + assert(computeM2n >= 0) + + val totalWeightSum = buffer.getDouble(computeTotalWeightSum) + require(totalWeightSum > 0, s"Nothing has been added to this summarizer.") + + val n = getN(buffer) + assert(n >= 0) + + val weightSquareSum = buffer.getDouble(computeWeightSquareSum) + val realVariance = Array.ofDim[Double](n) + + val currMean = exposeDArray2(buffer, computeMean) + val currM2n = exposeDArray2(buffer, computeM2n) + val weightSum = exposeDArray2(buffer, computeWeightSum) + val denominator = totalWeightSum - (weightSquareSum / totalWeightSum) + + // Sample variance is computed, if the denominator is less than 0, the variance is just 0. + if (denominator > 0.0) { + val deltaMean = currMean + var i = 0 + val len = currM2n.length + while (i < len) { + realVariance(i) = (currM2n(i) + deltaMean(i) * deltaMean(i) * weightSum(i) * + (totalWeightSum - weightSum(i)) / totalWeightSum) / denominator + i += 1 + } + } + Vectors.dense(realVariance) + } + + private def evaluateNNZ(buffer: Row): Vector = { + null // TODO + } + + private def evaluateMax(buffer: Row): Vector = { + null // TODO + } + + private def evaluateMin(buffer: Row): Vector = { + null // TODO + } + + private def evaluateNormL1(buffer: Row): Vector = { + null // TODO + } + + private def evaluateNormL2(buffer: Row): Vector = { + null // TODO + } + + private def update(buffer: MutableAggregationBuffer, instance: Vector, weight: Double): Unit = { + val startN = getN(buffer) + if (startN == -1) { + initialize(buffer, instance.size) + } else { + require(startN == instance.size, + s"Trying to insert a vector of size $instance into a buffer that " + + s"has been sized with $startN") + } + // It should be initialized. + assert(getN(buffer) > 0) + // Update the scalars that do not require going through the vector first. + if (computeTotalWeightSum >= 0) { + buffer.update(computeTotalWeightSum, buffer.getDouble(computeTotalWeightSum) + weight) + } + if (computeWeightSquareSum >= 0) { + buffer.update(computeWeightSquareSum, + buffer.getDouble(computeWeightSquareSum) + weight * weight) + } + if (computeTotalCount >= 0) { + buffer.update(computeTotalCount, buffer.getLong(computeTotalCount) + 1) + } + + // All these may be null pointers. + val localCurrMean = exposeDArray(buffer, computeMean) + val localCurrM2n = exposeDArray(buffer, computeM2n) + val localCurrM2 = exposeDArray(buffer, computeM2) + val localCurrL1 = exposeDArray(buffer, computeL1) + val localWeightSum = exposeDArray(buffer, computeWeightSum) + val localNumNonzeros = exposeDArray(buffer, computeNNZ) + val localCurrMax = exposeDArray(buffer, computeCurrMax) + val localCurrMin = exposeDArray(buffer, computeCurrMin) + + // Update the requested vector values. + // All these changes are in place. There is no need to make copies. + instance.foreachActive { (index, value) => + if (value != 0.0) { + if (localCurrMax != null && localCurrMax(index) < value) { + localCurrMax(index) = value + } + if (localCurrMin != null && localCurrMin(index) > value) { + localCurrMin(index) = value + } + + if (localCurrMean != null) { + val prevMean = localCurrMean(index) + val diff = value - prevMean + localCurrMean(index) = prevMean + weight * diff / (localWeightSum(index) + weight) + if (localCurrM2n != null) { + // require: localCurrMean != null. + localCurrM2n(index) += weight * (value - localCurrMean(index)) * diff + } + } + + if (localCurrM2 != null) { + localCurrM2(index) += weight * value * value + } + if (localCurrL1 != null) { + localCurrL1(index) += weight * math.abs(value) + } + if (localWeightSum != null) { + localWeightSum(index) += weight + } + if (localNumNonzeros != null) { + localNumNonzeros(index) += 1 + } + } + } + } + + // the current vector size, or -1 if not available yet. + private def getN(buffer: Row): Int = { + buffer.getInt(computeN) + } + + + private[this] def exposeDArray2(buffer: Row, index: Int): Array[Double] = { + if (index == -1) { + null + } else { + buffer.getAs[Array[Double]](index) + } + } + + private[this] def exposeDArray(buffer: MutableAggregationBuffer, index: Int): Array[Double] = { + if (index == -1) { + null + } else { + buffer.getAs[Array[Double]](index) + } + } + + // Performs the initialization of the requested fields, if required. + private def initialize(buffer: MutableAggregationBuffer, n: Int): Unit = { + require(n > 0, s"Cannot set n to $n: must be positive") + val currentN = buffer.getInt(computeN) + require(currentN == -1 || (currentN == n), + s"The vector size has already been set with $currentN, requested change to $n") + if (currentN == -1) { + // Set all the vectors to the given value. + // This is not a tight loop, so there should be no performance issue in + // making lookups for the values. + buffer.update(computeN, n) + // Only initialize the metricts that we want to compute: + metricsWithOrder.filter(p => metrics.contains(p._1)).foreach { case (m, idx) => + // Special cases for the scalar values. + m match { + case ComputeCount => + buffer.update(idx, 0L) + case ComputeTotalWeightSum => + buffer.update(idx, 0.0) + case ComputeWeightSquareSum => + buffer.update(idx, 0.0) + case _ => + buffer.update(idx, Array.ofDim[Double](n)) + } + } + } + } + + /** + * Returns a fixed index for the compute metric if it is in the list of requested metrics, + * and -1 otherwise. + */ + private def getIndexFor(m: ComputeMetrics): Int = { + if (metrics.contains(m)) { + metricsWithOrder.find(p => p._1 == m).get._2 + } else { + -1 + } + } + + } + +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala new file mode 100644 index 000000000000..685fea8575c9 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.stat + + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ +import org.apache.spark.sql.{DataFrame, Dataset, Row} + + +class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { + + import testImplicits._ + import Summarizer._ + + private val seed = 42 + private val eps: Double = 1e-5 + + private def denseData(input: Seq[Seq[Double]]): DataFrame = { + val data = input.map(_.toArray).map(Vectors.dense).map(Tuple1.apply) + sc.parallelize(data).toDF("features") + } + + private def compare(df: DataFrame, exp: Seq[Any]): Unit = { + val res = df.first().toSeq + val names = df.schema.fieldNames.zipWithIndex.map { case (n, idx) => s"$n ($idx)" } + assert(res.size === exp.size) + for (((x1, x2), name) <- res.zip(exp).zip(names)) { + (x1, x2) match { + case (d1: Double, d2: Double) => + assert(Vectors.dense(d1) ~== Vectors.dense(d2) absTol 1e-4, name) + } + } + } + + test("basic error handling") { + val df = denseData(Nil) + val c = df.col("features") + val res = df.select(metrics("mean").summary(c), mean(c)) + intercept[IllegalArgumentException] { + compare(res, Seq.empty) + } + } +} From 7539835dad863a6b73d88d79983342f9ddb7fb9d Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Mon, 6 Mar 2017 14:38:41 -0800 Subject: [PATCH 02/26] work on the test suite --- .../org/apache/spark/ml/stat/Summarizer.scala | 88 +++++++++++++------ .../spark/ml/stat/SummarizerSuite.scala | 87 +++++++++++++++--- 2 files changed, 137 insertions(+), 38 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index f94fc10721bb..1de782af9ca1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -17,12 +17,17 @@ package org.apache.spark.ml.stat +import scala.collection.mutable + import org.apache.spark.annotation.{DeveloperApi, Since} +import org.apache.spark.internal.Logging import org.apache.spark.ml.linalg.{SQLDataTypes, Vector, Vectors} import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types._ +// scalastyle:off println + /** * A builder object that provides summary statistics about a given column. * @@ -70,7 +75,7 @@ abstract class SummaryBuilder { * }}} */ @Since("2.2.0") -object Summarizer { +object Summarizer extends Logging { import SummaryBuilderImpl._ @@ -79,6 +84,15 @@ object Summarizer { * * See the documentation of [[Summarizer]] for an example. * + * The following metrics are accepted (case sensitive): + * - mean: a vector that contains the coefficient-wise mean. + * - variance: a vector tha contains the coefficient-wise variance. + * - count: the count of all vectors seen. + * - numNonzeros: a vector with the number of non-zeros for each coefficients + * - max: the maximum for each coefficient. + * - min: the minimum for each coefficient. + * - normL2: the Euclidian norm for each coefficient. + * - normL1: the L1 norm of each coefficient (sum of the absolute values). * @param firstMetric the metric being provided * @param metrics additional metrics that can be provided. * @return a builder. @@ -88,7 +102,7 @@ object Summarizer { def metrics(firstMetric: String, metrics: String*): SummaryBuilder = { val (typedMetrics, computeMetrics) = getRelevantMetrics(Seq(firstMetric) ++ metrics) new SummaryBuilderImpl( - SummaryBuilderImpl.buildUdafForMetrics(computeMetrics), + SummaryBuilderImpl.buildUdafForMetrics(typedMetrics, computeMetrics), SummaryBuilderImpl.structureForMetrics(typedMetrics)) } @@ -105,7 +119,7 @@ private[ml] class SummaryBuilderImpl( } private[ml] -object SummaryBuilderImpl { +object SummaryBuilderImpl extends Logging { def implementedMetrics: Seq[String] = allMetrics.map(_._1).sorted @@ -118,9 +132,9 @@ object SummaryBuilderImpl { } metric -> deps } - // TODO: do not sort, otherwise the user has to look the schema to see the order that it + // Do not sort, otherwise the user has to look the schema to see the order that it // is going to be given in. - val metrics = all.map(_._1).distinct.sortBy(_.toString) + val metrics = all.map(_._1) val computeMetrics = all.flatMap(_._2).distinct.sortBy(_.toString) metrics -> computeMetrics } @@ -136,7 +150,11 @@ object SummaryBuilderImpl { StructType(fields) } - def buildUdafForMetrics(metrics: Seq[ComputeMetrics]): UserDefinedAggregateFunction = null + def buildUdafForMetrics( + metrics: Seq[Metrics], + computeMetrics: Seq[ComputeMetrics]): UserDefinedAggregateFunction = { + new MetricsUDAF(metrics, computeMetrics) + } /** * All the metrics that can be currently computed by Spark for vectors. @@ -145,15 +163,15 @@ object SummaryBuilderImpl { * metrics that need to de computed internally to get the final result. */ private val allMetrics = Seq( - ("mean", Mean, Seq(ComputeMean, ComputeWeightSum, ComputeTotalWeightSum)), + ("mean", Mean, Seq(ComputeMean, ComputeWeightSum, ComputeTotalWeightSum)), // Vector ("variance", Variance, Seq(ComputeTotalWeightSum, ComputeWeightSquareSum, ComputeMean, - ComputeM2n)), - ("count", Count, Seq(ComputeCount)), - ("numNonZeros", NumNonZeros, Seq(ComputeNNZ)), - ("max", Max, Seq(ComputeMax, ComputeTotalWeightSum, ComputeNNZ, ComputeCount)), - ("min", Min, Seq(ComputeMin, ComputeTotalWeightSum, ComputeNNZ, ComputeCount)), - ("normL2", NormL2, Seq(ComputeTotalWeightSum, ComputeM2)), - ("normL1", Min, Seq(ComputeTotalWeightSum, ComputeL1)) + ComputeM2n)), // Vector + ("count", Count, Seq(ComputeCount)), // Long + ("numNonZeros", NumNonZeros, Seq(ComputeNNZ)), // Vector + ("max", Max, Seq(ComputeMax, ComputeTotalWeightSum, ComputeNNZ, ComputeCount)), // Vector + ("min", Min, Seq(ComputeMin, ComputeTotalWeightSum, ComputeNNZ, ComputeCount)), // Vector + ("normL2", NormL2, Seq(ComputeTotalWeightSum, ComputeM2)), // Vector + ("normL1", Min, Seq(ComputeTotalWeightSum, ComputeL1)) // Vector ) /** @@ -174,13 +192,13 @@ object SummaryBuilderImpl { * * There is a bipartite graph between the metrics and the computed metrics. */ - private sealed trait ComputeMetrics + sealed trait ComputeMetrics case object ComputeMean extends ComputeMetrics case object ComputeM2n extends ComputeMetrics case object ComputeM2 extends ComputeMetrics case object ComputeL1 extends ComputeMetrics - case object ComputeCount extends ComputeMetrics - case object ComputeTotalWeightSum extends ComputeMetrics + case object ComputeCount extends ComputeMetrics // Always computed + case object ComputeTotalWeightSum extends ComputeMetrics // Always computed case object ComputeWeightSquareSum extends ComputeMetrics case object ComputeWeightSum extends ComputeMetrics case object ComputeNNZ extends ComputeMetrics @@ -206,9 +224,8 @@ object SummaryBuilderImpl { private[this] val computeM2n = getIndexFor(ComputeM2n) private[this] val computeM2 = getIndexFor(ComputeM2) private[this] val computeL1 = getIndexFor(ComputeL1) - private[this] val computeTotalCount = getIndexFor(ComputeCount) - // TODO: why is it not used? - private[this] val computeTotalWeightSum = getIndexFor(ComputeTotalWeightSum) + private[this] val computeTotalCount = getForcedIndexFor(ComputeCount) // Always compute it. + private[this] val computeTotalWeightSum = getForcedIndexFor(ComputeTotalWeightSum) // Always. private[this] val computeWeightSquareSum = getIndexFor(ComputeWeightSquareSum) private[this] val computeWeightSum = getIndexFor(ComputeWeightSum) private[this] val computeNNZ = getIndexFor(ComputeNNZ) @@ -230,7 +247,7 @@ object SummaryBuilderImpl { case ComputeCount => LongType case ComputeTotalWeightSum => DoubleType case ComputeWeightSquareSum => DoubleType - case _ => SQLDataTypes.VectorType + case _ => ArrayType(DoubleType, containsNull = false) } StructField(m.toString, tpe, nullable = true) } :+ StructField("n", IntegerType, nullable = false) @@ -246,7 +263,11 @@ object SummaryBuilderImpl { // The initialization does not do anything because we do not know the size of the vectors yet. override def initialize(buffer: MutableAggregationBuffer): Unit = { // Set n to -1 to indicate that we do not know the size yet. + println("initialize: size=" + buffer.size + " computeN = " + computeN) + // Thes metrics are always computed. buffer.update(computeN, -1) + buffer.update(computeTotalCount, 0L) + buffer.update(computeTotalWeightSum, 0.0) } override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { @@ -256,10 +277,11 @@ object SummaryBuilderImpl { } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { - val thisUsed = ((computeTotalWeightSum >= 0 && buffer1.getDouble(computeTotalWeightSum) > 0) - || (computeTotalCount >= 0 && buffer1.getLong(computeTotalCount) > 0)) - val otherUsed = ((computeTotalWeightSum >= 0 && buffer2.getDouble(computeTotalWeightSum) > 0) - || (computeTotalCount >= 0 && buffer2.getLong(computeTotalCount) > 0)) + // The metrics that are used to determine if the buffers are used are always computed. + val thisUsed = (buffer1.getDouble(computeTotalWeightSum) > 0.0 + || buffer1.getLong(computeTotalCount) > 0L) + val otherUsed = (buffer2.getDouble(computeTotalWeightSum) > 0.0 + || buffer2.getLong(computeTotalCount) > 0L) if (thisUsed && otherUsed) { // Merge onto this one. val n1 = getN(buffer1) @@ -360,6 +382,7 @@ object SummaryBuilderImpl { val values = requested.map { case Mean => evaluateMean(buffer) case Variance => evaluateVariance(buffer) + case Count => evaluateCount(buffer) case NumNonZeros => evaluateNNZ(buffer) case Max => evaluateMax(buffer) case Min => evaluateMin(buffer) @@ -374,7 +397,9 @@ object SummaryBuilderImpl { assert(computeWeightSum >= 0) assert(computeTotalWeightSum >= 0) val n = getN(buffer) - assert(n >= 0) + if (n < 0) { + throw new IllegalArgumentException("No data point has been seen so far") + } val realMean = Array.ofDim[Double](n) val currMean = exposeDArray2(buffer, computeMean) val weightSum = exposeDArray2(buffer, computeWeightSum) @@ -425,6 +450,11 @@ object SummaryBuilderImpl { Vectors.dense(realVariance) } + private def evaluateCount(buffer: Row): Long = { + assert(computeTotalCount >= 0) + buffer.getLong(computeTotalCount) + } + private def evaluateNNZ(buffer: Row): Vector = { null // TODO } @@ -533,7 +563,7 @@ object SummaryBuilderImpl { if (index == -1) { null } else { - buffer.getAs[Array[Double]](index) + buffer.getAs[mutable.WrappedArray[Double]](index).array } } @@ -577,6 +607,10 @@ object SummaryBuilderImpl { } } + private def getForcedIndexFor(m: ComputeMetrics): Int = { + metricsWithOrder.find(p => p._1 == m).get._2 + } + } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala index 685fea8575c9..38e7bcd157d4 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala @@ -17,13 +17,13 @@ package org.apache.spark.ml.stat - -import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.{SparkException, SparkFunSuite} +import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Dataset, Row} +// scalastyle:off println class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { @@ -33,29 +33,94 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { private val seed = 42 private val eps: Double = 1e-5 + private case class ExpectedMetrics(mean: Vector, min: Vector) + + // The input is expected to be either a sparse vector, a dense vector or an array of doubles + // (which will be converted to a dense vector) + // The expected is the list of all the known metrics. + // + // The tests take an list of input vectors and a list of all the summary values that + // are expected for this input. They currently test against some fixed subset of the + // metrics, but should be made fuzzy in the future. + + private def testExample(input: Seq[Any], exp: ExpectedMetrics): Unit = { + val inputVec: Seq[Vector] = input.map { + case x: Array[Double] => Vectors.dense(x) + case x: Vector => x + case x => throw new Exception(x.toString) + } + val df = sc.parallelize(inputVec).map(Tuple1.apply).toDF("features") + val c = df.col("features") + + compare(df.select(metrics("mean").summary(c), mean(c)), Seq(exp.mean, exp.mean)) + } + private def denseData(input: Seq[Seq[Double]]): DataFrame = { val data = input.map(_.toArray).map(Vectors.dense).map(Tuple1.apply) sc.parallelize(data).toDF("features") } private def compare(df: DataFrame, exp: Seq[Any]): Unit = { - val res = df.first().toSeq + println(s"compare: df=$df") + val coll = df.collect().toSeq + println(s"compare: coll=$coll") + val Seq(row) = coll + val res = row.toSeq + println(s"compare: ress=${res.map(_.getClass)}") val names = df.schema.fieldNames.zipWithIndex.map { case (n, idx) => s"$n ($idx)" } - assert(res.size === exp.size) + assert(res.size === exp.size, (res.size, exp.size)) for (((x1, x2), name) <- res.zip(exp).zip(names)) { - (x1, x2) match { - case (d1: Double, d2: Double) => - assert(Vectors.dense(d1) ~== Vectors.dense(d2) absTol 1e-4, name) - } + compareStructures(x1, x2, name) } } + // Compares structured content. + private def compareStructures(x1: Any, x2: Any, name: String): Unit = (x1, x2) match { + case (d1: Double, d2: Double) => + assert(Vectors.dense(d1) ~== Vectors.dense(d2) absTol 1e-4, name) + case (r1: Row, r2: Row) => + assert(r1.size === r2.size, (r1, r2)) + for ((x1, x2) <- r1.toSeq.zip(r2.toSeq)) { compareStructures(x1, x2, name) } + case (v1: Vector, v2: Vector) => + assert(v1 ~== v2 absTol 1e-4, name) + case (l1: Long, l2: Long) => assert(l1 === l2) + } + test("basic error handling") { val df = denseData(Nil) val c = df.col("features") val res = df.select(metrics("mean").summary(c), mean(c)) - intercept[IllegalArgumentException] { + intercept[SparkException] { compare(res, Seq.empty) } } + + test("no element, working metrics") { + val df = denseData(Nil) + val c = df.col("features") + val res = df.select(metrics("count").summary(c)) + compare(res, Seq(Row(0L))) + } + + test("single element in vector, all metrics") { + testExample(Seq(Array(1.0, 2.0)), ExpectedMetrics( + Vectors.dense(1.0, 2.0), + Vectors.dense(1.0, 2.0) + )) + } + + test("repeated metrics") { + + } + + test("dense vector input") { + + } + + test("sparse vector input") { + + } + + test("merging summarizer when one side has zero mean (SPARK-4355)") { + } } From 673943f334b94e5d1ecd8874cb82bbc875d739c6 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Mon, 6 Mar 2017 16:01:30 -0800 Subject: [PATCH 03/26] last work --- .../org/apache/spark/ml/stat/Summarizer.scala | 51 ++++++++++++------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index 1de782af9ca1..625e1e2145c9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -331,35 +331,38 @@ object SummaryBuilderImpl extends Logging { // instead of failing silently? // Using exceptions is not great for performance in this loop. val thisNnz = if (weightSum == null) { 0.0 } else { weightSum(i) } - val otherNnz = if (otherWeightSum == null) { 0.0 } else { otherWeightSum(i) } + val otherNnz = if (otherWeightSum == null) { 0.0 } else { otherWeightSum.getDouble(i) } val totalNnz = thisNnz + otherNnz val totalCnnz = if (numNonzeros == null || numNonzeros == null) { 0.0 } else { - numNonzeros(i) + otherNumNonzeros(i) + numNonzeros(i) + otherNumNonzeros.getDouble(i) } if (totalNnz != 0.0) { - val deltaMean = if (computeMean >= 0) { otherMean(i) - currMean(i) } else { 0.0 } + val deltaMean = if (computeMean >= 0) { + otherMean.getDouble(i) - currMean(i) + } else { 0.0 } // merge mean together if (computeMean >= 0) { currMean(i) += deltaMean * otherNnz / totalNnz } // merge m2n together if (computeM2n >= 0) { - currM2n(i) += otherM2n(i) + deltaMean * deltaMean * thisNnz * otherNnz / totalNnz + val incr = deltaMean * deltaMean * thisNnz * otherNnz / totalNnz + currM2n(i) += otherM2n.getDouble(i) + incr } // merge m2 together if (computeM2 >= 0) { - currM2(i) += otherM2(i) + currM2(i) += otherM2.getDouble(i) } // merge l1 together if (computeL1 >= 0) { - currL1(i) += otherL1(i) + currL1(i) += otherL1.getDouble(i) } // merge max and min if (computeCurrMax >= 0) { - currMax(i) = math.max(currMax(i), otherMax(i)) + currMax(i) = math.max(currMax(i), otherMax.getDouble(i)) } if (computeCurrMin >= 0) { - currMin(i) = math.min(currMin(i), otherMin(i)) + currMin(i) = math.min(currMin(i), otherMin.getDouble(i)) } } if (computeWeightSum >= 0) { @@ -409,7 +412,7 @@ object SummaryBuilderImpl extends Logging { require(totalWeightSum > 0, "Data has zero weight") var i = 0 while (i < n) { - realMean(i) = currMean(i) * (weightSum(i) / totalWeightSum) + realMean(i) = currMean.getDouble(i) * (weightSum.getDouble(i) / totalWeightSum) i += 1 } Vectors.dense(realMean) @@ -442,8 +445,10 @@ object SummaryBuilderImpl extends Logging { var i = 0 val len = currM2n.length while (i < len) { - realVariance(i) = (currM2n(i) + deltaMean(i) * deltaMean(i) * weightSum(i) * - (totalWeightSum - weightSum(i)) / totalWeightSum) / denominator + val m = deltaMean.getDouble(i) + val num = currM2n.getDouble(i) + m * m * weightSum.getDouble(i) + realVariance(i) = (num * + (totalWeightSum - weightSum.getDouble(i)) / totalWeightSum) / denominator i += 1 } } @@ -499,7 +504,7 @@ object SummaryBuilderImpl extends Logging { } // All these may be null pointers. - val localCurrMean = exposeDArray(buffer, computeMean) + val localCurrMean = exposeDArray3(buffer, computeMean) val localCurrM2n = exposeDArray(buffer, computeM2n) val localCurrM2 = exposeDArray(buffer, computeM2) val localCurrL1 = exposeDArray(buffer, computeL1) @@ -520,12 +525,13 @@ object SummaryBuilderImpl extends Logging { } if (localCurrMean != null) { - val prevMean = localCurrMean(index) + val prevMean = localCurrMean.getDouble(index) val diff = value - prevMean - localCurrMean(index) = prevMean + weight * diff / (localWeightSum(index) + weight) + localCurrMean.update(index, prevMean + weight * diff / (localWeightSum(index) + weight)) +// localCurrMean(index) = prevMean + weight * diff / (localWeightSum(index) + weight) if (localCurrM2n != null) { // require: localCurrMean != null. - localCurrM2n(index) += weight * (value - localCurrMean(index)) * diff + localCurrM2n(index) += weight * (value - localCurrMean.getDouble(index)) * diff } } @@ -550,12 +556,11 @@ object SummaryBuilderImpl extends Logging { buffer.getInt(computeN) } - - private[this] def exposeDArray2(buffer: Row, index: Int): Array[Double] = { + private[this] def exposeDArray2(buffer: Row, index: Int): Row = { if (index == -1) { null } else { - buffer.getAs[Array[Double]](index) + buffer.getStruct(index) } } @@ -563,10 +568,20 @@ object SummaryBuilderImpl extends Logging { if (index == -1) { null } else { + buffer.getAs[MutableAggregationBuffer](index) buffer.getAs[mutable.WrappedArray[Double]](index).array } } + private[this] def exposeDArray3( + buffer: MutableAggregationBuffer, index: Int): MutableAggregationBuffer = { + if (index == -1) { + null + } else { + buffer.getAs[MutableAggregationBuffer](index) + } + } + // Performs the initialization of the requested fields, if required. private def initialize(buffer: MutableAggregationBuffer, n: Int): Unit = { require(n > 0, s"Cannot set n to $n: must be positive") From 202b672afec127f4e0885cf3a58f4dfc97031fc6 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Mon, 13 Mar 2017 15:48:47 -0700 Subject: [PATCH 04/26] work on using imperative aggregators --- .../org/apache/spark/ml/stat/Summarizer.scala | 133 +++++++++++++++++- 1 file changed, 132 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index 625e1e2145c9..93e0c7429d10 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -17,11 +17,16 @@ package org.apache.spark.ml.stat -import scala.collection.mutable +import org.apache.spark.SparkException +import scala.collection.mutable import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.ml.linalg.{SQLDataTypes, Vector, Vectors} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeProjection, UnsafeRow} +import org.apache.spark.sql.catalyst.expressions.aggregate.TypedImperativeAggregate +import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types._ @@ -211,6 +216,132 @@ object SummaryBuilderImpl extends Logging { ComputeTotalWeightSum, ComputeWeightSquareSum, ComputeWeightSum, ComputeNNZ, ComputeMax, ComputeMin).zipWithIndex + /** + * The buffer that contains all the summary statistics. If the value is null, it is considered + * to be not required. + * + * If it is required but the size of the vectors (n) is not yet know, it is initialized to + * an empty array. + */ + private case class Buffer private ( + var n: Int = -1, + var mean: Array[Double] = null, + var m2n: Array[Double] = null, + var m2: Array[Double] = null, + var l1: Array[Double] = null, + var totalCount: Long = 0, + var totalWeightSum: Double = 0.0, + var weightSquareSum: Array[Double] = null, + var weightSum: Array[Double] = null, + var nnz: Array[Long] = null, + var max: Array[Double] = null, + var min: Array[Double] = null) + + object Buffer { + // Recursive function, but the number of cases is really small. + def fromMetrics(requested: Seq[ComputeMetrics]): Buffer = { + if (requested.isEmpty) { + new Buffer() + } else { + val b = fromMetrics(requested.tail) + requested.head match { + case ComputeMean => b.copy(mean = Array.empty) + case ComputeM2n => b.copy(m2n = Array.empty) + case ComputeM2 => b.copy(m2 = Array.empty) + case ComputeL1 => b.copy(l1 = Array.empty) + case ComputeWeightSquareSum => b.copy(weightSquareSum = Array.empty) + case ComputeWeightSum => b.copy(weightSum = Array.empty) + case ComputeNNZ => b.copy(nnz = Array.empty) + case ComputeMax => b.copy(max = Array.empty) + case ComputeMin => b.copy(min = Array.empty) + case _ => b // These cases are already being computed + } + } + } + + def bufferSchema: StructType = { + // TODO: there is room for optimization here: we could take as argument the initialized + // start buffer and only attempt to compute the fields we know will be requested. + val fields = metricsWithOrder.map { case (m, _) => + // TODO(thunterdb) this could be merged with the types above so that there is no confusion + // when adding other types later, and with the initialization below. + val tpe = m match { + case ComputeCount => LongType + case ComputeTotalWeightSum => DoubleType + case ComputeWeightSquareSum => DoubleType + case _ => ArrayType(DoubleType, containsNull = false) + } + StructField(m.toString, tpe, nullable = true) + } + val n = StructField("n", IntegerType, nullable = false) + StructType(n +: fields) + } + + def updateInPlace(buffer: Buffer, v: Vector, w: Double): Unit = { + ??? + } + + @throws[SparkException]("When the buffers are not compatible") + def mergeBuffers(buffer: Buffer, other: Buffer): Buffer = { + ??? + } + + + } + + private case class MetricsAggregate( + requested: Seq[Metrics], + startBuffer: Buffer, + child: Expression) + extends TypedImperativeAggregate[Buffer] { + + override def eval(buff: Buffer): Buffer = Buffer + + override def children: Seq[Expression] = child :: Nil + + override def update(buff: Buffer, row: InternalRow): Buffer = { + val v = row.get(0, SQLDataTypes.VectorType).asInstanceOf[Vector] + + val w = row.numFields match { + case 1 => 1.0 + case 2 => row.getDouble(1) + case x => throw new SparkException(s"Expected 1 or 2 fields, got $x fields.") + } + Buffer.updateInPlace(buff, v, w) + buff + } + + override def merge(buff: Buffer, other: Buffer): Buffer = { + Buffer.mergeBuffers(buff, other) + } + + override def nullable: Boolean = false + + // Make a copy of the start buffer so that the current aggregator can be safely copied around. + override def createAggregationBuffer(): Buffer = startBuffer.copy() + + override def serialize(buff: Buffer): Array[Byte] = { + val array = new GenericArrayData(buff.productIterator.toArray) + projection.apply(InternalRow.apply(array)).getBytes + } + + override def deserialize(bytes: Array[Byte]): Buffer = { + val buffer = createAggregationBuffer() + row.pointTo(bytes, bytes.length) + row.getArray(0).foreach(child.dataType, (_, x: Any) => buffer += x) + buffer + } + + + + private lazy val projection = UnsafeProjection.create( + Array[DataType](ArrayType(elementType = child.dataType, containsNull = false))) + private lazy val row = new UnsafeRow(1) + + + } + + private class MetricsUDAF( requested: Seq[Metrics], From a983284cfeddabd017792e3991cf99a7d3ab1e16 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Fri, 17 Mar 2017 17:14:40 -0700 Subject: [PATCH 05/26] more work on summarizer --- .../org/apache/spark/ml/stat/Summarizer.scala | 182 +++++++++++++++++- 1 file changed, 174 insertions(+), 8 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index 93e0c7429d10..728faab71b37 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -17,17 +17,17 @@ package org.apache.spark.ml.stat -import org.apache.spark.SparkException +import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV} -import scala.collection.mutable +import org.apache.spark.SparkException import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging -import org.apache.spark.ml.linalg.{SQLDataTypes, Vector, Vectors} +import org.apache.spark.ml.linalg.{BLAS, SQLDataTypes, Vector, Vectors} +import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.aggregate.TypedImperativeAggregate import org.apache.spark.sql.catalyst.util.GenericArrayData -import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types._ @@ -231,7 +231,7 @@ object SummaryBuilderImpl extends Logging { var l1: Array[Double] = null, var totalCount: Long = 0, var totalWeightSum: Double = 0.0, - var weightSquareSum: Array[Double] = null, + var totalWeightSquareSum: Double = 0.0, var weightSum: Array[Double] = null, var nnz: Array[Long] = null, var max: Array[Double] = null, @@ -249,7 +249,6 @@ object SummaryBuilderImpl extends Logging { case ComputeM2n => b.copy(m2n = Array.empty) case ComputeM2 => b.copy(m2 = Array.empty) case ComputeL1 => b.copy(l1 = Array.empty) - case ComputeWeightSquareSum => b.copy(weightSquareSum = Array.empty) case ComputeWeightSum => b.copy(weightSum = Array.empty) case ComputeNNZ => b.copy(nnz = Array.empty) case ComputeMax => b.copy(max = Array.empty) @@ -278,15 +277,182 @@ object SummaryBuilderImpl extends Logging { } def updateInPlace(buffer: Buffer, v: Vector, w: Double): Unit = { - ??? + val startN = buffer.n + if (startN == -1) { + buffer.n = v.size + } else { + require(startN == v.size, + s"Trying to insert a vector of size $v into a buffer that " + + s"has been sized with $startN") + } + val n = buffer.n + assert(n > 0, n) + // Always update the following fields. + buffer.totalWeightSum += w + buffer.totalCount += 1 + buffer.totalWeightSquareSum += w * w + // All the fields that we compute on demand: + // TODO: the most common case is dense vectors. In that case we should + // directly use BLAS instructions instead of iterating through a scala iterator. + v.foreachActive { (index, value) => + if (value != 0.0) { + if (buffer.max != null && buffer.max(index) < value) { + buffer.max(index) = value + } + if (buffer.min != null && buffer.min(index) > value) { + buffer.min(index) = value + } + + if (buffer.mean != null) { + val prevMean = buffer.mean(index) + val diff = value - prevMean + buffer.mean(index) += w * diff / (buffer.weightSum(index) + w) + if (buffer.m2n != null) { + buffer.m2n(index) += w * (value - buffer.mean(index)) * diff + } + } + if (buffer.m2 != null) { + buffer.m2(index) += w * value * value + } + if (buffer.l1 != null) { + buffer.l1(index) += w * math.abs(value) + } + if (buffer.weightSum != null) { + buffer.weightSum(index) += w + } + if (buffer.nnz != null) { + buffer.nnz(index) += 1 + } + } + } } @throws[SparkException]("When the buffers are not compatible") def mergeBuffers(buffer: Buffer, other: Buffer): Buffer = { - ??? + if (buffer.n == -1) { + // buffer is not initialized. + if (other.n == -1) { + // Both are not initialized. + buffer + } else { + // other is initialized + other + } + } else { + // Buffer is initialized. + if (other.n == -1) { + buffer + } else { + mergeBuffers0(buffer, other) + buffer + } + } + } + + private def axpy(a: Double, x: Array[Double], y: Array[Double]): Unit = { + BLAS.axpy(a, Vectors.dense(x), Vectors.dense(y)) + } + + private def b(x: Array[Double]): BV[Double] = Vectors.dense(x).asBreeze + + private def bl(x: Array[Long]): BV[Long] = BV.apply(x) + + private def maxInPlace(x: Array[Double], y: Array[Double]): Unit = { + var i = 0 + while(i < x.length) { + // Note: do not use conditions, it is wrong when handling NaNs. + x(i) = Math.max(x(i), y(i)) + i += 1 + } + } + + private def minInPlace(x: Array[Double], y: Array[Double]): Unit = { + var i = 0 + while(i < x.length) { + // Note: do not use conditions, it is wrong when handling NaNs. + x(i) = Math.min(x(i), y(i)) + i += 1 + } + } + + + /** + * Merges other into buffer. + */ + private def mergeBuffers0(buffer: Buffer, other: Buffer): Unit = { + // Each buffer needs to be properly initialized. + require(buffer.n > 0 && other.n > 0, (buffer.n, other.n)) + require(buffer.n == other.n, (buffer.n, other.n)) + // Mandatory scalar values + buffer.totalWeightSquareSum += other.totalWeightSquareSum + buffer.totalWeightSum += other.totalWeightSum + buffer.totalCount += buffer.totalCount + // Keep the original weight sums. + val weightSum1 = if (buffer.weightSum == null) null else { buffer.weightSum.clone() } + val weightSum2 = if (other.weightSum == null) null else { other.weightSum.clone() } + + val weightSum = if (weightSum1 == null) null else { + require(weightSum2 != null) + val arr: Array[Double] = Array.ofDim(buffer.n) + b(arr) :+= b(weightSum1) :- b(weightSum1) + arr + } + + + // Since the operations are dense, we can directly use BLAS calls here. + val deltaMean: Array[Double] = if (buffer.mean != null) { + require(other.mean != null) + val arr: Array[Double] = Array.ofDim(buffer.n) + b(arr) :+= b(other.mean) :- b(buffer.mean) + arr + } else { null } + + if (buffer.mean != null) { + require(other.mean != null) + require(weightSum != null) + b(buffer.mean) :+= b(deltaMean) :* (b(weightSum2) / b(weightSum)) + } + + if (buffer.m2n != null) { + require(other.m2n != null) + val w = (b(weightSum1) :* b(weightSum2)) :/ b(weightSum) + b(buffer.m2n) :+= b(other.m2n) :+ (b(deltaMean) :* b(deltaMean)) :* w + } + + if (buffer.m2 != null) { + require(other.m2 != null) + b(buffer.m2) :+= b(other.m2) + } + + if (buffer.l1 != null) { + require(other.l1 != null) + b(buffer.l1) :+= b(other.l1) + } + + if (buffer.max != null) { + require(other.max != null) + maxInPlace(buffer.max, other.max) + } + + if (buffer.min != null) { + require(other.min != null) + minInPlace(buffer.min, other.min) + } + + if (buffer.nnz != null) { + require(other.nnz != null) + bl(buffer.nnz) :+= bl(other.nnz) + } + + if (buffer.weightSum != null) { + require(other.weightSum != null) + b(buffer.weightSum) :+= b(other.weightSum) + } } + + } private case class MetricsAggregate( From 647a4fecb17d478c3c8cd68d40f2a9456eb10c66 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Tue, 21 Mar 2017 10:47:30 -0700 Subject: [PATCH 06/26] work --- .../org/apache/spark/ml/stat/Summarizer.scala | 190 +++++++++++++++--- 1 file changed, 159 insertions(+), 31 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index 728faab71b37..abd5fda96e12 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -18,6 +18,7 @@ package org.apache.spark.ml.stat import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV} +import breeze.numerics import org.apache.spark.SparkException import org.apache.spark.annotation.{DeveloperApi, Since} @@ -28,7 +29,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.aggregate.TypedImperativeAggregate import org.apache.spark.sql.catalyst.util.GenericArrayData -import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} +import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction, UserDefinedFunction} import org.apache.spark.sql.types._ // scalastyle:off println @@ -224,18 +225,18 @@ object SummaryBuilderImpl extends Logging { * an empty array. */ private case class Buffer private ( - var n: Int = -1, - var mean: Array[Double] = null, - var m2n: Array[Double] = null, - var m2: Array[Double] = null, - var l1: Array[Double] = null, - var totalCount: Long = 0, - var totalWeightSum: Double = 0.0, - var totalWeightSquareSum: Double = 0.0, - var weightSum: Array[Double] = null, - var nnz: Array[Long] = null, - var max: Array[Double] = null, - var min: Array[Double] = null) + var n: Int = -1, // 0 + var mean: Array[Double] = null, // 1 + var m2n: Array[Double] = null, // 2 + var m2: Array[Double] = null, // 3 + var l1: Array[Double] = null, // 4 + var totalCount: Long = 0, // 5 + var totalWeightSum: Double = 0.0, // 6 + var totalWeightSquareSum: Double = 0.0, // 7 + var weightSum: Array[Double] = null, // 8 + var nnz: Array[Long] = null, // 9 + var max: Array[Double] = null, // 10 + var min: Array[Double] = null) // 11 object Buffer { // Recursive function, but the number of cases is really small. @@ -327,6 +328,9 @@ object SummaryBuilderImpl extends Logging { } } + /** + * Updates 'buffer' with the content of 'other', and returns 'buffer'. + */ @throws[SparkException]("When the buffers are not compatible") def mergeBuffers(buffer: Buffer, other: Buffer): Buffer = { if (buffer.n == -1) { @@ -349,8 +353,120 @@ object SummaryBuilderImpl extends Logging { } } - private def axpy(a: Double, x: Array[Double], y: Array[Double]): Unit = { - BLAS.axpy(a, Vectors.dense(x), Vectors.dense(y)) + /** + * Reads a buffer from a serialized form, using the row object as an assistant. + */ + def read(bytes: Array[Byte], row: UnsafeRow): Buffer = { + row.pointTo(bytes, bytes.length) + new Buffer( + n = row.getInt(0), + mean = nullableArrayD(row, 1), + m2n = nullableArrayD(row, 2), + m2 = nullableArrayD(row, 3), + l1 = nullableArrayD(row, 4), + totalCount = row.getLong(5), + totalWeightSum = row.getDouble(6), + totalWeightSquareSum = row.getDouble(7), + weightSum = nullableArrayD(row, 8), + nnz = nullableArrayL(row, 9), + max = nullableArrayD(row, 10), + min = nullableArrayD(row, 11) + ) + } + + + def write(buffer: Buffer): Array[Byte] = { + val ir = InternalRow.apply( + buffer.n, + buffer.mean, + buffer.m2n, + buffer.m2, + buffer.l1, + buffer.totalCount, + buffer.totalWeightSum, + buffer.totalWeightSquareSum, + buffer.weightSum, + buffer.nnz, + buffer.max, + buffer.min + ) + projection.apply(ir).getBytes + } + + def mean(buffer: Buffer): Array[Double] = { + require(buffer.totalWeightSum > 0) + require(buffer.mean != null) + require(buffer.weightSum != null) + val res = b(buffer.mean) :* b(buffer.weightSum) :/ buffer.totalWeightSum + res.toArray + } + + def variance(buffer: Buffer): Array[Double] = { + import buffer._ + require(n >= 0) + require(totalWeightSum > 0) + require(totalWeightSquareSum > 0) + require(mean != null) + require(m2n != null) + require(weightSum != null) + val denom = totalWeightSum - (totalWeightSquareSum / totalWeightSum) + if (denom > 0.0) { + val normWs = b(weightSum) :/ totalWeightSum + val x = b(mean) :* b(mean) :* b(weightSum) :* (1.0 - normWs) + val res = (b(m2n) :+ x) :/ denom + res.toArray + } else { + Array.ofDim(n) // Return 0.0 instead. + } + } + + def totalCount(buffer: Buffer): Long = buffer.totalCount + + def nnz(buffer: Buffer): Array[Long] = { + require(buffer.nnz != null) + buffer.nnz + } + + def max(buffer: Buffer): Array[Double] = { + require(buffer.max != null) + buffer.max + } + + def min(buffer: Buffer): Array[Double] = { + require(buffer.min != null) + buffer.min + } + + def l2(buffer: Buffer): Array[Double] = { + import buffer._ + require(totalWeightSum > 0.0) + require(m2 != null) + numerics.sqrt(b(m2)).toArray + } + + def l1(buffer: Buffer): Array[Double] = { + require(buffer.l1 != null) + buffer.l1 + } + + private[this] lazy val projection = UnsafeProjection.create(bufferSchema) + + // Returns the array at a given index, or null if the array is null. + private def nullableArrayD(row: UnsafeRow, ordinal: Int): Array[Double] = { + if (row.isNullAt(ordinal)) { + null + } else { + row.getArray(ordinal).toDoubleArray + } + } + + // Returns the array at a given index, or null if the array is null. + private def nullableArrayL(row: UnsafeRow, ordinal: Int): Array[Long] = { + if (row.isNullAt(ordinal)) { + null + } else { + row.getArray(ordinal).toLongArray + } } private def b(x: Array[Double]): BV[Double] = Vectors.dense(x).asBreeze @@ -449,19 +565,31 @@ object SummaryBuilderImpl extends Logging { b(buffer.weightSum) :+= b(other.weightSum) } } - - - - } private case class MetricsAggregate( requested: Seq[Metrics], startBuffer: Buffer, - child: Expression) + child: Expression, + mutableAggBufferOffset: Int, + inputAggBufferOffset: Int) extends TypedImperativeAggregate[Buffer] { - override def eval(buff: Buffer): Buffer = Buffer + private lazy val row = new UnsafeRow(1) + + override def eval(buff: Buffer): Row = { + val metrics = requested.map({ + case Mean => Buffer.mean(buff) + case Variance => Buffer.variance(buff) + case Count => Buffer.totalCount(buff) + case NumNonZeros => Buffer.nnz(buff) + case Max => Buffer.max(buff) + case Min => Buffer.min(buff) + case NormL2 => Buffer.l2(buff) + case NormL1 => Buffer.l1(buff) + }) + Row(metrics: _*) + } override def children: Seq[Expression] = child :: Nil @@ -487,24 +615,24 @@ object SummaryBuilderImpl extends Logging { override def createAggregationBuffer(): Buffer = startBuffer.copy() override def serialize(buff: Buffer): Array[Byte] = { - val array = new GenericArrayData(buff.productIterator.toArray) - projection.apply(InternalRow.apply(array)).getBytes + Buffer.write(buff) } override def deserialize(bytes: Array[Byte]): Buffer = { - val buffer = createAggregationBuffer() - row.pointTo(bytes, bytes.length) - row.getArray(0).foreach(child.dataType, (_, x: Any) => buffer += x) - buffer + Buffer.read(bytes, row) } + override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): MetricsAggregate = { + copy(mutableAggBufferOffset = newMutableAggBufferOffset) + } + override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): MetricsAggregate = { + copy(inputAggBufferOffset = newInputAggBufferOffset) + } - private lazy val projection = UnsafeProjection.create( - Array[DataType](ArrayType(elementType = child.dataType, containsNull = false))) - private lazy val row = new UnsafeRow(1) - + override def dataType: DataType = Buffer.bufferSchema + override def prettyName: String = "aggregate_metrics" } From 3c4bef772a3cbc759e43223af658a357c5ca6bc2 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Tue, 21 Mar 2017 11:54:16 -0700 Subject: [PATCH 07/26] changes --- .../org/apache/spark/ml/stat/Summarizer.scala | 475 +----------------- .../spark/ml/stat/SummarizerSuite.scala | 13 + 2 files changed, 36 insertions(+), 452 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index abd5fda96e12..018fb4cbc092 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -24,10 +24,11 @@ import org.apache.spark.SparkException import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.ml.linalg.{BLAS, SQLDataTypes, Vector, Vectors} +import org.apache.spark.ml.stat.SummaryBuilderImpl.MetricsAggregate import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeProjection, UnsafeRow} -import org.apache.spark.sql.catalyst.expressions.aggregate.TypedImperativeAggregate +import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Final, TypedImperativeAggregate} import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction, UserDefinedFunction} import org.apache.spark.sql.types._ @@ -46,19 +47,10 @@ abstract class SummaryBuilder { * Returns an aggregate object that contains the summary of the column with the requested metrics. * @param column a column that contains Vector object. * @return an aggregate column that contains the statistics. The exact content of this - * structure is determined during the creation of the builder. It is also provided by - * [[summaryStructure]]. + * structure is determined during the creation of the builder. */ @Since("2.2.0") def summary(column: Column): Column - - /** - * The structure of the summary that will be returned by this builder. - * @return the structure of the aggregate data returned by this builder. - */ - @Since("2.2.0") - @DeveloperApi - private[spark] def summaryStructure: StructType } /** @@ -107,21 +99,26 @@ object Summarizer extends Logging { @Since("2.2.0") def metrics(firstMetric: String, metrics: String*): SummaryBuilder = { val (typedMetrics, computeMetrics) = getRelevantMetrics(Seq(firstMetric) ++ metrics) - new SummaryBuilderImpl( - SummaryBuilderImpl.buildUdafForMetrics(typedMetrics, computeMetrics), - SummaryBuilderImpl.structureForMetrics(typedMetrics)) + new SummaryBuilderImpl(typedMetrics, computeMetrics) } def mean(col: Column): Column = metrics("mean").summary(col) } private[ml] class SummaryBuilderImpl( - udaf: UserDefinedAggregateFunction, - structType: StructType) extends SummaryBuilder { - - override def summary(column: Column): Column = udaf.apply(column) - - override val summaryStructure: StructType = structType + requestedMetrics: Seq[SummaryBuilderImpl.Metrics], + requestedCompMetrics: Seq[SummaryBuilderImpl.ComputeMetrics]) extends SummaryBuilder { + + override def summary(column: Column): Column = { + val start = SummaryBuilderImpl.Buffer.fromMetrics(requestedCompMetrics) + val agg = SummaryBuilderImpl.MetricsAggregate( + requestedMetrics, + start, + column.expr, + mutableAggBufferOffset = 0, + inputAggBufferOffset = 0) + new Column(AggregateExpression(agg, mode = Final, isDistinct = false)) + } } private[ml] @@ -148,7 +145,8 @@ object SummaryBuilderImpl extends Logging { def structureForMetrics(metrics: Seq[Metrics]): StructType = { val fields = metrics.map { m => // All types are vectors, except for one. - val tpe = if (m == Count) { LongType } else { SQLDataTypes.VectorType } +// val tpe = if (m == Count) { LongType } else { SQLDataTypes.VectorType } + val tpe = if (m == Count) { LongType } else { ArrayType(DoubleType, containsNull = false) } // We know the metric is part of the list. val (name, _, _) = allMetrics.find(tup => tup._2 == m).get StructField(name, tpe, nullable = false) @@ -156,11 +154,6 @@ object SummaryBuilderImpl extends Logging { StructType(fields) } - def buildUdafForMetrics( - metrics: Seq[Metrics], - computeMetrics: Seq[ComputeMetrics]): UserDefinedAggregateFunction = { - new MetricsUDAF(metrics, computeMetrics) - } /** * All the metrics that can be currently computed by Spark for vectors. @@ -224,7 +217,7 @@ object SummaryBuilderImpl extends Logging { * If it is required but the size of the vectors (n) is not yet know, it is initialized to * an empty array. */ - private case class Buffer private ( + case class Buffer private ( var n: Int = -1, // 0 var mean: Array[Double] = null, // 1 var m2n: Array[Double] = null, // 2 @@ -260,11 +253,7 @@ object SummaryBuilderImpl extends Logging { } def bufferSchema: StructType = { - // TODO: there is room for optimization here: we could take as argument the initialized - // start buffer and only attempt to compute the fields we know will be requested. val fields = metricsWithOrder.map { case (m, _) => - // TODO(thunterdb) this could be merged with the types above so that there is no confusion - // when adding other types later, and with the initialization below. val tpe = m match { case ComputeCount => LongType case ComputeTotalWeightSum => DoubleType @@ -406,13 +395,13 @@ object SummaryBuilderImpl extends Logging { require(n >= 0) require(totalWeightSum > 0) require(totalWeightSquareSum > 0) - require(mean != null) + require(buffer.mean != null) require(m2n != null) require(weightSum != null) val denom = totalWeightSum - (totalWeightSquareSum / totalWeightSum) if (denom > 0.0) { val normWs = b(weightSum) :/ totalWeightSum - val x = b(mean) :* b(mean) :* b(weightSum) :* (1.0 - normWs) + val x = b(buffer.mean) :* b(buffer.mean) :* b(weightSum) :* (- normWs :+ 1.0) val res = (b(m2n) :+ x) :/ denom res.toArray } else { @@ -630,427 +619,9 @@ object SummaryBuilderImpl extends Logging { copy(inputAggBufferOffset = newInputAggBufferOffset) } - override def dataType: DataType = Buffer.bufferSchema + override lazy val dataType: DataType = structureForMetrics(requested) override def prettyName: String = "aggregate_metrics" } - - - private class MetricsUDAF( - requested: Seq[Metrics], - metrics: Seq[ComputeMetrics]) extends UserDefinedAggregateFunction { - - // All of these are the indexes of the value in the temporary buffer, or -1 if these values are - // not requested. - // All these values are unrolled so that we do not pay too steep a penalty for the extra level - // of indirection associated with selectively computing some metrics. - private[this] val computeMean = getIndexFor(ComputeMean) - private[this] val computeM2n = getIndexFor(ComputeM2n) - private[this] val computeM2 = getIndexFor(ComputeM2) - private[this] val computeL1 = getIndexFor(ComputeL1) - private[this] val computeTotalCount = getForcedIndexFor(ComputeCount) // Always compute it. - private[this] val computeTotalWeightSum = getForcedIndexFor(ComputeTotalWeightSum) // Always. - private[this] val computeWeightSquareSum = getIndexFor(ComputeWeightSquareSum) - private[this] val computeWeightSum = getIndexFor(ComputeWeightSum) - private[this] val computeNNZ = getIndexFor(ComputeNNZ) - private[this] val computeCurrMax = getIndexFor(ComputeMax) - private[this] val computeCurrMin = getIndexFor(ComputeMin) - // Always computed, and put at the end. - private[this] val computeN = metricsWithOrder.size - - override def inputSchema: StructType = { - StructType(Seq(StructField("features", SQLDataTypes.VectorType))) - } - - // The schema for the buffer contains all the structures, but a subset of them may be null. - override def bufferSchema: StructType = { - val fields = metricsWithOrder.map { case (m, _) => - // TODO(thunterdb) this could be merged with the types above so that there is no confusion - // when adding other types later, and with the initialization below. - val tpe = m match { - case ComputeCount => LongType - case ComputeTotalWeightSum => DoubleType - case ComputeWeightSquareSum => DoubleType - case _ => ArrayType(DoubleType, containsNull = false) - } - StructField(m.toString, tpe, nullable = true) - } :+ StructField("n", IntegerType, nullable = false) - StructType(fields) - } - - override def dataType: DataType = { - structureForMetrics(requested) - } - - override def deterministic: Boolean = true - - // The initialization does not do anything because we do not know the size of the vectors yet. - override def initialize(buffer: MutableAggregationBuffer): Unit = { - // Set n to -1 to indicate that we do not know the size yet. - println("initialize: size=" + buffer.size + " computeN = " + computeN) - // Thes metrics are always computed. - buffer.update(computeN, -1) - buffer.update(computeTotalCount, 0L) - buffer.update(computeTotalWeightSum, 0.0) - } - - override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { - val v = input.getAs[Vector](0) - // TODO: add weights later. - update(buffer, v, 1.0) - } - - override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { - // The metrics that are used to determine if the buffers are used are always computed. - val thisUsed = (buffer1.getDouble(computeTotalWeightSum) > 0.0 - || buffer1.getLong(computeTotalCount) > 0L) - val otherUsed = (buffer2.getDouble(computeTotalWeightSum) > 0.0 - || buffer2.getLong(computeTotalCount) > 0L) - if (thisUsed && otherUsed) { - // Merge onto this one. - val n1 = getN(buffer1) - val n2 = getN(buffer2) - require(n1 == n2, s"Dimensions mismatch between summarizers: expected $n1 but got $n2") - require(n1 > 0, s"Expected the summarizers to be initialized") - // The scalar values. - if (computeTotalCount >= 0) { - buffer1.update(computeTotalCount, - buffer1.getLong(computeTotalCount) + buffer2.getLong(computeTotalCount)) - } - if (computeTotalWeightSum >= 0) { - buffer1.update(computeTotalWeightSum, - buffer1.getDouble(computeTotalWeightSum) + buffer2.getDouble(computeTotalWeightSum)) - } - if (computeWeightSquareSum >= 0) { - buffer1.update(computeWeightSquareSum, - buffer1.getDouble(computeWeightSquareSum) + buffer2.getDouble(computeWeightSquareSum)) - } - // The vector values. - val currMean = exposeDArray(buffer1, computeMean) - val currM2n = exposeDArray(buffer1, computeM2n) - val currM2 = exposeDArray(buffer1, computeM2) - val currL1 = exposeDArray(buffer1, computeL1) - val weightSum = exposeDArray(buffer1, computeWeightSum) - val numNonzeros = exposeDArray(buffer1, computeNNZ) - val currMax = exposeDArray(buffer1, computeCurrMax) - val currMin = exposeDArray(buffer1, computeCurrMin) - - val otherMean = exposeDArray2(buffer2, computeMean) - val otherM2n = exposeDArray2(buffer2, computeM2n) - val otherM2 = exposeDArray2(buffer2, computeM2) - val otherL1 = exposeDArray2(buffer2, computeL1) - val otherWeightSum = exposeDArray2(buffer2, computeWeightSum) - val otherNumNonzeros = exposeDArray2(buffer2, computeNNZ) - val otherMax = exposeDArray2(buffer2, computeCurrMax) - val otherMin = exposeDArray2(buffer2, computeCurrMin) - - - var i = 0 - while (i < n1) { - // This case is written do maximize conformance with the current code. - // For more proper implementation, the differnt loops should be disentangled. - // The case of 0.0 values being used should not happen if the proper selection - // of metrics is done. - // TODO: should it be NaN so that any computation with it will indicate an error, - // instead of failing silently? - // Using exceptions is not great for performance in this loop. - val thisNnz = if (weightSum == null) { 0.0 } else { weightSum(i) } - val otherNnz = if (otherWeightSum == null) { 0.0 } else { otherWeightSum.getDouble(i) } - val totalNnz = thisNnz + otherNnz - val totalCnnz = if (numNonzeros == null || numNonzeros == null) { 0.0 } else { - numNonzeros(i) + otherNumNonzeros.getDouble(i) - } - if (totalNnz != 0.0) { - val deltaMean = if (computeMean >= 0) { - otherMean.getDouble(i) - currMean(i) - } else { 0.0 } - // merge mean together - if (computeMean >= 0) { - currMean(i) += deltaMean * otherNnz / totalNnz - } - // merge m2n together - if (computeM2n >= 0) { - val incr = deltaMean * deltaMean * thisNnz * otherNnz / totalNnz - currM2n(i) += otherM2n.getDouble(i) + incr - } - // merge m2 together - if (computeM2 >= 0) { - currM2(i) += otherM2.getDouble(i) - } - // merge l1 together - if (computeL1 >= 0) { - currL1(i) += otherL1.getDouble(i) - } - // merge max and min - if (computeCurrMax >= 0) { - currMax(i) = math.max(currMax(i), otherMax.getDouble(i)) - } - if (computeCurrMin >= 0) { - currMin(i) = math.min(currMin(i), otherMin.getDouble(i)) - } - } - if (computeWeightSum >= 0) { - weightSum(i) = totalNnz - } - if (computeNNZ >= 0) { - numNonzeros(i) = totalCnnz - } - - i += 1 - } - } else if (otherUsed) { - // Copy buffer1 onto buffer2. - // TODO - } - } - - // This function packs the requested values, in order. - override def evaluate(buffer: Row): Row = { - val values = requested.map { - case Mean => evaluateMean(buffer) - case Variance => evaluateVariance(buffer) - case Count => evaluateCount(buffer) - case NumNonZeros => evaluateNNZ(buffer) - case Max => evaluateMax(buffer) - case Min => evaluateMin(buffer) - case NormL1 => evaluateNormL1(buffer) - case NormL2 => evaluateNormL2(buffer) - } - Row(values: _*) - } - - private def evaluateMean(buffer: Row): Vector = { - assert(computeMean >= 0) // This is a programming error. - assert(computeWeightSum >= 0) - assert(computeTotalWeightSum >= 0) - val n = getN(buffer) - if (n < 0) { - throw new IllegalArgumentException("No data point has been seen so far") - } - val realMean = Array.ofDim[Double](n) - val currMean = exposeDArray2(buffer, computeMean) - val weightSum = exposeDArray2(buffer, computeWeightSum) - val totalWeightSum = buffer.getDouble(computeTotalWeightSum) - // TODO: should this be turned into NaN instead? That would be more consistent than - // an error. - require(totalWeightSum > 0, "Data has zero weight") - var i = 0 - while (i < n) { - realMean(i) = currMean.getDouble(i) * (weightSum.getDouble(i) / totalWeightSum) - i += 1 - } - Vectors.dense(realMean) - } - - private def evaluateVariance(buffer: Row): Vector = { - assert(computeTotalWeightSum >= 0) - assert(computeWeightSquareSum >= 0) - assert(computeWeightSum >= 0) - assert(computeM2 >= 0) - assert(computeM2n >= 0) - - val totalWeightSum = buffer.getDouble(computeTotalWeightSum) - require(totalWeightSum > 0, s"Nothing has been added to this summarizer.") - - val n = getN(buffer) - assert(n >= 0) - - val weightSquareSum = buffer.getDouble(computeWeightSquareSum) - val realVariance = Array.ofDim[Double](n) - - val currMean = exposeDArray2(buffer, computeMean) - val currM2n = exposeDArray2(buffer, computeM2n) - val weightSum = exposeDArray2(buffer, computeWeightSum) - val denominator = totalWeightSum - (weightSquareSum / totalWeightSum) - - // Sample variance is computed, if the denominator is less than 0, the variance is just 0. - if (denominator > 0.0) { - val deltaMean = currMean - var i = 0 - val len = currM2n.length - while (i < len) { - val m = deltaMean.getDouble(i) - val num = currM2n.getDouble(i) + m * m * weightSum.getDouble(i) - realVariance(i) = (num * - (totalWeightSum - weightSum.getDouble(i)) / totalWeightSum) / denominator - i += 1 - } - } - Vectors.dense(realVariance) - } - - private def evaluateCount(buffer: Row): Long = { - assert(computeTotalCount >= 0) - buffer.getLong(computeTotalCount) - } - - private def evaluateNNZ(buffer: Row): Vector = { - null // TODO - } - - private def evaluateMax(buffer: Row): Vector = { - null // TODO - } - - private def evaluateMin(buffer: Row): Vector = { - null // TODO - } - - private def evaluateNormL1(buffer: Row): Vector = { - null // TODO - } - - private def evaluateNormL2(buffer: Row): Vector = { - null // TODO - } - - private def update(buffer: MutableAggregationBuffer, instance: Vector, weight: Double): Unit = { - val startN = getN(buffer) - if (startN == -1) { - initialize(buffer, instance.size) - } else { - require(startN == instance.size, - s"Trying to insert a vector of size $instance into a buffer that " + - s"has been sized with $startN") - } - // It should be initialized. - assert(getN(buffer) > 0) - // Update the scalars that do not require going through the vector first. - if (computeTotalWeightSum >= 0) { - buffer.update(computeTotalWeightSum, buffer.getDouble(computeTotalWeightSum) + weight) - } - if (computeWeightSquareSum >= 0) { - buffer.update(computeWeightSquareSum, - buffer.getDouble(computeWeightSquareSum) + weight * weight) - } - if (computeTotalCount >= 0) { - buffer.update(computeTotalCount, buffer.getLong(computeTotalCount) + 1) - } - - // All these may be null pointers. - val localCurrMean = exposeDArray3(buffer, computeMean) - val localCurrM2n = exposeDArray(buffer, computeM2n) - val localCurrM2 = exposeDArray(buffer, computeM2) - val localCurrL1 = exposeDArray(buffer, computeL1) - val localWeightSum = exposeDArray(buffer, computeWeightSum) - val localNumNonzeros = exposeDArray(buffer, computeNNZ) - val localCurrMax = exposeDArray(buffer, computeCurrMax) - val localCurrMin = exposeDArray(buffer, computeCurrMin) - - // Update the requested vector values. - // All these changes are in place. There is no need to make copies. - instance.foreachActive { (index, value) => - if (value != 0.0) { - if (localCurrMax != null && localCurrMax(index) < value) { - localCurrMax(index) = value - } - if (localCurrMin != null && localCurrMin(index) > value) { - localCurrMin(index) = value - } - - if (localCurrMean != null) { - val prevMean = localCurrMean.getDouble(index) - val diff = value - prevMean - localCurrMean.update(index, prevMean + weight * diff / (localWeightSum(index) + weight)) -// localCurrMean(index) = prevMean + weight * diff / (localWeightSum(index) + weight) - if (localCurrM2n != null) { - // require: localCurrMean != null. - localCurrM2n(index) += weight * (value - localCurrMean.getDouble(index)) * diff - } - } - - if (localCurrM2 != null) { - localCurrM2(index) += weight * value * value - } - if (localCurrL1 != null) { - localCurrL1(index) += weight * math.abs(value) - } - if (localWeightSum != null) { - localWeightSum(index) += weight - } - if (localNumNonzeros != null) { - localNumNonzeros(index) += 1 - } - } - } - } - - // the current vector size, or -1 if not available yet. - private def getN(buffer: Row): Int = { - buffer.getInt(computeN) - } - - private[this] def exposeDArray2(buffer: Row, index: Int): Row = { - if (index == -1) { - null - } else { - buffer.getStruct(index) - } - } - - private[this] def exposeDArray(buffer: MutableAggregationBuffer, index: Int): Array[Double] = { - if (index == -1) { - null - } else { - buffer.getAs[MutableAggregationBuffer](index) - buffer.getAs[mutable.WrappedArray[Double]](index).array - } - } - - private[this] def exposeDArray3( - buffer: MutableAggregationBuffer, index: Int): MutableAggregationBuffer = { - if (index == -1) { - null - } else { - buffer.getAs[MutableAggregationBuffer](index) - } - } - - // Performs the initialization of the requested fields, if required. - private def initialize(buffer: MutableAggregationBuffer, n: Int): Unit = { - require(n > 0, s"Cannot set n to $n: must be positive") - val currentN = buffer.getInt(computeN) - require(currentN == -1 || (currentN == n), - s"The vector size has already been set with $currentN, requested change to $n") - if (currentN == -1) { - // Set all the vectors to the given value. - // This is not a tight loop, so there should be no performance issue in - // making lookups for the values. - buffer.update(computeN, n) - // Only initialize the metricts that we want to compute: - metricsWithOrder.filter(p => metrics.contains(p._1)).foreach { case (m, idx) => - // Special cases for the scalar values. - m match { - case ComputeCount => - buffer.update(idx, 0L) - case ComputeTotalWeightSum => - buffer.update(idx, 0.0) - case ComputeWeightSquareSum => - buffer.update(idx, 0.0) - case _ => - buffer.update(idx, Array.ofDim[Double](n)) - } - } - } - } - - /** - * Returns a fixed index for the compute metric if it is in the list of requested metrics, - * and -1 otherwise. - */ - private def getIndexFor(m: ComputeMetrics): Int = { - if (metrics.contains(m)) { - metricsWithOrder.find(p => p._1 == m).get._2 - } else { - -1 - } - } - - private def getForcedIndexFor(m: ComputeMetrics): Int = { - metricsWithOrder.find(p => p._1 == m).get._2 - } - - } - } diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala index 38e7bcd157d4..925b07e34841 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala @@ -86,6 +86,19 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { case (l1: Long, l2: Long) => assert(l1 === l2) } + test("debugging test") { + val df = denseData(Nil) + val c = df.col("features") + println(s">>>>> c=$c") + val c1 = metrics("mean").summary(c) + println(s">>>>> c1=$c1") + val res = df.select(c1) + println(s">>>>> res=$res") + intercept[SparkException] { + compare(res, Seq.empty) + } + } + test("basic error handling") { val df = denseData(Nil) val c = df.col("features") From c3f236c4422031ae818cb6bbec2415b3f1bf7b70 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Tue, 21 Mar 2017 12:03:07 -0700 Subject: [PATCH 08/26] cleanup --- .../org/apache/spark/ml/stat/Summarizer.scala | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index 018fb4cbc092..8ae2eacfd615 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -187,7 +187,7 @@ object SummaryBuilderImpl extends Logging { case object NormL1 extends Metrics /** - * The running metrics that are going to be computing. + * The running metrics that are going to be computed. * * There is a bipartite graph between the metrics and the computed metrics. */ @@ -204,12 +204,6 @@ object SummaryBuilderImpl extends Logging { case object ComputeMax extends ComputeMetrics case object ComputeMin extends ComputeMetrics - // The order in which the metrics will be stored in the buffer. - // This order replicates the order above. - private val metricsWithOrder = Seq(ComputeMean, ComputeM2n, ComputeM2, ComputeL1, ComputeCount, - ComputeTotalWeightSum, ComputeWeightSquareSum, ComputeWeightSum, ComputeNNZ, ComputeMax, - ComputeMin).zipWithIndex - /** * The buffer that contains all the summary statistics. If the value is null, it is considered * to be not required. @@ -253,17 +247,23 @@ object SummaryBuilderImpl extends Logging { } def bufferSchema: StructType = { - val fields = metricsWithOrder.map { case (m, _) => - val tpe = m match { - case ComputeCount => LongType - case ComputeTotalWeightSum => DoubleType - case ComputeWeightSquareSum => DoubleType - case _ => ArrayType(DoubleType, containsNull = false) - } - StructField(m.toString, tpe, nullable = true) - } - val n = StructField("n", IntegerType, nullable = false) - StructType(n +: fields) + val at = ArrayType(DoubleType, containsNull = false) + val al = ArrayType(LongType, containsNull = false) + val fields = Seq( + "n" -> IntegerType, + "mean" -> at, + "m2n" -> at, + "m2" -> at, + "l1" -> at, + "totalCount" -> LongType, + "totalWeightSum" -> DoubleType, + "totalWeightSquareSum" -> DoubleType, + "weightSum" -> at, + "nnz" -> al, + "max" -> at, + "min" -> at + ) + StructType(fields.map { case (name, t) => StructField(name, t, nullable = true)}) } def updateInPlace(buffer: Buffer, v: Vector, w: Double): Unit = { From ef955c00275705f14342f3e4ed970a78f0f3c141 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Tue, 21 Mar 2017 15:42:42 -0700 Subject: [PATCH 09/26] debugging --- .../org/apache/spark/ml/stat/Summarizer.scala | 38 +++++++++++++------ 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index 8ae2eacfd615..16c842c3f8cb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -27,8 +27,8 @@ import org.apache.spark.ml.linalg.{BLAS, SQLDataTypes, Vector, Vectors} import org.apache.spark.ml.stat.SummaryBuilderImpl.MetricsAggregate import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeProjection, UnsafeRow} -import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Final, TypedImperativeAggregate} +import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeArrayData, UnsafeProjection, UnsafeRow} +import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete, Final, TypedImperativeAggregate} import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction, UserDefinedFunction} import org.apache.spark.sql.types._ @@ -117,7 +117,7 @@ private[ml] class SummaryBuilderImpl( column.expr, mutableAggBufferOffset = 0, inputAggBufferOffset = 0) - new Column(AggregateExpression(agg, mode = Final, isDistinct = false)) + new Column(AggregateExpression(agg, mode = Complete, isDistinct = false)) } } @@ -367,17 +367,17 @@ object SummaryBuilderImpl extends Logging { def write(buffer: Buffer): Array[Byte] = { val ir = InternalRow.apply( buffer.n, - buffer.mean, - buffer.m2n, - buffer.m2, - buffer.l1, + gadD(buffer.mean), + gadD(buffer.m2n), + gadD(buffer.m2), + gadD(buffer.l1), buffer.totalCount, buffer.totalWeightSum, buffer.totalWeightSquareSum, - buffer.weightSum, - buffer.nnz, - buffer.max, - buffer.min + gadD(buffer.weightSum), + gadL(buffer.nnz), + gadD(buffer.max), + gadD(buffer.min) ) projection.apply(ir).getBytes } @@ -438,6 +438,22 @@ object SummaryBuilderImpl extends Logging { buffer.l1 } + private def gadD(arr: Array[Double]): UnsafeArrayData = { + if (arr == null) { + null + } else { + UnsafeArrayData.fromPrimitiveArray(arr) + } + } + + private def gadL(arr: Array[Long]): UnsafeArrayData = { + if (arr == null) { + null + } else { + UnsafeArrayData.fromPrimitiveArray(arr) + } + } + private[this] lazy val projection = UnsafeProjection.create(bufferSchema) // Returns the array at a given index, or null if the array is null. From a04f923913ca1118a61d66bd53b8514af62594d7 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Tue, 21 Mar 2017 16:14:23 -0700 Subject: [PATCH 10/26] work --- .../org/apache/spark/ml/stat/Summarizer.scala | 56 +++++++++++++++++-- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index 16c842c3f8cb..8894dc596e51 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -24,7 +24,7 @@ import org.apache.spark.SparkException import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.ml.linalg.{BLAS, SQLDataTypes, Vector, Vectors} -import org.apache.spark.ml.stat.SummaryBuilderImpl.MetricsAggregate +import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeArrayData, UnsafeProjection, UnsafeRow} @@ -269,7 +269,7 @@ object SummaryBuilderImpl extends Logging { def updateInPlace(buffer: Buffer, v: Vector, w: Double): Unit = { val startN = buffer.n if (startN == -1) { - buffer.n = v.size + resizeArraysInPlace(buffer, v.size) } else { require(startN == v.size, s"Trying to insert a vector of size $v into a buffer that " + @@ -345,8 +345,9 @@ object SummaryBuilderImpl extends Logging { /** * Reads a buffer from a serialized form, using the row object as an assistant. */ - def read(bytes: Array[Byte], row: UnsafeRow): Buffer = { - row.pointTo(bytes, bytes.length) + def read(bytes: Array[Byte], row2: UnsafeRow): Buffer = { + row2.pointTo(bytes, bytes.length) + val row = row2.getStruct(0, 12) new Buffer( n = row.getInt(0), mean = nullableArrayD(row, 1), @@ -438,6 +439,40 @@ object SummaryBuilderImpl extends Logging { buffer.l1 } + private def resizeArraysInPlace(buffer: Buffer, n: Int): Unit = { + // It should either be unsized or of the same size. + require(buffer.n == -1 || buffer.n == n, (buffer.n, n)) + if (buffer.n == n) { + return + } + buffer.n = n + // Conditional resize of the non-null arrays + if (buffer.mean != null) { + buffer.mean = Array.ofDim(n) + } + if (buffer.m2n != null) { + buffer.m2n = Array.ofDim(n) + } + if (buffer.m2 != null) { + buffer.m2 = Array.ofDim(n) + } + if (buffer.l1 != null) { + buffer.l1 = Array.ofDim(n) + } + if (buffer.weightSum != null) { + buffer.weightSum = Array.ofDim(n) + } + if (buffer.nnz != null) { + buffer.nnz = Array.ofDim(n) + } + if (buffer.max != null) { + buffer.max = Array.ofDim(n) + } + if (buffer.min != null) { + buffer.min = Array.ofDim(n) + } + } + private def gadD(arr: Array[Double]): UnsafeArrayData = { if (arr == null) { null @@ -599,7 +634,10 @@ object SummaryBuilderImpl extends Logging { override def children: Seq[Expression] = child :: Nil override def update(buff: Buffer, row: InternalRow): Buffer = { - val v = row.get(0, SQLDataTypes.VectorType).asInstanceOf[Vector] + println(s"UPDATE: ROW=$row") + // Unsafe rows do not play well with UDTs, it seems. + // Directly call the deserializer. + val v = udt.deserialize(row.getStruct(0, udt.sqlType.size)) val w = row.numFields match { case 1 => 1.0 @@ -620,7 +658,11 @@ object SummaryBuilderImpl extends Logging { override def createAggregationBuffer(): Buffer = startBuffer.copy() override def serialize(buff: Buffer): Array[Byte] = { - Buffer.write(buff) + val x = Buffer.write(buff) + println(s"serialize: buff=$buff") + val b2 = deserialize(x) + println(s"serialize: b2=$b2") + x } override def deserialize(bytes: Array[Byte]): Buffer = { @@ -640,4 +682,6 @@ object SummaryBuilderImpl extends Logging { override def prettyName: String = "aggregate_metrics" } + private[this] val udt = new VectorUDT + } From 201eb7712054967cd5093d3a908f4ebbd73f30a8 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Wed, 22 Mar 2017 14:19:57 -0700 Subject: [PATCH 11/26] debug --- .../test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala index 925b07e34841..746357924a25 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala @@ -88,6 +88,7 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { test("debugging test") { val df = denseData(Nil) + println(s">>> df=${df.collect().toSeq}") val c = df.col("features") println(s">>>>> c=$c") val c1 = metrics("mean").summary(c) From f4dec88a49d0a20e1b328617fd721633fd8c201a Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Thu, 23 Mar 2017 11:27:19 -0700 Subject: [PATCH 12/26] trying to debug serialization issue --- .../org/apache/spark/ml/stat/Summarizer.scala | 105 ++++++++++++------ .../spark/ml/stat/SummarizerSuite.scala | 80 +++++++++++-- .../expressions/aggregate/interfaces.scala | 6 + 3 files changed, 148 insertions(+), 43 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index 8894dc596e51..10b9eafcbae6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -35,6 +35,7 @@ import org.apache.spark.sql.types._ // scalastyle:off println + /** * A builder object that provides summary statistics about a given column. * @@ -102,7 +103,26 @@ object Summarizer extends Logging { new SummaryBuilderImpl(typedMetrics, computeMetrics) } - def mean(col: Column): Column = metrics("mean").summary(col) + def mean(col: Column): Column = getSingleMetric(col, "mean") + + def variance(col: Column): Column = getSingleMetric(col, "variance") + + def count(col: Column): Column = getSingleMetric(col, "count") + + def numNonZeros(col: Column): Column = getSingleMetric(col, "numNonZeros") + + def max(col: Column): Column = getSingleMetric(col, "max") + + def min(col: Column): Column = getSingleMetric(col, "min") + + def normL1(col: Column): Column = getSingleMetric(col, "normL1") + + def normL2(col: Column): Column = getSingleMetric(col, "normL2") + + private def getSingleMetric(col: Column, metric: String): Column = { + val c1 = metrics(metric).summary(col) + c1.getField(metric).as(s"$metric($col)") + } } private[ml] class SummaryBuilderImpl( @@ -111,6 +131,9 @@ private[ml] class SummaryBuilderImpl( override def summary(column: Column): Column = { val start = SummaryBuilderImpl.Buffer.fromMetrics(requestedCompMetrics) + println(s"summary: requestedMetrics=$requestedMetrics") + println(s"summary: requestedCompMetrics=$requestedCompMetrics") + println(s"summary: start=$start") val agg = SummaryBuilderImpl.MetricsAggregate( requestedMetrics, start, @@ -129,9 +152,9 @@ object SummaryBuilderImpl extends Logging { @throws[IllegalArgumentException]("When the list is empty or not a subset of known metrics") def getRelevantMetrics(requested: Seq[String]): (Seq[Metrics], Seq[ComputeMetrics]) = { val all = requested.map { req => - val (_, metric, deps) = allMetrics.find(tup => tup._1 == req).getOrElse { + val (_, metric, _, deps) = allMetrics.find(tup => tup._1 == req).getOrElse { throw new IllegalArgumentException(s"Metric $req cannot be found." + - s" Valid metris are $implementedMetrics") + s" Valid metrics are $implementedMetrics") } metric -> deps } @@ -143,17 +166,15 @@ object SummaryBuilderImpl extends Logging { } def structureForMetrics(metrics: Seq[Metrics]): StructType = { - val fields = metrics.map { m => - // All types are vectors, except for one. -// val tpe = if (m == Count) { LongType } else { SQLDataTypes.VectorType } - val tpe = if (m == Count) { LongType } else { ArrayType(DoubleType, containsNull = false) } - // We know the metric is part of the list. - val (name, _, _) = allMetrics.find(tup => tup._2 == m).get - StructField(name, tpe, nullable = false) + val dct = allMetrics.map { case (n, m, dt, _) => m -> (n, dt) }.toMap + val fields = metrics.map(dct.apply).map { case (n, dt) => + StructField(n, dt, nullable = false) } StructType(fields) } + private val arrayDType = ArrayType(DoubleType, containsNull = false) + private val arrayLType = ArrayType(LongType, containsNull = false) /** * All the metrics that can be currently computed by Spark for vectors. @@ -161,16 +182,16 @@ object SummaryBuilderImpl extends Logging { * This list associates the user name, the internal (typed) name, and the list of computation * metrics that need to de computed internally to get the final result. */ - private val allMetrics = Seq( - ("mean", Mean, Seq(ComputeMean, ComputeWeightSum, ComputeTotalWeightSum)), // Vector - ("variance", Variance, Seq(ComputeTotalWeightSum, ComputeWeightSquareSum, ComputeMean, - ComputeM2n)), // Vector - ("count", Count, Seq(ComputeCount)), // Long - ("numNonZeros", NumNonZeros, Seq(ComputeNNZ)), // Vector - ("max", Max, Seq(ComputeMax, ComputeTotalWeightSum, ComputeNNZ, ComputeCount)), // Vector - ("min", Min, Seq(ComputeMin, ComputeTotalWeightSum, ComputeNNZ, ComputeCount)), // Vector - ("normL2", NormL2, Seq(ComputeTotalWeightSum, ComputeM2)), // Vector - ("normL1", Min, Seq(ComputeTotalWeightSum, ComputeL1)) // Vector + private val allMetrics: Seq[(String, Metrics, DataType, Seq[ComputeMetrics])] = Seq( + ("mean", Mean, arrayDType, Seq(ComputeMean, ComputeWeightSum, ComputeTotalWeightSum)), + ("variance", Variance, arrayDType, Seq(ComputeTotalWeightSum, ComputeWeightSum, + ComputeWeightSquareSum, ComputeMean, ComputeM2n)), + ("count", Count, LongType, Seq(ComputeCount)), + ("numNonZeros", NumNonZeros, arrayLType, Seq(ComputeNNZ)), + ("max", Max, arrayDType, Seq(ComputeMax)), + ("min", Min, arrayDType, Seq(ComputeMin)), + ("normL2", NormL2, arrayDType, Seq(ComputeTotalWeightSum, ComputeM2)), + ("normL1", Min, arrayDType, Seq(ComputeTotalWeightSum, ComputeL1)) ) /** @@ -223,8 +244,18 @@ object SummaryBuilderImpl extends Logging { var weightSum: Array[Double] = null, // 8 var nnz: Array[Long] = null, // 9 var max: Array[Double] = null, // 10 - var min: Array[Double] = null) // 11 - + var min: Array[Double] = null // 11 + ) { + override def toString: String = { + def v(x: Array[Double]) = if (x==null) "null" else x.toSeq.mkString("[", " ", "]") + def vl(x: Array[Long]) = if (x==null) "null" else x.toSeq.mkString("[", " ", "]") + + s"Buffer(n=$n mean=${v(mean)} m2n=${v(m2n)} m2=${v(m2)} l1=${v(l1)}" + + s" totalCount=$totalCount totalWeightSum=$totalWeightSum" + + s" totalWeightSquareSum=$totalWeightSquareSum weightSum=${v(weightSum)} nnz=${vl(nnz)}" + + s" max=${v(max)} min=${v(min)})" + } + } object Buffer { // Recursive function, but the number of cases is really small. def fromMetrics(requested: Seq[ComputeMetrics]): Buffer = { @@ -294,6 +325,7 @@ object SummaryBuilderImpl extends Logging { } if (buffer.mean != null) { + assert(buffer.weightSum != null) val prevMean = buffer.mean(index) val diff = value - prevMean buffer.mean(index) += w * diff / (buffer.weightSum(index) + w) @@ -548,7 +580,7 @@ object SummaryBuilderImpl extends Logging { val weightSum2 = if (other.weightSum == null) null else { other.weightSum.clone() } val weightSum = if (weightSum1 == null) null else { - require(weightSum2 != null) + require(weightSum2 != null, s"buffer=$buffer other=$other") val arr: Array[Double] = Array.ofDim(buffer.n) b(arr) :+= b(weightSum1) :- b(weightSum1) arr @@ -615,20 +647,20 @@ object SummaryBuilderImpl extends Logging { inputAggBufferOffset: Int) extends TypedImperativeAggregate[Buffer] { - private lazy val row = new UnsafeRow(1) +// private lazy val row = new UnsafeRow(1) - override def eval(buff: Buffer): Row = { + override def eval(buff: Buffer): InternalRow = { val metrics = requested.map({ - case Mean => Buffer.mean(buff) - case Variance => Buffer.variance(buff) + case Mean => UnsafeArrayData.fromPrimitiveArray(Buffer.mean(buff)) + case Variance => UnsafeArrayData.fromPrimitiveArray(Buffer.variance(buff)) case Count => Buffer.totalCount(buff) - case NumNonZeros => Buffer.nnz(buff) - case Max => Buffer.max(buff) - case Min => Buffer.min(buff) - case NormL2 => Buffer.l2(buff) - case NormL1 => Buffer.l1(buff) + case NumNonZeros => UnsafeArrayData.fromPrimitiveArray(Buffer.nnz(buff)) + case Max => UnsafeArrayData.fromPrimitiveArray(Buffer.max(buff)) + case Min => UnsafeArrayData.fromPrimitiveArray(Buffer.min(buff)) + case NormL2 => UnsafeArrayData.fromPrimitiveArray(Buffer.l2(buff)) + case NormL1 => UnsafeArrayData.fromPrimitiveArray(Buffer.l1(buff)) }) - Row(metrics: _*) + InternalRow.apply(metrics: _*) } override def children: Seq[Expression] = child :: Nil @@ -659,14 +691,15 @@ object SummaryBuilderImpl extends Logging { override def serialize(buff: Buffer): Array[Byte] = { val x = Buffer.write(buff) - println(s"serialize: buff=$buff") + println(s"serialize: ${buff.hashCode()} x=${x.length} buff=$buff") val b2 = deserialize(x) - println(s"serialize: b2=$b2") + println(s"serialize: ${buff.hashCode()} b2=$b2") x } override def deserialize(bytes: Array[Byte]): Buffer = { - Buffer.read(bytes, row) +// Buffer.read(bytes, row) + Buffer.read(bytes, new UnsafeRow(1)) } override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): MetricsAggregate = { diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala index 746357924a25..0479117498ff 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala @@ -22,6 +22,7 @@ import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema // scalastyle:off println @@ -33,7 +34,15 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { private val seed = 42 private val eps: Double = 1e-5 - private case class ExpectedMetrics(mean: Vector, min: Vector) + private case class ExpectedMetrics( + mean: Seq[Double], + variance: Seq[Double], + count: Long, + numNonZeros: Seq[Long], + max: Seq[Double], + min: Seq[Double], + normL2: Seq[Double], + normL1: Seq[Double]) // The input is expected to be either a sparse vector, a dense vector or an array of doubles // (which will be converted to a dense vector) @@ -46,13 +55,49 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { private def testExample(input: Seq[Any], exp: ExpectedMetrics): Unit = { val inputVec: Seq[Vector] = input.map { case x: Array[Double] => Vectors.dense(x) + case x: Seq[Double] => Vectors.dense(x.toArray) case x: Vector => x case x => throw new Exception(x.toString) } val df = sc.parallelize(inputVec).map(Tuple1.apply).toDF("features") val c = df.col("features") - compare(df.select(metrics("mean").summary(c), mean(c)), Seq(exp.mean, exp.mean)) + compare(df.select(metrics("mean").summary(c), mean(c)), Seq(Row(exp.mean), exp.mean)) + + compare(df.select(metrics("variance").summary(c), variance(c)), + Seq(Row(exp.variance), exp.variance)) + + compare(df.select(metrics("count").summary(c), count(c)), + Seq(Row(exp.count), exp.count)) + + compare(df.select(metrics("numNonZeros").summary(c), numNonZeros(c)), + Seq(Row(exp.numNonZeros), exp.numNonZeros)) + + println("STARTING MIN") + + compare(df.select(metrics("min").summary(c), min(c)), + Seq(Row(exp.min), exp.min)) + + println("STARTING MAX") + + compare(df.select(metrics("max").summary(c), max(c)), + Seq(Row(exp.max), exp.max)) + + println("STARTING NORML1") + + compare(df.select(metrics("normL1").summary(c), normL1(c)), + Seq(Row(exp.normL1), exp.normL1)) + + println("STARTING NORML2") + + compare(df.select(metrics("normL2").summary(c), normL2(c)), + Seq(Row(exp.normL2), exp.normL2)) + + compare(df.select( + metrics("mean", "variance", "count", "numNonZeros").summary(c), + mean(c), variance(c), count(c), numNonZeros(c)), + Seq(Row(exp.mean, exp.variance, exp.count, exp.numNonZeros), + exp.mean, exp.variance, exp.count, exp.numNonZeros)) } private def denseData(input: Seq[Seq[Double]]): DataFrame = { @@ -67,6 +112,7 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { val Seq(row) = coll val res = row.toSeq println(s"compare: ress=${res.map(_.getClass)}") + println(s"compare: row=${row}") val names = df.schema.fieldNames.zipWithIndex.map { case (n, idx) => s"$n ($idx)" } assert(res.size === exp.size, (res.size, exp.size)) for (((x1, x2), name) <- res.zip(exp).zip(names)) { @@ -78,12 +124,25 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { private def compareStructures(x1: Any, x2: Any, name: String): Unit = (x1, x2) match { case (d1: Double, d2: Double) => assert(Vectors.dense(d1) ~== Vectors.dense(d2) absTol 1e-4, name) + case (r1: GenericRowWithSchema, r2: Row) => + assert(r1.size === r2.size, (r1, r2)) + for (((fname, x1), x2) <- r1.schema.fieldNames.zip(r1.toSeq).zip(r2.toSeq)) { + compareStructures(x1, x2, s"$name.$fname") + } case (r1: Row, r2: Row) => assert(r1.size === r2.size, (r1, r2)) for ((x1, x2) <- r1.toSeq.zip(r2.toSeq)) { compareStructures(x1, x2, name) } case (v1: Vector, v2: Vector) => assert(v1 ~== v2 absTol 1e-4, name) case (l1: Long, l2: Long) => assert(l1 === l2) + case (s1: Seq[_], s2: Seq[_]) => + assert(s1.size === s2.size, s"$name ${(s1, s2)}") + for (((x1, idx), x2) <- s1.zipWithIndex.zip(s2)) { + compareStructures(x1, x2, s"$name.$idx") + } + case (arr1: Array[_], arr2: Array[_]) => + assert(arr1.toSeq === arr2.toSeq) + case _ => throw new Exception(s"$name: ${x1.getClass} ${x2.getClass} $x1 $x2") } test("debugging test") { @@ -112,14 +171,21 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { test("no element, working metrics") { val df = denseData(Nil) val c = df.col("features") - val res = df.select(metrics("count").summary(c)) - compare(res, Seq(Row(0L))) + val res = df.select(metrics("count").summary(c), count(c)) + compare(res, Seq(Row(0L), 0L)) } test("single element in vector, all metrics") { - testExample(Seq(Array(1.0, 2.0)), ExpectedMetrics( - Vectors.dense(1.0, 2.0), - Vectors.dense(1.0, 2.0) + val x = Seq(1.0, 2.0) + testExample(Seq(x), ExpectedMetrics( + mean = x, + variance = Seq(0.0, 0.0), + count = 1, + numNonZeros = Seq(1, 1), + max = x, + min = x, + normL1 = x, + normL2 = x )) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala index 80c25d0b0fb7..3c4c9690f378 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala @@ -495,6 +495,12 @@ abstract class TypedImperativeAggregate[T] extends ImperativeAggregate { * Generates the final aggregation result value for current key group with the aggregation buffer * object. * + * Developer note: the only return types accepted by Spark are: + * - primitive types + * - InternalRow and subclasses + * - ArrayData + * - MapData + * * @param buffer aggregation buffer object. * @return The aggregation result of current key group */ From 4af0f47d326ef91d7cf9ccaf6a45ee3f904b191f Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Thu, 23 Mar 2017 16:16:10 -0700 Subject: [PATCH 13/26] better tests --- .../org/apache/spark/ml/stat/Summarizer.scala | 151 ++++++++++++------ .../spark/ml/stat/SummarizerSuite.scala | 118 ++++++++------ 2 files changed, 169 insertions(+), 100 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index 10b9eafcbae6..330a98090901 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -23,8 +23,7 @@ import breeze.numerics import org.apache.spark.SparkException import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging -import org.apache.spark.ml.linalg.{BLAS, SQLDataTypes, Vector, Vectors} -import org.apache.spark.ml.linalg.VectorUDT +import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors, VectorUDT} import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeArrayData, UnsafeProjection, UnsafeRow} @@ -191,7 +190,7 @@ object SummaryBuilderImpl extends Logging { ("max", Max, arrayDType, Seq(ComputeMax)), ("min", Min, arrayDType, Seq(ComputeMin)), ("normL2", NormL2, arrayDType, Seq(ComputeTotalWeightSum, ComputeM2)), - ("normL1", Min, arrayDType, Seq(ComputeTotalWeightSum, ComputeL1)) + ("normL1", NormL1, arrayDType, Seq(ComputeTotalWeightSum, ComputeL1)) ) /** @@ -277,30 +276,32 @@ object SummaryBuilderImpl extends Logging { } } - def bufferSchema: StructType = { - val at = ArrayType(DoubleType, containsNull = false) - val al = ArrayType(LongType, containsNull = false) + val bufferSchema: StructType = { val fields = Seq( "n" -> IntegerType, - "mean" -> at, - "m2n" -> at, - "m2" -> at, - "l1" -> at, + "mean" -> arrayDType, + "m2n" -> arrayDType, + "m2" -> arrayDType, + "l1" -> arrayDType, "totalCount" -> LongType, "totalWeightSum" -> DoubleType, "totalWeightSquareSum" -> DoubleType, - "weightSum" -> at, - "nnz" -> al, - "max" -> at, - "min" -> at + "weightSum" -> arrayDType, + "nnz" -> arrayLType, + "max" -> arrayDType, + "min" -> arrayDType ) StructType(fields.map { case (name, t) => StructField(name, t, nullable = true)}) } + val numFields = bufferSchema.fields.length + def updateInPlace(buffer: Buffer, v: Vector, w: Double): Unit = { val startN = buffer.n if (startN == -1) { - resizeArraysInPlace(buffer, v.size) + // The buffer was not initialized, we initialize it with the incoming row. + fillBufferWithRow(buffer, v, w) + return } else { require(startN == v.size, s"Trying to insert a vector of size $v into a buffer that " + @@ -368,7 +369,7 @@ object SummaryBuilderImpl extends Logging { if (other.n == -1) { buffer } else { - mergeBuffers0(buffer, other) + mergeInitializedBuffers(buffer, other) buffer } } @@ -379,7 +380,7 @@ object SummaryBuilderImpl extends Logging { */ def read(bytes: Array[Byte], row2: UnsafeRow): Buffer = { row2.pointTo(bytes, bytes.length) - val row = row2.getStruct(0, 12) + val row = row2.getStruct(0, numFields) new Buffer( n = row.getInt(0), mean = nullableArrayD(row, 1), @@ -471,39 +472,6 @@ object SummaryBuilderImpl extends Logging { buffer.l1 } - private def resizeArraysInPlace(buffer: Buffer, n: Int): Unit = { - // It should either be unsized or of the same size. - require(buffer.n == -1 || buffer.n == n, (buffer.n, n)) - if (buffer.n == n) { - return - } - buffer.n = n - // Conditional resize of the non-null arrays - if (buffer.mean != null) { - buffer.mean = Array.ofDim(n) - } - if (buffer.m2n != null) { - buffer.m2n = Array.ofDim(n) - } - if (buffer.m2 != null) { - buffer.m2 = Array.ofDim(n) - } - if (buffer.l1 != null) { - buffer.l1 = Array.ofDim(n) - } - if (buffer.weightSum != null) { - buffer.weightSum = Array.ofDim(n) - } - if (buffer.nnz != null) { - buffer.nnz = Array.ofDim(n) - } - if (buffer.max != null) { - buffer.max = Array.ofDim(n) - } - if (buffer.min != null) { - buffer.min = Array.ofDim(n) - } - } private def gadD(arr: Array[Double]): UnsafeArrayData = { if (arr == null) { @@ -563,11 +531,88 @@ object SummaryBuilderImpl extends Logging { } } + /** + * Sets the content of a buffer based on a single row (initialization). + * + * The buffer must be uninitialized first. + */ + private def fillBufferWithRow(buffer: Buffer, v: Vector, w: Double): Unit = { + require(buffer.n == -1, (buffer.n, buffer)) + buffer.n = v.size + buffer.totalCount = 1L + buffer.totalWeightSum = w + buffer.totalWeightSquareSum = w * w + + val arr = v.toArray + if (buffer.mean != null) { + buffer.mean = arr.clone() + } + if (buffer.m2n != null) { + buffer.m2n = Array.ofDim(buffer.n) + } + if (buffer.max != null) { + buffer.max = arr.clone() + } + if (buffer.min != null) { + buffer.min = arr.clone() + } + + // The rest of these operations have efficient bulk versions. + v match { + case dv: DenseVector => + if (buffer.m2 != null) { + buffer.m2 = Array.ofDim(buffer.n) + b(buffer.m2) := w * (b(arr) :* b(arr)) + } + if (buffer.l1 != null) { + buffer.l1 = Array.ofDim(buffer.n) + b(buffer.l1) := numerics.abs(b(arr)) + } + + case sv: SparseVector => + if (buffer.m2 != null) { + buffer.m2 = Array.ofDim(buffer.n) + v.foreachActive { (index, value) => + buffer.weightSum(index) = w * value * value + } + } + + if (buffer.l1 != null) { + buffer.l1 = Array.ofDim(buffer.n) + v.foreachActive { (index, value) => + buffer.weightSum(index) = w * math.abs(value) + } + } + + + // In the case of the weightSum and NNZ, we also have to account for the value of + // the elements. + if (buffer.weightSum != null) { + buffer.weightSum = Array.ofDim(buffer.n) + v.foreachActive { (index, value) => + if (value != 0.0) { + buffer.weightSum(index) = w + } + } + } + + if (buffer.nnz != null) { + buffer.nnz = Array.ofDim(buffer.n) + v.foreachActive { (index, value) => + if (value != 0.0) { + buffer.nnz(index) = 1L + } + } + } + + } + } + /** * Merges other into buffer. */ - private def mergeBuffers0(buffer: Buffer, other: Buffer): Unit = { + private def mergeInitializedBuffers(buffer: Buffer, other: Buffer): Unit = { // Each buffer needs to be properly initialized. require(buffer.n > 0 && other.n > 0, (buffer.n, other.n)) require(buffer.n == other.n, (buffer.n, other.n)) @@ -647,7 +692,7 @@ object SummaryBuilderImpl extends Logging { inputAggBufferOffset: Int) extends TypedImperativeAggregate[Buffer] { -// private lazy val row = new UnsafeRow(1) + private lazy val row = new UnsafeRow(Buffer.numFields) override def eval(buff: Buffer): InternalRow = { val metrics = requested.map({ @@ -699,7 +744,7 @@ object SummaryBuilderImpl extends Logging { override def deserialize(bytes: Array[Byte]): Buffer = { // Buffer.read(bytes, row) - Buffer.read(bytes, new UnsafeRow(1)) + Buffer.read(bytes, new UnsafeRow(Buffer.numFields)) } override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): MetricsAggregate = { diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala index 0479117498ff..c8d842125cdc 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala @@ -52,52 +52,75 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { // are expected for this input. They currently test against some fixed subset of the // metrics, but should be made fuzzy in the future. - private def testExample(input: Seq[Any], exp: ExpectedMetrics): Unit = { - val inputVec: Seq[Vector] = input.map { + private def testExample(name: String, input: Seq[Any], exp: ExpectedMetrics): Unit = { + def inputVec: Seq[Vector] = input.map { case x: Array[Double] => Vectors.dense(x) case x: Seq[Double] => Vectors.dense(x.toArray) case x: Vector => x case x => throw new Exception(x.toString) } - val df = sc.parallelize(inputVec).map(Tuple1.apply).toDF("features") - val c = df.col("features") - - compare(df.select(metrics("mean").summary(c), mean(c)), Seq(Row(exp.mean), exp.mean)) - - compare(df.select(metrics("variance").summary(c), variance(c)), - Seq(Row(exp.variance), exp.variance)) - compare(df.select(metrics("count").summary(c), count(c)), - Seq(Row(exp.count), exp.count)) - - compare(df.select(metrics("numNonZeros").summary(c), numNonZeros(c)), - Seq(Row(exp.numNonZeros), exp.numNonZeros)) + def wrapped() = { + val df = sc.parallelize(inputVec).map(Tuple1.apply).toDF("features") + val c = df.col("features") + df -> c + } - println("STARTING MIN") + registerTest(s"$name - mean only") { + val (df, c) = wrapped() + compare(df.select(metrics("mean").summary(c), mean(c)), Seq(Row(exp.mean), exp.mean)) + } - compare(df.select(metrics("min").summary(c), min(c)), - Seq(Row(exp.min), exp.min)) + registerTest(s"$name - variance only") { + val (df, c) = wrapped() + compare(df.select(metrics("variance").summary(c), variance(c)), + Seq(Row(exp.variance), exp.variance)) + } - println("STARTING MAX") + registerTest(s"$name - count only") { + val (df, c) = wrapped() + compare(df.select(metrics("count").summary(c), count(c)), + Seq(Row(exp.count), exp.count)) + } - compare(df.select(metrics("max").summary(c), max(c)), - Seq(Row(exp.max), exp.max)) + registerTest(s"$name - numNonZeros only") { + val (df, c) = wrapped() + compare(df.select(metrics("numNonZeros").summary(c), numNonZeros(c)), + Seq(Row(exp.numNonZeros), exp.numNonZeros)) + } - println("STARTING NORML1") + registerTest(s"$name - min only") { + val (df, c) = wrapped() + compare(df.select(metrics("min").summary(c), min(c)), + Seq(Row(exp.min), exp.min)) + } - compare(df.select(metrics("normL1").summary(c), normL1(c)), - Seq(Row(exp.normL1), exp.normL1)) + registerTest(s"$name - max only") { + val (df, c) = wrapped() + compare(df.select(metrics("max").summary(c), max(c)), + Seq(Row(exp.max), exp.max)) + } - println("STARTING NORML2") + registerTest(s"$name - normL1 only") { + val (df, c) = wrapped() + compare(df.select(metrics("normL1").summary(c), normL1(c)), + Seq(Row(exp.normL1), exp.normL1)) + } - compare(df.select(metrics("normL2").summary(c), normL2(c)), - Seq(Row(exp.normL2), exp.normL2)) + registerTest(s"$name - normL2 only") { + val (df, c) = wrapped() + compare(df.select(metrics("normL2").summary(c), normL2(c)), + Seq(Row(exp.normL2), exp.normL2)) + } - compare(df.select( - metrics("mean", "variance", "count", "numNonZeros").summary(c), - mean(c), variance(c), count(c), numNonZeros(c)), - Seq(Row(exp.mean, exp.variance, exp.count, exp.numNonZeros), - exp.mean, exp.variance, exp.count, exp.numNonZeros)) + registerTest(s"$name - all metrics at once") { + val (df, c) = wrapped() + compare(df.select( + metrics("mean", "variance", "count", "numNonZeros").summary(c), + mean(c), variance(c), count(c), numNonZeros(c)), + Seq(Row(exp.mean, exp.variance, exp.count, exp.numNonZeros), + exp.mean, exp.variance, exp.count, exp.numNonZeros)) + } } private def denseData(input: Seq[Seq[Double]]): DataFrame = { @@ -145,6 +168,8 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { case _ => throw new Exception(s"$name: ${x1.getClass} ${x2.getClass} $x1 $x2") } + + test("debugging test") { val df = denseData(Nil) println(s">>> df=${df.collect().toSeq}") @@ -175,13 +200,13 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { compare(res, Seq(Row(0L), 0L)) } - test("single element in vector, all metrics") { - val x = Seq(1.0, 2.0) - testExample(Seq(x), ExpectedMetrics( + { + val x = Seq(0.0, 1.0, 2.0) + testExample("single element", Seq(x), ExpectedMetrics( mean = x, - variance = Seq(0.0, 0.0), + variance = Seq(0.0, 0.0, 0.0), count = 1, - numNonZeros = Seq(1, 1), + numNonZeros = Seq(0, 1, 1), max = x, min = x, normL1 = x, @@ -189,18 +214,17 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { )) } - test("repeated metrics") { + testExample("two elements", Seq(Seq(0.0, 1.0, 2.0), Seq(0.0, -1.0, -2.0)), ExpectedMetrics( + mean = Seq(0.0, 0.0, 0.0), + variance = Seq(0.0, 1.0, 2.0), + count = 2, + numNonZeros = Seq(0, 2, 2), + max = Seq(0.0, 1.0, 2.0), + min = Seq(0.0, -1.0, -2.0), + normL1 = Seq(0.0, 2.0, 4.0), + normL2 = Seq(0.0, 2.0, 4.0) + )) - } - test("dense vector input") { - } - - test("sparse vector input") { - - } - - test("merging summarizer when one side has zero mean (SPARK-4355)") { - } } From 9f29030f75089884156bdc4ee634857b3730114d Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Thu, 23 Mar 2017 17:12:28 -0700 Subject: [PATCH 14/26] changes --- .../org/apache/spark/ml/stat/Summarizer.scala | 75 +++++++++++-------- .../spark/ml/stat/SummarizerSuite.scala | 11 +++ 2 files changed, 55 insertions(+), 31 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index 330a98090901..57bfa17fa900 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -255,7 +255,7 @@ object SummaryBuilderImpl extends Logging { s" max=${v(max)} min=${v(min)})" } } - object Buffer { + object Buffer extends Logging { // Recursive function, but the number of cases is really small. def fromMetrics(requested: Seq[ComputeMetrics]): Buffer = { if (requested.isEmpty) { @@ -297,6 +297,7 @@ object SummaryBuilderImpl extends Logging { val numFields = bufferSchema.fields.length def updateInPlace(buffer: Buffer, v: Vector, w: Double): Unit = { + println(s"updateInPlace: buffer=$buffer v=$v w=$w") val startN = buffer.n if (startN == -1) { // The buffer was not initialized, we initialize it with the incoming row. @@ -348,6 +349,7 @@ object SummaryBuilderImpl extends Logging { } } } + println(s"updateInPlace:2: buffer=$buffer v=$v w=$w") } /** @@ -355,6 +357,7 @@ object SummaryBuilderImpl extends Logging { */ @throws[SparkException]("When the buffers are not compatible") def mergeBuffers(buffer: Buffer, other: Buffer): Buffer = { + println(s"mergeBuffers: buffer=$buffer other=$buffer") if (buffer.n == -1) { // buffer is not initialized. if (other.n == -1) { @@ -378,9 +381,11 @@ object SummaryBuilderImpl extends Logging { /** * Reads a buffer from a serialized form, using the row object as an assistant. */ - def read(bytes: Array[Byte], row2: UnsafeRow): Buffer = { - row2.pointTo(bytes, bytes.length) - val row = row2.getStruct(0, numFields) + def read(bytes: Array[Byte]): Buffer = { + assert(numFields == 12, numFields) + val row3 = new UnsafeRow(numFields) + row3.pointTo(bytes.clone(), bytes.length) + val row = row3.getStruct(0, numFields) new Buffer( n = row.getInt(0), mean = nullableArrayD(row, 1), @@ -417,18 +422,21 @@ object SummaryBuilderImpl extends Logging { } def mean(buffer: Buffer): Array[Double] = { + println(s"Buffer:mean: buffer=$buffer") require(buffer.totalWeightSum > 0) require(buffer.mean != null) require(buffer.weightSum != null) val res = b(buffer.mean) :* b(buffer.weightSum) :/ buffer.totalWeightSum + println(s"Buffer:mean: buffer=$buffer res=$res") res.toArray } def variance(buffer: Buffer): Array[Double] = { + println(s"Buffer:variance: buffer=$buffer") import buffer._ - require(n >= 0) - require(totalWeightSum > 0) - require(totalWeightSquareSum > 0) + require(n >= 0, n) + require(totalWeightSum > 0, totalWeightSum) + require(totalWeightSquareSum > 0, totalWeightSquareSum) require(buffer.mean != null) require(m2n != null) require(weightSum != null) @@ -538,17 +546,19 @@ object SummaryBuilderImpl extends Logging { */ private def fillBufferWithRow(buffer: Buffer, v: Vector, w: Double): Unit = { require(buffer.n == -1, (buffer.n, buffer)) - buffer.n = v.size + val n = v.size + buffer.n = n buffer.totalCount = 1L buffer.totalWeightSum = w buffer.totalWeightSquareSum = w * w val arr = v.toArray + assert(arr.length == n, (arr.toSeq, n)) if (buffer.mean != null) { buffer.mean = arr.clone() } if (buffer.m2n != null) { - buffer.m2n = Array.ofDim(buffer.n) + buffer.m2n = Array.ofDim(n) } if (buffer.max != null) { buffer.max = arr.clone() @@ -561,51 +571,51 @@ object SummaryBuilderImpl extends Logging { v match { case dv: DenseVector => if (buffer.m2 != null) { - buffer.m2 = Array.ofDim(buffer.n) + buffer.m2 = Array.ofDim(n) b(buffer.m2) := w * (b(arr) :* b(arr)) } if (buffer.l1 != null) { - buffer.l1 = Array.ofDim(buffer.n) + buffer.l1 = Array.ofDim(n) b(buffer.l1) := numerics.abs(b(arr)) } case sv: SparseVector => if (buffer.m2 != null) { - buffer.m2 = Array.ofDim(buffer.n) + buffer.m2 = Array.ofDim(n) v.foreachActive { (index, value) => buffer.weightSum(index) = w * value * value } } if (buffer.l1 != null) { - buffer.l1 = Array.ofDim(buffer.n) + buffer.l1 = Array.ofDim(n) v.foreachActive { (index, value) => buffer.weightSum(index) = w * math.abs(value) } } + } - - // In the case of the weightSum and NNZ, we also have to account for the value of - // the elements. - if (buffer.weightSum != null) { - buffer.weightSum = Array.ofDim(buffer.n) - v.foreachActive { (index, value) => - if (value != 0.0) { - buffer.weightSum(index) = w - } + // In the case of the weightSum and NNZ, we also have to account for the value of + // the elements. + // TODO It would be nice to vectorize these operations too. + if (buffer.weightSum != null) { + buffer.weightSum = Array.ofDim(n) + v.foreachActive { (index, value) => + if (value != 0.0) { + buffer.weightSum(index) = w } } + } - if (buffer.nnz != null) { - buffer.nnz = Array.ofDim(buffer.n) - v.foreachActive { (index, value) => - if (value != 0.0) { - buffer.nnz(index) = 1L - } + if (buffer.nnz != null) { + buffer.nnz = Array.ofDim(n) + v.foreachActive { (index, value) => + if (value != 0.0) { + buffer.nnz(index) = 1L } } - } + } @@ -613,6 +623,7 @@ object SummaryBuilderImpl extends Logging { * Merges other into buffer. */ private def mergeInitializedBuffers(buffer: Buffer, other: Buffer): Unit = { + println(s"mergeInitializedBuffers: buffer=$buffer other=$other") // Each buffer needs to be properly initialized. require(buffer.n > 0 && other.n > 0, (buffer.n, other.n)) require(buffer.n == other.n, (buffer.n, other.n)) @@ -681,6 +692,7 @@ object SummaryBuilderImpl extends Logging { require(other.weightSum != null) b(buffer.weightSum) :+= b(other.weightSum) } + println(s"mergeInitializedBuffers:2: buffer=$buffer other=$other") } } @@ -692,7 +704,7 @@ object SummaryBuilderImpl extends Logging { inputAggBufferOffset: Int) extends TypedImperativeAggregate[Buffer] { - private lazy val row = new UnsafeRow(Buffer.numFields) +// private lazy val row = new UnsafeRow(Buffer.numFields) override def eval(buff: Buffer): InternalRow = { val metrics = requested.map({ @@ -744,7 +756,8 @@ object SummaryBuilderImpl extends Logging { override def deserialize(bytes: Array[Byte]): Buffer = { // Buffer.read(bytes, row) - Buffer.read(bytes, new UnsafeRow(Buffer.numFields)) + assert(Buffer.numFields == 12, Buffer.numFields) + Buffer.read(bytes) } override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): MetricsAggregate = { diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala index c8d842125cdc..b5df4f0cbc12 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala @@ -71,6 +71,11 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { compare(df.select(metrics("mean").summary(c), mean(c)), Seq(Row(exp.mean), exp.mean)) } + registerTest(s"$name - mean only (direct)") { + val (df, c) = wrapped() + compare(df.select(mean(c)), Seq(exp.mean)) + } + registerTest(s"$name - variance only") { val (df, c) = wrapped() compare(df.select(metrics("variance").summary(c), variance(c)), @@ -89,6 +94,12 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { Seq(Row(exp.numNonZeros), exp.numNonZeros)) } + registerTest(s"$name - numNonZeros only (direct)") { + val (df, c) = wrapped() + compare(df.select(numNonZeros(c)), + Seq(exp.numNonZeros)) + } + registerTest(s"$name - min only") { val (df, c) = wrapped() compare(df.select(metrics("min").summary(c), min(c)), From e9877dc2f08d393f079bdf6fbbf1b9b9abaa21da Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Fri, 24 Mar 2017 14:04:32 -0700 Subject: [PATCH 15/26] debugging --- .../org/apache/spark/ml/stat/Summarizer.scala | 23 +++++++++++-------- .../spark/ml/stat/SummarizerSuite.scala | 2 +- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index 57bfa17fa900..ea7fd5db9d09 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -17,6 +17,7 @@ package org.apache.spark.ml.stat +import breeze.{linalg => la} import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV} import breeze.numerics @@ -418,6 +419,8 @@ object SummaryBuilderImpl extends Logging { gadD(buffer.max), gadD(buffer.min) ) + // TODO: the projection should be passed as an argument. + val projection = UnsafeProjection.create(bufferSchema) projection.apply(ir).getBytes } @@ -497,7 +500,7 @@ object SummaryBuilderImpl extends Logging { } } - private[this] lazy val projection = UnsafeProjection.create(bufferSchema) +// private[this] lazy val projection = UnsafeProjection.create(bufferSchema) // Returns the array at a given index, or null if the array is null. private def nullableArrayD(row: UnsafeRow, ordinal: Int): Array[Double] = { @@ -635,11 +638,13 @@ object SummaryBuilderImpl extends Logging { val weightSum1 = if (buffer.weightSum == null) null else { buffer.weightSum.clone() } val weightSum2 = if (other.weightSum == null) null else { other.weightSum.clone() } - val weightSum = if (weightSum1 == null) null else { + // This sum is going to be used as a denominator. In order to guarantee that the + // division is well-defined, we add an epsilon to the zero coefficients. + // This is not going to change the value of the resul since the numerator will also be zero. + val weightSum: BV[Double] = if (weightSum1 == null) null else { require(weightSum2 != null, s"buffer=$buffer other=$other") - val arr: Array[Double] = Array.ofDim(buffer.n) - b(arr) :+= b(weightSum1) :- b(weightSum1) - arr + val b1 = b(weightSum1) :+ b(weightSum2) + la.max(b1, Double.MinPositiveValue) } @@ -654,12 +659,12 @@ object SummaryBuilderImpl extends Logging { if (buffer.mean != null) { require(other.mean != null) require(weightSum != null) - b(buffer.mean) :+= b(deltaMean) :* (b(weightSum2) / b(weightSum)) + b(buffer.mean) :+= b(deltaMean) :* (b(weightSum2) :/ weightSum) } if (buffer.m2n != null) { require(other.m2n != null) - val w = (b(weightSum1) :* b(weightSum2)) :/ b(weightSum) + val w = (b(weightSum1) :* b(weightSum2)) :/ weightSum b(buffer.m2n) :+= b(other.m2n) :+ (b(deltaMean) :* b(deltaMean)) :* w } @@ -675,12 +680,12 @@ object SummaryBuilderImpl extends Logging { if (buffer.max != null) { require(other.max != null) - maxInPlace(buffer.max, other.max) + b(buffer.max) := la.max(b(buffer.max), b(other.max)) } if (buffer.min != null) { require(other.min != null) - minInPlace(buffer.min, other.min) + b(buffer.min) := la.min(b(buffer.min), b(other.min)) } if (buffer.nnz != null) { diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala index b5df4f0cbc12..f145c3dac6f1 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala @@ -233,7 +233,7 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { max = Seq(0.0, 1.0, 2.0), min = Seq(0.0, -1.0, -2.0), normL1 = Seq(0.0, 2.0, 4.0), - normL2 = Seq(0.0, 2.0, 4.0) + normL2 = Seq(0.0, math.sqrt(2.0), math.sqrt(2.0) * 2.0) )) From 3a11d0265ef665a63cd070eeb1ae4ac25bc89908 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Fri, 24 Mar 2017 15:14:06 -0700 Subject: [PATCH 16/26] more tests and debugging --- .../org/apache/spark/ml/stat/Summarizer.scala | 41 +++--- .../spark/ml/stat/SummarizerSuite.scala | 125 +++++++++++++++++- 2 files changed, 135 insertions(+), 31 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index ea7fd5db9d09..a8cde2864f34 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -18,7 +18,7 @@ package org.apache.spark.ml.stat import breeze.{linalg => la} -import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV} +import breeze.linalg.{Vector => BV} import breeze.numerics import org.apache.spark.SparkException @@ -217,8 +217,8 @@ object SummaryBuilderImpl extends Logging { case object ComputeM2n extends ComputeMetrics case object ComputeM2 extends ComputeMetrics case object ComputeL1 extends ComputeMetrics - case object ComputeCount extends ComputeMetrics // Always computed - case object ComputeTotalWeightSum extends ComputeMetrics // Always computed + case object ComputeCount extends ComputeMetrics // Always computed -> TODO: remove + case object ComputeTotalWeightSum extends ComputeMetrics // Always computed -> TODO: remove case object ComputeWeightSquareSum extends ComputeMetrics case object ComputeWeightSum extends ComputeMetrics case object ComputeNNZ extends ComputeMetrics @@ -256,6 +256,7 @@ object SummaryBuilderImpl extends Logging { s" max=${v(max)} min=${v(min)})" } } + object Buffer extends Logging { // Recursive function, but the number of cases is really small. def fromMetrics(requested: Seq[ComputeMetrics]): Buffer = { @@ -277,6 +278,15 @@ object SummaryBuilderImpl extends Logging { } } + /** + * (testing only). Makes a buffer with all the metrics enabled. + */ + def allMetrics(): Buffer = { + fromMetrics(Seq(ComputeMean, ComputeM2n, ComputeM2, ComputeL1, ComputeCount, + ComputeTotalWeightSum, ComputeWeightSquareSum, ComputeWeightSum, ComputeNNZ, ComputeMax, + ComputeMin)) + } + val bufferSchema: StructType = { val fields = Seq( "n" -> IntegerType, @@ -443,6 +453,7 @@ object SummaryBuilderImpl extends Logging { require(buffer.mean != null) require(m2n != null) require(weightSum != null) + val denom = totalWeightSum - (totalWeightSquareSum / totalWeightSum) if (denom > 0.0) { val normWs = b(weightSum) :/ totalWeightSum @@ -500,8 +511,6 @@ object SummaryBuilderImpl extends Logging { } } -// private[this] lazy val projection = UnsafeProjection.create(bufferSchema) - // Returns the array at a given index, or null if the array is null. private def nullableArrayD(row: UnsafeRow, ordinal: Int): Array[Double] = { if (row.isNullAt(ordinal)) { @@ -524,24 +533,6 @@ object SummaryBuilderImpl extends Logging { private def bl(x: Array[Long]): BV[Long] = BV.apply(x) - private def maxInPlace(x: Array[Double], y: Array[Double]): Unit = { - var i = 0 - while(i < x.length) { - // Note: do not use conditions, it is wrong when handling NaNs. - x(i) = Math.max(x(i), y(i)) - i += 1 - } - } - - private def minInPlace(x: Array[Double], y: Array[Double]): Unit = { - var i = 0 - while(i < x.length) { - // Note: do not use conditions, it is wrong when handling NaNs. - x(i) = Math.min(x(i), y(i)) - i += 1 - } - } - /** * Sets the content of a buffer based on a single row (initialization). * @@ -586,14 +577,14 @@ object SummaryBuilderImpl extends Logging { if (buffer.m2 != null) { buffer.m2 = Array.ofDim(n) v.foreachActive { (index, value) => - buffer.weightSum(index) = w * value * value + buffer.m2(index) = w * value * value } } if (buffer.l1 != null) { buffer.l1 = Array.ofDim(n) v.foreachActive { (index, value) => - buffer.weightSum(index) = w * math.abs(value) + buffer.l1(index) = w * math.abs(value) } } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala index f145c3dac6f1..4bfdb96f5df9 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala @@ -17,9 +17,14 @@ package org.apache.spark.ml.stat +import org.scalatest.exceptions.TestFailedException + import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.stat.SummaryBuilderImpl.Buffer import org.apache.spark.ml.util.TestingUtils._ +import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors} +import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema @@ -60,15 +65,22 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { case x => throw new Exception(x.toString) } + val s = { + val s2 = new MultivariateOnlineSummarizer + inputVec.foreach(v => s2OldVectors.fromML(v))) + s2 + } + + // Because the Spark context is reset between tests, we cannot hold a reference onto it. def wrapped() = { val df = sc.parallelize(inputVec).map(Tuple1.apply).toDF("features") val c = df.col("features") - df -> c + (df, c) } registerTest(s"$name - mean only") { val (df, c) = wrapped() - compare(df.select(metrics("mean").summary(c), mean(c)), Seq(Row(exp.mean), exp.mean)) + compare(df.select(metrics("mean").summary(c), mean(c)), Seq(Row(exp.mean), s.mean)) } registerTest(s"$name - mean only (direct)") { @@ -79,7 +91,12 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { registerTest(s"$name - variance only") { val (df, c) = wrapped() compare(df.select(metrics("variance").summary(c), variance(c)), - Seq(Row(exp.variance), exp.variance)) + Seq(Row(exp.variance), s.variance)) + } + + registerTest(s"$name - variance only (direct)") { + val (df, c) = wrapped() + compare(df.select(variance(c)), Seq(s.variance)) } registerTest(s"$name - count only") { @@ -88,6 +105,12 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { Seq(Row(exp.count), exp.count)) } + registerTest(s"$name - count only (direct)") { + val (df, c) = wrapped() + compare(df.select(count(c)), + Seq(exp.count)) + } + registerTest(s"$name - numNonZeros only") { val (df, c) = wrapped() compare(df.select(metrics("numNonZeros").summary(c), numNonZeros(c)), @@ -147,6 +170,7 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { val res = row.toSeq println(s"compare: ress=${res.map(_.getClass)}") println(s"compare: row=${row}") + println(s"compare: exp=${exp}") val names = df.schema.fieldNames.zipWithIndex.map { case (n, idx) => s"$n ($idx)" } assert(res.size === exp.size, (res.size, exp.size)) for (((x1, x2), name) <- res.zip(exp).zip(names)) { @@ -156,8 +180,10 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { // Compares structured content. private def compareStructures(x1: Any, x2: Any, name: String): Unit = (x1, x2) match { + case (y1: Seq[Double], v1: OldVector) => + compareStructures(y1, v1.toArray.toSeq, name) case (d1: Double, d2: Double) => - assert(Vectors.dense(d1) ~== Vectors.dense(d2) absTol 1e-4, name) + assert2(Vectors.dense(d1) ~== Vectors.dense(d2) absTol 1e-4, name) case (r1: GenericRowWithSchema, r2: Row) => assert(r1.size === r2.size, (r1, r2)) for (((fname, x1), x2) <- r1.schema.fieldNames.zip(r1.toSeq).zip(r2.toSeq)) { @@ -167,7 +193,7 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { assert(r1.size === r2.size, (r1, r2)) for ((x1, x2) <- r1.toSeq.zip(r2.toSeq)) { compareStructures(x1, x2, name) } case (v1: Vector, v2: Vector) => - assert(v1 ~== v2 absTol 1e-4, name) + assert2(v1 ~== v2 absTol 1e-4, name) case (l1: Long, l2: Long) => assert(l1 === l2) case (s1: Seq[_], s2: Seq[_]) => assert(s1.size === s2.size, s"$name ${(s1, s2)}") @@ -179,7 +205,24 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { case _ => throw new Exception(s"$name: ${x1.getClass} ${x2.getClass} $x1 $x2") } + private def assert2(x: => Boolean, hint: String): Unit = { + try { + assert(x, hint) + } catch { + case tfe: TestFailedException => + throw new TestFailedException(Some(s"Failure with hint $hint"), Some(tfe), 1) + } + } + + private def makeBuffer(vecs: Seq[Vector]): Buffer = { + val b = Buffer.allMetrics() + for (v <- vecs) { Buffer.updateInPlace(b, v, 1.0) } + b + } + + private def b(x: Array[Double]): Vector = Vectors.dense(x) + private def l(x: Array[Long]): Vector = b(x.map(_.toDouble)) test("debugging test") { val df = denseData(Nil) @@ -227,7 +270,8 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { testExample("two elements", Seq(Seq(0.0, 1.0, 2.0), Seq(0.0, -1.0, -2.0)), ExpectedMetrics( mean = Seq(0.0, 0.0, 0.0), - variance = Seq(0.0, 1.0, 2.0), + // TODO: I have a doubt about these values, they are not normalized. + variance = Seq(0.0, 2.0, 8.0), count = 2, numNonZeros = Seq(0, 2, 2), max = Seq(0.0, 1.0, 2.0), @@ -236,6 +280,75 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { normL2 = Seq(0.0, math.sqrt(2.0), math.sqrt(2.0) * 2.0) )) + testExample("dense vector input", + Seq(Seq(-1.0, 0.0, 6.0), Seq(3.0, -3.0, 0.0)), + ExpectedMetrics( + mean = Seq(1.0, -1.5, 3.0), + variance = Seq(8.0, 4.5, 18.0), + count = 2, + numNonZeros = Seq(2, 1, 1), + max = Seq(3.0, 0.0, 6.0), + min = Seq(-1.0, -3, 0.0), + normL1 = Seq(4.0, 4.0, 6.0), + normL2 = Seq(0.0, 0.0, 0.0) + )) + + test("mixing dense and sparse vector input") { + val summarizer = makeBuffer(Seq( + Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), + Vectors.dense(0.0, -1.0, -3.0), + Vectors.sparse(3, Seq((1, -5.1))), + Vectors.dense(3.8, 0.0, 1.9), + Vectors.dense(1.7, -0.6, 0.0), + Vectors.sparse(3, Seq((1, 1.9), (2, 0.0))))) + + assert(b(Buffer.mean(summarizer)) ~== + Vectors.dense(0.583333333333, -0.416666666666, -0.183333333333) absTol 1E-5, "mean mismatch") + + assert(b(Buffer.min(summarizer)) ~== Vectors.dense(-2.0, -5.1, -3) absTol 1E-5, "min " + + "mismatch") + + assert(b(Buffer.max(summarizer)) ~== Vectors.dense(3.8, 2.3, 1.9) absTol 1E-5, "max mismatch") + + assert(l(Buffer.nnz(summarizer)) ~== Vectors.dense(3, 5, 2) absTol 1E-5, "numNonzeros mismatch") + + assert(b(Buffer.variance(summarizer)) ~== + Vectors.dense(3.857666666666, 7.0456666666666, 2.48166666666666) absTol 1E-5, + "variance mismatch") + + assert(Buffer.totalCount(summarizer) === 6) + } + + + test("merging two summarizers") { + val summarizer1 = makeBuffer(Seq( + Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), + Vectors.dense(0.0, -1.0, -3.0))) + + val summarizer2 = makeBuffer(Seq( + Vectors.sparse(3, Seq((1, -5.1))), + Vectors.dense(3.8, 0.0, 1.9), + Vectors.dense(1.7, -0.6, 0.0), + Vectors.sparse(3, Seq((1, 1.9), (2, 0.0))))) + + val summarizer = Buffer.mergeBuffers(summarizer1, summarizer2) + + assert(b(Buffer.mean(summarizer)) ~== + Vectors.dense(0.583333333333, -0.416666666666, -0.183333333333) absTol 1E-5, "mean mismatch") + + assert(b(Buffer.min(summarizer)) ~== Vectors.dense(-2.0, -5.1, -3) absTol 1E-5, "min mismatch") + + assert(b(Buffer.max(summarizer)) ~== Vectors.dense(3.8, 2.3, 1.9) absTol 1E-5, "max mismatch") + + assert(l(Buffer.nnz(summarizer)) ~== Vectors.dense(3, 5, 2) absTol 1E-5, "numNonzeros mismatch") + + assert(b(Buffer.variance(summarizer)) ~== + + Vectors.dense(3.857666666666, 7.0456666666666, 2.48166666666666) absTol 1E-5, + "variance mismatch") + + assert(Buffer.totalCount(summarizer) === 6) + } } From 6d26c17d0bd4ab18d564ee7f37916780211702d5 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Fri, 24 Mar 2017 16:12:19 -0700 Subject: [PATCH 17/26] fixed tests --- .../org/apache/spark/ml/stat/Summarizer.scala | 23 +++++++++++-------- .../stat/MultivariateOnlineSummarizer.scala | 20 ++++++++++++++++ .../spark/ml/stat/SummarizerSuite.scala | 19 +++++++++++---- 3 files changed, 48 insertions(+), 14 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index a8cde2864f34..d6704799e104 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -368,7 +368,7 @@ object SummaryBuilderImpl extends Logging { */ @throws[SparkException]("When the buffers are not compatible") def mergeBuffers(buffer: Buffer, other: Buffer): Buffer = { - println(s"mergeBuffers: buffer=$buffer other=$buffer") + println(s"mergeBuffers: buffer=$buffer other=$other") if (buffer.n == -1) { // buffer is not initialized. if (other.n == -1) { @@ -624,7 +624,7 @@ object SummaryBuilderImpl extends Logging { // Mandatory scalar values buffer.totalWeightSquareSum += other.totalWeightSquareSum buffer.totalWeightSum += other.totalWeightSum - buffer.totalCount += buffer.totalCount + buffer.totalCount += other.totalCount // Keep the original weight sums. val weightSum1 = if (buffer.weightSum == null) null else { buffer.weightSum.clone() } val weightSum2 = if (other.weightSum == null) null else { other.weightSum.clone() } @@ -634,29 +634,32 @@ object SummaryBuilderImpl extends Logging { // This is not going to change the value of the resul since the numerator will also be zero. val weightSum: BV[Double] = if (weightSum1 == null) null else { require(weightSum2 != null, s"buffer=$buffer other=$other") - val b1 = b(weightSum1) :+ b(weightSum2) - la.max(b1, Double.MinPositiveValue) + val x = b(weightSum1) :+ b(weightSum2) + la.max(x, Double.MinPositiveValue) } // Since the operations are dense, we can directly use BLAS calls here. - val deltaMean: Array[Double] = if (buffer.mean != null) { + val deltaMean: BV[Double] = if (buffer.mean != null) { require(other.mean != null) - val arr: Array[Double] = Array.ofDim(buffer.n) - b(arr) :+= b(other.mean) :- b(buffer.mean) - arr + b(other.mean) :- b(buffer.mean) } else { null } if (buffer.mean != null) { require(other.mean != null) require(weightSum != null) - b(buffer.mean) :+= b(deltaMean) :* (b(weightSum2) :/ weightSum) + b(buffer.mean) :+= deltaMean :* (b(weightSum2) :/ weightSum) } if (buffer.m2n != null) { require(other.m2n != null) val w = (b(weightSum1) :* b(weightSum2)) :/ weightSum - b(buffer.m2n) :+= b(other.m2n) :+ (b(deltaMean) :* b(deltaMean)) :* w + val z = (deltaMean :* deltaMean) :* w + println(s"weightSum1=$weightSum1 weightSum2=$weightSum2 weightSum=$weightSum z=$z") + println(s"mergeInitializedBuffers: buffer.m2n=${b(buffer.m2n)}") + println(s"mergeInitializedBuffers: other.m2n=${b(other.m2n)}") + b(buffer.m2n) :+= b(other.m2n) :+ z + println(s"mergeInitializedBuffers: buffer.m2n_2=${b(buffer.m2n)}") } if (buffer.m2 != null) { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala index 7dc0c459ec03..d40d64bf9aa4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala @@ -20,6 +20,8 @@ package org.apache.spark.mllib.stat import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.{Vector, Vectors} +// scalastyle:off println + /** * :: DeveloperApi :: * MultivariateOnlineSummarizer implements [[MultivariateStatisticalSummary]] to compute the mean, @@ -57,6 +59,17 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S private var currMax: Array[Double] = _ private var currMin: Array[Double] = _ + override def toString: String = { + def v(x: Array[Double]) = if (x==null) "null" else x.toSeq.mkString("[", " ", "]") + def vl(x: Array[Long]) = if (x==null) "null" else x.toSeq.mkString("[", " ", "]") + + s"MultivariateOnlineSummarizer(n=$n mean=${v(currMean)} m2n=${v(currM2n)} m2=${v(currM2)} " + + s"l1=${v(currL1)}" + + s" totalCount=$totalCnt totalWeightSum=$totalWeightSum" + + s" totalWeightSquareSum=$weightSquareSum weightSum=${v(weightSum)} nnz=${vl(nnz)}" + + s" max=${v(currMax)} min=${v(currMin)})" + } + /** * Add a new sample to this summarizer, and update the statistical summary. * @@ -131,6 +144,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S */ @Since("1.1.0") def merge(other: MultivariateOnlineSummarizer): this.type = { + println(s"MultivariateOnlineSummarizer:merge: this=$this") + println(s"MultivariateOnlineSummarizer:merge: other=$other") if (this.totalWeightSum != 0.0 && other.totalWeightSum != 0.0) { require(n == other.n, s"Dimensions mismatch when merging with another summarizer. " + s"Expecting $n but got ${other.n}.") @@ -148,7 +163,11 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S // merge mean together currMean(i) += deltaMean * otherNnz / totalNnz // merge m2n together + val z = deltaMean * deltaMean * thisNnz * otherNnz / totalNnz + println(s"i=$i thisNnz=$thisNnz otherNnz=$otherNnz totalNnz=$totalNnz z=$z") + println(s"i=$i currM2n=${currM2n(i)} other.currM2n=${other.currM2n(i)}") currM2n(i) += other.currM2n(i) + deltaMean * deltaMean * thisNnz * otherNnz / totalNnz + println(s"i=$i >> currM2n=${currM2n(i)} other.currM2n=${other.currM2n(i)}") // merge m2 together currM2(i) += other.currM2(i) // merge l1 together @@ -175,6 +194,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S this.currMax = other.currMax.clone() this.currMin = other.currMin.clone() } + println(s"MultivariateOnlineSummarizer:merge(2): this=$this") this } diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala index 4bfdb96f5df9..9a54a386b953 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala @@ -67,7 +67,7 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { val s = { val s2 = new MultivariateOnlineSummarizer - inputVec.foreach(v => s2OldVectors.fromML(v))) + inputVec.foreach(v => s2.add(OldVectors.fromML(v))) s2 } @@ -289,8 +289,8 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { numNonZeros = Seq(2, 1, 1), max = Seq(3.0, 0.0, 6.0), min = Seq(-1.0, -3, 0.0), - normL1 = Seq(4.0, 4.0, 6.0), - normL2 = Seq(0.0, 0.0, 0.0) + normL1 = Seq(4.0, 3.0, 6.0), + normL2 = Seq(math.sqrt(10), 3, 6.0) )) test("mixing dense and sparse vector input") { @@ -324,14 +324,25 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { val summarizer1 = makeBuffer(Seq( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(0.0, -1.0, -3.0))) +// val s1 = new MultivariateOnlineSummarizer +// Seq( +// Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), +// Vectors.dense(0.0, -1.0, -3.0)).foreach(v => s1.add(OldVectors.fromML(v))) val summarizer2 = makeBuffer(Seq( Vectors.sparse(3, Seq((1, -5.1))), Vectors.dense(3.8, 0.0, 1.9), Vectors.dense(1.7, -0.6, 0.0), Vectors.sparse(3, Seq((1, 1.9), (2, 0.0))))) +// val s2 = new MultivariateOnlineSummarizer +// Seq( +// Vectors.sparse(3, Seq((1, -5.1))), +// Vectors.dense(3.8, 0.0, 1.9), +// Vectors.dense(1.7, -0.6, 0.0), +// Vectors.sparse(3, Seq((1, 1.9), (2, 0.0)))).foreach(v => s2.add(OldVectors.fromML(v))) val summarizer = Buffer.mergeBuffers(summarizer1, summarizer2) +// val s = s1.merge(s2) assert(b(Buffer.mean(summarizer)) ~== Vectors.dense(0.583333333333, -0.416666666666, -0.183333333333) absTol 1E-5, "mean mismatch") @@ -343,7 +354,6 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { assert(l(Buffer.nnz(summarizer)) ~== Vectors.dense(3, 5, 2) absTol 1E-5, "numNonzeros mismatch") assert(b(Buffer.variance(summarizer)) ~== - Vectors.dense(3.857666666666, 7.0456666666666, 2.48166666666666) absTol 1E-5, "variance mismatch") @@ -351,4 +361,5 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { } + } From 35eaeb0d02ae9cc29ae559231fe4858935315477 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Fri, 24 Mar 2017 16:23:15 -0700 Subject: [PATCH 18/26] doc --- .../main/scala/org/apache/spark/ml/stat/Summarizer.scala | 1 + .../apache/spark/sql/catalyst/expressions/Projection.scala | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index d6704799e104..536ef85d8a18 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -393,6 +393,7 @@ object SummaryBuilderImpl extends Logging { * Reads a buffer from a serialized form, using the row object as an assistant. */ def read(bytes: Array[Byte]): Buffer = { + // TODO move this row outside to the aggregate assert(numFields == 12, numFields) val row3 = new UnsafeRow(numFields) row3.pointTo(bytes.clone(), bytes.length) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala index 7c57025f995d..64b94f0a2c10 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala @@ -101,6 +101,8 @@ case class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mu /** * A projection that returns UnsafeRow. + * + * CAUTION: the returned projection object should *not* be assumed to be thread-safe. */ abstract class UnsafeProjection extends Projection { override def apply(row: InternalRow): UnsafeRow @@ -110,11 +112,15 @@ object UnsafeProjection { /** * Returns an UnsafeProjection for given StructType. + * + * CAUTION: the returned projection object is *not* thread-safe. */ def create(schema: StructType): UnsafeProjection = create(schema.fields.map(_.dataType)) /** * Returns an UnsafeProjection for given Array of DataTypes. + * + * CAUTION: the returned projection object is *not* thread-safe. */ def create(fields: Array[DataType]): UnsafeProjection = { create(fields.zipWithIndex.map(x => BoundReference(x._2, x._1, true))) From 58b17dc3af5ebc5eaeff82559bc3be3a1f64ef38 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Fri, 24 Mar 2017 16:30:40 -0700 Subject: [PATCH 19/26] cleanups --- .../org/apache/spark/ml/stat/Summarizer.scala | 28 ++----------------- .../stat/MultivariateOnlineSummarizer.scala | 8 ------ 2 files changed, 3 insertions(+), 33 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index 536ef85d8a18..b6832375135a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -22,19 +22,15 @@ import breeze.linalg.{Vector => BV} import breeze.numerics import org.apache.spark.SparkException -import org.apache.spark.annotation.{DeveloperApi, Since} +import org.apache.spark.annotation.Since import org.apache.spark.internal.Logging import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors, VectorUDT} -import org.apache.spark.sql.{Column, Row} +import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeArrayData, UnsafeProjection, UnsafeRow} -import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete, Final, TypedImperativeAggregate} -import org.apache.spark.sql.catalyst.util.GenericArrayData -import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction, UserDefinedFunction} +import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete, TypedImperativeAggregate} import org.apache.spark.sql.types._ -// scalastyle:off println - /** * A builder object that provides summary statistics about a given column. @@ -131,9 +127,6 @@ private[ml] class SummaryBuilderImpl( override def summary(column: Column): Column = { val start = SummaryBuilderImpl.Buffer.fromMetrics(requestedCompMetrics) - println(s"summary: requestedMetrics=$requestedMetrics") - println(s"summary: requestedCompMetrics=$requestedCompMetrics") - println(s"summary: start=$start") val agg = SummaryBuilderImpl.MetricsAggregate( requestedMetrics, start, @@ -308,7 +301,6 @@ object SummaryBuilderImpl extends Logging { val numFields = bufferSchema.fields.length def updateInPlace(buffer: Buffer, v: Vector, w: Double): Unit = { - println(s"updateInPlace: buffer=$buffer v=$v w=$w") val startN = buffer.n if (startN == -1) { // The buffer was not initialized, we initialize it with the incoming row. @@ -360,7 +352,6 @@ object SummaryBuilderImpl extends Logging { } } } - println(s"updateInPlace:2: buffer=$buffer v=$v w=$w") } /** @@ -368,7 +359,6 @@ object SummaryBuilderImpl extends Logging { */ @throws[SparkException]("When the buffers are not compatible") def mergeBuffers(buffer: Buffer, other: Buffer): Buffer = { - println(s"mergeBuffers: buffer=$buffer other=$other") if (buffer.n == -1) { // buffer is not initialized. if (other.n == -1) { @@ -436,17 +426,14 @@ object SummaryBuilderImpl extends Logging { } def mean(buffer: Buffer): Array[Double] = { - println(s"Buffer:mean: buffer=$buffer") require(buffer.totalWeightSum > 0) require(buffer.mean != null) require(buffer.weightSum != null) val res = b(buffer.mean) :* b(buffer.weightSum) :/ buffer.totalWeightSum - println(s"Buffer:mean: buffer=$buffer res=$res") res.toArray } def variance(buffer: Buffer): Array[Double] = { - println(s"Buffer:variance: buffer=$buffer") import buffer._ require(n >= 0, n) require(totalWeightSum > 0, totalWeightSum) @@ -618,7 +605,6 @@ object SummaryBuilderImpl extends Logging { * Merges other into buffer. */ private def mergeInitializedBuffers(buffer: Buffer, other: Buffer): Unit = { - println(s"mergeInitializedBuffers: buffer=$buffer other=$other") // Each buffer needs to be properly initialized. require(buffer.n > 0 && other.n > 0, (buffer.n, other.n)) require(buffer.n == other.n, (buffer.n, other.n)) @@ -656,11 +642,7 @@ object SummaryBuilderImpl extends Logging { require(other.m2n != null) val w = (b(weightSum1) :* b(weightSum2)) :/ weightSum val z = (deltaMean :* deltaMean) :* w - println(s"weightSum1=$weightSum1 weightSum2=$weightSum2 weightSum=$weightSum z=$z") - println(s"mergeInitializedBuffers: buffer.m2n=${b(buffer.m2n)}") - println(s"mergeInitializedBuffers: other.m2n=${b(other.m2n)}") b(buffer.m2n) :+= b(other.m2n) :+ z - println(s"mergeInitializedBuffers: buffer.m2n_2=${b(buffer.m2n)}") } if (buffer.m2 != null) { @@ -692,7 +674,6 @@ object SummaryBuilderImpl extends Logging { require(other.weightSum != null) b(buffer.weightSum) :+= b(other.weightSum) } - println(s"mergeInitializedBuffers:2: buffer=$buffer other=$other") } } @@ -723,7 +704,6 @@ object SummaryBuilderImpl extends Logging { override def children: Seq[Expression] = child :: Nil override def update(buff: Buffer, row: InternalRow): Buffer = { - println(s"UPDATE: ROW=$row") // Unsafe rows do not play well with UDTs, it seems. // Directly call the deserializer. val v = udt.deserialize(row.getStruct(0, udt.sqlType.size)) @@ -748,9 +728,7 @@ object SummaryBuilderImpl extends Logging { override def serialize(buff: Buffer): Array[Byte] = { val x = Buffer.write(buff) - println(s"serialize: ${buff.hashCode()} x=${x.length} buff=$buff") val b2 = deserialize(x) - println(s"serialize: ${buff.hashCode()} b2=$b2") x } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala index d40d64bf9aa4..c1a66cae3837 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala @@ -20,8 +20,6 @@ package org.apache.spark.mllib.stat import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.{Vector, Vectors} -// scalastyle:off println - /** * :: DeveloperApi :: * MultivariateOnlineSummarizer implements [[MultivariateStatisticalSummary]] to compute the mean, @@ -144,8 +142,6 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S */ @Since("1.1.0") def merge(other: MultivariateOnlineSummarizer): this.type = { - println(s"MultivariateOnlineSummarizer:merge: this=$this") - println(s"MultivariateOnlineSummarizer:merge: other=$other") if (this.totalWeightSum != 0.0 && other.totalWeightSum != 0.0) { require(n == other.n, s"Dimensions mismatch when merging with another summarizer. " + s"Expecting $n but got ${other.n}.") @@ -164,10 +160,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S currMean(i) += deltaMean * otherNnz / totalNnz // merge m2n together val z = deltaMean * deltaMean * thisNnz * otherNnz / totalNnz - println(s"i=$i thisNnz=$thisNnz otherNnz=$otherNnz totalNnz=$totalNnz z=$z") - println(s"i=$i currM2n=${currM2n(i)} other.currM2n=${other.currM2n(i)}") currM2n(i) += other.currM2n(i) + deltaMean * deltaMean * thisNnz * otherNnz / totalNnz - println(s"i=$i >> currM2n=${currM2n(i)} other.currM2n=${other.currM2n(i)}") // merge m2 together currM2(i) += other.currM2(i) // merge l1 together @@ -194,7 +187,6 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S this.currMax = other.currMax.clone() this.currMin = other.currMin.clone() } - println(s"MultivariateOnlineSummarizer:merge(2): this=$this") this } From 18078c1b82a55ac27ae241796c30f284c705ed42 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Fri, 24 Mar 2017 16:32:11 -0700 Subject: [PATCH 20/26] cleanups --- .../spark/ml/stat/SummarizerSuite.scala | 29 +------------------ 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala index 9a54a386b953..781340887e46 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala @@ -26,19 +26,14 @@ import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors} import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema -// scalastyle:off println - class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { import testImplicits._ import Summarizer._ - private val seed = 42 - private val eps: Double = 1e-5 - private case class ExpectedMetrics( mean: Seq[Double], variance: Seq[Double], @@ -163,14 +158,9 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { } private def compare(df: DataFrame, exp: Seq[Any]): Unit = { - println(s"compare: df=$df") val coll = df.collect().toSeq - println(s"compare: coll=$coll") val Seq(row) = coll val res = row.toSeq - println(s"compare: ress=${res.map(_.getClass)}") - println(s"compare: row=${row}") - println(s"compare: exp=${exp}") val names = df.schema.fieldNames.zipWithIndex.map { case (n, idx) => s"$n ($idx)" } assert(res.size === exp.size, (res.size, exp.size)) for (((x1, x2), name) <- res.zip(exp).zip(names)) { @@ -226,13 +216,9 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { test("debugging test") { val df = denseData(Nil) - println(s">>> df=${df.collect().toSeq}") val c = df.col("features") - println(s">>>>> c=$c") val c1 = metrics("mean").summary(c) - println(s">>>>> c1=$c1") val res = df.select(c1) - println(s">>>>> res=$res") intercept[SparkException] { compare(res, Seq.empty) } @@ -324,25 +310,14 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { val summarizer1 = makeBuffer(Seq( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(0.0, -1.0, -3.0))) -// val s1 = new MultivariateOnlineSummarizer -// Seq( -// Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), -// Vectors.dense(0.0, -1.0, -3.0)).foreach(v => s1.add(OldVectors.fromML(v))) val summarizer2 = makeBuffer(Seq( Vectors.sparse(3, Seq((1, -5.1))), Vectors.dense(3.8, 0.0, 1.9), Vectors.dense(1.7, -0.6, 0.0), Vectors.sparse(3, Seq((1, 1.9), (2, 0.0))))) -// val s2 = new MultivariateOnlineSummarizer -// Seq( -// Vectors.sparse(3, Seq((1, -5.1))), -// Vectors.dense(3.8, 0.0, 1.9), -// Vectors.dense(1.7, -0.6, 0.0), -// Vectors.sparse(3, Seq((1, 1.9), (2, 0.0)))).foreach(v => s2.add(OldVectors.fromML(v))) val summarizer = Buffer.mergeBuffers(summarizer1, summarizer2) -// val s = s1.merge(s2) assert(b(Buffer.mean(summarizer)) ~== Vectors.dense(0.583333333333, -0.416666666666, -0.183333333333) absTol 1E-5, "mean mismatch") @@ -360,6 +335,4 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { assert(Buffer.totalCount(summarizer) === 6) } - - } From ffe5cfe2e5a60f3f6056d8f89f7c50bbedca3fbf Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Fri, 24 Mar 2017 16:41:24 -0700 Subject: [PATCH 21/26] Cleanups --- .../org/apache/spark/ml/stat/Summarizer.scala | 46 ++++++++----------- .../spark/ml/stat/SummarizerSuite.scala | 6 +-- 2 files changed, 21 insertions(+), 31 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index b6832375135a..b87087168e71 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -176,15 +176,14 @@ object SummaryBuilderImpl extends Logging { * metrics that need to de computed internally to get the final result. */ private val allMetrics: Seq[(String, Metrics, DataType, Seq[ComputeMetrics])] = Seq( - ("mean", Mean, arrayDType, Seq(ComputeMean, ComputeWeightSum, ComputeTotalWeightSum)), - ("variance", Variance, arrayDType, Seq(ComputeTotalWeightSum, ComputeWeightSum, - ComputeWeightSquareSum, ComputeMean, ComputeM2n)), - ("count", Count, LongType, Seq(ComputeCount)), + ("mean", Mean, arrayDType, Seq(ComputeMean, ComputeWeightSum)), + ("variance", Variance, arrayDType, Seq(ComputeWeightSum, ComputeMean, ComputeM2n)), + ("count", Count, LongType, Seq()), ("numNonZeros", NumNonZeros, arrayLType, Seq(ComputeNNZ)), ("max", Max, arrayDType, Seq(ComputeMax)), ("min", Min, arrayDType, Seq(ComputeMin)), - ("normL2", NormL2, arrayDType, Seq(ComputeTotalWeightSum, ComputeM2)), - ("normL1", NormL1, arrayDType, Seq(ComputeTotalWeightSum, ComputeL1)) + ("normL2", NormL2, arrayDType, Seq(ComputeM2)), + ("normL1", NormL1, arrayDType, Seq(ComputeL1)) ) /** @@ -210,9 +209,6 @@ object SummaryBuilderImpl extends Logging { case object ComputeM2n extends ComputeMetrics case object ComputeM2 extends ComputeMetrics case object ComputeL1 extends ComputeMetrics - case object ComputeCount extends ComputeMetrics // Always computed -> TODO: remove - case object ComputeTotalWeightSum extends ComputeMetrics // Always computed -> TODO: remove - case object ComputeWeightSquareSum extends ComputeMetrics case object ComputeWeightSum extends ComputeMetrics case object ComputeNNZ extends ComputeMetrics case object ComputeMax extends ComputeMetrics @@ -275,8 +271,8 @@ object SummaryBuilderImpl extends Logging { * (testing only). Makes a buffer with all the metrics enabled. */ def allMetrics(): Buffer = { - fromMetrics(Seq(ComputeMean, ComputeM2n, ComputeM2, ComputeL1, ComputeCount, - ComputeTotalWeightSum, ComputeWeightSquareSum, ComputeWeightSum, ComputeNNZ, ComputeMax, + fromMetrics(Seq(ComputeMean, ComputeM2n, ComputeM2, ComputeL1, + ComputeWeightSum, ComputeNNZ, ComputeMax, ComputeMin)) } @@ -382,12 +378,9 @@ object SummaryBuilderImpl extends Logging { /** * Reads a buffer from a serialized form, using the row object as an assistant. */ - def read(bytes: Array[Byte]): Buffer = { - // TODO move this row outside to the aggregate - assert(numFields == 12, numFields) - val row3 = new UnsafeRow(numFields) - row3.pointTo(bytes.clone(), bytes.length) - val row = row3.getStruct(0, numFields) + def read(bytes: Array[Byte], backingRow: UnsafeRow): Buffer = { + backingRow.pointTo(bytes.clone(), bytes.length) + val row = backingRow.getStruct(0, numFields) new Buffer( n = row.getInt(0), mean = nullableArrayD(row, 1), @@ -405,7 +398,7 @@ object SummaryBuilderImpl extends Logging { } - def write(buffer: Buffer): Array[Byte] = { + def write(buffer: Buffer, project: UnsafeProjection): Array[Byte] = { val ir = InternalRow.apply( buffer.n, gadD(buffer.mean), @@ -420,9 +413,7 @@ object SummaryBuilderImpl extends Logging { gadD(buffer.max), gadD(buffer.min) ) - // TODO: the projection should be passed as an argument. - val projection = UnsafeProjection.create(bufferSchema) - projection.apply(ir).getBytes + project.apply(ir).getBytes } def mean(buffer: Buffer): Array[Double] = { @@ -685,7 +676,10 @@ object SummaryBuilderImpl extends Logging { inputAggBufferOffset: Int) extends TypedImperativeAggregate[Buffer] { -// private lazy val row = new UnsafeRow(Buffer.numFields) + // These objects are not thread-safe, allocate them in the aggregator. + private[this] lazy val row = new UnsafeRow(Buffer.numFields) + private[this] lazy val projection = UnsafeProjection.create(Buffer.bufferSchema) + override def eval(buff: Buffer): InternalRow = { val metrics = requested.map({ @@ -727,15 +721,11 @@ object SummaryBuilderImpl extends Logging { override def createAggregationBuffer(): Buffer = startBuffer.copy() override def serialize(buff: Buffer): Array[Byte] = { - val x = Buffer.write(buff) - val b2 = deserialize(x) - x + Buffer.write(buff, projection) } override def deserialize(bytes: Array[Byte]): Buffer = { -// Buffer.read(bytes, row) - assert(Buffer.numFields == 12, Buffer.numFields) - Buffer.read(bytes) + Buffer.read(bytes, row) } override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): MetricsAggregate = { diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala index 781340887e46..3181886fb08a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala @@ -54,8 +54,8 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { private def testExample(name: String, input: Seq[Any], exp: ExpectedMetrics): Unit = { def inputVec: Seq[Vector] = input.map { - case x: Array[Double] => Vectors.dense(x) - case x: Seq[Double] => Vectors.dense(x.toArray) + case x: Array[Double @unchecked] => Vectors.dense(x) + case x: Seq[Double @unchecked] => Vectors.dense(x.toArray) case x: Vector => x case x => throw new Exception(x.toString) } @@ -170,7 +170,7 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { // Compares structured content. private def compareStructures(x1: Any, x2: Any, name: String): Unit = (x1, x2) match { - case (y1: Seq[Double], v1: OldVector) => + case (y1: Seq[Double @unchecked], v1: OldVector) => compareStructures(y1, v1.toArray.toSeq, name) case (d1: Double, d2: Double) => assert2(Vectors.dense(d1) ~== Vectors.dense(d2) absTol 1e-4, name) From 41f4be6044588dc1db7a8366282fd8aeb5343483 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Fri, 24 Mar 2017 16:43:14 -0700 Subject: [PATCH 22/26] Cleanups --- .../apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala index c1a66cae3837..20e2cfc7ccdf 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala @@ -159,7 +159,6 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S // merge mean together currMean(i) += deltaMean * otherNnz / totalNnz // merge m2n together - val z = deltaMean * deltaMean * thisNnz * otherNnz / totalNnz currM2n(i) += other.currM2n(i) + deltaMean * deltaMean * thisNnz * otherNnz / totalNnz // merge m2 together currM2(i) += other.currM2(i) From ba200bb1d52d468acebc9fe44f6727c570a35335 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Fri, 24 Mar 2017 16:43:32 -0700 Subject: [PATCH 23/26] Cleanups --- .../mllib/stat/MultivariateOnlineSummarizer.scala | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala index 20e2cfc7ccdf..7dc0c459ec03 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala @@ -57,17 +57,6 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S private var currMax: Array[Double] = _ private var currMin: Array[Double] = _ - override def toString: String = { - def v(x: Array[Double]) = if (x==null) "null" else x.toSeq.mkString("[", " ", "]") - def vl(x: Array[Long]) = if (x==null) "null" else x.toSeq.mkString("[", " ", "]") - - s"MultivariateOnlineSummarizer(n=$n mean=${v(currMean)} m2n=${v(currM2n)} m2=${v(currM2)} " + - s"l1=${v(currL1)}" + - s" totalCount=$totalCnt totalWeightSum=$totalWeightSum" + - s" totalWeightSquareSum=$weightSquareSum weightSum=${v(weightSum)} nnz=${vl(nnz)}" + - s" max=${v(currMax)} min=${v(currMin)})" - } - /** * Add a new sample to this summarizer, and update the statistical summary. * From 662f62c5dc13b0b8b21f1c0fa374e4279f724a8e Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Mon, 27 Mar 2017 17:12:11 -0700 Subject: [PATCH 24/26] small test to find perf issues --- .../spark/ml/stat/SummarizerSuite.scala | 63 ++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala index 3181886fb08a..f691148c0c3a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala @@ -24,7 +24,7 @@ import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.stat.SummaryBuilderImpl.Buffer import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors} -import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer +import org.apache.spark.mllib.stat.{MultivariateOnlineSummarizer, Statistics} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema @@ -335,4 +335,65 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { assert(Buffer.totalCount(summarizer) === 6) } + // TODO: this test should not be committed. It is here to isolate some performance hotspots. + test("perf test") { + val n = 10000000 + val rdd1 = sc.parallelize(1 to n).map { idx => + OldVectors.dense(idx.toDouble) + } + val trieouts = 10 + rdd1.cache() + rdd1.count() + val rdd2 = sc.parallelize(1 to n).map { idx => + Vectors.dense(idx.toDouble) + } + rdd2.cache() + rdd2.count() + val df = rdd2.map(Tuple1.apply).toDF("features") + df.cache() + df.count() + val x = df.select( + metrics("mean", "variance", "count", "numNonZeros", "max", "min", "normL1", + "normL2").summary($"features")) + val x1 = df.select(metrics("variance").summary($"features")) + + var times_df: List[Long] = Nil + for (_ <- 1 to trieouts) { + val t21 = System.nanoTime() + x.head() + val t22 = System.nanoTime() + times_df ::= (t22 - t21) + } + + var times_rdd: List[Long] = Nil + for (_ <- 1 to trieouts) { + val t21 = System.nanoTime() + Statistics.colStats(rdd1) + val t22 = System.nanoTime() + times_rdd ::= (t22 - t21) + } + + var times_df_variance: List[Long] = Nil + for (_ <- 1 to trieouts) { + val t21 = System.nanoTime() + x1.head() + val t22 = System.nanoTime() + times_df_variance ::= (t22 - t21) + } + + def print(name: String, l: List[Long]): Unit = { + def f(z: Long) = (1e6 * n.toDouble) / z + val min = f(l.max) + val max = f(l.min) + val med = f(l.sorted.drop(l.size / 2).head) + + // scalastyle:off println + println(s"dataframe = [$min ~ $med ~ $max] records / milli") + } + + print("RDD", times_rdd) + print("Dataframes (variance only)", times_df_variance) + print("Dataframes", times_df) + } + } From 96be071b30a896abc1a1e2f0342bd333dbfda7c5 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Thu, 30 Mar 2017 15:10:13 -0700 Subject: [PATCH 25/26] Current speed: Dataframes = [2766.648008567718 ~ 5091.204527768661 ~ 5716.359795809639] records / milli --- .../apache/spark/ml/linalg/VectorUDT.scala | 24 ++++---- .../spark/ml/stat/SummarizerSuite.scala | 55 +++++++++++-------- 2 files changed, 44 insertions(+), 35 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorUDT.scala b/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorUDT.scala index 917861309c57..37f173bc2046 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorUDT.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorUDT.scala @@ -27,17 +27,7 @@ import org.apache.spark.sql.types._ */ private[spark] class VectorUDT extends UserDefinedType[Vector] { - override def sqlType: StructType = { - // type: 0 = sparse, 1 = dense - // We only use "values" for dense vectors, and "size", "indices", and "values" for sparse - // vectors. The "values" field is nullable because we might want to add binary vectors later, - // which uses "size" and "indices", but not "values". - StructType(Seq( - StructField("type", ByteType, nullable = false), - StructField("size", IntegerType, nullable = true), - StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true), - StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true))) - } + override final def sqlType: StructType = _sqlType override def serialize(obj: Vector): InternalRow = { obj match { @@ -94,4 +84,16 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] { override def typeName: String = "vector" private[spark] override def asNullable: VectorUDT = this + + private[this] val _sqlType = { + // type: 0 = sparse, 1 = dense + // We only use "values" for dense vectors, and "size", "indices", and "values" for sparse + // vectors. The "values" field is nullable because we might want to add binary vectors later, + // which uses "size" and "indices", but not "values". + StructType(Seq( + StructField("type", ByteType, nullable = false), + StructField("size", IntegerType, nullable = true), + StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true), + StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true))) + } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala index f691148c0c3a..b7d0c5635bbc 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala @@ -341,7 +341,7 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { val rdd1 = sc.parallelize(1 to n).map { idx => OldVectors.dense(idx.toDouble) } - val trieouts = 10 + val trieouts = 100 rdd1.cache() rdd1.count() val rdd2 = sc.parallelize(1 to n).map { idx => @@ -357,43 +357,50 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { "normL2").summary($"features")) val x1 = df.select(metrics("variance").summary($"features")) + def print(name: String, l: List[Long]): Unit = { + def f(z: Long) = (1e6 * n.toDouble) / z + val min = f(l.max) + val max = f(l.min) + val med = f(l.sorted.drop(l.size / 2).head) + + // scalastyle:off println + println(s"$name = [$min ~ $med ~ $max] records / milli") + } + + var times_df: List[Long] = Nil for (_ <- 1 to trieouts) { + System.gc() val t21 = System.nanoTime() x.head() val t22 = System.nanoTime() - times_df ::= (t22 - t21) + val dt = t22 - t21 + times_df ::= dt + // scalastyle:off + print("Dataframes", times_df) + // scalastyle:on } var times_rdd: List[Long] = Nil - for (_ <- 1 to trieouts) { - val t21 = System.nanoTime() - Statistics.colStats(rdd1) - val t22 = System.nanoTime() - times_rdd ::= (t22 - t21) - } +// for (_ <- 1 to trieouts) { +// val t21 = System.nanoTime() +// Statistics.colStats(rdd1) +// val t22 = System.nanoTime() +// times_rdd ::= (t22 - t21) +// } var times_df_variance: List[Long] = Nil - for (_ <- 1 to trieouts) { - val t21 = System.nanoTime() - x1.head() - val t22 = System.nanoTime() - times_df_variance ::= (t22 - t21) - } - - def print(name: String, l: List[Long]): Unit = { - def f(z: Long) = (1e6 * n.toDouble) / z - val min = f(l.max) - val max = f(l.min) - val med = f(l.sorted.drop(l.size / 2).head) +// for (_ <- 1 to trieouts) { +// val t21 = System.nanoTime() +// x1.head() +// val t22 = System.nanoTime() +// times_df_variance ::= (t22 - t21) +// } - // scalastyle:off println - println(s"dataframe = [$min ~ $med ~ $max] records / milli") - } + print("Dataframes", times_df) print("RDD", times_rdd) print("Dataframes (variance only)", times_df_variance) - print("Dataframes", times_df) } } From a569dac8998d63adbc9adba1f2eb2f42967533e7 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Thu, 30 Mar 2017 15:46:56 -0700 Subject: [PATCH 26/26] BLAS calls for dense interface --- .../org/apache/spark/ml/stat/Summarizer.scala | 119 +++++++++++++----- 1 file changed, 86 insertions(+), 33 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index b87087168e71..23ee455b9a66 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -313,40 +313,10 @@ object SummaryBuilderImpl extends Logging { buffer.totalWeightSum += w buffer.totalCount += 1 buffer.totalWeightSquareSum += w * w - // All the fields that we compute on demand: - // TODO: the most common case is dense vectors. In that case we should - // directly use BLAS instructions instead of iterating through a scala iterator. - v.foreachActive { (index, value) => - if (value != 0.0) { - if (buffer.max != null && buffer.max(index) < value) { - buffer.max(index) = value - } - if (buffer.min != null && buffer.min(index) > value) { - buffer.min(index) = value - } - if (buffer.mean != null) { - assert(buffer.weightSum != null) - val prevMean = buffer.mean(index) - val diff = value - prevMean - buffer.mean(index) += w * diff / (buffer.weightSum(index) + w) - if (buffer.m2n != null) { - buffer.m2n(index) += w * (value - buffer.mean(index)) * diff - } - } - if (buffer.m2 != null) { - buffer.m2(index) += w * value * value - } - if (buffer.l1 != null) { - buffer.l1(index) += w * math.abs(value) - } - if (buffer.weightSum != null) { - buffer.weightSum(index) += w - } - if (buffer.nnz != null) { - buffer.nnz(index) += 1 - } - } + v match { + case dv: DenseVector => updateInPlaceDense(buffer, dv, w) + case sv: SparseVector => updateInPlaceSparse(buffer, sv, w) } } @@ -588,10 +558,93 @@ object SummaryBuilderImpl extends Logging { } } } + } + + private def updateInPlaceDense(buffer: Buffer, v: DenseVector, w: Double): Unit = { + val epsi = Double.MinPositiveValue + lazy val value = v.asBreeze + // The mask is zero for all the zero values, and one otherwise. + lazy val mask = numerics.ceil(la.min(numerics.abs(value), epsi)) + lazy val maskWeight = w * mask + + if (buffer.max != null) { + val x = b(buffer.max) + x := la.max(x, value) + } + + if (buffer.min != null) { + val x = b(buffer.min) + x := la.min(x, value) + } + + if (buffer.mean != null) { + assert(buffer.weightSum != null) + val prevMean = b(buffer.mean).copy + val diff = value :- prevMean + // Adding an epsilon to ensure that the denominator is always positive. + // This epsilon is not going to have impact since numerator(i) == 0 => denominator(i) == 0. + val denom = la.max(b(buffer.weightSum) :+ maskWeight, epsi) + b(buffer.mean) :+= (maskWeight :* diff) :/ denom + if (buffer.m2n != null) { + b(buffer.m2n) :+= maskWeight :* ((value :- b(buffer.mean)) :* diff) + } + } + + if (buffer.m2 != null) { + b(buffer.m2) :+= maskWeight :* (value :* value) + } + + if (buffer.l1 != null) { + b(buffer.l1) :+= maskWeight :* numerics.abs(value) + } + + + if (buffer.weightSum != null) { + b(buffer.weightSum) :+= maskWeight + } + if (buffer.nnz != null) { + bl(buffer.nnz) :+= la.convert(maskWeight, Long) + } } + private def updateInPlaceSparse(buffer: Buffer, v: SparseVector, w: Double): Unit = { + v.foreachActive { (index, value) => + if (value != 0.0) { + if (buffer.max != null && buffer.max(index) < value) { + buffer.max(index) = value + } + if (buffer.min != null && buffer.min(index) > value) { + buffer.min(index) = value + } + + if (buffer.mean != null) { + assert(buffer.weightSum != null) + val prevMean = buffer.mean(index) + val diff = value - prevMean + buffer.mean(index) += w * diff / (buffer.weightSum(index) + w) + if (buffer.m2n != null) { + buffer.m2n(index) += w * (value - buffer.mean(index)) * diff + } + } + if (buffer.m2 != null) { + buffer.m2(index) += w * value * value + } + if (buffer.l1 != null) { + buffer.l1(index) += w * math.abs(value) + } + if (buffer.weightSum != null) { + buffer.weightSum(index) += w + } + if (buffer.nnz != null) { + buffer.nnz(index) += 1 + } + } + } + + } + /** * Merges other into buffer. */