From 4dee9e1a94d963e9aa174df45a0ad9b52c3d8f24 Mon Sep 17 00:00:00 2001 From: Travis Galoppo Date: Tue, 6 Jan 2015 22:45:43 -0500 Subject: [PATCH 1/7] SPARK-5018 MultivariateGaussian.scala - Made class public and exposed public methods leveraging MLlib vectors and matrices. Added logpdf method providing log-density calculation. MultivariateGaussianSuite.scala - Test are now performed through the public methods. --- .../stat/impl/MultivariateGaussian.scala | 52 ++++++++++++++++--- .../stat/impl/MultivariateGaussianSuite.scala | 31 ++++++----- 2 files changed, 61 insertions(+), 22 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala index bc7f6c5197ac7..f291b575c47a0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala @@ -19,6 +19,7 @@ package org.apache.spark.mllib.stat.impl import breeze.linalg.{DenseVector => DBV, DenseMatrix => DBM, diag, max, eigSym} +import org.apache.spark.mllib.linalg.{ Vectors, Vector, Matrices, Matrix } import org.apache.spark.mllib.util.MLUtils /** @@ -30,22 +31,61 @@ import org.apache.spark.mllib.util.MLUtils * @param mu The mean vector of the distribution * @param sigma The covariance matrix of the distribution */ -private[mllib] class MultivariateGaussian( +class MultivariateGaussian private[mllib] ( val mu: DBV[Double], val sigma: DBM[Double]) extends Serializable { + /** + * Public constructor + * + * @param mu The mean vector of the distribution + * @param sigma The covariance matrix of the distribution + */ + def this(mu: Vector, sigma: Matrix) = { + this(mu.toBreeze.toDenseVector, sigma.toBreeze.toDenseMatrix) + } + /** * Compute distribution dependent constants: * rootSigmaInv = D^(-1/2) * U, where sigma = U * D * U.t - * u = (2*pi)^(-k/2) * det(sigma)^(-1/2) + * u = log((2*pi)^(-k/2) * det(sigma)^(-1/2)) */ private val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants + // Public methods use MLlib vectors/matrices + + /** Return the mean vector for this distribution */ + def getMean(): Vector = { + Vectors.fromBreeze(mu) + } + + /** Return the covariance matrix for this distribution */ + def getCovariance(): Matrix = { + Matrices.fromBreeze(sigma) + } + /** Returns density of this multivariate Gaussian at given point, x */ - def pdf(x: DBV[Double]): Double = { + def pdf(x: Vector): Double = { + pdf(x.toBreeze.toDenseVector) + } + + /** Returns the log-density of this multivariate Gaussian at given point, x */ + def logpdf(x: Vector): Double = { + logpdf(x.toBreeze.toDenseVector) + } + + // private methods use Breeze vectors/matrices + + /** Returns density of this multivariate Gaussian at given point, x */ + private[mllib] def pdf(x: DBV[Double]): Double = { + math.exp(logpdf(x)) + } + + /** Returns the log-density of this multivariate Gaussian at given point, x */ + private[mllib] def logpdf(x: DBV[Double]): Double = { val delta = x - mu val v = rootSigmaInv * delta - u * math.exp(v.t * v * -0.5) + u + v.t * v * -0.5 } /** @@ -54,7 +94,7 @@ private[mllib] class MultivariateGaussian( * where k is length of the mean vector. * * We here compute distribution-fixed parts - * (2*pi)^(-k/2) * det(sigma)^(-1/2) + * log((2*pi)^(-k/2) * det(sigma)^(-1/2)) * and * D^(-1/2) * U, where sigma = U * D * U.t * @@ -91,7 +131,7 @@ private[mllib] class MultivariateGaussian( // by inverting the square root of all non-zero values val pinvS = diag(new DBV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray)) - (pinvS * u, math.pow(2.0 * math.Pi, -mu.length / 2.0) * math.pow(pdetSigma, -0.5)) + (pinvS * u, math.log(math.pow(2.0 * math.Pi, -mu.length / 2.0) * math.pow(pdetSigma, -0.5))) } catch { case uex: UnsupportedOperationException => throw new IllegalArgumentException("Covariance matrix has no non-zero singular values") diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussianSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussianSuite.scala index d58f2587e55aa..172f95fb95bc9 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussianSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussianSuite.scala @@ -19,50 +19,49 @@ package org.apache.spark.mllib.stat.impl import org.scalatest.FunSuite -import breeze.linalg.{ DenseVector => BDV, DenseMatrix => BDM } - +import org.apache.spark.mllib.linalg.{ Vectors, Vector, Matrices, Matrix } import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class MultivariateGaussianSuite extends FunSuite with MLlibTestSparkContext { test("univariate") { - val x1 = new BDV(Array(0.0)) - val x2 = new BDV(Array(1.5)) + val x1 = Vectors.dense(0.0) + val x2 = Vectors.dense(1.5) - val mu = new BDV(Array(0.0)) - val sigma1 = new BDM(1, 1, Array(1.0)) + val mu = Vectors.dense(0.0) + val sigma1 = Matrices.dense(1, 1, Array(1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5) - val sigma2 = new BDM(1, 1, Array(4.0)) + val sigma2 = Matrices.dense(1, 1, Array(4.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5) } test("multivariate") { - val x1 = new BDV(Array(0.0, 0.0)) - val x2 = new BDV(Array(1.0, 1.0)) + val x1 = Vectors.dense(0.0, 0.0) + val x2 = Vectors.dense(1.0, 1.0) - val mu = new BDV(Array(0.0, 0.0)) - val sigma1 = new BDM(2, 2, Array(1.0, 0.0, 0.0, 1.0)) + val mu = Vectors.dense(0.0, 0.0) + val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5) - val sigma2 = new BDM(2, 2, Array(4.0, -1.0, -1.0, 2.0)) + val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5) } test("multivariate degenerate") { - val x1 = new BDV(Array(0.0, 0.0)) - val x2 = new BDV(Array(1.0, 1.0)) + val x1 = Vectors.dense(0.0, 0.0) + val x2 = Vectors.dense(1.0, 1.0) - val mu = new BDV(Array(0.0, 0.0)) - val sigma = new BDM(2, 2, Array(1.0, 1.0, 1.0, 1.0)) + val mu = Vectors.dense(0.0, 0.0) + val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0)) val dist = new MultivariateGaussian(mu, sigma) assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5) assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5) From 0943dc43b887a2626d6e6ef6cf0f8ac48ebc7f0f Mon Sep 17 00:00:00 2001 From: Travis Galoppo Date: Tue, 6 Jan 2015 22:51:10 -0500 Subject: [PATCH 2/7] SPARK-5018 --- .../apache/spark/mllib/stat/impl/MultivariateGaussian.scala | 4 ---- .../spark/mllib/stat/impl/MultivariateGaussianSuite.scala | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala index f291b575c47a0..b7dbd9367ad74 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala @@ -52,8 +52,6 @@ class MultivariateGaussian private[mllib] ( */ private val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants - // Public methods use MLlib vectors/matrices - /** Return the mean vector for this distribution */ def getMean(): Vector = { Vectors.fromBreeze(mu) @@ -74,8 +72,6 @@ class MultivariateGaussian private[mllib] ( logpdf(x.toBreeze.toDenseVector) } - // private methods use Breeze vectors/matrices - /** Returns density of this multivariate Gaussian at given point, x */ private[mllib] def pdf(x: DBV[Double]): Double = { math.exp(logpdf(x)) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussianSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussianSuite.scala index 172f95fb95bc9..fb900dceb9019 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussianSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussianSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.stat.impl import org.scalatest.FunSuite -import org.apache.spark.mllib.linalg.{ Vectors, Vector, Matrices, Matrix } +import org.apache.spark.mllib.linalg.{ Vectors, Matrices } import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ From 8c353810670690b3814b298523ecfbcfacc5b0f0 Mon Sep 17 00:00:00 2001 From: Travis Galoppo Date: Wed, 7 Jan 2015 19:47:04 -0500 Subject: [PATCH 3/7] Fixed accessor methods to match member variable names. Modified calculations to avoid log(pow(x,y)) calculations --- .../stat/impl/MultivariateGaussian.scala | 38 ++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala index b7dbd9367ad74..88b80a3ed6d5d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala @@ -35,6 +35,14 @@ class MultivariateGaussian private[mllib] ( val mu: DBV[Double], val sigma: DBM[Double]) extends Serializable { + if (sigma.cols != sigma.rows) { + throw new IllegalArgumentException("Covariance matrix must be square"); + } + + if (mu.length != sigma.cols) { + throw new IllegalArgumentException("Mean vector length must match covariance matrix size") + } + /** * Public constructor * @@ -47,20 +55,16 @@ class MultivariateGaussian private[mllib] ( /** * Compute distribution dependent constants: - * rootSigmaInv = D^(-1/2) * U, where sigma = U * D * U.t - * u = log((2*pi)^(-k/2) * det(sigma)^(-1/2)) + * rootSigmaInv = D^(-1/2)^ * U, where sigma = U * D * U.t + * u = log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^) */ private val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants - /** Return the mean vector for this distribution */ - def getMean(): Vector = { - Vectors.fromBreeze(mu) - } + /** Return the mean vector (mu) for this distribution */ + def getMu: Vector = Vectors.fromBreeze(mu) - /** Return the covariance matrix for this distribution */ - def getCovariance(): Matrix = { - Matrices.fromBreeze(sigma) - } + /** Return the covariance matrix (sigma) for this distribution */ + def getSigma: Matrix = Matrices.fromBreeze(sigma) /** Returns density of this multivariate Gaussian at given point, x */ def pdf(x: Vector): Double = { @@ -86,13 +90,13 @@ class MultivariateGaussian private[mllib] ( /** * Calculate distribution dependent components used for the density function: - * pdf(x) = (2*pi)^(-k/2) * det(sigma)^(-1/2) * exp( (-1/2) * (x-mu).t * inv(sigma) * (x-mu) ) + * pdf(x) = (2*pi)^(-k/2)^ * det(sigma)^(-1/2)^ * exp((-1/2) * (x-mu).t * inv(sigma) * (x-mu)) * where k is length of the mean vector. * * We here compute distribution-fixed parts - * log((2*pi)^(-k/2) * det(sigma)^(-1/2)) + * log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^) * and - * D^(-1/2) * U, where sigma = U * D * U.t + * D^(-1/2)^ * U, where sigma = U * D * U.t * * Both the determinant and the inverse can be computed from the singular value decomposition * of sigma. Noting that covariance matrices are always symmetric and positive semi-definite, @@ -101,11 +105,11 @@ class MultivariateGaussian private[mllib] ( * * sigma = U * D * U.t * inv(Sigma) = U * inv(D) * U.t - * = (D^{-1/2} * U).t * (D^{-1/2} * U) + * = (D^{-1/2}^ * U).t * (D^{-1/2}^ * U) * * and thus * - * -0.5 * (x-mu).t * inv(Sigma) * (x-mu) = -0.5 * norm(D^{-1/2} * U * (x-mu))^2 + * -0.5 * (x-mu).t * inv(Sigma) * (x-mu) = -0.5 * norm(D^{-1/2}^ * U * (x-mu))^2^ * * To guard against singular covariance matrices, this method computes both the * pseudo-determinant and the pseudo-inverse (Moore-Penrose). Singular values are considered @@ -121,13 +125,13 @@ class MultivariateGaussian private[mllib] ( try { // pseudo-determinant is product of all non-zero singular values - val pdetSigma = d.activeValuesIterator.filter(_ > tol).reduce(_ * _) + val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log(_)).reduce(_ + _) // calculate the root-pseudo-inverse of the diagonal matrix of singular values // by inverting the square root of all non-zero values val pinvS = diag(new DBV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray)) - (pinvS * u, math.log(math.pow(2.0 * math.Pi, -mu.length / 2.0) * math.pow(pdetSigma, -0.5))) + (pinvS * u, (-mu.length / 2.0) * math.log(2.0 * math.Pi) + -0.5 * logPseudoDetSigma) } catch { case uex: UnsupportedOperationException => throw new IllegalArgumentException("Covariance matrix has no non-zero singular values") From 91a5fae036f668e79422d6003ba62fc973d003a8 Mon Sep 17 00:00:00 2001 From: Travis Galoppo Date: Wed, 7 Jan 2015 20:22:42 -0500 Subject: [PATCH 4/7] Rearranged equation for part of density function --- .../apache/spark/mllib/stat/impl/MultivariateGaussian.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala index 88b80a3ed6d5d..be098122106b1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala @@ -59,7 +59,7 @@ class MultivariateGaussian private[mllib] ( * u = log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^) */ private val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants - + /** Return the mean vector (mu) for this distribution */ def getMu: Vector = Vectors.fromBreeze(mu) @@ -124,14 +124,14 @@ class MultivariateGaussian private[mllib] ( val tol = MLUtils.EPSILON * max(d) * d.length try { - // pseudo-determinant is product of all non-zero singular values + // log(pseudo-determinant) is sum of the logs of all non-zero singular values val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log(_)).reduce(_ + _) // calculate the root-pseudo-inverse of the diagonal matrix of singular values // by inverting the square root of all non-zero values val pinvS = diag(new DBV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray)) - (pinvS * u, (-mu.length / 2.0) * math.log(2.0 * math.Pi) + -0.5 * logPseudoDetSigma) + (pinvS * u, -0.5 * (mu.length * math.log(2.0 * math.Pi) + logPseudoDetSigma)) } catch { case uex: UnsupportedOperationException => throw new IllegalArgumentException("Covariance matrix has no non-zero singular values") From 9fa3bb74b0834449d9ccb5ecf54b95d1ecc33db2 Mon Sep 17 00:00:00 2001 From: Travis Galoppo Date: Thu, 8 Jan 2015 15:42:36 -0500 Subject: [PATCH 5/7] Style improvements --- .../spark/mllib/stat/impl/MultivariateGaussian.scala | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala index be098122106b1..118f08c25dcea 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala @@ -35,13 +35,8 @@ class MultivariateGaussian private[mllib] ( val mu: DBV[Double], val sigma: DBM[Double]) extends Serializable { - if (sigma.cols != sigma.rows) { - throw new IllegalArgumentException("Covariance matrix must be square"); - } - - if (mu.length != sigma.cols) { - throw new IllegalArgumentException("Mean vector length must match covariance matrix size") - } + require(sigma.cols == sigma.rows, "Covariance matrix must be square") + require(mu.length == sigma.cols, "Mean vector length must match covariance matrix size") /** * Public constructor @@ -125,7 +120,7 @@ class MultivariateGaussian private[mllib] ( try { // log(pseudo-determinant) is sum of the logs of all non-zero singular values - val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log(_)).reduce(_ + _) + val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum // calculate the root-pseudo-inverse of the diagonal matrix of singular values // by inverting the square root of all non-zero values From e30a100c2e5f3fd6af7f7e83b48315c44437147f Mon Sep 17 00:00:00 2001 From: Travis Galoppo Date: Fri, 9 Jan 2015 14:36:57 -0500 Subject: [PATCH 6/7] Made mu, sigma private[mllib] members of MultivariateGaussian Moved MultivariateGaussian (and test suite) from stat.impl to stat.distribution (required updates in GaussianMixture{EM,Model}.scala) Marked MultivariateGaussian as @DeveloperApi Fixed style error --- .../spark/mllib/clustering/GaussianMixtureEM.scala | 2 +- .../spark/mllib/clustering/GaussianMixtureModel.scala | 2 +- .../{impl => distribution}/MultivariateGaussian.scala | 10 ++++++---- .../MultivariateGaussianSuite.scala | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) rename mllib/src/main/scala/org/apache/spark/mllib/stat/{impl => distribution}/MultivariateGaussian.scala (94%) rename mllib/src/test/scala/org/apache/spark/mllib/stat/{impl => distribution}/MultivariateGaussianSuite.scala (98%) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala index bdf984aee4dae..e029a30678287 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala @@ -22,7 +22,7 @@ import scala.collection.mutable.IndexedSeq import breeze.linalg.{DenseVector => BreezeVector, DenseMatrix => BreezeMatrix, diag, Transpose} import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors} -import org.apache.spark.mllib.stat.impl.MultivariateGaussian +import org.apache.spark.mllib.stat.distribution.MultivariateGaussian import org.apache.spark.mllib.util.MLUtils /** diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala index b461ea4f0f06e..416cad080c408 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala @@ -21,7 +21,7 @@ import breeze.linalg.{DenseVector => BreezeVector} import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.{Matrix, Vector} -import org.apache.spark.mllib.stat.impl.MultivariateGaussian +import org.apache.spark.mllib.stat.distribution.MultivariateGaussian import org.apache.spark.mllib.util.MLUtils /** diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala similarity index 94% rename from mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala rename to mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala index 118f08c25dcea..fa6d755b9defb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala @@ -15,12 +15,13 @@ * limitations under the License. */ -package org.apache.spark.mllib.stat.impl +package org.apache.spark.mllib.stat.distribution import breeze.linalg.{DenseVector => DBV, DenseMatrix => DBM, diag, max, eigSym} -import org.apache.spark.mllib.linalg.{ Vectors, Vector, Matrices, Matrix } +import org.apache.spark.mllib.linalg.{Vectors, Vector, Matrices, Matrix} import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.annotation.DeveloperApi; /** * This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In @@ -31,9 +32,10 @@ import org.apache.spark.mllib.util.MLUtils * @param mu The mean vector of the distribution * @param sigma The covariance matrix of the distribution */ +@DeveloperApi class MultivariateGaussian private[mllib] ( - val mu: DBV[Double], - val sigma: DBM[Double]) extends Serializable { + private[mllib] val mu: DBV[Double], + private[mllib] val sigma: DBM[Double]) extends Serializable { require(sigma.cols == sigma.rows, "Covariance matrix must be square") require(mu.length == sigma.cols, "Mean vector length must match covariance matrix size") diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussianSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala similarity index 98% rename from mllib/src/test/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussianSuite.scala rename to mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala index fb900dceb9019..fac2498e4dcb3 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussianSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.mllib.stat.impl +package org.apache.spark.mllib.stat.distribution import org.scalatest.FunSuite From 2b1558727af2a28c7c3aa675fbdb040cd17fee01 Mon Sep 17 00:00:00 2001 From: Travis Galoppo Date: Sat, 10 Jan 2015 20:10:28 -0500 Subject: [PATCH 7/7] Style correction --- .../org/apache/spark/mllib/clustering/GaussianMixtureEM.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala index 7c96808a36bd0..d8e134619411b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala @@ -17,6 +17,8 @@ package org.apache.spark.mllib.clustering +import scala.collection.mutable.IndexedSeq + import breeze.linalg.{DenseVector => BreezeVector, DenseMatrix => BreezeMatrix, diag, Transpose} import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors, DenseVector, DenseMatrix, BLAS} @@ -25,8 +27,6 @@ import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.util.Utils -import scala.collection.mutable.IndexedSeq - /** * This class performs expectation maximization for multivariate Gaussian * Mixture Models (GMMs). A GMM represents a composite distribution of