@@ -21,7 +21,6 @@ import breeze.linalg.{Vector => BV, DenseVector => BDV}
2121import org .apache .spark .mllib .linalg .{Vector , Vectors }
2222import org .apache .spark .mllib .util .MLUtils ._
2323import org .apache .spark .rdd .RDD
24- import breeze .linalg ._
2524
2625/**
2726 * Extra functions available on RDDs of [[org.apache.spark.mllib.linalg.Vector Vector ]] through an
@@ -30,30 +29,6 @@ import breeze.linalg._
3029 */
3130class VectorRDDFunctions (self : RDD [Vector ]) extends Serializable {
3231
33- /**
34- * Compute the mean of each `Vector` in the RDD.
35- */
36- def rowMeans (): RDD [Double ] = {
37- self.map(x => x.toArray.sum / x.size)
38- }
39-
40- /**
41- * Compute the norm-2 of each `Vector` in the RDD.
42- */
43- def rowNorm2 (): RDD [Double ] = {
44- self.map(x => math.sqrt(x.toArray.map(x => x* x).sum))
45- }
46-
47- /**
48- * Compute the standard deviation of each `Vector` in the RDD.
49- */
50- def rowSDs (): RDD [Double ] = {
51- val means = self.rowMeans()
52- self.zip(means)
53- .map{ case (x, m) => x.toBreeze - m }
54- .map{ x => math.sqrt(x.toArray.map(x => x* x).sum / x.size) }
55- }
56-
5732 /**
5833 * Compute the mean of each column in the RDD.
5934 */
@@ -137,11 +112,6 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable {
137112 */
138113 def minOption (cmp : (Vector , Vector ) => Boolean ) = maxMinOption(! cmp(_, _))
139114
140- /**
141- * Filter the vectors whose standard deviation is not zero.
142- */
143- def rowShrink (): RDD [Vector ] = self.zip(self.rowSDs()).filter(_._2 != 0.0 ).map(_._1)
144-
145115 /**
146116 * Filter each column of the RDD whose standard deviation is not zero.
147117 */
@@ -163,34 +133,66 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable {
163133 }
164134 }
165135
166- def parallelMeanAndVar (size : Int ): (Vector , Vector , Double , Vector , Vector , Vector ) = {
167- val statistics = self.map(_.toBreeze).aggregate((BV .zeros[Double ](size), BV .zeros[Double ](size), 0.0 , BV .zeros[Double ](size), BV .fill(size){Double .MinValue }, BV .fill(size){Double .MaxValue }))(
136+ /**
137+ * Compute full column-wise statistics for the RDD, including
138+ * {{{
139+ * Mean: Vector,
140+ * Variance: Vector,
141+ * Count: Double,
142+ * Non-zero count: Vector,
143+ * Maximum elements: Vector,
144+ * Minimum elements: Vector.
145+ * }}},
146+ * with the size of Vector as input parameter.
147+ */
148+ def statistics (size : Int ): (Vector , Vector , Double , Vector , Vector , Vector ) = {
149+ val results = self.map(_.toBreeze).aggregate((
150+ BV .zeros[Double ](size),
151+ BV .zeros[Double ](size),
152+ 0.0 ,
153+ BV .zeros[Double ](size),
154+ BV .fill(size){Double .MinValue },
155+ BV .fill(size){Double .MaxValue }))(
168156 seqOp = (c, v) => (c, v) match {
169- case ((prevMean, prevM2n, cnt, nnz , maxVec, minVec), currData) =>
157+ case ((prevMean, prevM2n, cnt, nnzVec , maxVec, minVec), currData) =>
170158 val currMean = ((prevMean :* cnt) + currData) :/ (cnt + 1.0 )
171- val nonZeroCnt = Vectors .sparse(size, currData.activeKeysIterator.toSeq.map(x => (x, 1.0 ))).toBreeze
159+ val nonZeroCnt = Vectors
160+ .sparse(size, currData.activeKeysIterator.toSeq.map(x => (x, 1.0 ))).toBreeze
172161 currData.activeIterator.foreach { case (id, value) =>
173162 if (maxVec(id) < value) maxVec(id) = value
174163 if (minVec(id) > value) minVec(id) = value
175164 }
176- (currMean, prevM2n + ((currData - prevMean) :* (currData - currMean)), cnt + 1.0 , nnz + nonZeroCnt, maxVec, minVec)
165+ (currMean,
166+ prevM2n + ((currData - prevMean) :* (currData - currMean)),
167+ cnt + 1.0 ,
168+ nnzVec + nonZeroCnt,
169+ maxVec,
170+ minVec)
177171 },
178172 combOp = (lhs, rhs) => (lhs, rhs) match {
179- case ((lhsMean, lhsM2n, lhsCnt, lhsNNZ, lhsMax, lhsMin), (rhsMean, rhsM2n, rhsCnt, rhsNNZ, rhsMax, rhsMin)) =>
180- val totalCnt = lhsCnt + rhsCnt
181- val totalMean = (lhsMean :* lhsCnt) + (rhsMean :* rhsCnt) :/ totalCnt
182- val deltaMean = rhsMean - lhsMean
183- val totalM2n = lhsM2n + rhsM2n + (((deltaMean :* deltaMean) :* (lhsCnt * rhsCnt)) :/ totalCnt)
184- rhsMax.activeIterator.foreach { case (id, value) =>
185- if (lhsMax(id) < value) lhsMax(id) = value
186- }
187- rhsMin.activeIterator.foreach { case (id, value) =>
188- if (lhsMin(id) > value) lhsMin(id) = value
189- }
190- (totalMean, totalM2n, totalCnt, lhsNNZ + rhsNNZ, lhsMax, lhsMin)
173+ case (
174+ (lhsMean, lhsM2n, lhsCnt, lhsNNZ, lhsMax, lhsMin),
175+ (rhsMean, rhsM2n, rhsCnt, rhsNNZ, rhsMax, rhsMin)) =>
176+ val totalCnt = lhsCnt + rhsCnt
177+ val totalMean = (lhsMean :* lhsCnt) + (rhsMean :* rhsCnt) :/ totalCnt
178+ val deltaMean = rhsMean - lhsMean
179+ val totalM2n =
180+ lhsM2n + rhsM2n + (((deltaMean :* deltaMean) :* (lhsCnt * rhsCnt)) :/ totalCnt)
181+ rhsMax.activeIterator.foreach { case (id, value) =>
182+ if (lhsMax(id) < value) lhsMax(id) = value
183+ }
184+ rhsMin.activeIterator.foreach { case (id, value) =>
185+ if (lhsMin(id) > value) lhsMin(id) = value
186+ }
187+ (totalMean, totalM2n, totalCnt, lhsNNZ + rhsNNZ, lhsMax, lhsMin)
191188 }
192189 )
193190
194- (Vectors .fromBreeze(statistics._1), Vectors .fromBreeze(statistics._2 :/ statistics._3), statistics._3, Vectors .fromBreeze(statistics._4), Vectors .fromBreeze(statistics._5), Vectors .fromBreeze(statistics._6))
191+ (Vectors .fromBreeze(results._1),
192+ Vectors .fromBreeze(results._2 :/ results._3),
193+ results._3,
194+ Vectors .fromBreeze(results._4),
195+ Vectors .fromBreeze(results._5),
196+ Vectors .fromBreeze(results._6))
195197 }
196198}
0 commit comments