@@ -157,58 +157,6 @@ object MLUtils {
157157 dataStr.saveAsTextFile(dir)
158158 }
159159
160- /**
161- * Utility function to compute mean and standard deviation on a given dataset.
162- *
163- * @param data - input data set whose statistics are computed
164- * @param numFeatures - number of features
165- * @param numExamples - number of examples in input dataset
166- *
167- * @return (yMean, xColMean, xColSd) - Tuple consisting of
168- * yMean - mean of the labels
169- * xColMean - Row vector with mean for every column (or feature) of the input data
170- * xColSd - Row vector standard deviation for every column (or feature) of the input data.
171- */
172- def computeStats (
173- data : RDD [LabeledPoint ],
174- numFeatures : Int ,
175- numExamples : Long ): (Double , Vector , Vector ) = {
176- val brzData = data.map { case LabeledPoint (label, features) =>
177- (label, features.toBreeze)
178- }
179- val aggStats = brzData.aggregate(
180- (0L , 0.0 , BDV .zeros[Double ](numFeatures), BDV .zeros[Double ](numFeatures))
181- )(
182- seqOp = (c, v) => (c, v) match {
183- case ((n, sumLabel, sum, sumSq), (label, features)) =>
184- features.activeIterator.foreach { case (i, x) =>
185- sumSq(i) += x * x
186- }
187- (n + 1L , sumLabel + label, sum += features, sumSq)
188- },
189- combOp = (c1, c2) => (c1, c2) match {
190- case ((n1, sumLabel1, sum1, sumSq1), (n2, sumLabel2, sum2, sumSq2)) =>
191- (n1 + n2, sumLabel1 + sumLabel2, sum1 += sum2, sumSq1 += sumSq2)
192- }
193- )
194- val (nl, sumLabel, sum, sumSq) = aggStats
195-
196- require(nl > 0 , " Input data is empty." )
197- require(nl == numExamples)
198-
199- val n = nl.toDouble
200- val yMean = sumLabel / n
201- val mean = sum / n
202- val std = new Array [Double ](sum.length)
203- var i = 0
204- while (i < numFeatures) {
205- std(i) = sumSq(i) / n - mean(i) * mean(i)
206- i += 1
207- }
208-
209- (yMean, Vectors .fromBreeze(mean), Vectors .dense(std))
210- }
211-
212160 /**
213161 * Returns the squared Euclidean distance between two vectors. The following formula will be used
214162 * if it does not introduce too much numerical error:
0 commit comments