Skip to content

Commit d31d488

Browse files
committed
adress comments
1 parent bb4539b commit d31d488

File tree

1 file changed

+34
-37
lines changed

1 file changed

+34
-37
lines changed

mllib/src/main/scala/org/apache/spark/mllib/clustering/DistanceMeasure.scala

Lines changed: 34 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,14 @@ private[spark] abstract class DistanceMeasure extends Serializable {
3636
/**
3737
* Statistics used in triangle inequality to obtain useful bounds to find closest centers.
3838
*
39-
* @return The upper triangular part of a symmetric matrix containing statistics, matrix(i)(j)
40-
* represents:
41-
* 1, a lower bound r of the center i, if i==j. If distance between point x and center i
42-
* is less than f(r), then center i is the closest center to point x.
43-
* 2, a lower bound r=matrix(i)(j) to help avoiding unnecessary distance computation.
44-
* Given point x, let i be current closest center, and d be current best distance,
45-
* if d < f(r), then we no longer need to compute the distance to center j.
39+
* @return The packed upper triangular part of a symmetric matrix containing statistics,
40+
* matrix(i,j) represents:
41+
* 1, if i != j: a bound r = matrix(i,j) to help avoiding unnecessary distance
42+
* computation. Given point x, let i be current closest center, and d be current best
43+
* distance, if d < f(r), then we no longer need to compute the distance to center j;
44+
* 2, if i == j: a bound r = matrix(i,i) = min_k{maxtrix(i,k)|k!=i}. If distance
45+
* between point x and center i is less than f(r), then center i is the closest center
46+
* to point x.
4647
*/
4748
def computeStatistics(centers: Array[VectorWithNorm]): Array[Double] = {
4849
val k = centers.length
@@ -261,14 +262,15 @@ private[spark] class EuclideanDistanceMeasure extends DistanceMeasure {
261262
* @see <a href="https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf">Charles Elkan,
262263
* Using the Triangle Inequality to Accelerate k-Means</a>
263264
*
264-
* @return One element used in statistics matrix to make matrix(i)(j) represents:
265-
* 1, squared radii of the center i, if i==j. If distance between point x and center i
266-
* is less than the radius of center i, then center i is the closest center to point x.
267-
* For Euclidean distance, radius of center i is half of the distance between center i
268-
* and its closest center;
269-
* 2, a lower bound r=matrix(i)(j) to help avoiding unnecessary distance computation.
270-
* Given point x, let i be current closest center, and d be current best squared
271-
* distance, if d < r, then we no longer need to compute the distance to center j.
265+
* @return One element used in statistics matrix to make matrix(i,j) represents:
266+
* 1, if i != j: a bound r = matrix(i,j) to help avoiding unnecessary distance
267+
* computation. Given point x, let i be current closest center, and d be current best
268+
* squared distance, if d < r, then we no longer need to compute the distance to center
269+
* j. matrix(i,j) equals to squared of half of Euclidean distance between centers i
270+
* and j;
271+
* 2, if i == j: a bound r = matrix(i,i) = min_k{maxtrix(i,k)|k!=i}. If squared
272+
* distance between point x and center i is less than r, then center i is the closest
273+
* center to point x.
272274
*/
273275
override def computeStatistics(distance: Double): Double = {
274276
0.25 * distance * distance
@@ -282,9 +284,7 @@ private[spark] class EuclideanDistanceMeasure extends DistanceMeasure {
282284
statistics: Array[Double],
283285
point: VectorWithNorm): (Int, Double) = {
284286
var bestDistance = EuclideanDistanceMeasure.fastSquaredDistance(centers(0), point)
285-
if (bestDistance < statistics(0)) {
286-
return (0, bestDistance)
287-
}
287+
if (bestDistance < statistics(0)) return (0, bestDistance)
288288

289289
val k = centers.length
290290
var bestIndex = 0
@@ -300,9 +300,8 @@ private[spark] class EuclideanDistanceMeasure extends DistanceMeasure {
300300
if (statistics(index1) < bestDistance) {
301301
val d = EuclideanDistanceMeasure.fastSquaredDistance(center, point)
302302
val index2 = indexUpperTriangular(k, i, i)
303-
if (d < statistics(index2)) {
304-
return (i, d)
305-
} else if (d < bestDistance) {
303+
if (d < statistics(index2)) return (i, d)
304+
if (d < bestDistance) {
306305
bestDistance = d
307306
bestIndex = i
308307
}
@@ -398,16 +397,17 @@ private[spark] class CosineDistanceMeasure extends DistanceMeasure {
398397
/**
399398
* Statistics used in triangle inequality to obtain useful bounds to find closest centers.
400399
*
401-
* @return One element used in statistics matrix to make matrix(i)(j) represents:
402-
* 1, squared radii of the center i, if i==j. If distance between point x and center i
403-
* is less than the radius of center i, then center i is the closest center to point x.
404-
* For Cosine distance, it is similar to Euclidean distance. However, here radian/angle
405-
* is used instead of Cosine distance: for center c, finding its closest center,
406-
* computing the radian/angle between them, halving it, and converting it back to Cosine
407-
* distance at the end.
408-
* 2, a lower bound r=matrix(i)(j) to help avoiding unnecessary distance computation.
409-
* Given point x, let i be current closest center, and d be current best squared
410-
* distance, if d < r, then we no longer need to compute the distance to center j.
400+
* @return One element used in statistics matrix to make matrix(i,j) represents:
401+
* 1, if i != j: a bound r = matrix(i,j) to help avoiding unnecessary distance
402+
* computation. Given point x, let i be current closest center, and d be current best
403+
* squared distance, if d < r, then we no longer need to compute the distance to center
404+
* j. For Cosine distance, it is similar to Euclidean distance. However, radian/angle
405+
* is used instead of Cosine distance to compute matrix(i,j): for centers i and j,
406+
* compute the radian/angle between them, halving it, and converting it back to Cosine
407+
* distance at the end;
408+
* 2, if i == j: a bound r = matrix(i,i) = min_k{maxtrix(i,k)|k!=i}. If Cosine
409+
* distance between point x and center i is less than r, then center i is the closest
410+
* center to point x.
411411
*/
412412
override def computeStatistics(distance: Double): Double = {
413413
// d = 1 - cos(x)
@@ -423,9 +423,7 @@ private[spark] class CosineDistanceMeasure extends DistanceMeasure {
423423
statistics: Array[Double],
424424
point: VectorWithNorm): (Int, Double) = {
425425
var bestDistance = distance(centers(0), point)
426-
if (bestDistance < statistics(0)) {
427-
return (0, bestDistance)
428-
}
426+
if (bestDistance < statistics(0)) return (0, bestDistance)
429427

430428
val k = centers.length
431429
var bestIndex = 0
@@ -436,9 +434,8 @@ private[spark] class CosineDistanceMeasure extends DistanceMeasure {
436434
val center = centers(i)
437435
val d = distance(center, point)
438436
val index2 = indexUpperTriangular(k, i, i)
439-
if (d < statistics(index2)) {
440-
return (i, d)
441-
} else if (d < bestDistance) {
437+
if (d < statistics(index2)) return (i, d)
438+
if (d < bestDistance) {
442439
bestDistance = d
443440
bestIndex = i
444441
}

0 commit comments

Comments
 (0)