@@ -36,13 +36,14 @@ private[spark] abstract class DistanceMeasure extends Serializable {
3636 /**
3737 * Statistics used in triangle inequality to obtain useful bounds to find closest centers.
3838 *
39- * @return The upper triangular part of a symmetric matrix containing statistics, matrix(i)(j)
40- * represents:
41- * 1, a lower bound r of the center i, if i==j. If distance between point x and center i
42- * is less than f(r), then center i is the closest center to point x.
43- * 2, a lower bound r=matrix(i)(j) to help avoiding unnecessary distance computation.
44- * Given point x, let i be current closest center, and d be current best distance,
45- * if d < f(r), then we no longer need to compute the distance to center j.
39+ * @return The packed upper triangular part of a symmetric matrix containing statistics,
40+ * matrix(i,j) represents:
41+ * 1, if i != j: a bound r = matrix(i,j) to help avoiding unnecessary distance
42+ * computation. Given point x, let i be current closest center, and d be current best
43+ * distance, if d < f(r), then we no longer need to compute the distance to center j;
44+ * 2, if i == j: a bound r = matrix(i,i) = min_k{maxtrix(i,k)|k!=i}. If distance
45+ * between point x and center i is less than f(r), then center i is the closest center
46+ * to point x.
4647 */
4748 def computeStatistics (centers : Array [VectorWithNorm ]): Array [Double ] = {
4849 val k = centers.length
@@ -261,14 +262,15 @@ private[spark] class EuclideanDistanceMeasure extends DistanceMeasure {
261262 * @see <a href="https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf">Charles Elkan,
262263 * Using the Triangle Inequality to Accelerate k-Means</a>
263264 *
264- * @return One element used in statistics matrix to make matrix(i)(j) represents:
265- * 1, squared radii of the center i, if i==j. If distance between point x and center i
266- * is less than the radius of center i, then center i is the closest center to point x.
267- * For Euclidean distance, radius of center i is half of the distance between center i
268- * and its closest center;
269- * 2, a lower bound r=matrix(i)(j) to help avoiding unnecessary distance computation.
270- * Given point x, let i be current closest center, and d be current best squared
271- * distance, if d < r, then we no longer need to compute the distance to center j.
265+ * @return One element used in statistics matrix to make matrix(i,j) represents:
266+ * 1, if i != j: a bound r = matrix(i,j) to help avoiding unnecessary distance
267+ * computation. Given point x, let i be current closest center, and d be current best
268+ * squared distance, if d < r, then we no longer need to compute the distance to center
269+ * j. matrix(i,j) equals to squared of half of Euclidean distance between centers i
270+ * and j;
271+ * 2, if i == j: a bound r = matrix(i,i) = min_k{maxtrix(i,k)|k!=i}. If squared
272+ * distance between point x and center i is less than r, then center i is the closest
273+ * center to point x.
272274 */
273275 override def computeStatistics (distance : Double ): Double = {
274276 0.25 * distance * distance
@@ -282,9 +284,7 @@ private[spark] class EuclideanDistanceMeasure extends DistanceMeasure {
282284 statistics : Array [Double ],
283285 point : VectorWithNorm ): (Int , Double ) = {
284286 var bestDistance = EuclideanDistanceMeasure .fastSquaredDistance(centers(0 ), point)
285- if (bestDistance < statistics(0 )) {
286- return (0 , bestDistance)
287- }
287+ if (bestDistance < statistics(0 )) return (0 , bestDistance)
288288
289289 val k = centers.length
290290 var bestIndex = 0
@@ -300,9 +300,8 @@ private[spark] class EuclideanDistanceMeasure extends DistanceMeasure {
300300 if (statistics(index1) < bestDistance) {
301301 val d = EuclideanDistanceMeasure .fastSquaredDistance(center, point)
302302 val index2 = indexUpperTriangular(k, i, i)
303- if (d < statistics(index2)) {
304- return (i, d)
305- } else if (d < bestDistance) {
303+ if (d < statistics(index2)) return (i, d)
304+ if (d < bestDistance) {
306305 bestDistance = d
307306 bestIndex = i
308307 }
@@ -398,16 +397,17 @@ private[spark] class CosineDistanceMeasure extends DistanceMeasure {
398397 /**
399398 * Statistics used in triangle inequality to obtain useful bounds to find closest centers.
400399 *
401- * @return One element used in statistics matrix to make matrix(i)(j) represents:
402- * 1, squared radii of the center i, if i==j. If distance between point x and center i
403- * is less than the radius of center i, then center i is the closest center to point x.
404- * For Cosine distance, it is similar to Euclidean distance. However, here radian/angle
405- * is used instead of Cosine distance: for center c, finding its closest center,
406- * computing the radian/angle between them, halving it, and converting it back to Cosine
407- * distance at the end.
408- * 2, a lower bound r=matrix(i)(j) to help avoiding unnecessary distance computation.
409- * Given point x, let i be current closest center, and d be current best squared
410- * distance, if d < r, then we no longer need to compute the distance to center j.
400+ * @return One element used in statistics matrix to make matrix(i,j) represents:
401+ * 1, if i != j: a bound r = matrix(i,j) to help avoiding unnecessary distance
402+ * computation. Given point x, let i be current closest center, and d be current best
403+ * squared distance, if d < r, then we no longer need to compute the distance to center
404+ * j. For Cosine distance, it is similar to Euclidean distance. However, radian/angle
405+ * is used instead of Cosine distance to compute matrix(i,j): for centers i and j,
406+ * compute the radian/angle between them, halving it, and converting it back to Cosine
407+ * distance at the end;
408+ * 2, if i == j: a bound r = matrix(i,i) = min_k{maxtrix(i,k)|k!=i}. If Cosine
409+ * distance between point x and center i is less than r, then center i is the closest
410+ * center to point x.
411411 */
412412 override def computeStatistics (distance : Double ): Double = {
413413 // d = 1 - cos(x)
@@ -423,9 +423,7 @@ private[spark] class CosineDistanceMeasure extends DistanceMeasure {
423423 statistics : Array [Double ],
424424 point : VectorWithNorm ): (Int , Double ) = {
425425 var bestDistance = distance(centers(0 ), point)
426- if (bestDistance < statistics(0 )) {
427- return (0 , bestDistance)
428- }
426+ if (bestDistance < statistics(0 )) return (0 , bestDistance)
429427
430428 val k = centers.length
431429 var bestIndex = 0
@@ -436,9 +434,8 @@ private[spark] class CosineDistanceMeasure extends DistanceMeasure {
436434 val center = centers(i)
437435 val d = distance(center, point)
438436 val index2 = indexUpperTriangular(k, i, i)
439- if (d < statistics(index2)) {
440- return (i, d)
441- } else if (d < bestDistance) {
437+ if (d < statistics(index2)) return (i, d)
438+ if (d < bestDistance) {
442439 bestDistance = d
443440 bestIndex = i
444441 }
0 commit comments