Skip to content

Commit 2f82c84

Browse files
hhbyyhmengxr
authored andcommitted
[SPARK-5186] [MLLIB] Vector.equals and Vector.hashCode are very inefficient
JIRA Issue: https://issues.apache.org/jira/browse/SPARK-5186 Currently SparseVector is using the inherited equals from Vector, which will create a full-size array for even the sparse vector. The pull request contains a specialized equals optimization that improves on both time and space. 1. The implementation will be consistent with the original. Especially it will keep equality comparison between SparseVector and DenseVector. Author: Yuhao Yang <[email protected]> Author: Yuhao Yang <[email protected]> Closes apache#3997 from hhbyyh/master and squashes the following commits: 0d9d130 [Yuhao Yang] function name change and ut update 93f0d46 [Yuhao Yang] unify sparse vs dense vectors 985e160 [Yuhao Yang] improve locality for equals bdf8789 [Yuhao Yang] improve equals and rewrite hashCode for Vector a6952c3 [Yuhao Yang] fix scala style for comments 50abef3 [Yuhao Yang] fix ut for sparse vector with explicit 0 f41b135 [Yuhao Yang] iterative equals for sparse vector 5741144 [Yuhao Yang] Specialized equals for SparseVector
1 parent d181c2a commit 2f82c84

File tree

2 files changed

+70
-3
lines changed

2 files changed

+70
-3
lines changed

mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,35 @@ sealed trait Vector extends Serializable {
5050

5151
override def equals(other: Any): Boolean = {
5252
other match {
53-
case v: Vector =>
54-
util.Arrays.equals(this.toArray, v.toArray)
53+
case v2: Vector => {
54+
if (this.size != v2.size) return false
55+
(this, v2) match {
56+
case (s1: SparseVector, s2: SparseVector) =>
57+
Vectors.equals(s1.indices, s1.values, s2.indices, s2.values)
58+
case (s1: SparseVector, d1: DenseVector) =>
59+
Vectors.equals(s1.indices, s1.values, 0 until d1.size, d1.values)
60+
case (d1: DenseVector, s1: SparseVector) =>
61+
Vectors.equals(0 until d1.size, d1.values, s1.indices, s1.values)
62+
case (_, _) => util.Arrays.equals(this.toArray, v2.toArray)
63+
}
64+
}
5565
case _ => false
5666
}
5767
}
5868

59-
override def hashCode(): Int = util.Arrays.hashCode(this.toArray)
69+
override def hashCode(): Int = {
70+
var result: Int = size + 31
71+
this.foreachActive { case (index, value) =>
72+
// ignore explict 0 for comparison between sparse and dense
73+
if (value != 0) {
74+
result = 31 * result + index
75+
// refer to {@link java.util.Arrays.equals} for hash algorithm
76+
val bits = java.lang.Double.doubleToLongBits(value)
77+
result = 31 * result + (bits ^ (bits >>> 32)).toInt
78+
}
79+
}
80+
return result
81+
}
6082

6183
/**
6284
* Converts the instance to a breeze vector.
@@ -392,6 +414,33 @@ object Vectors {
392414
}
393415
squaredDistance
394416
}
417+
418+
/**
419+
* Check equality between sparse/dense vectors
420+
*/
421+
private[mllib] def equals(
422+
v1Indices: IndexedSeq[Int],
423+
v1Values: Array[Double],
424+
v2Indices: IndexedSeq[Int],
425+
v2Values: Array[Double]): Boolean = {
426+
val v1Size = v1Values.size
427+
val v2Size = v2Values.size
428+
var k1 = 0
429+
var k2 = 0
430+
var allEqual = true
431+
while (allEqual) {
432+
while (k1 < v1Size && v1Values(k1) == 0) k1 += 1
433+
while (k2 < v2Size && v2Values(k2) == 0) k2 += 1
434+
435+
if (k1 >= v1Size || k2 >= v2Size) {
436+
return k1 >= v1Size && k2 >= v2Size // check end alignment
437+
}
438+
allEqual = v1Indices(k1) == v2Indices(k2) && v1Values(k1) == v2Values(k2)
439+
k1 += 1
440+
k2 += 1
441+
}
442+
allEqual
443+
}
395444
}
396445

397446
/**

mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,24 @@ class VectorsSuite extends FunSuite {
8989
}
9090
}
9191

92+
test("vectors equals with explicit 0") {
93+
val dv1 = Vectors.dense(Array(0, 0.9, 0, 0.8, 0))
94+
val sv1 = Vectors.sparse(5, Array(1, 3), Array(0.9, 0.8))
95+
val sv2 = Vectors.sparse(5, Array(0, 1, 2, 3, 4), Array(0, 0.9, 0, 0.8, 0))
96+
97+
val vectors = Seq(dv1, sv1, sv2)
98+
for (v <- vectors; u <- vectors) {
99+
assert(v === u)
100+
assert(v.## === u.##)
101+
}
102+
103+
val another = Vectors.sparse(5, Array(0, 1, 3), Array(0, 0.9, 0.2))
104+
for (v <- vectors) {
105+
assert(v != another)
106+
assert(v.## != another.##)
107+
}
108+
}
109+
92110
test("indexing dense vectors") {
93111
val vec = Vectors.dense(1.0, 2.0, 3.0, 4.0)
94112
assert(vec(0) === 1.0)

0 commit comments

Comments
 (0)