From 04677afaeb214c814df84677a9462b763ad7fca2 Mon Sep 17 00:00:00 2001 From: George Date: Sun, 10 May 2015 23:30:29 -0700 Subject: [PATCH 01/18] initial work on adding argmax to Vector and SparseVector --- .../apache/spark/mllib/linalg/Vectors.scala | 29 +++++++++++++++---- .../spark/mllib/linalg/VectorsSuite.scala | 3 ++ 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index f6bcdf83cd337..436ab63ed4669 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -150,6 +150,12 @@ sealed trait Vector extends Serializable { toDense } } + + /** + * Find the index of a maximal element. Returns the first maximal element in case of a tie. + * Returns -1 if vector has length 0. + */ + def argmax: Int } /** @@ -588,11 +594,7 @@ class DenseVector(val values: Array[Double]) extends Vector { new SparseVector(size, ii, vv) } - /** - * Find the index of a maximal element. Returns the first maximal element in case of a tie. - * Returns -1 if vector has length 0. - */ - private[spark] def argmax: Int = { + def argmax: Int = { if (size == 0) { -1 } else { @@ -717,6 +719,23 @@ class SparseVector( new SparseVector(size, ii, vv) } } + + override def argmax: Int = { + if (size == 0) { + -1 + } else { + var maxIdx = 0 + var maxValue = values(0) + var i = 1 + foreachActive{ (i, v) => + if(v > maxValue) { + maxIdx = i + maxValue = v + } + } + maxIdx + } + } } object SparseVector { diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index 24755e9ff46fc..1bc87892e886d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -42,6 +42,7 @@ class VectorsSuite extends FunSuite { val vec = Vectors.dense(arr).asInstanceOf[DenseVector] assert(vec.size === arr.length) assert(vec.values.eq(arr)) + vec.argmax } test("sparse vector construction") { @@ -56,6 +57,8 @@ class VectorsSuite extends FunSuite { assert(vec.size === n) assert(vec.indices === indices) assert(vec.values === values) + val vec2 = Vectors.sparse(5,Array(0,3),values).asInstanceOf[SparseVector] + vec2.foreachActive( (i, v) => println(i,v)) } test("dense to array") { From 3cffed45fa6265c0e258adb0efcfe99a8e06f3ae Mon Sep 17 00:00:00 2001 From: George Date: Mon, 11 May 2015 22:37:35 -0700 Subject: [PATCH 02/18] Adding unit tests for argmax functions for Dense and Sparse vectors --- .../apache/spark/mllib/linalg/Vectors.scala | 15 ++++++++---- .../spark/mllib/linalg/VectorsSuite.scala | 23 ++++++++++++++++--- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 436ab63ed4669..4e76b3b1db19d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -724,15 +724,22 @@ class SparseVector( if (size == 0) { -1 } else { - var maxIdx = 0 + var maxIdx = indices(0) var maxValue = values(0) - var i = 1 - foreachActive{ (i, v) => - if(v > maxValue) { + + foreachActive { (i, v) => + if(values(i) > maxValue){ maxIdx = i maxValue = v } } +// while(i < this.indices.size){ +// if(values(i) > maxValue){ +// maxIdx = indices(i) +// maxValue = values(i) +// } +// i += 1 +// } maxIdx } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index 1bc87892e886d..b118856176a72 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -42,7 +42,6 @@ class VectorsSuite extends FunSuite { val vec = Vectors.dense(arr).asInstanceOf[DenseVector] assert(vec.size === arr.length) assert(vec.values.eq(arr)) - vec.argmax } test("sparse vector construction") { @@ -57,8 +56,6 @@ class VectorsSuite extends FunSuite { assert(vec.size === n) assert(vec.indices === indices) assert(vec.values === values) - val vec2 = Vectors.sparse(5,Array(0,3),values).asInstanceOf[SparseVector] - vec2.foreachActive( (i, v) => println(i,v)) } test("dense to array") { @@ -66,11 +63,31 @@ class VectorsSuite extends FunSuite { assert(vec.toArray.eq(arr)) } + test("dense argmax"){ + val vec = Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector] + val noMax = vec.argmax + assert(noMax === -1) + + val vec2 = Vectors.dense(arr).asInstanceOf[DenseVector] + val max = vec2.argmax + assert(max === 3) + } + test("sparse to array") { val vec = Vectors.sparse(n, indices, values).asInstanceOf[SparseVector] assert(vec.toArray === arr) } + test("sparse argmax"){ + val vec = Vectors.sparse(0,Array.empty[Int],Array.empty[Double]).asInstanceOf[SparseVector] + val noMax = vec.argmax + assert(noMax === -1) + + val vec2 = Vectors.sparse(n,indices,values).asInstanceOf[SparseVector] + val max = vec2.argmax + assert(max === 3) + } + test("vector equals") { val dv1 = Vectors.dense(arr.clone()) val dv2 = Vectors.dense(arr.clone()) From df9538a51d02e35ea12a0759726975ecb1a89981 Mon Sep 17 00:00:00 2001 From: George Date: Mon, 11 May 2015 23:00:09 -0700 Subject: [PATCH 03/18] Added argmax to sparse vector and added unit test --- .../scala/org/apache/spark/mllib/linalg/Vectors.scala | 9 +-------- .../org/apache/spark/mllib/linalg/VectorsSuite.scala | 4 ++++ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 4e76b3b1db19d..dce1420793169 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -728,18 +728,11 @@ class SparseVector( var maxValue = values(0) foreachActive { (i, v) => - if(values(i) > maxValue){ + if(v > maxValue){ maxIdx = i maxValue = v } } -// while(i < this.indices.size){ -// if(values(i) > maxValue){ -// maxIdx = indices(i) -// maxValue = values(i) -// } -// i += 1 -// } maxIdx } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index b118856176a72..7d35186df62cd 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -86,6 +86,10 @@ class VectorsSuite extends FunSuite { val vec2 = Vectors.sparse(n,indices,values).asInstanceOf[SparseVector] val max = vec2.argmax assert(max === 3) + + val vec3 = Vectors.sparse(5,Array(1,3,4),Array(1.0,.5,.7)) + val max2 = vec3.argmax + assert(max2 === 1) } test("vector equals") { From eeda560f3f921ac9d22932e5caa08d572ce5503d Mon Sep 17 00:00:00 2001 From: George Date: Thu, 14 May 2015 21:58:38 -0700 Subject: [PATCH 04/18] Fixing SparseVector argmax function to ignore zero values while doing the calculation. --- .../scala/org/apache/spark/mllib/linalg/Vectors.scala | 2 +- .../org/apache/spark/mllib/linalg/VectorsSuite.scala | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index dce1420793169..c9a7dd0b7ecc7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -728,7 +728,7 @@ class SparseVector( var maxValue = values(0) foreachActive { (i, v) => - if(v > maxValue){ + if(v != 0.0 && v > maxValue){ maxIdx = i maxValue = v } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index 7d35186df62cd..24d614b0eb973 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -71,6 +71,10 @@ class VectorsSuite extends FunSuite { val vec2 = Vectors.dense(arr).asInstanceOf[DenseVector] val max = vec2.argmax assert(max === 3) + + val vec3 = Vectors.dense(Array(-1.0, 0.0, -2.0, 1.0)).asInstanceOf[DenseVector] + val max2 = vec3.argmax + assert(max === 3) } test("sparse to array") { @@ -87,9 +91,10 @@ class VectorsSuite extends FunSuite { val max = vec2.argmax assert(max === 3) - val vec3 = Vectors.sparse(5,Array(1,3,4),Array(1.0,.5,.7)) + // check for case that sparse vector is created with a zero value in it by mistake + val vec3 = Vectors.sparse(5,Array(0, 2, 4),Array(-1.0, 0.0, -.7)) val max2 = vec3.argmax - assert(max2 === 1) + assert(max2 === 4) } test("vector equals") { From af179812bf76c33d2b5a7cded3c4b20280a772d2 Mon Sep 17 00:00:00 2001 From: dittmarg Date: Thu, 21 May 2015 22:02:51 -0700 Subject: [PATCH 05/18] Initial work fixing bug that was made clear in pr --- .../main/scala/org/apache/spark/mllib/linalg/Vectors.scala | 7 ++++--- .../scala/org/apache/spark/mllib/linalg/VectorsSuite.scala | 6 +++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index c9a7dd0b7ecc7..7c26779536a91 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -724,11 +724,12 @@ class SparseVector( if (size == 0) { -1 } else { - var maxIdx = indices(0) - var maxValue = values(0) + + var maxIdx = 0 + var maxValue = if(indices(0) != 0) 0 else values(0) foreachActive { (i, v) => - if(v != 0.0 && v > maxValue){ + if(v > maxValue){ maxIdx = i maxValue = v } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index 24d614b0eb973..d1c75e4edb70c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -91,10 +91,10 @@ class VectorsSuite extends FunSuite { val max = vec2.argmax assert(max === 3) - // check for case that sparse vector is created with a zero value in it by mistake - val vec3 = Vectors.sparse(5,Array(0, 2, 4),Array(-1.0, 0.0, -.7)) + // check for case that sparse vector is created with only negative vaues {0.0,0.0,-1.0,0.0,-0.7} + val vec3 = Vectors.sparse(5,Array(2, 4),Array(-1.0,-.7)) val max2 = vec3.argmax - assert(max2 === 4) + assert(max2 === 0) } test("vector equals") { From f21dcce52442718cbf8c495826ea38e54e7591cf Mon Sep 17 00:00:00 2001 From: George Dittmar Date: Mon, 25 May 2015 16:07:06 -0700 Subject: [PATCH 06/18] commit --- .../apache/spark/mllib/linalg/Vectors.scala | 31 +++++++++++++++++-- .../spark/mllib/linalg/VectorsSuite.scala | 19 ++++++++++-- 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 7c26779536a91..d5990413173b5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -594,7 +594,7 @@ class DenseVector(val values: Array[Double]) extends Vector { new SparseVector(size, ii, vv) } - def argmax: Int = { + override def argmax: Int = { if (size == 0) { -1 } else { @@ -726,17 +726,42 @@ class SparseVector( } else { var maxIdx = 0 - var maxValue = if(indices(0) != 0) 0 else values(0) + var maxValue = if(indices(0) != 0) 0.0 else values(0) foreachActive { (i, v) => - if(v > maxValue){ + if (v > maxValue) { maxIdx = i maxValue = v } } + + // look for inactive values incase all active node values are negative + if(size != values.size && maxValue < 0){ + maxIdx = calcInactiveIdx(indices(0)) + maxValue = 0 + } maxIdx } } + + /** + * Calculates the first instance of an inactive node in a sparse vector and returns the Idx + * of the element. + * @param idx starting index of computation + * @return index of first inactive node or -1 if it cannot find one + */ + private[SparseVector] def calcInactiveIdx(idx: Int): Int ={ + if(idx < size){ + if(!indices.contains(idx)){ + idx + }else{ + calcInactiveIdx(idx+1) + } + }else{ + -1 + } + } + } object SparseVector { diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index d1c75e4edb70c..7a86c670b9c71 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -91,10 +91,23 @@ class VectorsSuite extends FunSuite { val max = vec2.argmax assert(max === 3) - // check for case that sparse vector is created with only negative vaues {0.0,0.0,-1.0,0.0,-0.7} - val vec3 = Vectors.sparse(5,Array(2, 4),Array(-1.0,-.7)) + val vec3 = Vectors.sparse(5,Array(2, 4),Array(1.0,-.7)) val max2 = vec3.argmax - assert(max2 === 0) + assert(max2 === 2) + + // check for case that sparse vector is created with only negative vaues {0.0, 0.0,-1.0, -0.7, 0.0} + val vec4 = Vectors.sparse(5,Array(2, 3),Array(-1.0,-.7)) + val max3 = vec4.argmax + assert(max3 === 0) + + // check for case that sparse vector is created with only negative vaues {-1.0, 0.0, -0.7, 0.0, 0.0} + val vec5 = Vectors.sparse(5,Array(0, 3),Array(-1.0,-.7)) + val max4 = vec5.argmax + assert(max4 === 1) + + val vec6 = Vectors.sparse(5,Array(0, 1, 2),Array(-1.0, -.025, -.7)) + val max5 = vec6.argmax + assert(max5 === 3) } test("vector equals") { From b1f059f9379d3da8b16514f775cef00151518ae5 Mon Sep 17 00:00:00 2001 From: George Dittmar Date: Thu, 28 May 2015 21:01:21 -0700 Subject: [PATCH 07/18] Added comment before we start arg max calculation. Updated unit tests to cover corner cases --- .../apache/spark/mllib/linalg/Vectors.scala | 24 +++++++++---------- .../spark/mllib/linalg/VectorsSuite.scala | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index d5990413173b5..c46c6f886e7c9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -725,8 +725,9 @@ class SparseVector( -1 } else { - var maxIdx = 0 - var maxValue = if(indices(0) != 0) 0.0 else values(0) + //grab first active index and value by default + var maxIdx = indices(0) + var maxValue = values(0) foreachActive { (i, v) => if (v > maxValue) { @@ -736,8 +737,8 @@ class SparseVector( } // look for inactive values incase all active node values are negative - if(size != values.size && maxValue < 0){ - maxIdx = calcInactiveIdx(indices(0)) + if(size != values.size && maxValue <= 0){ + maxIdx = calcInactiveIdx(0) maxValue = 0 } maxIdx @@ -748,20 +749,19 @@ class SparseVector( * Calculates the first instance of an inactive node in a sparse vector and returns the Idx * of the element. * @param idx starting index of computation - * @return index of first inactive node or -1 if it cannot find one + * @return index of first inactive node */ - private[SparseVector] def calcInactiveIdx(idx: Int): Int ={ - if(idx < size){ - if(!indices.contains(idx)){ + private[SparseVector] def calcInactiveIdx(idx: Int): Int = { + if (idx < size) { + if (!indices.contains(idx)) { idx - }else{ - calcInactiveIdx(idx+1) + } else { + calcInactiveIdx(idx + 1) } - }else{ + } else { -1 } } - } object SparseVector { diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index 7a86c670b9c71..118855baecae0 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -105,9 +105,9 @@ class VectorsSuite extends FunSuite { val max4 = vec5.argmax assert(max4 === 1) - val vec6 = Vectors.sparse(5,Array(0, 1, 2),Array(-1.0, -.025, -.7)) + val vec6 = Vectors.sparse(2,Array(0, 1),Array(-1.0, 0.0)) val max5 = vec6.argmax - assert(max5 === 3) + assert(max5 === 1) } test("vector equals") { From 3ee87118472ae68c18444612ffbaba9cd4eaa080 Mon Sep 17 00:00:00 2001 From: George Dittmar Date: Sun, 31 May 2015 23:41:05 -0700 Subject: [PATCH 08/18] Fixing corner case issue with zeros in the active values of the sparse vector. Updated unit tests --- .../org/apache/spark/mllib/linalg/Vectors.scala | 14 +++++++++----- .../apache/spark/mllib/linalg/VectorsSuite.scala | 15 ++++++++++----- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index c46c6f886e7c9..f5cf732506181 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -725,7 +725,6 @@ class SparseVector( -1 } else { - //grab first active index and value by default var maxIdx = indices(0) var maxValue = values(0) @@ -736,9 +735,14 @@ class SparseVector( } } - // look for inactive values incase all active node values are negative + // look for inactive values in case all active node values are negative if(size != values.size && maxValue <= 0){ - maxIdx = calcInactiveIdx(0) + val firstInactiveIdx = calcFirstInactiveIdx(0) + if(maxValue == 0){ + if(firstInactiveIdx >= maxIdx) maxIdx else maxIdx = firstInactiveIdx + }else{ + maxIdx = firstInactiveIdx + } maxValue = 0 } maxIdx @@ -751,12 +755,12 @@ class SparseVector( * @param idx starting index of computation * @return index of first inactive node */ - private[SparseVector] def calcInactiveIdx(idx: Int): Int = { + private[SparseVector] def calcFirstInactiveIdx(idx: Int): Int = { if (idx < size) { if (!indices.contains(idx)) { idx } else { - calcInactiveIdx(idx + 1) + calcFirstInactiveIdx(idx + 1) } } else { -1 diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index 118855baecae0..d4dd7f2e0d3d6 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -95,19 +95,24 @@ class VectorsSuite extends FunSuite { val max2 = vec3.argmax assert(max2 === 2) - // check for case that sparse vector is created with only negative vaues {0.0, 0.0,-1.0, -0.7, 0.0} - val vec4 = Vectors.sparse(5,Array(2, 3),Array(-1.0,-.7)) + // check for case that sparse vector is created with only negative values {0.0, 0.0,-1.0, -0.7, 0.0} + val vec4 = Vectors.sparse(5,Array(0, 1, 2, 3),Array(0.0, 0.0, -1.0,-.7)) val max3 = vec4.argmax assert(max3 === 0) - // check for case that sparse vector is created with only negative vaues {-1.0, 0.0, -0.7, 0.0, 0.0} - val vec5 = Vectors.sparse(5,Array(0, 3),Array(-1.0,-.7)) + val vec5 = Vectors.sparse(11,Array(0, 3, 10),Array(-1.0,-.7,0.0)) val max4 = vec5.argmax assert(max4 === 1) - val vec6 = Vectors.sparse(2,Array(0, 1),Array(-1.0, 0.0)) + val vec6 = Vectors.sparse(5,Array(0, 1, 3),Array(-1.0, 0.0, -.7)) val max5 = vec6.argmax assert(max5 === 1) + + // test that converting the sparse vector to another sparse vector then calling argmax still works right + var vec8 = Vectors.sparse(5,Array(0, 1),Array(0.0, -1.0)) + vec8 = vec8.toSparse + val max7 = vec8.argmax + assert(max7 === 0) } test("vector equals") { From ee1a85adff9738cf24a0cbe9c4ec124c07780564 Mon Sep 17 00:00:00 2001 From: George Dittmar Date: Sun, 31 May 2015 23:48:06 -0700 Subject: [PATCH 09/18] Cleaning up unit tests a bit and modifying a few cases --- .../scala/org/apache/spark/mllib/linalg/VectorsSuite.scala | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index d4dd7f2e0d3d6..7ba423fc5582e 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -96,7 +96,7 @@ class VectorsSuite extends FunSuite { assert(max2 === 2) // check for case that sparse vector is created with only negative values {0.0, 0.0,-1.0, -0.7, 0.0} - val vec4 = Vectors.sparse(5,Array(0, 1, 2, 3),Array(0.0, 0.0, -1.0,-.7)) + val vec4 = Vectors.sparse(5,Array(2, 3),Array(-1.0,-.7)) val max3 = vec4.argmax assert(max3 === 0) @@ -108,9 +108,7 @@ class VectorsSuite extends FunSuite { val max5 = vec6.argmax assert(max5 === 1) - // test that converting the sparse vector to another sparse vector then calling argmax still works right - var vec8 = Vectors.sparse(5,Array(0, 1),Array(0.0, -1.0)) - vec8 = vec8.toSparse + var vec8 = Vectors.sparse(5,Array(1, 2),Array(0.0, -1.0)) val max7 = vec8.argmax assert(max7 === 0) } From d5b5423522a1753b99dacd0597d8a929f37e5646 Mon Sep 17 00:00:00 2001 From: George Dittmar Date: Mon, 8 Jun 2015 21:33:36 -0700 Subject: [PATCH 10/18] Fixing code style and updating if logic on when to check for zero values --- .../org/apache/spark/mllib/linalg/Vectors.scala | 6 ++---- .../apache/spark/mllib/linalg/VectorsSuite.scala | 14 +++++++++----- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index f5cf732506181..ab57a7bf0aa6a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -736,11 +736,9 @@ class SparseVector( } // look for inactive values in case all active node values are negative - if(size != values.size && maxValue <= 0){ + if (size != values.size && maxValue <= 0){ val firstInactiveIdx = calcFirstInactiveIdx(0) - if(maxValue == 0){ - if(firstInactiveIdx >= maxIdx) maxIdx else maxIdx = firstInactiveIdx - }else{ + if (!(maxValue == 0 && firstInactiveIdx >= maxIdx)){ maxIdx = firstInactiveIdx } maxValue = 0 diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index 7ba423fc5582e..eee4f69ca8ba9 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -91,22 +91,26 @@ class VectorsSuite extends FunSuite { val max = vec2.argmax assert(max === 3) - val vec3 = Vectors.sparse(5,Array(2, 4),Array(1.0,-.7)) + val vec3 = Vectors.sparse(5,Array(2, 3, 4),Array(1.0, 0.0, -.7)) val max2 = vec3.argmax assert(max2 === 2) // check for case that sparse vector is created with only negative values {0.0, 0.0,-1.0, -0.7, 0.0} - val vec4 = Vectors.sparse(5,Array(2, 3),Array(-1.0,-.7)) + val vec4 = Vectors.sparse(5,Array(2, 3),Array(-1.0, -.7)) val max3 = vec4.argmax assert(max3 === 0) - val vec5 = Vectors.sparse(11,Array(0, 3, 10),Array(-1.0,-.7,0.0)) + val vec5 = Vectors.sparse(11,Array(0, 3, 10),Array(-1.0, -.7, 0.0)) val max4 = vec5.argmax assert(max4 === 1) - val vec6 = Vectors.sparse(5,Array(0, 1, 3),Array(-1.0, 0.0, -.7)) + val vec6 = Vectors.sparse(11,Array(0, 1, 2),Array(-1.0, -.7, 0.0)) val max5 = vec6.argmax - assert(max5 === 1) + assert(max5 === 2) + + val vec7 = Vectors.sparse(5,Array(0, 1, 3),Array(-1.0, 0.0, -.7)) + val max6 = vec7.argmax + assert(max6 === 1) var vec8 = Vectors.sparse(5,Array(1, 2),Array(0.0, -1.0)) val max7 = vec8.argmax From ac53c553ec19449f909d33427f685303f1a720bc Mon Sep 17 00:00:00 2001 From: George Dittmar Date: Mon, 8 Jun 2015 21:36:16 -0700 Subject: [PATCH 11/18] changing dense vector argmax unit test to be one line call vs 2 --- .../scala/org/apache/spark/mllib/linalg/VectorsSuite.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index eee4f69ca8ba9..0eb20d8f3f269 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -65,8 +65,7 @@ class VectorsSuite extends FunSuite { test("dense argmax"){ val vec = Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector] - val noMax = vec.argmax - assert(noMax === -1) + assert(vec.argmax === -1) val vec2 = Vectors.dense(arr).asInstanceOf[DenseVector] val max = vec2.argmax From aa330e383d924a3e18463ea597f153589bc66944 Mon Sep 17 00:00:00 2001 From: George Dittmar Date: Mon, 8 Jun 2015 21:39:12 -0700 Subject: [PATCH 12/18] Fixing some last if else spacing issues --- .../main/scala/org/apache/spark/mllib/linalg/Vectors.scala | 4 ++-- .../scala/org/apache/spark/mllib/linalg/VectorsSuite.scala | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index ab57a7bf0aa6a..2c8891cef93ab 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -736,9 +736,9 @@ class SparseVector( } // look for inactive values in case all active node values are negative - if (size != values.size && maxValue <= 0){ + if (size != values.size && maxValue <= 0) { val firstInactiveIdx = calcFirstInactiveIdx(0) - if (!(maxValue == 0 && firstInactiveIdx >= maxIdx)){ + if (!(maxValue == 0 && firstInactiveIdx >= maxIdx)) { maxIdx = firstInactiveIdx } maxValue = 0 diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index 0eb20d8f3f269..dced71a63a3ac 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -63,7 +63,7 @@ class VectorsSuite extends FunSuite { assert(vec.toArray.eq(arr)) } - test("dense argmax"){ + test("dense argmax") { val vec = Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector] assert(vec.argmax === -1) @@ -81,7 +81,7 @@ class VectorsSuite extends FunSuite { assert(vec.toArray === arr) } - test("sparse argmax"){ + test("sparse argmax") { val vec = Vectors.sparse(0,Array.empty[Int],Array.empty[Double]).asInstanceOf[SparseVector] val noMax = vec.argmax assert(noMax === -1) From f2eba2f6b2917178d61dd0bd9d6eb02854059708 Mon Sep 17 00:00:00 2001 From: George Dittmar Date: Mon, 8 Jun 2015 21:43:23 -0700 Subject: [PATCH 13/18] Cleaning up unit tests to be fewer lines --- .../spark/mllib/linalg/VectorsSuite.scala | 30 +++++++------------ 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index dced71a63a3ac..be7059ceac870 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -68,12 +68,10 @@ class VectorsSuite extends FunSuite { assert(vec.argmax === -1) val vec2 = Vectors.dense(arr).asInstanceOf[DenseVector] - val max = vec2.argmax - assert(max === 3) + assert(vec2.argmax === 3) val vec3 = Vectors.dense(Array(-1.0, 0.0, -2.0, 1.0)).asInstanceOf[DenseVector] - val max2 = vec3.argmax - assert(max === 3) + assert(vec3.argmax === 3) } test("sparse to array") { @@ -83,37 +81,29 @@ class VectorsSuite extends FunSuite { test("sparse argmax") { val vec = Vectors.sparse(0,Array.empty[Int],Array.empty[Double]).asInstanceOf[SparseVector] - val noMax = vec.argmax - assert(noMax === -1) + assert(vec.argmax === -1) val vec2 = Vectors.sparse(n,indices,values).asInstanceOf[SparseVector] - val max = vec2.argmax - assert(max === 3) + assert(vec2.argmax === 3) val vec3 = Vectors.sparse(5,Array(2, 3, 4),Array(1.0, 0.0, -.7)) - val max2 = vec3.argmax - assert(max2 === 2) + assert(vec3.argmax === 2) // check for case that sparse vector is created with only negative values {0.0, 0.0,-1.0, -0.7, 0.0} val vec4 = Vectors.sparse(5,Array(2, 3),Array(-1.0, -.7)) - val max3 = vec4.argmax - assert(max3 === 0) + assert(vec4.argmax === 0) val vec5 = Vectors.sparse(11,Array(0, 3, 10),Array(-1.0, -.7, 0.0)) - val max4 = vec5.argmax - assert(max4 === 1) + assert(vec5.argmax === 1) val vec6 = Vectors.sparse(11,Array(0, 1, 2),Array(-1.0, -.7, 0.0)) - val max5 = vec6.argmax - assert(max5 === 2) + assert(vec6.argmax === 2) val vec7 = Vectors.sparse(5,Array(0, 1, 3),Array(-1.0, 0.0, -.7)) - val max6 = vec7.argmax - assert(max6 === 1) + assert(vec7.argmax === 1) var vec8 = Vectors.sparse(5,Array(1, 2),Array(0.0, -1.0)) - val max7 = vec8.argmax - assert(max7 === 0) + assert(vec8.argmax === 0) } test("vector equals") { From b22af46c026f2b984118df9e8a8c757afe1c62fb Mon Sep 17 00:00:00 2001 From: George Dittmar Date: Tue, 9 Jun 2015 20:23:37 -0700 Subject: [PATCH 14/18] Fixing spaces between commas in unit test --- .../apache/spark/mllib/linalg/VectorsSuite.scala | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index be7059ceac870..837e5bcd278f3 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -80,29 +80,29 @@ class VectorsSuite extends FunSuite { } test("sparse argmax") { - val vec = Vectors.sparse(0,Array.empty[Int],Array.empty[Double]).asInstanceOf[SparseVector] + val vec = Vectors.sparse(0, Array.empty[Int], Array.empty[Double]).asInstanceOf[SparseVector] assert(vec.argmax === -1) - val vec2 = Vectors.sparse(n,indices,values).asInstanceOf[SparseVector] + val vec2 = Vectors.sparse(n, indices, values).asInstanceOf[SparseVector] assert(vec2.argmax === 3) - val vec3 = Vectors.sparse(5,Array(2, 3, 4),Array(1.0, 0.0, -.7)) + val vec3 = Vectors.sparse(5, Array(2, 3, 4), Array(1.0, 0.0, -.7)) assert(vec3.argmax === 2) // check for case that sparse vector is created with only negative values {0.0, 0.0,-1.0, -0.7, 0.0} - val vec4 = Vectors.sparse(5,Array(2, 3),Array(-1.0, -.7)) + val vec4 = Vectors.sparse(5, Array(2, 3), Array(-1.0, -.7)) assert(vec4.argmax === 0) - val vec5 = Vectors.sparse(11,Array(0, 3, 10),Array(-1.0, -.7, 0.0)) + val vec5 = Vectors.sparse(11, Array(0, 3, 10), Array(-1.0, -.7, 0.0)) assert(vec5.argmax === 1) - val vec6 = Vectors.sparse(11,Array(0, 1, 2),Array(-1.0, -.7, 0.0)) + val vec6 = Vectors.sparse(11, Array(0, 1, 2), Array(-1.0, -.7, 0.0)) assert(vec6.argmax === 2) - val vec7 = Vectors.sparse(5,Array(0, 1, 3),Array(-1.0, 0.0, -.7)) + val vec7 = Vectors.sparse(5, Array(0, 1, 3), Array(-1.0, 0.0, -.7)) assert(vec7.argmax === 1) - var vec8 = Vectors.sparse(5,Array(1, 2),Array(0.0, -1.0)) + val vec8 = Vectors.sparse(5, Array(1, 2), Array(0.0, -1.0)) assert(vec8.argmax === 0) } From 42341fb9e109ddf77e949c8453de2a30c9e4e71f Mon Sep 17 00:00:00 2001 From: George Dittmar Date: Wed, 8 Jul 2015 23:28:26 -0700 Subject: [PATCH 15/18] refactoring arg max check to better handle zero values --- .../apache/spark/mllib/linalg/Vectors.scala | 30 +++---------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 2c8891cef93ab..e4ba9a243737d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -724,7 +724,6 @@ class SparseVector( if (size == 0) { -1 } else { - var maxIdx = indices(0) var maxValue = values(0) @@ -735,33 +734,12 @@ class SparseVector( } } - // look for inactive values in case all active node values are negative - if (size != values.size && maxValue <= 0) { - val firstInactiveIdx = calcFirstInactiveIdx(0) - if (!(maxValue == 0 && firstInactiveIdx >= maxIdx)) { - maxIdx = firstInactiveIdx - } - maxValue = 0 + var k = 0 + while (k < indices.length && indices(k) == k && values(k) != 0.0) { + k += 1 } - maxIdx - } - } - /** - * Calculates the first instance of an inactive node in a sparse vector and returns the Idx - * of the element. - * @param idx starting index of computation - * @return index of first inactive node - */ - private[SparseVector] def calcFirstInactiveIdx(idx: Int): Int = { - if (idx < size) { - if (!indices.contains(idx)) { - idx - } else { - calcFirstInactiveIdx(idx + 1) - } - } else { - -1 + if (maxValue <= 0.0 || k >= maxIdx) k else maxIdx } } } From 5fd9380573859b23a18ffe65d9ee3335a792d7ba Mon Sep 17 00:00:00 2001 From: George Dittmar Date: Wed, 8 Jul 2015 23:41:23 -0700 Subject: [PATCH 16/18] fixing style check error --- .../scala/org/apache/spark/mllib/linalg/VectorsSuite.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index 837e5bcd278f3..c3f407cb56e00 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -89,7 +89,8 @@ class VectorsSuite extends FunSuite { val vec3 = Vectors.sparse(5, Array(2, 3, 4), Array(1.0, 0.0, -.7)) assert(vec3.argmax === 2) - // check for case that sparse vector is created with only negative values {0.0, 0.0,-1.0, -0.7, 0.0} + // check for case that sparse vector is created with + // only negative values {0.0, 0.0,-1.0, -0.7, 0.0} val vec4 = Vectors.sparse(5, Array(2, 3), Array(-1.0, -.7)) assert(vec4.argmax === 0) From 2ea6a55c59a8acf745616dabd980e245419b3181 Mon Sep 17 00:00:00 2001 From: George Dittmar Date: Tue, 14 Jul 2015 23:02:49 -0700 Subject: [PATCH 17/18] Added MimaExcludes for Vectors.argmax --- project/MimaExcludes.scala | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 4e4e810ec36e3..905c9a0035fae 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -95,6 +95,10 @@ object MimaExcludes { "org.apache.spark.api.r.StringRRDD.this"), ProblemFilters.exclude[MissingMethodProblem]( "org.apache.spark.api.r.BaseRRDD.this") + ) ++ Seq( + // SPARK-7422 add argmax for sparse vectors + ProblemFilters.exclude[MissingMethodProblem]( + "org.apache.spark.mllib.linalg.Vector.argmax") ) case v if v.startsWith("1.4") => From 127dec5d7aa536f70891dfce10d0fb56a3b57723 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Thu, 16 Jul 2015 21:49:34 -0700 Subject: [PATCH 18/18] update argmax impl --- .../apache/spark/mllib/linalg/Vectors.scala | 36 +++++++++++++++---- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 68c933752a959..9067b3ba9a7bb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -724,22 +724,44 @@ class SparseVector( if (size == 0) { -1 } else { + // Find the max active entry. var maxIdx = indices(0) var maxValue = values(0) - - foreachActive { (i, v) => + var maxJ = 0 + var j = 1 + val na = numActives + while (j < na) { + val v = values(j) if (v > maxValue) { - maxIdx = i maxValue = v + maxIdx = indices(j) + maxJ = j } + j += 1 } - var k = 0 - while (k < indices.length && indices(k) == k && values(k) != 0.0) { - k += 1 + // If the max active entry is nonpositive and there exists inactive ones, find the first zero. + if (maxValue <= 0.0 && na < size) { + if (maxValue == 0.0) { + // If there exists an inactive entry before maxIdx, find it and return its index. + if (maxJ < maxIdx) { + var k = 0 + while (k < maxJ && indices(k) == k) { + k += 1 + } + maxIdx = k + } + } else { + // If the max active value is negative, find and return the first inactive index. + var k = 0 + while (k < na && indices(k) == k) { + k += 1 + } + maxIdx = k + } } - if (maxValue <= 0.0 || k >= maxIdx) k else maxIdx + maxIdx } } }