From b85806c190f82bf9946e9e1f581d22b3c8afec19 Mon Sep 17 00:00:00 2001 From: Funes Date: Tue, 6 May 2014 11:49:52 +0800 Subject: [PATCH 1/5] Bug fix of sparse vector conversion --- .../src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 7cdf6bd56acd9..46f30edc9425c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -136,7 +136,7 @@ object Vectors { new DenseVector(v.toArray) // Can't use underlying array directly, so make a new one } case v: BSV[Double] => - new SparseVector(v.length, v.index, v.data) + new SparseVector(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used)) case v: BV[_] => sys.error("Unsupported Breeze vector type: " + v.getClass.getName) } From 64e719843929a75208f18dbca599bb263449fb86 Mon Sep 17 00:00:00 2001 From: Funes Date: Tue, 6 May 2014 17:26:15 +0800 Subject: [PATCH 2/5] Copy data only when necessary --- .../main/scala/org/apache/spark/mllib/linalg/Vectors.scala | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 46f30edc9425c..b883c040dc8c7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -136,7 +136,12 @@ object Vectors { new DenseVector(v.toArray) // Can't use underlying array directly, so make a new one } case v: BSV[Double] => - new SparseVector(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used)) + if (v.index.length == v.used) { + new SparseVector(v.length, v.index, v.data) + } + else { + new SparseVector(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used)) + } case v: BV[_] => sys.error("Unsupported Breeze vector type: " + v.getClass.getName) } From d129a66c99c489f8a74c97d33cab09207fbd58c0 Mon Sep 17 00:00:00 2001 From: Funes Date: Wed, 7 May 2014 11:48:51 +0800 Subject: [PATCH 3/5] Add test for sparse breeze by vector builder --- .../org/apache/spark/mllib/linalg/Vectors.scala | 3 +-- .../mllib/linalg/BreezeVectorConversionSuite.scala | 14 +++++++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index b883c040dc8c7..84d223908c1f6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -138,8 +138,7 @@ object Vectors { case v: BSV[Double] => if (v.index.length == v.used) { new SparseVector(v.length, v.index, v.data) - } - else { + } else { new SparseVector(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used)) } case v: BV[_] => diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala index aacaa300849aa..2056160b7c8e3 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.linalg import org.scalatest.FunSuite -import breeze.linalg.{DenseVector => BDV, SparseVector => BSV} +import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, VectorBuilder => BVB} /** * Test Breeze vector conversions. @@ -55,4 +55,16 @@ class BreezeVectorConversionSuite extends FunSuite { assert(vec.indices.eq(indices), "should not copy data") assert(vec.values.eq(values), "should not copy data") } + + test("sparse breeze by vector builder to vector") { + val builder = new BVB[Double](n) + for (i <- 0 until indices.length) { + builder.add(indices(i), values(i)) + } + val breeze = builder.toSparseVector + val vec = Vectors.fromBreeze(breeze).asInstanceOf[SparseVector] + assert(vec.size === n) + assert(vec.indices === indices) + assert(vec.values === values) + } } From 75dced3efeb8b26a3793124e71d9c7343608c9c9 Mon Sep 17 00:00:00 2001 From: Funes Date: Thu, 8 May 2014 14:07:23 +0800 Subject: [PATCH 4/5] update test case --- .../mllib/linalg/BreezeVectorConversionSuite.scala | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala index 2056160b7c8e3..6eaec4fa21f58 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala @@ -56,15 +56,12 @@ class BreezeVectorConversionSuite extends FunSuite { assert(vec.values.eq(values), "should not copy data") } - test("sparse breeze by vector builder to vector") { - val builder = new BVB[Double](n) - for (i <- 0 until indices.length) { - builder.add(indices(i), values(i)) - } - val breeze = builder.toSparseVector + test("sparse breeze with partially-used arrays to vector") { + val activeSize = 3 + val breeze = new BSV[Double](indices, values, activeSize, n) val vec = Vectors.fromBreeze(breeze).asInstanceOf[SparseVector] assert(vec.size === n) - assert(vec.indices === indices) - assert(vec.values === values) + assert(vec.indices === indices.slice(0, activeSize)) + assert(vec.values === values.slice(0, activeSize)) } } From edb2b9de3a29d632a8634b49f14d10a4825a4983 Mon Sep 17 00:00:00 2001 From: funes Date: Thu, 8 May 2014 23:18:41 +0800 Subject: [PATCH 5/5] remove unused import --- .../apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala index 6eaec4fa21f58..8abdac72902c6 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.linalg import org.scalatest.FunSuite -import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, VectorBuilder => BVB} +import breeze.linalg.{DenseVector => BDV, SparseVector => BSV} /** * Test Breeze vector conversions.