Skip to content

Commit da54179

Browse files
committed
add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
1 parent 52ccf1d commit da54179

File tree

2 files changed

+136
-0
lines changed

2 files changed

+136
-0
lines changed

mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,39 @@ sealed trait Vector extends Serializable {
116116
* with type `Double`.
117117
*/
118118
private[spark] def foreachActive(f: (Int, Double) => Unit)
119+
120+
/**
121+
* Number of active entries. Inactive entries are all zeros, while active entries could be zero.
122+
*/
123+
def numActives: Int
124+
125+
/**
126+
* Number of nonzero elements. This scans all active values and count nonzeros.
127+
*/
128+
def numNonzeros: Int
129+
130+
/**
131+
* Converts this vector to a sparse vector with all explicit zeros removed.
132+
*/
133+
def toSparse: SparseVector
134+
135+
/**
136+
* Converts this vector to a dense vector.
137+
*/
138+
def toDense: DenseVector = new DenseVector(this.toArray)
139+
140+
/**
141+
* Returns a vector in either dense or sparse format, whichever uses less storage.
142+
*/
143+
def compressed: Vector = {
144+
val nnz = numNonzeros
145+
// A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes.
146+
if (1.5 * (nnz + 1.0) < size) {
147+
toSparse
148+
} else {
149+
toDense
150+
}
151+
}
119152
}
120153

121154
/**
@@ -525,6 +558,34 @@ class DenseVector(val values: Array[Double]) extends Vector {
525558
}
526559
result
527560
}
561+
562+
override def numActives: Int = size
563+
564+
override def numNonzeros: Int = {
565+
// same as values.count(_ != 0.0) but faster
566+
var nnz = 0
567+
values.foreach { v =>
568+
if (v != 0.0) {
569+
nnz += 1
570+
}
571+
}
572+
nnz
573+
}
574+
575+
override def toSparse: SparseVector = {
576+
val nnz = numNonzeros
577+
val ii = new Array[Int](nnz)
578+
val vv = new Array[Double](nnz)
579+
var k = 0
580+
foreachActive { (i, v) =>
581+
if (v != 0) {
582+
ii(k) = i
583+
vv(k) = v
584+
k += 1
585+
}
586+
}
587+
new SparseVector(size, ii, vv)
588+
}
528589
}
529590

530591
object DenseVector {
@@ -602,6 +663,37 @@ class SparseVector(
602663
}
603664
result
604665
}
666+
667+
override def numActives: Int = values.length
668+
669+
override def numNonzeros: Int = {
670+
var nnz = 0
671+
values.foreach { v =>
672+
if (v != 0.0) {
673+
nnz += 1
674+
}
675+
}
676+
nnz
677+
}
678+
679+
override def toSparse: SparseVector = {
680+
val nnz = numNonzeros
681+
if (nnz == numActives) {
682+
this
683+
} else {
684+
val ii = new Array[Int](nnz)
685+
val vv = new Array[Double](nnz)
686+
var k = 0
687+
foreachActive { (i, v) =>
688+
if (v != 0.0) {
689+
ii(k) = i
690+
vv(k) = v
691+
k += 1
692+
}
693+
}
694+
new SparseVector(size, ii, vv)
695+
}
696+
}
605697
}
606698

607699
object SparseVector {

mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,4 +270,48 @@ class VectorsSuite extends FunSuite {
270270
assert(Vectors.norm(sv, 3.7) ~== math.pow(sv.toArray.foldLeft(0.0)((a, v) =>
271271
a + math.pow(math.abs(v), 3.7)), 1.0 / 3.7) relTol 1E-8)
272272
}
273+
274+
test("Vector numActive and numNonzeros") {
275+
val dv = Vectors.dense(0.0, 2.0, 3.0, 0.0)
276+
assert(dv.numActives === 4)
277+
assert(dv.numNonzeros === 2)
278+
279+
val sv = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
280+
assert(sv.numActives === 3)
281+
assert(sv.numNonzeros === 2)
282+
}
283+
284+
test("Vector toSparse and toDense") {
285+
val dv0 = Vectors.dense(0.0, 2.0, 3.0, 0.0)
286+
assert(dv0.toDense === dv0)
287+
val dv0s = dv0.toSparse
288+
assert(dv0s.numActives === 2)
289+
assert(dv0s === dv0)
290+
291+
val sv0 = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
292+
assert(sv0.toDense === sv0)
293+
val sv0s = sv0.toSparse
294+
assert(sv0s.numActives === 2)
295+
assert(sv0s === sv0)
296+
}
297+
298+
test("Vector.compressed") {
299+
val dv0 = Vectors.dense(1.0, 2.0, 3.0, 0.0)
300+
val dv0c = dv0.compressed.asInstanceOf[DenseVector]
301+
assert(dv0c === dv0)
302+
303+
val dv1 = Vectors.dense(0.0, 2.0, 0.0, 0.0)
304+
val dv1c = dv1.compressed.asInstanceOf[SparseVector]
305+
assert(dv1 === dv1c)
306+
assert(dv1c.numActives === 1)
307+
308+
val sv0 = Vectors.sparse(4, Array(1, 2), Array(2.0, 0.0))
309+
val sv0c = sv0.compressed.asInstanceOf[SparseVector]
310+
assert(sv0 === sv0c)
311+
assert(sv0c.numActives === 1)
312+
313+
val sv1 = Vectors.sparse(4, Array(0, 1, 2), Array(1.0, 2.0, 3.0))
314+
val sv1c = sv1.compressed.asInstanceOf[DenseVector]
315+
assert(sv1 === sv1c)
316+
}
273317
}

0 commit comments

Comments
 (0)