Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,40 @@ sealed trait Vector extends Serializable {
* with type `Double`.
*/
private[spark] def foreachActive(f: (Int, Double) => Unit)

/**
* Number of active entries. An "active entry" is an element which is explicitly stored,
* regardless of its value. Note that inactive entries have value 0.
*/
def numActives: Int

/**
* Number of nonzero elements. This scans all active values and count nonzeros.
*/
def numNonzeros: Int

/**
* Converts this vector to a sparse vector with all explicit zeros removed.
*/
def toSparse: SparseVector

/**
* Converts this vector to a dense vector.
*/
def toDense: DenseVector = new DenseVector(this.toArray)

/**
* Returns a vector in either dense or sparse format, whichever uses less storage.
*/
def compressed: Vector = {
val nnz = numNonzeros
// A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes.
if (1.5 * (nnz + 1.0) < size) {
toSparse
} else {
toDense
}
}
}

/**
Expand Down Expand Up @@ -525,6 +559,34 @@ class DenseVector(val values: Array[Double]) extends Vector {
}
result
}

override def numActives: Int = size

override def numNonzeros: Int = {
// same as values.count(_ != 0.0) but faster
var nnz = 0
values.foreach { v =>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is foreach as fast as a while loop?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They should have similar performance.

if (v != 0.0) {
nnz += 1
}
}
nnz
}

override def toSparse: SparseVector = {
val nnz = numNonzeros
val ii = new Array[Int](nnz)
val vv = new Array[Double](nnz)
var k = 0
foreachActive { (i, v) =>
if (v != 0) {
ii(k) = i
vv(k) = v
k += 1
}
}
new SparseVector(size, ii, vv)
}
}

object DenseVector {
Expand Down Expand Up @@ -602,6 +664,37 @@ class SparseVector(
}
result
}

override def numActives: Int = values.length

override def numNonzeros: Int = {
var nnz = 0
values.foreach { v =>
if (v != 0.0) {
nnz += 1
}
}
nnz
}

override def toSparse: SparseVector = {
val nnz = numNonzeros
if (nnz == numActives) {
this
} else {
val ii = new Array[Int](nnz)
val vv = new Array[Double](nnz)
var k = 0
foreachActive { (i, v) =>
if (v != 0.0) {
ii(k) = i
vv(k) = v
k += 1
}
}
new SparseVector(size, ii, vv)
}
}
}

object SparseVector {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -270,4 +270,48 @@ class VectorsSuite extends FunSuite {
assert(Vectors.norm(sv, 3.7) ~== math.pow(sv.toArray.foldLeft(0.0)((a, v) =>
a + math.pow(math.abs(v), 3.7)), 1.0 / 3.7) relTol 1E-8)
}

test("Vector numActive and numNonzeros") {
val dv = Vectors.dense(0.0, 2.0, 3.0, 0.0)
assert(dv.numActives === 4)
assert(dv.numNonzeros === 2)

val sv = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
assert(sv.numActives === 3)
assert(sv.numNonzeros === 2)
}

test("Vector toSparse and toDense") {
val dv0 = Vectors.dense(0.0, 2.0, 3.0, 0.0)
assert(dv0.toDense === dv0)
val dv0s = dv0.toSparse
assert(dv0s.numActives === 2)
assert(dv0s === dv0)

val sv0 = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
assert(sv0.toDense === sv0)
val sv0s = sv0.toSparse
assert(sv0s.numActives === 2)
assert(sv0s === sv0)
}

test("Vector.compressed") {
val dv0 = Vectors.dense(1.0, 2.0, 3.0, 0.0)
val dv0c = dv0.compressed.asInstanceOf[DenseVector]
assert(dv0c === dv0)

val dv1 = Vectors.dense(0.0, 2.0, 0.0, 0.0)
val dv1c = dv1.compressed.asInstanceOf[SparseVector]
assert(dv1 === dv1c)
assert(dv1c.numActives === 1)

val sv0 = Vectors.sparse(4, Array(1, 2), Array(2.0, 0.0))
val sv0c = sv0.compressed.asInstanceOf[SparseVector]
assert(sv0 === sv0c)
assert(sv0c.numActives === 1)

val sv1 = Vectors.sparse(4, Array(0, 1, 2), Array(1.0, 2.0, 3.0))
val sv1c = sv1.compressed.asInstanceOf[DenseVector]
assert(sv1 === sv1c)
}
}
12 changes: 12 additions & 0 deletions project/MimaExcludes.scala
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,18 @@ object MimaExcludes {
// SPARK-7090 Introduce LDAOptimizer to LDA to further improve extensibility
ProblemFilters.exclude[MissingClassProblem](
"org.apache.spark.mllib.clustering.LDA$EMOptimizer")
) ++ Seq(
// SPARK-6756 add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
ProblemFilters.exclude[MissingMethodProblem](
"org.apache.spark.mllib.linalg.Vector.compressed"),
ProblemFilters.exclude[MissingMethodProblem](
"org.apache.spark.mllib.linalg.Vector.toDense"),
ProblemFilters.exclude[MissingMethodProblem](
"org.apache.spark.mllib.linalg.Vector.numNonzeros"),
ProblemFilters.exclude[MissingMethodProblem](
"org.apache.spark.mllib.linalg.Vector.toSparse"),
ProblemFilters.exclude[MissingMethodProblem](
"org.apache.spark.mllib.linalg.Vector.numActives")
)

case v if v.startsWith("1.3") =>
Expand Down