Skip to content

Commit dbbd391

Browse files
committed
[SPARK-20687] mllib.Matrices.fromBreeze may cause crash when converting breeze CSCMatrix
In an operation of two A, B CSCMatrices the resulting C matrix may have some extra 0s in rowIndices and data which are created for performance improvement by Breeze. This causes problems on converting back to mllib.Matrix because it relies on rowIndices and data being coherent with colPtrs. Therefore it is necessary to truncate rowIndices and data to the active number of elements hold by the C matrix, before creating a Spark's SparseMatrix.
1 parent 62d78a2 commit dbbd391

File tree

2 files changed

+20
-11
lines changed

2 files changed

+20
-11
lines changed

mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -992,7 +992,20 @@ object Matrices {
992992
new DenseMatrix(dm.rows, dm.cols, dm.data, dm.isTranspose)
993993
case sm: BSM[Double] =>
994994
// There is no isTranspose flag for sparse matrices in Breeze
995-
new SparseMatrix(sm.rows, sm.cols, sm.colPtrs, sm.rowIndices, sm.data)
995+
996+
// Some Breeze CSCMatrices may have extra trailing zeros in
997+
// .rowIndices and .data, which are added after some matrix
998+
// operations for efficiency.
999+
//
1000+
// Therefore the last element of sm.colPtrs would no longer be
1001+
// coherent with the size of sm.rowIndices and sm.data
1002+
// despite sm being a valid CSCMatrix.
1003+
// We need to truncate both arrays (rowIndices, data)
1004+
// to the real size of the vector sm.activeSize to allow valid conversion
1005+
1006+
val truncRowIndices = sm.rowIndices.slice(0, sm.activeSize)
1007+
val truncData = sm.data.slice(0, sm.activeSize)
1008+
new SparseMatrix(sm.rows, sm.cols, sm.colPtrs, truncRowIndices, truncData)
9961009
case _ =>
9971010
throw new UnsupportedOperationException(
9981011
s"Do not support conversion from type ${breeze.getClass.getName}.")

mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,25 +46,21 @@ class MatricesSuite extends SparkFunSuite {
4646
}
4747
}
4848

49-
test("breeze conversion bug") {
49+
test("Test Breeze Conversion Bug - SPARK-20687") {
5050
// (2, 0, 0)
5151
// (2, 0, 0)
5252
val mat1Brz = Matrices.sparse(2, 3, Array(0, 2, 2, 2), Array(0, 1), Array(2, 2)).asBreeze
5353
// (2, 1E-15, 1E-15)
54-
// (2, 1E-15, 1E-15
54+
// (2, 1E-15, 1E-15)
5555
val mat2Brz = Matrices.sparse(2, 3, Array(0, 2, 4, 6), Array(0, 0, 0, 1, 1, 1), Array(2, 1E-15, 1E-15, 2, 1E-15, 1E-15)).asBreeze
56-
// The following shouldn't break
57-
val t01 = mat1Brz - mat1Brz
58-
val t02 = mat2Brz - mat2Brz
59-
val t02Brz = Matrices.fromBreeze(t02)
60-
val t01Brz = Matrices.fromBreeze(t01)
61-
6256
val t1Brz = mat1Brz - mat2Brz
6357
val t2Brz = mat2Brz - mat1Brz
64-
// The following ones should break
58+
// The following operations raise exceptions on un-patch Matrices.fromBreeze
6559
val t1 = Matrices.fromBreeze(t1Brz)
6660
val t2 = Matrices.fromBreeze(t2Brz)
67-
61+
// t1 == t1Brz && t2 == t2Brz
62+
assert((t1.asBreeze - t1Brz).iterator.map((x) => math.abs(x._2)).sum < 1E-15)
63+
assert((t2.asBreeze - t2Brz).iterator.map((x) => math.abs(x._2)).sum < 1E-15)
6864
}
6965

7066
test("sparse matrix construction") {

0 commit comments

Comments
 (0)