From 6248d0e85bdb69ac64fd664049b38819cb021daf Mon Sep 17 00:00:00 2001 From: MechCoder Date: Wed, 5 Aug 2015 18:38:36 +0530 Subject: [PATCH 01/13] [SPARK-6227] [MLlib] [PySpark] Implement PySpark wrappers for SVD --- python/pyspark/mllib/linalg/distributed.py | 64 ++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index ea4f27cf4ffe..612442e59a73 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -304,6 +304,70 @@ def tallSkinnyQR(self, computeQ=False): return QRDecomposition(Q, R) + def computeSVD(self, k, computeU=False, rCond=1e-9): + """ + Computes the singular value decomposition of the RowMatrix. + + The given row matrix A of dimension (m X n) is decomposed into U * s * V'T where + + * U: (m X k) (left singular vectors) is a RowMatrix whose columns are the + eigenvectors of (A X A') + * s: DenseVector consisting of square root of the eigenvalues (singular values) + in descending order. + * v: (n X k) (right singular vectors) is a Matrix whose columns are the + eigenvectors of (A' X A) + + For more specific details on implementation, please refer the scala documentation. + + :param k: Set the number of singular values to keep. + :param computeU: Whether of not to compute U. If set to be True, then U is computed + by A * V * s^-1 + :param rCond: Reciprocal condition number. All singular values smaller than + rCond * s[0] are treated as zero, where s[0] is the largest + singular value. + :returns: SingularValueDecomposition object + + >>> data = [(3, 1, 1), (-1, 3, 1)] + >>> rm = RowMatrix(sc.parallelize(data)) + >>> svd_model = rm.computeSVD(2, True) + >>> svd_model.U.rows.collect() + [DenseVector([-0.7071, 0.7071]), DenseVector([-0.7071, -0.7071])] + >>> svd_model.s + DenseVector([3.4641, 3.1623]) + >>> svd_model.V + DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, 0.0], 0) + """ + j_model = self._java_matrix_wrapper.call("computeSVD", int(k), computeU, float(rCond)) + return SingularValueDecomposition(j_model) + + +class SingularValueDecomposition(JavaModelWrapper): + """Wrapper around the SingularValueDecomposition Java case class""" + + @property + def U(self): + """ + Returns a RowMatrix whose columns are the left singular vectors of the + SingularValueDecomposition if computeU was set to be True. + """ + u = self.call("U") + if u is not None: + return RowMatrix(u) + + @property + def s(self): + """Returns a DenseVector with singular values in descending order.""" + return self.call("s") + + @property + def V(self): + """ + Returns a DenseMatrix whose columns are the right singular vectors of the + SingularValueDecomposition. + """ + return self.call("V") + + class IndexedRow(object): """ .. note:: Experimental From 921d5b6ea7d74032dcf1b9e7c41db22a090de547 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Wed, 5 Aug 2015 23:25:13 +0530 Subject: [PATCH 02/13] Add PCA Wrappers --- python/pyspark/mllib/linalg/distributed.py | 33 +++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index 612442e59a73..0ba58460ace5 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -340,9 +340,40 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): j_model = self._java_matrix_wrapper.call("computeSVD", int(k), computeU, float(rCond)) return SingularValueDecomposition(j_model) + def computePrincipalComponents(self, k): + """ + Computes the k principal components of the given row matrix + + :param k: Number of principal components to keep. + :returns: DenseMatrix + + >>> data = sc.parallelize([[1, 2, 3], [2, 4, 5], [3, 6, 1]]) + >>> rm = RowMatrix(data) + + >>> # Returns the two principal components of rm + >>> pca = rm.computePrincipalComponents(2) + >>> pca + DenseMatrix(3, 2, [-0.349, -0.6981, 0.6252, -0.2796, -0.5592, -0.7805], 0) + + >>> # Transform into new dimensions with the greatest variance. + >>> rm.multiply(pca).rows.collect() # doctest: +NORMALIZE_WHITESPACE + [DenseVector([0.1305, -3.7394]), DenseVector([-0.3642, -6.6983]), \ + DenseVector([-4.6102, -4.9745])] + """ + return self._java_matrix_wrapper.call("computePrincipalComponents", k) + + def multiply(self, matrix): + """ + Multiplies the given row matrix with another matrix. + + :param matrix: Matrix to multiply with. + :returns: RowMatrix + """ + return RowMatrix(self._java_matrix_wrapper.call("multiply", matrix)) + class SingularValueDecomposition(JavaModelWrapper): - """Wrapper around the SingularValueDecomposition Java case class""" + """Wrapper around the SingularValueDecomposition scala case class""" @property def U(self): From 5558fa9bffca7d81b0f35152512cd3692455ec99 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Wed, 5 Aug 2015 23:48:06 +0530 Subject: [PATCH 03/13] Added docs --- docs/mllib-dimensionality-reduction.md | 44 ++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/docs/mllib-dimensionality-reduction.md b/docs/mllib-dimensionality-reduction.md index cceddce9f79a..b0261aa1db21 100644 --- a/docs/mllib-dimensionality-reduction.md +++ b/docs/mllib-dimensionality-reduction.md @@ -83,6 +83,26 @@ Applications](quick-start.html#self-contained-applications) section of the Spark quick-start guide. Be sure to also include *spark-mllib* to your build file as a dependency. + +
+{% highlight scala %} +from pyspark.mllib.linalg import Matrix +from pyspark.mllib.linalg.distributed import RowMatrix +from numpy.random import RandomState + +# Generate random data with 50 samples and 30 features. +rng = RandomState(0) +mat = RowMatrix(sc.parallelize(rng.randn(50, 30))) + +# Compute the top 20 singular values and corresponding singular vectors. +svd = mat.computeSVD(20, computeU=True) +u = svd.U # The U factor is a RowMatrix. +s = svd.s # The singular values are stored in a local dense vector. +V = svd.V # The V factor is a local dense matrix. +{% endhighlight %} + +The same code applies to `IndexedRowMatrix` if `U` is defined as an +`IndexedRowMatrix`.
@@ -124,6 +144,30 @@ Refer to the [`RowMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/dist {% include_example java/org/apache/spark/examples/mllib/JavaPCAExample.java %} + + +
+ +The following code demonstrates how to compute principal components on a `RowMatrix` +and use them to project the vectors into a low-dimensional space. + +{% highlight python %} +from pyspark.mllib.linalg import Matrix +from pyspark.mllib.linalg.distributed import RowMatrix +from numpy.random import RandomState + +# Generate random data with 50 samples and 30 features. +rng = RandomState(0) +mat = sc.parallelize(rng.randn(50, 30)) +rm = RowMatrix(mat) + +# Compute the top 10 principal components. +pc = rm.computePrincipalComponents(10) # Principal components are stored in a local dense matrix. + +# Project the rows to the linear space spanned by the top 10 principal components. +projected = rm.multiply(pc) +{% endhighlight %} +
From 50ed70011d47521a4a20e415afd87736f357876e Mon Sep 17 00:00:00 2001 From: MechCoder Date: Thu, 6 Aug 2015 00:31:08 +0530 Subject: [PATCH 04/13] Add support for multiply and computeSVD in IRM --- docs/mllib-dimensionality-reduction.md | 6 +- python/pyspark/mllib/linalg/distributed.py | 72 ++++++++++++++++++++-- 2 files changed, 70 insertions(+), 8 deletions(-) diff --git a/docs/mllib-dimensionality-reduction.md b/docs/mllib-dimensionality-reduction.md index b0261aa1db21..190edf9c9122 100644 --- a/docs/mllib-dimensionality-reduction.md +++ b/docs/mllib-dimensionality-reduction.md @@ -85,7 +85,7 @@ a dependency.
-{% highlight scala %} +{% highlight python %} from pyspark.mllib.linalg import Matrix from pyspark.mllib.linalg.distributed import RowMatrix from numpy.random import RandomState @@ -161,8 +161,8 @@ rng = RandomState(0) mat = sc.parallelize(rng.randn(50, 30)) rm = RowMatrix(mat) -# Compute the top 10 principal components. -pc = rm.computePrincipalComponents(10) # Principal components are stored in a local dense matrix. +# Compute the top 10 principal components stored in a local dense matrix. +pc = rm.computePrincipalComponents(10) # Project the rows to the linear space spanned by the top 10 principal components. projected = rm.multiply(pc) diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index 0ba58460ace5..4c152ec04c0b 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -28,7 +28,7 @@ from pyspark import RDD, since from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper -from pyspark.mllib.linalg import _convert_to_vector, Matrix, QRDecomposition +from pyspark.mllib.linalg import _convert_to_vector, DenseMatrix, Matrix, QRDecomposition from pyspark.mllib.stat import MultivariateStatisticalSummary from pyspark.storagelevel import StorageLevel @@ -364,12 +364,18 @@ def computePrincipalComponents(self, k): def multiply(self, matrix): """ - Multiplies the given row matrix with another matrix. + Multiplies the given RowMatrix with another matrix. :param matrix: Matrix to multiply with. :returns: RowMatrix + + >>> rm = RowMatrix(sc.parallelize([[0, 1], [2, 3]])) + >>> rm.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect() + [DenseVector([2.0, 3.0]), DenseVector([6.0, 11.0])] """ - return RowMatrix(self._java_matrix_wrapper.call("multiply", matrix)) + # TODO: Only DenseMatrices are supported on the Scala side. + j_model = self._java_matrix_wrapper.call("multiply", matrix) + return RowMatrix(j_model) class SingularValueDecomposition(JavaModelWrapper): @@ -378,12 +384,18 @@ class SingularValueDecomposition(JavaModelWrapper): @property def U(self): """ - Returns a RowMatrix whose columns are the left singular vectors of the + Returns a distributed matrix whose columns are the left singular vectors of the SingularValueDecomposition if computeU was set to be True. """ u = self.call("U") if u is not None: - return RowMatrix(u) + mat_name = u.getClass().getSimpleName() + if mat_name == "RowMatrix": + return RowMatrix(u) + elif mat_name == "IndexedRowMatrix": + return IndexedRowMatrix(u) + else: + raise TypeError("Expected RowMatrix/IndexedRowMatrix got %s" % mat_name) @property def s(self): @@ -628,6 +640,56 @@ def toBlockMatrix(self, rowsPerBlock=1024, colsPerBlock=1024): colsPerBlock) return BlockMatrix(java_block_matrix, rowsPerBlock, colsPerBlock) + def computeSVD(self, k, computeU=False, rCond=1e-9): + """ + Computes the singular value decomposition of the IndexedRowMatrix. + + The given row matrix A of dimension (m X n) is decomposed into U * s * V'T where + + * U: (m X k) (left singular vectors) is a IndexedRowMatrix whose columns are the + eigenvectors of (A X A') + * s: DenseVector consisting of square root of the eigenvalues (singular values) + in descending order. + * v: (n X k) (right singular vectors) is a Matrix whose columns are the + eigenvectors of (A' X A) + + For more specific details on implementation, please refer the scala documentation. + + :param k: Set the number of singular values to keep. + :param computeU: Whether of not to compute U. If set to be True, then U is computed + by A * V * s^-1 + :param rCond: Reciprocal condition number. All singular values smaller than + rCond * s[0] are treated as zero, where s[0] is the largest + singular value. + :returns: SingularValueDecomposition object + + >>> data = [(0, (3, 1, 1)), (1, (-1, 3, 1))] + >>> irm = IndexedRowMatrix(sc.parallelize(data)) + >>> svd_model = irm.computeSVD(2, True) + >>> svd_model.U.rows.collect() # doctest: +NORMALIZE_WHITESPACE + [IndexedRow(0, [-0.707106781187,0.707106781187]),\ + IndexedRow(1, [-0.707106781187,-0.707106781187])] + >>> svd_model.s + DenseVector([3.4641, 3.1623]) + >>> svd_model.V + DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, 0.0], 0) + """ + j_model = self._java_matrix_wrapper.call("computeSVD", int(k), computeU, float(rCond)) + return SingularValueDecomposition(j_model) + + def multiply(self, matrix): + """ + Multiplies the given IndexedRowMatrix with another matrix. + + :param matrix: Matrix to multiply with. + :returns: IndexedRowMatrix + + >>> data = IndexedRowMatrix(sc.parallelize([(0, (0, 1)), (1, (2, 3))])) + >>> data.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect() + [IndexedRow(0, [2.0,3.0]), IndexedRow(1, [6.0,11.0])] + """ + return IndexedRowMatrix(self._java_matrix_wrapper.call("multiply", matrix)) + class MatrixEntry(object): """ From 59f53d51cda6f9744fd2eed3cd633f1fecf11063 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Thu, 6 Aug 2015 02:26:56 +0530 Subject: [PATCH 05/13] Added tests --- python/pyspark/mllib/tests.py | 63 +++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 74cf7bb8eaf9..3219619bdc17 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -23,6 +23,7 @@ import sys import tempfile import array as pyarray +from math import sqrt from time import time, sleep from shutil import rmtree @@ -53,6 +54,7 @@ from pyspark.mllib.clustering import StreamingKMeans, StreamingKMeansModel from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector,\ DenseMatrix, SparseMatrix, Vectors, Matrices, MatrixUDT +from pyspark.mllib.linalg.distributed import RowMatrix from pyspark.mllib.classification import StreamingLogisticRegressionWithSGD from pyspark.mllib.recommendation import Rating from pyspark.mllib.regression import LabeledPoint, StreamingLinearRegressionWithSGD @@ -1608,6 +1610,67 @@ def test_binary_term_freqs(self): ": expected " + str(expected[i]) + ", got " + str(output[i])) +class DimensionalityReductionTests(MLlibTestCase): + + denseData = [ + Vectors.dense([0.0, 1.0, 2.0]), + Vectors.dense([3.0, 4.0, 5.0]), + Vectors.dense([6.0, 7.0, 8.0]), + Vectors.dense([9.0, 0.0, 1.0]) + ] + sparseData = [ + Vectors.sparse(3, [(1, 1.0), (2, 2.0)]), + Vectors.sparse(3, [(0, 3.0), (1, 4.0), (2, 5.0)]), + Vectors.sparse(3, [(0, 6.0), (1, 7.0), (2, 8.0)]), + Vectors.sparse(3, [(0, 9.0), (2, 1.0)]) + ] + + def assertEqualUpToSign(self, vecA, vecB): + eq1 = vecA - vecB + eq2 = vecA + vecB + self.assertTrue(sum(abs(eq1)) < 1e-6 or sum(abs(eq2)) < 1e-6) + + def test_svd(self): + denseMat = RowMatrix(self.sc.parallelize(self.denseData)) + sparseMat = RowMatrix(self.sc.parallelize(self.sparseData)) + m = 4 + n = 3 + for mat in [denseMat, sparseMat]: + for k in range(1, 4): + rm = mat.computeSVD(k, computeU=True) + self.assertEqual(rm.s.size, k) + self.assertEqual(rm.U.numRows(), m) + self.assertEqual(rm.U.numCols(), k) + self.assertEqual(rm.V.numRows, n) + self.assertEqual(rm.V.numCols, k) + + # Test that U returned is None if computeU is set to False. + self.assertEqual(mat.computeSVD(1).U, None) + + # Test that low rank matrices cannot have number of singular values + # greater than a limit. + rm = RowMatrix(self.sc.parallelize(tile([1, 2, 3], (3, 1)))) + self.assertEqual(rm.computeSVD(3, False, 1e-6).s.size, 1) + + def test_pca(self): + expected_pcs = array([ + [0.0, 1.0, 0.0], + [sqrt(2.0) / 2.0, 0.0, sqrt(2.0) / 2.0], + [sqrt(2.0) / 2.0, 0.0, -sqrt(2.0) / 2.0] + ]) + n = 3 + denseMat = RowMatrix(self.sc.parallelize(self.denseData)) + sparseMat = RowMatrix(self.sc.parallelize(self.sparseData)) + for mat in [denseMat, sparseMat]: + for k in range(1, 4): + pcs = mat.computePrincipalComponents(k) + self.assertEqual(pcs.numRows, n) + self.assertEqual(pcs.numCols, k) + + # We can just test the updated principal component for equality. + self.assertEqualUpToSign(pcs.toArray()[:, k - 1], expected_pcs[:, k - 1]) + + if __name__ == "__main__": from pyspark.mllib.tests import * if not _have_scipy: From 70a871d189d92c41e477e6f426e12a9fbb549ce7 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Mon, 10 Aug 2015 13:15:46 +0530 Subject: [PATCH 06/13] minor changes to doc --- docs/mllib-dimensionality-reduction.md | 6 +- python/pyspark/mllib/linalg/distributed.py | 80 ++++++++++++---------- 2 files changed, 47 insertions(+), 39 deletions(-) diff --git a/docs/mllib-dimensionality-reduction.md b/docs/mllib-dimensionality-reduction.md index 190edf9c9122..b1b11610ddff 100644 --- a/docs/mllib-dimensionality-reduction.md +++ b/docs/mllib-dimensionality-reduction.md @@ -86,7 +86,6 @@ a dependency.
{% highlight python %} -from pyspark.mllib.linalg import Matrix from pyspark.mllib.linalg.distributed import RowMatrix from numpy.random import RandomState @@ -152,14 +151,13 @@ The following code demonstrates how to compute principal components on a `RowMat and use them to project the vectors into a low-dimensional space. {% highlight python %} -from pyspark.mllib.linalg import Matrix from pyspark.mllib.linalg.distributed import RowMatrix from numpy.random import RandomState # Generate random data with 50 samples and 30 features. rng = RandomState(0) -mat = sc.parallelize(rng.randn(50, 30)) -rm = RowMatrix(mat) +data = sc.parallelize(rng.randn(50, 30)) +mat = RowMatrix(data) # Compute the top 10 principal components stored in a local dense matrix. pc = rm.computePrincipalComponents(10) diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index 4c152ec04c0b..dd87aada2878 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -308,23 +308,25 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): """ Computes the singular value decomposition of the RowMatrix. - The given row matrix A of dimension (m X n) is decomposed into U * s * V'T where + The given row matrix A of dimension (m X n) is decomposed into + U * s * V'T where - * U: (m X k) (left singular vectors) is a RowMatrix whose columns are the - eigenvectors of (A X A') - * s: DenseVector consisting of square root of the eigenvalues (singular values) - in descending order. - * v: (n X k) (right singular vectors) is a Matrix whose columns are the - eigenvectors of (A' X A) + * U: (m X k) (left singular vectors) is a RowMatrix whose + columns are the eigenvectors of (A X A') + * s: DenseVector consisting of square root of the eigenvalues + (singular values) in descending order. + * v: (n X k) (right singular vectors) is a Matrix whose columns + are the eigenvectors of (A' X A) - For more specific details on implementation, please refer the scala documentation. + For more specific details on implementation, please refer + the scala documentation. :param k: Set the number of singular values to keep. - :param computeU: Whether of not to compute U. If set to be True, then U is computed - by A * V * s^-1 - :param rCond: Reciprocal condition number. All singular values smaller than - rCond * s[0] are treated as zero, where s[0] is the largest - singular value. + :param computeU: Whether or not to compute U. If set to be + True, then U is computed by A * V * s^-1 + :param rCond: Reciprocal condition number. All singular values + smaller than rCond * s[0] are treated as zero + where s[0] is the largest singular value. :returns: SingularValueDecomposition object >>> data = [(3, 1, 1), (-1, 3, 1)] @@ -337,7 +339,8 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): >>> svd_model.V DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, 0.0], 0) """ - j_model = self._java_matrix_wrapper.call("computeSVD", int(k), computeU, float(rCond)) + j_model = self._java_matrix_wrapper.call( + "computeSVD", int(k), bool(computeU), float(rCond)) return SingularValueDecomposition(j_model) def computePrincipalComponents(self, k): @@ -384,8 +387,9 @@ class SingularValueDecomposition(JavaModelWrapper): @property def U(self): """ - Returns a distributed matrix whose columns are the left singular vectors of the - SingularValueDecomposition if computeU was set to be True. + Returns a distributed matrix whose columns are the left + singular vectors of the SingularValueDecomposition if + computeU was set to be True. """ u = self.call("U") if u is not None: @@ -399,14 +403,17 @@ def U(self): @property def s(self): - """Returns a DenseVector with singular values in descending order.""" + """ + Returns a DenseVector with singular values in + descending order. + """ return self.call("s") @property def V(self): """ - Returns a DenseMatrix whose columns are the right singular vectors of the - SingularValueDecomposition. + Returns a DenseMatrix whose columns are the right singular + vectors of the SingularValueDecomposition. """ return self.call("V") @@ -644,23 +651,25 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): """ Computes the singular value decomposition of the IndexedRowMatrix. - The given row matrix A of dimension (m X n) is decomposed into U * s * V'T where + The given row matrix A of dimension (m X n) is decomposed into + U * s * V'T where - * U: (m X k) (left singular vectors) is a IndexedRowMatrix whose columns are the - eigenvectors of (A X A') - * s: DenseVector consisting of square root of the eigenvalues (singular values) - in descending order. - * v: (n X k) (right singular vectors) is a Matrix whose columns are the - eigenvectors of (A' X A) + * U: (m X k) (left singular vectors) is a IndexedRowMatrix + whose columns are the eigenvectors of (A X A') + * s: DenseVector consisting of square root of the eigenvalues + (singular values) in descending order. + * v: (n X k) (right singular vectors) is a Matrix whose columns + are the eigenvectors of (A' X A) - For more specific details on implementation, please refer the scala documentation. + For more specific details on implementation, please refer + the scala documentation. :param k: Set the number of singular values to keep. - :param computeU: Whether of not to compute U. If set to be True, then U is computed - by A * V * s^-1 - :param rCond: Reciprocal condition number. All singular values smaller than - rCond * s[0] are treated as zero, where s[0] is the largest - singular value. + :param computeU: Whether or not to compute U. If set to be + True, then U is computed by A * V * s^-1 + :param rCond: Reciprocal condition number. All singular values + smaller than rCond * s[0] are treated as zero + where s[0] is the largest singular value. :returns: SingularValueDecomposition object >>> data = [(0, (3, 1, 1)), (1, (-1, 3, 1))] @@ -674,7 +683,8 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): >>> svd_model.V DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, 0.0], 0) """ - j_model = self._java_matrix_wrapper.call("computeSVD", int(k), computeU, float(rCond)) + j_model = self._java_matrix_wrapper.call( + "computeSVD", int(k), bool(computeU), float(rCond)) return SingularValueDecomposition(j_model) def multiply(self, matrix): @@ -684,8 +694,8 @@ def multiply(self, matrix): :param matrix: Matrix to multiply with. :returns: IndexedRowMatrix - >>> data = IndexedRowMatrix(sc.parallelize([(0, (0, 1)), (1, (2, 3))])) - >>> data.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect() + >>> mat = IndexedRowMatrix(sc.parallelize([(0, (0, 1)), (1, (2, 3))])) + >>> mat.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect() [IndexedRow(0, [2.0,3.0]), IndexedRow(1, [6.0,11.0])] """ return IndexedRowMatrix(self._java_matrix_wrapper.call("multiply", matrix)) From 0bc6a3caf25c5c5a3cfcc661432890aff2293584 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Thu, 26 May 2016 21:27:09 -0400 Subject: [PATCH 07/13] Add check for DenseMatrix --- python/pyspark/mllib/linalg/distributed.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index dd87aada2878..4a2b53c73179 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -303,7 +303,6 @@ def tallSkinnyQR(self, computeQ=False): R = decomp.call("R") return QRDecomposition(Q, R) - def computeSVD(self, k, computeU=False, rCond=1e-9): """ Computes the singular value decomposition of the RowMatrix. @@ -376,7 +375,9 @@ def multiply(self, matrix): >>> rm.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect() [DenseVector([2.0, 3.0]), DenseVector([6.0, 11.0])] """ - # TODO: Only DenseMatrices are supported on the Scala side. + if not isinstance(matrix, DenseMatrix): + raise ValueError("Only multiplication with DenseMatrix " + "is supported.") j_model = self._java_matrix_wrapper.call("multiply", matrix) return RowMatrix(j_model) @@ -698,6 +699,9 @@ def multiply(self, matrix): >>> mat.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect() [IndexedRow(0, [2.0,3.0]), IndexedRow(1, [6.0,11.0])] """ + if not isinstance(matrix, DenseMatrix): + raise ValueError("Only multiplication with DenseMatrix " + "is supported.") return IndexedRowMatrix(self._java_matrix_wrapper.call("multiply", matrix)) From 3ba24114c0281502c93b82e2d781bd1cdc8c5aa1 Mon Sep 17 00:00:00 2001 From: Nick Pentreath Date: Wed, 12 Apr 2017 11:36:52 +0200 Subject: [PATCH 08/13] Update examples, add Python examples, doc and since tags --- docs/mllib-dimensionality-reduction.md | 45 ++-------------- .../spark/examples/mllib/JavaPCAExample.java | 27 ++++++---- .../spark/examples/mllib/JavaSVDExample.java | 27 +++++----- .../python/mllib/pca_rowmatrix_example.py | 46 ++++++++++++++++ examples/src/main/python/mllib/svd_example.py | 48 +++++++++++++++++ .../mllib/PCAOnRowMatrixExample.scala | 4 +- .../spark/examples/mllib/SVDExample.scala | 11 ++-- python/pyspark/mllib/linalg/distributed.py | 54 ++++++++++++------- 8 files changed, 172 insertions(+), 90 deletions(-) create mode 100644 examples/src/main/python/mllib/pca_rowmatrix_example.py create mode 100644 examples/src/main/python/mllib/svd_example.py diff --git a/docs/mllib-dimensionality-reduction.md b/docs/mllib-dimensionality-reduction.md index 7a492734a344..a72680d52a26 100644 --- a/docs/mllib-dimensionality-reduction.md +++ b/docs/mllib-dimensionality-reduction.md @@ -76,29 +76,11 @@ Refer to the [`SingularValueDecomposition` Java docs](api/java/org/apache/spark/ The same code applies to `IndexedRowMatrix` if `U` is defined as an `IndexedRowMatrix`. - -In order to run the above application, follow the instructions -provided in the [Self-Contained -Applications](quick-start.html#self-contained-applications) section of the Spark -quick-start guide. Be sure to also include *spark-mllib* to your build file as -a dependency. -
-{% highlight python %} -from pyspark.mllib.linalg.distributed import RowMatrix -from numpy.random import RandomState - -# Generate random data with 50 samples and 30 features. -rng = RandomState(0) -mat = RowMatrix(sc.parallelize(rng.randn(50, 30))) +Refer to the [`SingularValueDecomposition` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.SingularValueDecomposition) for details on the API. -# Compute the top 20 singular values and corresponding singular vectors. -svd = mat.computeSVD(20, computeU=True) -u = svd.U # The U factor is a RowMatrix. -s = svd.s # The singular values are stored in a local dense vector. -V = svd.V # The V factor is a local dense matrix. -{% endhighlight %} +{% include_example python/mllib/svd_example.py %} The same code applies to `IndexedRowMatrix` if `U` is defined as an `IndexedRowMatrix`. @@ -137,7 +119,6 @@ Refer to the [`PCA` Scala docs](api/scala/index.html#org.apache.spark.mllib.feat The following code demonstrates how to compute principal components on a `RowMatrix` and use them to project the vectors into a low-dimensional space. -The number of columns should be small, e.g, less than 1000. Refer to the [`RowMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/distributed/RowMatrix.html) for details on the API. @@ -150,27 +131,9 @@ Refer to the [`RowMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/dist The following code demonstrates how to compute principal components on a `RowMatrix` and use them to project the vectors into a low-dimensional space. -{% highlight python %} -from pyspark.mllib.linalg.distributed import RowMatrix -from numpy.random import RandomState +Refer to the [`RowMatrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.RowMatrix) for details on the API. -# Generate random data with 50 samples and 30 features. -rng = RandomState(0) -data = sc.parallelize(rng.randn(50, 30)) -mat = RowMatrix(data) - -# Compute the top 10 principal components stored in a local dense matrix. -pc = rm.computePrincipalComponents(10) - -# Project the rows to the linear space spanned by the top 10 principal components. -projected = rm.multiply(pc) -{% endhighlight %} +{% include_example python/mllib/pca_rowmatrix_example.py %}
- -In order to run the above application, follow the instructions -provided in the [Self-Contained Applications](quick-start.html#self-contained-applications) -section of the Spark -quick-start guide. Be sure to also include *spark-mllib* to your build file as -a dependency. diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaPCAExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaPCAExample.java index 3077f557ef88..0a7dc621e111 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaPCAExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaPCAExample.java @@ -18,7 +18,8 @@ package org.apache.spark.examples.mllib; // $example on$ -import java.util.LinkedList; +import java.util.Arrays; +import java.util.List; // $example off$ import org.apache.spark.SparkConf; @@ -39,21 +40,25 @@ public class JavaPCAExample { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("PCA Example"); SparkContext sc = new SparkContext(conf); + JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc); // $example on$ - double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}}; - LinkedList rowsList = new LinkedList<>(); - for (int i = 0; i < array.length; i++) { - Vector currentRow = Vectors.dense(array[i]); - rowsList.add(currentRow); - } - JavaRDD rows = JavaSparkContext.fromSparkContext(sc).parallelize(rowsList); + List data = Arrays.asList( + Vectors.sparse(5, new int[] {1, 3}, new double[] {1.0, 7.0}), + Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), + Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) + ); + + JavaRDD rows = jsc.parallelize(data); // Create a RowMatrix from JavaRDD. RowMatrix mat = new RowMatrix(rows.rdd()); - // Compute the top 3 principal components. - Matrix pc = mat.computePrincipalComponents(3); + // Compute the top 4 principal components. + // Principal components are stored in a local dense matrix. + Matrix pc = mat.computePrincipalComponents(4); + + // Project the rows to the linear space spanned by the top 4 principal components. RowMatrix projected = mat.multiply(pc); // $example off$ Vector[] collectPartitions = (Vector[])projected.rows().collect(); @@ -61,6 +66,6 @@ public static void main(String[] args) { for (Vector vector : collectPartitions) { System.out.println("\t" + vector); } - sc.stop(); + jsc.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVDExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVDExample.java index 3730e60f6880..802be3960a33 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVDExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVDExample.java @@ -18,7 +18,8 @@ package org.apache.spark.examples.mllib; // $example on$ -import java.util.LinkedList; +import java.util.Arrays; +import java.util.List; // $example off$ import org.apache.spark.SparkConf; @@ -43,22 +44,22 @@ public static void main(String[] args) { JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc); // $example on$ - double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}}; - LinkedList rowsList = new LinkedList<>(); - for (int i = 0; i < array.length; i++) { - Vector currentRow = Vectors.dense(array[i]); - rowsList.add(currentRow); - } - JavaRDD rows = jsc.parallelize(rowsList); + List data = Arrays.asList( + Vectors.sparse(5, new int[] {1, 3}, new double[] {1.0, 7.0}), + Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), + Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) + ); + + JavaRDD rows = jsc.parallelize(data); // Create a RowMatrix from JavaRDD. RowMatrix mat = new RowMatrix(rows.rdd()); - // Compute the top 3 singular values and corresponding singular vectors. - SingularValueDecomposition svd = mat.computeSVD(3, true, 1.0E-9d); - RowMatrix U = svd.U(); - Vector s = svd.s(); - Matrix V = svd.V(); + // Compute the top 5 singular values and corresponding singular vectors. + SingularValueDecomposition svd = mat.computeSVD(5, true, 1.0E-9d); + RowMatrix U = svd.U(); // The U factor is a RowMatrix. + Vector s = svd.s(); // The singular values are stored in a local dense vector. + Matrix V = svd.V(); // The V factor is a local dense matrix. // $example off$ Vector[] collectPartitions = (Vector[]) U.rows().collect(); System.out.println("U factor is:"); diff --git a/examples/src/main/python/mllib/pca_rowmatrix_example.py b/examples/src/main/python/mllib/pca_rowmatrix_example.py new file mode 100644 index 000000000000..9b20f11b8651 --- /dev/null +++ b/examples/src/main/python/mllib/pca_rowmatrix_example.py @@ -0,0 +1,46 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.linalg import Vectors +from pyspark.mllib.linalg.distributed import RowMatrix +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="PythonPCAOnRowMatrixExample") + + # $example on$ + rows = sc.parallelize([ + Vectors.sparse(5, {1: 1.0, 3: 7.0}), + Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), + Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) + ]) + + mat = RowMatrix(rows) + # Compute the top 4 principal components. + # Principal components are stored in a local dense matrix. + pc = mat.computePrincipalComponents(4) + + # Project the rows to the linear space spanned by the top 4 principal components. + projected = mat.multiply(pc) + # $example off$ + collected = projected.rows.collect() + print("Projected Row Matrix of principal component:") + for vector in collected: + print vector + sc.stop() diff --git a/examples/src/main/python/mllib/svd_example.py b/examples/src/main/python/mllib/svd_example.py new file mode 100644 index 000000000000..09085139f17f --- /dev/null +++ b/examples/src/main/python/mllib/svd_example.py @@ -0,0 +1,48 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.linalg import Vectors +from pyspark.mllib.linalg.distributed import RowMatrix +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="PythonSVDExample") + + # $example on$ + rows = sc.parallelize([ + Vectors.sparse(5, {1: 1.0, 3: 7.0}), + Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), + Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) + ]) + + mat = RowMatrix(rows) + + # Compute the top 5 singular values and corresponding singular vectors. + svd = mat.computeSVD(5, computeU=True) + U = svd.U # The U factor is a RowMatrix. + s = svd.s # The singular values are stored in a local dense vector. + V = svd.V # The V factor is a local dense matrix. + # $example off$ + collected = U.rows.collect() + print("U factor is:") + for vector in collected: + print vector + print("Singular values are: %s" % s) + print("V factor is:\n%s" % V) + sc.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PCAOnRowMatrixExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAOnRowMatrixExample.scala index a137ba2a2f9d..da43a8d9c7e8 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/PCAOnRowMatrixExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAOnRowMatrixExample.scala @@ -39,9 +39,9 @@ object PCAOnRowMatrixExample { Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) - val dataRDD = sc.parallelize(data, 2) + val rows = sc.parallelize(data) - val mat: RowMatrix = new RowMatrix(dataRDD) + val mat: RowMatrix = new RowMatrix(rows) // Compute the top 4 principal components. // Principal components are stored in a local dense matrix. diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SVDExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SVDExample.scala index b286a3f7b909..769ae2a3a88b 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/SVDExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SVDExample.scala @@ -28,6 +28,9 @@ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ +/** + * Example for SingularValueDecomposition. + */ object SVDExample { def main(args: Array[String]): Unit = { @@ -41,15 +44,15 @@ object SVDExample { Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) - val dataRDD = sc.parallelize(data, 2) + val rows = sc.parallelize(data) - val mat: RowMatrix = new RowMatrix(dataRDD) + val mat: RowMatrix = new RowMatrix(rows) // Compute the top 5 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. - val s: Vector = svd.s // The singular values are stored in a local dense vector. - val V: Matrix = svd.V // The V factor is a local dense matrix. + val s: Vector = svd.s // The singular values are stored in a local dense vector. + val V: Matrix = svd.V // The V factor is a local dense matrix. // $example off$ val collect = U.rows.collect() println("U factor is:") diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index 5a4d147985dd..413a83dd41f2 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -33,9 +33,8 @@ from pyspark.storagelevel import StorageLevel -__all__ = ['DistributedMatrix', 'RowMatrix', 'IndexedRow', - 'IndexedRowMatrix', 'MatrixEntry', 'CoordinateMatrix', - 'BlockMatrix'] +__all__ = ['BlockMatrix', 'CoordinateMatrix', 'DistributedMatrix', 'IndexedRow', + 'IndexedRowMatrix', 'MatrixEntry', 'RowMatrix', 'SingularValueDecomposition'] class DistributedMatrix(object): @@ -301,6 +300,7 @@ def tallSkinnyQR(self, computeQ=False): R = decomp.call("R") return QRDecomposition(Q, R) + @since('2.2.0') def computeSVD(self, k, computeU=False, rCond=1e-9): """ Computes the singular value decomposition of the RowMatrix. @@ -316,18 +316,22 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): are the eigenvectors of (A' X A) For more specific details on implementation, please refer - the scala documentation. + the Scala documentation. - :param k: Set the number of singular values to keep. + :param k: Number of leading singular values to keep (`0 < k <= n`). + It might return less than k if there are numerically zero singular values + or there are not enough Ritz values converged before the maximum number of + Arnoldi update iterations is reached (in case that matrix A is ill-conditioned). :param computeU: Whether or not to compute U. If set to be True, then U is computed by A * V * s^-1 :param rCond: Reciprocal condition number. All singular values smaller than rCond * s[0] are treated as zero where s[0] is the largest singular value. - :returns: SingularValueDecomposition object + :returns: :py:class:`SingularValueDecomposition` - >>> data = [(3, 1, 1), (-1, 3, 1)] - >>> rm = RowMatrix(sc.parallelize(data)) + >>> rows = sc.parallelize[[3, 1, 1], [-1, 3, 1]] + >>> rm = RowMatrix(rows) + >>> svd_model = rm.computeSVD(2, True) >>> svd_model.U.rows.collect() [DenseVector([-0.7071, 0.7071]), DenseVector([-0.7071, -0.7071])] @@ -340,15 +344,18 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): "computeSVD", int(k), bool(computeU), float(rCond)) return SingularValueDecomposition(j_model) + @since('2.2.0') def computePrincipalComponents(self, k): """ Computes the k principal components of the given row matrix + + .. note:: This cannot be computed on matrices with more than 65535 columns. :param k: Number of principal components to keep. - :returns: DenseMatrix + :returns: :py:class:`pyspark.mllib.linalg.DenseMatrix` - >>> data = sc.parallelize([[1, 2, 3], [2, 4, 5], [3, 6, 1]]) - >>> rm = RowMatrix(data) + >>> rows = sc.parallelize([[1, 2, 3], [2, 4, 5], [3, 6, 1]]) + >>> rm = RowMatrix(rows) >>> # Returns the two principal components of rm >>> pca = rm.computePrincipalComponents(2) @@ -381,14 +388,18 @@ def multiply(self, matrix): class SingularValueDecomposition(JavaModelWrapper): - """Wrapper around the SingularValueDecomposition scala case class""" + """ + Represents singular value decomposition (SVD) factors. + + .. versionadded:: 2.2.0 + """ @property + @since('2.2.0') def U(self): """ Returns a distributed matrix whose columns are the left - singular vectors of the SingularValueDecomposition if - computeU was set to be True. + singular vectors of the SingularValueDecomposition if computeU was set to be True. """ u = self.call("U") if u is not None: @@ -401,14 +412,15 @@ def U(self): raise TypeError("Expected RowMatrix/IndexedRowMatrix got %s" % mat_name) @property + @since('2.2.0') def s(self): """ - Returns a DenseVector with singular values in - descending order. + Returns a DenseVector with singular values in descending order. """ return self.call("s") @property + @since('2.2.0') def V(self): """ Returns a DenseMatrix whose columns are the right singular @@ -643,6 +655,7 @@ def toBlockMatrix(self, rowsPerBlock=1024, colsPerBlock=1024): colsPerBlock) return BlockMatrix(java_block_matrix, rowsPerBlock, colsPerBlock) + @since('2.2.0') def computeSVD(self, k, computeU=False, rCond=1e-9): """ Computes the singular value decomposition of the IndexedRowMatrix. @@ -660,7 +673,10 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): For more specific details on implementation, please refer the scala documentation. - :param k: Set the number of singular values to keep. + :param k: Number of leading singular values to keep (`0 < k <= n`). + It might return less than k if there are numerically zero singular values + or there are not enough Ritz values converged before the maximum number of + Arnoldi update iterations is reached (in case that matrix A is ill-conditioned). :param computeU: Whether or not to compute U. If set to be True, then U is computed by A * V * s^-1 :param rCond: Reciprocal condition number. All singular values @@ -668,8 +684,8 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): where s[0] is the largest singular value. :returns: SingularValueDecomposition object - >>> data = [(0, (3, 1, 1)), (1, (-1, 3, 1))] - >>> irm = IndexedRowMatrix(sc.parallelize(data)) + >>> rows = [(0, (3, 1, 1)), (1, (-1, 3, 1))] + >>> irm = IndexedRowMatrix(sc.parallelize(rows)) >>> svd_model = irm.computeSVD(2, True) >>> svd_model.U.rows.collect() # doctest: +NORMALIZE_WHITESPACE [IndexedRow(0, [-0.707106781187,0.707106781187]),\ From 3ea88e27aa2d91adda7457afc64b8b5839e6396c Mon Sep 17 00:00:00 2001 From: Nick Pentreath Date: Wed, 12 Apr 2017 12:08:14 +0200 Subject: [PATCH 09/13] Parens for print statements --- examples/src/main/python/mllib/pca_rowmatrix_example.py | 2 +- examples/src/main/python/mllib/svd_example.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/src/main/python/mllib/pca_rowmatrix_example.py b/examples/src/main/python/mllib/pca_rowmatrix_example.py index 9b20f11b8651..49b9b1bbe08e 100644 --- a/examples/src/main/python/mllib/pca_rowmatrix_example.py +++ b/examples/src/main/python/mllib/pca_rowmatrix_example.py @@ -42,5 +42,5 @@ collected = projected.rows.collect() print("Projected Row Matrix of principal component:") for vector in collected: - print vector + print(vector) sc.stop() diff --git a/examples/src/main/python/mllib/svd_example.py b/examples/src/main/python/mllib/svd_example.py index 09085139f17f..5b220fdb3fd6 100644 --- a/examples/src/main/python/mllib/svd_example.py +++ b/examples/src/main/python/mllib/svd_example.py @@ -42,7 +42,7 @@ collected = U.rows.collect() print("U factor is:") for vector in collected: - print vector + print(vector) print("Singular values are: %s" % s) print("V factor is:\n%s" % V) sc.stop() From 0118f2c0839a45ea68a65a639efaf9fc2eab4883 Mon Sep 17 00:00:00 2001 From: Nick Pentreath Date: Wed, 12 Apr 2017 12:21:33 +0200 Subject: [PATCH 10/13] PEP8 fixes --- python/pyspark/mllib/linalg/distributed.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index 413a83dd41f2..a12b95c89c4a 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -331,7 +331,7 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): >>> rows = sc.parallelize[[3, 1, 1], [-1, 3, 1]] >>> rm = RowMatrix(rows) - + >>> svd_model = rm.computeSVD(2, True) >>> svd_model.U.rows.collect() [DenseVector([-0.7071, 0.7071]), DenseVector([-0.7071, -0.7071])] @@ -348,7 +348,7 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): def computePrincipalComponents(self, k): """ Computes the k principal components of the given row matrix - + .. note:: This cannot be computed on matrices with more than 65535 columns. :param k: Number of principal components to keep. @@ -390,7 +390,7 @@ def multiply(self, matrix): class SingularValueDecomposition(JavaModelWrapper): """ Represents singular value decomposition (SVD) factors. - + .. versionadded:: 2.2.0 """ From 56205d2c7d9f0ff1a803bc2e2bcdf435ceee6f82 Mon Sep 17 00:00:00 2001 From: Nick Pentreath Date: Wed, 12 Apr 2017 16:52:08 +0200 Subject: [PATCH 11/13] Fix incorrect bracket --- python/pyspark/mllib/linalg/distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index a12b95c89c4a..2fefa201fc22 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -329,7 +329,7 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): where s[0] is the largest singular value. :returns: :py:class:`SingularValueDecomposition` - >>> rows = sc.parallelize[[3, 1, 1], [-1, 3, 1]] + >>> rows = sc.parallelize([[3, 1, 1], [-1, 3, 1]]) >>> rm = RowMatrix(rows) >>> svd_model = rm.computeSVD(2, True) From 07808fc98609420f1c12ee131ad5e48204704a93 Mon Sep 17 00:00:00 2001 From: Nick Pentreath Date: Mon, 24 Apr 2017 13:46:09 +0200 Subject: [PATCH 12/13] Add since tag and update doc string for multiply methods --- python/pyspark/mllib/linalg/distributed.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index 2fefa201fc22..2d6d921a6930 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -369,12 +369,14 @@ def computePrincipalComponents(self, k): """ return self._java_matrix_wrapper.call("computePrincipalComponents", k) + @since('2.2.0') def multiply(self, matrix): """ - Multiplies the given RowMatrix with another matrix. + Multiply this matrix by a local dense matrix on the right. - :param matrix: Matrix to multiply with. - :returns: RowMatrix + :param matrix: a local dense matrix whose number of rows must match the number of columns + of this matrix + :returns: :py:class:`RowMatrix` >>> rm = RowMatrix(sc.parallelize([[0, 1], [2, 3]])) >>> rm.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect() @@ -699,12 +701,14 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): "computeSVD", int(k), bool(computeU), float(rCond)) return SingularValueDecomposition(j_model) + @since('2.2.0') def multiply(self, matrix): """ - Multiplies the given IndexedRowMatrix with another matrix. + Multiply this matrix by a local dense matrix on the right. - :param matrix: Matrix to multiply with. - :returns: IndexedRowMatrix + :param matrix: a local dense matrix whose number of rows must match the number of columns + of this matrix + :returns: :py:class:`IndexedRowMatrix` >>> mat = IndexedRowMatrix(sc.parallelize([(0, (0, 1)), (1, (2, 3))])) >>> mat.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect() From 94006a404aeb2c9b05643080eece64f0506ebafd Mon Sep 17 00:00:00 2001 From: Nick Pentreath Date: Mon, 24 Apr 2017 13:56:41 +0200 Subject: [PATCH 13/13] Fix doc indent --- python/pyspark/mllib/linalg/distributed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index 2d6d921a6930..4cb802514be5 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -375,7 +375,7 @@ def multiply(self, matrix): Multiply this matrix by a local dense matrix on the right. :param matrix: a local dense matrix whose number of rows must match the number of columns - of this matrix + of this matrix :returns: :py:class:`RowMatrix` >>> rm = RowMatrix(sc.parallelize([[0, 1], [2, 3]])) @@ -707,7 +707,7 @@ def multiply(self, matrix): Multiply this matrix by a local dense matrix on the right. :param matrix: a local dense matrix whose number of rows must match the number of columns - of this matrix + of this matrix :returns: :py:class:`IndexedRowMatrix` >>> mat = IndexedRowMatrix(sc.parallelize([(0, (0, 1)), (1, (2, 3))]))