From 6248d0e85bdb69ac64fd664049b38819cb021daf Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Wed, 5 Aug 2015 18:38:36 +0530
Subject: [PATCH 01/13] [SPARK-6227] [MLlib] [PySpark] Implement PySpark
 wrappers for SVD

---
 python/pyspark/mllib/linalg/distributed.py | 64 ++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index ea4f27cf4ffe..612442e59a73 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -304,6 +304,70 @@ def tallSkinnyQR(self, computeQ=False):
         return QRDecomposition(Q, R)
 
 
+    def computeSVD(self, k, computeU=False, rCond=1e-9):
+        """
+        Computes the singular value decomposition of the RowMatrix.
+
+        The given row matrix A of dimension (m X n) is decomposed into U * s * V'T where
+
+        * U: (m X k) (left singular vectors) is a RowMatrix whose columns are the
+             eigenvectors of (A X A')
+        * s: DenseVector consisting of square root of the eigenvalues (singular values)
+             in descending order.
+        * v: (n X k) (right singular vectors) is a Matrix whose columns are the
+             eigenvectors of (A' X A)
+
+        For more specific details on implementation, please refer the scala documentation.
+
+        :param k: Set the number of singular values to keep.
+        :param computeU: Whether of not to compute U. If set to be True, then U is computed
+                         by A * V * s^-1
+        :param rCond: Reciprocal condition number. All singular values smaller than
+                      rCond * s[0] are treated as zero, where s[0] is the largest
+                      singular value.
+        :returns: SingularValueDecomposition object
+
+        >>> data = [(3, 1, 1), (-1, 3, 1)]
+        >>> rm = RowMatrix(sc.parallelize(data))
+        >>> svd_model = rm.computeSVD(2, True)
+        >>> svd_model.U.rows.collect()
+        [DenseVector([-0.7071, 0.7071]), DenseVector([-0.7071, -0.7071])]
+        >>> svd_model.s
+        DenseVector([3.4641, 3.1623])
+        >>> svd_model.V
+        DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, 0.0], 0)
+        """
+        j_model = self._java_matrix_wrapper.call("computeSVD", int(k), computeU, float(rCond))
+        return SingularValueDecomposition(j_model)
+
+
+class SingularValueDecomposition(JavaModelWrapper):
+    """Wrapper around the SingularValueDecomposition Java case class"""
+
+    @property
+    def U(self):
+        """
+        Returns a RowMatrix whose columns are the left singular vectors of the
+        SingularValueDecomposition if computeU was set to be True.
+        """
+        u = self.call("U")
+        if u is not None:
+            return RowMatrix(u)
+
+    @property
+    def s(self):
+        """Returns a DenseVector with singular values in descending order."""
+        return self.call("s")
+
+    @property
+    def V(self):
+        """
+        Returns a DenseMatrix whose columns are the right singular vectors of the
+        SingularValueDecomposition.
+        """
+        return self.call("V")
+
+
 class IndexedRow(object):
     """
     .. note:: Experimental

From 921d5b6ea7d74032dcf1b9e7c41db22a090de547 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Wed, 5 Aug 2015 23:25:13 +0530
Subject: [PATCH 02/13] Add PCA Wrappers

---
 python/pyspark/mllib/linalg/distributed.py | 33 +++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index 612442e59a73..0ba58460ace5 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -340,9 +340,40 @@ def computeSVD(self, k, computeU=False, rCond=1e-9):
         j_model = self._java_matrix_wrapper.call("computeSVD", int(k), computeU, float(rCond))
         return SingularValueDecomposition(j_model)
 
+    def computePrincipalComponents(self, k):
+        """
+        Computes the k principal components of the given row matrix
+
+        :param k: Number of principal components to keep.
+        :returns: DenseMatrix
+
+        >>> data = sc.parallelize([[1, 2, 3], [2, 4, 5], [3, 6, 1]])
+        >>> rm = RowMatrix(data)
+
+        >>> # Returns the two principal components of rm
+        >>> pca = rm.computePrincipalComponents(2)
+        >>> pca
+        DenseMatrix(3, 2, [-0.349, -0.6981, 0.6252, -0.2796, -0.5592, -0.7805], 0)
+
+        >>> # Transform into new dimensions with the greatest variance.
+        >>> rm.multiply(pca).rows.collect() # doctest: +NORMALIZE_WHITESPACE
+        [DenseVector([0.1305, -3.7394]), DenseVector([-0.3642, -6.6983]), \
+        DenseVector([-4.6102, -4.9745])]
+        """
+        return self._java_matrix_wrapper.call("computePrincipalComponents", k)
+
+    def multiply(self, matrix):
+        """
+        Multiplies the given row matrix with another matrix.
+
+        :param matrix: Matrix to multiply with.
+        :returns: RowMatrix
+        """
+        return RowMatrix(self._java_matrix_wrapper.call("multiply", matrix))
+
 
 class SingularValueDecomposition(JavaModelWrapper):
-    """Wrapper around the SingularValueDecomposition Java case class"""
+    """Wrapper around the SingularValueDecomposition scala case class"""
 
     @property
     def U(self):

From 5558fa9bffca7d81b0f35152512cd3692455ec99 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Wed, 5 Aug 2015 23:48:06 +0530
Subject: [PATCH 03/13] Added docs

---
 docs/mllib-dimensionality-reduction.md | 44 ++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/docs/mllib-dimensionality-reduction.md b/docs/mllib-dimensionality-reduction.md
index cceddce9f79a..b0261aa1db21 100644
--- a/docs/mllib-dimensionality-reduction.md
+++ b/docs/mllib-dimensionality-reduction.md
@@ -83,6 +83,26 @@ Applications](quick-start.html#self-contained-applications) section of the Spark
 quick-start guide. Be sure to also include *spark-mllib* to your build file as
 a dependency.
 
+</div>
+<div data-lang="python" markdown="1">
+{% highlight scala %}
+from pyspark.mllib.linalg import Matrix
+from pyspark.mllib.linalg.distributed import RowMatrix
+from numpy.random import RandomState
+
+# Generate random data with 50 samples and 30 features.
+rng = RandomState(0)
+mat = RowMatrix(sc.parallelize(rng.randn(50, 30)))
+
+# Compute the top 20 singular values and corresponding singular vectors.
+svd = mat.computeSVD(20, computeU=True)
+u = svd.U  # The U factor is a RowMatrix.
+s = svd.s  # The singular values are stored in a local dense vector.
+V = svd.V  # The V factor is a local dense matrix.
+{% endhighlight %}
+
+The same code applies to `IndexedRowMatrix` if `U` is defined as an
+`IndexedRowMatrix`.
 </div>
 </div>
 
@@ -124,6 +144,30 @@ Refer to the [`RowMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/dist
 
 {% include_example java/org/apache/spark/examples/mllib/JavaPCAExample.java %}
 
+</div>
+
+<div data-lang="python" markdown="1">
+
+The following code demonstrates how to compute principal components on a `RowMatrix`
+and use them to project the vectors into a low-dimensional space.
+
+{% highlight python %}
+from pyspark.mllib.linalg import Matrix
+from pyspark.mllib.linalg.distributed import RowMatrix
+from numpy.random import RandomState
+
+# Generate random data with 50 samples and 30 features.
+rng = RandomState(0)
+mat = sc.parallelize(rng.randn(50, 30))
+rm = RowMatrix(mat)
+
+# Compute the top 10 principal components.
+pc = rm.computePrincipalComponents(10)  # Principal components are stored in a local dense matrix.
+
+# Project the rows to the linear space spanned by the top 10 principal components.
+projected = rm.multiply(pc)
+{% endhighlight %}
+
 </div>
 </div>
 

From 50ed70011d47521a4a20e415afd87736f357876e Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Thu, 6 Aug 2015 00:31:08 +0530
Subject: [PATCH 04/13] Add support for multiply and computeSVD in IRM

---
 docs/mllib-dimensionality-reduction.md     |  6 +-
 python/pyspark/mllib/linalg/distributed.py | 72 ++++++++++++++++++++--
 2 files changed, 70 insertions(+), 8 deletions(-)

diff --git a/docs/mllib-dimensionality-reduction.md b/docs/mllib-dimensionality-reduction.md
index b0261aa1db21..190edf9c9122 100644
--- a/docs/mllib-dimensionality-reduction.md
+++ b/docs/mllib-dimensionality-reduction.md
@@ -85,7 +85,7 @@ a dependency.
 
 </div>
 <div data-lang="python" markdown="1">
-{% highlight scala %}
+{% highlight python %}
 from pyspark.mllib.linalg import Matrix
 from pyspark.mllib.linalg.distributed import RowMatrix
 from numpy.random import RandomState
@@ -161,8 +161,8 @@ rng = RandomState(0)
 mat = sc.parallelize(rng.randn(50, 30))
 rm = RowMatrix(mat)
 
-# Compute the top 10 principal components.
-pc = rm.computePrincipalComponents(10)  # Principal components are stored in a local dense matrix.
+# Compute the top 10 principal components stored in a local dense matrix.
+pc = rm.computePrincipalComponents(10)
 
 # Project the rows to the linear space spanned by the top 10 principal components.
 projected = rm.multiply(pc)
diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index 0ba58460ace5..4c152ec04c0b 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -28,7 +28,7 @@
 
 from pyspark import RDD, since
 from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
-from pyspark.mllib.linalg import _convert_to_vector, Matrix, QRDecomposition
+from pyspark.mllib.linalg import _convert_to_vector, DenseMatrix, Matrix, QRDecomposition
 from pyspark.mllib.stat import MultivariateStatisticalSummary
 from pyspark.storagelevel import StorageLevel
 
@@ -364,12 +364,18 @@ def computePrincipalComponents(self, k):
 
     def multiply(self, matrix):
         """
-        Multiplies the given row matrix with another matrix.
+        Multiplies the given RowMatrix with another matrix.
 
         :param matrix: Matrix to multiply with.
         :returns: RowMatrix
+
+        >>> rm = RowMatrix(sc.parallelize([[0, 1], [2, 3]]))
+        >>> rm.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect()
+        [DenseVector([2.0, 3.0]), DenseVector([6.0, 11.0])]
         """
-        return RowMatrix(self._java_matrix_wrapper.call("multiply", matrix))
+        # TODO: Only DenseMatrices are supported on the Scala side.
+        j_model = self._java_matrix_wrapper.call("multiply", matrix)
+        return RowMatrix(j_model)
 
 
 class SingularValueDecomposition(JavaModelWrapper):
@@ -378,12 +384,18 @@ class SingularValueDecomposition(JavaModelWrapper):
     @property
     def U(self):
         """
-        Returns a RowMatrix whose columns are the left singular vectors of the
+        Returns a distributed matrix whose columns are the left singular vectors of the
         SingularValueDecomposition if computeU was set to be True.
         """
         u = self.call("U")
         if u is not None:
-            return RowMatrix(u)
+            mat_name = u.getClass().getSimpleName()
+            if mat_name == "RowMatrix":
+                return RowMatrix(u)
+            elif mat_name == "IndexedRowMatrix":
+                return IndexedRowMatrix(u)
+            else:
+                raise TypeError("Expected RowMatrix/IndexedRowMatrix got %s" % mat_name)
 
     @property
     def s(self):
@@ -628,6 +640,56 @@ def toBlockMatrix(self, rowsPerBlock=1024, colsPerBlock=1024):
                                                            colsPerBlock)
         return BlockMatrix(java_block_matrix, rowsPerBlock, colsPerBlock)
 
+    def computeSVD(self, k, computeU=False, rCond=1e-9):
+        """
+        Computes the singular value decomposition of the IndexedRowMatrix.
+
+        The given row matrix A of dimension (m X n) is decomposed into U * s * V'T where
+
+        * U: (m X k) (left singular vectors) is a IndexedRowMatrix whose columns are the
+             eigenvectors of (A X A')
+        * s: DenseVector consisting of square root of the eigenvalues (singular values)
+             in descending order.
+        * v: (n X k) (right singular vectors) is a Matrix whose columns are the
+             eigenvectors of (A' X A)
+
+        For more specific details on implementation, please refer the scala documentation.
+
+        :param k: Set the number of singular values to keep.
+        :param computeU: Whether of not to compute U. If set to be True, then U is computed
+                         by A * V * s^-1
+        :param rCond: Reciprocal condition number. All singular values smaller than
+                      rCond * s[0] are treated as zero, where s[0] is the largest
+                      singular value.
+        :returns: SingularValueDecomposition object
+
+        >>> data = [(0, (3, 1, 1)), (1, (-1, 3, 1))]
+        >>> irm = IndexedRowMatrix(sc.parallelize(data))
+        >>> svd_model = irm.computeSVD(2, True)
+        >>> svd_model.U.rows.collect() # doctest: +NORMALIZE_WHITESPACE
+        [IndexedRow(0, [-0.707106781187,0.707106781187]),\
+        IndexedRow(1, [-0.707106781187,-0.707106781187])]
+        >>> svd_model.s
+        DenseVector([3.4641, 3.1623])
+        >>> svd_model.V
+        DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, 0.0], 0)
+        """
+        j_model = self._java_matrix_wrapper.call("computeSVD", int(k), computeU, float(rCond))
+        return SingularValueDecomposition(j_model)
+
+    def multiply(self, matrix):
+        """
+        Multiplies the given IndexedRowMatrix with another matrix.
+
+        :param matrix: Matrix to multiply with.
+        :returns: IndexedRowMatrix
+
+        >>> data = IndexedRowMatrix(sc.parallelize([(0, (0, 1)), (1, (2, 3))]))
+        >>> data.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect()
+        [IndexedRow(0, [2.0,3.0]), IndexedRow(1, [6.0,11.0])]
+        """
+        return IndexedRowMatrix(self._java_matrix_wrapper.call("multiply", matrix))
+
 
 class MatrixEntry(object):
     """

From 59f53d51cda6f9744fd2eed3cd633f1fecf11063 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Thu, 6 Aug 2015 02:26:56 +0530
Subject: [PATCH 05/13] Added tests

---
 python/pyspark/mllib/tests.py | 63 +++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 74cf7bb8eaf9..3219619bdc17 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -23,6 +23,7 @@
 import sys
 import tempfile
 import array as pyarray
+from math import sqrt
 from time import time, sleep
 from shutil import rmtree
 
@@ -53,6 +54,7 @@
 from pyspark.mllib.clustering import StreamingKMeans, StreamingKMeansModel
 from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector,\
     DenseMatrix, SparseMatrix, Vectors, Matrices, MatrixUDT
+from pyspark.mllib.linalg.distributed import RowMatrix
 from pyspark.mllib.classification import StreamingLogisticRegressionWithSGD
 from pyspark.mllib.recommendation import Rating
 from pyspark.mllib.regression import LabeledPoint, StreamingLinearRegressionWithSGD
@@ -1608,6 +1610,67 @@ def test_binary_term_freqs(self):
                                    ": expected " + str(expected[i]) + ", got " + str(output[i]))
 
 
+class DimensionalityReductionTests(MLlibTestCase):
+
+    denseData = [
+        Vectors.dense([0.0, 1.0, 2.0]),
+        Vectors.dense([3.0, 4.0, 5.0]),
+        Vectors.dense([6.0, 7.0, 8.0]),
+        Vectors.dense([9.0, 0.0, 1.0])
+    ]
+    sparseData = [
+        Vectors.sparse(3, [(1, 1.0), (2, 2.0)]),
+        Vectors.sparse(3, [(0, 3.0), (1, 4.0), (2, 5.0)]),
+        Vectors.sparse(3, [(0, 6.0), (1, 7.0), (2, 8.0)]),
+        Vectors.sparse(3, [(0, 9.0), (2, 1.0)])
+    ]
+
+    def assertEqualUpToSign(self, vecA, vecB):
+        eq1 = vecA - vecB
+        eq2 = vecA + vecB
+        self.assertTrue(sum(abs(eq1)) < 1e-6 or sum(abs(eq2)) < 1e-6)
+
+    def test_svd(self):
+        denseMat = RowMatrix(self.sc.parallelize(self.denseData))
+        sparseMat = RowMatrix(self.sc.parallelize(self.sparseData))
+        m = 4
+        n = 3
+        for mat in [denseMat, sparseMat]:
+            for k in range(1, 4):
+                rm = mat.computeSVD(k, computeU=True)
+                self.assertEqual(rm.s.size, k)
+                self.assertEqual(rm.U.numRows(), m)
+                self.assertEqual(rm.U.numCols(), k)
+                self.assertEqual(rm.V.numRows, n)
+                self.assertEqual(rm.V.numCols, k)
+
+        # Test that U returned is None if computeU is set to False.
+        self.assertEqual(mat.computeSVD(1).U, None)
+
+        # Test that low rank matrices cannot have number of singular values
+        # greater than a limit.
+        rm = RowMatrix(self.sc.parallelize(tile([1, 2, 3], (3, 1))))
+        self.assertEqual(rm.computeSVD(3, False, 1e-6).s.size, 1)
+
+    def test_pca(self):
+        expected_pcs = array([
+            [0.0, 1.0, 0.0],
+            [sqrt(2.0) / 2.0, 0.0, sqrt(2.0) / 2.0],
+            [sqrt(2.0) / 2.0, 0.0, -sqrt(2.0) / 2.0]
+        ])
+        n = 3
+        denseMat = RowMatrix(self.sc.parallelize(self.denseData))
+        sparseMat = RowMatrix(self.sc.parallelize(self.sparseData))
+        for mat in [denseMat, sparseMat]:
+            for k in range(1, 4):
+                pcs = mat.computePrincipalComponents(k)
+                self.assertEqual(pcs.numRows, n)
+                self.assertEqual(pcs.numCols, k)
+
+                # We can just test the updated principal component for equality.
+                self.assertEqualUpToSign(pcs.toArray()[:, k - 1], expected_pcs[:, k - 1])
+
+
 if __name__ == "__main__":
     from pyspark.mllib.tests import *
     if not _have_scipy:

From 70a871d189d92c41e477e6f426e12a9fbb549ce7 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Mon, 10 Aug 2015 13:15:46 +0530
Subject: [PATCH 06/13] minor changes to doc

---
 docs/mllib-dimensionality-reduction.md     |  6 +-
 python/pyspark/mllib/linalg/distributed.py | 80 ++++++++++++----------
 2 files changed, 47 insertions(+), 39 deletions(-)

diff --git a/docs/mllib-dimensionality-reduction.md b/docs/mllib-dimensionality-reduction.md
index 190edf9c9122..b1b11610ddff 100644
--- a/docs/mllib-dimensionality-reduction.md
+++ b/docs/mllib-dimensionality-reduction.md
@@ -86,7 +86,6 @@ a dependency.
 </div>
 <div data-lang="python" markdown="1">
 {% highlight python %}
-from pyspark.mllib.linalg import Matrix
 from pyspark.mllib.linalg.distributed import RowMatrix
 from numpy.random import RandomState
 
@@ -152,14 +151,13 @@ The following code demonstrates how to compute principal components on a `RowMat
 and use them to project the vectors into a low-dimensional space.
 
 {% highlight python %}
-from pyspark.mllib.linalg import Matrix
 from pyspark.mllib.linalg.distributed import RowMatrix
 from numpy.random import RandomState
 
 # Generate random data with 50 samples and 30 features.
 rng = RandomState(0)
-mat = sc.parallelize(rng.randn(50, 30))
-rm = RowMatrix(mat)
+data = sc.parallelize(rng.randn(50, 30))
+mat = RowMatrix(data)
 
 # Compute the top 10 principal components stored in a local dense matrix.
 pc = rm.computePrincipalComponents(10)
diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index 4c152ec04c0b..dd87aada2878 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -308,23 +308,25 @@ def computeSVD(self, k, computeU=False, rCond=1e-9):
         """
         Computes the singular value decomposition of the RowMatrix.
 
-        The given row matrix A of dimension (m X n) is decomposed into U * s * V'T where
+        The given row matrix A of dimension (m X n) is decomposed into
+        U * s * V'T where
 
-        * U: (m X k) (left singular vectors) is a RowMatrix whose columns are the
-             eigenvectors of (A X A')
-        * s: DenseVector consisting of square root of the eigenvalues (singular values)
-             in descending order.
-        * v: (n X k) (right singular vectors) is a Matrix whose columns are the
-             eigenvectors of (A' X A)
+        * U: (m X k) (left singular vectors) is a RowMatrix whose
+             columns are the eigenvectors of (A X A')
+        * s: DenseVector consisting of square root of the eigenvalues
+             (singular values) in descending order.
+        * v: (n X k) (right singular vectors) is a Matrix whose columns
+             are the eigenvectors of (A' X A)
 
-        For more specific details on implementation, please refer the scala documentation.
+        For more specific details on implementation, please refer
+        the scala documentation.
 
         :param k: Set the number of singular values to keep.
-        :param computeU: Whether of not to compute U. If set to be True, then U is computed
-                         by A * V * s^-1
-        :param rCond: Reciprocal condition number. All singular values smaller than
-                      rCond * s[0] are treated as zero, where s[0] is the largest
-                      singular value.
+        :param computeU: Whether or not to compute U. If set to be
+                         True, then U is computed by A * V * s^-1
+        :param rCond: Reciprocal condition number. All singular values
+                      smaller than rCond * s[0] are treated as zero
+                      where s[0] is the largest singular value.
         :returns: SingularValueDecomposition object
 
         >>> data = [(3, 1, 1), (-1, 3, 1)]
@@ -337,7 +339,8 @@ def computeSVD(self, k, computeU=False, rCond=1e-9):
         >>> svd_model.V
         DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, 0.0], 0)
         """
-        j_model = self._java_matrix_wrapper.call("computeSVD", int(k), computeU, float(rCond))
+        j_model = self._java_matrix_wrapper.call(
+            "computeSVD", int(k), bool(computeU), float(rCond))
         return SingularValueDecomposition(j_model)
 
     def computePrincipalComponents(self, k):
@@ -384,8 +387,9 @@ class SingularValueDecomposition(JavaModelWrapper):
     @property
     def U(self):
         """
-        Returns a distributed matrix whose columns are the left singular vectors of the
-        SingularValueDecomposition if computeU was set to be True.
+        Returns a distributed matrix whose columns are the left
+        singular vectors of the SingularValueDecomposition if
+        computeU was set to be True.
         """
         u = self.call("U")
         if u is not None:
@@ -399,14 +403,17 @@ def U(self):
 
     @property
     def s(self):
-        """Returns a DenseVector with singular values in descending order."""
+        """
+        Returns a DenseVector with singular values in
+        descending order.
+        """
         return self.call("s")
 
     @property
     def V(self):
         """
-        Returns a DenseMatrix whose columns are the right singular vectors of the
-        SingularValueDecomposition.
+        Returns a DenseMatrix whose columns are the right singular
+        vectors of the SingularValueDecomposition.
         """
         return self.call("V")
 
@@ -644,23 +651,25 @@ def computeSVD(self, k, computeU=False, rCond=1e-9):
         """
         Computes the singular value decomposition of the IndexedRowMatrix.
 
-        The given row matrix A of dimension (m X n) is decomposed into U * s * V'T where
+        The given row matrix A of dimension (m X n) is decomposed into
+        U * s * V'T where
 
-        * U: (m X k) (left singular vectors) is a IndexedRowMatrix whose columns are the
-             eigenvectors of (A X A')
-        * s: DenseVector consisting of square root of the eigenvalues (singular values)
-             in descending order.
-        * v: (n X k) (right singular vectors) is a Matrix whose columns are the
-             eigenvectors of (A' X A)
+        * U: (m X k) (left singular vectors) is a IndexedRowMatrix
+             whose columns are the eigenvectors of (A X A')
+        * s: DenseVector consisting of square root of the eigenvalues
+             (singular values) in descending order.
+        * v: (n X k) (right singular vectors) is a Matrix whose columns
+             are the eigenvectors of (A' X A)
 
-        For more specific details on implementation, please refer the scala documentation.
+        For more specific details on implementation, please refer
+        the scala documentation.
 
         :param k: Set the number of singular values to keep.
-        :param computeU: Whether of not to compute U. If set to be True, then U is computed
-                         by A * V * s^-1
-        :param rCond: Reciprocal condition number. All singular values smaller than
-                      rCond * s[0] are treated as zero, where s[0] is the largest
-                      singular value.
+        :param computeU: Whether or not to compute U. If set to be
+                         True, then U is computed by A * V * s^-1
+        :param rCond: Reciprocal condition number. All singular values
+                      smaller than rCond * s[0] are treated as zero
+                      where s[0] is the largest singular value.
         :returns: SingularValueDecomposition object
 
         >>> data = [(0, (3, 1, 1)), (1, (-1, 3, 1))]
@@ -674,7 +683,8 @@ def computeSVD(self, k, computeU=False, rCond=1e-9):
         >>> svd_model.V
         DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, 0.0], 0)
         """
-        j_model = self._java_matrix_wrapper.call("computeSVD", int(k), computeU, float(rCond))
+        j_model = self._java_matrix_wrapper.call(
+            "computeSVD", int(k), bool(computeU), float(rCond))
         return SingularValueDecomposition(j_model)
 
     def multiply(self, matrix):
@@ -684,8 +694,8 @@ def multiply(self, matrix):
         :param matrix: Matrix to multiply with.
         :returns: IndexedRowMatrix
 
-        >>> data = IndexedRowMatrix(sc.parallelize([(0, (0, 1)), (1, (2, 3))]))
-        >>> data.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect()
+        >>> mat = IndexedRowMatrix(sc.parallelize([(0, (0, 1)), (1, (2, 3))]))
+        >>> mat.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect()
         [IndexedRow(0, [2.0,3.0]), IndexedRow(1, [6.0,11.0])]
         """
         return IndexedRowMatrix(self._java_matrix_wrapper.call("multiply", matrix))

From 0bc6a3caf25c5c5a3cfcc661432890aff2293584 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Thu, 26 May 2016 21:27:09 -0400
Subject: [PATCH 07/13] Add check for DenseMatrix

---
 python/pyspark/mllib/linalg/distributed.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index dd87aada2878..4a2b53c73179 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -303,7 +303,6 @@ def tallSkinnyQR(self, computeQ=False):
         R = decomp.call("R")
         return QRDecomposition(Q, R)
 
-
     def computeSVD(self, k, computeU=False, rCond=1e-9):
         """
         Computes the singular value decomposition of the RowMatrix.
@@ -376,7 +375,9 @@ def multiply(self, matrix):
         >>> rm.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect()
         [DenseVector([2.0, 3.0]), DenseVector([6.0, 11.0])]
         """
-        # TODO: Only DenseMatrices are supported on the Scala side.
+        if not isinstance(matrix, DenseMatrix):
+            raise ValueError("Only multiplication with DenseMatrix "
+                             "is supported.")
         j_model = self._java_matrix_wrapper.call("multiply", matrix)
         return RowMatrix(j_model)
 
@@ -698,6 +699,9 @@ def multiply(self, matrix):
         >>> mat.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect()
         [IndexedRow(0, [2.0,3.0]), IndexedRow(1, [6.0,11.0])]
         """
+        if not isinstance(matrix, DenseMatrix):
+            raise ValueError("Only multiplication with DenseMatrix "
+                             "is supported.")
         return IndexedRowMatrix(self._java_matrix_wrapper.call("multiply", matrix))
 
 

From 3ba24114c0281502c93b82e2d781bd1cdc8c5aa1 Mon Sep 17 00:00:00 2001
From: Nick Pentreath <nickp@za.ibm.com>
Date: Wed, 12 Apr 2017 11:36:52 +0200
Subject: [PATCH 08/13] Update examples, add Python examples, doc and since
 tags

---
 docs/mllib-dimensionality-reduction.md        | 45 ++--------------
 .../spark/examples/mllib/JavaPCAExample.java  | 27 ++++++----
 .../spark/examples/mllib/JavaSVDExample.java  | 27 +++++-----
 .../python/mllib/pca_rowmatrix_example.py     | 46 ++++++++++++++++
 examples/src/main/python/mllib/svd_example.py | 48 +++++++++++++++++
 .../mllib/PCAOnRowMatrixExample.scala         |  4 +-
 .../spark/examples/mllib/SVDExample.scala     | 11 ++--
 python/pyspark/mllib/linalg/distributed.py    | 54 ++++++++++++-------
 8 files changed, 172 insertions(+), 90 deletions(-)
 create mode 100644 examples/src/main/python/mllib/pca_rowmatrix_example.py
 create mode 100644 examples/src/main/python/mllib/svd_example.py

diff --git a/docs/mllib-dimensionality-reduction.md b/docs/mllib-dimensionality-reduction.md
index 7a492734a344..a72680d52a26 100644
--- a/docs/mllib-dimensionality-reduction.md
+++ b/docs/mllib-dimensionality-reduction.md
@@ -76,29 +76,11 @@ Refer to the [`SingularValueDecomposition` Java docs](api/java/org/apache/spark/
 
 The same code applies to `IndexedRowMatrix` if `U` is defined as an
 `IndexedRowMatrix`.
-
-In order to run the above application, follow the instructions
-provided in the [Self-Contained
-Applications](quick-start.html#self-contained-applications) section of the Spark
-quick-start guide. Be sure to also include *spark-mllib* to your build file as
-a dependency.
-
 </div>
 <div data-lang="python" markdown="1">
-{% highlight python %}
-from pyspark.mllib.linalg.distributed import RowMatrix
-from numpy.random import RandomState
-
-# Generate random data with 50 samples and 30 features.
-rng = RandomState(0)
-mat = RowMatrix(sc.parallelize(rng.randn(50, 30)))
+Refer to the [`SingularValueDecomposition` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.SingularValueDecomposition) for details on the API.
 
-# Compute the top 20 singular values and corresponding singular vectors.
-svd = mat.computeSVD(20, computeU=True)
-u = svd.U  # The U factor is a RowMatrix.
-s = svd.s  # The singular values are stored in a local dense vector.
-V = svd.V  # The V factor is a local dense matrix.
-{% endhighlight %}
+{% include_example python/mllib/svd_example.py %}
 
 The same code applies to `IndexedRowMatrix` if `U` is defined as an
 `IndexedRowMatrix`.
@@ -137,7 +119,6 @@ Refer to the [`PCA` Scala docs](api/scala/index.html#org.apache.spark.mllib.feat
 
 The following code demonstrates how to compute principal components on a `RowMatrix`
 and use them to project the vectors into a low-dimensional space.
-The number of columns should be small, e.g, less than 1000.
 
 Refer to the [`RowMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/distributed/RowMatrix.html) for details on the API.
 
@@ -150,27 +131,9 @@ Refer to the [`RowMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/dist
 The following code demonstrates how to compute principal components on a `RowMatrix`
 and use them to project the vectors into a low-dimensional space.
 
-{% highlight python %}
-from pyspark.mllib.linalg.distributed import RowMatrix
-from numpy.random import RandomState
+Refer to the [`RowMatrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.RowMatrix) for details on the API.
 
-# Generate random data with 50 samples and 30 features.
-rng = RandomState(0)
-data = sc.parallelize(rng.randn(50, 30))
-mat = RowMatrix(data)
-
-# Compute the top 10 principal components stored in a local dense matrix.
-pc = rm.computePrincipalComponents(10)
-
-# Project the rows to the linear space spanned by the top 10 principal components.
-projected = rm.multiply(pc)
-{% endhighlight %}
+{% include_example python/mllib/pca_rowmatrix_example.py %}
 
 </div>
 </div>
-
-In order to run the above application, follow the instructions
-provided in the [Self-Contained Applications](quick-start.html#self-contained-applications)
-section of the Spark
-quick-start guide. Be sure to also include *spark-mllib* to your build file as
-a dependency.
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaPCAExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaPCAExample.java
index 3077f557ef88..0a7dc621e111 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaPCAExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaPCAExample.java
@@ -18,7 +18,8 @@
 package org.apache.spark.examples.mllib;
 
 // $example on$
-import java.util.LinkedList;
+import java.util.Arrays;
+import java.util.List;
 // $example off$
 
 import org.apache.spark.SparkConf;
@@ -39,21 +40,25 @@ public class JavaPCAExample {
   public static void main(String[] args) {
     SparkConf conf = new SparkConf().setAppName("PCA Example");
     SparkContext sc = new SparkContext(conf);
+    JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);
 
     // $example on$
-    double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}};
-    LinkedList<Vector> rowsList = new LinkedList<>();
-    for (int i = 0; i < array.length; i++) {
-      Vector currentRow = Vectors.dense(array[i]);
-      rowsList.add(currentRow);
-    }
-    JavaRDD<Vector> rows = JavaSparkContext.fromSparkContext(sc).parallelize(rowsList);
+    List<Vector> data = Arrays.asList(
+            Vectors.sparse(5, new int[] {1, 3}, new double[] {1.0, 7.0}),
+            Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
+            Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
+    );
+
+    JavaRDD<Vector> rows = jsc.parallelize(data);
 
     // Create a RowMatrix from JavaRDD<Vector>.
     RowMatrix mat = new RowMatrix(rows.rdd());
 
-    // Compute the top 3 principal components.
-    Matrix pc = mat.computePrincipalComponents(3);
+    // Compute the top 4 principal components.
+    // Principal components are stored in a local dense matrix.
+    Matrix pc = mat.computePrincipalComponents(4);
+
+    // Project the rows to the linear space spanned by the top 4 principal components.
     RowMatrix projected = mat.multiply(pc);
     // $example off$
     Vector[] collectPartitions = (Vector[])projected.rows().collect();
@@ -61,6 +66,6 @@ public static void main(String[] args) {
     for (Vector vector : collectPartitions) {
       System.out.println("\t" + vector);
     }
-    sc.stop();
+    jsc.stop();
   }
 }
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVDExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVDExample.java
index 3730e60f6880..802be3960a33 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVDExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVDExample.java
@@ -18,7 +18,8 @@
 package org.apache.spark.examples.mllib;
 
 // $example on$
-import java.util.LinkedList;
+import java.util.Arrays;
+import java.util.List;
 // $example off$
 
 import org.apache.spark.SparkConf;
@@ -43,22 +44,22 @@ public static void main(String[] args) {
     JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);
 
     // $example on$
-    double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}};
-    LinkedList<Vector> rowsList = new LinkedList<>();
-    for (int i = 0; i < array.length; i++) {
-      Vector currentRow = Vectors.dense(array[i]);
-      rowsList.add(currentRow);
-    }
-    JavaRDD<Vector> rows = jsc.parallelize(rowsList);
+    List<Vector> data = Arrays.asList(
+            Vectors.sparse(5, new int[] {1, 3}, new double[] {1.0, 7.0}),
+            Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
+            Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
+    );
+
+    JavaRDD<Vector> rows = jsc.parallelize(data);
 
     // Create a RowMatrix from JavaRDD<Vector>.
     RowMatrix mat = new RowMatrix(rows.rdd());
 
-    // Compute the top 3 singular values and corresponding singular vectors.
-    SingularValueDecomposition<RowMatrix, Matrix> svd = mat.computeSVD(3, true, 1.0E-9d);
-    RowMatrix U = svd.U();
-    Vector s = svd.s();
-    Matrix V = svd.V();
+    // Compute the top 5 singular values and corresponding singular vectors.
+    SingularValueDecomposition<RowMatrix, Matrix> svd = mat.computeSVD(5, true, 1.0E-9d);
+    RowMatrix U = svd.U();  // The U factor is a RowMatrix.
+    Vector s = svd.s();     // The singular values are stored in a local dense vector.
+    Matrix V = svd.V();     // The V factor is a local dense matrix.
     // $example off$
     Vector[] collectPartitions = (Vector[]) U.rows().collect();
     System.out.println("U factor is:");
diff --git a/examples/src/main/python/mllib/pca_rowmatrix_example.py b/examples/src/main/python/mllib/pca_rowmatrix_example.py
new file mode 100644
index 000000000000..9b20f11b8651
--- /dev/null
+++ b/examples/src/main/python/mllib/pca_rowmatrix_example.py
@@ -0,0 +1,46 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.mllib.linalg import Vectors
+from pyspark.mllib.linalg.distributed import RowMatrix
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="PythonPCAOnRowMatrixExample")
+
+    # $example on$
+    rows = sc.parallelize([
+        Vectors.sparse(5, {1: 1.0, 3: 7.0}),
+        Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
+        Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
+    ])
+
+    mat = RowMatrix(rows)
+    # Compute the top 4 principal components.
+    # Principal components are stored in a local dense matrix.
+    pc = mat.computePrincipalComponents(4)
+
+    # Project the rows to the linear space spanned by the top 4 principal components.
+    projected = mat.multiply(pc)
+    # $example off$
+    collected = projected.rows.collect()
+    print("Projected Row Matrix of principal component:")
+    for vector in collected:
+        print vector
+    sc.stop()
diff --git a/examples/src/main/python/mllib/svd_example.py b/examples/src/main/python/mllib/svd_example.py
new file mode 100644
index 000000000000..09085139f17f
--- /dev/null
+++ b/examples/src/main/python/mllib/svd_example.py
@@ -0,0 +1,48 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.mllib.linalg import Vectors
+from pyspark.mllib.linalg.distributed import RowMatrix
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="PythonSVDExample")
+
+    # $example on$
+    rows = sc.parallelize([
+        Vectors.sparse(5, {1: 1.0, 3: 7.0}),
+        Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
+        Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
+    ])
+
+    mat = RowMatrix(rows)
+
+    # Compute the top 5 singular values and corresponding singular vectors.
+    svd = mat.computeSVD(5, computeU=True)
+    U = svd.U       # The U factor is a RowMatrix.
+    s = svd.s       # The singular values are stored in a local dense vector.
+    V = svd.V       # The V factor is a local dense matrix.
+    # $example off$
+    collected = U.rows.collect()
+    print("U factor is:")
+    for vector in collected:
+        print vector
+    print("Singular values are: %s" % s)
+    print("V factor is:\n%s" % V)
+    sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PCAOnRowMatrixExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAOnRowMatrixExample.scala
index a137ba2a2f9d..da43a8d9c7e8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/PCAOnRowMatrixExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAOnRowMatrixExample.scala
@@ -39,9 +39,9 @@ object PCAOnRowMatrixExample {
       Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
       Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))
 
-    val dataRDD = sc.parallelize(data, 2)
+    val rows = sc.parallelize(data)
 
-    val mat: RowMatrix = new RowMatrix(dataRDD)
+    val mat: RowMatrix = new RowMatrix(rows)
 
     // Compute the top 4 principal components.
     // Principal components are stored in a local dense matrix.
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SVDExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SVDExample.scala
index b286a3f7b909..769ae2a3a88b 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SVDExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SVDExample.scala
@@ -28,6 +28,9 @@ import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.linalg.distributed.RowMatrix
 // $example off$
 
+/**
+ * Example for SingularValueDecomposition.
+ */
 object SVDExample {
 
   def main(args: Array[String]): Unit = {
@@ -41,15 +44,15 @@ object SVDExample {
       Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
       Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))
 
-    val dataRDD = sc.parallelize(data, 2)
+    val rows = sc.parallelize(data)
 
-    val mat: RowMatrix = new RowMatrix(dataRDD)
+    val mat: RowMatrix = new RowMatrix(rows)
 
     // Compute the top 5 singular values and corresponding singular vectors.
     val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true)
     val U: RowMatrix = svd.U  // The U factor is a RowMatrix.
-    val s: Vector = svd.s  // The singular values are stored in a local dense vector.
-    val V: Matrix = svd.V  // The V factor is a local dense matrix.
+    val s: Vector = svd.s     // The singular values are stored in a local dense vector.
+    val V: Matrix = svd.V     // The V factor is a local dense matrix.
     // $example off$
     val collect = U.rows.collect()
     println("U factor is:")
diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index 5a4d147985dd..413a83dd41f2 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -33,9 +33,8 @@
 from pyspark.storagelevel import StorageLevel
 
 
-__all__ = ['DistributedMatrix', 'RowMatrix', 'IndexedRow',
-           'IndexedRowMatrix', 'MatrixEntry', 'CoordinateMatrix',
-           'BlockMatrix']
+__all__ = ['BlockMatrix', 'CoordinateMatrix', 'DistributedMatrix', 'IndexedRow',
+           'IndexedRowMatrix', 'MatrixEntry', 'RowMatrix', 'SingularValueDecomposition']
 
 
 class DistributedMatrix(object):
@@ -301,6 +300,7 @@ def tallSkinnyQR(self, computeQ=False):
         R = decomp.call("R")
         return QRDecomposition(Q, R)
 
+    @since('2.2.0')
     def computeSVD(self, k, computeU=False, rCond=1e-9):
         """
         Computes the singular value decomposition of the RowMatrix.
@@ -316,18 +316,22 @@ def computeSVD(self, k, computeU=False, rCond=1e-9):
              are the eigenvectors of (A' X A)
 
         For more specific details on implementation, please refer
-        the scala documentation.
+        the Scala documentation.
 
-        :param k: Set the number of singular values to keep.
+        :param k: Number of leading singular values to keep (`0 < k <= n`).
+                  It might return less than k if there are numerically zero singular values
+                  or there are not enough Ritz values converged before the maximum number of
+                  Arnoldi update iterations is reached (in case that matrix A is ill-conditioned).
         :param computeU: Whether or not to compute U. If set to be
                          True, then U is computed by A * V * s^-1
         :param rCond: Reciprocal condition number. All singular values
                       smaller than rCond * s[0] are treated as zero
                       where s[0] is the largest singular value.
-        :returns: SingularValueDecomposition object
+        :returns: :py:class:`SingularValueDecomposition`
 
-        >>> data = [(3, 1, 1), (-1, 3, 1)]
-        >>> rm = RowMatrix(sc.parallelize(data))
+        >>> rows = sc.parallelize[[3, 1, 1], [-1, 3, 1]]
+        >>> rm = RowMatrix(rows)
+        
         >>> svd_model = rm.computeSVD(2, True)
         >>> svd_model.U.rows.collect()
         [DenseVector([-0.7071, 0.7071]), DenseVector([-0.7071, -0.7071])]
@@ -340,15 +344,18 @@ def computeSVD(self, k, computeU=False, rCond=1e-9):
             "computeSVD", int(k), bool(computeU), float(rCond))
         return SingularValueDecomposition(j_model)
 
+    @since('2.2.0')
     def computePrincipalComponents(self, k):
         """
         Computes the k principal components of the given row matrix
+        
+        .. note:: This cannot be computed on matrices with more than 65535 columns.
 
         :param k: Number of principal components to keep.
-        :returns: DenseMatrix
+        :returns: :py:class:`pyspark.mllib.linalg.DenseMatrix`
 
-        >>> data = sc.parallelize([[1, 2, 3], [2, 4, 5], [3, 6, 1]])
-        >>> rm = RowMatrix(data)
+        >>> rows = sc.parallelize([[1, 2, 3], [2, 4, 5], [3, 6, 1]])
+        >>> rm = RowMatrix(rows)
 
         >>> # Returns the two principal components of rm
         >>> pca = rm.computePrincipalComponents(2)
@@ -381,14 +388,18 @@ def multiply(self, matrix):
 
 
 class SingularValueDecomposition(JavaModelWrapper):
-    """Wrapper around the SingularValueDecomposition scala case class"""
+    """
+    Represents singular value decomposition (SVD) factors.
+    
+    .. versionadded:: 2.2.0
+    """
 
     @property
+    @since('2.2.0')
     def U(self):
         """
         Returns a distributed matrix whose columns are the left
-        singular vectors of the SingularValueDecomposition if
-        computeU was set to be True.
+        singular vectors of the SingularValueDecomposition if computeU was set to be True.
         """
         u = self.call("U")
         if u is not None:
@@ -401,14 +412,15 @@ def U(self):
                 raise TypeError("Expected RowMatrix/IndexedRowMatrix got %s" % mat_name)
 
     @property
+    @since('2.2.0')
     def s(self):
         """
-        Returns a DenseVector with singular values in
-        descending order.
+        Returns a DenseVector with singular values in descending order.
         """
         return self.call("s")
 
     @property
+    @since('2.2.0')
     def V(self):
         """
         Returns a DenseMatrix whose columns are the right singular
@@ -643,6 +655,7 @@ def toBlockMatrix(self, rowsPerBlock=1024, colsPerBlock=1024):
                                                            colsPerBlock)
         return BlockMatrix(java_block_matrix, rowsPerBlock, colsPerBlock)
 
+    @since('2.2.0')
     def computeSVD(self, k, computeU=False, rCond=1e-9):
         """
         Computes the singular value decomposition of the IndexedRowMatrix.
@@ -660,7 +673,10 @@ def computeSVD(self, k, computeU=False, rCond=1e-9):
         For more specific details on implementation, please refer
         the scala documentation.
 
-        :param k: Set the number of singular values to keep.
+        :param k: Number of leading singular values to keep (`0 < k <= n`).
+                  It might return less than k if there are numerically zero singular values
+                  or there are not enough Ritz values converged before the maximum number of
+                  Arnoldi update iterations is reached (in case that matrix A is ill-conditioned).
         :param computeU: Whether or not to compute U. If set to be
                          True, then U is computed by A * V * s^-1
         :param rCond: Reciprocal condition number. All singular values
@@ -668,8 +684,8 @@ def computeSVD(self, k, computeU=False, rCond=1e-9):
                       where s[0] is the largest singular value.
         :returns: SingularValueDecomposition object
 
-        >>> data = [(0, (3, 1, 1)), (1, (-1, 3, 1))]
-        >>> irm = IndexedRowMatrix(sc.parallelize(data))
+        >>> rows = [(0, (3, 1, 1)), (1, (-1, 3, 1))]
+        >>> irm = IndexedRowMatrix(sc.parallelize(rows))
         >>> svd_model = irm.computeSVD(2, True)
         >>> svd_model.U.rows.collect() # doctest: +NORMALIZE_WHITESPACE
         [IndexedRow(0, [-0.707106781187,0.707106781187]),\

From 3ea88e27aa2d91adda7457afc64b8b5839e6396c Mon Sep 17 00:00:00 2001
From: Nick Pentreath <nickp@za.ibm.com>
Date: Wed, 12 Apr 2017 12:08:14 +0200
Subject: [PATCH 09/13] Parens for print statements

---
 examples/src/main/python/mllib/pca_rowmatrix_example.py | 2 +-
 examples/src/main/python/mllib/svd_example.py           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/src/main/python/mllib/pca_rowmatrix_example.py b/examples/src/main/python/mllib/pca_rowmatrix_example.py
index 9b20f11b8651..49b9b1bbe08e 100644
--- a/examples/src/main/python/mllib/pca_rowmatrix_example.py
+++ b/examples/src/main/python/mllib/pca_rowmatrix_example.py
@@ -42,5 +42,5 @@
     collected = projected.rows.collect()
     print("Projected Row Matrix of principal component:")
     for vector in collected:
-        print vector
+        print(vector)
     sc.stop()
diff --git a/examples/src/main/python/mllib/svd_example.py b/examples/src/main/python/mllib/svd_example.py
index 09085139f17f..5b220fdb3fd6 100644
--- a/examples/src/main/python/mllib/svd_example.py
+++ b/examples/src/main/python/mllib/svd_example.py
@@ -42,7 +42,7 @@
     collected = U.rows.collect()
     print("U factor is:")
     for vector in collected:
-        print vector
+        print(vector)
     print("Singular values are: %s" % s)
     print("V factor is:\n%s" % V)
     sc.stop()

From 0118f2c0839a45ea68a65a639efaf9fc2eab4883 Mon Sep 17 00:00:00 2001
From: Nick Pentreath <nickp@za.ibm.com>
Date: Wed, 12 Apr 2017 12:21:33 +0200
Subject: [PATCH 10/13] PEP8 fixes

---
 python/pyspark/mllib/linalg/distributed.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index 413a83dd41f2..a12b95c89c4a 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -331,7 +331,7 @@ def computeSVD(self, k, computeU=False, rCond=1e-9):
 
         >>> rows = sc.parallelize[[3, 1, 1], [-1, 3, 1]]
         >>> rm = RowMatrix(rows)
-        
+
         >>> svd_model = rm.computeSVD(2, True)
         >>> svd_model.U.rows.collect()
         [DenseVector([-0.7071, 0.7071]), DenseVector([-0.7071, -0.7071])]
@@ -348,7 +348,7 @@ def computeSVD(self, k, computeU=False, rCond=1e-9):
     def computePrincipalComponents(self, k):
         """
         Computes the k principal components of the given row matrix
-        
+
         .. note:: This cannot be computed on matrices with more than 65535 columns.
 
         :param k: Number of principal components to keep.
@@ -390,7 +390,7 @@ def multiply(self, matrix):
 class SingularValueDecomposition(JavaModelWrapper):
     """
     Represents singular value decomposition (SVD) factors.
-    
+
     .. versionadded:: 2.2.0
     """
 

From 56205d2c7d9f0ff1a803bc2e2bcdf435ceee6f82 Mon Sep 17 00:00:00 2001
From: Nick Pentreath <nickp@za.ibm.com>
Date: Wed, 12 Apr 2017 16:52:08 +0200
Subject: [PATCH 11/13] Fix incorrect bracket

---
 python/pyspark/mllib/linalg/distributed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index a12b95c89c4a..2fefa201fc22 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -329,7 +329,7 @@ def computeSVD(self, k, computeU=False, rCond=1e-9):
                       where s[0] is the largest singular value.
         :returns: :py:class:`SingularValueDecomposition`
 
-        >>> rows = sc.parallelize[[3, 1, 1], [-1, 3, 1]]
+        >>> rows = sc.parallelize([[3, 1, 1], [-1, 3, 1]])
         >>> rm = RowMatrix(rows)
 
         >>> svd_model = rm.computeSVD(2, True)

From 07808fc98609420f1c12ee131ad5e48204704a93 Mon Sep 17 00:00:00 2001
From: Nick Pentreath <nickp@za.ibm.com>
Date: Mon, 24 Apr 2017 13:46:09 +0200
Subject: [PATCH 12/13] Add since tag and update doc string for multiply
 methods

---
 python/pyspark/mllib/linalg/distributed.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index 2fefa201fc22..2d6d921a6930 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -369,12 +369,14 @@ def computePrincipalComponents(self, k):
         """
         return self._java_matrix_wrapper.call("computePrincipalComponents", k)
 
+    @since('2.2.0')
     def multiply(self, matrix):
         """
-        Multiplies the given RowMatrix with another matrix.
+        Multiply this matrix by a local dense matrix on the right.
 
-        :param matrix: Matrix to multiply with.
-        :returns: RowMatrix
+        :param matrix: a local dense matrix whose number of rows must match the number of columns
+        of this matrix
+        :returns: :py:class:`RowMatrix`
 
         >>> rm = RowMatrix(sc.parallelize([[0, 1], [2, 3]]))
         >>> rm.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect()
@@ -699,12 +701,14 @@ def computeSVD(self, k, computeU=False, rCond=1e-9):
             "computeSVD", int(k), bool(computeU), float(rCond))
         return SingularValueDecomposition(j_model)
 
+    @since('2.2.0')
     def multiply(self, matrix):
         """
-        Multiplies the given IndexedRowMatrix with another matrix.
+        Multiply this matrix by a local dense matrix on the right.
 
-        :param matrix: Matrix to multiply with.
-        :returns: IndexedRowMatrix
+        :param matrix: a local dense matrix whose number of rows must match the number of columns
+        of this matrix
+        :returns: :py:class:`IndexedRowMatrix`
 
         >>> mat = IndexedRowMatrix(sc.parallelize([(0, (0, 1)), (1, (2, 3))]))
         >>> mat.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect()

From 94006a404aeb2c9b05643080eece64f0506ebafd Mon Sep 17 00:00:00 2001
From: Nick Pentreath <nickp@za.ibm.com>
Date: Mon, 24 Apr 2017 13:56:41 +0200
Subject: [PATCH 13/13] Fix doc indent

---
 python/pyspark/mllib/linalg/distributed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index 2d6d921a6930..4cb802514be5 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -375,7 +375,7 @@ def multiply(self, matrix):
         Multiply this matrix by a local dense matrix on the right.
 
         :param matrix: a local dense matrix whose number of rows must match the number of columns
-        of this matrix
+                       of this matrix
         :returns: :py:class:`RowMatrix`
 
         >>> rm = RowMatrix(sc.parallelize([[0, 1], [2, 3]]))
@@ -707,7 +707,7 @@ def multiply(self, matrix):
         Multiply this matrix by a local dense matrix on the right.
 
         :param matrix: a local dense matrix whose number of rows must match the number of columns
-        of this matrix
+                       of this matrix
         :returns: :py:class:`IndexedRowMatrix`
 
         >>> mat = IndexedRowMatrix(sc.parallelize([(0, (0, 1)), (1, (2, 3))]))