From e4afaa8642cc27e7fc9c16306653c7d8fdab16d5 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sun, 18 May 2014 10:36:07 -0700
Subject: [PATCH 01/11] explicity state what might change

---
 docs/mllib-guide.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 842ca5c8c6d8a..1c6d5fe03218c 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -27,8 +27,9 @@ filtering, dimensionality reduction, as well as underlying optimization primitiv
   * stochastic gradient descent
   * limited-memory BFGS (L-BFGS)
 
-MLlib is currently a *beta* component under active development.
-The APIs may change in the future releases, and we will provide migration guide between releases.
+MLlib is a new component under active development.
+The APIs marked `Experimental`/`DeveloperApi` may change in the future releases, 
+and we will provide migration guide between releases.
 
 ## Dependencies
 

From 35bdeb93f17143a4441d239e81e831f7edf75d2e Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sun, 18 May 2014 11:27:27 -0700
Subject: [PATCH 02/11] api/mllib -> api/scala

---
 docs/mllib-basics.md                  | 58 +++++++++++++--------------
 docs/mllib-clustering.md              |  2 +-
 docs/mllib-collaborative-filtering.md |  2 +-
 docs/mllib-guide.md                   | 14 +++----
 docs/mllib-linear-methods.md          | 18 ++++-----
 docs/mllib-naive-bayes.md             | 12 +++---
 docs/mllib-optimization.md            |  8 ++--
 7 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/docs/mllib-basics.md b/docs/mllib-basics.md
index aa9321a547097..e229bd23c3fba 100644
--- a/docs/mllib-basics.md
+++ b/docs/mllib-basics.md
@@ -26,11 +26,11 @@ of the vector.
 <div data-lang="scala" markdown="1">
 
 The base class of local vectors is
-[`Vector`](api/mllib/index.html#org.apache.spark.mllib.linalg.Vector), and we provide two
-implementations: [`DenseVector`](api/mllib/index.html#org.apache.spark.mllib.linalg.DenseVector) and
-[`SparseVector`](api/mllib/index.html#org.apache.spark.mllib.linalg.SparseVector).  We recommend
+[`Vector`](api/scala/index.html#org.apache.spark.mllib.linalg.Vector), and we provide two
+implementations: [`DenseVector`](api/scala/index.html#org.apache.spark.mllib.linalg.DenseVector) and
+[`SparseVector`](api/scala/index.html#org.apache.spark.mllib.linalg.SparseVector).  We recommend
 using the factory methods implemented in
-[`Vectors`](api/mllib/index.html#org.apache.spark.mllib.linalg.Vector) to create local vectors.
+[`Vectors`](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) to create local vectors.
 
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
@@ -53,11 +53,11 @@ Scala imports `scala.collection.immutable.Vector` by default, so you have to imp
 <div data-lang="java" markdown="1">
 
 The base class of local vectors is
-[`Vector`](api/mllib/index.html#org.apache.spark.mllib.linalg.Vector), and we provide two
-implementations: [`DenseVector`](api/mllib/index.html#org.apache.spark.mllib.linalg.DenseVector) and
-[`SparseVector`](api/mllib/index.html#org.apache.spark.mllib.linalg.SparseVector).  We recommend
+[`Vector`](api/scala/index.html#org.apache.spark.mllib.linalg.Vector), and we provide two
+implementations: [`DenseVector`](api/scala/index.html#org.apache.spark.mllib.linalg.DenseVector) and
+[`SparseVector`](api/scala/index.html#org.apache.spark.mllib.linalg.SparseVector).  We recommend
 using the factory methods implemented in
-[`Vectors`](api/mllib/index.html#org.apache.spark.mllib.linalg.Vector) to create local vectors.
+[`Vectors`](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) to create local vectors.
 
 {% highlight java %}
 import org.apache.spark.mllib.linalg.Vector;
@@ -117,7 +117,7 @@ For multiclass classification, labels should be class indices staring from zero:
 <div data-lang="scala" markdown="1">
 
 A labeled point is represented by the case class
-[`LabeledPoint`](api/mllib/index.html#org.apache.spark.mllib.regression.LabeledPoint).
+[`LabeledPoint`](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint).
 
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.Vectors
@@ -134,7 +134,7 @@ val neg = LabeledPoint(0.0, Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0)))
 <div data-lang="java" markdown="1">
 
 A labeled point is represented by
-[`LabeledPoint`](api/mllib/index.html#org.apache.spark.mllib.regression.LabeledPoint).
+[`LabeledPoint`](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint).
 
 {% highlight java %}
 import org.apache.spark.mllib.linalg.Vectors;
@@ -184,7 +184,7 @@ After loading, the feature indices are converted to zero-based.
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-[`MLUtils.loadLibSVMFile`](api/mllib/index.html#org.apache.spark.mllib.util.MLUtils$) reads training
+[`MLUtils.loadLibSVMFile`](api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) reads training
 examples stored in LIBSVM format.
 
 {% highlight scala %}
@@ -197,7 +197,7 @@ val training: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "mllib/data/sample_
 </div>
 
 <div data-lang="java" markdown="1">
-[`MLUtils.loadLibSVMFile`](api/mllib/index.html#org.apache.spark.mllib.util.MLUtils$) reads training
+[`MLUtils.loadLibSVMFile`](api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) reads training
 examples stored in LIBSVM format.
 
 {% highlight java %}
@@ -227,10 +227,10 @@ We are going to add sparse matrix in the next release.
 <div data-lang="scala" markdown="1">
 
 The base class of local matrices is
-[`Matrix`](api/mllib/index.html#org.apache.spark.mllib.linalg.Matrix), and we provide one
-implementation: [`DenseMatrix`](api/mllib/index.html#org.apache.spark.mllib.linalg.DenseMatrix).
+[`Matrix`](api/scala/index.html#org.apache.spark.mllib.linalg.Matrix), and we provide one
+implementation: [`DenseMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.DenseMatrix).
 Sparse matrix will be added in the next release.  We recommend using the factory methods implemented
-in [`Matrices`](api/mllib/index.html#org.apache.spark.mllib.linalg.Matrices) to create local
+in [`Matrices`](api/scala/index.html#org.apache.spark.mllib.linalg.Matrices) to create local
 matrices.
 
 {% highlight scala %}
@@ -244,10 +244,10 @@ val dm: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0))
 <div data-lang="java" markdown="1">
 
 The base class of local matrices is
-[`Matrix`](api/mllib/index.html#org.apache.spark.mllib.linalg.Matrix), and we provide one
-implementation: [`DenseMatrix`](api/mllib/index.html#org.apache.spark.mllib.linalg.DenseMatrix).
+[`Matrix`](api/scala/index.html#org.apache.spark.mllib.linalg.Matrix), and we provide one
+implementation: [`DenseMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.DenseMatrix).
 Sparse matrix will be added in the next release.  We recommend using the factory methods implemented
-in [`Matrices`](api/mllib/index.html#org.apache.spark.mllib.linalg.Matrices) to create local
+in [`Matrices`](api/scala/index.html#org.apache.spark.mllib.linalg.Matrices) to create local
 matrices.
 
 {% highlight java %}
@@ -284,7 +284,7 @@ limited by the integer range but it should be much smaller in practice.
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-A [`RowMatrix`](api/mllib/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) can be
+A [`RowMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) can be
 created from an `RDD[Vector]` instance.  Then we can compute its column summary statistics.
 
 {% highlight scala %}
@@ -303,7 +303,7 @@ val n = mat.numCols()
 
 <div data-lang="java" markdown="1">
 
-A [`RowMatrix`](api/mllib/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) can be
+A [`RowMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) can be
 created from a `JavaRDD<Vector>` instance.  Then we can compute its column summary statistics.
 
 {% highlight java %}
@@ -334,7 +334,7 @@ which could be faster if the rows are sparse.
 <div data-lang="scala" markdown="1">
 
 `RowMatrix#computeColumnSummaryStatistics` returns an instance of
-[`MultivariateStatisticalSummary`](api/mllib/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary),
+[`MultivariateStatisticalSummary`](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary),
 which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
 total count.
 
@@ -366,9 +366,9 @@ an RDD of indexed rows, which each row is represented by its index (long-typed)
 <div data-lang="scala" markdown="1">
 
 An
-[`IndexedRowMatrix`](api/mllib/index.html#org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix)
+[`IndexedRowMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix)
 can be created from an `RDD[IndexedRow]` instance, where
-[`IndexedRow`](api/mllib/index.html#org.apache.spark.mllib.linalg.distributed.IndexedRow) is a
+[`IndexedRow`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.IndexedRow) is a
 wrapper over `(Long, Vector)`.  An `IndexedRowMatrix` can be converted to a `RowMatrix` by dropping
 its row indices.
 
@@ -391,9 +391,9 @@ val rowMat: RowMatrix = mat.toRowMatrix()
 <div data-lang="java" markdown="1">
 
 An
-[`IndexedRowMatrix`](api/mllib/index.html#org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix)
+[`IndexedRowMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix)
 can be created from an `JavaRDD<IndexedRow>` instance, where
-[`IndexedRow`](api/mllib/index.html#org.apache.spark.mllib.linalg.distributed.IndexedRow) is a
+[`IndexedRow`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.IndexedRow) is a
 wrapper over `(long, Vector)`.  An `IndexedRowMatrix` can be converted to a `RowMatrix` by dropping
 its row indices.
 
@@ -427,9 +427,9 @@ dimensions of the matrix are huge and the matrix is very sparse.
 <div data-lang="scala" markdown="1">
 
 A
-[`CoordinateMatrix`](api/mllib/index.html#org.apache.spark.mllib.linalg.distributed.CoordinateMatrix)
+[`CoordinateMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.CoordinateMatrix)
 can be created from an `RDD[MatrixEntry]` instance, where
-[`MatrixEntry`](api/mllib/index.html#org.apache.spark.mllib.linalg.distributed.MatrixEntry) is a
+[`MatrixEntry`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.MatrixEntry) is a
 wrapper over `(Long, Long, Double)`.  A `CoordinateMatrix` can be converted to a `IndexedRowMatrix`
 with sparse rows by calling `toIndexedRowMatrix`.  In this release, we do not provide other
 computation for `CoordinateMatrix`.
@@ -453,9 +453,9 @@ val indexedRowMatrix = mat.toIndexedRowMatrix()
 <div data-lang="java" markdown="1">
 
 A
-[`CoordinateMatrix`](api/mllib/index.html#org.apache.spark.mllib.linalg.distributed.CoordinateMatrix)
+[`CoordinateMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.CoordinateMatrix)
 can be created from a `JavaRDD<MatrixEntry>` instance, where
-[`MatrixEntry`](api/mllib/index.html#org.apache.spark.mllib.linalg.distributed.MatrixEntry) is a
+[`MatrixEntry`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.MatrixEntry) is a
 wrapper over `(long, long, double)`.  A `CoordinateMatrix` can be converted to a `IndexedRowMatrix`
 with sparse rows by calling `toIndexedRowMatrix`.
 
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index 276868fa8490d..2f909b30fa9e5 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -40,7 +40,7 @@ a given dataset, the algorithm returns the best clustering result).
 Following code snippets can be executed in `spark-shell`.
 
 In the following example after loading and parsing data, we use the
-[`KMeans`](api/mllib/index.html#org.apache.spark.mllib.clustering.KMeans) object to cluster the data
+[`KMeans`](api/scala/index.html#org.apache.spark.mllib.clustering.KMeans) object to cluster the data
 into two clusters. The number of desired clusters is passed to the algorithm. We then compute Within
 Set Sum of Squared Error (WSSSE). You can reduce this error measure by increasing *k*. In fact the
 optimal *k* is usually one where there is an "elbow" in the WSSSE graph.
diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md
index f486c56e55907..f6fb261a1a7e5 100644
--- a/docs/mllib-collaborative-filtering.md
+++ b/docs/mllib-collaborative-filtering.md
@@ -48,7 +48,7 @@ user for an item.
 
 <div data-lang="scala" markdown="1">
 In the following example we load rating data. Each row consists of a user, a product and a rating.
-We use the default [ALS.train()](api/mllib/index.html#org.apache.spark.mllib.recommendation.ALS$) 
+We use the default [ALS.train()](api/scala/index.html#org.apache.spark.mllib.recommendation.ALS$) 
 method which assumes ratings are explicit. We evaluate the
 recommendation model by measuring the Mean Squared Error of rating prediction.
 
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 1c6d5fe03218c..8c38a543fd522 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -28,7 +28,7 @@ filtering, dimensionality reduction, as well as underlying optimization primitiv
   * limited-memory BFGS (L-BFGS)
 
 MLlib is a new component under active development.
-The APIs marked `Experimental`/`DeveloperApi` may change in the future releases, 
+The APIs marked `Experimental`/`DeveloperApi` may change in future releases, 
 and we will provide migration guide between releases.
 
 ## Dependencies
@@ -62,9 +62,9 @@ take advantage of sparsity in both storage and computation.
 <div data-lang="scala" markdown="1">
 
 We used to represent a feature vector by `Array[Double]`, which is replaced by
-[`Vector`](api/mllib/index.html#org.apache.spark.mllib.linalg.Vector) in v1.0. Algorithms that used
+[`Vector`](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) in v1.0. Algorithms that used
 to accept `RDD[Array[Double]]` now take
-`RDD[Vector]`. [`LabeledPoint`](api/mllib/index.html#org.apache.spark.mllib.regression.LabeledPoint)
+`RDD[Vector]`. [`LabeledPoint`](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint)
 is now a wrapper of `(Double, Vector)` instead of `(Double, Array[Double])`. Converting
 `Array[Double]` to `Vector` is straightforward:
 
@@ -75,7 +75,7 @@ val array: Array[Double] = ... // a double array
 val vector: Vector = Vectors.dense(array) // a dense vector
 {% endhighlight %}
 
-[`Vectors`](api/mllib/index.html#org.apache.spark.mllib.linalg.Vectors$) provides factory methods to create sparse vectors.
+[`Vectors`](api/scala/index.html#org.apache.spark.mllib.linalg.Vectors$) provides factory methods to create sparse vectors.
 
 *Note*. Scala imports `scala.collection.immutable.Vector` by default, so you have to import `org.apache.spark.mllib.linalg.Vector` explicitly to use MLlib's `Vector`.
 
@@ -84,9 +84,9 @@ val vector: Vector = Vectors.dense(array) // a dense vector
 <div data-lang="java" markdown="1">
 
 We used to represent a feature vector by `double[]`, which is replaced by
-[`Vector`](api/mllib/index.html#org.apache.spark.mllib.linalg.Vector) in v1.0. Algorithms that used
+[`Vector`](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) in v1.0. Algorithms that used
 to accept `RDD<double[]>` now take
-`RDD<Vector>`. [`LabeledPoint`](api/mllib/index.html#org.apache.spark.mllib.regression.LabeledPoint)
+`RDD<Vector>`. [`LabeledPoint`](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint)
 is now a wrapper of `(double, Vector)` instead of `(double, double[])`. Converting `double[]` to
 `Vector` is straightforward:
 
@@ -98,7 +98,7 @@ double[] array = ... // a double array
 Vector vector = Vectors.dense(array); // a dense vector
 {% endhighlight %}
 
-[`Vectors`](api/mllib/index.html#org.apache.spark.mllib.linalg.Vectors$) provides factory methods to
+[`Vectors`](api/scala/index.html#org.apache.spark.mllib.linalg.Vectors$) provides factory methods to
 create sparse vectors.
 
 </div>
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
index eff617d8641e2..d80fb327789a5 100644
--- a/docs/mllib-linear-methods.md
+++ b/docs/mllib-linear-methods.md
@@ -233,7 +233,7 @@ val modelL1 = svmAlg.run(training)
 {% endhighlight %}
 
 Similarly, you can use replace `SVMWithSGD` by
-[`LogisticRegressionWithSGD`](api/mllib/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithSGD).
+[`LogisticRegressionWithSGD`](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithSGD).
 
 </div>
 
@@ -328,8 +328,8 @@ println("training Mean Squared Error = " + MSE)
 {% endhighlight %}
 
 Similarly you can use
-[`RidgeRegressionWithSGD`](api/mllib/index.html#org.apache.spark.mllib.regression.RidgeRegressionWithSGD)
-and [`LassoWithSGD`](api/mllib/index.html#org.apache.spark.mllib.regression.LassoWithSGD).
+[`RidgeRegressionWithSGD`](api/scala/index.html#org.apache.spark.mllib.regression.RidgeRegressionWithSGD)
+and [`LassoWithSGD`](api/scala/index.html#org.apache.spark.mllib.regression.LassoWithSGD).
 
 </div>
 
@@ -380,11 +380,11 @@ all three possible regularizations (none, L1 or L2).
 
 Algorithms are all implemented in Scala:
 
-* [SVMWithSGD](api/mllib/index.html#org.apache.spark.mllib.classification.SVMWithSGD)
-* [LogisticRegressionWithSGD](api/mllib/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithSGD)
-* [LinearRegressionWithSGD](api/mllib/index.html#org.apache.spark.mllib.regression.LinearRegressionWithSGD)
-* [RidgeRegressionWithSGD](api/mllib/index.html#org.apache.spark.mllib.regression.RidgeRegressionWithSGD)
-* [LassoWithSGD](api/mllib/index.html#org.apache.spark.mllib.regression.LassoWithSGD)
+* [SVMWithSGD](api/scala/index.html#org.apache.spark.mllib.classification.SVMWithSGD)
+* [LogisticRegressionWithSGD](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithSGD)
+* [LinearRegressionWithSGD](api/scala/index.html#org.apache.spark.mllib.regression.LinearRegressionWithSGD)
+* [RidgeRegressionWithSGD](api/scala/index.html#org.apache.spark.mllib.regression.RidgeRegressionWithSGD)
+* [LassoWithSGD](api/scala/index.html#org.apache.spark.mllib.regression.LassoWithSGD)
 
 Python calls the Scala implementation via
-[PythonMLLibAPI](api/mllib/index.html#org.apache.spark.mllib.api.python.PythonMLLibAPI).
+[PythonMLLibAPI](api/scala/index.html#org.apache.spark.mllib.api.python.PythonMLLibAPI).
diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index c47508b7daa2d..050c3323cfdcb 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -27,11 +27,11 @@ sparsity. Since the training data is only used once, it is not necessary to cach
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-[NaiveBayes](api/mllib/index.html#org.apache.spark.mllib.classification.NaiveBayes$) implements
+[NaiveBayes](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayes$) implements
 multinomial naive Bayes. It takes an RDD of
-[LabeledPoint](api/mllib/index.html#org.apache.spark.mllib.regression.LabeledPoint) and an optional
+[LabeledPoint](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint) and an optional
 smoothing parameter `lambda` as input, and output a
-[NaiveBayesModel](api/mllib/index.html#org.apache.spark.mllib.classification.NaiveBayesModel), which
+[NaiveBayesModel](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayesModel), which
 can be used for evaluation and prediction.
 
 {% highlight scala %}
@@ -59,11 +59,11 @@ val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test
 
 <div data-lang="java" markdown="1">
 
-[NaiveBayes](api/mllib/index.html#org.apache.spark.mllib.classification.NaiveBayes$) implements
+[NaiveBayes](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayes$) implements
 multinomial naive Bayes. It takes a Scala RDD of
-[LabeledPoint](api/mllib/index.html#org.apache.spark.mllib.regression.LabeledPoint) and an
+[LabeledPoint](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint) and an
 optionally smoothing parameter `lambda` as input, and output a
-[NaiveBayesModel](api/mllib/index.html#org.apache.spark.mllib.classification.NaiveBayesModel), which
+[NaiveBayesModel](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayesModel), which
 can be used for evaluation and prediction.
 
 {% highlight java %}
diff --git a/docs/mllib-optimization.md b/docs/mllib-optimization.md
index aa0dec2130593..7cb62e71d0236 100644
--- a/docs/mllib-optimization.md
+++ b/docs/mllib-optimization.md
@@ -170,17 +170,17 @@ each iteration, to compute the gradient direction.
 
 Available algorithms for gradient descent:
 
-* [GradientDescent.runMiniBatchSGD](api/mllib/index.html#org.apache.spark.mllib.optimization.GradientDescent)
+* [GradientDescent.runMiniBatchSGD](api/scala/index.html#org.apache.spark.mllib.optimization.GradientDescent)
 
 ### L-BFGS
 L-BFGS is currently only a low-level optimization primitive in `MLlib`. If you want to use L-BFGS in various 
 ML algorithms such as Linear Regression, and Logistic Regression, you have to pass the gradient of objective
 function, and updater into optimizer yourself instead of using the training APIs like 
-[LogisticRegressionWithSGD](api/mllib/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithSGD).
+[LogisticRegressionWithSGD](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithSGD).
 See the example below. It will be addressed in the next release. 
 
 The L1 regularization by using 
-[L1Updater](api/mllib/index.html#org.apache.spark.mllib.optimization.L1Updater) will not work since the 
+[L1Updater](api/scala/index.html#org.apache.spark.mllib.optimization.L1Updater) will not work since the 
 soft-thresholding logic in L1Updater is designed for gradient descent. See the developer's note.
 
 The L-BFGS method
@@ -274,4 +274,4 @@ the actual gradient descent step. However, we're able to take the gradient and
 loss of objective function of regularization for L-BFGS by ignoring the part of logic
 only for gradient decent such as adaptive step size stuff. We will refactorize
 this into regularizer to replace updater to separate the logic between 
-regularization and step update later. 
\ No newline at end of file
+regularization and step update later. 

From 3a0f4a63109a6d63d2e4e6bc626738dc41ba99a9 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sun, 18 May 2014 11:29:43 -0700
Subject: [PATCH 03/11] api/pyspark -> api/python

---
 docs/mllib-basics.md      | 6 +++---
 docs/mllib-guide.md       | 2 +-
 docs/mllib-naive-bayes.md | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/mllib-basics.md b/docs/mllib-basics.md
index e229bd23c3fba..edb71e900820e 100644
--- a/docs/mllib-basics.md
+++ b/docs/mllib-basics.md
@@ -78,13 +78,13 @@ MLlib recognizes the following types as dense vectors:
 
 and the following as sparse vectors:
 
-* MLlib's [`SparseVector`](api/pyspark/pyspark.mllib.linalg.SparseVector-class.html).
+* MLlib's [`SparseVector`](api/python/pyspark.mllib.linalg.SparseVector-class.html).
 * SciPy's
   [`csc_matrix`](http://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csc_matrix.html#scipy.sparse.csc_matrix)
   with a single column
 
 We recommend using NumPy arrays over lists for efficiency, and using the factory methods implemented
-in [`Vectors`](api/pyspark/pyspark.mllib.linalg.Vectors-class.html) to create sparse vectors.
+in [`Vectors`](api/python/pyspark.mllib.linalg.Vectors-class.html) to create sparse vectors.
 
 {% highlight python %}
 import numpy as np
@@ -151,7 +151,7 @@ LabeledPoint neg = new LabeledPoint(1.0, Vectors.sparse(3, new int[] {0, 2}, new
 <div data-lang="python" markdown="1">
 
 A labeled point is represented by
-[`LabeledPoint`](api/pyspark/pyspark.mllib.regression.LabeledPoint-class.html).
+[`LabeledPoint`](api/python/pyspark.mllib.regression.LabeledPoint-class.html).
 
 {% highlight python %}
 from pyspark.mllib.linalg import SparseVector
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 8c38a543fd522..640ca83085387 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -107,7 +107,7 @@ create sparse vectors.
 
 We used to represent a labeled feature vector in a NumPy array, where the first entry corresponds to
 the label and the rest are features.  This representation is replaced by class
-[`LabeledPoint`](api/pyspark/pyspark.mllib.regression.LabeledPoint-class.html), which takes both
+[`LabeledPoint`](api/python/pyspark.mllib.regression.LabeledPoint-class.html), which takes both
 dense and sparse feature vectors.
 
 {% highlight python %}
diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index 050c3323cfdcb..eef68d2f9d9a4 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -102,11 +102,11 @@ double accuracy = 1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Do
 
 <div data-lang="python" markdown="1">
 
-[NaiveBayes](api/pyspark/pyspark.mllib.classification.NaiveBayes-class.html) implements multinomial
+[NaiveBayes](api/python/pyspark.mllib.classification.NaiveBayes-class.html) implements multinomial
 naive Bayes. It takes an RDD of
-[LabeledPoint](api/pyspark/pyspark.mllib.regression.LabeledPoint-class.html) and an optionally
+[LabeledPoint](api/python/pyspark.mllib.regression.LabeledPoint-class.html) and an optionally
 smoothing parameter `lambda` as input, and output a
-[NaiveBayesModel](api/pyspark/pyspark.mllib.classification.NaiveBayesModel-class.html), which can be
+[NaiveBayesModel](api/python/pyspark.mllib.classification.NaiveBayesModel-class.html), which can be
 used for evaluation and prediction.
 
 <!-- TODO: Make Python's example consistent with Scala's and Java's. -->

From 7dad18eaabb5609be1beaadd0d29f6a0d0e6a453 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sun, 18 May 2014 11:43:02 -0700
Subject: [PATCH 04/11] update java api links in NB

---
 docs/mllib-naive-bayes.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index eef68d2f9d9a4..bded1b7eb93b3 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -59,11 +59,11 @@ val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test
 
 <div data-lang="java" markdown="1">
 
-[NaiveBayes](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayes$) implements
+[NaiveBayes](api/java/org/apache/spark/mllib/classification/NaiveBayes.html) implements
 multinomial naive Bayes. It takes a Scala RDD of
-[LabeledPoint](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint) and an
+[LabeledPoint](api/java/org/apache/spark/mllib/regression/LabeledPoint.html) and an
 optionally smoothing parameter `lambda` as input, and output a
-[NaiveBayesModel](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayesModel), which
+[NaiveBayesModel](api/java/org/apache/spark/mllib/classification/NaiveBayesModel.html), which
 can be used for evaluation and prediction.
 
 {% highlight java %}

From 9f1ff8913cd16e96d97d75b32007d2dbf11634ce Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sun, 18 May 2014 11:51:43 -0700
Subject: [PATCH 05/11] update java api links in mllib-basics

---
 docs/mllib-basics.md | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/docs/mllib-basics.md b/docs/mllib-basics.md
index edb71e900820e..f400dae8d9777 100644
--- a/docs/mllib-basics.md
+++ b/docs/mllib-basics.md
@@ -53,11 +53,11 @@ Scala imports `scala.collection.immutable.Vector` by default, so you have to imp
 <div data-lang="java" markdown="1">
 
 The base class of local vectors is
-[`Vector`](api/scala/index.html#org.apache.spark.mllib.linalg.Vector), and we provide two
-implementations: [`DenseVector`](api/scala/index.html#org.apache.spark.mllib.linalg.DenseVector) and
-[`SparseVector`](api/scala/index.html#org.apache.spark.mllib.linalg.SparseVector).  We recommend
+[`Vector`](api/java/org/apache/spark/mllib/linalg/Vector.html), and we provide two
+implementations: [`DenseVector`](api/java/org/apache/spark/mllib/linalg/DenseVector.html) and
+[`SparseVector`](api/java/org/apache/spark/mllib/linalg/SparseVector.html).  We recommend
 using the factory methods implemented in
-[`Vectors`](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) to create local vectors.
+[`Vectors`](api/java/org/apache/spark/mllib/linalg/Vector.html) to create local vectors.
 
 {% highlight java %}
 import org.apache.spark.mllib.linalg.Vector;
@@ -134,7 +134,7 @@ val neg = LabeledPoint(0.0, Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0)))
 <div data-lang="java" markdown="1">
 
 A labeled point is represented by
-[`LabeledPoint`](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint).
+[`LabeledPoint`](api/java/org/apache/spark/mllib/regression/LabeledPoint.html).
 
 {% highlight java %}
 import org.apache.spark.mllib.linalg.Vectors;
@@ -197,7 +197,7 @@ val training: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "mllib/data/sample_
 </div>
 
 <div data-lang="java" markdown="1">
-[`MLUtils.loadLibSVMFile`](api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) reads training
+[`MLUtils.loadLibSVMFile`](api/java/org/apache/spark/mllib/util/MLUtils.html) reads training
 examples stored in LIBSVM format.
 
 {% highlight java %}
@@ -244,10 +244,10 @@ val dm: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0))
 <div data-lang="java" markdown="1">
 
 The base class of local matrices is
-[`Matrix`](api/scala/index.html#org.apache.spark.mllib.linalg.Matrix), and we provide one
-implementation: [`DenseMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.DenseMatrix).
+[`Matrix`](api/java/org/apache/spark/mllib/linalg/Matrix.html), and we provide one
+implementation: [`DenseMatrix`](api/java/org/apache/spark/mllib/linalg/DenseMatrix.html).
 Sparse matrix will be added in the next release.  We recommend using the factory methods implemented
-in [`Matrices`](api/scala/index.html#org.apache.spark.mllib.linalg.Matrices) to create local
+in [`Matrices`](api/java/org/apache/spark/mllib/linalg/Matrices.html) to create local
 matrices.
 
 {% highlight java %}
@@ -303,7 +303,7 @@ val n = mat.numCols()
 
 <div data-lang="java" markdown="1">
 
-A [`RowMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) can be
+A [`RowMatrix`](api/java/org/apache/spark/mllib/linalg/distributed/RowMatrix.html) can be
 created from a `JavaRDD<Vector>` instance.  Then we can compute its column summary statistics.
 
 {% highlight java %}
@@ -391,9 +391,9 @@ val rowMat: RowMatrix = mat.toRowMatrix()
 <div data-lang="java" markdown="1">
 
 An
-[`IndexedRowMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix)
+[`IndexedRowMatrix`](api/java/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.html)
 can be created from an `JavaRDD<IndexedRow>` instance, where
-[`IndexedRow`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.IndexedRow) is a
+[`IndexedRow`](api/java/org/apache/spark/mllib/linalg/distributed/IndexedRow.html) is a
 wrapper over `(long, Vector)`.  An `IndexedRowMatrix` can be converted to a `RowMatrix` by dropping
 its row indices.
 
@@ -453,13 +453,13 @@ val indexedRowMatrix = mat.toIndexedRowMatrix()
 <div data-lang="java" markdown="1">
 
 A
-[`CoordinateMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.CoordinateMatrix)
+[`CoordinateMatrix`](api/java/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.html)
 can be created from a `JavaRDD<MatrixEntry>` instance, where
-[`MatrixEntry`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.MatrixEntry) is a
+[`MatrixEntry`](api/java/org/apache/spark/mllib/linalg/distributed/MatrixEntry.html) is a
 wrapper over `(long, long, double)`.  A `CoordinateMatrix` can be converted to a `IndexedRowMatrix`
 with sparse rows by calling `toIndexedRowMatrix`.
 
-{% highlight scala %}
+{% highlight java %}
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
 import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix;

From 195d06f24f8536b1fa0eb12313e1d54067baed7b Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sun, 18 May 2014 11:57:04 -0700
Subject: [PATCH 06/11] add Java example for summary stats and minor fix

---
 docs/mllib-basics.md | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/docs/mllib-basics.md b/docs/mllib-basics.md
index f400dae8d9777..51b0dba110630 100644
--- a/docs/mllib-basics.md
+++ b/docs/mllib-basics.md
@@ -333,7 +333,7 @@ which could be faster if the rows are sparse.
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-`RowMatrix#computeColumnSummaryStatistics` returns an instance of
+[`RowMatrix#computeColumnSummaryStatistics`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) returns an instance of
 [`MultivariateStatisticalSummary`](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary),
 which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
 total count.
@@ -355,6 +355,31 @@ println(summary.numNonzeros) // number of nonzeros in each column
 val cov: Matrix = mat.computeCovariance()
 {% endhighlight %}
 </div>
+
+<div data-lang="java" markdown="1">
+
+[`RowMatrix#computeColumnSummaryStatistics`](api/java/org/apache/spark/mllib/linalg/distributed/RowMatrix.html#computeColumnSummaryStatistics()) returns an instance of
+[`MultivariateStatisticalSummary`](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html),
+which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
+total count.
+
+{% highlight java %}
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.linalg.distributed.RowMatrix;
+import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
+
+RowMatrix mat = ... // a RowMatrix
+
+// Compute column summary statistics.
+MultivariateStatisticalSummary summary = mat.computeColumnSummaryStatistics();
+System.out.println(summary.mean()); // a dense vector containing the mean value for each column
+System.out.println(summary.variance()); // column-wise variance
+System.out.println(summary.numNonzeros()); // number of nonzeros in each column
+
+// Compute the covariance matrix.
+Matrix cov = mat.computeCovariance();
+{% endhighlight %}
+</div>
 </div>
 
 ### IndexedRowMatrix
@@ -467,7 +492,7 @@ import org.apache.spark.mllib.linalg.distributed.MatrixEntry;
 
 JavaRDD<MatrixEntry> entries = ... // a JavaRDD of matrix entries
 // Create a CoordinateMatrix from a JavaRDD<MatrixEntry>.
-CoordinateMatrix mat = new CoordinateMatrix(entries);
+CoordinateMatrix mat = new CoordinateMatrix(entries.rdd());
 
 // Get its size.
 long m = mat.numRows();

From 561fdc09fc4060a4adc9f39dfd4ed3fb8abdcc16 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sun, 18 May 2014 12:22:26 -0700
Subject: [PATCH 07/11] add a displayTitle option to global layout

---
 docs/_layouts/global.html | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
index 8b543de574622..fb808129bb65d 100755
--- a/docs/_layouts/global.html
+++ b/docs/_layouts/global.html
@@ -114,7 +114,11 @@
         </div>
 
         <div class="container" id="content">
-          <h1 class="title">{{ page.title }}</h1>
+          {% if page.displayTitle %}
+            <h1 class="title">{{ page.displayTitle }}</h1>
+          {% else %}
+            <h1 class="title">{{ page.title }}</h1>
+          {% endif %}
 
           {{ content }}
 

From d6509c20f3bc216b93c1b967ca0a31aee2bb8204 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sun, 18 May 2014 12:28:53 -0700
Subject: [PATCH 08/11] [SPARK-1783] update mllib titles

---
 docs/mllib-basics.md                   | 3 ++-
 docs/mllib-clustering.md               | 3 ++-
 docs/mllib-collaborative-filtering.md  | 3 ++-
 docs/mllib-decision-tree.md            | 3 ++-
 docs/mllib-dimensionality-reduction.md | 3 ++-
 docs/mllib-linear-methods.md           | 3 ++-
 docs/mllib-naive-bayes.md              | 3 ++-
 docs/mllib-optimization.md             | 3 ++-
 8 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/docs/mllib-basics.md b/docs/mllib-basics.md
index 51b0dba110630..8d93dff46ec6d 100644
--- a/docs/mllib-basics.md
+++ b/docs/mllib-basics.md
@@ -1,6 +1,7 @@
 ---
 layout: global
-title: <a href="mllib-guide.html">MLlib</a> - Basics
+title: Basics - MLlib
+displayTitle: <a href="mllib-guide.html">MLlib</a> - Basics
 ---
 
 * Table of contents
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index 2f909b30fa9e5..429cdf8d40cec 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -1,6 +1,7 @@
 ---
 layout: global
-title: <a href="mllib-guide.html">MLlib</a> - Clustering
+title: Clustering - MLlib
+displayTitle: <a href="mllib-guide.html">MLlib</a> - Clustering
 ---
 
 * Table of contents
diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md
index f6fb261a1a7e5..f4fd158b729d9 100644
--- a/docs/mllib-collaborative-filtering.md
+++ b/docs/mllib-collaborative-filtering.md
@@ -1,6 +1,7 @@
 ---
 layout: global
-title: <a href="mllib-guide.html">MLlib</a> - Collaborative Filtering 
+title: Collaborative Filtering - MLlib
+displayTitle: <a href="mllib-guide.html">MLlib</a> - Collaborative Filtering 
 ---
 
 * Table of contents
diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md
index acf0feff42a8d..3002a66a4fdb3 100644
--- a/docs/mllib-decision-tree.md
+++ b/docs/mllib-decision-tree.md
@@ -1,6 +1,7 @@
 ---
 layout: global
-title: <a href="mllib-guide.html">MLlib</a> - Decision Tree
+title: Decision Tree - MLlib
+displayTitle: <a href="mllib-guide.html">MLlib</a> - Decision Tree
 ---
 
 * Table of contents
diff --git a/docs/mllib-dimensionality-reduction.md b/docs/mllib-dimensionality-reduction.md
index ab24663cfe258..e3608075fbb13 100644
--- a/docs/mllib-dimensionality-reduction.md
+++ b/docs/mllib-dimensionality-reduction.md
@@ -1,6 +1,7 @@
 ---
 layout: global
-title: <a href="mllib-guide.html">MLlib</a> - Dimensionality Reduction
+title: Dimensionality Reduction - MLlib
+displayTitle: <a href="mllib-guide.html">MLlib</a> - Dimensionality Reduction
 ---
 
 * Table of contents
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
index d80fb327789a5..4dfbebbcd04b7 100644
--- a/docs/mllib-linear-methods.md
+++ b/docs/mllib-linear-methods.md
@@ -1,6 +1,7 @@
 ---
 layout: global
-title: <a href="mllib-guide.html">MLlib</a> - Linear Methods
+title: Linear Methods - MLlib
+displayTitle: <a href="mllib-guide.html">MLlib</a> - Linear Methods
 ---
 
 * Table of contents
diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index bded1b7eb93b3..4b3a7cab32118 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -1,6 +1,7 @@
 ---
 layout: global
-title: <a href="mllib-guide.html">MLlib</a> - Naive Bayes
+title: Naive Bayes - MLlib
+displayTitle: <a href="mllib-guide.html">MLlib</a> - Naive Bayes
 ---
 
 Naive Bayes is a simple multiclass classification algorithm with the assumption of independence
diff --git a/docs/mllib-optimization.md b/docs/mllib-optimization.md
index 7cb62e71d0236..a22980d03a2f0 100644
--- a/docs/mllib-optimization.md
+++ b/docs/mllib-optimization.md
@@ -1,6 +1,7 @@
 ---
 layout: global
-title: <a href="mllib-guide.html">MLlib</a> - Optimization
+title: Optimization - MLlib
+displayTitle: <a href="mllib-guide.html">MLlib</a> - Optimization
 ---
 
 * Table of contents

From 4617f04b50a444728552920340a34d90c55be810 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sun, 18 May 2014 12:39:02 -0700
Subject: [PATCH 09/11] add python example to loadLibSVMFile and fix Java
 example

---
 docs/mllib-basics.md | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/docs/mllib-basics.md b/docs/mllib-basics.md
index 8d93dff46ec6d..3a8835ca2ab37 100644
--- a/docs/mllib-basics.md
+++ b/docs/mllib-basics.md
@@ -193,7 +193,7 @@ import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 
-val training: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "mllib/data/sample_libsvm_data.txt")
+val examples: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "mllib/data/sample_libsvm_data.txt")
 {% endhighlight %}
 </div>
 
@@ -204,9 +204,21 @@ examples stored in LIBSVM format.
 {% highlight java %}
 import org.apache.spark.mllib.regression.LabeledPoint;
 import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDDimport;
+import org.apache.spark.api.java.JavaRDD;
+
+JavaRDD<LabeledPoint> examples = 
+  MLUtils.loadLibSVMFile(jsc.sc(), "mllib/data/sample_libsvm_data.txt").toJavaRDD();
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+[`MLUtils.loadLibSVMFile`](api/python/pyspark.mllib.util.MLUtils-class.html) reads training
+examples stored in LIBSVM format.
+
+{% highlight python %}
+from pyspark.mllib.util import MLUtils
 
-RDD<LabeledPoint> training = MLUtils.loadLibSVMFile(jsc, "mllib/data/sample_libsvm_data.txt");
+examples = MLUtils.loadLibSVMFile(sc, "mllib/data/sample_libsvm_data.txt")
 {% endhighlight %}
 </div>
 </div>

From cd9f40b40ba2b12d5c56ac668b24556a55352f82 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sun, 18 May 2014 12:51:56 -0700
Subject: [PATCH 10/11] add a paragraph to summarize distributed matrix types

---
 docs/mllib-basics.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/mllib-basics.md b/docs/mllib-basics.md
index 3a8835ca2ab37..5796e16e8f99c 100644
--- a/docs/mllib-basics.md
+++ b/docs/mllib-basics.md
@@ -282,6 +282,15 @@ and distributed matrices.  Converting a distributed matrix to a different format
 global shuffle, which is quite expensive.  We implemented three types of distributed matrices in
 this release and will add more types in the future.
 
+The basic type is called `RowMatrix`. A `RowMatrix` is a row-oriented distributed
+matrix without meaningful row indices, e.g., a collection of feature vectors.
+It is backed by an RDD of its rows, where each row is a local vector.
+We assume that the number of columns is not huge for a `RowMatrix`.
+An `IndexedRowMatrix` is similar to a `RowMatrix` but with row indices,
+which can be used for identifying rows and joins.
+A `CoordinateMatrix` is a distributed matrix stored in [coordinate list (COO)](https://en.wikipedia.org/wiki/Sparse_matrix) format,
+backed by an RDD of its entries.
+
 ***Note***
 
 The underlying RDDs of a distributed matrix must be deterministic, because we cache the matrix size.

From ec2e407fede9a56bfe6828331e3313d8e030beb2 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sun, 18 May 2014 12:58:37 -0700
Subject: [PATCH 11/11] format scala example for ALS

---
 docs/mllib-collaborative-filtering.md | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md
index f4fd158b729d9..d51002f015670 100644
--- a/docs/mllib-collaborative-filtering.md
+++ b/docs/mllib-collaborative-filtering.md
@@ -59,9 +59,9 @@ import org.apache.spark.mllib.recommendation.Rating
 
 // Load and parse the data
 val data = sc.textFile("mllib/data/als/test.data")
-val ratings = data.map(_.split(',') match {
-    case Array(user, item, rate) =>  Rating(user.toInt, item.toInt, rate.toDouble)
-})
+val ratings = data.map(_.split(',') match { case Array(user, item, rate) =>
+    Rating(user.toInt, item.toInt, rate.toDouble)
+  })
 
 // Build the recommendation model using ALS
 val rank = 10
@@ -69,15 +69,19 @@ val numIterations = 20
 val model = ALS.train(ratings, rank, numIterations, 0.01)
 
 // Evaluate the model on rating data
-val usersProducts = ratings.map{ case Rating(user, product, rate)  => (user, product)}
-val predictions = model.predict(usersProducts).map{
-    case Rating(user, product, rate) => ((user, product), rate)
+val usersProducts = ratings.map { case Rating(user, product, rate) =>
+  (user, product)
 }
-val ratesAndPreds = ratings.map{
-    case Rating(user, product, rate) => ((user, product), rate)
+val predictions = 
+  model.predict(usersProducts).map { case Rating(user, product, rate) => 
+    ((user, product), rate)
+  }
+val ratesAndPreds = ratings.map { case Rating(user, product, rate) => 
+  ((user, product), rate)
 }.join(predictions)
-val MSE = ratesAndPreds.map{
-    case ((user, product), (r1, r2)) =>  math.pow((r1- r2), 2)
+val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) => 
+  val err = (r1 - r2)
+  err * err
 }.mean()
 println("Mean Squared Error = " + MSE)
 {% endhighlight %}