apache · mateiz · Apr 6, 2014 · Apr 6, 2014 · Apr 7, 2014 · Apr 7, 2014
diff --git a/docs/mllib-classification-regression.md b/docs/mllib-classification-regression.md
@@ -356,16 +356,17 @@ error.
 import org.apache.spark.SparkContext
 import org.apache.spark.mllib.classification.SVMWithSGD
 import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.linalg.Vectors
 
 // Load and parse the data file
 val data = sc.textFile("mllib/data/sample_svm_data.txt")
 val parsedData = data.map { line =>
-  val parts = line.split(' ')
-  LabeledPoint(parts(0).toDouble, parts.tail.map(x => x.toDouble).toArray)
+  val parts = line.split(' ').map(_.toDouble)
+  LabeledPoint(parts(0), Vectors.dense(parts.tail))
 }
 
 // Run training algorithm to build the model
-val numIterations = 20
+val numIterations = 100
 val model = SVMWithSGD.train(parsedData, numIterations)
 
 // Evaluate model on training examples and compute training error
@@ -401,29 +402,30 @@ val modelL1 = svmAlg.run(parsedData)
 The following example demonstrate how to load training data, parse it as an RDD of LabeledPoint.
 The example then uses LinearRegressionWithSGD to build a simple linear model to predict label 
 values. We compute the Mean Squared Error at the end to evaluate
-[goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit)
+[goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit).
 
 {% highlight scala %}
 import org.apache.spark.mllib.regression.LinearRegressionWithSGD
 import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.linalg.Vectors
 
 // Load and parse the data
 val data = sc.textFile("mllib/data/ridge-data/lpsa.data")
 val parsedData = data.map { line =>
   val parts = line.split(',')
-  LabeledPoint(parts(0).toDouble, parts(1).split(' ').map(x => x.toDouble).toArray)
+  LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
 }
 
 // Building the model
-val numIterations = 20
+val numIterations = 100
 val model = LinearRegressionWithSGD.train(parsedData, numIterations)
 
 // Evaluate model on training examples and compute training error
 val valuesAndPreds = parsedData.map { point =>
   val prediction = model.predict(point.features)
   (point.label, prediction)
 }
-val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2)}.reduce(_ + _)/valuesAndPreds.count
+val MSE = valuesAndPreds.map{case(v, p) => math.pow((v - p), 2)}.reduce(_ + _) / valuesAndPreds.count
 println("training Mean Squared Error = " + MSE)
 {% endhighlight %}
 
@@ -518,18 +520,22 @@ and make predictions with the resulting model to compute the training error.
 
 {% highlight python %}
 from pyspark.mllib.classification import LogisticRegressionWithSGD
+from pyspark.mllib.regression import LabeledPoint
 from numpy import array
 
 # Load and parse the data
+def parsePoint(line):
+    values = [float(x) for x in line.split(' ')]
+    return LabeledPoint(values[0], values[1:])
+
 data = sc.textFile("mllib/data/sample_svm_data.txt")
-parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
-model = LogisticRegressionWithSGD.train(parsedData)
+parsedData = data.map(parsePoint)
 
 # Build the model
-labelsAndPreds = parsedData.map(lambda point: (int(point.item(0)),
-        model.predict(point.take(range(1, point.size)))))
+model = LogisticRegressionWithSGD.train(parsedData)
 
 # Evaluating the model on training data
+labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
 trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
 print("Training Error = " + str(trainErr))
 {% endhighlight %}
@@ -538,22 +544,25 @@ print("Training Error = " + str(trainErr))
 The following example demonstrate how to load training data, parse it as an RDD of LabeledPoint.
 The example then uses LinearRegressionWithSGD to build a simple linear model to predict label 
 values. We compute the Mean Squared Error at the end to evaluate
-[goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit)
+[goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit).
 
 {% highlight python %}
-from pyspark.mllib.regression import LinearRegressionWithSGD
+from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
 from numpy import array
 
 # Load and parse the data
+def parsePoint(line):
+    values = [float(x) for x in line.replace(',', ' ').split(' ')]
+    return LabeledPoint(values[0], values[1:])
+
 data = sc.textFile("mllib/data/ridge-data/lpsa.data")
-parsedData = data.map(lambda line: array([float(x) for x in line.replace(',', ' ').split(' ')]))
+parsedData = data.map(parsePoint)
 
 # Build the model
 model = LinearRegressionWithSGD.train(parsedData)
 
 # Evaluate the model on training data
-valuesAndPreds = parsedData.map(lambda point: (point.item(0),
-        model.predict(point.take(range(1, point.size)))))
-MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y)/valuesAndPreds.count()
+valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
+MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
 print("Mean Squared Error = " + str(MSE))
-{% endhighlight %}
+{% endhighlight %}
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
@@ -48,14 +48,15 @@ optimal *k* is usually one where there is an "elbow" in the WSSSE graph.
 
 {% highlight scala %}
 import org.apache.spark.mllib.clustering.KMeans
+import org.apache.spark.mllib.linalg.Vectors
 
 // Load and parse the data
-val data = sc.textFile("kmeans_data.txt")
-val parsedData = data.map( _.split(' ').map(_.toDouble))
+val data = sc.textFile("data/kmeans_data.txt")
+val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble)))
 
 // Cluster the data into two classes using KMeans
-val numIterations = 20
 val numClusters = 2
+val numIterations = 20
 val clusters = KMeans.train(parsedData, numClusters, numIterations)
 
 // Evaluate clustering by computing Within Set Sum of Squared Errors
@@ -85,12 +86,12 @@ from numpy import array
 from math import sqrt
 
 # Load and parse the data
-data = sc.textFile("kmeans_data.txt")
+data = sc.textFile("data/kmeans_data.txt")
 parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
 
 # Build the model (cluster the data)
 clusters = KMeans.train(parsedData, 2, maxIterations=10,
-        runs=30, initialization_mode="random")
+        runs=10, initialization_mode="random")
 
 # Evaluate clustering by computing Within Set Sum of Squared Errors
 def error(point):

diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
@@ -7,8 +7,9 @@ title: Machine Learning Library (MLlib)
 MLlib is a Spark implementation of some common machine learning (ML)
 functionality, as well associated tests and data generators.  MLlib
 currently supports four common types of machine learning problem settings,
-namely, binary classification, regression, clustering and collaborative
-filtering, as well as an underlying gradient descent optimization primitive.
+namely classification, regression, clustering and collaborative filtering,
+as well as an underlying gradient descent optimization primitive and several
+linear algebra methods.
 
 # Available Methods
 The following links provide a detailed explanation of the methods and usage examples for each of them:
@@ -32,6 +33,28 @@ The following links provide a detailed explanation of the methods and usage exam
   * Singular Value Decomposition
   * Principal Component Analysis
 
+# Data Types
+
+Most MLlib algorithms operate on RDDs containing vectors. In Java and Scala, the
+[Vector](api/mllib/index.html#org.apache.spark.mllib.linalg.Vector) class is used to
+represent vectors. You can create either dense or sparse vectors using the
+[Vectors](api/mllib/index.html#org.apache.spark.mllib.linalg.Vectors$) factory.
+
+In Python, MLlib can take the following vector types:
+
+* [NumPy](http://www.numpy.org) arrays
+* Standard Python lists (e.g. `[1, 2, 3]`)
+* The MLlib [SparseVector](api/pyspark/pyspark.mllib.linalg.SparseVector-class.html) class
+* [SciPy sparse matrices](http://docs.scipy.org/doc/scipy/reference/sparse.html)
+
+For efficiency, we recommend using NumPy arrays over lists, and using the
+[CSC format](http://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csc_matrix.html#scipy.sparse.csc_matrix)
+for SciPy matrices, or MLlib's own SparseVector class.
+
+Several other simple data types are used throughout the library, e.g. the LabeledPoint
+class ([Java/Scala](api/mllib/index.html#org.apache.spark.mllib.regression.LabeledPoint),
+[Python](api/pyspark/pyspark.mllib.regression.LabeledPoint-class.html)) for labeled data.
+
 # Dependencies
 MLlib uses the [jblas](https://github.com/mikiobraun/jblas) linear algebra library, which itself
 depends on native Fortran routines. You may need to install the