apache · mengxr · May 7, 2014 · May 7, 2014 · May 7, 2014 · May 7, 2014
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
@@ -99,7 +99,7 @@ object DecisionTreeRunner {
     val sc = new SparkContext(conf)
 
     // Load training data and cache it.
-    val examples = MLUtils.loadLabeledData(sc, params.input).cache()
+    val examples = MLUtils.loadLabeledPoints(sc, params.input).cache()
 
     val splits = examples.randomSplit(Array(0.8, 0.2))
     val training = splits(0).cache()

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -20,12 +20,13 @@ package org.apache.spark.mllib.api.python
 import java.nio.{ByteBuffer, ByteOrder}
 
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.api.java.JavaRDD
+import org.apache.spark.api.java.{JavaSparkContext, JavaRDD}
 import org.apache.spark.mllib.classification._
 import org.apache.spark.mllib.clustering._
 import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.recommendation._
 import org.apache.spark.mllib.regression._
+import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 
 /**
@@ -41,7 +42,7 @@ class PythonMLLibAPI extends Serializable {
   private val DENSE_MATRIX_MAGIC: Byte = 3
   private val LABELED_POINT_MAGIC: Byte = 4
 
-  private def deserializeDoubleVector(bytes: Array[Byte], offset: Int = 0): Vector = {
+  private[python] def deserializeDoubleVector(bytes: Array[Byte], offset: Int = 0): Vector = {
     require(bytes.length - offset >= 5, "Byte array too short")
     val magic = bytes(offset)
     if (magic == DENSE_VECTOR_MAGIC) {
@@ -116,7 +117,7 @@ class PythonMLLibAPI extends Serializable {
     bytes
   }
 
-  private def serializeDoubleVector(vector: Vector): Array[Byte] = vector match {
+  private[python] def serializeDoubleVector(vector: Vector): Array[Byte] = vector match {
     case s: SparseVector =>
       serializeSparseVector(s)
     case _ =>
@@ -167,7 +168,18 @@ class PythonMLLibAPI extends Serializable {
     bytes
   }
 
-  private def deserializeLabeledPoint(bytes: Array[Byte]): LabeledPoint = {
+  private[python] def serializeLabeledPoint(p: LabeledPoint): Array[Byte] = {
+    val fb = serializeDoubleVector(p.features)
+    val bytes = new Array[Byte](1 + 8 + fb.length)
+    val bb = ByteBuffer.wrap(bytes)
+    bb.order(ByteOrder.nativeOrder())
+    bb.put(LABELED_POINT_MAGIC)
+    bb.putDouble(p.label)
+    bb.put(fb)
+    bytes
+  }
+
+  private[python] def deserializeLabeledPoint(bytes: Array[Byte]): LabeledPoint = {
     require(bytes.length >= 9, "Byte array too short")
     val magic = bytes(0)
     if (magic != LABELED_POINT_MAGIC) {
@@ -179,6 +191,19 @@ class PythonMLLibAPI extends Serializable {
     LabeledPoint(label, deserializeDoubleVector(bytes, 9))
   }
 
+  /**
+   * Loads and serializes labeled points saved with `RDD#saveAsTextFile`.
+   * @param jsc Java SparkContext
+   * @param path file or directory path in any Hadoop-supported file system URI
+   * @param minPartitions min number of partitions
+   * @return serialized labeled points stored in a JavaRDD of byte array
+   */
+  def loadLabeledPoints(
+      jsc: JavaSparkContext,
+      path: String,
+      minPartitions: Int): JavaRDD[Array[Byte]] =
+    MLUtils.loadLabeledPoints(jsc.sc, path, minPartitions).map(serializeLabeledPoint).toJavaRDD()
+
   private def trainRegressionModel(
       trainFunc: (RDD[LabeledPoint], Vector) => GeneralizedLinearModel,
       dataBytesJRDD: JavaRDD[Array[Byte]],

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -17,13 +17,16 @@
 
 package org.apache.spark.mllib.linalg
 
-import java.lang.{Iterable => JavaIterable, Integer => JavaInteger, Double => JavaDouble}
+import java.lang.{Double => JavaDouble, Integer => JavaInteger, Iterable => JavaIterable}
 import java.util.Arrays
 
 import scala.annotation.varargs
 import scala.collection.JavaConverters._
 
-import breeze.linalg.{Vector => BV, DenseVector => BDV, SparseVector => BSV}
+import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
+
+import org.apache.spark.mllib.util.NumericParser
+import org.apache.spark.SparkException
 
 /**
  * Represents a numeric vector, whose index type is Int and value type is Double.
@@ -124,6 +127,25 @@ object Vectors {
     }.toSeq)
   }
 
+  /**
+   * Parses a string resulted from `Vector#toString` into
+   * an [[org.apache.spark.mllib.linalg.Vector]].
+   */
+  def parse(s: String): Vector = {
+    parseNumeric(NumericParser.parse(s))
+  }
+
+  private[mllib] def parseNumeric(any: Any): Vector = {
+    any match {
+      case values: Array[Double] =>
+        Vectors.dense(values)
+      case Seq(size: Double, indices: Array[Double], values: Array[Double]) =>
+        Vectors.sparse(size.toInt, indices.map(_.toInt), values)
+      case other =>
+       throw new SparkException(s"Cannot parse $other.")
+    }
+  }
+
   /**
    * Creates a vector instance from a breeze vector.
    */
@@ -175,9 +197,10 @@ class SparseVector(
     val indices: Array[Int],
     val values: Array[Double]) extends Vector {
 
-  override def toString: String = {
-    "(" + size + "," + indices.zip(values).mkString("[", "," ,"]") + ")"
-  }
+  require(indices.length == values.length)
+
+  override def toString: String =
+    "(%s,%s,%s)".format(size, indices.mkString("[", ",", "]"), values.mkString("[", ",", "]"))
 
   override def toArray: Array[Double] = {
     val data = new Array[Double](size)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.mllib.regression
 
-import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+import org.apache.spark.mllib.util.NumericParser
+import org.apache.spark.SparkException
 
 /**
  * Class that represents the features and labels of a data point.
@@ -27,6 +29,31 @@ import org.apache.spark.mllib.linalg.Vector
  */
 case class LabeledPoint(label: Double, features: Vector) {
   override def toString: String = {
-    "LabeledPoint(%s, %s)".format(label, features)
+    "(%s,%s)".format(label, features)
+  }
+}
+
+/**
+ * Parser for [[org.apache.spark.mllib.regression.LabeledPoint]].
+ */
+private[mllib] object LabeledPointParser {
+  /**
+   * Parses a string resulted from `LabeledPoint#toString` into
+   * an [[org.apache.spark.mllib.regression.LabeledPoint]].
+   */
+  def parse(s: String): LabeledPoint = {
+    if (s.startsWith("(")) {
+      NumericParser.parse(s) match {
+        case Seq(label: Double, numeric: Any) =>
+          LabeledPoint(label, Vectors.parseNumeric(numeric))
+        case other =>
+          throw new SparkException(s"Cannot parse $other.")
+      }
+    } else { // dense format used before v1.0
+      val parts = s.split(',')
+      val label = java.lang.Double.parseDouble(parts(0))
+      val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble))
+      LabeledPoint(label, features)
+    }
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -129,7 +129,8 @@ object LinearDataGenerator {
     val sc = new SparkContext(sparkMaster, "LinearDataGenerator")
     val data = generateLinearRDD(sc, nexamples, nfeatures, eps, nparts = parts)
 
-    MLUtils.saveLabeledData(data, outputPath)
+    data.saveAsTextFile(outputPath)
+
     sc.stop()
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala
@@ -79,7 +79,8 @@ object LogisticRegressionDataGenerator {
     val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator")
     val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts)
 
-    MLUtils.saveLabeledData(data, outputPath)
+    data.saveAsTextFile(outputPath)
+
     sc.stop()
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -27,7 +27,7 @@ import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.rdd.PartitionwiseSampledRDD
 import org.apache.spark.util.random.BernoulliSampler
-import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.regression.{LabeledPointParser, LabeledPoint}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.storage.StorageLevel
 
@@ -180,7 +180,39 @@ object MLUtils {
   }
 
   /**
-   * :: Experimental ::
+   * Loads vectors saved using `RDD[Vector].saveAsTextFile`.
+   * @param sc Spark context
+   * @param path file or directory path in any Hadoop-supported file system URI
+   * @param minPartitions min number of partitions
+   * @return vectors stored as an RDD[Vector]
+   */
+  def loadVectors(sc: SparkContext, path: String, minPartitions: Int): RDD[Vector] =
+    sc.textFile(path, minPartitions).map(Vectors.parse)
+
+  /**
+   * Loads vectors saved using `RDD[Vector].saveAsTextFile` with the default number of partitions.
+   */
+  def loadVectors(sc: SparkContext, path: String): RDD[Vector] =
+    sc.textFile(path, sc.defaultMinPartitions).map(Vectors.parse)
+
+  /**
+   * Loads labeled points saved using `RDD[LabeledPoint].saveAsTextFile`.
+   * @param sc Spark context
+   * @param path file or directory path in any Hadoop-supported file system URI
+   * @param minPartitions min number of partitions
+   * @return labeled points stored as an RDD[LabeledPoint]
+   */
+  def loadLabeledPoints(sc: SparkContext, path: String, minPartitions: Int): RDD[LabeledPoint] =
+    sc.textFile(path, minPartitions).map(LabeledPointParser.parse)
+
+  /**
+   * Loads labeled points saved using `RDD[LabeledPoint].saveAsTextFile` with the default number of
+   * partitions.
+   */
+  def loadLabeledPoints(sc: SparkContext, dir: String): RDD[LabeledPoint] =
+    loadLabeledPoints(sc, dir, sc.defaultMinPartitions)
+
+  /**
    * Load labeled data from a file. The data format used here is
    * <L>, <f1> <f2> ...
    * where <f1>, <f2> are feature values in Double and <L> is the corresponding label as Double.
@@ -189,8 +221,11 @@ object MLUtils {
    * @param dir Directory to the input data files.
    * @return An RDD of LabeledPoint. Each labeled point has two elements: the first element is
    *         the label, and the second element represents the feature values (an array of Double).
+   *
+   * @deprecated Should use [[org.apache.spark.rdd.RDD#saveAsTextFile]] for saving and
+   *            [[org.apache.spark.mllib.util.MLUtils#loadLabeledPoints]] for loading.
    */
-  @Experimental
+  @deprecated("Should use MLUtils.loadLabeledPoints instead.", "1.0.1")
   def loadLabeledData(sc: SparkContext, dir: String): RDD[LabeledPoint] = {
     sc.textFile(dir).map { line =>
       val parts = line.split(',')
@@ -201,15 +236,17 @@ object MLUtils {
   }
 
   /**
-   * :: Experimental ::
    * Save labeled data to a file. The data format used here is
    * <L>, <f1> <f2> ...
    * where <f1>, <f2> are feature values in Double and <L> is the corresponding label as Double.
    *
    * @param data An RDD of LabeledPoints containing data to be saved.
    * @param dir Directory to save the data.
+   *
+   * @deprecated Should use [[org.apache.spark.rdd.RDD#saveAsTextFile]] for saving and
+   *            [[org.apache.spark.mllib.util.MLUtils#loadLabeledPoints]] for loading.
    */
-  @Experimental
+  @deprecated("Should use RDD[LabeledPoint].saveAsTextFile instead.", "1.0.1")
   def saveLabeledData(data: RDD[LabeledPoint], dir: String) {
     val dataStr = data.map(x => x.label + "," + x.features.toArray.mkString(" "))
     dataStr.saveAsTextFile(dir)