use scopt for DecisionTreeRunner

mengxr · mengxr · commit 6acff75410e4 · 2014-04-28T21:44:05.000-07:00
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
@@ -17,15 +17,17 @@
 
 package org.apache.spark.examples.mllib
 
-import org.apache.spark.{Logging, SparkConf, SparkContext}
+import scopt.OptionParser
+
+import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.SparkContext._
-import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.tree.DecisionTree
+import org.apache.spark.mllib.tree.{DecisionTree, impurity}
 import org.apache.spark.mllib.tree.configuration._
 import org.apache.spark.mllib.tree.configuration.Algo._
-import org.apache.spark.mllib.tree.impurity._
 import org.apache.spark.mllib.tree.model.DecisionTreeModel
+import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 
 /**
@@ -35,124 +37,118 @@ import org.apache.spark.rdd.RDD
  * }}}
  * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
  */
-object DecisionTreeRunner extends Logging {
+object DecisionTreeRunner {
+
+  object ImpurityType extends Enumeration {
+    type ImpurityType = Value
+    val Gini, Entropy, Variance = Value
+  }
 
-  private val usage =
-    """
-      |Usage: DecisionTreeRunner --algo <Classification, Regression> --trainDataDir path
-      |  --testDataDir path --maxDepth num [--impurity <Gini,Entropy,Variance>] [--maxBins num]
-    """.stripMargin
+  import ImpurityType._
+
+  case class Params(
+      input: String = null,
+      algo: Algo = Classification,
+      maxDepth: Int = 5,
+      impurity: ImpurityType = Gini,
+      maxBins: Int = 20)
 
   def main(args: Array[String]) {
+    val defaultParams = Params()
+
+    val parser = new OptionParser[Params]("DecisionTreeRunner") {
+      head("DecisionTreeRunner: an example decision tree app.")
+      opt[String]("algo")
+        .text(s"algorithm (${Algo.values.mkString(",")}), default: ${defaultParams.algo}")
+        .action((x, c) => c.copy(algo = Algo.withName(x)))
+      opt[String]("impurity")
+        .text(s"impurity type (${ImpurityType.values.mkString(",")}), " +
+          s"default: ${defaultParams.impurity}")
+        .action((x, c) => c.copy(impurity = ImpurityType.withName(x)))
+      opt[Int]("maxDepth")
+        .text(s"max depth of the tree, default: ${defaultParams.maxDepth}")
+        .action((x, c) => c.copy(maxDepth = x))
+      opt[Int]("maxBins")
+        .text(s"max number of bins, default: ${defaultParams.maxBins}")
+        .action((x, c) => c.copy(maxBins = x))
+      arg[String]("<input>")
+        .text("input paths to labeled examples in dense format (label,f0 f1 f2 ...)")
+        .required()
+        .action((x, c) => c.copy(input = x))
+      checkConfig { params =>
+        if (params.algo == Classification &&
+            (params.impurity == Gini || params.impurity == Entropy)) {
+          success
+        } else if (params.algo == Regression && params.impurity == Variance) {
+          success
+        } else {
+          failure(s"Algo ${params.algo} is not compatible with impurity ${params.impurity}.")
+        }
+      }
+    }
 
-    if (args.length < 2) {
-      System.err.println(usage)
-      System.exit(1)
+    parser.parse(args, defaultParams).map { params =>
+      run(params)
+    }.getOrElse {
+      sys.exit(1)
     }
+  }
 
+  def run(params: Params) {
     val conf = new SparkConf().setAppName("DecisionTreeRunner")
     val sc = new SparkContext(conf)
 
-    val argList = args.toList
-    type OptionMap = Map[Symbol, Any]
-
-    def nextOption(map : OptionMap, list: List[String]): OptionMap = {
-      list match {
-        case Nil => map
-        case "--algo" :: string :: tail => nextOption(map ++ Map('algo -> string), tail)
-        case "--impurity" :: string :: tail => nextOption(map ++ Map('impurity -> string), tail)
-        case "--maxDepth" :: string :: tail => nextOption(map ++ Map('maxDepth -> string), tail)
-        case "--maxBins" :: string :: tail => nextOption(map ++ Map('maxBins -> string), tail)
-        case "--trainDataDir" :: string :: tail => nextOption(map ++ Map('trainDataDir -> string)
-          , tail)
-        case "--testDataDir" :: string :: tail => nextOption(map ++ Map('testDataDir -> string),
-          tail)
-        case string :: Nil =>  nextOption(map ++ Map('infile -> string), list.tail)
-        case option :: tail => logError("Unknown option " + option)
-          sys.exit(1)
-      }
-    }
-    val options = nextOption(Map(), argList)
-    logDebug(options.toString())
+    // Load training data and cache it.
+    val examples = MLUtils.loadLabeledData(sc, params.input).cache()
 
-    // Load training data.
-    val trainData = loadLabeledData(sc, options.get('trainDataDir).get.toString)
+    val splits = examples.randomSplit(Array(0.8, 0.2))
+    val training = splits(0).cache()
+    val test = splits(1).cache()
 
-    // Identify the type of algorithm.
-    val algoStr =  options.get('algo).get.toString
-    val algo = algoStr match {
-      case "Classification" => Classification
-      case "Regression" => Regression
-    }
+    val numTraining = training.count()
+    val numTest = test.count()
 
-    // Identify the type of impurity.
-    val impurityStr = options.getOrElse('impurity,
-      if (algo == Classification) "Gini" else "Variance").toString
-    val impurity = impurityStr match {
-      case "Gini" => Gini
-      case "Entropy" => Entropy
-      case "Variance" => Variance
-    }
+    println(s"numTraining = $numTraining, numTest = $numTest.")
 
-    val maxDepth = options.getOrElse('maxDepth, "1").toString.toInt
-    val maxBins = options.getOrElse('maxBins, "100").toString.toInt
+    examples.unpersist(blocking = false)
 
-    val strategy = new Strategy(algo, impurity, maxDepth, maxBins)
-    val model = DecisionTree.train(trainData, strategy)
+    val impurityCalculator = params.impurity match {
+      case Gini => impurity.Gini
+      case Entropy => impurity.Entropy
+      case Variance => impurity.Variance
+    }
 
-    // Load test data.
-    val testData = loadLabeledData(sc, options.get('testDataDir).get.toString)
+    val strategy = new Strategy(params.algo, impurityCalculator, params.maxDepth, params.maxBins)
+    val model = DecisionTree.train(training, strategy)
 
-    // Measure algorithm accuracy
-    if (algo == Classification) {
-      val accuracy = accuracyScore(model, testData)
-      logDebug("accuracy = " + accuracy)
+    if (params.algo == Classification) {
+      val accuracy = accuracyScore(model, test)
+      println(s"Test accuracy = $accuracy.")
     }
 
-    if (algo == Regression) {
-      val mse = meanSquaredError(model, testData)
-      logDebug("mean square error = " + mse)
+    if (params.algo == Regression) {
+      val mse = meanSquaredError(model, test)
+      println(s"Test mean squared error = $mse.")
     }
 
     sc.stop()
   }
 
-  /**
-   * Load labeled data from a file. The data format used here is
-   * <L>, <f1> <f2> ...,
-   * where <f1>, <f2> are feature values in Double and <L> is the corresponding label as Double.
-   *
-   * @param sc SparkContext
-   * @param dir Directory to the input data files.
-   * @return An RDD of LabeledPoint. Each labeled point has two elements: the first element is
-   *         the label, and the second element represents the feature values (an array of Double).
-   */
-  private def loadLabeledData(sc: SparkContext, dir: String): RDD[LabeledPoint] = {
-    sc.textFile(dir).map { line =>
-      val parts = line.trim().split(",")
-      val label = parts(0).toDouble
-      val features = Vectors.dense(parts.slice(1,parts.length).map(_.toDouble))
-      LabeledPoint(label, features)
-    }
-  }
-
-  // TODO: Port this method to a generic metrics package.
   /**
    * Calculates the classifier accuracy.
    */
-  private def accuracyScore(model: DecisionTreeModel, data: RDD[LabeledPoint],
-                            threshold: Double = 0.5): Double = {
-    def predictedValue(features: Vector) = {
+  private def accuracyScore(
+      model: DecisionTreeModel,
+      data: RDD[LabeledPoint],
+      threshold: Double = 0.5): Double = {
+    def predictedValue(features: Vector): Double = {
       if (model.predict(features) < threshold) 0.0 else 1.0
     }
     val correctCount = data.filter(y => predictedValue(y.features) == y.label).count()
     val count = data.count()
-    logDebug("correct prediction count = " +  correctCount)
-    logDebug("data count = " + count)
     correctCount.toDouble / count
   }
 
-  // TODO: Port this method to a generic metrics package
   /**
    * Calculates the mean squared error for regression.
    */