Merge branch 'decisiontree-bugfix' into decisiontree-python-new

jkbradley · jkbradley · commit 8a758dbb18ed · 2014-07-30T16:08:48.000-07:00
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
@@ -21,7 +21,6 @@ import scopt.OptionParser
 
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.SparkContext._
-import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.{DecisionTree, impurity}
 import org.apache.spark.mllib.tree.configuration.{Algo, Strategy}
@@ -36,6 +35,9 @@ import org.apache.spark.rdd.RDD
  * ./bin/spark-example org.apache.spark.examples.mllib.DecisionTreeRunner [options]
  * }}}
  * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
+ *
+ * Note: This script treats all features as real-valued (not categorical).
+ *       To include categorical features, modify categoricalFeaturesInfo.
  */
 object DecisionTreeRunner {
 
@@ -48,11 +50,13 @@ object DecisionTreeRunner {
 
   case class Params(
       input: String = null,
+      dataFormat: String = null,
       algo: Algo = Classification,
       numClassesForClassification: Int = 2,
-      maxDepth: Int = 5,
+      maxDepth: Int = 4,
       impurity: ImpurityType = Gini,
-      maxBins: Int = 100)
+      maxBins: Int = 100,
+      fracTest: Double = 0.2)
 
   def main(args: Array[String]) {
     val defaultParams = Params()
@@ -69,25 +73,32 @@ object DecisionTreeRunner {
       opt[Int]("maxDepth")
         .text(s"max depth of the tree, default: ${defaultParams.maxDepth}")
         .action((x, c) => c.copy(maxDepth = x))
-      opt[Int]("numClassesForClassification")
-        .text(s"number of classes for classification, "
-          + s"default: ${defaultParams.numClassesForClassification}")
-        .action((x, c) => c.copy(numClassesForClassification = x))
       opt[Int]("maxBins")
         .text(s"max number of bins, default: ${defaultParams.maxBins}")
         .action((x, c) => c.copy(maxBins = x))
+      opt[Double]("fracTest")
+        .text(s"fraction of data to hold out for testing, default: ${defaultParams.fracTest}")
+        .action((x, c) => c.copy(fracTest = x))
       arg[String]("<input>")
         .text("input paths to labeled examples in dense format (label,f0 f1 f2 ...)")
         .required()
         .action((x, c) => c.copy(input = x))
+      arg[String]("<dataFormat>")
+        .text("data format: dense/libsvm")
+        .required()
+        .action((x, c) => c.copy(dataFormat = x))
       checkConfig { params =>
-        if (params.algo == Classification &&
-            (params.impurity == Gini || params.impurity == Entropy)) {
-          success
-        } else if (params.algo == Regression && params.impurity == Variance) {
-          success
+        if (params.fracTest < 0 || params.fracTest > 1) {
+          failure(s"fracTest ${params.fracTest} value incorrect; should be in [0,1].")
         } else {
-          failure(s"Algo ${params.algo} is not compatible with impurity ${params.impurity}.")
+          if (params.algo == Classification &&
+            (params.impurity == Gini || params.impurity == Entropy)) {
+            success
+          } else if (params.algo == Regression && params.impurity == Variance) {
+            success
+          } else {
+            failure(s"Algo ${params.algo} is not compatible with impurity ${params.impurity}.")
+          }
         }
       }
     }
@@ -100,16 +111,57 @@ object DecisionTreeRunner {
   }
 
   def run(params: Params) {
+
     val conf = new SparkConf().setAppName("DecisionTreeRunner")
     val sc = new SparkContext(conf)
 
     // Load training data and cache it.
-    val examples = MLUtils.loadLabeledPoints(sc, params.input).cache()
+    val origExamples = params.dataFormat match {
+      case "dense" => MLUtils.loadLabeledPoints(sc, params.input).cache()
+      case "libsvm" => MLUtils.loadLibSVMFile(sc, params.input, multiclass = true).cache()
+    }
+    // For classification, re-index classes if needed.
+    val (examples, numClasses) = params.algo match {
+      case Classification => {
+        // classCounts: class --> # examples in class
+        val classCounts = origExamples.map(_.label).countByValue
+        val numClasses = classCounts.size
+        // classIndex: class --> index in 0,...,numClasses-1
+        val classIndex = {
+          if (classCounts.keySet != Set[Double](0.0, 1.0)) {
+            classCounts.keys.toList.sorted.zipWithIndex.toMap
+          } else {
+            Map[Double, Int]()
+          }
+        }
+        val examples = {
+          if (classIndex.isEmpty) {
+            origExamples
+          } else {
+            origExamples.map(lp => LabeledPoint(classIndex(lp.label), lp.features))
+          }
+        }
+        println(s"numClasses = $numClasses.")
+        println(s"Per-class example fractions, counts:")
+        println(s"Class\tFrac\tCount")
+        classCounts.keys.toList.sorted.foreach(c => {
+          val frac = classCounts(c) / (0.0 + examples.count())
+          println(s"$c\t$frac\t${classCounts(c)}")
+        })
+        (examples, numClasses)
+      }
+      case Regression => {
+        (origExamples, 0)
+      }
+      case _ => {
+        throw new IllegalArgumentException("Algo ${params.algo} not supported.")
+      }
+    }
 
-    val splits = examples.randomSplit(Array(0.8, 0.2))
+    // Split into training, test.
+    val splits = examples.randomSplit(Array(1.0 - params.fracTest, params.fracTest))
     val training = splits(0).cache()
     val test = splits(1).cache()
-
     val numTraining = training.count()
     val numTest = test.count()
 
@@ -129,17 +181,19 @@ object DecisionTreeRunner {
           impurity = impurityCalculator,
           maxDepth = params.maxDepth,
           maxBins = params.maxBins,
-          numClassesForClassification = params.numClassesForClassification)
+          numClassesForClassification = numClasses)
     val model = DecisionTree.train(training, strategy)
 
+    println(model)
+
     if (params.algo == Classification) {
       val accuracy = accuracyScore(model, test)
-      println(s"Test accuracy = $accuracy.")
+      println(s"Test accuracy = $accuracy")
     }
 
     if (params.algo == Regression) {
       val mse = meanSquaredError(model, test)
-      println(s"Test mean squared error = $mse.")
+      println(s"Test mean squared error = $mse")
     }
 
     sc.stop()
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -598,9 +598,12 @@ object DecisionTree extends Serializable with Logging {
      // Find feature bins for all nodes at a level.
     val binMappedRDD = input.map(x => findBinsForLevel(x))
 
-    def updateBinForOrderedFeature(arr: Array[Double], agg: Array[Double], nodeIndex: Int,
-        label: Double, featureIndex: Int) = {
-
+    def updateBinForOrderedFeature(
+        arr: Array[Double],
+        agg: Array[Double],
+        nodeIndex: Int,
+        label: Double,
+        featureIndex: Int) = {
       // Find the bin index for this feature.
       val arrShift = 1 + numFeatures * nodeIndex
       val arrIndex = arrShift + featureIndex
@@ -612,27 +615,31 @@ object DecisionTree extends Serializable with Logging {
       agg(aggIndex + labelInt) = agg(aggIndex + labelInt) + 1
     }
 
-    def updateBinForUnorderedFeature(nodeIndex: Int, featureIndex: Int, arr: Array[Double],
-        label: Double, agg: Array[Double], rightChildShift: Int) = {
+    def updateBinForUnorderedFeature(
+        nodeIndex: Int,
+        featureIndex: Int,
+        arr: Array[Double],
+        label: Double,
+        agg: Array[Double],
+        rightChildShift: Int) = {
       // Find the bin index for this feature.
-      val arrShift = 1 + numFeatures * nodeIndex
-      val arrIndex = arrShift + featureIndex
+      val arrIndex = 1 + numFeatures * nodeIndex + featureIndex
+      val featureValue = arr(arrIndex).toInt
       // Update the left or right count for one bin.
-      val aggShift = numClasses * numBins * numFeatures * nodeIndex
-      val aggIndex
-        = aggShift + numClasses * featureIndex * numBins + arr(arrIndex).toInt * numClasses
+      val aggShift =
+        numClasses * numBins * numFeatures * nodeIndex +
+        numClasses * numBins * featureIndex +
+        label.toInt
       // Find all matching bins and increment their values
       val featureCategories = strategy.categoricalFeaturesInfo(featureIndex)
       val numCategoricalBins = math.pow(2.0, featureCategories - 1).toInt - 1
       var binIndex = 0
       while (binIndex < numCategoricalBins) {
-        val labelInt = label.toInt
-        if (bins(featureIndex)(binIndex).highSplit.categories.contains(labelInt)) {
-          agg(aggIndex + binIndex)
-            = agg(aggIndex + binIndex) + 1
+        val aggIndex = aggShift + binIndex * numClasses
+        if (bins(featureIndex)(binIndex).highSplit.categories.contains(featureValue)) {
+          agg(aggIndex) += 1
         } else {
-          agg(rightChildShift + aggIndex + binIndex)
-            = agg(rightChildShift + aggIndex + binIndex) + 1
+          agg(rightChildShift + aggIndex) += 1
         }
         binIndex += 1
       }
@@ -815,20 +822,10 @@ object DecisionTree extends Serializable with Logging {
         topImpurity: Double): InformationGainStats = {
       strategy.algo match {
         case Classification =>
-          var classIndex = 0
-          val leftCounts: Array[Double] = new Array[Double](numClasses)
-          val rightCounts: Array[Double] = new Array[Double](numClasses)
-          var leftTotalCount = 0.0
-          var rightTotalCount = 0.0
-          while (classIndex < numClasses) {
-            val leftClassCount = leftNodeAgg(featureIndex)(splitIndex)(classIndex)
-            val rightClassCount = rightNodeAgg(featureIndex)(splitIndex)(classIndex)
-            leftCounts(classIndex) = leftClassCount
-            leftTotalCount += leftClassCount
-            rightCounts(classIndex) = rightClassCount
-            rightTotalCount += rightClassCount
-            classIndex += 1
-          }
+          val leftCounts: Array[Double] = leftNodeAgg(featureIndex)(splitIndex)
+          val rightCounts: Array[Double] = rightNodeAgg(featureIndex)(splitIndex)
+          var leftTotalCount = leftCounts.sum
+          var rightTotalCount = rightCounts.sum
 
           val impurity = {
             if (level > 0) {
@@ -845,33 +842,15 @@ object DecisionTree extends Serializable with Logging {
             }
           }
 
-          if (leftTotalCount == 0) {
-            return new InformationGainStats(0, topImpurity, topImpurity, Double.MinValue, 1)
-          }
-          if (rightTotalCount == 0) {
-            return new InformationGainStats(0, topImpurity, Double.MinValue, topImpurity, 1)
-          }
-
-          val leftImpurity = strategy.impurity.calculate(leftCounts, leftTotalCount)
-          val rightImpurity = strategy.impurity.calculate(rightCounts, rightTotalCount)
-
-          val leftWeight = leftTotalCount / (leftTotalCount + rightTotalCount)
-          val rightWeight = rightTotalCount / (leftTotalCount + rightTotalCount)
-
-          val gain = {
-            if (level > 0) {
-              impurity - leftWeight * leftImpurity - rightWeight * rightImpurity
-            } else {
-              impurity - leftWeight * leftImpurity - rightWeight * rightImpurity
-            }
-          }
-
           val totalCount = leftTotalCount + rightTotalCount
+          if (totalCount == 0) {
+            // Return arbitrary prediction.
+            return new InformationGainStats(0, topImpurity, topImpurity, topImpurity, 0)
+          }
 
           // Sum of count for each label
-          val leftRightCounts: Array[Double]
-            = leftCounts.zip(rightCounts)
-              .map{case (leftCount, rightCount) => leftCount + rightCount}
+          val leftRightCounts: Array[Double] = leftCounts.zip(rightCounts).map {
+              case (leftCount, rightCount) => leftCount + rightCount }
 
           def indexOfLargestArrayElement(array: Array[Double]): Int = {
             val result = array.foldLeft(-1, Double.MinValue, 0) {
@@ -885,6 +864,22 @@ object DecisionTree extends Serializable with Logging {
           val predict = indexOfLargestArrayElement(leftRightCounts)
           val prob = leftRightCounts(predict) / totalCount
 
+          val leftImpurity = if (leftTotalCount == 0) {
+            topImpurity
+          } else {
+            strategy.impurity.calculate(leftCounts, leftTotalCount)
+          }
+          val rightImpurity = if (rightTotalCount == 0) {
+            topImpurity
+          } else {
+            strategy.impurity.calculate(rightCounts, rightTotalCount)
+          }
+
+          val leftWeight = leftTotalCount / totalCount
+          val rightWeight = rightTotalCount / totalCount
+
+          val gain = impurity - leftWeight * leftImpurity - rightWeight * rightImpurity
+
           new InformationGainStats(gain, impurity, leftImpurity, rightImpurity, predict, prob)
         case Regression =>
           val leftCount = leftNodeAgg(featureIndex)(splitIndex)(0)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala
@@ -34,10 +34,13 @@ object Entropy extends Impurity {
    * information calculation for multiclass classification
    * @param counts Array[Double] with counts for each label
    * @param totalCount sum of counts for all labels
-   * @return information value
+   * @return information value, or 0 if totalCount = 0
    */
   @DeveloperApi
   override def calculate(counts: Array[Double], totalCount: Double): Double = {
+    if (totalCount == 0) {
+      return 0
+    }
     val numClasses = counts.length
     var impurity = 0.0
     var classIndex = 0
@@ -58,6 +61,7 @@ object Entropy extends Impurity {
    * @param count number of instances
    * @param sum sum of labels
    * @param sumSquares summation of squares of the labels
+   * @return information value, or 0 if count = 0
    */
   @DeveloperApi
   override def calculate(count: Double, sum: Double, sumSquares: Double): Double =
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala
@@ -33,10 +33,13 @@ object Gini extends Impurity {
    * information calculation for multiclass classification
    * @param counts Array[Double] with counts for each label
    * @param totalCount sum of counts for all labels
-   * @return information value
+   * @return information value, or 0 if totalCount = 0
    */
   @DeveloperApi
   override def calculate(counts: Array[Double], totalCount: Double): Double = {
+    if (totalCount == 0) {
+      return 0
+    }
     val numClasses = counts.length
     var impurity = 1.0
     var classIndex = 0
@@ -54,6 +57,7 @@ object Gini extends Impurity {
    * @param count number of instances
    * @param sum sum of labels
    * @param sumSquares summation of squares of the labels
+   * @return information value, or 0 if count = 0
    */
   @DeveloperApi
   override def calculate(count: Double, sum: Double, sumSquares: Double): Double =
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
@@ -31,7 +31,7 @@ trait Impurity extends Serializable {
    * information calculation for multiclass classification
    * @param counts Array[Double] with counts for each label
    * @param totalCount sum of counts for all labels
-   * @return information value
+   * @return information value, or 0 if totalCount = 0
    */
   @DeveloperApi
   def calculate(counts: Array[Double], totalCount: Double): Double
@@ -42,7 +42,7 @@ trait Impurity extends Serializable {
    * @param count number of instances
    * @param sum sum of labels
    * @param sumSquares summation of squares of the labels
-   * @return information value
+   * @return information value, or 0 if count = 0
    */
   @DeveloperApi
   def calculate(count: Double, sum: Double, sumSquares: Double): Double
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala
@@ -31,7 +31,7 @@ object Variance extends Impurity {
    * information calculation for multiclass classification
    * @param counts Array[Double] with counts for each label
    * @param totalCount sum of counts for all labels
-   * @return information value
+   * @return information value, or 0 if totalCount = 0
    */
   @DeveloperApi
   override def calculate(counts: Array[Double], totalCount: Double): Double =
@@ -43,9 +43,13 @@ object Variance extends Impurity {
    * @param count number of instances
    * @param sum sum of labels
    * @param sumSquares summation of squares of the labels
+   * @return information value, or 0 if count = 0
    */
   @DeveloperApi
   override def calculate(count: Double, sum: Double, sumSquares: Double): Double = {
+    if (count == 0) {
+      return 0
+    }
     val squaredLoss = sumSquares - (sum * sum) / count
     squaredLoss / count
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala