Wrote Python API and example for DecisionTree. Also added toString, depth, and numNodes methods to DecisionTreeModel.

jkbradley · jkbradley · commit f8253520045d · 2014-07-30T14:48:41.000-07:00
diff --git a/examples/src/main/python/mllib/tree.py b/examples/src/main/python/mllib/tree.py
@@ -0,0 +1,76 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Decision tree classification and regression using MLlib.
+"""
+
+import sys
+
+from operator import add
+
+from pyspark import SparkContext
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.tree import DecisionTree
+
+
+# Parse a line of text into an MLlib LabeledPoint object
+def parsePoint(line):
+    values = [float(s) for s in line.split(',')]
+    if values[0] == -1:   # Convert -1 labels to 0 for MLlib
+        values[0] = 0
+    return LabeledPoint(values[0], values[1:])
+
+# Return accuracy of DecisionTreeModel on the given RDD[LabeledPoint].
+def getAccuracy(dtModel, data):
+    seqOp = (lambda acc, x: acc + (x[0] == x[1]))
+    trainCorrect = \
+        dtModel.predict(data).zip(data.map((lambda p => p.label))).aggregate(0, seqOp, add)
+    return trainCorrect / (0.0 + data.count())
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 1:
+        print >> sys.stderr, "Usage: logistic_regression"
+        exit(-1)
+    sc = SparkContext(appName="PythonDT")
+
+    # Load data.
+    dataPath = 'data/mllib/sample_tree_data.csv'
+    points = sc.textFile(dataPath).map(parsePoint)
+
+    # Train a classifier.
+    model = DecisionTree.trainClassifier(points, numClasses=2)
+    # Print learned tree.
+    print "Model numNodes: " + model.numNodes() + "\n"
+    print "Model depth: " + model.depth() + "\n"
+    print model
+    # Check accuracy.
+    print "Training accuracy: " + getAccuracy(model, points) + "\n"
+
+    # Switch labels and first feature to create a regression dataset with categorical features.
+    """
+    datasetInfo = DatasetInfo(numClasses=0, numFeatures=numFeatures)
+    dtParams = DecisionTreeRegressor.defaultParams()
+    model = DecisionTreeRegressor.train(points, datasetInfo, dtParams)
+    # Print learned tree.
+    print "Model numNodes: " + model.numNodes() + "\n"
+    print "Model depth: " + model.depth() + "\n"
+    print model
+    # Check error.
+    print "Training accuracy: " + getAccuracy(model, points) + "\n"
+    """
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -19,15 +19,23 @@ package org.apache.spark.mllib.api.python
 
 import java.nio.{ByteBuffer, ByteOrder}
 
+import scala.collection.JavaConversions._
+
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.api.java.{JavaSparkContext, JavaRDD}
 import org.apache.spark.mllib.classification._
 import org.apache.spark.mllib.clustering._
 import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.recommendation._
 import org.apache.spark.mllib.regression._
+import org.apache.spark.mllib.tree.configuration.Algo._
+import org.apache.spark.mllib.tree.configuration.Strategy
+import org.apache.spark.mllib.tree.DecisionTree
+import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity, Variance}
+import org.apache.spark.mllib.tree.model.DecisionTreeModel
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
+import org.apache.spark.util.Utils
 
 /**
  * :: DeveloperApi ::
@@ -453,4 +461,75 @@ class PythonMLLibAPI extends Serializable {
     val ratings = ratingsBytesJRDD.rdd.map(unpackRating)
     ALS.trainImplicit(ratings, rank, iterations, lambda, blocks, alpha)
   }
+
+  /**
+   * Java stub for Python mllib DecisionTree.train().
+   * This stub returns a handle to the Java object instead of the content of the Java object.
+   * Extra care needs to be taken in the Python code to ensure it gets freed on exit;
+   * see the Py4J documentation.
+   * @param dataBytesJRDD  Training data
+   * @param categoricalFeaturesInfoJMap  Categorical features info, as Java map
+   */
+  def trainDecisionTreeModel(
+      dataBytesJRDD: JavaRDD[Array[Byte]],
+      algoStr: String,
+      numClasses: Int,
+      categoricalFeaturesInfoJMap: java.util.Map[Int,Int],
+      impurityStr: String,
+      maxDepth: Int,
+      maxBins: Int): DecisionTreeModel = {
+
+    val data = dataBytesJRDD.rdd.map(deserializeLabeledPoint)
+
+    val algo: Algo = algoStr match {
+      case "classification" => Classification
+      case "regression" => Regression
+      case _ => throw new IllegalArgumentException(s"Bad algoStr parameter: $algoStr")
+    }
+    val impurity: Impurity = impurityStr match {
+      case "gini" => Gini
+      case "entropy" => Entropy
+      case "variance" => Variance
+      case _ => throw new IllegalArgumentException(s"Bad impurityStr parameter: $impurityStr")
+    }
+
+    val strategy = new Strategy(
+      algo = algo,
+      impurity = impurity,
+      maxDepth = maxDepth,
+      numClassesForClassification = numClasses,
+      maxBins = maxBins,
+      categoricalFeaturesInfo = categoricalFeaturesInfoJMap.toMap)
+
+    DecisionTree.train(data, strategy)
+  }
+
+  /**
+   * Predict the label of the given data point.
+   * This is a Java stub for python DecisionTreeModel.predict()
+   *
+   * @param featuresBytes Serialized feature vector for data point
+   * @return predicted label
+   */
+  def predictDecisionTreeModel(
+      model: DecisionTreeModel,
+      featuresBytes: Array[Byte]): Double = {
+    val features: Vector = deserializeDoubleVector(featuresBytes)
+    model.predict(features)
+  }
+
+  /**
+   * Predict the labels of the given data points.
+   * This is a Java stub for python DecisionTreeModel.predict()
+   *
+   * @param dataJRDD A JavaRDD with serialized feature vectors
+   * @return JavaRDD of serialized predictions
+   */
+  def predictDecisionTreeModel(
+      model: DecisionTreeModel,
+      dataJRDD: JavaRDD[Array[Byte]]): JavaRDD[Array[Byte]] = {
+    val data = dataJRDD.rdd.map(xBytes => deserializeDoubleVector(xBytes))
+    model.predict(data).map(Utils.serialize(_))
+  }
+
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
@@ -50,4 +50,32 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable
   def predict(features: RDD[Vector]): RDD[Double] = {
     features.map(x => predict(x))
   }
+
+  /**
+   * Get number of nodes in tree, including leaf nodes.
+   */
+  def numNodes: Int = {
+    topNode.numNodesRecursive
+  }
+
+  /**
+   * Get depth of tree.
+   * E.g.: Depth 0 means 1 leaf node.  Depth 1 means 1 internal node and 2 leaf nodes.
+   */
+  def depth: Int = {
+    topNode.depthRecursive
+  }
+
+  /**
+   * Print full model.
+   */
+  override def toString: String = algo match {
+    case Classification =>
+      s"DecisionTreeModel classifier\n" + topNode.toStringRecursive(2)
+    case Regression =>
+      s"DecisionTreeModel regressor\n" + topNode.toStringRecursive(2)
+    case _ => throw new IllegalArgumentException(
+      s"DecisionTreeModel given unknown algo parameter: $algo.")
+  }
+
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
@@ -91,4 +91,59 @@ class Node (
       }
     }
   }
+
+  /**
+   * Get number of nodes in tree from this node, including leaf nodes.
+   */
+  def numNodesRecursive: Int = {
+    if (isLeaf) {
+      1
+    } else {
+      1 + leftNode.get.numNodesRecursive + rightNode.get.numNodesRecursive
+    }
+  }
+
+  /**
+   * Get depth of tree from this node.
+   * E.g.: Depth 0 means this is a leaf node.
+   */
+  def depthRecursive: Int = {
+    if (isLeaf) {
+      0
+    } else {
+      1 + math.max(leftNode.get.depthRecursive, rightNode.get.depthRecursive)
+    }
+  }
+
+  /**
+   * Recursive print function.
+   * @param indentFactor  The number of spaces to add to each level of indentation.
+   */
+  def toStringRecursive(indentFactor: Int = 0): String = {
+
+    def splitToString(split: Split, left: Boolean) : String = {
+      split.featureType match {
+        case Continuous => if (left) {
+          s"(feature ${split.feature} <= ${split.threshold})"
+        } else {
+          s"(feature ${split.feature} > ${split.threshold})"
+        }
+        case Categorical => if (left) {
+          s"(feature ${split.feature} in ${split.categories})"
+        } else {
+          s"(feature ${split.feature} not in ${split.categories})"
+        }
+      }
+    }
+    val prefix: String = " " * indentFactor
+    if (isLeaf) {
+      prefix + s"Predict: $predict\n"
+    } else {
+      prefix + s"If ${splitToString(split.get, left=true)}\n" +
+        leftNode.get.toStringRecursive(indentFactor + 1) +
+        prefix + s"Else ${splitToString(split.get, left=false)}\n" +
+        rightNode.get.toStringRecursive(indentFactor + 1)
+    }
+  }
+
 }
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py