jkbradley · feynmanliang · Nov 7, 2014 · Nov 7, 2014 · Nov 7, 2014 · Nov 30, 2014
diff --git a/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala b/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
@@ -219,4 +219,37 @@ class BitSet(numBits: Int) extends Serializable {
 
   /** Return the number of longs it would take to hold numBits. */
   private def bit2words(numBits: Int) = ((numBits - 1) >> 6) + 1
+
+  /**
+   * Bit-wise OR between two BitSets where the ith bit of other is ORed against the i+offset bit of this instance. For
+   * performance, the OR is computed word-by-word rather than bit-by-bit.
+   *
+   * This function mutates the current BitSet instance (i.e. not `other`).
+   *
+   * @param offset the amount to left-shift (with zero padding) `other` before performing the OR, must be >= 0.
+   */
+  private[spark] def orWithOffset(other: BitSet, offset: Int): Unit = {
+    val numWords = bit2words(math.min(this.capacity, other.capacity - offset))
+    val wordOffset = offset >> 6 // divide by 64
+
+    // Bit vectors have memory layout [63..0|127..64|...] where | denotes word boundaries, so left/right within a word
+    // and left/right across words are flipped
+    val rightOffset = offset & 0x3f // mod 64
+    val leftOffset = (64 - rightOffset)  & 0x3f // mod 64
+
+    var wordIndex = 0
+    while (wordIndex < numWords) {
+      // Fill in lowest-order bits from other's previous word's highest-order bits if available
+      if (rightOffset > 0 && wordIndex > 0) {
+        val maskedShiftedPrevWord = (other.words(wordIndex - 1) & (-1L << leftOffset)) >> leftOffset
+        words(wordIndex + wordOffset) = words(wordIndex + wordOffset) | maskedShiftedPrevWord
+      }
+
+      // Mask, shift, and OR with current word
+      val maskedShiftedOtherWord = (other.words(wordIndex) & (-1L >> rightOffset)) << rightOffset
+      words(wordIndex + wordOffset) = words(wordIndex + wordOffset) | maskedShiftedOtherWord
+
+      wordIndex += 1
+    }
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala
@@ -41,6 +41,29 @@ class BitSetSuite extends SparkFunSuite {
     assert(bitset.cardinality() === setBits.size)
   }
 
+  test("orWithOffset") {
+    val setBits = Seq(0, 9, 1, 10, 90, 96)
+    val bitset = new BitSet(100)
+    setBits.foreach(i => bitset.set(i))
+
+    for {
+      offset <- Seq(0, 1, 63, 64, 65)
+    } {
+      val copyBitset = new BitSet(100)
+      copyBitset.orWithOffset(bitset, offset)
+      for (i <- 0 until offset) {
+        assert(!copyBitset.get(i))
+      }
+      for (i <- offset until 100) {
+        if (setBits.contains(i - offset)) {
+          assert(copyBitset.get(i))
+        } else {
+          assert(!copyBitset.get(i))
+        }
+      }
+    }
+  }
+
   test("100% full bit set") {
     val bitset = new BitSet(10000)
     for (i <- 0 until 10000) {

diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
@@ -60,6 +60,7 @@ object DecisionTreeExample {
       testInput: String = "",
       dataFormat: String = "libsvm",
       algo: String = "Classification",
+      algorithm: String = "byRow",
       maxDepth: Int = 5,
       maxBins: Int = 32,
       minInstancesPerNode: Int = 1,
@@ -77,6 +78,9 @@ object DecisionTreeExample {
       opt[String]("algo")
         .text(s"algorithm (classification, regression), default: ${defaultParams.algo}")
         .action((x, c) => c.copy(algo = x))
+      opt[String]("algorithm")
+        .text(s"algorithm (byRow, byCol), default: ${defaultParams.algo}")
+        .action((x, c) => c.copy(algorithm = x))
       opt[Int]("maxDepth")
         .text(s"max depth of the tree, default: ${defaultParams.maxDepth}")
         .action((x, c) => c.copy(maxDepth = x))
@@ -236,33 +240,37 @@ object DecisionTreeExample {
     }
     // (2) Identify categorical features using VectorIndexer.
     //     Features with more than maxCategories values will be treated as continuous.
+    /*
     val featuresIndexer = new VectorIndexer()
       .setInputCol("features")
       .setOutputCol("indexedFeatures")
       .setMaxCategories(10)
     stages += featuresIndexer
+    */
     // (3) Learn Decision Tree
     val dt = algo match {
       case "classification" =>
         new DecisionTreeClassifier()
-          .setFeaturesCol("indexedFeatures")
+          .setFeaturesCol("features") // indexedFeatures
           .setLabelCol(labelColName)
           .setMaxDepth(params.maxDepth)
           .setMaxBins(params.maxBins)
           .setMinInstancesPerNode(params.minInstancesPerNode)
           .setMinInfoGain(params.minInfoGain)
           .setCacheNodeIds(params.cacheNodeIds)
           .setCheckpointInterval(params.checkpointInterval)
+          .setAlgorithm(params.algorithm)
       case "regression" =>
         new DecisionTreeRegressor()
-          .setFeaturesCol("indexedFeatures")
+          .setFeaturesCol("features") // indexedFeatures
           .setLabelCol(labelColName)
           .setMaxDepth(params.maxDepth)
           .setMaxBins(params.maxBins)
           .setMinInstancesPerNode(params.minInstancesPerNode)
           .setMinInfoGain(params.minInfoGain)
           .setCacheNodeIds(params.cacheNodeIds)
           .setCheckpointInterval(params.checkpointInterval)
+          .setAlgorithm(params.algorithm)
       case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.")
     }
     stages += dt
@@ -278,14 +286,14 @@ object DecisionTreeExample {
     algo match {
       case "classification" =>
         val treeModel = pipelineModel.stages.last.asInstanceOf[DecisionTreeClassificationModel]
-        if (treeModel.numNodes < 20) {
+        if (treeModel.numNodes < 200) {
           println(treeModel.toDebugString) // Print full model.
         } else {
           println(treeModel) // Print model summary.
         }
       case "regression" =>
         val treeModel = pipelineModel.stages.last.asInstanceOf[DecisionTreeRegressionModel]
-        if (treeModel.numNodes < 20) {
+        if (treeModel.numNodes < 200) {
           println(treeModel.toDebugString) // Print full model.
         } else {
           println(treeModel) // Print model summary.

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -18,9 +18,9 @@
 package org.apache.spark.ml.classification
 
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.param.{Param, ParamMap}
 import org.apache.spark.ml.tree.{DecisionTreeModel, DecisionTreeParams, Node, TreeClassifierParams}
-import org.apache.spark.ml.tree.impl.RandomForest
+import org.apache.spark.ml.tree.impl.{AltDT, RandomForest}
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -62,6 +62,25 @@ final class DecisionTreeClassifier(override val uid: String)
 
   override def setImpurity(value: String): this.type = super.setImpurity(value)
 
+  /**
+   * Algorithm used for learning.
+   * Supported: "byRow" or "byCol" (case sensitive).
+   * (default = "byRow")
+   * @group param
+   */
+  val algorithm: Param[String] = new Param[String](this, "algorithm", "Algorithm used " +
+    "for learning. Supported options:" +
+    s" ${DecisionTreeClassifier.supportedAlgorithms.mkString(", ")}",
+    (value: String) => DecisionTreeClassifier.supportedAlgorithms.contains(value))
+
+  setDefault(algorithm -> "byRow")
+
+  /** @group setParam */
+  def setAlgorithm(value: String): this.type = set(algorithm, value)
+
+  /** @group getParam */
+  def getAlgorithm: String = $(algorithm)
+
   override protected def train(dataset: DataFrame): DecisionTreeClassificationModel = {
     val categoricalFeatures: Map[Int, Int] =
       MetadataUtils.getCategoricalFeatures(dataset.schema($(featuresCol)))
@@ -74,9 +93,15 @@ final class DecisionTreeClassifier(override val uid: String)
     }
     val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
     val strategy = getOldStrategy(categoricalFeatures, numClasses)
-    val trees = RandomForest.run(oldDataset, strategy, numTrees = 1, featureSubsetStrategy = "all",
-      seed = 0L, parentUID = Some(uid))
-    trees.head.asInstanceOf[DecisionTreeClassificationModel]
+    val model = getAlgorithm match {
+      case "byRow" =>
+        val trees = RandomForest.run(oldDataset, strategy, numTrees = 1,
+          featureSubsetStrategy = "all", seed = 0L, parentUID = Some(uid))
+        trees.head
+      case "byCol" =>
+        AltDT.train(oldDataset, strategy, parentUID = Some(uid))
+    }
+    model.asInstanceOf[DecisionTreeClassificationModel]
   }
 
   /** (private[ml]) Create a Strategy instance to use with the old API. */
@@ -94,6 +119,8 @@ final class DecisionTreeClassifier(override val uid: String)
 object DecisionTreeClassifier {
   /** Accessor for supported impurities: entropy, gini */
   final val supportedImpurities: Array[String] = TreeClassifierParams.supportedImpurities
+
+  final val supportedAlgorithms: Array[String] = Array("byRow", "byCol")
 }
 
 /**

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -19,9 +19,9 @@ package org.apache.spark.ml.regression
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.{PredictionModel, Predictor}
-import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.param.{Param, ParamMap}
 import org.apache.spark.ml.tree.{DecisionTreeModel, DecisionTreeParams, Node, TreeRegressorParams}
-import org.apache.spark.ml.tree.impl.RandomForest
+import org.apache.spark.ml.tree.impl.{AltDT, RandomForest}
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -62,14 +62,39 @@ final class DecisionTreeRegressor(override val uid: String)
 
   override def setImpurity(value: String): this.type = super.setImpurity(value)
 
+  /**
+   * Algorithm used for learning.
+   * Supported: "byRow" or "byCol" (case sensitive).
+   * (default = "byRow")
+   * @group param
+   */
+  val algorithm: Param[String] = new Param[String](this, "algorithm", "Algorithm used " +
+    "for learning. Supported options:" +
+    s" ${DecisionTreeRegressor.supportedAlgorithms.mkString(", ")}",
+    (value: String) => DecisionTreeRegressor.supportedAlgorithms.contains(value))
+
+  setDefault(algorithm -> "byRow")
+
+  /** @group setParam */
+  def setAlgorithm(value: String): this.type = set(algorithm, value)
+
+  /** @group getParam */
+  def getAlgorithm: String = $(algorithm)
+
   override protected def train(dataset: DataFrame): DecisionTreeRegressionModel = {
     val categoricalFeatures: Map[Int, Int] =
       MetadataUtils.getCategoricalFeatures(dataset.schema($(featuresCol)))
     val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
     val strategy = getOldStrategy(categoricalFeatures)
-    val trees = RandomForest.run(oldDataset, strategy, numTrees = 1, featureSubsetStrategy = "all",
-      seed = 0L, parentUID = Some(uid))
-    trees.head.asInstanceOf[DecisionTreeRegressionModel]
+    val model = getAlgorithm match {
+      case "byRow" =>
+        val trees = RandomForest.run(oldDataset, strategy, numTrees = 1,
+          featureSubsetStrategy = "all", seed = 0L, parentUID = Some(uid))
+        trees.head
+      case "byCol" =>
+        AltDT.train(oldDataset, strategy, parentUID = Some(uid))
+    }
+    model.asInstanceOf[DecisionTreeRegressionModel]
   }
 
   /** (private[ml]) Create a Strategy instance to use with the old API. */
@@ -85,6 +110,8 @@ final class DecisionTreeRegressor(override val uid: String)
 object DecisionTreeRegressor {
   /** Accessor for supported impurities: variance */
   final val supportedImpurities: Array[String] = TreeRegressorParams.supportedImpurities
+
+  final val supportedAlgorithms: Array[String] = Array("byRow", "byCol")
 }
 
 /**

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
@@ -288,7 +288,7 @@ private[tree] object LearningNode {
       id: Int,
       isLeaf: Boolean,
       stats: ImpurityStats): LearningNode = {
-    new LearningNode(id, None, None, None, false, stats)
+    new LearningNode(id, None, None, None, isLeaf, stats)
   }
 
   /** Create an empty node with the given node index.  Values must be set later on. */

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/Split.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/Split.scala
@@ -47,6 +47,12 @@ sealed trait Split extends Serializable {
    */
   private[tree] def shouldGoLeft(binnedFeature: Int, splits: Array[Split]): Boolean
 
+  /**
+   * Return true (split to left) or false (split to right).
+   * @param feature Feature value (original value, not binned)
+   */
+  private[tree] def shouldGoLeft(feature: Double): Boolean
+
   /** Convert to old Split format */
   private[tree] def toOld: OldSplit
 }
@@ -112,6 +118,14 @@ final class CategoricalSplit private[ml] (
     }
   }
 
+  override private[tree] def shouldGoLeft(feature: Double): Boolean = {
+    if (isLeft) {
+      categories.contains(feature)
+    } else {
+      !categories.contains(feature)
+    }
+  }
+
   override def equals(o: Any): Boolean = {
     o match {
       case other: CategoricalSplit => featureIndex == other.featureIndex &&
@@ -172,6 +186,10 @@ final class ContinuousSplit private[ml] (override val featureIndex: Int, val thr
     }
   }
 
+  override private[tree] def shouldGoLeft(feature: Double): Boolean = {
+    feature <= threshold
+  }
+
   override def equals(o: Any): Boolean = {
     o match {
       case other: ContinuousSplit =>