DecisionTree Wrapper in SparkR

vectorijk · vectorijk · commit bee486814c32 · 2016-10-06T00:53:21.000-07:00
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -43,7 +43,8 @@ exportMethods("glm",
               "spark.isoreg",
               "spark.gaussianMixture",
               "spark.als",
-              "spark.kstest")
+              "spark.kstest",
+              "spark.decisionTree")
 
 # Job group lifecycle management methods
 export("setJobGroup",
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -1358,6 +1358,11 @@ setGeneric("spark.perplexity", function(object, data) { standardGeneric("spark.p
 #' @export
 setGeneric("spark.isoreg", function(data, formula, ...) { standardGeneric("spark.isoreg") })
 
+#' @rdname spark.decisionTree
+#' @export
+setGeneric("spark.decisionTree",
+           function(data, formula, ...) { standardGeneric("spark.decisionTree") })
+
 #' @rdname spark.gaussianMixture
 #' @export
 setGeneric("spark.gaussianMixture",
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
@@ -95,6 +95,20 @@ setClass("ALSModel", representation(jobj = "jobj"))
 #' @note KSTest since 2.1.0
 setClass("KSTest", representation(jobj = "jobj"))
 
+#' S4 class that represents a DecisionTreeRegressionModel
+#'
+#' @param jobj a Java object reference to the backing Scala DecisionTreeRegressionModel
+#' @export
+#' @note DecisionTreeRegressionModel since 2.1.0
+setClass("DecisionTreeRegressionModel", representation(jobj = "jobj"))
+
+#' S4 class that represents a DecisionTreeClassificationModel
+#'
+#' @param jobj a Java object reference to the backing Scala DecisionTreeClassificationModel
+#' @export
+#' @note DecisionTreeClassificationModel since 2.1.0
+setClass("DecisionTreeClassificationModel", representation(jobj = "jobj"))
+
 #' Saves the MLlib model to the input path
 #'
 #' Saves the MLlib model to the input path. For more information, see the specific
@@ -897,6 +911,22 @@ setMethod("write.ml", signature(object = "GaussianMixtureModel", path = "charact
             write_internal(object, path, overwrite)
           })
 
+#' Save the Decision Tree Regression model to the input path.
+#'
+#' @param object A fitted Decision tree regression model
+#' @param path The directory where the model is saved
+#' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
+#'                  which means throw exception if the output path exists.
+#'
+#' @aliases write.ml,DecisionTreeRegressionModel,character-method
+#' @rdname spark.decisionTreeRegression
+#' @export
+#' @note write.ml(DecisionTreeRegressionModel, character) since 2.1.0
+setMethod("write.ml", signature(object = "DecisionTreeRegressionModel", path = "character"),
+          function(object, path, overwrite = FALSE) {
+            write_internal(object, path, overwrite)
+          })
+
 #' Load a fitted MLlib model from the input path.
 #'
 #' @param path path of the model to read.
@@ -932,6 +962,8 @@ read.ml <- function(path) {
     new("GaussianMixtureModel", jobj = jobj)
   } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.ALSWrapper")) {
     new("ALSModel", jobj = jobj)
+  } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.DecisionTreeRegressorWrapper")) {
+    new("DecisionTreeRegressionModel", jobj = jobj)
   } else {
     stop("Unsupported model: ", jobj)
   }
@@ -1427,3 +1459,39 @@ print.summary.KSTest <- function(x, ...) {
   cat(summaryStr, "\n")
   invisible(x)
 }
+
+#' Decision tree regression model.
+#'
+#' Fit Decision Tree regression model on a SparkDataFrame.
+#'
+#' @param data SparkDataFrame for training.
+#' @param formula A symbolic description of the model to be fitted. Currently only a few formula
+#'                operators are supported, including '~', ':', '+', and '-'.
+#'                Note that operator '.' is not supported currently.
+#' @return a fitted decision tree regression model
+#' @rdname spark.decisionTreeRegressor
+#' @seealso rpart: \url{https://cran.r-project.org/web/packages/rpart/}
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- createDataFrame(sqlContext, kyphosis)
+#' model <- spark.decisionTree(df, Kyphosis ~ Age + Number + Start)
+#' }
+setMethod("spark.decisionTree", signature(data = "SparkDataFrame", formula = "formula"),
+          function(data, formula, type = c("regression", "classification")) {
+            formula <- paste(deparse(formula), collapse = "")
+            if (identical(type, "regression")) {
+              jobj <- callJStatic("org.apache.spark.ml.r.DecisionTreeRegressorWrapper", "fit",
+                                  data@sdf, formula)
+              new("DecisionTreeRegressionModel", jobj = jobj)
+            } else if (identical(type, "classification")) {
+              jobj <- callJStatic("org.apache.spark.ml.r.DecisionTreeClassificationWrapper", "fit",
+              data@sdf, formula)
+              new("DecisionTreeClassificationModel", jobj = jobj)
+            }
+          })
+
+setMethod("predict", signature(object = "DecisionTreeRegressionModel"),
+          function(object, newData) {
+            return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf)))
+          })
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeClassifierWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeClassifierWrapper.scala
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.r
+
+import org.apache.hadoop.fs.Path
+import org.json4s._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.ml.{Pipeline, PipelineModel}
+import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute}
+import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier}
+import org.apache.spark.ml.feature.{IndexToString, RFormula}
+import org.apache.spark.ml.util._
+import org.apache.spark.sql.{DataFrame, Dataset}
+
+private[r] class DecisionTreeClassifierWrapper private (
+  val pipeline: PipelineModel,
+  val features: Array[String],
+  val labels: Array[String]) extends MLWritable {
+
+  import DecisionTreeClassifierWrapper.PREDICTED_LABEL_INDEX_COL
+
+  private val DTModel: DecisionTreeClassificationModel =
+    pipeline.stages(1).asInstanceOf[DecisionTreeClassificationModel]
+
+  lazy val maxDepth: Int = DTModel.getMaxDepth
+
+  lazy val maxBins: Int = DTModel.getMaxBins
+
+  def transform(dataset: Dataset[_]): DataFrame = {
+    pipeline.transform(dataset)
+      .drop(PREDICTED_LABEL_INDEX_COL)
+      .drop(DTModel.getFeaturesCol)
+  }
+
+  override def write: MLWriter = new
+      DecisionTreeClassifierWrapper.DecisionTreeClassifierWrapperWriter(this)
+}
+
+private[r] object DecisionTreeClassifierWrapper extends MLReadable[DecisionTreeClassifierWrapper] {
+
+  val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
+  val PREDICTED_LABEL_COL = "prediction"
+
+  def fit(data: DataFrame, formula: String): DecisionTreeClassifierWrapper = {
+    val rFormula = new RFormula()
+      .setFormula(formula)
+      .fit(data)
+    // get labels and feature names from output schema
+    val schema = rFormula.transform(data).schema
+    val labelAttr = Attribute.fromStructField(schema(rFormula.getLabelCol))
+      .asInstanceOf[NominalAttribute]
+    val labels = labelAttr.values.get
+    val featureAttrs = AttributeGroup.fromStructField(schema(rFormula.getFeaturesCol))
+      .attributes.get
+    val features = featureAttrs.map(_.name.get)
+    // assemble and fit the pipeline
+    val decisionTree = new DecisionTreeClassifier()
+      .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
+    val idxToStr = new IndexToString()
+      .setInputCol(PREDICTED_LABEL_INDEX_COL)
+      .setOutputCol(PREDICTED_LABEL_COL)
+      .setLabels(labels)
+    val pipeline = new Pipeline()
+      .setStages(Array(rFormula, decisionTree, idxToStr))
+      .fit(data)
+    new DecisionTreeClassifierWrapper(pipeline, features, labels)
+  }
+
+  override def read: MLReader[DecisionTreeClassifierWrapper] =
+    new DecisionTreeClassifierWrapperReader
+
+  override def load(path: String): DecisionTreeClassifierWrapper = super.load(path)
+
+  class DecisionTreeClassifierWrapperWriter(instance: DecisionTreeClassifierWrapper)
+    extends MLWriter {
+
+    override protected def saveImpl(path: String): Unit = {
+      val rMetadataPath = new Path(path, "rMetadata").toString
+      val pipelinePath = new Path(path, "pipeline").toString
+
+      val rMetadata = ("class" -> instance.getClass.getName) ~
+        ("features" -> instance.features.toSeq) ~
+        ("labels" -> instance.labels.toSeq)
+      val rMetadataJson: String = compact(render(rMetadata))
+
+      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)
+      instance.pipeline.save(pipelinePath)
+    }
+  }
+
+  class DecisionTreeClassifierWrapperReader extends MLReader[DecisionTreeClassifierWrapper] {
+
+    override def load(path: String): DecisionTreeClassifierWrapper = {
+      implicit val format = DefaultFormats
+      val rMetadataPath = new Path(path, "rMetadata").toString
+      val pipelinePath = new Path(path, "pipeline").toString
+      val pipeline = PipelineModel.load(pipelinePath)
+
+      val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
+      val rMetadata = parse(rMetadataStr)
+      val features = (rMetadata \ "features").extract[Array[String]]
+      val labels = (rMetadata \ "labels").extract[Array[String]]
+      new DecisionTreeClassifierWrapper(pipeline, features, labels)
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeRegressorWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeRegressorWrapper.scala
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.r
+
+import org.apache.hadoop.fs.Path
+import org.json4s._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.ml.{Pipeline, PipelineModel}
+import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute}
+import org.apache.spark.ml.feature.{IndexToString, RFormula}
+import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, DecisionTreeRegressor}
+import org.apache.spark.ml.util._
+import org.apache.spark.sql.{DataFrame, Dataset}
+
+private[r] class DecisionTreeRegressorWrapper private (
+  val pipeline: PipelineModel,
+  val features: Array[String],
+  val labels: Array[String]) extends MLWritable {
+
+  import DecisionTreeRegressorWrapper.PREDICTED_LABEL_INDEX_COL
+
+  private val DTModel: DecisionTreeRegressionModel =
+    pipeline.stages(1).asInstanceOf[DecisionTreeRegressionModel]
+
+  lazy val maxDepth: Int = DTModel.getMaxDepth
+
+  lazy val maxBins: Int = DTModel.getMaxBins
+
+  def transform(dataset: Dataset[_]): DataFrame = {
+    pipeline.transform(dataset)
+      .drop(PREDICTED_LABEL_INDEX_COL)
+      .drop(DTModel.getFeaturesCol)
+  }
+
+  override def write: MLWriter = new
+      DecisionTreeRegressorWrapper.DecisionTreeRegressorWrapperWriter(this)
+}
+
+private[r] object DecisionTreeRegressorWrapper extends MLReadable[DecisionTreeRegressorWrapper] {
+
+  val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
+  val PREDICTED_LABEL_COL = "prediction"
+
+  def fit(data: DataFrame, formula: String): DecisionTreeRegressorWrapper = {
+    val rFormula = new RFormula()
+      .setFormula(formula)
+      .fit(data)
+    // get labels and feature names from output schema
+    val schema = rFormula.transform(data).schema
+    val labelAttr = Attribute.fromStructField(schema(rFormula.getLabelCol))
+      .asInstanceOf[NominalAttribute]
+    val labels = labelAttr.values.get
+    val featureAttrs = AttributeGroup.fromStructField(schema(rFormula.getFeaturesCol))
+      .attributes.get
+    val features = featureAttrs.map(_.name.get)
+    // assemble and fit the pipeline
+    val decisionTree = new DecisionTreeRegressor()
+      .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
+    val idxToStr = new IndexToString()
+      .setInputCol(PREDICTED_LABEL_INDEX_COL)
+      .setOutputCol(PREDICTED_LABEL_COL)
+      .setLabels(labels)
+    val pipeline = new Pipeline()
+      .setStages(Array(rFormula, decisionTree, idxToStr))
+      .fit(data)
+    new DecisionTreeRegressorWrapper(pipeline, features, labels)
+  }
+
+  override def read: MLReader[DecisionTreeRegressorWrapper] = new DecisionTreeRegressorWrapperReader
+
+  override def load(path: String): DecisionTreeRegressorWrapper = super.load(path)
+
+  class DecisionTreeRegressorWrapperWriter(instance: DecisionTreeRegressorWrapper)
+    extends MLWriter {
+
+    override protected def saveImpl(path: String): Unit = {
+      val rMetadataPath = new Path(path, "rMetadata").toString
+      val pipelinePath = new Path(path, "pipeline").toString
+
+      val rMetadata = ("class" -> instance.getClass.getName) ~
+        ("features" -> instance.features.toSeq) ~
+        ("labels" -> instance.labels.toSeq)
+      val rMetadataJson: String = compact(render(rMetadata))
+
+      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)
+      instance.pipeline.save(pipelinePath)
+    }
+  }
+
+  class DecisionTreeRegressorWrapperReader extends MLReader[DecisionTreeRegressorWrapper] {
+
+    override def load(path: String): DecisionTreeRegressorWrapper = {
+      implicit val format = DefaultFormats
+      val rMetadataPath = new Path(path, "rMetadata").toString
+      val pipelinePath = new Path(path, "pipeline").toString
+      val pipeline = PipelineModel.load(pipelinePath)
+
+      val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
+      val rMetadata = parse(rMetadataStr)
+      val features = (rMetadata \ "features").extract[Array[String]]
+      val labels = (rMetadata \ "labels").extract[Array[String]]
+      new DecisionTreeRegressorWrapper(pipeline, features, labels)
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/RWrappers.scala b/mllib/src/main/scala/org/apache/spark/ml/r/RWrappers.scala
@@ -54,6 +54,8 @@ private[r] object RWrappers extends MLReader[Object] {
         GaussianMixtureWrapper.load(path)
       case "org.apache.spark.ml.r.ALSWrapper" =>
         ALSWrapper.load(path)
+      case "org.apache.spark.ml.r.DecisionTreeRegressorWrapper" =>
+        DecisionTreeRegressorWrapper.load(path)
       case _ =>
         throw new SparkException(s"SparkR read.ml does not support load $className")
     }

Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,8 @@ private[r] object RWrappers extends MLReader[Object] {`
`54`	`54`	`GaussianMixtureWrapper.load(path)`
`55`	`55`	`case "org.apache.spark.ml.r.ALSWrapper" =>`
`56`	`56`	`ALSWrapper.load(path)`
	`57`	`+ case "org.apache.spark.ml.r.DecisionTreeRegressorWrapper" =>`
	`58`	`+ DecisionTreeRegressorWrapper.load(path)`
`57`	`59`	`case _ =>`
`58`	`60`	`throw new SparkException(s"SparkR read.ml does not support load $className")`
`59`	`61`	`}`