regression pass unit test

vectorijk · vectorijk · commit b18b7181799e · 2016-10-06T00:53:21.000-07:00
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -348,7 +348,9 @@ export("as.DataFrame",
        "uncacheTable",
        "print.summary.GeneralizedLinearRegressionModel",
        "read.ml",
-       "print.summary.KSTest")
+       "print.summary.KSTest",
+       "print.summary.DecisionTreeRegressionModel",
+       "print.summary.DecisionTreeClassificationModel")
 
 export("structField",
        "structField.jobj",
@@ -373,6 +375,8 @@ S3method(print, structField)
 S3method(print, structType)
 S3method(print, summary.GeneralizedLinearRegressionModel)
 S3method(print, summary.KSTest)
+S3method(print, summary.DecisionTreeRegressionModel)
+S3method(print, summary.DecisionTreeClassificationModel)
 S3method(structField, character)
 S3method(structField, jobj)
 S3method(structType, jobj)
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
@@ -1447,39 +1447,31 @@ print.summary.KSTest <- function(x, ...) {
   invisible(x)
 }
 
-#' Decision tree regression model.
+#' Decision Tree
 #'
-#' Fit Decision Tree regression model on a SparkDataFrame.
+#' @description
+#' \code{spark.decisionTree} tree
 #'
-#' @param data SparkDataFrame for training.
-#' @param formula A symbolic description of the model to be fitted. Currently only a few formula
-#'                operators are supported, including '~', ':', '+', and '-'.
-#'                Note that operator '.' is not supported currently.
-#' @return a fitted decision tree regression model
-#' @rdname spark.decisionTreeRegressor
-#' @seealso rpart: \url{https://cran.r-project.org/web/packages/rpart/}
-#' @export
-#' @examples
-#' \dontrun{
-#' df <- createDataFrame(sqlContext, kyphosis)
-#' model <- spark.decisionTree(df, Kyphosis ~ Age + Number + Start)
-#' }
+#' Decision Tree
+#'
+#' @param data a SparkDataFrame of user data.
 #' @note spark.decisionTree since 2.1.0
 setMethod("spark.decisionTree", signature(data = "SparkDataFrame", formula = "formula"),
-          function(data, formula, type = c("regression", "classification")) {
+          function(data, formula, type = c("regression", "classification"), maxDepth = 5, maxBins = 32 ) {
             formula <- paste(deparse(formula), collapse = "")
             if (identical(type, "regression")) {
               jobj <- callJStatic("org.apache.spark.ml.r.DecisionTreeRegressorWrapper", "fit",
-                                  data@sdf, formula)
+                                  data@sdf, formula, as.integer(maxDepth), as.integer(maxBins))
               new("DecisionTreeRegressionModel", jobj = jobj)
             } else if (identical(type, "classification")) {
-              jobj <- callJStatic("org.apache.spark.ml.r.DecisionTreeClassificationWrapper", "fit",
-              data@sdf, formula)
+              jobj <- callJStatic("org.apache.spark.ml.r.DecisionTreeClassifierWrapper", "fit",
+                                  data@sdf, formula, as.integer(maxDepth), as.integer(maxBins))
               new("DecisionTreeClassificationModel", jobj = jobj)
             }
           })
 
-# Makes predictions from a Decision Tree model or a model produced by spark.decisionTree()
+# Makes predictions from a Decision Tree Regression model or
+# a model produced by spark.decisionTree()
 
 #' @param newData a SparkDataFrame for testing.
 #' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named
@@ -1492,6 +1484,20 @@ setMethod("predict", signature(object = "DecisionTreeRegressionModel"),
             predict_internal(object, newData)
           })
 
+# Makes predictions from a Decision Tree Classification model or
+# a model produced by spark.decisionTree()
+
+#' @param newData a SparkDataFrame for testing.
+#' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named
+#' "prediction"
+#' @rdname spark.decisionTree
+#' @export
+#' @note predict(decisionTreeClassificationModel) since 2.1.0
+setMethod("predict", signature(object = "DecisionTreeClassificationModel"),
+          function(object, newData) {
+            predict_internal(object, newData)
+          })
+
 #' Save the Decision Tree Regression model to the input path.
 #'
 #' @param object A fitted Decision tree regression model
@@ -1504,23 +1510,88 @@ setMethod("predict", signature(object = "DecisionTreeRegressionModel"),
 #' @export
 #' @note write.ml(DecisionTreeRegressionModel, character) since 2.1.0
 setMethod("write.ml", signature(object = "DecisionTreeRegressionModel", path = "character"),
-function(object, path, overwrite = FALSE) {
-    write_internal(object, path, overwrite)
-})
+          function(object, path, overwrite = FALSE) {
+            write_internal(object, path, overwrite)
+          })
 
-#  Get the summary of an IsotonicRegressionModel model
+#' Save the Decision Tree Classification model to the input path.
+#'
+#' @param object A fitted Decision tree classification model
+#' @param path The directory where the model is saved
+#' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
+#'                  which means throw exception if the output path exists.
+#'
+#' @aliases write.ml,DecisionTreeClassificationModel,character-method
+#' @rdname spark.decisionTreeClassification
+#' @export
+#' @note write.ml(DecisionTreeClassificationModel, character) since 2.1.0
+setMethod("write.ml", signature(object = "DecisionTreeClassificationModel", path = "character"),
+          function(object, path, overwrite = FALSE) {
+            write_internal(object, path, overwrite)
+          })
 
-#' @param object a fitted IsotonicRegressionModel
-#' @param ... Other optional arguments to summary of an IsotonicRegressionModel
-#' @return \code{summary} returns the model's boundaries and prediction as lists
-#' @rdname spark.isoreg
-#' @aliases summary,IsotonicRegressionModel-method
+#  Get the summary of an DecisionTreeRegressionModel model
+
+#' @param object a fitted DecisionTreeRegressionModel
+#' @param ... Other optional arguments to summary of a DecisionTreeRegressionModel
+#' @return \code{summary} returns the model's features as lists, depth and number of nodes
+#' @rdname spark.decisionTree
+#' @aliases summary,DecisionTreeRegressionModel-method
 #' @export
-#' @note summary(IsotonicRegressionModel) since 2.1.0
+#' @note summary(DecisionTreeRegressionModel) since 2.1.0
 setMethod("summary", signature(object = "DecisionTreeRegressionModel"),
           function(object, ...) {
             jobj <- object@jobj
-            boundaries <- callJMethod(jobj, "boundaries")
-            predictions <- callJMethod(jobj, "predictions")
-            return(list(boundaries = boundaries, predictions = predictions))
-          })
+            features <- callJMethod(jobj, "features")
+            depth <- callJMethod(jobj, "depth")
+            numNodes <- callJMethod(jobj, "numNodes")
+            ans <- list(features = features, depth = depth, numNodes = numNodes)
+            class(ans) <- "summary.DecisionTreeRegressionModel"
+            ans
+          })
+
+#  Get the summary of an DecisionTreeClassificationModel model
+
+#' @param object a fitted DecisionTreeClassificationModel
+#' @param ... Other optional arguments to summary of a DecisionTreeClassificationModel
+#' @return \code{summary} returns the model's features as lists, depth and number of nodes
+#' @rdname spark.decisionTree
+#' @aliases summary,DecisionTreeClassificationModel-method
+#' @export
+#' @note summary(DecisionTreeRegressionModel) since 2.1.0
+setMethod("summary", signature(object = "DecisionTreeClassificationModel"),
+function(object, ...) {
+    jobj <- object@jobj
+    features <- callJMethod(jobj, "features")
+    depth <- callJMethod(jobj, "depth")
+    numNodes <- callJMethod(jobj, "numNodes")
+    ans <- list(features = features, depth = depth, numNodes = numNodes)
+    class(ans) <- "summary.DecisionTreeClassificationModel"
+    ans
+})
+
+#  Prints the summary of Decision Tree Regression Model
+
+#' @rdname spark.decisionTree
+#' @param x summary object of decisionTreeRegressionModel returned by \code{summary}.
+#' @export
+#' @note print.summary.DecisionTreeRegressionModel since 2.1.0
+print.summary.DecisionTreeRegressionModel <- function(x, ...) {
+  jobj <- x@jobj
+  summaryStr <- callJMethod(jobj, "summary")
+  cat(summaryStr, "\n")
+  invisible(x)
+  }
+
+#  Prints the summary of Decision Tree Classification Model
+
+#' @rdname spark.decisionTree
+#' @param x summary object of decisionTreeClassificationModel returned by \code{summary}.
+#' @export
+#' @note print.summary.DecisionTreeClassificationModel since 2.1.0
+print.summary.DecisionTreeClassificationModel <- function(x, ...) {
+    jobj <- x@jobj
+    summaryStr <- callJMethod(jobj, "summary")
+    cat(summaryStr, "\n")
+    invisible(x)
+}
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -791,4 +791,34 @@ test_that("spark.kstest", {
   expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
 })
 
+test_that("spark.decisionTree Regression", {
+  data <- suppressWarnings(createDataFrame(longley))
+  model <- spark.decisionTree(data, Employed~., "regression", maxDepth=5, maxBins=16)
+
+  #Test summary
+  stats <- summary(model)
+  expect_equal(stats$depth, 5)
+  expect_equal(stats$numNodes, 31)
+
+  #Test model predict
+  predictions <- collect(predict(model, data))
+  expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
+                                         63.221, 63.639, 64.989, 63.761,
+                                         66.019, 67.857, 68.169, 66.513,
+                                         68.655, 69.564, 69.331, 70.551),
+               tolerance = 1e-4)
+
+  # Test model save/load
+  modelPath <- tempfile(pattern = "spark-decisionTreeRegression", fileext = ".tmp")
+  write.ml(model, modelPath)
+  expect_error(write.ml(model, modelPath))
+  write.ml(model, modelPath, overwrite = TRUE)
+  model2 <- read.ml(modelPath)
+  stats2 <- summary(model2)
+  expect_equal(stats$depth, stats2$depth)
+  expect_equal(stats$numNodes, stats2$numNodes)
+
+  unlink(modelPath)
+})
+
 sparkR.session.stop()
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeClassifierWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeClassifierWrapper.scala
@@ -23,29 +23,23 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.ml.{Pipeline, PipelineModel}
-import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute}
+import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier}
-import org.apache.spark.ml.feature.{IndexToString, RFormula}
+import org.apache.spark.ml.feature.RFormula
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
 
 private[r] class DecisionTreeClassifierWrapper private (
   val pipeline: PipelineModel,
   val features: Array[String],
-  val labels: Array[String]) extends MLWritable {
-
-  import DecisionTreeClassifierWrapper.PREDICTED_LABEL_INDEX_COL
+  val maxDepth: Int,
+  val maxBins: Int) extends MLWritable {
 
   private val DTModel: DecisionTreeClassificationModel =
     pipeline.stages(1).asInstanceOf[DecisionTreeClassificationModel]
 
-  lazy val maxDepth: Int = DTModel.getMaxDepth
-
-  lazy val maxBins: Int = DTModel.getMaxBins
-
   def transform(dataset: Dataset[_]): DataFrame = {
     pipeline.transform(dataset)
-      .drop(PREDICTED_LABEL_INDEX_COL)
       .drop(DTModel.getFeaturesCol)
   }
 
@@ -54,33 +48,36 @@ private[r] class DecisionTreeClassifierWrapper private (
 }
 
 private[r] object DecisionTreeClassifierWrapper extends MLReadable[DecisionTreeClassifierWrapper] {
+  def fit(data: DataFrame,
+          formula: String,
+          maxDepth: Int,
+          maxBins: Int): DecisionTreeClassifierWrapper = {
 
-  val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
-  val PREDICTED_LABEL_COL = "prediction"
-
-  def fit(data: DataFrame, formula: String): DecisionTreeClassifierWrapper = {
     val rFormula = new RFormula()
       .setFormula(formula)
-      .fit(data)
-    // get labels and feature names from output schema
-    val schema = rFormula.transform(data).schema
-    val labelAttr = Attribute.fromStructField(schema(rFormula.getLabelCol))
-      .asInstanceOf[NominalAttribute]
-    val labels = labelAttr.values.get
-    val featureAttrs = AttributeGroup.fromStructField(schema(rFormula.getFeaturesCol))
+      .setFeaturesCol("features")
+      .setLabelCol("label")
+
+    RWrapperUtils.checkDataColumns(rFormula, data)
+    val rFormulaModel = rFormula.fit(data)
+
+    // get feature names from output schema
+    val schema = rFormulaModel.transform(data).schema
+    val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol))
       .attributes.get
     val features = featureAttrs.map(_.name.get)
+
     // assemble and fit the pipeline
-    val decisionTree = new DecisionTreeClassifier()
-      .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
-    val idxToStr = new IndexToString()
-      .setInputCol(PREDICTED_LABEL_INDEX_COL)
-      .setOutputCol(PREDICTED_LABEL_COL)
-      .setLabels(labels)
+    val decisionTreeClassification = new DecisionTreeClassifier()
+      .setMaxDepth(maxDepth)
+      .setMaxBins(maxBins)
+      .setFeaturesCol(rFormula.getFeaturesCol)
+
     val pipeline = new Pipeline()
-      .setStages(Array(rFormula, decisionTree, idxToStr))
+      .setStages(Array(rFormulaModel, decisionTreeClassification))
       .fit(data)
-    new DecisionTreeClassifierWrapper(pipeline, features, labels)
+
+    new DecisionTreeClassifierWrapper(pipeline, features, maxDepth, maxBins)
   }
 
   override def read: MLReader[DecisionTreeClassifierWrapper] =
@@ -97,7 +94,8 @@ private[r] object DecisionTreeClassifierWrapper extends MLReadable[DecisionTreeC
 
       val rMetadata = ("class" -> instance.getClass.getName) ~
         ("features" -> instance.features.toSeq) ~
-        ("labels" -> instance.labels.toSeq)
+        ("maxDepth" -> instance.maxDepth) ~
+        ("maxBins" -> instance.maxBins)
       val rMetadataJson: String = compact(render(rMetadata))
 
       sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)
@@ -116,8 +114,10 @@ private[r] object DecisionTreeClassifierWrapper extends MLReadable[DecisionTreeC
       val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
       val rMetadata = parse(rMetadataStr)
       val features = (rMetadata \ "features").extract[Array[String]]
-      val labels = (rMetadata \ "labels").extract[Array[String]]
-      new DecisionTreeClassifierWrapper(pipeline, features, labels)
+      val maxDepth = (rMetadata \ "maxDepth").extract[Int]
+      val maxBins = (rMetadata \ "maxBins").extract[Int]
+
+      new DecisionTreeClassifierWrapper(pipeline, features, maxDepth, maxBins)
     }
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeRegressorWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeRegressorWrapper.scala