refactor with NaiveBayesWrapper

mengxr · mengxr · commit 49f36f304fd9 · 2016-03-22T08:48:21.000-07:00
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
@@ -22,6 +22,11 @@
 #' @export
 setClass("PipelineModel", representation(model = "jobj"))
 
+#' @tile S4 class that represents a NaiveBayesModel
+#' @param jobj a Java object reference to the backing Scala NaiveBayesWrapper
+#' @export
+setClass("NaiveBayesModel", representation(jobj = "jobj"))
+
 #' Fits a generalized linear model
 #'
 #' Fits a generalized linear model, similarly to R's glm(). Also see the glmnet package.
@@ -61,7 +66,7 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFram
             return(new("PipelineModel", model = model))
           })
 
-#' Make predictions from a model
+#' Make predictions from a amodel
 #'
 #' Makes predictions from a model produced by glm(), similarly to R's predict().
 #'
@@ -81,6 +86,26 @@ setMethod("predict", signature(object = "PipelineModel"),
             return(dataFrame(callJMethod(object@model, "transform", newData@sdf)))
           })
 
+#' Make predictions from a naive Bayes model
+#'
+#' Makes predictions from a model produced by naiveBayes(), similarly to R package e1071's predict.
+#'
+#' @param object A fitted naive Bayes model
+#' @param newData DataFrame for testing
+#' @return DataFrame containing predicted labels in a column named "prediction"
+#' @rdname predict
+#' @export
+#' @examples
+#' \dontrun{
+#' model <- naiveBayes(y ~ x, trainingData)
+#' predicted <- predict(model, testData)
+#' showDF(predicted)
+#'}
+setMethod("predict", signature(object = "NaiveBayesModel"),
+          function(object, newData) {
+            return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf)))
+          })
+
 #' Get the summary of a model
 #'
 #' Returns the summary of a model produced by glm(), similarly to R's summary().
@@ -135,24 +160,40 @@ setMethod("summary", signature(object = "PipelineModel"),
               colnames(coefficients) <- unlist(features)
               rownames(coefficients) <- 1:k
               return(list(coefficients = coefficients, size = size, cluster = dataFrame(cluster)))
-            } else if (modelName == "NaiveBayesModel") {
-              labels <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
-                                    "getNaiveBayesLabels", object@model)
-              pi <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
-                                "getNaiveBayesPi", object@model)
-              theta <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
-                                   "getNaiveBayesTheta", object@model)
-              pi <- t(as.matrix(unlist(pi)))
-              colnames(pi) <- unlist(labels)
-              theta <- matrix(theta, nrow = length(labels))
-              rownames(theta) <- unlist(labels)
-              colnames(theta) <- unlist(features)
-              return(list(pi = pi, theta = theta))
             } else {
               stop(paste("Unsupported model", modelName, sep = " "))
             }
           })
 
+#' Get the summary of a naive Bayes model
+#'
+#' Returns the summary of a naive Bayes model produced by naiveBayes(), similarly to R's summary().
+#'
+#' @param object A fitted MLlib model
+#' @return a list containing 'apriori', the label distribution, and 'tables', conditional
+#          probabilities given the target label
+#' @rdname summary
+#' @export
+#' @examples
+#' \dontrun{
+#' model <- naiveBayes(y ~ x, trainingData)
+#' summary(model)
+#'}
+setMethod("summary", signature(object = "NaiveBayesModel"),
+          function(object, ...) {
+            jobj <- object@jobj
+            features <- callJMethod(jobj, "features")
+            labels <- callJMethod(jobj, "labels")
+            apriori <- callJMethod(jobj, "apriori")
+            apriori <- t(as.matrix(unlist(apriori)))
+            colnames(apriori) <- unlist(labels)
+            tables <- callJMethod(jobj, "tables")
+            tables <- matrix(tables, nrow = length(labels))
+            rownames(tables) <- unlist(labels)
+            colnames(tables) <- unlist(features)
+            return(list(apriori = apriori, tables = tables))
+          })
+
 #' Fit a k-means model
 #'
 #' Fit a k-means model, similarly to R's kmeans().
@@ -206,34 +247,30 @@ setMethod("fitted", signature(object = "PipelineModel"),
             }
           })
 
-#' Fit a naive Bayes model
+#' Fit a Bernoulli naive Bayes model
 #'
-#' Fit a naive Bayes model, similarly to R's naiveBayes() except for omitting two arguments 'subset'
-#' and 'na.action'. Users can use 'subset' function and 'fillna' or 'na.omit' function of DataFrame,
-#' respectively, to preprocess their DataFrame. We use na.omit in this interface to remove rows with
-#' NA values.
+#' Fit a Bernoulli naive Bayes model, similarly to R package e1071's naiveBayes() while only
+#' categorical features are supported. The input should be a DataFrame of observations instead of a
+#' contingency table.
 #'
 #' @param object A symbolic description of the model to be fitted. Currently only a few formula
-#'                operators are supported, including '~', '.', ':', '+', and '-'.
+#'               operators are supported, including '~', '.', ':', '+', and '-'.
 #' @param data DataFrame for training
-#' @param lambda Smoothing parameter
-#' @param modelType Either 'multinomial' or 'bernoulli'. Default "multinomial".
-#' @return A fitted naive Bayes model.
+#' @param laplace Smoothing parameter
+#' @return a fitted naive Bayes model
 #' @rdname naiveBayes
+#' @seealso e1071: \url{https://cran.r-project.org/web/packages/e1071/}
 #' @export
 #' @examples
 #' \dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
 #' df <- createDataFrame(sqlContext, infert)
-#' model <- naiveBayes(education ~ ., df, lambda = 1, modelType = "multinomial")
+#' model <- naiveBayes(education ~ ., df, laplace = 0)
 #'}
 setMethod("naiveBayes", signature(formula = "formula", data = "DataFrame"),
-          function(formula, data, lambda = 1, modelType = c("multinomial", "bernoulli"), ...) {
+          function(formula, data, laplace = 0, ...) {
             data <- na.omit(data)
             formula <- paste(deparse(formula), collapse = "")
-            modelType <- match.arg(modelType)
-            model <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers", "fitNaiveBayes",
-                                 formula, data@sdf, lambda, modelType)
-            return(new("PipelineModel", model = model))
+            jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", "fit",
+                                 formula, data@sdf, laplace)
+            return(new("NaiveBayesModel", jobj = jobj))
           })
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -143,25 +143,60 @@ test_that("kmeans", {
 })
 
 test_that("naiveBayes", {
-  training <- suppressWarnings(createDataFrame(sqlContext, infert))
-
-  model <- naiveBayes(education ~ ., data = training, lambda = 1, modelType = "multinomial")
-  sample <- take(select(predict(model, training), "rawLabelsPrediction"), 1)
-  expect_equal(typeof(sample$rawLabelsPrediction), "character")
-  expect_equal(sample$rawLabelsPrediction, "0-5yrs")
-
-  # Test summary works on naiveBayes
-  summary.model <- summary(model)
-  expect_equal(length(summary.model$pi), 3)
-  expect_equal(sum(summary.model$pi), 1)
-  l1 <- summary.model$theta[1, ]
-  l2 <- summary.model$theta[2, ]
-  expect_equal(sum(unlist(l1)), 1)
-  expect_equal(sum(unlist(l2)), 1)
+  # R code to reproduce the result.
+  # We do not support instance weights yet. So we ignore the frequencies.
+  #
+  # library(e1071)
+  # t <- as.data.frame(Titanic)
+  # t1 <- t[t$Freq > 0, -5]
+  # m <- naiveBayes(Survived ~ ., data = t1)
+  # m
+  # predict(m, t1)
+  #
+  # -- output of 'm'
+  #
+  # A-priori probabilities:
+  # Y
+  #        No       Yes
+  # 0.4166667 0.5833333
+  #
+  # Conditional probabilities:
+  #      Class
+  # Y           1st       2nd       3rd      Crew
+  #   No  0.2000000 0.2000000 0.4000000 0.2000000
+  #   Yes 0.2857143 0.2857143 0.2857143 0.1428571
+  #
+  #      Sex
+  # Y     Male Female
+  #   No   0.5    0.5
+  #   Yes  0.5    0.5
+  #
+  #      Age
+  # Y         Child     Adult
+  #   No  0.2000000 0.8000000
+  #   Yes 0.4285714 0.5714286
+  #
+  # -- output of 'predict(m, t1)'
+  #
+  # Yes Yes Yes Yes No  No  Yes Yes No  No  Yes Yes Yes Yes Yes Yes Yes Yes No  No  Yes Yes No  No
+  #
+
+  t <- as.data.frame(Titanic)
+  t1 <- t[t$Freq > 0, -5]
+  df <- suppressWarnings(createDataFrame(sqlContext, t1))
+  m <- naiveBayes(Survived ~ ., data = df)
+  s <- summary(m)
+  expect_equal(s$apriori[1, "Yes"], 0.5833333, tolerance = 1e-6)
+  expect_equal(sum(s$apriori), 1)
+  expect_equal(s$tables["Yes", "Age_Adult"], 0.5714286, tolerance = 1e-6)
+  p <- collect(select(predict(m, df), "prediction"))
+  expect_equal(p$prediction, c("Yes", "Yes", "Yes", "Yes", "No", "No", "Yes", "Yes", "No", "No",
+                               "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No",
+                               "Yes", "Yes", "No", "No"))
 
   # Test e1071::naiveBayes
   if (requireNamespace("e1071", quietly = TRUE)) {
-    expect_that(m <- e1071::naiveBayes(education ~ ., data = infert), not(throws_error()))
-    expect_equal(as.character(predict(m, infert[1, ])), "0-5yrs")
+    expect_that(m <- e1071::naiveBayes(Survived ~ ., data = t1), not(throws_error()))
+    expect_equal(as.character(predict(m, t1[1, ])), "Yes")
   }
 })
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -22,7 +22,6 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.PredictorParams
-import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap, ParamValidators}
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.classification.{NaiveBayes => OldNaiveBayes}
@@ -105,12 +104,7 @@ class NaiveBayes @Since("1.5.0") (
   override protected def train(dataset: DataFrame): NaiveBayesModel = {
     val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
     val oldModel = OldNaiveBayes.train(oldDataset, $(smoothing), $(modelType))
-    val nbModel = copyValues(NaiveBayesModel.fromOld(oldModel, this))
-    val attr = AttributeGroup.fromStructField(dataset.schema($(featuresCol))).attributes
-    if (attr.isDefined) {
-      nbModel.setFeatureNames(attr.get.map(_.name.getOrElse("NA")))
-    }
-    nbModel
+    NaiveBayesModel.fromOld(oldModel, this)
   }
 
   @Since("1.5.0")
@@ -233,21 +227,6 @@ class NaiveBayesModel private[ml] (
 
   @Since("1.6.0")
   override def write: MLWriter = new NaiveBayesModel.NaiveBayesModelWriter(this)
-
-  private var featureNames: Option[Array[String]] = None
-
-  private[classification] def setFeatureNames(names: Array[String]): this.type = {
-    this.featureNames = Some(names)
-    this
-  }
-
-  private[ml] def getFeatureNames: Array[String] = featureNames match {
-    case Some(names) => names
-    case None =>
-      throw new SparkException(
-        s"No training result available for the ${this.getClass.getSimpleName}",
-        new NullPointerException())
-  }
 }
 
 @Since("1.6.0")
@@ -258,6 +237,7 @@ object NaiveBayesModel extends MLReadable[NaiveBayesModel] {
       oldModel: OldNaiveBayesModel,
       parent: NaiveBayes): NaiveBayesModel = {
     val uid = if (parent != null) parent.uid else Identifiable.randomUID("nb")
+    val labels = Vectors.dense(oldModel.labels)
     val pi = Vectors.dense(oldModel.pi)
     val theta = new DenseMatrix(oldModel.labels.length, oldModel.theta(0).length,
       oldModel.theta.flatten, true)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -257,18 +257,6 @@ class RFormulaModel private[feature](
       "Label column already exists and is not of type DoubleType.")
   }
 
-  /**
-   * Get the original array of labels if exists.
-   */
-  private[ml] def getOriginalLabels: Option[Array[String]] = {
-    // According to the sequences of transformers in RFormula, if the last stage is a
-    // StringIndexerModel, then we can extract the original labels from it.
-    pipelineModel.stages.last match {
-      case m: StringIndexerModel => Some(m.labels)
-      case _ => None
-    }
-  }
-
   @Since("2.0.0")
   override def write: MLWriter = new RFormulaModel.RFormulaModelWriter(this)
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.r
+
+import org.apache.spark.ml.{Pipeline, PipelineModel}
+import org.apache.spark.ml.attribute.{AttributeGroup, Attribute, NominalAttribute}
+import org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel}
+import org.apache.spark.ml.feature.{IndexToString, RFormula}
+import org.apache.spark.sql.DataFrame
+
+private[r] class NaiveBayesWrapper private (
+    pipeline: PipelineModel,
+    val labels: Array[String],
+    val features: Array[String]) {
+
+  import NaiveBayesWrapper._
+
+  private val naiveBayesModel: NaiveBayesModel = pipeline.stages(1).asInstanceOf[NaiveBayesModel]
+
+  lazy val apriori: Array[Double] = naiveBayesModel.pi.toArray.map(math.exp)
+
+  lazy val tables: Array[Double] = naiveBayesModel.theta.toArray.map(math.exp)
+
+  def transform(dataset: DataFrame): DataFrame = {
+    pipeline.transform(dataset).drop(PREDICTED_LABEL_INDEX_COL)
+  }
+}
+
+private[r] object NaiveBayesWrapper {
+
+  val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
+  val PREDICTED_LABEL_COL = "prediction"
+
+  def fit(formula: String, data: DataFrame, laplace: Double): NaiveBayesWrapper = {
+    val rFormula = new RFormula()
+      .setFormula(formula)
+      .fit(data)
+    // get labels and feature names from output schema
+    val schema = rFormula.transform(data).schema
+    val labelAttr = Attribute.fromStructField(schema(rFormula.getLabelCol))
+      .asInstanceOf[NominalAttribute]
+    val labels = labelAttr.values.get
+    val featureAttrs = AttributeGroup.fromStructField(schema(rFormula.getFeaturesCol))
+      .attributes.get
+    val features = featureAttrs.map(_.name.get)
+    // assemble and fit the pipeline
+    val naiveBayes = new NaiveBayes()
+      .setSmoothing(laplace)
+      .setModelType("bernoulli")
+      .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
+    val idxToStr = new IndexToString()
+      .setInputCol(PREDICTED_LABEL_INDEX_COL)
+      .setOutputCol(PREDICTED_LABEL_COL)
+      .setLabels(labels)
+    val pipeline = new Pipeline()
+      .setStages(Array(rFormula, naiveBayes, idxToStr))
+      .fit(data)
+    new NaiveBayesWrapper(pipeline, labels, features)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala