extract link and family name in R

hhbyyh · hhbyyh · commit 8b3dd3eebd80 · 2016-03-17T01:58:30.000-07:00
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
@@ -29,7 +29,8 @@ setClass("PipelineModel", representation(model = "jobj"))
 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
 #'                operators are supported, including '~', '.', ':', '+', and '-'.
 #' @param data DataFrame for training
-#' @param family a description of the error distribution and link function to be used in the model..
+#' @param family a description of the error distribution and link function to be used in the model,
+#'               as in [[https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html]]
 #' @param lambda Regularization parameter
 #' @param solver Currently only support "irls" which is also the default solver.
 #' @return a fitted MLlib model
@@ -45,12 +46,12 @@ setClass("PipelineModel", representation(model = "jobj"))
 #' summary(model)
 #'}
 setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFrame"),
-          function(formula, family = c("gaussian", "binomial", "poisson", "gamma"), data,
-              lambda = 0, solver = "irls") {
-            family <- match.arg(family)
+          function(formula, family = gaussian(), data, lambda = 0, solver = "auto") {
+            familyName <- family$family
+            linkName <- family$link
             formula <- paste(deparse(formula), collapse = "")
             model <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
-                                 "fitGLM", formula, data@sdf, family, lambda, solver)
+                                 "fitGLM", formula, data@sdf, familyName, linkName, lambda, solver)
             return(new("PipelineModel", model = model))
           })
 
@@ -117,11 +118,6 @@ setMethod("summary", signature(object = "PipelineModel"),
               colnames(coefficients) <- c("Estimate")
               rownames(coefficients) <- unlist(features)
               return(list(coefficients = coefficients))
-            } else if (modelName == "GeneralizedLinearRegressionModel") {
-                coefficients <- as.matrix(unlist(coefficients))
-                colnames(coefficients) <- c("Estimate")
-                rownames(coefficients) <- unlist(features)
-                return(list(coefficients = coefficients))
             } else if (modelName == "KMeansModel") {
               modelSize <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
                                        "getKMeansModelSize", object@model)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
@@ -31,23 +31,16 @@ private[r] object SparkRWrappers {
       value: String,
       df: DataFrame,
       family: String,
+      link: String,
       lambda: Double,
       solver: String): PipelineModel = {
-    if (solver.trim != "irls") throw new SparkException("Currently only support irls")
-
     val formula = new RFormula().setFormula(value)
-    val regex = "^\\s*(\\w+)\\s*(\\(\\s*link\\s*=\\s*\"(\\w+)\"\\s*\\))?\\s*$".r
-    val estimator = family match {
-      case regex(familyName, group2, linkName) =>
-        val estimator = new GeneralizedLinearRegression()
-          .setFamily(familyName)
-          .setRegParam(lambda)
-          .setFitIntercept(formula.hasIntercept)
-        if (linkName != null) estimator.setLink(linkName)
-        estimator
-      case _ => throw new SparkException(s"Could not parse family: $family")
-    }
+    val estimator = new GeneralizedLinearRegression()
+      .setFamily(family)
+      .setRegParam(lambda)
+      .setFitIntercept(formula.hasIntercept)
 
+    if (link != null) estimator.setLink(link)
     val pipeline = new Pipeline().setStages(Array(formula, estimator))
     pipeline.fit(df)
   }
@@ -117,12 +110,6 @@ private[r] object SparkRWrappers {
       }
       case m: KMeansModel =>
         m.clusterCenters.flatMap(_.toArray)
-      case m: GeneralizedLinearRegressionModel =>
-        if (m.getFitIntercept) {
-          Array(m.intercept) ++ m.coefficients.toArray
-        } else {
-          m.coefficients.toArray
-        }
     }
   }
 
@@ -183,14 +170,6 @@ private[r] object SparkRWrappers {
         val attrs = AttributeGroup.fromStructField(
           m.summary.predictions.schema(m.summary.featuresCol))
         attrs.attributes.get.map(_.name.get)
-      case m: GeneralizedLinearRegressionModel =>
-        val attrs = AttributeGroup.fromStructField(
-          m.summary.predictions.schema(m.summary.featuresCol))
-        if (m.getFitIntercept) {
-          Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get)
-        } else {
-          attrs.attributes.get.map(_.name.get)
-        }
     }
   }
 
@@ -202,8 +181,6 @@ private[r] object SparkRWrappers {
         "LogisticRegressionModel"
       case m: KMeansModel =>
         "KMeansModel"
-      case m: GeneralizedLinearRegressionModel =>
-        "GeneralizedLinearRegressionModel"
     }
   }
 }