1717
1818# mllib.R: Provides methods for MLlib integration
1919
20- # ' @title S4 class that represents a PipelineModel
21- # ' @param model A Java object reference to the backing Scala PipelineModel
20+ # ' @title S4 class that represents a generalized linear model
21+ # ' @param jobj a Java object reference to the backing Scala GeneralizedLinearRegressionWrapper
2222# ' @export
23- setClass ("PipelineModel ", representation(model = "jobj"))
23+ setClass ("GeneralizedLinearRegressionModel ", representation(jobj = "jobj"))
2424
2525# ' @title S4 class that represents a NaiveBayesModel
2626# ' @param jobj a Java object reference to the backing Scala NaiveBayesWrapper
@@ -39,21 +39,18 @@ setClass("KMeansModel", representation(jobj = "jobj"))
3939
4040# ' Fits a generalized linear model
4141# '
42- # ' Fits a generalized linear model, similarly to R's glm(). Also see the glmnet package.
42+ # ' Fits a generalized linear model, similarly to R's glm().
4343# '
4444# ' @param formula A symbolic description of the model to be fitted. Currently only a few formula
4545# ' operators are supported, including '~', '.', ':', '+', and '-'.
46- # ' @param data DataFrame for training
47- # ' @param family Error distribution. "gaussian" -> linear regression, "binomial" -> logistic reg.
48- # ' @param lambda Regularization parameter
49- # ' @param alpha Elastic-net mixing parameter (see glmnet's documentation for details)
50- # ' @param standardize Whether to standardize features before training
51- # ' @param solver The solver algorithm used for optimization, this can be "l-bfgs", "normal" and
52- # ' "auto". "l-bfgs" denotes Limited-memory BFGS which is a limited-memory
53- # ' quasi-Newton optimization method. "normal" denotes using Normal Equation as an
54- # ' analytical solution to the linear regression problem. The default value is "auto"
55- # ' which means that the solver algorithm is selected automatically.
56- # ' @return a fitted MLlib model
46+ # ' @param data DataFrame for training.
47+ # ' @param family A description of the error distribution and link function to be used in the model.
48+ # ' This can be a character string naming a family function, a family function or
49+ # ' the result of a call to a family function. Refer R family at
50+ # ' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
51+ # ' @param epsilon Positive convergence tolerance of iterations.
52+ # ' @param maxit Integer giving the maximal number of IRLS iterations.
53+ # ' @return a fitted generalized linear model
5754# ' @rdname glm
5855# ' @export
5956# ' @examples
@@ -64,36 +61,70 @@ setClass("KMeansModel", representation(jobj = "jobj"))
6461# ' df <- createDataFrame(sqlContext, iris)
6562# ' model <- glm(Sepal_Length ~ Sepal_Width, df, family="gaussian")
6663# ' summary(model)
67- # '}
64+ # ' }
6865setMethod ("glm ", signature(formula = "formula", family = "ANY", data = "DataFrame"),
69- function (formula , family = c(" gaussian" , " binomial" ), data , lambda = 0 , alpha = 0 ,
70- standardize = TRUE , solver = " auto" ) {
71- family <- match.arg(family )
66+ function (formula , family = gaussian , data , epsilon = 1e-06 , maxit = 25 ) {
67+ if (is.character(family )) {
68+ family <- get(family , mode = " function" , envir = parent.frame())
69+ }
70+ if (is.function(family )) {
71+ family <- family()
72+ }
73+ if (is.null(family $ family )) {
74+ print(family )
75+ stop(" 'family' not recognized" )
76+ }
77+
7278 formula <- paste(deparse(formula ), collapse = " " )
73- model <- callJStatic(" org.apache.spark.ml.api.r.SparkRWrappers" ,
74- " fitRModelFormula" , formula , data @ sdf , family , lambda ,
75- alpha , standardize , solver )
76- return (new(" PipelineModel" , model = model ))
79+
80+ jobj <- callJStatic(" org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper" ,
81+ " fit" , formula , data @ sdf , family $ family , family $ link ,
82+ epsilon , as.integer(maxit ))
83+ return (new(" GeneralizedLinearRegressionModel" , jobj = jobj ))
7784 })
7885
79- # ' Make predictions from a model
86+ # ' Get the summary of a generalized linear model
8087# '
81- # ' Makes predictions from a model produced by glm(), similarly to R's predict ().
88+ # ' Returns the summary of a model produced by glm(), similarly to R's summary ().
8289# '
83- # ' @param object A fitted MLlib model
90+ # ' @param object A fitted generalized linear model
91+ # ' @return coefficients the model's coefficients, intercept
92+ # ' @rdname summary
93+ # ' @export
94+ # ' @examples
95+ # ' \dontrun{
96+ # ' model <- glm(y ~ x, trainingData)
97+ # ' summary(model)
98+ # ' }
99+ setMethod ("summary ", signature(object = "GeneralizedLinearRegressionModel"),
100+ function (object , ... ) {
101+ jobj <- object @ jobj
102+ features <- callJMethod(jobj , " rFeatures" )
103+ coefficients <- callJMethod(jobj , " rCoefficients" )
104+ coefficients <- as.matrix(unlist(coefficients ))
105+ colnames(coefficients ) <- c(" Estimate" )
106+ rownames(coefficients ) <- unlist(features )
107+ return (list (coefficients = coefficients ))
108+ })
109+
110+ # ' Make predictions from a generalized linear model
111+ # '
112+ # ' Makes predictions from a generalized linear model produced by glm(), similarly to R's predict().
113+ # '
114+ # ' @param object A fitted generalized linear model
84115# ' @param newData DataFrame for testing
85- # ' @return DataFrame containing predicted values
116+ # ' @return DataFrame containing predicted labels in a column named "prediction"
86117# ' @rdname predict
87118# ' @export
88119# ' @examples
89120# ' \dontrun{
90121# ' model <- glm(y ~ x, trainingData)
91122# ' predicted <- predict(model, testData)
92123# ' showDF(predicted)
93- # '}
94- setMethod ("predict ", signature(object = "PipelineModel "),
124+ # ' }
125+ setMethod ("predict ", signature(object = "GeneralizedLinearRegressionModel "),
95126 function (object , newData ) {
96- return (dataFrame(callJMethod(object @ model , " transform" , newData @ sdf )))
127+ return (dataFrame(callJMethod(object @ jobj , " transform" , newData @ sdf )))
97128 })
98129
99130# ' Make predictions from a naive Bayes model
@@ -116,54 +147,6 @@ setMethod("predict", signature(object = "NaiveBayesModel"),
116147 return (dataFrame(callJMethod(object @ jobj , " transform" , newData @ sdf )))
117148 })
118149
119- # ' Get the summary of a model
120- # '
121- # ' Returns the summary of a model produced by glm(), similarly to R's summary().
122- # '
123- # ' @param object A fitted MLlib model
124- # ' @return a list with 'devianceResiduals' and 'coefficients' components for gaussian family
125- # ' or a list with 'coefficients' component for binomial family. \cr
126- # ' For gaussian family: the 'devianceResiduals' gives the min/max deviance residuals
127- # ' of the estimation, the 'coefficients' gives the estimated coefficients and their
128- # ' estimated standard errors, t values and p-values. (It only available when model
129- # ' fitted by normal solver.) \cr
130- # ' For binomial family: the 'coefficients' gives the estimated coefficients.
131- # ' See summary.glm for more information. \cr
132- # ' @rdname summary
133- # ' @export
134- # ' @examples
135- # ' \dontrun{
136- # ' model <- glm(y ~ x, trainingData)
137- # ' summary(model)
138- # '}
139- setMethod ("summary ", signature(object = "PipelineModel"),
140- function (object , ... ) {
141- modelName <- callJStatic(" org.apache.spark.ml.api.r.SparkRWrappers" ,
142- " getModelName" , object @ model )
143- features <- callJStatic(" org.apache.spark.ml.api.r.SparkRWrappers" ,
144- " getModelFeatures" , object @ model )
145- coefficients <- callJStatic(" org.apache.spark.ml.api.r.SparkRWrappers" ,
146- " getModelCoefficients" , object @ model )
147- if (modelName == " LinearRegressionModel" ) {
148- devianceResiduals <- callJStatic(" org.apache.spark.ml.api.r.SparkRWrappers" ,
149- " getModelDevianceResiduals" , object @ model )
150- devianceResiduals <- matrix (devianceResiduals , nrow = 1 )
151- colnames(devianceResiduals ) <- c(" Min" , " Max" )
152- rownames(devianceResiduals ) <- rep(" " , times = 1 )
153- coefficients <- matrix (coefficients , ncol = 4 )
154- colnames(coefficients ) <- c(" Estimate" , " Std. Error" , " t value" , " Pr(>|t|)" )
155- rownames(coefficients ) <- unlist(features )
156- return (list (devianceResiduals = devianceResiduals , coefficients = coefficients ))
157- } else if (modelName == " LogisticRegressionModel" ) {
158- coefficients <- as.matrix(unlist(coefficients ))
159- colnames(coefficients ) <- c(" Estimate" )
160- rownames(coefficients ) <- unlist(features )
161- return (list (coefficients = coefficients ))
162- } else {
163- stop(paste(" Unsupported model" , modelName , sep = " " ))
164- }
165- })
166-
167150# ' Get the summary of a naive Bayes model
168151# '
169152# ' Returns the summary of a naive Bayes model produced by naiveBayes(), similarly to R's summary().
0 commit comments