From 9d7a0e16ae99ef2d4288a48f0fa13b60719390e1 Mon Sep 17 00:00:00 2001 From: GayathriMurali Date: Tue, 31 May 2016 15:48:48 -0700 Subject: [PATCH 1/4] Removing unnecessary commits --- docs/sparkr.md | 73 +++++++++++--------------------------------------- 1 file changed, 15 insertions(+), 58 deletions(-) diff --git a/docs/sparkr.md b/docs/sparkr.md index 59e486d1929f0..29a2df886686d 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -285,71 +285,28 @@ head(teenagers) # Machine Learning -SparkR allows the fitting of generalized linear models over DataFrames using the [glm()](api/R/glm.html) function. Under the hood, SparkR uses MLlib to train a model of the specified family. Currently the gaussian and binomial families are supported. We support a subset of the available R formula operators for model fitting, including '~', '.', ':', '+', and '-'. +SparkR supports the following Machine Learning algorithms. -The [summary()](api/R/summary.html) function gives the summary of a model produced by [glm()](api/R/glm.html). +* Generalized Linear Regression Model [spark.glm()](api/R/glm.html) +* Naive Bayes [spark.naiveBayes()](api/R/naiveBayes.html) +* KMeans [spark.kmeans()](api/R/kmeans.html) +* AFT Survival Regression [spark.survreg()](api/R/survreg.html) -* For gaussian GLM model, it returns a list with 'devianceResiduals' and 'coefficients' components. The 'devianceResiduals' gives the min/max deviance residuals of the estimation; the 'coefficients' gives the estimated coefficients and their estimated standard errors, t values and p-values. (It only available when model fitted by normal solver.) -* For binomial GLM model, it returns a list with 'coefficients' component which gives the estimated coefficients. +Generalized Linear Regression can be used to train a model from a specified family. Currently the Gaussian, Binomial, Poisson and Gamma families are supported. We support a subset of the available R formula operators for model fitting, including '~', '.', ':', '+', and '-'. -The examples below show the use of building gaussian GLM model and binomial GLM model using SparkR. +The [summary()](api/R/summary.html) function gives the summary of a model produced by different algorithms listed above. +This summary is same as the result of summary() function in R. -## Gaussian GLM model +## Model persistence -
-{% highlight r %} -# Create the DataFrame -df <- createDataFrame(sqlContext, iris) - -# Fit a gaussian GLM model over the dataset. -model <- glm(Sepal_Length ~ Sepal_Width + Species, data = df, family = "gaussian") - -# Model summary are returned in a similar format to R's native glm(). -summary(model) -##$devianceResiduals -## Min Max -## -1.307112 1.412532 -## -##$coefficients -## Estimate Std. Error t value Pr(>|t|) -##(Intercept) 2.251393 0.3697543 6.08889 9.568102e-09 -##Sepal_Width 0.8035609 0.106339 7.556598 4.187317e-12 -##Species_versicolor 1.458743 0.1121079 13.01195 0 -##Species_virginica 1.946817 0.100015 19.46525 0 - -# Make predictions based on the model. -predictions <- predict(model, newData = df) -head(select(predictions, "Sepal_Length", "prediction")) -## Sepal_Length prediction -##1 5.1 5.063856 -##2 4.9 4.662076 -##3 4.7 4.822788 -##4 4.6 4.742432 -##5 5.0 5.144212 -##6 5.4 5.385281 -{% endhighlight %} -
+* write.ml allows users to save a fitted model in a given input path +* read.ml allows users to read/load the model which was saved using write.ml in a given path -## Binomial GLM model +Model persistence is supported for all Machine Learning algorithms for all families. -
-{% highlight r %} -# Create the DataFrame -df <- createDataFrame(sqlContext, iris) -training <- filter(df, df$Species != "setosa") - -# Fit a binomial GLM model over the dataset. -model <- glm(Species ~ Sepal_Length + Sepal_Width, data = training, family = "binomial") - -# Model coefficients are returned in a similar format to R's native glm(). -summary(model) -##$coefficients -## Estimate -##(Intercept) -13.046005 -##Sepal_Length 1.902373 -##Sepal_Width 0.404655 -{% endhighlight %} -
+The examples below show the use of building Gaussian GLM, NaiveBayes, kMeans and AFTSurvivalReg using SparkR + +{% include_example r/ml.r %} # R Function Name Conflicts From 13530efdf23951b0725a5284a433992e8a3b10a4 Mon Sep 17 00:00:00 2001 From: GayathriMurali Date: Wed, 1 Jun 2016 15:42:14 -0700 Subject: [PATCH 2/4] Fixing example R file --- docs/sparkr.md | 4 ++-- examples/src/main/r/ml.R | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/sparkr.md b/docs/sparkr.md index 29a2df886686d..966c41f41ca23 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -304,9 +304,9 @@ This summary is same as the result of summary() function in R. Model persistence is supported for all Machine Learning algorithms for all families. -The examples below show the use of building Gaussian GLM, NaiveBayes, kMeans and AFTSurvivalReg using SparkR +The examples below show the use of building Gaussian GLM, NaiveBayes, kMeans and AFTSurvivalReg models using SparkR -{% include_example r/ml.r %} +{% include_example r/ml.R %} # R Function Name Conflicts diff --git a/examples/src/main/r/ml.R b/examples/src/main/r/ml.R index fd35936635334..7cb747c1fb20b 100644 --- a/examples/src/main/r/ml.R +++ b/examples/src/main/r/ml.R @@ -25,8 +25,9 @@ library(SparkR) sc <- sparkR.init(appName="SparkR-ML-example") sqlContext <- sparkRSQL.init(sc) -############################ spark.glm and glm ############################################## +############################ spark.glm and glm ############################################## +# $example on$ irisDF <- suppressWarnings(createDataFrame(sqlContext, iris)) # Fit a generalized linear model of family "gaussian" with spark.glm gaussianDF <- irisDF @@ -57,7 +58,6 @@ binomialPredictions <- predict(binomialGLM, binomialTestDF) showDF(binomialPredictions) ############################ spark.survreg ############################################## - # Use the ovarian dataset available in R survival package library(survival) @@ -121,7 +121,7 @@ gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, famil modelPath <- tempfile(pattern = "ml", fileext = ".tmp") write.ml(gaussianGLM, modelPath) gaussianGLM2 <- read.ml(modelPath) - +# $example off$ # Check model summary summary(gaussianGLM2) From f4516c73cdb7da5578ee611d0ddac42ae6873c4f Mon Sep 17 00:00:00 2001 From: GayathriMurali Date: Sun, 5 Jun 2016 23:16:08 -0700 Subject: [PATCH 3/4] Review comments --- docs/sparkr.md | 18 +++++++++--------- examples/src/main/r/ml.R | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/sparkr.md b/docs/sparkr.md index 966c41f41ca23..0e4b9c416528f 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -287,24 +287,24 @@ head(teenagers) SparkR supports the following Machine Learning algorithms. -* Generalized Linear Regression Model [spark.glm()](api/R/glm.html) -* Naive Bayes [spark.naiveBayes()](api/R/naiveBayes.html) -* KMeans [spark.kmeans()](api/R/kmeans.html) -* AFT Survival Regression [spark.survreg()](api/R/survreg.html) +* Generalized Linear Regression Model [spark.glm()](api/R/spark.glm.html) +* Naive Bayes [spark.naiveBayes()](api/R/spark.naiveBayes.html) +* KMeans [spark.kmeans()](api/R/spark.kmeans.html) +* AFT Survival Regression [spark.survreg()](api/R/spark.survreg.html) -Generalized Linear Regression can be used to train a model from a specified family. Currently the Gaussian, Binomial, Poisson and Gamma families are supported. We support a subset of the available R formula operators for model fitting, including '~', '.', ':', '+', and '-'. +[Generalized Linear Regression](api/R/spark.glm.html) can be used to train a model from a specified family. Currently the Gaussian, Binomial, Poisson and Gamma families are supported. We support a subset of the available R formula operators for model fitting, including '~', '.', ':', '+', and '-'. The [summary()](api/R/summary.html) function gives the summary of a model produced by different algorithms listed above. -This summary is same as the result of summary() function in R. +It produces the similar result compared with R summary function. ## Model persistence -* write.ml allows users to save a fitted model in a given input path -* read.ml allows users to read/load the model which was saved using write.ml in a given path +* [write.ml](api/R/write.ml.html) allows users to save a fitted model in a given input path +* [read.ml](api/R/read.ml.html) allows users to read/load the model which was saved using write.ml in a given path Model persistence is supported for all Machine Learning algorithms for all families. -The examples below show the use of building Gaussian GLM, NaiveBayes, kMeans and AFTSurvivalReg models using SparkR +The examples below show the use of building glm with Gaussian family,glm with Binomial family, survreg, naiveBayes, kmeans models using SparkR {% include_example r/ml.R %} diff --git a/examples/src/main/r/ml.R b/examples/src/main/r/ml.R index 7cb747c1fb20b..495f392c26542 100644 --- a/examples/src/main/r/ml.R +++ b/examples/src/main/r/ml.R @@ -25,9 +25,9 @@ library(SparkR) sc <- sparkR.init(appName="SparkR-ML-example") sqlContext <- sparkRSQL.init(sc) - -############################ spark.glm and glm ############################################## # $example on$ +############################ spark.glm and glm ############################################## + irisDF <- suppressWarnings(createDataFrame(sqlContext, iris)) # Fit a generalized linear model of family "gaussian" with spark.glm gaussianDF <- irisDF From e773d038460e566394cefa9c183c636bbaba4a4e Mon Sep 17 00:00:00 2001 From: GayathriMurali Date: Wed, 15 Jun 2016 15:53:24 -0700 Subject: [PATCH 4/4] Review comment --- docs/sparkr.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/sparkr.md b/docs/sparkr.md index 0e4b9c416528f..961bd323fabcb 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -304,7 +304,11 @@ It produces the similar result compared with R summary function. Model persistence is supported for all Machine Learning algorithms for all families. -The examples below show the use of building glm with Gaussian family,glm with Binomial family, survreg, naiveBayes, kmeans models using SparkR +The examples below show how to build several models: +* GLM using the Gaussian and Binomial model families +* AFT survival regression model +* Naive Bayes model +* K-Means model {% include_example r/ml.R %}