Skip to content

Commit 2fb4e3d

Browse files
committed
merge with master
2 parents fd4fc11 + 62b7f30 commit 2fb4e3d

File tree

498 files changed

+10033
-5634
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

498 files changed

+10033
-5634
lines changed

LICENSE

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -257,9 +257,8 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
257257
(BSD-style) scalacheck (org.scalacheck:scalacheck_2.11:1.10.0 - http://www.scalacheck.org)
258258
(BSD-style) spire (org.spire-math:spire_2.11:0.7.1 - http://spire-math.org)
259259
(BSD-style) spire-macros (org.spire-math:spire-macros_2.11:0.7.1 - http://spire-math.org)
260-
(New BSD License) Kryo (com.esotericsoftware.kryo:kryo:2.21 - http://code.google.com/p/kryo/)
261-
(New BSD License) MinLog (com.esotericsoftware.minlog:minlog:1.2 - http://code.google.com/p/minlog/)
262-
(New BSD License) ReflectASM (com.esotericsoftware.reflectasm:reflectasm:1.07 - http://code.google.com/p/reflectasm/)
260+
(New BSD License) Kryo (com.esotericsoftware:kryo:3.0.3 - https://github.com/EsotericSoftware/kryo)
261+
(New BSD License) MinLog (com.esotericsoftware:minlog:1.3.0 - https://github.com/EsotericSoftware/minlog)
263262
(New BSD license) Protocol Buffer Java API (com.google.protobuf:protobuf-java:2.5.0 - http://code.google.com/p/protobuf)
264263
(New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
265264
(The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)

R/pkg/R/mllib.R

Lines changed: 61 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@
1717

1818
# mllib.R: Provides methods for MLlib integration
1919

20-
#' @title S4 class that represents a PipelineModel
21-
#' @param model A Java object reference to the backing Scala PipelineModel
20+
#' @title S4 class that represents a generalized linear model
21+
#' @param jobj a Java object reference to the backing Scala GeneralizedLinearRegressionWrapper
2222
#' @export
23-
setClass("PipelineModel", representation(model = "jobj"))
23+
setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj"))
2424

2525
#' @title S4 class that represents a NaiveBayesModel
2626
#' @param jobj a Java object reference to the backing Scala NaiveBayesWrapper
@@ -39,21 +39,18 @@ setClass("KMeansModel", representation(jobj = "jobj"))
3939

4040
#' Fits a generalized linear model
4141
#'
42-
#' Fits a generalized linear model, similarly to R's glm(). Also see the glmnet package.
42+
#' Fits a generalized linear model, similarly to R's glm().
4343
#'
4444
#' @param formula A symbolic description of the model to be fitted. Currently only a few formula
4545
#' operators are supported, including '~', '.', ':', '+', and '-'.
46-
#' @param data DataFrame for training
47-
#' @param family Error distribution. "gaussian" -> linear regression, "binomial" -> logistic reg.
48-
#' @param lambda Regularization parameter
49-
#' @param alpha Elastic-net mixing parameter (see glmnet's documentation for details)
50-
#' @param standardize Whether to standardize features before training
51-
#' @param solver The solver algorithm used for optimization, this can be "l-bfgs", "normal" and
52-
#' "auto". "l-bfgs" denotes Limited-memory BFGS which is a limited-memory
53-
#' quasi-Newton optimization method. "normal" denotes using Normal Equation as an
54-
#' analytical solution to the linear regression problem. The default value is "auto"
55-
#' which means that the solver algorithm is selected automatically.
56-
#' @return a fitted MLlib model
46+
#' @param data DataFrame for training.
47+
#' @param family A description of the error distribution and link function to be used in the model.
48+
#' This can be a character string naming a family function, a family function or
49+
#' the result of a call to a family function. Refer R family at
50+
#' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
51+
#' @param epsilon Positive convergence tolerance of iterations.
52+
#' @param maxit Integer giving the maximal number of IRLS iterations.
53+
#' @return a fitted generalized linear model
5754
#' @rdname glm
5855
#' @export
5956
#' @examples
@@ -64,36 +61,70 @@ setClass("KMeansModel", representation(jobj = "jobj"))
6461
#' df <- createDataFrame(sqlContext, iris)
6562
#' model <- glm(Sepal_Length ~ Sepal_Width, df, family="gaussian")
6663
#' summary(model)
67-
#'}
64+
#' }
6865
setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFrame"),
69-
function(formula, family = c("gaussian", "binomial"), data, lambda = 0, alpha = 0,
70-
standardize = TRUE, solver = "auto") {
71-
family <- match.arg(family)
66+
function(formula, family = gaussian, data, epsilon = 1e-06, maxit = 25) {
67+
if (is.character(family)) {
68+
family <- get(family, mode = "function", envir = parent.frame())
69+
}
70+
if (is.function(family)) {
71+
family <- family()
72+
}
73+
if (is.null(family$family)) {
74+
print(family)
75+
stop("'family' not recognized")
76+
}
77+
7278
formula <- paste(deparse(formula), collapse = "")
73-
model <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
74-
"fitRModelFormula", formula, data@sdf, family, lambda,
75-
alpha, standardize, solver)
76-
return(new("PipelineModel", model = model))
79+
80+
jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
81+
"fit", formula, data@sdf, family$family, family$link,
82+
epsilon, as.integer(maxit))
83+
return(new("GeneralizedLinearRegressionModel", jobj = jobj))
7784
})
7885

79-
#' Make predictions from a model
86+
#' Get the summary of a generalized linear model
8087
#'
81-
#' Makes predictions from a model produced by glm(), similarly to R's predict().
88+
#' Returns the summary of a model produced by glm(), similarly to R's summary().
8289
#'
83-
#' @param object A fitted MLlib model
90+
#' @param object A fitted generalized linear model
91+
#' @return coefficients the model's coefficients, intercept
92+
#' @rdname summary
93+
#' @export
94+
#' @examples
95+
#' \dontrun{
96+
#' model <- glm(y ~ x, trainingData)
97+
#' summary(model)
98+
#' }
99+
setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
100+
function(object, ...) {
101+
jobj <- object@jobj
102+
features <- callJMethod(jobj, "rFeatures")
103+
coefficients <- callJMethod(jobj, "rCoefficients")
104+
coefficients <- as.matrix(unlist(coefficients))
105+
colnames(coefficients) <- c("Estimate")
106+
rownames(coefficients) <- unlist(features)
107+
return(list(coefficients = coefficients))
108+
})
109+
110+
#' Make predictions from a generalized linear model
111+
#'
112+
#' Makes predictions from a generalized linear model produced by glm(), similarly to R's predict().
113+
#'
114+
#' @param object A fitted generalized linear model
84115
#' @param newData DataFrame for testing
85-
#' @return DataFrame containing predicted values
116+
#' @return DataFrame containing predicted labels in a column named "prediction"
86117
#' @rdname predict
87118
#' @export
88119
#' @examples
89120
#' \dontrun{
90121
#' model <- glm(y ~ x, trainingData)
91122
#' predicted <- predict(model, testData)
92123
#' showDF(predicted)
93-
#'}
94-
setMethod("predict", signature(object = "PipelineModel"),
124+
#' }
125+
setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"),
95126
function(object, newData) {
96-
return(dataFrame(callJMethod(object@model, "transform", newData@sdf)))
127+
return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf)))
97128
})
98129

99130
#' Make predictions from a naive Bayes model
@@ -116,54 +147,6 @@ setMethod("predict", signature(object = "NaiveBayesModel"),
116147
return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf)))
117148
})
118149

119-
#' Get the summary of a model
120-
#'
121-
#' Returns the summary of a model produced by glm(), similarly to R's summary().
122-
#'
123-
#' @param object A fitted MLlib model
124-
#' @return a list with 'devianceResiduals' and 'coefficients' components for gaussian family
125-
#' or a list with 'coefficients' component for binomial family. \cr
126-
#' For gaussian family: the 'devianceResiduals' gives the min/max deviance residuals
127-
#' of the estimation, the 'coefficients' gives the estimated coefficients and their
128-
#' estimated standard errors, t values and p-values. (It only available when model
129-
#' fitted by normal solver.) \cr
130-
#' For binomial family: the 'coefficients' gives the estimated coefficients.
131-
#' See summary.glm for more information. \cr
132-
#' @rdname summary
133-
#' @export
134-
#' @examples
135-
#' \dontrun{
136-
#' model <- glm(y ~ x, trainingData)
137-
#' summary(model)
138-
#'}
139-
setMethod("summary", signature(object = "PipelineModel"),
140-
function(object, ...) {
141-
modelName <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
142-
"getModelName", object@model)
143-
features <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
144-
"getModelFeatures", object@model)
145-
coefficients <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
146-
"getModelCoefficients", object@model)
147-
if (modelName == "LinearRegressionModel") {
148-
devianceResiduals <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
149-
"getModelDevianceResiduals", object@model)
150-
devianceResiduals <- matrix(devianceResiduals, nrow = 1)
151-
colnames(devianceResiduals) <- c("Min", "Max")
152-
rownames(devianceResiduals) <- rep("", times = 1)
153-
coefficients <- matrix(coefficients, ncol = 4)
154-
colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
155-
rownames(coefficients) <- unlist(features)
156-
return(list(devianceResiduals = devianceResiduals, coefficients = coefficients))
157-
} else if (modelName == "LogisticRegressionModel") {
158-
coefficients <- as.matrix(unlist(coefficients))
159-
colnames(coefficients) <- c("Estimate")
160-
rownames(coefficients) <- unlist(features)
161-
return(list(coefficients = coefficients))
162-
} else {
163-
stop(paste("Unsupported model", modelName, sep = " "))
164-
}
165-
})
166-
167150
#' Get the summary of a naive Bayes model
168151
#'
169152
#' Returns the summary of a naive Bayes model produced by naiveBayes(), similarly to R's summary().

R/pkg/inst/tests/testthat/test_mllib.R

Lines changed: 29 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -25,20 +25,21 @@ sc <- sparkR.init()
2525

2626
sqlContext <- sparkRSQL.init(sc)
2727

28-
test_that("glm and predict", {
28+
test_that("formula of glm", {
2929
training <- suppressWarnings(createDataFrame(sqlContext, iris))
30-
test <- select(training, "Sepal_Length")
31-
model <- glm(Sepal_Width ~ Sepal_Length, training, family = "gaussian")
32-
prediction <- predict(model, test)
33-
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
30+
# dot minus and intercept vs native glm
31+
model <- glm(Sepal_Width ~ . - Species + 0, data = training)
32+
vals <- collect(select(predict(model, training), "prediction"))
33+
rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
34+
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
3435

35-
# Test stats::predict is working
36-
x <- rnorm(15)
37-
y <- x + rnorm(15)
38-
expect_equal(length(predict(lm(y ~ x))), 15)
39-
})
36+
# feature interaction vs native glm
37+
model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
38+
vals <- collect(select(predict(model, training), "prediction"))
39+
rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
40+
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
4041

41-
test_that("glm should work with long formula", {
42+
# glm should work with long formula
4243
training <- suppressWarnings(createDataFrame(sqlContext, iris))
4344
training$LongLongLongLongLongName <- training$Sepal_Width
4445
training$VeryLongLongLongLonLongName <- training$Sepal_Length
@@ -50,68 +51,30 @@ test_that("glm should work with long formula", {
5051
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
5152
})
5253

53-
test_that("predictions match with native glm", {
54+
test_that("glm and predict", {
5455
training <- suppressWarnings(createDataFrame(sqlContext, iris))
56+
# gaussian family
5557
model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
56-
vals <- collect(select(predict(model, training), "prediction"))
58+
prediction <- predict(model, training)
59+
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
60+
vals <- collect(select(prediction, "prediction"))
5761
rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
5862
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
59-
})
60-
61-
test_that("dot minus and intercept vs native glm", {
62-
training <- suppressWarnings(createDataFrame(sqlContext, iris))
63-
model <- glm(Sepal_Width ~ . - Species + 0, data = training)
64-
vals <- collect(select(predict(model, training), "prediction"))
65-
rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
66-
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
67-
})
6863

69-
test_that("feature interaction vs native glm", {
70-
training <- suppressWarnings(createDataFrame(sqlContext, iris))
71-
model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
72-
vals <- collect(select(predict(model, training), "prediction"))
73-
rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
64+
# poisson family
65+
model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
66+
family = poisson(link = identity))
67+
prediction <- predict(model, training)
68+
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
69+
vals <- collect(select(prediction, "prediction"))
70+
rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
71+
data = iris, family = poisson(link = identity)), iris))
7472
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
75-
})
7673

77-
test_that("summary coefficients match with native glm", {
78-
training <- suppressWarnings(createDataFrame(sqlContext, iris))
79-
stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training, solver = "normal"))
80-
coefs <- unlist(stats$coefficients)
81-
devianceResiduals <- unlist(stats$devianceResiduals)
82-
83-
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
84-
rCoefs <- unlist(rStats$coefficients)
85-
rDevianceResiduals <- c(-0.95096, 0.72918)
86-
87-
expect_true(all(abs(rCoefs - coefs) < 1e-5))
88-
expect_true(all(abs(rDevianceResiduals - devianceResiduals) < 1e-5))
89-
expect_true(all(
90-
rownames(stats$coefficients) ==
91-
c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
92-
})
93-
94-
test_that("summary coefficients match with native glm of family 'binomial'", {
95-
df <- suppressWarnings(createDataFrame(sqlContext, iris))
96-
training <- filter(df, df$Species != "setosa")
97-
stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
98-
family = "binomial"))
99-
coefs <- as.vector(stats$coefficients[, 1])
100-
101-
rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
102-
rCoefs <- as.vector(coef(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
103-
family = binomial(link = "logit"))))
104-
105-
expect_true(all(abs(rCoefs - coefs) < 1e-4))
106-
expect_true(all(
107-
rownames(stats$coefficients) ==
108-
c("(Intercept)", "Sepal_Length", "Sepal_Width")))
109-
})
110-
111-
test_that("summary works on base GLM models", {
112-
baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
113-
baseSummary <- summary(baseModel)
114-
expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
74+
# Test stats::predict is working
75+
x <- rnorm(15)
76+
y <- x + rnorm(15)
77+
expect_equal(length(predict(lm(y ~ x))), 15)
11578
})
11679

11780
test_that("kmeans", {

R/pkg/inst/tests/testthat/test_sparkSQL.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1853,7 +1853,7 @@ test_that("approxQuantile() on a DataFrame", {
18531853

18541854
test_that("SQL error message is returned from JVM", {
18551855
retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
1856-
expect_equal(grepl("Table not found", retError), TRUE)
1856+
expect_equal(grepl("Table or View not found", retError), TRUE)
18571857
expect_equal(grepl("blah", retError), TRUE)
18581858
})
18591859

bin/spark-class

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ fi
4444

4545
if [ ! -d "$SPARK_JARS_DIR" ] && [ -z "$SPARK_TESTING$SPARK_SQL_TESTING" ]; then
4646
echo "Failed to find Spark jars directory ($SPARK_JARS_DIR)." 1>&2
47-
echo "You need to build Spark before running this program." 1>&2
47+
echo "You need to build Spark with the target \"package\" before running this program." 1>&2
4848
exit 1
4949
else
5050
LAUNCH_CLASSPATH="$SPARK_JARS_DIR/*"

build/mvn

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,10 @@ install_app() {
7070
# Install maven under the build/ folder
7171
install_mvn() {
7272
local MVN_VERSION="3.3.9"
73+
local APACHE_MIRROR=${APACHE_MIRROR:-'https://www.apache.org/dyn/closer.lua?action=download&filename='}
7374

7475
install_app \
75-
"https://archive.apache.org/dist/maven/maven-3/${MVN_VERSION}/binaries" \
76+
"${APACHE_MIRROR}/maven/maven-3/${MVN_VERSION}/binaries" \
7677
"apache-maven-${MVN_VERSION}-bin.tar.gz" \
7778
"apache-maven-${MVN_VERSION}/bin/mvn"
7879

@@ -83,8 +84,10 @@ install_mvn() {
8384
install_zinc() {
8485
local zinc_path="zinc-0.3.9/bin/zinc"
8586
[ ! -f "${_DIR}/${zinc_path}" ] && ZINC_INSTALL_FLAG=1
87+
local TYPESAFE_MIRROR=${TYPESAFE_MIRROR:-https://downloads.typesafe.com}
88+
8689
install_app \
87-
"https://downloads.typesafe.com/zinc/0.3.9" \
90+
"${TYPESAFE_MIRROR}/zinc/0.3.9" \
8891
"zinc-0.3.9.tgz" \
8992
"${zinc_path}"
9093
ZINC_BIN="${_DIR}/${zinc_path}"
@@ -98,9 +101,10 @@ install_scala() {
98101
local scala_version=`grep "scala.version" "${_DIR}/../pom.xml" | \
99102
head -1 | cut -f2 -d'>' | cut -f1 -d'<'`
100103
local scala_bin="${_DIR}/scala-${scala_version}/bin/scala"
104+
local TYPESAFE_MIRROR=${TYPESAFE_MIRROR:-https://downloads.typesafe.com}
101105

102106
install_app \
103-
"https://downloads.typesafe.com/scala/${scala_version}" \
107+
"${TYPESAFE_MIRROR}/scala/${scala_version}" \
104108
"scala-${scala_version}.tgz" \
105109
"scala-${scala_version}/bin/scala"
106110

0 commit comments

Comments
 (0)