Skip to content

Commit e28f201

Browse files
author
ArtRand
committed
merge master
2 parents ba119fe + d3abb36 commit e28f201

File tree

563 files changed

+13611
-5346
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

563 files changed

+13611
-5346
lines changed

R/pkg/DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ Package: SparkR
22
Type: Package
33
Version: 2.3.0
44
Title: R Frontend for Apache Spark
5-
Description: The SparkR package provides an R Frontend for Apache Spark.
5+
Description: Provides an R Frontend for Apache Spark.
66
Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
77
email = "[email protected]"),
88
person("Xiangrui", "Meng", role = "aut",

R/pkg/R/DataFrame.R

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2930,7 +2930,7 @@ setMethod("saveAsTable",
29302930
invisible(callJMethod(write, "saveAsTable", tableName))
29312931
})
29322932

2933-
#' summary
2933+
#' describe
29342934
#'
29352935
#' Computes statistics for numeric and string columns.
29362936
#' If no columns are given, this function computes statistics for all numerical or string columns.
@@ -2941,7 +2941,7 @@ setMethod("saveAsTable",
29412941
#' @return A SparkDataFrame.
29422942
#' @family SparkDataFrame functions
29432943
#' @aliases describe,SparkDataFrame,character-method describe,SparkDataFrame,ANY-method
2944-
#' @rdname summary
2944+
#' @rdname describe
29452945
#' @name describe
29462946
#' @export
29472947
#' @examples
@@ -2953,6 +2953,7 @@ setMethod("saveAsTable",
29532953
#' describe(df, "col1")
29542954
#' describe(df, "col1", "col2")
29552955
#' }
2956+
#' @seealso See \link{summary} for expanded statistics and control over which statistics to compute.
29562957
#' @note describe(SparkDataFrame, character) since 1.4.0
29572958
setMethod("describe",
29582959
signature(x = "SparkDataFrame", col = "character"),
@@ -2962,7 +2963,7 @@ setMethod("describe",
29622963
dataFrame(sdf)
29632964
})
29642965

2965-
#' @rdname summary
2966+
#' @rdname describe
29662967
#' @name describe
29672968
#' @aliases describe,SparkDataFrame-method
29682969
#' @note describe(SparkDataFrame) since 1.4.0
@@ -2973,15 +2974,50 @@ setMethod("describe",
29732974
dataFrame(sdf)
29742975
})
29752976

2977+
#' summary
2978+
#'
2979+
#' Computes specified statistics for numeric and string columns. Available statistics are:
2980+
#' \itemize{
2981+
#' \item count
2982+
#' \item mean
2983+
#' \item stddev
2984+
#' \item min
2985+
#' \item max
2986+
#' \item arbitrary approximate percentiles specified as a percentage (eg, "75%")
2987+
#' }
2988+
#' If no statistics are given, this function computes count, mean, stddev, min,
2989+
#' approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
2990+
#' This function is meant for exploratory data analysis, as we make no guarantee about the
2991+
#' backward compatibility of the schema of the resulting Dataset. If you want to
2992+
#' programmatically compute summary statistics, use the \code{agg} function instead.
2993+
#'
2994+
#'
29762995
#' @param object a SparkDataFrame to be summarized.
2996+
#' @param ... (optional) statistics to be computed for all columns.
2997+
#' @return A SparkDataFrame.
2998+
#' @family SparkDataFrame functions
29772999
#' @rdname summary
29783000
#' @name summary
29793001
#' @aliases summary,SparkDataFrame-method
3002+
#' @export
3003+
#' @examples
3004+
#'\dontrun{
3005+
#' sparkR.session()
3006+
#' path <- "path/to/file.json"
3007+
#' df <- read.json(path)
3008+
#' summary(df)
3009+
#' summary(df, "min", "25%", "75%", "max")
3010+
#' summary(select(df, "age", "height"))
3011+
#' }
29803012
#' @note summary(SparkDataFrame) since 1.5.0
3013+
#' @note The statistics provided by \code{summary} were change in 2.3.0 use \link{describe} for previous defaults.
3014+
#' @seealso \link{describe}
29813015
setMethod("summary",
29823016
signature(object = "SparkDataFrame"),
29833017
function(object, ...) {
2984-
describe(object)
3018+
statisticsList <- list(...)
3019+
sdf <- callJMethod(object@sdf, "summary", statisticsList)
3020+
dataFrame(sdf)
29853021
})
29863022

29873023

R/pkg/R/generics.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,7 @@ setGeneric("gapplyCollect", function(x, ...) { standardGeneric("gapplyCollect")
521521
# @export
522522
setGeneric("getNumPartitions", function(x) { standardGeneric("getNumPartitions") })
523523

524-
#' @rdname summary
524+
#' @rdname describe
525525
#' @export
526526
setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
527527

R/pkg/R/install.R

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,11 @@ sparkCachePath <- function() {
270270
if (is_windows()) {
271271
winAppPath <- Sys.getenv("LOCALAPPDATA", unset = NA)
272272
if (is.na(winAppPath)) {
273-
stop(paste("%LOCALAPPDATA% not found.",
273+
message("%LOCALAPPDATA% not found. Falling back to %USERPROFILE%.")
274+
winAppPath <- Sys.getenv("USERPROFILE", unset = NA)
275+
}
276+
if (is.na(winAppPath)) {
277+
stop(paste("%LOCALAPPDATA% and %USERPROFILE% not found.",
274278
"Please define the environment variable",
275279
"or restart and enter an installation path in localDir."))
276280
} else {

R/pkg/R/mllib_regression.R

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
7676
#' "frequencyDesc", "frequencyAsc", "alphabetDesc", and "alphabetAsc".
7777
#' The default value is "frequencyDesc". When the ordering is set to
7878
#' "alphabetDesc", this drops the same category as R when encoding strings.
79+
#' @param offsetCol the offset column name. If this is not set or empty, we treat all instance offsets
80+
#' as 0.0. The feature specified as offset has a constant coefficient of 1.0.
7981
#' @param ... additional arguments passed to the method.
8082
#' @aliases spark.glm,SparkDataFrame,formula-method
8183
#' @return \code{spark.glm} returns a fitted generalized linear model.
@@ -127,7 +129,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
127129
function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL,
128130
regParam = 0.0, var.power = 0.0, link.power = 1.0 - var.power,
129131
stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
130-
"alphabetDesc", "alphabetAsc")) {
132+
"alphabetDesc", "alphabetAsc"),
133+
offsetCol = NULL) {
131134

132135
stringIndexerOrderType <- match.arg(stringIndexerOrderType)
133136
if (is.character(family)) {
@@ -159,12 +162,19 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
159162
weightCol <- as.character(weightCol)
160163
}
161164

165+
if (!is.null(offsetCol)) {
166+
offsetCol <- as.character(offsetCol)
167+
if (nchar(offsetCol) == 0) {
168+
offsetCol <- NULL
169+
}
170+
}
171+
162172
# For known families, Gamma is upper-cased
163173
jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
164174
"fit", formula, data@sdf, tolower(family$family), family$link,
165175
tol, as.integer(maxIter), weightCol, regParam,
166176
as.double(var.power), as.double(link.power),
167-
stringIndexerOrderType)
177+
stringIndexerOrderType, offsetCol)
168178
new("GeneralizedLinearRegressionModel", jobj = jobj)
169179
})
170180

@@ -192,6 +202,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
192202
#' "frequencyDesc", "frequencyAsc", "alphabetDesc", and "alphabetAsc".
193203
#' The default value is "frequencyDesc". When the ordering is set to
194204
#' "alphabetDesc", this drops the same category as R when encoding strings.
205+
#' @param offsetCol the offset column name. If this is not set or empty, we treat all instance offsets
206+
#' as 0.0. The feature specified as offset has a constant coefficient of 1.0.
195207
#' @return \code{glm} returns a fitted generalized linear model.
196208
#' @rdname glm
197209
#' @export
@@ -209,10 +221,12 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDat
209221
function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL,
210222
var.power = 0.0, link.power = 1.0 - var.power,
211223
stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
212-
"alphabetDesc", "alphabetAsc")) {
224+
"alphabetDesc", "alphabetAsc"),
225+
offsetCol = NULL) {
213226
spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol,
214227
var.power = var.power, link.power = link.power,
215-
stringIndexerOrderType = stringIndexerOrderType)
228+
stringIndexerOrderType = stringIndexerOrderType,
229+
offsetCol = offsetCol)
216230
})
217231

218232
# Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary().

R/pkg/tests/fulltests/test_mllib_regression.R

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,14 @@ test_that("spark.glm summary", {
173173
expect_equal(stats$df.residual, rStats$df.residual)
174174
expect_equal(stats$aic, rStats$aic)
175175

176+
# Test spark.glm works with offset
177+
training <- suppressWarnings(createDataFrame(iris))
178+
stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
179+
family = poisson(), offsetCol = "Petal_Length"))
180+
rStats <- suppressWarnings(summary(glm(Sepal.Width ~ Sepal.Length + Species,
181+
data = iris, family = poisson(), offset = iris$Petal.Length)))
182+
expect_true(all(abs(rStats$coefficients - stats$coefficients) < 1e-3))
183+
176184
# Test summary works on base GLM models
177185
baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
178186
baseSummary <- summary(baseModel)

R/pkg/tests/fulltests/test_sparkSQL.R

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2497,7 +2497,7 @@ test_that("read/write text files - compression option", {
24972497
unlink(textPath)
24982498
})
24992499

2500-
test_that("describe() and summarize() on a DataFrame", {
2500+
test_that("describe() and summary() on a DataFrame", {
25012501
df <- read.json(jsonPath)
25022502
stats <- describe(df, "age")
25032503
expect_equal(collect(stats)[1, "summary"], "count")
@@ -2508,8 +2508,15 @@ test_that("describe() and summarize() on a DataFrame", {
25082508
expect_equal(collect(stats)[5, "age"], "30")
25092509

25102510
stats2 <- summary(df)
2511-
expect_equal(collect(stats2)[4, "summary"], "min")
2512-
expect_equal(collect(stats2)[5, "age"], "30")
2511+
expect_equal(collect(stats2)[5, "summary"], "25%")
2512+
expect_equal(collect(stats2)[5, "age"], "30.0")
2513+
2514+
stats3 <- summary(df, "min", "max", "55.1%")
2515+
2516+
expect_equal(collect(stats3)[1, "summary"], "min")
2517+
expect_equal(collect(stats3)[2, "summary"], "max")
2518+
expect_equal(collect(stats3)[3, "summary"], "55.1%")
2519+
expect_equal(collect(stats3)[3, "age"], "30.0")
25132520

25142521
# SPARK-16425: SparkR summary() fails on column of type logical
25152522
df <- withColumn(df, "boolean", df$age == 30)
@@ -2742,15 +2749,15 @@ test_that("attach() on a DataFrame", {
27422749
expected_age <- data.frame(age = c(NA, 30, 19))
27432750
expect_equal(head(age), expected_age)
27442751
stat <- summary(age)
2745-
expect_equal(collect(stat)[5, "age"], "30")
2752+
expect_equal(collect(stat)[8, "age"], "30")
27462753
age <- age$age + 1
27472754
expect_is(age, "Column")
27482755
rm(age)
27492756
stat2 <- summary(age)
2750-
expect_equal(collect(stat2)[5, "age"], "30")
2757+
expect_equal(collect(stat2)[8, "age"], "30")
27512758
detach("df")
27522759
stat3 <- summary(df[, "age", drop = F])
2753-
expect_equal(collect(stat3)[5, "age"], "30")
2760+
expect_equal(collect(stat3)[8, "age"], "30")
27542761
expect_error(age)
27552762
})
27562763

R/pkg/vignettes/sparkr-vignettes.Rmd

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,17 @@ vignette: >
2727
limitations under the License.
2828
-->
2929

30+
```{r setup, include=FALSE}
31+
library(knitr)
32+
opts_hooks$set(eval = function(options) {
33+
# override eval to FALSE only on windows
34+
if (.Platform$OS.type == "windows") {
35+
options$eval = FALSE
36+
}
37+
options
38+
})
39+
```
40+
3041
## Overview
3142

3243
SparkR is an R package that provides a light-weight frontend to use Apache Spark from R. With Spark `r packageVersion("SparkR")`, SparkR provides a distributed data frame implementation that supports data processing operations like selection, filtering, aggregation etc. and distributed machine learning using [MLlib](http://spark.apache.org/mllib/).

assembly/pom.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,12 @@
220220
<hive.deps.scope>provided</hive.deps.scope>
221221
</properties>
222222
</profile>
223+
<profile>
224+
<id>orc-provided</id>
225+
<properties>
226+
<orc.deps.scope>provided</orc.deps.scope>
227+
</properties>
228+
</profile>
223229
<profile>
224230
<id>parquet-provided</id>
225231
<properties>

common/kvstore/pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@
3535
</properties>
3636

3737
<dependencies>
38+
<dependency>
39+
<groupId>org.apache.spark</groupId>
40+
<artifactId>spark-tags_${scala.binary.version}</artifactId>
41+
</dependency>
42+
3843
<dependency>
3944
<groupId>com.google.guava</groupId>
4045
<artifactId>guava</artifactId>

0 commit comments

Comments
 (0)