Skip to content

Commit 4a5890d

Browse files
authored
Merge pull request apache-spark-on-k8s#178 from palantir/rk/merge-again
2 parents 9c3e14d + b3f45db commit 4a5890d

File tree

183 files changed

+2986
-1390
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

183 files changed

+2986
-1390
lines changed

R/pkg/R/DataFrame.R

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3745,3 +3745,27 @@ setMethod("hint",
37453745
jdf <- callJMethod(x@sdf, "hint", name, parameters)
37463746
dataFrame(jdf)
37473747
})
3748+
3749+
#' alias
3750+
#'
3751+
#' @aliases alias,SparkDataFrame-method
3752+
#' @family SparkDataFrame functions
3753+
#' @rdname alias
3754+
#' @name alias
3755+
#' @export
3756+
#' @examples
3757+
#' \dontrun{
3758+
#' df <- alias(createDataFrame(mtcars), "mtcars")
3759+
#' avg_mpg <- alias(agg(groupBy(df, df$cyl), avg(df$mpg)), "avg_mpg")
3760+
#'
3761+
#' head(select(df, column("mtcars.mpg")))
3762+
#' head(join(df, avg_mpg, column("mtcars.cyl") == column("avg_mpg.cyl")))
3763+
#' }
3764+
#' @note alias(SparkDataFrame) since 2.3.0
3765+
setMethod("alias",
3766+
signature(object = "SparkDataFrame"),
3767+
function(object, data) {
3768+
stopifnot(is.character(data))
3769+
sdf <- callJMethod(object@sdf, "alias", data)
3770+
dataFrame(sdf)
3771+
})

R/pkg/R/column.R

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -130,19 +130,19 @@ createMethods <- function() {
130130

131131
createMethods()
132132

133-
#' alias
134-
#'
135-
#' Set a new name for a column
136-
#'
137-
#' @param object Column to rename
138-
#' @param data new name to use
139-
#'
140133
#' @rdname alias
141134
#' @name alias
142135
#' @aliases alias,Column-method
143136
#' @family colum_func
144137
#' @export
145-
#' @note alias since 1.4.0
138+
#' @examples \dontrun{
139+
#' df <- createDataFrame(iris)
140+
#'
141+
#' head(select(
142+
#' df, alias(df$Sepal_Length, "slength"), alias(df$Petal_Length, "plength")
143+
#' ))
144+
#' }
145+
#' @note alias(Column) since 1.4.0
146146
setMethod("alias",
147147
signature(object = "Column"),
148148
function(object, data) {

R/pkg/R/generics.R

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,17 @@ setGeneric("value", function(bcast) { standardGeneric("value") })
387387
#' @export
388388
setGeneric("agg", function (x, ...) { standardGeneric("agg") })
389389

390+
#' alias
391+
#'
392+
#' Returns a new SparkDataFrame or a Column with an alias set. Equivalent to SQL "AS" keyword.
393+
#'
394+
#' @name alias
395+
#' @rdname alias
396+
#' @param object x a SparkDataFrame or a Column
397+
#' @param data new name to use
398+
#' @return a SparkDataFrame or a Column
399+
NULL
400+
390401
#' @rdname arrange
391402
#' @export
392403
setGeneric("arrange", function(x, col, ...) { standardGeneric("arrange") })

R/pkg/inst/tests/testthat/test_sparkSQL.R

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,10 @@ mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}
9696
mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
9797
writeLines(mockLinesMapType, mapTypeJsonPath)
9898

99+
if (.Platform$OS.type == "windows") {
100+
Sys.setenv(TZ = "GMT")
101+
}
102+
99103
test_that("calling sparkRSQL.init returns existing SQL context", {
100104
skip_on_cran()
101105

@@ -673,24 +677,27 @@ test_that("jsonRDD() on a RDD with json string", {
673677
})
674678

675679
test_that("test tableNames and tables", {
680+
count <- count(listTables())
681+
676682
df <- read.json(jsonPath)
677683
createOrReplaceTempView(df, "table1")
678-
expect_equal(length(tableNames()), 1)
679-
expect_equal(length(tableNames("default")), 1)
684+
expect_equal(length(tableNames()), count + 1)
685+
expect_equal(length(tableNames("default")), count + 1)
686+
680687
tables <- listTables()
681-
expect_equal(count(tables), 1)
688+
expect_equal(count(tables), count + 1)
682689
expect_equal(count(tables()), count(tables))
683690
expect_true("tableName" %in% colnames(tables()))
684691
expect_true(all(c("tableName", "database", "isTemporary") %in% colnames(tables())))
685692

686693
suppressWarnings(registerTempTable(df, "table2"))
687694
tables <- listTables()
688-
expect_equal(count(tables), 2)
695+
expect_equal(count(tables), count + 2)
689696
suppressWarnings(dropTempTable("table1"))
690697
expect_true(dropTempView("table2"))
691698

692699
tables <- listTables()
693-
expect_equal(count(tables), 0)
700+
expect_equal(count(tables), count + 0)
694701
})
695702

696703
test_that(
@@ -1223,6 +1230,16 @@ test_that("select with column", {
12231230
expect_equal(columns(df4), c("name", "age"))
12241231
expect_equal(count(df4), 3)
12251232

1233+
# Test select with alias
1234+
df5 <- alias(df, "table")
1235+
1236+
expect_equal(columns(select(df5, column("table.name"))), "name")
1237+
expect_equal(columns(select(df5, "table.name")), "name")
1238+
1239+
# Test that stats::alias is not masked
1240+
expect_is(alias(aov(yield ~ block + N * P * K, npk)), "listof")
1241+
1242+
12261243
expect_error(select(df, c("name", "age"), "name"),
12271244
"To select multiple columns, use a character vector or list for col")
12281245
})
@@ -3387,7 +3404,7 @@ compare_list <- function(list1, list2) {
33873404

33883405
# This should always be the **very last test** in this test file.
33893406
test_that("No extra files are created in SPARK_HOME by starting session and making calls", {
3390-
skip_on_cran()
3407+
skip_on_cran() # skip because when run from R CMD check SPARK_HOME is not the current directory
33913408

33923409
# Check that it is not creating any extra file.
33933410
# Does not check the tempdir which would be cleaned up after.

R/pkg/vignettes/sparkr-vignettes.Rmd

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ We can view the first few rows of the `SparkDataFrame` by `head` or `showDF` fun
6565
head(carsDF)
6666
```
6767

68-
Common data processing operations such as `filter`, `select` are supported on the `SparkDataFrame`.
68+
Common data processing operations such as `filter` and `select` are supported on the `SparkDataFrame`.
6969
```{r}
7070
carsSubDF <- select(carsDF, "model", "mpg", "hp")
7171
carsSubDF <- filter(carsSubDF, carsSubDF$hp >= 200)
@@ -379,7 +379,7 @@ out <- dapply(carsSubDF, function(x) { x <- cbind(x, x$mpg * 1.61) }, schema)
379379
head(collect(out))
380380
```
381381

382-
Like `dapply`, apply a function to each partition of a `SparkDataFrame` and collect the result back. The output of function should be a `data.frame`, but no schema is required in this case. Note that `dapplyCollect` can fail if the output of UDF run on all the partition cannot be pulled to the driver and fit in driver memory.
382+
Like `dapply`, `dapplyCollect` can apply a function to each partition of a `SparkDataFrame` and collect the result back. The output of the function should be a `data.frame`, but no schema is required in this case. Note that `dapplyCollect` can fail if the output of the UDF on all partitions cannot be pulled into the driver's memory.
383383

384384
```{r}
385385
out <- dapplyCollect(
@@ -405,7 +405,7 @@ result <- gapply(
405405
head(arrange(result, "max_mpg", decreasing = TRUE))
406406
```
407407

408-
Like gapply, `gapplyCollect` applies a function to each partition of a `SparkDataFrame` and collect the result back to R `data.frame`. The output of the function should be a `data.frame` but no schema is required in this case. Note that `gapplyCollect` can fail if the output of UDF run on all the partition cannot be pulled to the driver and fit in driver memory.
408+
Like `gapply`, `gapplyCollect` can apply a function to each partition of a `SparkDataFrame` and collect the result back to R `data.frame`. The output of the function should be a `data.frame` but no schema is required in this case. Note that `gapplyCollect` can fail if the output of the UDF on all partitions cannot be pulled into the driver's memory.
409409

410410
```{r}
411411
result <- gapplyCollect(
@@ -458,20 +458,20 @@ options(ops)
458458

459459

460460
### SQL Queries
461-
A `SparkDataFrame` can also be registered as a temporary view in Spark SQL and that allows you to run SQL queries over its data. The sql function enables applications to run SQL queries programmatically and returns the result as a `SparkDataFrame`.
461+
A `SparkDataFrame` can also be registered as a temporary view in Spark SQL so that one can run SQL queries over its data. The sql function enables applications to run SQL queries programmatically and returns the result as a `SparkDataFrame`.
462462

463463
```{r}
464464
people <- read.df(paste0(sparkR.conf("spark.home"),
465465
"/examples/src/main/resources/people.json"), "json")
466466
```
467467

468-
Register this SparkDataFrame as a temporary view.
468+
Register this `SparkDataFrame` as a temporary view.
469469

470470
```{r}
471471
createOrReplaceTempView(people, "people")
472472
```
473473

474-
SQL statements can be run by using the sql method.
474+
SQL statements can be run using the sql method.
475475
```{r}
476476
teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
477477
head(teenagers)
@@ -780,7 +780,7 @@ head(predict(isoregModel, newDF))
780780
`spark.gbt` fits a [gradient-boosted tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or regression model on a `SparkDataFrame`.
781781
Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models.
782782

783-
Similar to the random forest example above, we use the `longley` dataset to train a gradient-boosted tree and make predictions:
783+
We use the `longley` dataset to train a gradient-boosted tree and make predictions:
784784

785785
```{r, warning=FALSE}
786786
df <- createDataFrame(longley)
@@ -820,7 +820,7 @@ head(select(fitted, "Class", "prediction"))
820820

821821
`spark.gaussianMixture` fits multivariate [Gaussian Mixture Model](https://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model) (GMM) against a `SparkDataFrame`. [Expectation-Maximization](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) (EM) is used to approximate the maximum likelihood estimator (MLE) of the model.
822822

823-
We use a simulated example to demostrate the usage.
823+
We use a simulated example to demonstrate the usage.
824824
```{r}
825825
X1 <- data.frame(V1 = rnorm(4), V2 = rnorm(4))
826826
X2 <- data.frame(V1 = rnorm(6, 3), V2 = rnorm(6, 4))
@@ -851,9 +851,9 @@ head(select(kmeansPredictions, "model", "mpg", "hp", "wt", "prediction"), n = 20
851851

852852
* Topics and documents both exist in a feature space, where feature vectors are vectors of word counts (bag of words).
853853

854-
* Rather than estimating a clustering using a traditional distance, LDA uses a function based on a statistical model of how text documents are generated.
854+
* Rather than clustering using a traditional distance, LDA uses a function based on a statistical model of how text documents are generated.
855855

856-
To use LDA, we need to specify a `features` column in `data` where each entry represents a document. There are two type options for the column:
856+
To use LDA, we need to specify a `features` column in `data` where each entry represents a document. There are two options for the column:
857857

858858
* character string: This can be a string of the whole document. It will be parsed automatically. Additional stop words can be added in `customizedStopWords`.
859859

@@ -901,7 +901,7 @@ perplexity
901901

902902
`spark.als` learns latent factors in [collaborative filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) via [alternating least squares](http://dl.acm.org/citation.cfm?id=1608614).
903903

904-
There are multiple options that can be configured in `spark.als`, including `rank`, `reg`, `nonnegative`. For a complete list, refer to the help file.
904+
There are multiple options that can be configured in `spark.als`, including `rank`, `reg`, and `nonnegative`. For a complete list, refer to the help file.
905905

906906
```{r, eval=FALSE}
907907
ratings <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 4.0),
@@ -981,7 +981,7 @@ testSummary
981981

982982

983983
### Model Persistence
984-
The following example shows how to save/load an ML model by SparkR.
984+
The following example shows how to save/load an ML model in SparkR.
985985
```{r}
986986
t <- as.data.frame(Titanic)
987987
training <- createDataFrame(t)
@@ -1079,19 +1079,19 @@ There are three main object classes in SparkR you may be working with.
10791079
+ `sdf` stores a reference to the corresponding Spark Dataset in the Spark JVM backend.
10801080
+ `env` saves the meta-information of the object such as `isCached`.
10811081

1082-
It can be created by data import methods or by transforming an existing `SparkDataFrame`. We can manipulate `SparkDataFrame` by numerous data processing functions and feed that into machine learning algorithms.
1082+
It can be created by data import methods or by transforming an existing `SparkDataFrame`. We can manipulate `SparkDataFrame` by numerous data processing functions and feed that into machine learning algorithms.
10831083

1084-
* `Column`: an S4 class representing column of `SparkDataFrame`. The slot `jc` saves a reference to the corresponding Column object in the Spark JVM backend.
1084+
* `Column`: an S4 class representing a column of `SparkDataFrame`. The slot `jc` saves a reference to the corresponding `Column` object in the Spark JVM backend.
10851085

1086-
It can be obtained from a `SparkDataFrame` by `$` operator, `df$col`. More often, it is used together with other functions, for example, with `select` to select particular columns, with `filter` and constructed conditions to select rows, with aggregation functions to compute aggregate statistics for each group.
1086+
It can be obtained from a `SparkDataFrame` by `$` operator, e.g., `df$col`. More often, it is used together with other functions, for example, with `select` to select particular columns, with `filter` and constructed conditions to select rows, with aggregation functions to compute aggregate statistics for each group.
10871087

1088-
* `GroupedData`: an S4 class representing grouped data created by `groupBy` or by transforming other `GroupedData`. Its `sgd` slot saves a reference to a RelationalGroupedDataset object in the backend.
1088+
* `GroupedData`: an S4 class representing grouped data created by `groupBy` or by transforming other `GroupedData`. Its `sgd` slot saves a reference to a `RelationalGroupedDataset` object in the backend.
10891089

1090-
This is often an intermediate object with group information and followed up by aggregation operations.
1090+
This is often an intermediate object with group information and followed up by aggregation operations.
10911091

10921092
### Architecture
10931093

1094-
A complete description of architecture can be seen in reference, in particular the paper *SparkR: Scaling R Programs with Spark*.
1094+
A complete description of architecture can be seen in the references, in particular the paper *SparkR: Scaling R Programs with Spark*.
10951095

10961096
Under the hood of SparkR is Spark SQL engine. This avoids the overheads of running interpreted R code, and the optimized SQL execution engine in Spark uses structural information about data and computation flow to perform a bunch of optimizations to speed up the computation.
10971097

appveyor.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,14 @@ install:
4848
build_script:
4949
- cmd: mvn -DskipTests -Psparkr -Phive -Phive-thriftserver package
5050

51+
environment:
52+
NOT_CRAN: true
53+
5154
test_script:
52-
- cmd: .\bin\spark-submit2.cmd --conf spark.hadoop.fs.defaultFS="file:///" R\pkg\tests\run-all.R
55+
- cmd: .\bin\spark-submit2.cmd --driver-java-options "-Dlog4j.configuration=file:///%CD:\=/%/R/log4j.properties" --conf spark.hadoop.fs.defaultFS="file:///" R\pkg\tests\run-all.R
5356

5457
notifications:
5558
- provider: Email
5659
on_build_success: false
5760
on_build_failure: false
5861
on_build_status_changed: false
59-

assembly/pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,10 +224,10 @@
224224
</profile>
225225

226226
<!--
227-
Pull in spark-hadoop-cloud and its associated JARs,
227+
Pull in hadoop-cloud and its associated JARs,
228228
-->
229229
<profile>
230-
<id>cloud</id>
230+
<id>hadoop-cloud</id>
231231
<dependencies>
232232
<dependency>
233233
<groupId>org.apache.spark</groupId>

bin/spark-class

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ build_command() {
7272
printf "%d\0" $?
7373
}
7474

75+
# Turn off posix mode since it does not allow process substitution
76+
set +o posix
7577
CMD=()
7678
while IFS= read -d '' -r ARG; do
7779
CMD+=("$ARG")

bin/spark-class2.cmd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if not "x%SPARK_PREPEND_CLASSES%"=="x" (
5151
rem Figure out where java is.
5252
set RUNNER=java
5353
if not "x%JAVA_HOME%"=="x" (
54-
set RUNNER="%JAVA_HOME%\bin\java"
54+
set RUNNER=%JAVA_HOME%\bin\java
5555
) else (
5656
where /q "%RUNNER%"
5757
if ERRORLEVEL 1 (

circle.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,13 @@ dependencies:
3737
# Copy contents into current build directory
3838
rsync --info=stats2,misc1,flist0 -a build_classes/ .
3939
fi
40-
- ./build/mvn -DskipTests -Pcloud -Phadoop-palantir -Pkinesis-asl -Pkubernetes -Pyarn -Phive -Psparkr install
40+
- ./build/mvn -DskipTests -Phadoop-cloud -Phadoop-palantir -Pkinesis-asl -Pkubernetes -Pyarn -Phive -Psparkr install
4141
# Copy all of */target/scala_2.11/classes to build_classes/
4242
- >
4343
rsync --info=stats2,misc1,flist0 -a --delete-excluded --prune-empty-dirs --exclude build_classes/ --exclude 'target/streams' --exclude 'assembly/target' --exclude 'common/network-yarn/target' --exclude 'examples/target' --exclude '***/*.jar' --include 'target/***' --include '**/' --exclude '*' . build_classes/
4444
- |
4545
# Make sbt fetch all the external deps to ~/.ivy2 so it gets cached
46-
./build/sbt -Pcloud -Phadoop-palantir -Pkinesis-asl -Pkubernetes -Pyarn -Phive -Psparkr externalDependencyClasspath
46+
./build/sbt -Phadoop-cloud -Phadoop-palantir -Pkinesis-asl -Pkubernetes -Pyarn -Phive -Psparkr externalDependencyClasspath
4747
cache_directories:
4848
- "build_classes"
4949
- "build"

0 commit comments

Comments
 (0)