Skip to content

Commit c56de6d

Browse files
committed
Merge remote-tracking branch 'origin/master' into agg-bug
2 parents da6a285 + 79159a1 commit c56de6d

File tree

328 files changed

+7997
-4000
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

328 files changed

+7997
-4000
lines changed

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ It lists steps that are required before creating a PR. In particular, consider:
66

77
- Is the change important and ready enough to ask the community to spend time reviewing?
88
- Have you searched for existing, related JIRAs and pull requests?
9-
- Is this a new feature that can stand alone as a package on http://spark-packages.org ?
9+
- Is this a new feature that can stand alone as a [third party project](https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects) ?
1010
- Is the change being proposed clearly explained and motivated?
1111

1212
When you contribute code, you affirm that the contribution is your original work and that you

R/create-docs.sh

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,26 @@
1717
# limitations under the License.
1818
#
1919

20-
# Script to create API docs for SparkR
21-
# This requires `devtools` and `knitr` to be installed on the machine.
20+
# Script to create API docs and vignettes for SparkR
21+
# This requires `devtools`, `knitr` and `rmarkdown` to be installed on the machine.
2222

2323
# After running this script the html docs can be found in
2424
# $SPARK_HOME/R/pkg/html
25+
# The vignettes can be found in
26+
# $SPARK_HOME/R/pkg/vignettes/sparkr_vignettes.html
2527

2628
set -o pipefail
2729
set -e
2830

2931
# Figure out where the script is
3032
export FWDIR="$(cd "`dirname "$0"`"; pwd)"
33+
export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
34+
35+
# Required for setting SPARK_SCALA_VERSION
36+
. "${SPARK_HOME}"/bin/load-spark-env.sh
37+
38+
echo "Using Scala $SPARK_SCALA_VERSION"
39+
3140
pushd $FWDIR
3241

3342
# Install the package (this will also generate the Rd files)
@@ -43,4 +52,21 @@ Rscript -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); library(knit
4352

4453
popd
4554

55+
# Find Spark jars.
56+
if [ -f "${SPARK_HOME}/RELEASE" ]; then
57+
SPARK_JARS_DIR="${SPARK_HOME}/jars"
58+
else
59+
SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION/jars"
60+
fi
61+
62+
# Only create vignettes if Spark JARs exist
63+
if [ -d "$SPARK_JARS_DIR" ]; then
64+
# render creates SparkR vignettes
65+
Rscript -e 'library(rmarkdown); paths <- .libPaths(); .libPaths(c("lib", paths)); Sys.setenv(SPARK_HOME=tools::file_path_as_absolute("..")); render("pkg/vignettes/sparkr-vignettes.Rmd"); .libPaths(paths)'
66+
67+
find pkg/vignettes/. -not -name '.' -not -name '*.Rmd' -not -name '*.md' -not -name '*.pdf' -not -name '*.html' -delete
68+
else
69+
echo "Skipping R vignettes as Spark JARs not found in $SPARK_HOME"
70+
fi
71+
4672
popd

R/pkg/NAMESPACE

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,9 @@ export("as.DataFrame",
336336
"read.parquet",
337337
"read.text",
338338
"spark.lapply",
339+
"spark.addFile",
340+
"spark.getSparkFilesRootDirectory",
341+
"spark.getSparkFiles",
339342
"sql",
340343
"str",
341344
"tableToDF",

R/pkg/R/context.R

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,54 @@ setCheckpointDir <- function(sc, dirName) {
225225
invisible(callJMethod(sc, "setCheckpointDir", suppressWarnings(normalizePath(dirName))))
226226
}
227227

228+
#' Add a file or directory to be downloaded with this Spark job on every node.
229+
#'
230+
#' The path passed can be either a local file, a file in HDFS (or other Hadoop-supported
231+
#' filesystems), or an HTTP, HTTPS or FTP URI. To access the file in Spark jobs,
232+
#' use spark.getSparkFiles(fileName) to find its download location.
233+
#'
234+
#' @rdname spark.addFile
235+
#' @param path The path of the file to be added
236+
#' @export
237+
#' @examples
238+
#'\dontrun{
239+
#' spark.addFile("~/myfile")
240+
#'}
241+
#' @note spark.addFile since 2.1.0
242+
spark.addFile <- function(path) {
243+
sc <- getSparkContext()
244+
invisible(callJMethod(sc, "addFile", suppressWarnings(normalizePath(path))))
245+
}
246+
247+
#' Get the root directory that contains files added through spark.addFile.
248+
#'
249+
#' @rdname spark.getSparkFilesRootDirectory
250+
#' @return the root directory that contains files added through spark.addFile
251+
#' @export
252+
#' @examples
253+
#'\dontrun{
254+
#' spark.getSparkFilesRootDirectory()
255+
#'}
256+
#' @note spark.getSparkFilesRootDirectory since 2.1.0
257+
spark.getSparkFilesRootDirectory <- function() {
258+
callJStatic("org.apache.spark.SparkFiles", "getRootDirectory")
259+
}
260+
261+
#' Get the absolute path of a file added through spark.addFile.
262+
#'
263+
#' @rdname spark.getSparkFiles
264+
#' @param fileName The name of the file added through spark.addFile
265+
#' @return the absolute path of a file added through spark.addFile.
266+
#' @export
267+
#' @examples
268+
#'\dontrun{
269+
#' spark.getSparkFiles("myfile")
270+
#'}
271+
#' @note spark.getSparkFiles since 2.1.0
272+
spark.getSparkFiles <- function(fileName) {
273+
callJStatic("org.apache.spark.SparkFiles", "get", as.character(fileName))
274+
}
275+
228276
#' Run a function over a list of elements, distributing the computations with Spark
229277
#'
230278
#' Run a function over a list of elements, distributing the computations with Spark. Applies a

R/pkg/R/functions.R

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2713,11 +2713,15 @@ setMethod("from_unixtime", signature(x = "Column"),
27132713
#' @param x a time Column. Must be of TimestampType.
27142714
#' @param windowDuration a string specifying the width of the window, e.g. '1 second',
27152715
#' '1 day 12 hours', '2 minutes'. Valid interval strings are 'week',
2716-
#' 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond'.
2716+
#' 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond'. Note that
2717+
#' the duration is a fixed length of time, and does not vary over time
2718+
#' according to a calendar. For example, '1 day' always means 86,400,000
2719+
#' milliseconds, not a calendar day.
27172720
#' @param slideDuration a string specifying the sliding interval of the window. Same format as
27182721
#' \code{windowDuration}. A new window will be generated every
27192722
#' \code{slideDuration}. Must be less than or equal to
2720-
#' the \code{windowDuration}.
2723+
#' the \code{windowDuration}. This duration is likewise absolute, and does not
2724+
#' vary according to a calendar.
27212725
#' @param startTime the offset with respect to 1970-01-01 00:00:00 UTC with which to start
27222726
#' window intervals. For example, in order to have hourly tumbling windows
27232727
#' that start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide

R/pkg/R/mllib.R

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1398,20 +1398,22 @@ setMethod("summary", signature(object = "KSTest"),
13981398
distParams <- unlist(callJMethod(jobj, "distParams"))
13991399
degreesOfFreedom <- callJMethod(jobj, "degreesOfFreedom")
14001400

1401-
list(p.value = pValue, statistic = statistic, nullHypothesis = nullHypothesis,
1402-
nullHypothesis.name = distName, nullHypothesis.parameters = distParams,
1403-
degreesOfFreedom = degreesOfFreedom)
1401+
ans <- list(p.value = pValue, statistic = statistic, nullHypothesis = nullHypothesis,
1402+
nullHypothesis.name = distName, nullHypothesis.parameters = distParams,
1403+
degreesOfFreedom = degreesOfFreedom, jobj = jobj)
1404+
class(ans) <- "summary.KSTest"
1405+
ans
14041406
})
14051407

14061408
# Prints the summary of KSTest
14071409

14081410
#' @rdname spark.kstest
1409-
#' @param x test result object of KSTest by \code{spark.kstest}.
1411+
#' @param x summary object of KSTest returned by \code{summary}.
14101412
#' @export
14111413
#' @note print.summary.KSTest since 2.1.0
14121414
print.summary.KSTest <- function(x, ...) {
1413-
jobj <- x@jobj
1415+
jobj <- x$jobj
14141416
summaryStr <- callJMethod(jobj, "summary")
1415-
cat(summaryStr)
1416-
invisible(summaryStr)
1417+
cat(summaryStr, "\n")
1418+
invisible(x)
14171419
}

R/pkg/R/sparkR.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ sparkR.stop <- function() {
100100
#' @param sparkEnvir Named list of environment variables to set on worker nodes
101101
#' @param sparkExecutorEnv Named list of environment variables to be used when launching executors
102102
#' @param sparkJars Character vector of jar files to pass to the worker nodes
103-
#' @param sparkPackages Character vector of packages from spark-packages.org
103+
#' @param sparkPackages Character vector of package coordinates
104104
#' @seealso \link{sparkR.session}
105105
#' @rdname sparkR.init-deprecated
106106
#' @export
@@ -327,7 +327,7 @@ sparkRHive.init <- function(jsc = NULL) {
327327
#' @param sparkHome Spark Home directory.
328328
#' @param sparkConfig named list of Spark configuration to set on worker nodes.
329329
#' @param sparkJars character vector of jar files to pass to the worker nodes.
330-
#' @param sparkPackages character vector of packages from spark-packages.org
330+
#' @param sparkPackages character vector of package coordinates
331331
#' @param enableHiveSupport enable support for Hive, fallback if not built with Hive support; once
332332
#' set, this cannot be turned off on an existing session
333333
#' @param ... named Spark properties passed to the method.

R/pkg/inst/tests/testthat/test_context.R

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,3 +166,16 @@ test_that("spark.lapply should perform simple transforms", {
166166
expect_equal(doubled, as.list(2 * 1:10))
167167
sparkR.session.stop()
168168
})
169+
170+
test_that("add and get file to be downloaded with Spark job on every node", {
171+
sparkR.sparkContext()
172+
path <- tempfile(pattern = "hello", fileext = ".txt")
173+
filename <- basename(path)
174+
words <- "Hello World!"
175+
writeLines(words, path)
176+
spark.addFile(path)
177+
download_path <- spark.getSparkFiles(filename)
178+
expect_equal(readLines(download_path), words)
179+
unlink(path)
180+
sparkR.session.stop()
181+
})

R/pkg/inst/tests/testthat/test_mllib.R

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -760,13 +760,7 @@ test_that("spark.kstest", {
760760

761761
expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4)
762762
expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4)
763-
764-
printStr <- print.summary.KSTest(testResult)
765-
expect_match(printStr, paste0("Kolmogorov-Smirnov test summary:\\n",
766-
"degrees of freedom = 0 \\n",
767-
"statistic = 0.38208[0-9]* \\n",
768-
"pValue = 0.19849[0-9]* \\n",
769-
".*"), perl = TRUE)
763+
expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
770764

771765
testResult <- spark.kstest(df, "test", "norm", -0.5)
772766
stats <- summary(testResult)
@@ -775,13 +769,7 @@ test_that("spark.kstest", {
775769

776770
expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4)
777771
expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4)
778-
779-
printStr <- print.summary.KSTest(testResult)
780-
expect_match(printStr, paste0("Kolmogorov-Smirnov test summary:\\n",
781-
"degrees of freedom = 0 \\n",
782-
"statistic = 0.44003[0-9]* \\n",
783-
"pValue = 0.09470[0-9]* \\n",
784-
".*"), perl = TRUE)
772+
expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
785773
})
786774

787775
sparkR.session.stop()

0 commit comments

Comments
 (0)