Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
66cfb6c
add install_spark
junyangq Jul 19, 2016
9d52d19
add doc for install_spark
junyangq Jul 19, 2016
89efb04
changes to conform to R code style
junyangq Jul 19, 2016
7ba5213
add install into sparkR.session if spark jar is not found
junyangq Jul 19, 2016
6203223
message when SPARK_HOME is non-empty
junyangq Jul 19, 2016
98087ad
change options of spark mirror url
junyangq Jul 21, 2016
0db89b7
minor changes
junyangq Jul 21, 2016
503cb9f
fix R style issue: don't use absolute paths
junyangq Jul 21, 2016
f4522a6
remove spark version function, and use return value of packageVersion…
junyangq Jul 21, 2016
78d6f91
remove unnecessary dir create
junyangq Jul 21, 2016
9666e06
fix issue that dir.exists not available before 3.2.0
junyangq Jul 25, 2016
e4fe002
another dir.exists
junyangq Jul 26, 2016
c105d88
temporary change of test
junyangq Jul 26, 2016
d19853a
recover test changes
junyangq Jul 26, 2016
124110a
minor fix
junyangq Jul 26, 2016
03b8320
fix docs
junyangq Jul 26, 2016
d727be8
delete csv file option
junyangq Jul 26, 2016
ab3789f
rename functions of default mirror and check local master
junyangq Jul 26, 2016
626e4a1
fix mustWork issue in normalizePath, and create cache folder if not f…
junyangq Jul 26, 2016
785de93
output OS name if not matched
junyangq Jul 26, 2016
e3fa259
more specific message and reference for spark_cache_path
junyangq Jul 26, 2016
cf0f66d
more precise message under free build
junyangq Jul 26, 2016
4f4a899
fix typo on hadoop version message
junyangq Jul 26, 2016
14e4943
remove the setting of global spark.install.dir option
junyangq Jul 26, 2016
976472f
add overwrite option and adjust layout (#chars per line) of doc
junyangq Jul 26, 2016
8821b56
remove hardcoded hadoop versions
junyangq Jul 26, 2016
328408b
edit doc of install.spark
junyangq Jul 26, 2016
0091615
check installation of spark only when using ordinary R console
junyangq Jul 26, 2016
dbabb56
minor fix of overwrite message
junyangq Jul 26, 2016
6b2a897
minor change of messages
junyangq Jul 27, 2016
907f37d
set env var SPARK_HOME after installation
junyangq Jul 27, 2016
fa94e3c
minor change of doc
junyangq Jul 27, 2016
22f2f78
delete trailing space
junyangq Jul 27, 2016
7aa3239
celebrate 2.0 release
junyangq Jul 27, 2016
3ad99bc
add explanation of remote url structure, change default hadoop version
junyangq Jul 27, 2016
e421c30
change default mirror
junyangq Jul 27, 2016
aa4ba4d
add additional a step of searching for suggested url and only use def…
junyangq Jul 28, 2016
2bb00e1
separate sub functions
junyangq Jul 28, 2016
699420d
fix typos in fun defs
junyangq Jul 28, 2016
d58e080
fix mirror path and typo
junyangq Jul 28, 2016
82d24a6
fix message
junyangq Jul 28, 2016
0ebef8a
fix message
junyangq Jul 28, 2016
26d4518
fix typo
junyangq Jul 28, 2016
f37a07c
message update
junyangq Jul 28, 2016
64756de
concise message, improve doc for windows, fix regex match
junyangq Jul 28, 2016
29bdf30
Disable (temporarily) some test cases of describe and summary functions
junyangq Jul 29, 2016
5decac6
Send message of reset SPARK_HOME in installation
junyangq Aug 9, 2016
d84ba06
Specify path in jsonUrl and add alias to install function doc
junyangq Aug 9, 2016
3aeb4eb
remove comment
junyangq Aug 9, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion R/check-cran.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,6 @@ $FWDIR/create-docs.sh

VERSION=`grep Version $FWDIR/pkg/DESCRIPTION | awk '{print $NF}'`

"$R_SCRIPT_PATH/"R CMD check --as-cran --no-tests SparkR_"$VERSION".tar.gz
"$R_SCRIPT_PATH/"R CMD check --as-cran SparkR_"$VERSION".tar.gz

popd > /dev/null
3 changes: 2 additions & 1 deletion R/pkg/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Author: The Apache Software Foundation
Maintainer: Shivaram Venkataraman <[email protected]>
Depends:
R (>= 3.0),
methods,
methods
Suggests:
testthat,
e1071,
Expand All @@ -31,6 +31,7 @@ Collate:
'context.R'
'deserialize.R'
'functions.R'
'install.R'
'mllib.R'
'serialize.R'
'sparkR.R'
Expand Down
2 changes: 2 additions & 0 deletions R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -352,3 +352,5 @@ S3method(structField, character)
S3method(structField, jobj)
S3method(structType, jobj)
S3method(structType, structField)

export("install.spark")
235 changes: 235 additions & 0 deletions R/pkg/R/install.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Functions to install Spark in case the user directly downloads SparkR
# from CRAN.

#' Download and Install Apache Spark to a Local Directory
#'
#' \code{install.spark} downloads and installs Spark to a local directory if
#' it is not found. The Spark version we use is the same as the SparkR version.
#' Users can specify a desired Hadoop version, the remote mirror site, and
#' the directory where the package is installed locally.
#'
#' The full url of remote file is inferred from \code{mirrorUrl} and \code{hadoopVersion}.
#' \code{mirrorUrl} specifies the remote path to a Spark folder. It is followed by a subfolder
#' named after the Spark version (that corresponds to SparkR), and then the tar filename.
#' The filename is composed of four parts, i.e. [Spark version]-bin-[Hadoop version].tgz.
#' For example, the full path for a Spark 2.0.0 package for Hadoop 2.7 from
#' \code{http://apache.osuosl.org} has path:
#' \code{http://apache.osuosl.org/spark/spark-2.0.0/spark-2.0.0-bin-hadoop2.7.tgz}.
#' For \code{hadoopVersion = "without"}, [Hadoop version] in the filename is then
#' \code{without-hadoop}.
#'
#' @param hadoopVersion Version of Hadoop to install. Default is \code{"2.7"}. It can take other
#' version number in the format of "x.y" where x and y are integer.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"are integers"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, thanks!

#' If \code{hadoopVersion = "without"}, "Hadoop free" build is installed.
#' See
#' \href{http://spark.apache.org/docs/latest/hadoop-provided.html}{
#' "Hadoop Free" Build} for more information.
#' Other patched version names can also be used, e.g. \code{"cdh4"}
#' @param mirrorUrl base URL of the repositories to use. The directory layout should follow
#' \href{http://www.apache.org/dyn/closer.lua/spark/}{Apache mirrors}.
#' @param localDir a local directory where Spark is installed. The directory contains
#' version-specific folders of Spark packages. Default is path to
#' the cache directory:
#' \itemize{
#' \item Mac OS X: \file{~/Library/Caches/spark}
#' \item Unix: \env{$XDG_CACHE_HOME} if defined, otherwise \file{~/.cache/spark}
#' \item Windows: \file{\%LOCALAPPDATA\%\\spark\\spark\\Cache}. See
#' \href{https://www.microsoft.com/security/portal/mmpc/shared/variables.aspx}{
#' Windows Common Folder Variables} about \%LOCALAPPDATA\%
#' }
#' @param overwrite If \code{TRUE}, download and overwrite the existing tar file in localDir
#' and force re-install Spark (in case the local directory or file is corrupted)
#' @return \code{install.spark} returns the local directory where Spark is found or installed
#' @rdname install.spark
#' @name install.spark
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add @Aliases

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's suppose to have the parameter types like this:

#' @aliases withColumnRenamed,SparkDataFrame,character,character-method

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought it was the format for S4 methods? Do we want to make it S4?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd vote to leave this as a S3 method. Also I think the CRAN checks should show the missing aliases if there are any

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's just a normal function for now. @shivaram Did you mean that or the S3 method? Just to clarify...

Copy link
Member

@felixcheung felixcheung Aug 10, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah you are right - I've missed the fact that's it's not S4.
I think @shivaram means leaving it as you have right now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sounds good, thanks!

#' @aliases install.spark
#' @export
#' @examples
#'\dontrun{
#' install.spark()
#'}
#' @note install.spark since 2.1.0
#' @seealso See available Hadoop versions:
#' \href{http://spark.apache.org/downloads.html}{Apache Spark}
install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL,
localDir = NULL, overwrite = FALSE) {
version <- paste0("spark-", packageVersion("SparkR"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it mean that we can only publish SparkR with released Spark versions? Then how to make patched releases, say "2.0.0-1"? Can we overwrite an existing release on CRAN?

cc: @felixcheung

Copy link
Member

@felixcheung felixcheung Jul 26, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's an excellent point. we could have a fourth digital (separate with -) that we remove to match Spark version:
http://r-pkgs.had.co.nz/description.html#version

> numeric_version("2.0.0-10")
[1] ‘2.0.0.10’

In that case we could patch the SparkR package and still have it match major/minor/patch with Spark

hadoopVersion <- tolower(hadoopVersion)
hadoopVersionName <- hadoop_version_name(hadoopVersion)
packageName <- paste(version, "bin", hadoopVersionName, sep = "-")
localDir <- ifelse(is.null(localDir), spark_cache_path(),
normalizePath(localDir, mustWork = FALSE))

if (is.na(file.info(localDir)$isdir)) {
dir.create(localDir, recursive = TRUE)
}

packageLocalDir <- file.path(localDir, packageName)

if (overwrite) {
message(paste0("Overwrite = TRUE: download and overwrite the tar file",
"and Spark package directory if they exist."))
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove extra empty line

# can use dir.exists(packageLocalDir) under R 3.2.0 or later
if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) {
fmt <- "Spark %s for Hadoop %s is found, and SPARK_HOME set to %s"
msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion),
packageLocalDir)
message(msg)
Sys.setenv(SPARK_HOME = packageLocalDir)
return(invisible(packageLocalDir))
}

packageLocalPath <- paste0(packageLocalDir, ".tgz")
tarExists <- file.exists(packageLocalPath)

if (tarExists && !overwrite) {
message("tar file found.")
} else {
robust_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath)
}

message(sprintf("Installing to %s", localDir))
untar(tarfile = packageLocalPath, exdir = localDir)
if (!tarExists || overwrite) {
unlink(packageLocalPath)
}
message("DONE.")
Sys.setenv(SPARK_HOME = packageLocalDir)
message(paste("SPARK_HOME set to", packageLocalDir))
invisible(packageLocalDir)
}

robust_download_tar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) {
# step 1: use user-provided url
if (!is.null(mirrorUrl)) {
msg <- sprintf("Use user-provided mirror site: %s.", mirrorUrl)
message(msg)
success <- direct_download_tar(mirrorUrl, version, hadoopVersion,
packageName, packageLocalPath)
if (success) return()
} else {
message("Mirror site not provided.")
}

# step 2: use url suggested from apache website
message("Looking for site suggested from apache website...")
mirrorUrl <- get_preferred_mirror(version, packageName)
if (!is.null(mirrorUrl)) {
success <- direct_download_tar(mirrorUrl, version, hadoopVersion,
packageName, packageLocalPath)
if (success) return()
} else {
message("Unable to find suggested mirror site.")
}

# step 3: use backup option
message("To use backup site...")
mirrorUrl <- default_mirror_url()
success <- direct_download_tar(mirrorUrl, version, hadoopVersion,
packageName, packageLocalPath)
if (success) {
return(packageLocalPath)
} else {
msg <- sprintf(paste("Unable to download Spark %s for Hadoop %s.",
"Please check network connection, Hadoop version,",
"or provide other mirror sites."),
version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion))
stop(msg)
}
}

get_preferred_mirror <- function(version, packageName) {
jsonUrl <- paste0("http://www.apache.org/dyn/closer.cgi?path=",
file.path("spark", version, packageName),
".tgz&as_json=1")
textLines <- readLines(jsonUrl, warn = FALSE)
rowNum <- grep("\"preferred\"", textLines)
linePreferred <- textLines[rowNum]
matchInfo <- regexpr("\"[A-Za-z][A-Za-z0-9+-.]*://.+\"", linePreferred)
if (matchInfo != -1) {
startPos <- matchInfo + 1
endPos <- matchInfo + attr(matchInfo, "match.length") - 2
mirrorPreferred <- base::substr(linePreferred, startPos, endPos)
mirrorPreferred <- paste0(mirrorPreferred, "spark")
message(sprintf("Preferred mirror site found: %s", mirrorPreferred))
} else {
mirrorPreferred <- NULL
}
mirrorPreferred
}

direct_download_tar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) {
packageRemotePath <- paste0(
file.path(mirrorUrl, version, packageName), ".tgz")
fmt <- paste("Downloading Spark %s for Hadoop %s from:\n- %s")
msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion),
packageRemotePath)
message(msg)

isFail <- tryCatch(download.file(packageRemotePath, packageLocalPath),
error = function(e) {
message(sprintf("Fetch failed from %s", mirrorUrl))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might want to print(e) as well so it could debug why the download throws exception

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Thanks!

print(e)
TRUE
})
!isFail
}

default_mirror_url <- function() {
"http://www-us.apache.org/dist/spark"
}

hadoop_version_name <- function(hadoopVersion) {
if (hadoopVersion == "without") {
"without-hadoop"
} else if (grepl("^[0-9]+\\.[0-9]+$", hadoopVersion, perl = TRUE)) {
paste0("hadoop", hadoopVersion)
} else {
hadoopVersion
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't it fail here? I think we shouldn't let arbitrary string through?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking about the name of user patched versions, e.g. cdh4 - wondering if there is a way to tell such names from other invalid ones?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is usually a format?
http://archive.cloudera.com/cdh5/cdh/5/spark-1.6.0-cdh5.8.0.tar.gz

so spark-2.0.0 + -something?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we trying to validate the -something part? Without attempting to download, we still don't know if the input version is good or not? I might miss something here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW I dont think its worth verifying the -something part just because it could be cdh or hdp etc. If the file is not found, we should throw an appropriate error which the user can use to understand what went wrong.

}
}

# The implementation refers to appdirs package: https://pypi.python.org/pypi/appdirs and
# adapt to Spark context
spark_cache_path <- function() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there references about the implementation here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, those actually refer to the implementation of rappdirs/R/cache.r. Should add reference here.

if (.Platform$OS.type == "windows") {
winAppPath <- Sys.getenv("%LOCALAPPDATA%", unset = NA)
if (is.na(winAppPath)) {
msg <- paste("%LOCALAPPDATA% not found.",
"Please define the environment variable",
"or restart and enter an installation path in localDir.")
stop(msg)
} else {
path <- file.path(winAppPath, "spark", "spark", "Cache")
}
} else if (.Platform$OS.type == "unix") {
if (Sys.info()["sysname"] == "Darwin") {
path <- file.path(Sys.getenv("HOME"), "Library/Caches", "spark")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to check whether the folder exists and create the folder if it is not. I got this error in the first run

Error in normalizePath(path, mustWork = TRUE) : 
  path[1]="/Users/meng/Library/Caches/spark": No such file or directory

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed to mustWork = FALSE. The check and create operations come later :)

} else {
path <- file.path(
Sys.getenv("XDG_CACHE_HOME", file.path(Sys.getenv("HOME"), ".cache")), "spark")
}
} else {
stop(sprintf("Unknown OS: %s", .Platform$OS.type))
}
normalizePath(path, mustWork = FALSE)
}
17 changes: 17 additions & 0 deletions R/pkg/R/sparkR.R
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,23 @@ sparkR.session <- function(
}
overrideEnvs(sparkConfigMap, paramMap)
}
# do not download if it is run in the sparkR shell
if (!nzchar(master) || is_master_local(master)) {
if (!is_sparkR_shell()) {
if (is.na(file.info(sparkHome)$isdir)) {
msg <- paste0("Spark not found in SPARK_HOME: ",
sparkHome,
" .\nTo search in the cache directory. ",
"Installation will start if not found.")
message(msg)
packageLocalDir <- install.spark()
sparkHome <- packageLocalDir
} else {
msg <- paste0("Spark package is found in SPARK_HOME: ", sparkHome)
message(msg)
}
}
}

if (!exists(".sparkRjsc", envir = .sparkREnv)) {
sparkExecutorEnvMap <- new.env()
Expand Down
8 changes: 8 additions & 0 deletions R/pkg/R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -689,3 +689,11 @@ getSparkContext <- function() {
sc <- get(".sparkRjsc", envir = .sparkREnv)
sc
}

is_master_local <- function(master) {
grepl("^local(\\[([0-9]+|\\*)\\])?$", master, perl = TRUE)
}

is_sparkR_shell <- function() {
grepl(".*shell\\.R$", Sys.getenv("R_PROFILE_USER"), perl = TRUE)
}
4 changes: 2 additions & 2 deletions R/pkg/inst/tests/testthat/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1824,11 +1824,11 @@ test_that("describe() and summarize() on a DataFrame", {
expect_equal(collect(stats)[2, "age"], "24.5")
expect_equal(collect(stats)[3, "age"], "7.7781745930520225")
stats <- describe(df)
expect_equal(collect(stats)[4, "name"], "Andy")
expect_equal(collect(stats)[4, "summary"], "min")
expect_equal(collect(stats)[5, "age"], "30")

stats2 <- summary(df)
expect_equal(collect(stats2)[4, "name"], "Andy")
expect_equal(collect(stats2)[4, "summary"], "min")
expect_equal(collect(stats2)[5, "age"], "30")

# SPARK-16425: SparkR summary() fails on column of type logical
Expand Down