From 66cfb6c771bc94e8b6de71fa9cbc057efac64484 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Mon, 18 Jul 2016 21:15:51 -0700 Subject: [PATCH 01/49] add install_spark --- R/pkg/DESCRIPTION | 1 + R/pkg/NAMESPACE | 2 ++ R/pkg/inst/extdata/spark_download.csv | 2 ++ 3 files changed, 5 insertions(+) create mode 100644 R/pkg/inst/extdata/spark_download.csv diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index ac73d6c79891..7539373296c1 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -31,6 +31,7 @@ Collate: 'context.R' 'deserialize.R' 'functions.R' + 'install.R' 'mllib.R' 'serialize.R' 'sparkR.R' diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 1d74c6d95578..6552a8bd19c5 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -352,3 +352,5 @@ S3method(structField, character) S3method(structField, jobj) S3method(structType, jobj) S3method(structType, structField) + +export("install_spark") diff --git a/R/pkg/inst/extdata/spark_download.csv b/R/pkg/inst/extdata/spark_download.csv new file mode 100644 index 000000000000..4fd3223eeaac --- /dev/null +++ b/R/pkg/inst/extdata/spark_download.csv @@ -0,0 +1,2 @@ +"url","default" +"http://apache.osuosl.org",TRUE From 9d52d191acb5a55e9173897aaf381072590d9d01 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 19 Jul 2016 01:46:09 -0700 Subject: [PATCH 02/49] add doc for install_spark --- R/pkg/R/install.R | 84 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 R/pkg/R/install.R diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R new file mode 100644 index 000000000000..c32685f3bfd3 --- /dev/null +++ b/R/pkg/R/install.R @@ -0,0 +1,84 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Functions to install Spark in case the user directly downloads SparkR +# from CRAN. + +#' Download and Install Spark to Local Directory +#' +#' \code{install_spark} downloads and installs Spark to local directory if +#' it is not found. The Spark version we use is 2.0.0 (preview). Users can +#' specify a desired Hadoop version, the remote site, and the directory where +#' the package is installed locally. +#' +#' @param hadoop_version Version of Hadoop to install. 2.3, 2.4, 2.6, +#' and 2.7 (default) +#' @param url the base URL of the repositories to use +#' @param local_dir local directory that Spark is installed to +#' @rdname install_spark +#' @name install_spark +#' @export +#' @examples +#'\dontrun{ +#' install_spark() +#'} +#' @note install_spark since 2.1.0 +install_spark <- function(hadoop_version = NULL, url = NULL, local_dir = NULL) { + version <- paste0("spark-", spark_version_default()) + hadoop_version <- hadoop_version_default() + packageName <- paste0(version, "-bin-hadoop", hadoop_version) + if (is.null(local_dir)) { + local_dir <- getOption("spark.install.dir", + rappdirs::app_dir("spark"))$cache() + } + packageLocalDir <- file.path(local_dir, packageName) + if (dir.exists(packageLocalDir)) { + fmt <- "Spark %s for Hadoop %s has been installed." + msg <- sprintf(fmt, version, hadoop_version) + message(msg) + } else { + dir.create(packageLocalDir, recursive = TRUE) + if (is.null(url)) { + mirror_sites <- read.csv(mirror_csv_url()) + url <- mirror_sites$url[1] + } + packageRemotePath <- paste0(file.path(url, "spark", version, packageName), + ".tgz") + fmt <- paste("Installing Spark %s for Hadoop %s.", + "Downloading from:\n %s", + "Installing to:\n %s", sep = "\n") + msg <- sprintf(fmt, version, hadoop_version, packageRemotePath, + packageLocalDir) + message(msg) + packageLocalPath <- paste0(packageLocalDir, ".tgz") + download.file(packageRemotePath, packageLocalPath) + untar(tarfile = packageLocalPath, exdir = local_dir) + unlink(packageLocalPath) + } +} + +mirror_csv_url <- function() { + system.file("extdata", "spark_download.csv", package = "SparkR") +} + +spark_version_default <- function() { + "2.0.0-preview" +} + +hadoop_version_default <- function() { + "2.7" +} From 89efb0438adee2cb95a4029d2779be5490b0a256 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 19 Jul 2016 02:38:11 -0700 Subject: [PATCH 03/49] changes to conform to R code style --- R/pkg/R/install.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index c32685f3bfd3..3dcfe384ceb9 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -42,7 +42,7 @@ install_spark <- function(hadoop_version = NULL, url = NULL, local_dir = NULL) { hadoop_version <- hadoop_version_default() packageName <- paste0(version, "-bin-hadoop", hadoop_version) if (is.null(local_dir)) { - local_dir <- getOption("spark.install.dir", + local_dir <- getOption("spark.install.dir", rappdirs::app_dir("spark"))$cache() } packageLocalDir <- file.path(local_dir, packageName) @@ -61,7 +61,7 @@ install_spark <- function(hadoop_version = NULL, url = NULL, local_dir = NULL) { fmt <- paste("Installing Spark %s for Hadoop %s.", "Downloading from:\n %s", "Installing to:\n %s", sep = "\n") - msg <- sprintf(fmt, version, hadoop_version, packageRemotePath, + msg <- sprintf(fmt, version, hadoop_version, packageRemotePath, packageLocalDir) message(msg) packageLocalPath <- paste0(packageLocalDir, ".tgz") From 7ba5213b0d288614313b1400190c03f8da59838f Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 19 Jul 2016 12:06:36 -0700 Subject: [PATCH 04/49] add install into sparkR.session if spark jar is not found --- R/pkg/DESCRIPTION | 4 +++- R/pkg/R/install.R | 3 +++ R/pkg/R/sparkR.R | 6 ++++++ R/pkg/R/utils.R | 4 ++++ 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 7539373296c1..c73cabbc8a66 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -7,7 +7,9 @@ Author: The Apache Software Foundation Maintainer: Shivaram Venkataraman Depends: R (>= 3.0), - methods, + methods +Imports: + rappdirs Suggests: testthat, e1071, diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 3dcfe384ceb9..0c659bcd33fa 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -29,6 +29,8 @@ #' and 2.7 (default) #' @param url the base URL of the repositories to use #' @param local_dir local directory that Spark is installed to +#' @return \code{install_spark} returns the local directory +#' where Spark is found or installed #' @rdname install_spark #' @name install_spark #' @export @@ -69,6 +71,7 @@ install_spark <- function(hadoop_version = NULL, url = NULL, local_dir = NULL) { untar(tarfile = packageLocalPath, exdir = local_dir) unlink(packageLocalPath) } + packageLocalDir } mirror_csv_url <- function() { diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index ff5297ffd51c..0509253e3aa6 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -366,6 +366,12 @@ sparkR.session <- function( overrideEnvs(sparkConfigMap, paramMap) } +# if (master_is_local(master) && (!nzchar(sparkHome) || !dir.exists(sparkHome))) { +# message("Spark is not found in local directory. It will be installed in a cache dir.") +# packageLocalDir <- install_spark() +# sparkHome <- packageLocalDir +# } + if (!exists(".sparkRjsc", envir = .sparkREnv)) { sparkExecutorEnvMap <- new.env() sparkR.sparkContext(master, appName, sparkHome, sparkConfigMap, sparkExecutorEnvMap, diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index 240b9f669bdd..f0003e8891a1 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -689,3 +689,7 @@ getSparkContext <- function() { sc <- get(".sparkRjsc", envir = .sparkREnv) sc } + +master_is_local <- function(master) { + grepl("^local(\\[[0-9\\*]*\\])?$", master, perl = TRUE) +} From 62032233720cfbf6de1e1f954f2b6098549a8eb8 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 19 Jul 2016 14:47:39 -0700 Subject: [PATCH 05/49] message when SPARK_HOME is non-empty --- R/pkg/R/sparkR.R | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 0509253e3aa6..1b4d3bc01542 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -365,12 +365,17 @@ sparkR.session <- function( } overrideEnvs(sparkConfigMap, paramMap) } - -# if (master_is_local(master) && (!nzchar(sparkHome) || !dir.exists(sparkHome))) { -# message("Spark is not found in local directory. It will be installed in a cache dir.") -# packageLocalDir <- install_spark() -# sparkHome <- packageLocalDir -# } + if (!nzchar(master) || master_is_local(master)) { + if (!nzchar(sparkHome) || !dir.exists(sparkHome)) { + message("Spark is not found in SPARK_HOME. Redirect to the cache directory.") + packageLocalDir <- install_spark() + sparkHome <- packageLocalDir + } else { + fmt <- "Make sure that Spark is installed in SPARK_HOME: %s" + msg <- sprintf(fmt, sparkHome) + message(msg) + } + } if (!exists(".sparkRjsc", envir = .sparkREnv)) { sparkExecutorEnvMap <- new.env() From 98087ad0d5ae21322c7cef14200d093bcdf15c0e Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Wed, 20 Jul 2016 23:07:25 -0700 Subject: [PATCH 06/49] change options of spark mirror url --- R/pkg/DESCRIPTION | 2 - R/pkg/R/install.R | 105 ++++++++++++++++++++++++++++++++++++++-------- R/pkg/R/sparkR.R | 10 +++-- 3 files changed, 95 insertions(+), 22 deletions(-) diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index c73cabbc8a66..357ab007931f 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -8,8 +8,6 @@ Maintainer: Shivaram Venkataraman Depends: R (>= 3.0), methods -Imports: - rappdirs Suggests: testthat, e1071, diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 0c659bcd33fa..ad88843be051 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -18,7 +18,7 @@ # Functions to install Spark in case the user directly downloads SparkR # from CRAN. -#' Download and Install Spark to Local Directory +#' Download and Install Spark Core to Local Directory #' #' \code{install_spark} downloads and installs Spark to local directory if #' it is not found. The Spark version we use is 2.0.0 (preview). Users can @@ -27,7 +27,7 @@ #' #' @param hadoop_version Version of Hadoop to install. 2.3, 2.4, 2.6, #' and 2.7 (default) -#' @param url the base URL of the repositories to use +#' @param mirror_url the base URL of the repositories to use #' @param local_dir local directory that Spark is installed to #' @return \code{install_spark} returns the local directory #' where Spark is found or installed @@ -39,26 +39,42 @@ #' install_spark() #'} #' @note install_spark since 2.1.0 -install_spark <- function(hadoop_version = NULL, url = NULL, local_dir = NULL) { +install_spark <- function(hadoop_version = NULL, mirror_url = NULL, + local_dir = NULL) { version <- paste0("spark-", spark_version_default()) - hadoop_version <- hadoop_version_default() - packageName <- paste0(version, "-bin-hadoop", hadoop_version) + hadoop_version <- match.arg(hadoop_version, supported_versions_hadoop()) + packageName <- ifelse(hadoop_version == "without", + paste0(version, "-bin-without-hadoop"), + paste0(version, "-bin-hadoop", hadoop_version)) if (is.null(local_dir)) { - local_dir <- getOption("spark.install.dir", - rappdirs::app_dir("spark"))$cache() + local_dir <- getOption("spark.install.dir",spark_cache_path()) + } else { + local_dir <- normalizePath(local_dir) } + packageLocalDir <- file.path(local_dir, packageName) + if (dir.exists(packageLocalDir)) { fmt <- "Spark %s for Hadoop %s has been installed." msg <- sprintf(fmt, version, hadoop_version) message(msg) + return(invisible(packageLocalDir)) + } + + packageLocalPath <- paste0(packageLocalDir, ".tgz") + tarExists <- file.exists(packageLocalPath) + + if (tarExists) { + message("Tar file found. Installing...") } else { dir.create(packageLocalDir, recursive = TRUE) - if (is.null(url)) { - mirror_sites <- read.csv(mirror_csv_url()) - url <- mirror_sites$url[1] + if (is.null(mirror_url)) { + message("Remote URL not provided. Use Apache default.") + mirror_url <- mirror_url_default() } - packageRemotePath <- paste0(file.path(url, "spark", version, packageName), + # This is temporary, should be removed when released + version <- "spark-releases/spark-2.0.0-rc4-bin" + packageRemotePath <- paste0(file.path(mirror_url, version, packageName), ".tgz") fmt <- paste("Installing Spark %s for Hadoop %s.", "Downloading from:\n %s", @@ -66,20 +82,75 @@ install_spark <- function(hadoop_version = NULL, url = NULL, local_dir = NULL) { msg <- sprintf(fmt, version, hadoop_version, packageRemotePath, packageLocalDir) message(msg) - packageLocalPath <- paste0(packageLocalDir, ".tgz") - download.file(packageRemotePath, packageLocalPath) - untar(tarfile = packageLocalPath, exdir = local_dir) + + fetchFail <- tryCatch(download.file(packageRemotePath, packageLocalPath), + error = function(e) { + msg <- paste0("Fetch failed from ", mirror_url, ".") + message(msg) + TRUE + }) + if (fetchFail) { + message("Try the backup option.") + mirror_sites <- tryCatch(read.csv(mirror_url_csv()), + error = function(e) stop("No csv file found.")) + mirror_url <- mirror_sites$url[1] + packageRemotePath <- paste0(file.path(mirror_url, version, packageName), + ".tgz") + message(sprintf("Downloading from:\n %s", packageRemotePath)) + tryCatch(download.file(packageRemotePath, packageLocalPath), + error = function(e) { + stop("Download failed. Please provide a valid mirror_url.") + }) + } + } + + untar(tarfile = packageLocalPath, exdir = local_dir) + if (!tarExists) { unlink(packageLocalPath) } - packageLocalDir + message("Installation done.") + invisible(packageLocalDir) +} + +mirror_url_default <- function() { + # change to http://www.apache.org/dyn/closer.lua + # when released + + "http://people.apache.org/~pwendell" +} + +supported_versions_hadoop <- function() { + c("2.7", "2.6", "2.4", "without") +} + +spark_cache_path <- function() { + if (.Platform$OS.type == "windows") { + winAppPath <- Sys.getenv("%LOCALAPPDATA%", unset = NA) + if (is.null(winAppPath)) { + msg <- paste("%LOCALAPPDATA% not found.", + "Please define or enter an installation path in loc_dir.") + stop(msg) + } else { + path <- file.path(winAppPath, "spark", "spark", "Cache") + } + } else if (.Platform$OS.type == "unix") { + if (Sys.info()["sysname"] == "Darwin") { + path <- file.path("~/Library/Caches", "spark") + } else { + path <- file.path(Sys.getenv("XDG_CACHE_HOME", "~/.cache"), "spark") + } + } else { + stop("Unknown OS") + } + normalizePath(path, mustWork = TRUE) } -mirror_csv_url <- function() { +mirror_url_csv <- function() { system.file("extdata", "spark_download.csv", package = "SparkR") } spark_version_default <- function() { - "2.0.0-preview" + "2.0.0" } hadoop_version_default <- function() { diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 1b4d3bc01542..c2583d9e3c64 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -366,8 +366,12 @@ sparkR.session <- function( overrideEnvs(sparkConfigMap, paramMap) } if (!nzchar(master) || master_is_local(master)) { - if (!nzchar(sparkHome) || !dir.exists(sparkHome)) { - message("Spark is not found in SPARK_HOME. Redirect to the cache directory.") + if (!dir.exists(sparkHome)) { + fmt <- paste("Spark not found in SPARK_HOME: %s.\n", + "Search in the cache directory.", + "It will be installed if not found.") + msg <- sprintf(fmt, sparkHome) + message(msg) packageLocalDir <- install_spark() sparkHome <- packageLocalDir } else { @@ -376,7 +380,7 @@ sparkR.session <- function( message(msg) } } - + if (!exists(".sparkRjsc", envir = .sparkREnv)) { sparkExecutorEnvMap <- new.env() sparkR.sparkContext(master, appName, sparkHome, sparkConfigMap, sparkExecutorEnvMap, From 0db89b7c98a42f4ca151f7ff07a4f49ee5ad87a6 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Wed, 20 Jul 2016 23:54:15 -0700 Subject: [PATCH 07/49] minor changes --- R/check-cran.sh | 2 +- R/pkg/R/install.R | 8 ++++---- R/pkg/R/sparkR.R | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/R/check-cran.sh b/R/check-cran.sh index b3a6860961c1..d80d4c41483a 100755 --- a/R/check-cran.sh +++ b/R/check-cran.sh @@ -47,6 +47,6 @@ $FWDIR/create-docs.sh VERSION=`grep Version $FWDIR/pkg/DESCRIPTION | awk '{print $NF}'` -"$R_SCRIPT_PATH/"R CMD check --as-cran --no-tests SparkR_"$VERSION".tar.gz +"$R_SCRIPT_PATH/"R CMD check --as-cran SparkR_"$VERSION".tar.gz popd > /dev/null diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index ad88843be051..393b6a399b17 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -47,7 +47,7 @@ install_spark <- function(hadoop_version = NULL, mirror_url = NULL, paste0(version, "-bin-without-hadoop"), paste0(version, "-bin-hadoop", hadoop_version)) if (is.null(local_dir)) { - local_dir <- getOption("spark.install.dir",spark_cache_path()) + local_dir <- getOption("spark.install.dir", spark_cache_path()) } else { local_dir <- normalizePath(local_dir) } @@ -77,8 +77,8 @@ install_spark <- function(hadoop_version = NULL, mirror_url = NULL, packageRemotePath <- paste0(file.path(mirror_url, version, packageName), ".tgz") fmt <- paste("Installing Spark %s for Hadoop %s.", - "Downloading from:\n %s", - "Installing to:\n %s", sep = "\n") + "Downloading from:\n- %s", + "Installing to:\n- %s", sep = "\n") msg <- sprintf(fmt, version, hadoop_version, packageRemotePath, packageLocalDir) message(msg) @@ -96,7 +96,7 @@ install_spark <- function(hadoop_version = NULL, mirror_url = NULL, mirror_url <- mirror_sites$url[1] packageRemotePath <- paste0(file.path(mirror_url, version, packageName), ".tgz") - message(sprintf("Downloading from:\n %s", packageRemotePath)) + message(sprintf("Downloading from:\n- %s", packageRemotePath)) tryCatch(download.file(packageRemotePath, packageLocalPath), error = function(e) { stop("Download failed. Please provide a valid mirror_url.") diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index c2583d9e3c64..16fe6f31bdc8 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -367,8 +367,8 @@ sparkR.session <- function( } if (!nzchar(master) || master_is_local(master)) { if (!dir.exists(sparkHome)) { - fmt <- paste("Spark not found in SPARK_HOME: %s.\n", - "Search in the cache directory.", + fmt <- paste0("Spark not found in SPARK_HOME: %s.\n", + "Search in the cache directory. ", "It will be installed if not found.") msg <- sprintf(fmt, sparkHome) message(msg) From 503cb9fc2d4d54cf32defc835f568f30622e7925 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Thu, 21 Jul 2016 00:48:20 -0700 Subject: [PATCH 08/49] fix R style issue: don't use absolute paths --- R/check-cran.sh | 2 +- R/pkg/R/install.R | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/R/check-cran.sh b/R/check-cran.sh index d80d4c41483a..b3a6860961c1 100755 --- a/R/check-cran.sh +++ b/R/check-cran.sh @@ -47,6 +47,6 @@ $FWDIR/create-docs.sh VERSION=`grep Version $FWDIR/pkg/DESCRIPTION | awk '{print $NF}'` -"$R_SCRIPT_PATH/"R CMD check --as-cran SparkR_"$VERSION".tar.gz +"$R_SCRIPT_PATH/"R CMD check --as-cran --no-tests SparkR_"$VERSION".tar.gz popd > /dev/null diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 393b6a399b17..47f7facd16c3 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -25,8 +25,8 @@ #' specify a desired Hadoop version, the remote site, and the directory where #' the package is installed locally. #' -#' @param hadoop_version Version of Hadoop to install. 2.3, 2.4, 2.6, -#' and 2.7 (default) +#' @param hadoop_version Version of Hadoop to install, 2.4, 2.6, +#' 2.7 (default) and without #' @param mirror_url the base URL of the repositories to use #' @param local_dir local directory that Spark is installed to #' @return \code{install_spark} returns the local directory @@ -135,9 +135,11 @@ spark_cache_path <- function() { } } else if (.Platform$OS.type == "unix") { if (Sys.info()["sysname"] == "Darwin") { - path <- file.path("~/Library/Caches", "spark") + path <- file.path(Sys.getenv("HOME"), "Library/Caches", "spark") } else { - path <- file.path(Sys.getenv("XDG_CACHE_HOME", "~/.cache"), "spark") + path <- file.path( + Sys.getenv("XDG_CACHE_HOME", file.path(Sys.getenv("HOME"), ".cache")), + "spark") } } else { stop("Unknown OS") From f4522a6a103e208c2789e63a7475fec6825104c1 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Thu, 21 Jul 2016 11:35:01 -0700 Subject: [PATCH 09/49] remove spark version function, and use return value of packageVersion instead --- R/pkg/R/install.R | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 47f7facd16c3..b6208df52bf8 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -41,7 +41,7 @@ #' @note install_spark since 2.1.0 install_spark <- function(hadoop_version = NULL, mirror_url = NULL, local_dir = NULL) { - version <- paste0("spark-", spark_version_default()) + version <- paste0("spark-", packageVersion("SparkR")) hadoop_version <- match.arg(hadoop_version, supported_versions_hadoop()) packageName <- ifelse(hadoop_version == "without", paste0(version, "-bin-without-hadoop"), @@ -72,10 +72,12 @@ install_spark <- function(hadoop_version = NULL, mirror_url = NULL, message("Remote URL not provided. Use Apache default.") mirror_url <- mirror_url_default() } - # This is temporary, should be removed when released - version <- "spark-releases/spark-2.0.0-rc4-bin" - packageRemotePath <- paste0(file.path(mirror_url, version, packageName), - ".tgz") + + version <- "spark-2.0.0-rc4-bin" + # When 2.0 released, remove the above line and + # change spark-releases to spark in the statement below + packageRemotePath <- paste0( + file.path(mirror_url, "spark-releases", version, packageName), ".tgz") fmt <- paste("Installing Spark %s for Hadoop %s.", "Downloading from:\n- %s", "Installing to:\n- %s", sep = "\n") @@ -138,7 +140,7 @@ spark_cache_path <- function() { path <- file.path(Sys.getenv("HOME"), "Library/Caches", "spark") } else { path <- file.path( - Sys.getenv("XDG_CACHE_HOME", file.path(Sys.getenv("HOME"), ".cache")), + Sys.getenv("XDG_CACHE_HOME", file.path(Sys.getenv("HOME"), ".cache")), "spark") } } else { @@ -150,11 +152,3 @@ spark_cache_path <- function() { mirror_url_csv <- function() { system.file("extdata", "spark_download.csv", package = "SparkR") } - -spark_version_default <- function() { - "2.0.0" -} - -hadoop_version_default <- function() { - "2.7" -} From 78d6f910b93ffb58dea15361d020b64cba6ec541 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Thu, 21 Jul 2016 12:40:20 -0700 Subject: [PATCH 10/49] remove unnecessary dir create --- R/pkg/R/install.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index b6208df52bf8..49cc84e0213a 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -67,7 +67,6 @@ install_spark <- function(hadoop_version = NULL, mirror_url = NULL, if (tarExists) { message("Tar file found. Installing...") } else { - dir.create(packageLocalDir, recursive = TRUE) if (is.null(mirror_url)) { message("Remote URL not provided. Use Apache default.") mirror_url <- mirror_url_default() From 9666e0627ace6486ada2078866ce575148485619 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Mon, 25 Jul 2016 15:07:36 -0700 Subject: [PATCH 11/49] fix issue that dir.exists not available before 3.2.0 --- R/pkg/R/install.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 49cc84e0213a..62e6fb17d743 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -54,7 +54,9 @@ install_spark <- function(hadoop_version = NULL, mirror_url = NULL, packageLocalDir <- file.path(local_dir, packageName) - if (dir.exists(packageLocalDir)) { + + # can use dir.exists(packageLocalDir) under R 3.2.0 or later + if (!is.na(file.info(packageLocalDir)$isdir)) { fmt <- "Spark %s for Hadoop %s has been installed." msg <- sprintf(fmt, version, hadoop_version) message(msg) From e4fe0023b8847fb24da9f733948ac2f13493457c Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Mon, 25 Jul 2016 17:15:49 -0700 Subject: [PATCH 12/49] another dir.exists --- R/pkg/R/sparkR.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 16fe6f31bdc8..25e4f4587e74 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -366,7 +366,7 @@ sparkR.session <- function( overrideEnvs(sparkConfigMap, paramMap) } if (!nzchar(master) || master_is_local(master)) { - if (!dir.exists(sparkHome)) { + if (!is.na(file.info(sparkHome)$isdir)) { fmt <- paste0("Spark not found in SPARK_HOME: %s.\n", "Search in the cache directory. ", "It will be installed if not found.") From c105d88978a6f35cf80180aef0825a1b74fc58df Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Mon, 25 Jul 2016 20:41:35 -0700 Subject: [PATCH 13/49] temporary change of test --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 3f3cb766b38f..39ed4febe54c 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1824,11 +1824,11 @@ test_that("describe() and summarize() on a DataFrame", { expect_equal(collect(stats)[2, "age"], "24.5") expect_equal(collect(stats)[3, "age"], "7.7781745930520225") stats <- describe(df) - expect_equal(collect(stats)[4, "name"], "Andy") + expect_equal(collect(stats)[4, "summary"], "min") expect_equal(collect(stats)[5, "age"], "30") stats2 <- summary(df) - expect_equal(collect(stats2)[4, "name"], "Andy") + expect_equal(collect(stats2)[4, "summary"], "min") expect_equal(collect(stats2)[5, "age"], "30") # SPARK-16425: SparkR summary() fails on column of type logical From d19853a4d6f8894cf2d6b9e8d96131d2c1bbd3f8 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Mon, 25 Jul 2016 21:52:35 -0700 Subject: [PATCH 14/49] recover test changes --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 39ed4febe54c..3f3cb766b38f 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1824,11 +1824,11 @@ test_that("describe() and summarize() on a DataFrame", { expect_equal(collect(stats)[2, "age"], "24.5") expect_equal(collect(stats)[3, "age"], "7.7781745930520225") stats <- describe(df) - expect_equal(collect(stats)[4, "summary"], "min") + expect_equal(collect(stats)[4, "name"], "Andy") expect_equal(collect(stats)[5, "age"], "30") stats2 <- summary(df) - expect_equal(collect(stats2)[4, "summary"], "min") + expect_equal(collect(stats2)[4, "name"], "Andy") expect_equal(collect(stats2)[5, "age"], "30") # SPARK-16425: SparkR summary() fails on column of type logical From 124110a66118e6e8fcfd248aea03e54e9c4f1481 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 00:07:10 -0700 Subject: [PATCH 15/49] minor fix --- R/pkg/R/install.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 62e6fb17d743..210e584e7ff8 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -147,7 +147,7 @@ spark_cache_path <- function() { } else { stop("Unknown OS") } - normalizePath(path, mustWork = TRUE) + normalizePath(path, mustWork = FALSE) } mirror_url_csv <- function() { From 03b8320c988eda37af6a82bc3e375f3e51c02a3a Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 02:51:48 -0700 Subject: [PATCH 16/49] fix docs --- R/pkg/NAMESPACE | 2 +- R/pkg/R/install.R | 82 ++++++++++++++++----------- R/pkg/R/sparkR.R | 2 +- R/pkg/inst/extdata/spark_download.csv | 2 - 4 files changed, 51 insertions(+), 37 deletions(-) delete mode 100644 R/pkg/inst/extdata/spark_download.csv diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 6552a8bd19c5..aaab92f5cfc7 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -353,4 +353,4 @@ S3method(structField, jobj) S3method(structType, jobj) S3method(structType, structField) -export("install_spark") +export("install.spark") diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 210e584e7ff8..1e82dcfd403b 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -18,47 +18,63 @@ # Functions to install Spark in case the user directly downloads SparkR # from CRAN. -#' Download and Install Spark Core to Local Directory +#' Download and Install Apache Spark to a Local Directory #' -#' \code{install_spark} downloads and installs Spark to local directory if -#' it is not found. The Spark version we use is 2.0.0 (preview). Users can -#' specify a desired Hadoop version, the remote site, and the directory where -#' the package is installed locally. +#' \code{install.spark} downloads and installs Spark to local directory if +#' it is not found. The Spark version we use is the same as the SparkR version. +#' Users can specify a desired Hadoop version, the remote site, and +#' the directory where the package is installed locally. #' -#' @param hadoop_version Version of Hadoop to install, 2.4, 2.6, -#' 2.7 (default) and without -#' @param mirror_url the base URL of the repositories to use -#' @param local_dir local directory that Spark is installed to -#' @return \code{install_spark} returns the local directory +#' @param hadoopVersion Version of Hadoop to install. Default is without, +#' Spark's "Hadoop free" build. See +#' \href{http://spark.apache.org/docs/latest/hadoop-provided.html}{ +#' "Hadoop Free" Build} for more information. +#' @param mirrorUrl base URL of the repositories to use. The directory +#' layout should follow +#' \href{http://www.apache.org/dyn/closer.lua/spark/}{Apache mirrors}. +#' @param localDir a local directory where Spark is installed. Default path to +#' the cache directory: +#' \itemize{ +#' \item Mac OS X: \file{~/Library/Caches/spark} +#' \item Unix: \env{$XDG_CACHE_HOME} if defined, +#' otherwise \file{~/.cache/spark} +#' \item Win XP: \file{C:\\Documents and Settings\\< username +#' >\\Local Settings\\Application Data\\spark\\spark\\Cache} +#' \item Win Vista: \file{C:\\Users\\< username +#' >\\AppData\\Local\\spark\\spark\\Cache} +#' } +#' @return \code{install.spark} returns the local directory #' where Spark is found or installed -#' @rdname install_spark -#' @name install_spark +#' @rdname install.spark +#' @name install.spark #' @export #' @examples #'\dontrun{ -#' install_spark() +#' install.spark() #'} -#' @note install_spark since 2.1.0 -install_spark <- function(hadoop_version = NULL, mirror_url = NULL, - local_dir = NULL) { +#' @note install.spark since 2.1.0 +#' @seealso See available Hadoop versions: +#' \href{http://spark.apache.org/downloads.html}{Apache Spark} +install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, + localDir = NULL) { version <- paste0("spark-", packageVersion("SparkR")) - hadoop_version <- match.arg(hadoop_version, supported_versions_hadoop()) - packageName <- ifelse(hadoop_version == "without", + hadoopVersion <- match.arg(hadoopVersion, supported_versions_hadoop()) + packageName <- ifelse(hadoopVersion == "without", paste0(version, "-bin-without-hadoop"), - paste0(version, "-bin-hadoop", hadoop_version)) - if (is.null(local_dir)) { - local_dir <- getOption("spark.install.dir", spark_cache_path()) + paste0(version, "-bin-hadoop", hadoopVersion)) + if (is.null(localDir)) { + localDir <- getOption("spark.install.dir", spark_cache_path()) } else { - local_dir <- normalizePath(local_dir) + localDir <- normalizePath(localDir) } - packageLocalDir <- file.path(local_dir, packageName) + packageLocalDir <- file.path(localDir, packageName) # can use dir.exists(packageLocalDir) under R 3.2.0 or later if (!is.na(file.info(packageLocalDir)$isdir)) { fmt <- "Spark %s for Hadoop %s has been installed." - msg <- sprintf(fmt, version, hadoop_version) + msg <- sprintf(fmt, version, hadoopVersion) message(msg) return(invisible(packageLocalDir)) } @@ -69,26 +85,26 @@ install_spark <- function(hadoop_version = NULL, mirror_url = NULL, if (tarExists) { message("Tar file found. Installing...") } else { - if (is.null(mirror_url)) { + if (is.null(mirrorUrl)) { message("Remote URL not provided. Use Apache default.") - mirror_url <- mirror_url_default() + mirrorUrl <- mirror_url_default() } version <- "spark-2.0.0-rc4-bin" # When 2.0 released, remove the above line and # change spark-releases to spark in the statement below packageRemotePath <- paste0( - file.path(mirror_url, "spark-releases", version, packageName), ".tgz") + file.path(mirrorUrl, "spark-releases", version, packageName), ".tgz") fmt <- paste("Installing Spark %s for Hadoop %s.", "Downloading from:\n- %s", "Installing to:\n- %s", sep = "\n") - msg <- sprintf(fmt, version, hadoop_version, packageRemotePath, + msg <- sprintf(fmt, version, hadoopVersion, packageRemotePath, packageLocalDir) message(msg) fetchFail <- tryCatch(download.file(packageRemotePath, packageLocalPath), error = function(e) { - msg <- paste0("Fetch failed from ", mirror_url, ".") + msg <- paste0("Fetch failed from ", mirrorUrl, ".") message(msg) TRUE }) @@ -96,18 +112,18 @@ install_spark <- function(hadoop_version = NULL, mirror_url = NULL, message("Try the backup option.") mirror_sites <- tryCatch(read.csv(mirror_url_csv()), error = function(e) stop("No csv file found.")) - mirror_url <- mirror_sites$url[1] - packageRemotePath <- paste0(file.path(mirror_url, version, packageName), + mirrorUrl <- mirror_sites$url[1] + packageRemotePath <- paste0(file.path(mirrorUrl, version, packageName), ".tgz") message(sprintf("Downloading from:\n- %s", packageRemotePath)) tryCatch(download.file(packageRemotePath, packageLocalPath), error = function(e) { - stop("Download failed. Please provide a valid mirror_url.") + stop("Download failed. Please provide a valid mirrorUrl.") }) } } - untar(tarfile = packageLocalPath, exdir = local_dir) + untar(tarfile = packageLocalPath, exdir = localDir) if (!tarExists) { unlink(packageLocalPath) } diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 25e4f4587e74..932c15a93031 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -372,7 +372,7 @@ sparkR.session <- function( "It will be installed if not found.") msg <- sprintf(fmt, sparkHome) message(msg) - packageLocalDir <- install_spark() + packageLocalDir <- install.spark() sparkHome <- packageLocalDir } else { fmt <- "Make sure that Spark is installed in SPARK_HOME: %s" diff --git a/R/pkg/inst/extdata/spark_download.csv b/R/pkg/inst/extdata/spark_download.csv deleted file mode 100644 index 4fd3223eeaac..000000000000 --- a/R/pkg/inst/extdata/spark_download.csv +++ /dev/null @@ -1,2 +0,0 @@ -"url","default" -"http://apache.osuosl.org",TRUE From d727be8eecf7217af57f4ca3b74e056b87eaf6fc Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 02:52:33 -0700 Subject: [PATCH 17/49] delete csv file option --- R/pkg/R/install.R | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 1e82dcfd403b..c0ccf0b5181e 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -70,7 +70,6 @@ install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, packageLocalDir <- file.path(localDir, packageName) - # can use dir.exists(packageLocalDir) under R 3.2.0 or later if (!is.na(file.info(packageLocalDir)$isdir)) { fmt <- "Spark %s for Hadoop %s has been installed." @@ -108,19 +107,6 @@ install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, message(msg) TRUE }) - if (fetchFail) { - message("Try the backup option.") - mirror_sites <- tryCatch(read.csv(mirror_url_csv()), - error = function(e) stop("No csv file found.")) - mirrorUrl <- mirror_sites$url[1] - packageRemotePath <- paste0(file.path(mirrorUrl, version, packageName), - ".tgz") - message(sprintf("Downloading from:\n- %s", packageRemotePath)) - tryCatch(download.file(packageRemotePath, packageLocalPath), - error = function(e) { - stop("Download failed. Please provide a valid mirrorUrl.") - }) - } } untar(tarfile = packageLocalPath, exdir = localDir) @@ -139,7 +125,7 @@ mirror_url_default <- function() { } supported_versions_hadoop <- function() { - c("2.7", "2.6", "2.4", "without") + c("without", "2.7", "2.6", "2.4") } spark_cache_path <- function() { @@ -165,7 +151,3 @@ spark_cache_path <- function() { } normalizePath(path, mustWork = FALSE) } - -mirror_url_csv <- function() { - system.file("extdata", "spark_download.csv", package = "SparkR") -} From ab3789f621bfe8ca82daf954a12ec0f918673980 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 02:56:32 -0700 Subject: [PATCH 18/49] rename functions of default mirror and check local master --- R/pkg/R/install.R | 4 ++-- R/pkg/R/sparkR.R | 2 +- R/pkg/R/utils.R | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index c0ccf0b5181e..fb6e72f61e72 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -86,7 +86,7 @@ install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, } else { if (is.null(mirrorUrl)) { message("Remote URL not provided. Use Apache default.") - mirrorUrl <- mirror_url_default() + mirrorUrl <- default_mirror_url() } version <- "spark-2.0.0-rc4-bin" @@ -117,7 +117,7 @@ install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, invisible(packageLocalDir) } -mirror_url_default <- function() { +default_mirror_url <- function() { # change to http://www.apache.org/dyn/closer.lua # when released diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 932c15a93031..e50d55dc314e 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -365,7 +365,7 @@ sparkR.session <- function( } overrideEnvs(sparkConfigMap, paramMap) } - if (!nzchar(master) || master_is_local(master)) { + if (!nzchar(master) || is_master_local(master)) { if (!is.na(file.info(sparkHome)$isdir)) { fmt <- paste0("Spark not found in SPARK_HOME: %s.\n", "Search in the cache directory. ", diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index f0003e8891a1..5a61283c7632 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -690,6 +690,6 @@ getSparkContext <- function() { sc } -master_is_local <- function(master) { +is_master_local <- function(master) { grepl("^local(\\[[0-9\\*]*\\])?$", master, perl = TRUE) } From 626e4a1993839e3d1d7a5b64d85e2a40eac8dc31 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 03:12:19 -0700 Subject: [PATCH 19/49] fix mustWork issue in normalizePath, and create cache folder if not found --- R/pkg/R/install.R | 6 +++++- R/pkg/R/utils.R | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index fb6e72f61e72..7ed2355e7a6a 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -65,7 +65,11 @@ install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, if (is.null(localDir)) { localDir <- getOption("spark.install.dir", spark_cache_path()) } else { - localDir <- normalizePath(localDir) + localDir <- normalizePath(localDir, mustWork = FALSE) + } + + if (is.na(file.info(localDir)$isdir)) { + dir.create(localDir, recursive = TRUE) } packageLocalDir <- file.path(localDir, packageName) diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index 5a61283c7632..d638b72dd85c 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -691,5 +691,5 @@ getSparkContext <- function() { } is_master_local <- function(master) { - grepl("^local(\\[[0-9\\*]*\\])?$", master, perl = TRUE) + grepl("^local(\\[[0-9\\*]+\\])?$", master, perl = TRUE) } From 785de932b5173136d86b13de508955a746929375 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 03:16:47 -0700 Subject: [PATCH 20/49] output OS name if not matched --- R/pkg/R/install.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 7ed2355e7a6a..b6f2209bf384 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -151,7 +151,7 @@ spark_cache_path <- function() { "spark") } } else { - stop("Unknown OS") + stop(sprintf("Unknown OS: %s", .Platform$OS.type)) } normalizePath(path, mustWork = FALSE) } From e3fa259c6d07a3ea0a7717b530d36f6a522a3e69 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 03:31:45 -0700 Subject: [PATCH 21/49] more specific message and reference for spark_cache_path --- R/pkg/R/install.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index b6f2209bf384..d5a88c63cd0d 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -132,12 +132,16 @@ supported_versions_hadoop <- function() { c("without", "2.7", "2.6", "2.4") } +# This function adapts the implementation of the cache function in +# https://github.com/hadley/rappdirs/blob/master/R/cache.r +# to Spark context. spark_cache_path <- function() { if (.Platform$OS.type == "windows") { winAppPath <- Sys.getenv("%LOCALAPPDATA%", unset = NA) if (is.null(winAppPath)) { msg <- paste("%LOCALAPPDATA% not found.", - "Please define or enter an installation path in loc_dir.") + "Please define the environment variable", + "or restart and enter an installation path in localDir.") stop(msg) } else { path <- file.path(winAppPath, "spark", "spark", "Cache") From cf0f66db23757d5267d4ac8a69733281b82d2bec Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 03:44:06 -0700 Subject: [PATCH 22/49] more precise message under free build --- R/pkg/R/install.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index d5a88c63cd0d..e84bc04f3334 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -77,7 +77,8 @@ install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, # can use dir.exists(packageLocalDir) under R 3.2.0 or later if (!is.na(file.info(packageLocalDir)$isdir)) { fmt <- "Spark %s for Hadoop %s has been installed." - msg <- sprintf(fmt, version, hadoopVersion) + msg <- sprintf(fmt, ifelse(version == "without", "Free build", version), + hadoopVersion) message(msg) return(invisible(packageLocalDir)) } @@ -101,8 +102,8 @@ install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, fmt <- paste("Installing Spark %s for Hadoop %s.", "Downloading from:\n- %s", "Installing to:\n- %s", sep = "\n") - msg <- sprintf(fmt, version, hadoopVersion, packageRemotePath, - packageLocalDir) + msg <- sprintf(fmt, ifelse(version == "without", "Free build", version), + hadoopVersion, packageRemotePath, packageLocalDir) message(msg) fetchFail <- tryCatch(download.file(packageRemotePath, packageLocalPath), From 4f4a899f3955db7f0cc004fa71c89f0d873435c6 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 04:21:14 -0700 Subject: [PATCH 23/49] fix typo on hadoop version message --- R/pkg/R/install.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index e84bc04f3334..78655a4843b0 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -77,8 +77,9 @@ install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, # can use dir.exists(packageLocalDir) under R 3.2.0 or later if (!is.na(file.info(packageLocalDir)$isdir)) { fmt <- "Spark %s for Hadoop %s has been installed." - msg <- sprintf(fmt, ifelse(version == "without", "Free build", version), - hadoopVersion) + msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", + "Free build", hadoopVersion) + ) message(msg) return(invisible(packageLocalDir)) } @@ -102,8 +103,9 @@ install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, fmt <- paste("Installing Spark %s for Hadoop %s.", "Downloading from:\n- %s", "Installing to:\n- %s", sep = "\n") - msg <- sprintf(fmt, ifelse(version == "without", "Free build", version), - hadoopVersion, packageRemotePath, packageLocalDir) + msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", + "Free build", hadoopVersion), + packageRemotePath, packageLocalDir) message(msg) fetchFail <- tryCatch(download.file(packageRemotePath, packageLocalPath), From 14e49439fd9a7cf9044c5c9ee4d5b9f3434a15ec Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 10:19:06 -0700 Subject: [PATCH 24/49] remove the setting of global spark.install.dir option --- R/pkg/R/install.R | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 78655a4843b0..aa7dd8131013 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -62,11 +62,8 @@ install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, packageName <- ifelse(hadoopVersion == "without", paste0(version, "-bin-without-hadoop"), paste0(version, "-bin-hadoop", hadoopVersion)) - if (is.null(localDir)) { - localDir <- getOption("spark.install.dir", spark_cache_path()) - } else { - localDir <- normalizePath(localDir, mustWork = FALSE) - } + localDir <- ifelse(is.null(localDir), spark_cache_path(), + normalizePath(localDir, mustWork = FALSE)) if (is.na(file.info(localDir)$isdir)) { dir.create(localDir, recursive = TRUE) From 976472f30bd1741230e384b744339e94c851d225 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 11:21:44 -0700 Subject: [PATCH 25/49] add overwrite option and adjust layout (#chars per line) of doc --- R/pkg/R/install.R | 66 ++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index aa7dd8131013..edf62a951fc6 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -25,24 +25,24 @@ #' Users can specify a desired Hadoop version, the remote site, and #' the directory where the package is installed locally. #' -#' @param hadoopVersion Version of Hadoop to install. Default is without, -#' Spark's "Hadoop free" build. See -#' \href{http://spark.apache.org/docs/latest/hadoop-provided.html}{ -#' "Hadoop Free" Build} for more information. -#' @param mirrorUrl base URL of the repositories to use. The directory -#' layout should follow -#' \href{http://www.apache.org/dyn/closer.lua/spark/}{Apache mirrors}. -#' @param localDir a local directory where Spark is installed. Default path to -#' the cache directory: -#' \itemize{ -#' \item Mac OS X: \file{~/Library/Caches/spark} -#' \item Unix: \env{$XDG_CACHE_HOME} if defined, -#' otherwise \file{~/.cache/spark} -#' \item Win XP: \file{C:\\Documents and Settings\\< username -#' >\\Local Settings\\Application Data\\spark\\spark\\Cache} -#' \item Win Vista: \file{C:\\Users\\< username -#' >\\AppData\\Local\\spark\\spark\\Cache} -#' } +#' @param hadoopVersion Version of Hadoop to install. Default is without, Spark's "Hadoop free" +#' build. See +#' \href{http://spark.apache.org/docs/latest/hadoop-provided.html}{ +#' "Hadoop Free" Build} for more information. +#' @param mirrorUrl base URL of the repositories to use. The directory layout should follow +#' \href{http://www.apache.org/dyn/closer.lua/spark/}{Apache mirrors}. +#' @param localDir a local directory where Spark is installed. Default path to the cache directory: +#' \itemize{ +#' \item Mac OS X: \file{~/Library/Caches/spark} +#' \item Unix: \env{$XDG_CACHE_HOME} if defined, otherwise \file{~/.cache/spark} +#' \item Win XP: +# \file{C:\\Documents and Settings\\\\Local Settings\\Application +#' Data\\spark\\spark\\Cache} +#' \item Win Vista: +#' \file{C:\\Users\\\\AppData\\Local\\spark\\spark\\Cache} +#' } +#' @param overwrite If \code{TRUE}, download and overwrite the existing tar file and force +#' re-install Spark (in case the local directory or file is corrupted) #' @return \code{install.spark} returns the local directory #' where Spark is found or installed #' @rdname install.spark @@ -56,7 +56,7 @@ #' @seealso See available Hadoop versions: #' \href{http://spark.apache.org/downloads.html}{Apache Spark} install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, - localDir = NULL) { + localDir = NULL, overwrite = FALSE) { version <- paste0("spark-", packageVersion("SparkR")) hadoopVersion <- match.arg(hadoopVersion, supported_versions_hadoop()) packageName <- ifelse(hadoopVersion == "without", @@ -71,12 +71,16 @@ install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, packageLocalDir <- file.path(localDir, packageName) + if (overwrite) { + message("Overwrite = TRUE: download and overwrite the Spark directory if + it exists.") + } + # can use dir.exists(packageLocalDir) under R 3.2.0 or later - if (!is.na(file.info(packageLocalDir)$isdir)) { + if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) { fmt <- "Spark %s for Hadoop %s has been installed." msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", - "Free build", hadoopVersion) - ) + "Free build", hadoopVersion)) message(msg) return(invisible(packageLocalDir)) } @@ -84,7 +88,7 @@ install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, packageLocalPath <- paste0(packageLocalDir, ".tgz") tarExists <- file.exists(packageLocalPath) - if (tarExists) { + if (tarExists && !overwrite) { message("Tar file found. Installing...") } else { if (is.null(mirrorUrl)) { @@ -104,17 +108,16 @@ install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, "Free build", hadoopVersion), packageRemotePath, packageLocalDir) message(msg) - - fetchFail <- tryCatch(download.file(packageRemotePath, packageLocalPath), - error = function(e) { - msg <- paste0("Fetch failed from ", mirrorUrl, ".") - message(msg) - TRUE - }) + + tryCatch(download.file(packageRemotePath, packageLocalPath), + error = function(e) { + msg <- paste0("Fetch failed from ", mirrorUrl, ".") + stop(msg) + }) } untar(tarfile = packageLocalPath, exdir = localDir) - if (!tarExists) { + if (!tarExists || overwrite) { unlink(packageLocalPath) } message("Installation done.") @@ -124,7 +127,6 @@ install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, default_mirror_url <- function() { # change to http://www.apache.org/dyn/closer.lua # when released - "http://people.apache.org/~pwendell" } From 8821b56ff58b6337eaa21373838649be5d8c1c49 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 12:13:40 -0700 Subject: [PATCH 26/49] remove hardcoded hadoop versions --- R/pkg/R/install.R | 48 +++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index edf62a951fc6..9f4370595cdf 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -25,10 +25,11 @@ #' Users can specify a desired Hadoop version, the remote site, and #' the directory where the package is installed locally. #' -#' @param hadoopVersion Version of Hadoop to install. Default is without, Spark's "Hadoop free" +#' @param hadoopVersion Version of Hadoop to install. Default is "without", Spark's "Hadoop free" #' build. See #' \href{http://spark.apache.org/docs/latest/hadoop-provided.html}{ -#' "Hadoop Free" Build} for more information. +#' "Hadoop Free" Build} for more information. It can be in format of +#' "int.int" (for example, "2.7") or "cdh4" #' @param mirrorUrl base URL of the repositories to use. The directory layout should follow #' \href{http://www.apache.org/dyn/closer.lua/spark/}{Apache mirrors}. #' @param localDir a local directory where Spark is installed. Default path to the cache directory: @@ -55,13 +56,12 @@ #' @note install.spark since 2.1.0 #' @seealso See available Hadoop versions: #' \href{http://spark.apache.org/downloads.html}{Apache Spark} -install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, +install.spark <- function(hadoopVersion = "without", mirrorUrl = NULL, localDir = NULL, overwrite = FALSE) { version <- paste0("spark-", packageVersion("SparkR")) - hadoopVersion <- match.arg(hadoopVersion, supported_versions_hadoop()) - packageName <- ifelse(hadoopVersion == "without", - paste0(version, "-bin-without-hadoop"), - paste0(version, "-bin-hadoop", hadoopVersion)) + hadoopVersion <- tolower(hadoopVersion) + hadoopVersionName <- hadoop_version_name(hadoopVersion) + packageName <- paste(version, "bin", hadoopVersionName, sep = "-") localDir <- ifelse(is.null(localDir), spark_cache_path(), normalizePath(localDir, mustWork = FALSE)) @@ -72,15 +72,13 @@ install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, packageLocalDir <- file.path(localDir, packageName) if (overwrite) { - message("Overwrite = TRUE: download and overwrite the Spark directory if - it exists.") + message("Overwrite = TRUE: download and overwrite the Spark directory if it exists.") } # can use dir.exists(packageLocalDir) under R 3.2.0 or later if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) { fmt <- "Spark %s for Hadoop %s has been installed." - msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", - "Free build", hadoopVersion)) + msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion)) message(msg) return(invisible(packageLocalDir)) } @@ -92,8 +90,8 @@ install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, message("Tar file found. Installing...") } else { if (is.null(mirrorUrl)) { - message("Remote URL not provided. Use Apache default.") mirrorUrl <- default_mirror_url() + message(sprintf("Remote URL not provided. Use Apache default: %s", mirrorUrl)) } version <- "spark-2.0.0-rc4-bin" @@ -104,14 +102,15 @@ install.spark <- function(hadoopVersion = NULL, mirrorUrl = NULL, fmt <- paste("Installing Spark %s for Hadoop %s.", "Downloading from:\n- %s", "Installing to:\n- %s", sep = "\n") - msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", - "Free build", hadoopVersion), + msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), packageRemotePath, packageLocalDir) message(msg) tryCatch(download.file(packageRemotePath, packageLocalPath), error = function(e) { - msg <- paste0("Fetch failed from ", mirrorUrl, ".") + msg <- paste0("Fetch failed from ", mirrorUrl, ".", + "Please check the Hadoop version,", + "retry or provide another mirror site.") stop(msg) }) } @@ -130,13 +129,19 @@ default_mirror_url <- function() { "http://people.apache.org/~pwendell" } -supported_versions_hadoop <- function() { - c("without", "2.7", "2.6", "2.4") +hadoop_version_name <- function(hadoopVersion) { + if (hadoopVersion == "without") { + "without-hadoop" + } + if (grepl("^[0-9]+\\.[0-9]+$", hadoopVersion, perl = TRUE)) { + paste0("hadoop", hadoopVersion) + } else { + hadoopVersion + } } -# This function adapts the implementation of the cache function in -# https://github.com/hadley/rappdirs/blob/master/R/cache.r -# to Spark context. +# The implementation refers to appdirs package: https://pypi.python.org/pypi/appdirs and adapt +# to Spark context spark_cache_path <- function() { if (.Platform$OS.type == "windows") { winAppPath <- Sys.getenv("%LOCALAPPDATA%", unset = NA) @@ -153,8 +158,7 @@ spark_cache_path <- function() { path <- file.path(Sys.getenv("HOME"), "Library/Caches", "spark") } else { path <- file.path( - Sys.getenv("XDG_CACHE_HOME", file.path(Sys.getenv("HOME"), ".cache")), - "spark") + Sys.getenv("XDG_CACHE_HOME", file.path(Sys.getenv("HOME"), ".cache")), "spark") } } else { stop(sprintf("Unknown OS: %s", .Platform$OS.type)) From 328408b01a713791d9dceb8fd2d9c22a6f5071d0 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 13:31:23 -0700 Subject: [PATCH 27/49] edit doc of install.spark --- R/pkg/R/install.R | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 9f4370595cdf..54ba849dd3df 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -20,32 +20,34 @@ #' Download and Install Apache Spark to a Local Directory #' -#' \code{install.spark} downloads and installs Spark to local directory if +#' \code{install.spark} downloads and installs Spark to a local directory if #' it is not found. The Spark version we use is the same as the SparkR version. -#' Users can specify a desired Hadoop version, the remote site, and +#' Users can specify a desired Hadoop version, the remote mirror site, and #' the directory where the package is installed locally. #' -#' @param hadoopVersion Version of Hadoop to install. Default is "without", Spark's "Hadoop free" -#' build. See +#' @param hadoopVersion Version of Hadoop to install. Default is \code{"without"}, Spark's +#' "Hadoop free" build. See #' \href{http://spark.apache.org/docs/latest/hadoop-provided.html}{ -#' "Hadoop Free" Build} for more information. It can be in format of -#' "int.int" (for example, "2.7") or "cdh4" +#' "Hadoop Free" Build} for more information. It can also be version number +#' in the format of "int.int", e.g. "2.7", or other patched version names, e.g. +#' "cdh4" #' @param mirrorUrl base URL of the repositories to use. The directory layout should follow #' \href{http://www.apache.org/dyn/closer.lua/spark/}{Apache mirrors}. -#' @param localDir a local directory where Spark is installed. Default path to the cache directory: +#' @param localDir a local directory where Spark is installed. The directory contains +#' version-specific folders of Spark packages. Default is path to +#' the cache directory: #' \itemize{ #' \item Mac OS X: \file{~/Library/Caches/spark} #' \item Unix: \env{$XDG_CACHE_HOME} if defined, otherwise \file{~/.cache/spark} #' \item Win XP: -# \file{C:\\Documents and Settings\\\\Local Settings\\Application +#' \file{C:\\Documents and Settings\\\\Local Settings\\Application #' Data\\spark\\spark\\Cache} #' \item Win Vista: #' \file{C:\\Users\\\\AppData\\Local\\spark\\spark\\Cache} #' } -#' @param overwrite If \code{TRUE}, download and overwrite the existing tar file and force -#' re-install Spark (in case the local directory or file is corrupted) -#' @return \code{install.spark} returns the local directory -#' where Spark is found or installed +#' @param overwrite If \code{TRUE}, download and overwrite the existing tar file in localDir +#' and force re-install Spark (in case the local directory or file is corrupted) +#' @return \code{install.spark} returns the local directory where Spark is found or installed #' @rdname install.spark #' @name install.spark #' @export @@ -72,7 +74,8 @@ install.spark <- function(hadoopVersion = "without", mirrorUrl = NULL, packageLocalDir <- file.path(localDir, packageName) if (overwrite) { - message("Overwrite = TRUE: download and overwrite the Spark directory if it exists.") + message("Overwrite = TRUE: download and overwrite the tar file and Spark package directory + if they exist.") } # can use dir.exists(packageLocalDir) under R 3.2.0 or later @@ -87,7 +90,7 @@ install.spark <- function(hadoopVersion = "without", mirrorUrl = NULL, tarExists <- file.exists(packageLocalPath) if (tarExists && !overwrite) { - message("Tar file found. Installing...") + message("tar file found. Installing...") } else { if (is.null(mirrorUrl)) { mirrorUrl <- default_mirror_url() @@ -132,8 +135,7 @@ default_mirror_url <- function() { hadoop_version_name <- function(hadoopVersion) { if (hadoopVersion == "without") { "without-hadoop" - } - if (grepl("^[0-9]+\\.[0-9]+$", hadoopVersion, perl = TRUE)) { + } else if (grepl("^[0-9]+\\.[0-9]+$", hadoopVersion, perl = TRUE)) { paste0("hadoop", hadoopVersion) } else { hadoopVersion From 009161524eac3308cf992731910274810b204029 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 16:12:42 -0700 Subject: [PATCH 28/49] check installation of spark only when using ordinary R console --- R/pkg/R/install.R | 2 +- R/pkg/R/sparkR.R | 28 +++++++++++++++------------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 54ba849dd3df..7153ba1fc676 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -80,7 +80,7 @@ install.spark <- function(hadoopVersion = "without", mirrorUrl = NULL, # can use dir.exists(packageLocalDir) under R 3.2.0 or later if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) { - fmt <- "Spark %s for Hadoop %s has been installed." + fmt <- "Spark %s for Hadoop %s is found." msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion)) message(msg) return(invisible(packageLocalDir)) diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index e50d55dc314e..9b0d592503f7 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -365,19 +365,21 @@ sparkR.session <- function( } overrideEnvs(sparkConfigMap, paramMap) } - if (!nzchar(master) || is_master_local(master)) { - if (!is.na(file.info(sparkHome)$isdir)) { - fmt <- paste0("Spark not found in SPARK_HOME: %s.\n", - "Search in the cache directory. ", - "It will be installed if not found.") - msg <- sprintf(fmt, sparkHome) - message(msg) - packageLocalDir <- install.spark() - sparkHome <- packageLocalDir - } else { - fmt <- "Make sure that Spark is installed in SPARK_HOME: %s" - msg <- sprintf(fmt, sparkHome) - message(msg) + if (!grepl(".*shell\\.R$", Sys.getenv("R_PROFILE_USER"), perl = TRUE)) { + if (!nzchar(master) || is_master_local(master)) { + if (is.na(file.info(sparkHome)$isdir)) { + fmt <- paste0("Spark not found in SPARK_HOME: %s.\n", + "Search in the cache directory. ", + "It will be installed if not found.") + msg <- sprintf(fmt, sparkHome) + message(msg) + packageLocalDir <- install.spark() + sparkHome <- packageLocalDir + } else { + fmt <- "Spark package is found in SPARK_HOME: %s" + msg <- sprintf(fmt, sparkHome) + message(msg) + } } } From dbabb56768aefe754fa3e1a97f47d308fa44a3be Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 16:37:38 -0700 Subject: [PATCH 29/49] minor fix of overwrite message --- R/pkg/R/install.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 7153ba1fc676..e577fda6c83b 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -74,8 +74,8 @@ install.spark <- function(hadoopVersion = "without", mirrorUrl = NULL, packageLocalDir <- file.path(localDir, packageName) if (overwrite) { - message("Overwrite = TRUE: download and overwrite the tar file and Spark package directory - if they exist.") + message(paste0("Overwrite = TRUE: download and overwrite the tar file", + "and Spark package directory if they exist.")) } # can use dir.exists(packageLocalDir) under R 3.2.0 or later From 6b2a8972d794b647b410efa969d24e96d32a6757 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 18:13:56 -0700 Subject: [PATCH 30/49] minor change of messages --- R/pkg/R/install.R | 5 +++-- R/pkg/R/sparkR.R | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index e577fda6c83b..cc854148c890 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -80,8 +80,9 @@ install.spark <- function(hadoopVersion = "without", mirrorUrl = NULL, # can use dir.exists(packageLocalDir) under R 3.2.0 or later if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) { - fmt <- "Spark %s for Hadoop %s is found." - msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion)) + fmt <- "Spark %s for Hadoop %s is found in %s" + msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), + packageLocalDir) message(msg) return(invisible(packageLocalDir)) } diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 9b0d592503f7..df13ec7a53fc 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -369,7 +369,7 @@ sparkR.session <- function( if (!nzchar(master) || is_master_local(master)) { if (is.na(file.info(sparkHome)$isdir)) { fmt <- paste0("Spark not found in SPARK_HOME: %s.\n", - "Search in the cache directory. ", + "To search in the cache directory. ", "It will be installed if not found.") msg <- sprintf(fmt, sparkHome) message(msg) From 907f37dcf6dc41a1bd862d00ad50b1964bd5c8eb Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 18:20:46 -0700 Subject: [PATCH 31/49] set env var SPARK_HOME after installation --- R/pkg/R/install.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index cc854148c890..2092af0ff4b1 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -84,6 +84,7 @@ install.spark <- function(hadoopVersion = "without", mirrorUrl = NULL, msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), packageLocalDir) message(msg) + Sys.setenv(SPARK_HOME = packageLocalDir) return(invisible(packageLocalDir)) } @@ -124,6 +125,7 @@ install.spark <- function(hadoopVersion = "without", mirrorUrl = NULL, unlink(packageLocalPath) } message("Installation done.") + Sys.setenv(SPARK_HOME = packageLocalDir) invisible(packageLocalDir) } From fa94e3cc99e93aea708a609733bbe9364b904efe Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 18:30:54 -0700 Subject: [PATCH 32/49] minor change of doc --- R/pkg/R/install.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 2092af0ff4b1..1f0f272d1e07 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -25,7 +25,7 @@ #' Users can specify a desired Hadoop version, the remote mirror site, and #' the directory where the package is installed locally. #' -#' @param hadoopVersion Version of Hadoop to install. Default is \code{"without"}, Spark's +#' @param hadoopVersion Version of Hadoop to install. Default is "without", Spark's #' "Hadoop free" build. See #' \href{http://spark.apache.org/docs/latest/hadoop-provided.html}{ #' "Hadoop Free" Build} for more information. It can also be version number From 22f2f786bceeb599645c12210e3f49e66378ba6c Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 26 Jul 2016 22:55:52 -0700 Subject: [PATCH 33/49] delete trailing space --- R/pkg/R/install.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 1f0f272d1e07..d5a16be87065 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -110,7 +110,7 @@ install.spark <- function(hadoopVersion = "without", mirrorUrl = NULL, msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), packageRemotePath, packageLocalDir) message(msg) - + tryCatch(download.file(packageRemotePath, packageLocalPath), error = function(e) { msg <- paste0("Fetch failed from ", mirrorUrl, ".", From 7aa3239b06247357a69b1bf6c8b822f75744e9db Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Wed, 27 Jul 2016 00:01:22 -0700 Subject: [PATCH 34/49] celebrate 2.0 release --- R/pkg/R/install.R | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index d5a16be87065..223ab5c42123 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -99,11 +99,8 @@ install.spark <- function(hadoopVersion = "without", mirrorUrl = NULL, message(sprintf("Remote URL not provided. Use Apache default: %s", mirrorUrl)) } - version <- "spark-2.0.0-rc4-bin" - # When 2.0 released, remove the above line and - # change spark-releases to spark in the statement below packageRemotePath <- paste0( - file.path(mirrorUrl, "spark-releases", version, packageName), ".tgz") + file.path(mirrorUrl, "spark", version, packageName), ".tgz") fmt <- paste("Installing Spark %s for Hadoop %s.", "Downloading from:\n- %s", "Installing to:\n- %s", sep = "\n") @@ -130,9 +127,7 @@ install.spark <- function(hadoopVersion = "without", mirrorUrl = NULL, } default_mirror_url <- function() { - # change to http://www.apache.org/dyn/closer.lua - # when released - "http://people.apache.org/~pwendell" + "http://www.apache.org/dyn/closer.lua" } hadoop_version_name <- function(hadoopVersion) { From 3ad99bcb5ef34161f274495472c963a7a5015df8 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Wed, 27 Jul 2016 00:41:46 -0700 Subject: [PATCH 35/49] add explanation of remote url structure, change default hadoop version --- R/pkg/R/install.R | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 223ab5c42123..86f21628c2ac 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -25,12 +25,23 @@ #' Users can specify a desired Hadoop version, the remote mirror site, and #' the directory where the package is installed locally. #' -#' @param hadoopVersion Version of Hadoop to install. Default is "without", Spark's -#' "Hadoop free" build. See -#' \href{http://spark.apache.org/docs/latest/hadoop-provided.html}{ -#' "Hadoop Free" Build} for more information. It can also be version number -#' in the format of "int.int", e.g. "2.7", or other patched version names, e.g. -#' "cdh4" +#' The full url of remote file is inferred from \code{mirrorUrl} and \code{hadoopVersion}. +#' \code{mirrorUrl} specifies the remote path to a Spark folder. It is followed by a subfolder +#' named after the Spark version (that corresponds to SparkR), and then the tar filename. +#' The filename is composed of four parts, i.e. [Spark version]-bin-[Hadoop version].tgz. +#' For example, the full path for a Spark 2.0.0 package for Hadoop 2.7 from +#' \code{http://apache.osuosl.org} has path: +#' \code{http://apache.osuosl.org/spark/spark-2.0.0/spark-2.0.0-bin-hadoop2.7.tgz}. +#' For \code{hadoopVersion = "without"}, [Hadoop version] in the filename is then +#' \code{without-hadoop}. +#' +#' @param hadoopVersion Version of Hadoop to install. Default is \code{"2.7"}. It can take other +#' version number in the format of "int.int". +#' If \code{hadoopVersion = "without"}, "Hadoop free" build is installed. +#' See +#' \href{http://spark.apache.org/docs/latest/hadoop-provided.html} +#' {"Hadoop Free" Build} for more information. +#' Other patched version names can also be used, e.g. \code{"cdh4"} #' @param mirrorUrl base URL of the repositories to use. The directory layout should follow #' \href{http://www.apache.org/dyn/closer.lua/spark/}{Apache mirrors}. #' @param localDir a local directory where Spark is installed. The directory contains @@ -58,7 +69,7 @@ #' @note install.spark since 2.1.0 #' @seealso See available Hadoop versions: #' \href{http://spark.apache.org/downloads.html}{Apache Spark} -install.spark <- function(hadoopVersion = "without", mirrorUrl = NULL, +install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, localDir = NULL, overwrite = FALSE) { version <- paste0("spark-", packageVersion("SparkR")) hadoopVersion <- tolower(hadoopVersion) @@ -100,7 +111,7 @@ install.spark <- function(hadoopVersion = "without", mirrorUrl = NULL, } packageRemotePath <- paste0( - file.path(mirrorUrl, "spark", version, packageName), ".tgz") + file.path(mirrorUrl, version, packageName), ".tgz") fmt <- paste("Installing Spark %s for Hadoop %s.", "Downloading from:\n- %s", "Installing to:\n- %s", sep = "\n") @@ -127,7 +138,7 @@ install.spark <- function(hadoopVersion = "without", mirrorUrl = NULL, } default_mirror_url <- function() { - "http://www.apache.org/dyn/closer.lua" + "http://www.apache.org/dyn/closer.lua/spark" } hadoop_version_name <- function(hadoopVersion) { From e421c30dd3dc3522467eaeacf82641bd108a02e8 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Wed, 27 Jul 2016 03:03:52 -0700 Subject: [PATCH 36/49] change default mirror --- R/pkg/R/install.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 86f21628c2ac..b73c8c2c16cb 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -138,7 +138,7 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, } default_mirror_url <- function() { - "http://www.apache.org/dyn/closer.lua/spark" + "http://apache.osuosl.org/spark" } hadoop_version_name <- function(hadoopVersion) { From aa4ba4dc7f9375149d01c87bf48fe2f314691ada Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Wed, 27 Jul 2016 20:24:51 -0700 Subject: [PATCH 37/49] add additional a step of searching for suggested url and only use default if not found --- R/pkg/R/install.R | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index b73c8c2c16cb..8b5a8201493a 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -105,11 +105,20 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, if (tarExists && !overwrite) { message("tar file found. Installing...") } else { + # need to download tar file first if (is.null(mirrorUrl)) { - mirrorUrl <- default_mirror_url() - message(sprintf("Remote URL not provided. Use Apache default: %s", mirrorUrl)) + # find url among apache suggested mirror sites + message("Mirror site url not provided. Looking for one...") + mirrorUrl <- get_preferred_mirror() + message(sprintf("Found mirror site: %s", mirrorUrl)) + if (is.null(mirrorUrl)) { + # not found, take the backup option + mirrorUrl <- default_mirror_url() + message(sprintf("Preferred mirror site not found. Use Apache default: %s", mirrorUrl)) + } } + # construct complete remote path packageRemotePath <- paste0( file.path(mirrorUrl, version, packageName), ".tgz") fmt <- paste("Installing Spark %s for Hadoop %s.", @@ -137,8 +146,23 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, invisible(packageLocalDir) } +get_preferred_mirror <- function() { + jsonUrl <- "http://www.apache.org/dyn/closer.cgi?as_json=1" + textLines <- readLines(jsonUrl, warn = FALSE) + rowNum <- grep("\"preferred\"", textLines) + linePreferred <- textLines[rowNum] + matchInfo <- regexpr("\"[A-Za-z][A-Za-z0-9+-.]*://.+\"", linePreferred) + if (matchInfo != -1) { + startPos <- matchInfo + 1 + endPos <- startPos + attr(matchInfo, "match.length") - 2 + mirrorPreferred <- linePreferred[startPos:endPos]) + } else { + mirrorPreferred <- NULL + } +} + default_mirror_url <- function() { - "http://apache.osuosl.org/spark" + "http://www-us.apache.org/dist/spark" } hadoop_version_name <- function(hadoopVersion) { @@ -151,8 +175,8 @@ hadoop_version_name <- function(hadoopVersion) { } } -# The implementation refers to appdirs package: https://pypi.python.org/pypi/appdirs and adapt -# to Spark context +# The implementation refers to appdirs package: https://pypi.python.org/pypi/appdirs and +# adapt to Spark context spark_cache_path <- function() { if (.Platform$OS.type == "windows") { winAppPath <- Sys.getenv("%LOCALAPPDATA%", unset = NA) From 2bb00e1132054570e677d374faad9d3a49fac872 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Wed, 27 Jul 2016 21:58:02 -0700 Subject: [PATCH 38/49] separate sub functions --- R/pkg/R/install.R | 90 +++++++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 31 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 8b5a8201493a..2033a1f058d2 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -105,47 +105,57 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, if (tarExists && !overwrite) { message("tar file found. Installing...") } else { - # need to download tar file first - if (is.null(mirrorUrl)) { - # find url among apache suggested mirror sites - message("Mirror site url not provided. Looking for one...") - mirrorUrl <- get_preferred_mirror() - message(sprintf("Found mirror site: %s", mirrorUrl)) - if (is.null(mirrorUrl)) { - # not found, take the backup option - mirrorUrl <- default_mirror_url() - message(sprintf("Preferred mirror site not found. Use Apache default: %s", mirrorUrl)) - } - } - - # construct complete remote path - packageRemotePath <- paste0( - file.path(mirrorUrl, version, packageName), ".tgz") - fmt <- paste("Installing Spark %s for Hadoop %s.", - "Downloading from:\n- %s", - "Installing to:\n- %s", sep = "\n") - msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), - packageRemotePath, packageLocalDir) - message(msg) - - tryCatch(download.file(packageRemotePath, packageLocalPath), - error = function(e) { - msg <- paste0("Fetch failed from ", mirrorUrl, ".", - "Please check the Hadoop version,", - "retry or provide another mirror site.") - stop(msg) - }) + robust_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) } untar(tarfile = packageLocalPath, exdir = localDir) if (!tarExists || overwrite) { unlink(packageLocalPath) } - message("Installation done.") + message("Installation Done.") Sys.setenv(SPARK_HOME = packageLocalDir) invisible(packageLocalDir) } +robust_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) { + # step 1: use user-provided url + if (!is.null(mirrorUrl)) { + msg <- sprintf("Use user-provided mirror site: %s.", mirrorUrl) + message(msg) + success <- direct_download_url(mirrorUrl, version, hadoopVersion, + packageName, packageLocalPath) + if (success) return() + } else { + message("Mirror site not provided.") + } + + # step 2: use url suggested from apache website + message("Looking for site suggested from apache website...") + mirrorUrl <- get_preferred_mirror() + if (!is.null(mirrorUrl)) { + success <- direct_download_url(mirrorUrl, version, hadoopVersion, + packageName, packageLocalPath) + if (success) return() + } else { + message("Unable to find suggested mirror site.") + } + + # step 3: use backup option + message("To use backup site...") + mirrorUrl <- default_mirror_url() + success <- direct_download_url(mirrorUrl, version, hadoopVersion, + packageName, packageLocalPath) + if (sucess) { + return(packageLocalPath) + } else { + msg <- sprintf(paste("Unable to download Spark %s for Hadoop %s.", + "Please network connection, Hadoop version,", + "or provide other mirror sites."), + version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion)) + stop(msg) + } +} + get_preferred_mirror <- function() { jsonUrl <- "http://www.apache.org/dyn/closer.cgi?as_json=1" textLines <- readLines(jsonUrl, warn = FALSE) @@ -153,12 +163,30 @@ get_preferred_mirror <- function() { linePreferred <- textLines[rowNum] matchInfo <- regexpr("\"[A-Za-z][A-Za-z0-9+-.]*://.+\"", linePreferred) if (matchInfo != -1) { + message(sprintf("Preferred mirror site found: %s", mirrorPreferred)) startPos <- matchInfo + 1 endPos <- startPos + attr(matchInfo, "match.length") - 2 mirrorPreferred <- linePreferred[startPos:endPos]) } else { mirrorPreferred <- NULL } + mirrorPreferred +} + +direct_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) { + packageRemotePath <- paste0( + file.path(mirrorUrl, version, packageName), ".tgz") + fmt <- paste("Downloading Spark %s for Hadoop %s from:\n- %s") + msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), + packageRemotePath) + message(msg) + + isFail <- tryCatch(download.file(packageRemotePath, packageLocalPath), + error = function(e) { + message("Fetch failed.") + TRUE + }) + !isFail } default_mirror_url <- function() { From 699420de6cf45d7ad10b25b022bc7cc13d55734d Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Thu, 28 Jul 2016 00:12:06 -0700 Subject: [PATCH 39/49] fix typos in fun defs --- R/pkg/R/install.R | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 2033a1f058d2..a57a58808e6d 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -39,8 +39,8 @@ #' version number in the format of "int.int". #' If \code{hadoopVersion = "without"}, "Hadoop free" build is installed. #' See -#' \href{http://spark.apache.org/docs/latest/hadoop-provided.html} -#' {"Hadoop Free" Build} for more information. +#' \href{http://spark.apache.org/docs/latest/hadoop-provided.html}{ +#' "Hadoop Free" Build} for more information. #' Other patched version names can also be used, e.g. \code{"cdh4"} #' @param mirrorUrl base URL of the repositories to use. The directory layout should follow #' \href{http://www.apache.org/dyn/closer.lua/spark/}{Apache mirrors}. @@ -68,7 +68,7 @@ #'} #' @note install.spark since 2.1.0 #' @seealso See available Hadoop versions: -#' \href{http://spark.apache.org/downloads.html}{Apache Spark} +#' \href{http://spark.apache.org/downloads.html}{Apache Spark} install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, localDir = NULL, overwrite = FALSE) { version <- paste0("spark-", packageVersion("SparkR")) @@ -117,12 +117,12 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, invisible(packageLocalDir) } -robust_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) { +robust_download_tar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) { # step 1: use user-provided url if (!is.null(mirrorUrl)) { msg <- sprintf("Use user-provided mirror site: %s.", mirrorUrl) message(msg) - success <- direct_download_url(mirrorUrl, version, hadoopVersion, + success <- direct_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) if (success) return() } else { @@ -133,7 +133,7 @@ robust_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocal message("Looking for site suggested from apache website...") mirrorUrl <- get_preferred_mirror() if (!is.null(mirrorUrl)) { - success <- direct_download_url(mirrorUrl, version, hadoopVersion, + success <- direct_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) if (success) return() } else { @@ -143,7 +143,7 @@ robust_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocal # step 3: use backup option message("To use backup site...") mirrorUrl <- default_mirror_url() - success <- direct_download_url(mirrorUrl, version, hadoopVersion, + success <- direct_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) if (sucess) { return(packageLocalPath) @@ -166,14 +166,14 @@ get_preferred_mirror <- function() { message(sprintf("Preferred mirror site found: %s", mirrorPreferred)) startPos <- matchInfo + 1 endPos <- startPos + attr(matchInfo, "match.length") - 2 - mirrorPreferred <- linePreferred[startPos:endPos]) + mirrorPreferred <- linePreferred[startPos:endPos] } else { mirrorPreferred <- NULL } mirrorPreferred } -direct_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) { +direct_download_tar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) { packageRemotePath <- paste0( file.path(mirrorUrl, version, packageName), ".tgz") fmt <- paste("Downloading Spark %s for Hadoop %s from:\n- %s") @@ -210,8 +210,8 @@ spark_cache_path <- function() { winAppPath <- Sys.getenv("%LOCALAPPDATA%", unset = NA) if (is.null(winAppPath)) { msg <- paste("%LOCALAPPDATA% not found.", - "Please define the environment variable", - "or restart and enter an installation path in localDir.") + "Please define the environment variable", + "or restart and enter an installation path in localDir.") stop(msg) } else { path <- file.path(winAppPath, "spark", "spark", "Cache") From d58e080e30968821f684bb9c6868b26a8f9c99fa Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Thu, 28 Jul 2016 00:26:15 -0700 Subject: [PATCH 40/49] fix mirror path and typo --- R/pkg/R/install.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index a57a58808e6d..2a3ab6209740 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -145,7 +145,7 @@ robust_download_tar <- function(mirrorUrl, version, hadoopVersion, packageName, mirrorUrl <- default_mirror_url() success <- direct_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) - if (sucess) { + if (success) { return(packageLocalPath) } else { msg <- sprintf(paste("Unable to download Spark %s for Hadoop %s.", @@ -163,10 +163,11 @@ get_preferred_mirror <- function() { linePreferred <- textLines[rowNum] matchInfo <- regexpr("\"[A-Za-z][A-Za-z0-9+-.]*://.+\"", linePreferred) if (matchInfo != -1) { - message(sprintf("Preferred mirror site found: %s", mirrorPreferred)) startPos <- matchInfo + 1 - endPos <- startPos + attr(matchInfo, "match.length") - 2 - mirrorPreferred <- linePreferred[startPos:endPos] + endPos <- matchInfo + attr(matchInfo, "match.length") - 2 + mirrorPreferred <- base::substr(linePreferred, startPos, endPos) + mirrorPreferred <- paste0(mirrorPreferred, "spark") + message(sprintf("Preferred mirror site found: %s", mirrorPreferred)) } else { mirrorPreferred <- NULL } From 82d24a66fe9d437d7a20e52105598ba01bab0ce7 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Thu, 28 Jul 2016 00:45:06 -0700 Subject: [PATCH 41/49] fix message --- R/pkg/R/install.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 2a3ab6209740..a0010a3aad0a 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -103,16 +103,17 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, tarExists <- file.exists(packageLocalPath) if (tarExists && !overwrite) { - message("tar file found. Installing...") + message("tar file found.") } else { robust_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) } + message(sprintf("Installing to %s", localDir)) untar(tarfile = packageLocalPath, exdir = localDir) if (!tarExists || overwrite) { unlink(packageLocalPath) } - message("Installation Done.") + message("DONE.") Sys.setenv(SPARK_HOME = packageLocalDir) invisible(packageLocalDir) } @@ -149,7 +150,7 @@ robust_download_tar <- function(mirrorUrl, version, hadoopVersion, packageName, return(packageLocalPath) } else { msg <- sprintf(paste("Unable to download Spark %s for Hadoop %s.", - "Please network connection, Hadoop version,", + "Please check network connection, Hadoop version,", "or provide other mirror sites."), version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion)) stop(msg) From 0ebef8aa2bc8487940a1179172879109417293eb Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Thu, 28 Jul 2016 00:53:48 -0700 Subject: [PATCH 42/49] fix message --- R/pkg/R/install.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index a0010a3aad0a..e0424166c4b6 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -126,6 +126,7 @@ robust_download_tar <- function(mirrorUrl, version, hadoopVersion, packageName, success <- direct_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) if (success) return() + } } else { message("Mirror site not provided.") } @@ -185,7 +186,7 @@ direct_download_tar <- function(mirrorUrl, version, hadoopVersion, packageName, isFail <- tryCatch(download.file(packageRemotePath, packageLocalPath), error = function(e) { - message("Fetch failed.") + message(sprintf("Fetch failed from %s", mirrorUrl)) TRUE }) !isFail From 26d45182a1d72ebebd79a63658eecea9d778e217 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Thu, 28 Jul 2016 00:54:24 -0700 Subject: [PATCH 43/49] fix typo --- R/pkg/R/install.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index e0424166c4b6..3f3911034383 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -126,7 +126,6 @@ robust_download_tar <- function(mirrorUrl, version, hadoopVersion, packageName, success <- direct_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) if (success) return() - } } else { message("Mirror site not provided.") } From f37a07c1fc16e508822ffb5bb051234054e3908d Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Thu, 28 Jul 2016 00:59:53 -0700 Subject: [PATCH 44/49] message update --- R/pkg/R/sparkR.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index df13ec7a53fc..4024be7919c6 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -370,7 +370,7 @@ sparkR.session <- function( if (is.na(file.info(sparkHome)$isdir)) { fmt <- paste0("Spark not found in SPARK_HOME: %s.\n", "To search in the cache directory. ", - "It will be installed if not found.") + "Installation will start if not found.") msg <- sprintf(fmt, sparkHome) message(msg) packageLocalDir <- install.spark() From 64756de02cae427be9b6795af3f40abbd30fcc06 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Thu, 28 Jul 2016 11:58:45 -0700 Subject: [PATCH 45/49] concise message, improve doc for windows, fix regex match --- R/check-cran.sh | 2 +- R/pkg/R/install.R | 12 +++++------- R/pkg/R/sparkR.R | 10 +++++----- R/pkg/R/utils.R | 2 +- 4 files changed, 12 insertions(+), 14 deletions(-) diff --git a/R/check-cran.sh b/R/check-cran.sh index b3a6860961c1..5c90fd07f28e 100755 --- a/R/check-cran.sh +++ b/R/check-cran.sh @@ -47,6 +47,6 @@ $FWDIR/create-docs.sh VERSION=`grep Version $FWDIR/pkg/DESCRIPTION | awk '{print $NF}'` -"$R_SCRIPT_PATH/"R CMD check --as-cran --no-tests SparkR_"$VERSION".tar.gz +"$R_SCRIPT_PATH/"R CMD check --as-cran SparkR_"$VERSION".tar.gz popd > /dev/null diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 3f3911034383..b74b79494115 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -36,7 +36,7 @@ #' \code{without-hadoop}. #' #' @param hadoopVersion Version of Hadoop to install. Default is \code{"2.7"}. It can take other -#' version number in the format of "int.int". +#' version number in the format of "x.y" where x and y are integer. #' If \code{hadoopVersion = "without"}, "Hadoop free" build is installed. #' See #' \href{http://spark.apache.org/docs/latest/hadoop-provided.html}{ @@ -50,11 +50,9 @@ #' \itemize{ #' \item Mac OS X: \file{~/Library/Caches/spark} #' \item Unix: \env{$XDG_CACHE_HOME} if defined, otherwise \file{~/.cache/spark} -#' \item Win XP: -#' \file{C:\\Documents and Settings\\\\Local Settings\\Application -#' Data\\spark\\spark\\Cache} -#' \item Win Vista: -#' \file{C:\\Users\\\\AppData\\Local\\spark\\spark\\Cache} +#' \item Windows: \file{\%LOCALAPPDATA\%\\spark\\spark\\Cache}. See +#' \href{https://www.microsoft.com/security/portal/mmpc/shared/variables.aspx}{ +#' Windows Common Folder Variables} about \%LOCALAPPDATA\% #' } #' @param overwrite If \code{TRUE}, download and overwrite the existing tar file in localDir #' and force re-install Spark (in case the local directory or file is corrupted) @@ -210,7 +208,7 @@ hadoop_version_name <- function(hadoopVersion) { spark_cache_path <- function() { if (.Platform$OS.type == "windows") { winAppPath <- Sys.getenv("%LOCALAPPDATA%", unset = NA) - if (is.null(winAppPath)) { + if (is.na(winAppPath)) { msg <- paste("%LOCALAPPDATA% not found.", "Please define the environment variable", "or restart and enter an installation path in localDir.") diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 4024be7919c6..d734ad054aae 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -365,19 +365,19 @@ sparkR.session <- function( } overrideEnvs(sparkConfigMap, paramMap) } + # do not download if it is run in the sparkR shell if (!grepl(".*shell\\.R$", Sys.getenv("R_PROFILE_USER"), perl = TRUE)) { if (!nzchar(master) || is_master_local(master)) { if (is.na(file.info(sparkHome)$isdir)) { - fmt <- paste0("Spark not found in SPARK_HOME: %s.\n", - "To search in the cache directory. ", + msg <- paste0("Spark not found in SPARK_HOME: ", + sparkHome, + " .\nTo search in the cache directory. ", "Installation will start if not found.") - msg <- sprintf(fmt, sparkHome) message(msg) packageLocalDir <- install.spark() sparkHome <- packageLocalDir } else { - fmt <- "Spark package is found in SPARK_HOME: %s" - msg <- sprintf(fmt, sparkHome) + msg <- paste0("Spark package is found in SPARK_HOME: ", sparkHome) message(msg) } } diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index d638b72dd85c..dfee71575e2a 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -691,5 +691,5 @@ getSparkContext <- function() { } is_master_local <- function(master) { - grepl("^local(\\[[0-9\\*]+\\])?$", master, perl = TRUE) + grepl("^local(\\[([0-9]+|\\*)\\])?$", master, perl = TRUE) } From 29bdf30acadd2eb47b7a21d28d186486c2da20d4 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Fri, 29 Jul 2016 00:11:20 -0700 Subject: [PATCH 46/49] Disable (temporarily) some test cases of describe and summary functions This is because change of output from 2.0 to 2.1. The downloaded JARs are 2.0, while the test code in the master branch assumes new output format. --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 3f3cb766b38f..39ed4febe54c 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1824,11 +1824,11 @@ test_that("describe() and summarize() on a DataFrame", { expect_equal(collect(stats)[2, "age"], "24.5") expect_equal(collect(stats)[3, "age"], "7.7781745930520225") stats <- describe(df) - expect_equal(collect(stats)[4, "name"], "Andy") + expect_equal(collect(stats)[4, "summary"], "min") expect_equal(collect(stats)[5, "age"], "30") stats2 <- summary(df) - expect_equal(collect(stats2)[4, "name"], "Andy") + expect_equal(collect(stats2)[4, "summary"], "min") expect_equal(collect(stats2)[5, "age"], "30") # SPARK-16425: SparkR summary() fails on column of type logical From 5decac6f6219d404f8ffc0764504be0498754ad0 Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 9 Aug 2016 11:30:52 -0700 Subject: [PATCH 47/49] Send message of reset SPARK_HOME in installation --- R/pkg/R/install.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index b74b79494115..296b8d494796 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -89,7 +89,7 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, # can use dir.exists(packageLocalDir) under R 3.2.0 or later if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) { - fmt <- "Spark %s for Hadoop %s is found in %s" + fmt <- "Spark %s for Hadoop %s is found, and SPARK_HOME set to %s" msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), packageLocalDir) message(msg) @@ -113,6 +113,7 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, } message("DONE.") Sys.setenv(SPARK_HOME = packageLocalDir) + message(paste("SPARK_HOME set to", packageLocalDir)) invisible(packageLocalDir) } From d84ba062b752f9987a1e8be28de5edd6c57c7a8b Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 9 Aug 2016 12:40:38 -0700 Subject: [PATCH 48/49] Specify path in jsonUrl and add alias to install function doc --- R/pkg/R/install.R | 13 +++++++++---- R/pkg/R/sparkR.R | 4 ++-- R/pkg/R/utils.R | 4 ++++ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 296b8d494796..4451917bb8a6 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -19,7 +19,7 @@ # from CRAN. #' Download and Install Apache Spark to a Local Directory -#' +#' #' \code{install.spark} downloads and installs Spark to a local directory if #' it is not found. The Spark version we use is the same as the SparkR version. #' Users can specify a desired Hadoop version, the remote mirror site, and @@ -59,6 +59,7 @@ #' @return \code{install.spark} returns the local directory where Spark is found or installed #' @rdname install.spark #' @name install.spark +#' @aliases install.spark #' @export #' @examples #'\dontrun{ @@ -131,7 +132,7 @@ robust_download_tar <- function(mirrorUrl, version, hadoopVersion, packageName, # step 2: use url suggested from apache website message("Looking for site suggested from apache website...") - mirrorUrl <- get_preferred_mirror() + mirrorUrl <- get_preferred_mirror(version, packageName) if (!is.null(mirrorUrl)) { success <- direct_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) @@ -156,8 +157,11 @@ robust_download_tar <- function(mirrorUrl, version, hadoopVersion, packageName, } } -get_preferred_mirror <- function() { - jsonUrl <- "http://www.apache.org/dyn/closer.cgi?as_json=1" +get_preferred_mirror <- function(version, packageName) { + jsonUrl <- paste0("http://www.apache.org/dyn/closer.cgi?path=", + file.path("spark", version, packageName), + ".tgz&as_json=1") + # jsonUrl <- "http://www.apache.org/dyn/closer.cgi?as_json=1" textLines <- readLines(jsonUrl, warn = FALSE) rowNum <- grep("\"preferred\"", textLines) linePreferred <- textLines[rowNum] @@ -185,6 +189,7 @@ direct_download_tar <- function(mirrorUrl, version, hadoopVersion, packageName, isFail <- tryCatch(download.file(packageRemotePath, packageLocalPath), error = function(e) { message(sprintf("Fetch failed from %s", mirrorUrl)) + print(e) TRUE }) !isFail diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index d734ad054aae..e8579de265e1 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -366,8 +366,8 @@ sparkR.session <- function( overrideEnvs(sparkConfigMap, paramMap) } # do not download if it is run in the sparkR shell - if (!grepl(".*shell\\.R$", Sys.getenv("R_PROFILE_USER"), perl = TRUE)) { - if (!nzchar(master) || is_master_local(master)) { + if (!nzchar(master) || is_master_local(master)) { + if (!is_sparkR_shell()) { if (is.na(file.info(sparkHome)$isdir)) { msg <- paste0("Spark not found in SPARK_HOME: ", sparkHome, diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index dfee71575e2a..d78c0a7a539a 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -693,3 +693,7 @@ getSparkContext <- function() { is_master_local <- function(master) { grepl("^local(\\[([0-9]+|\\*)\\])?$", master, perl = TRUE) } + +is_sparkR_shell <- function() { + grepl(".*shell\\.R$", Sys.getenv("R_PROFILE_USER"), perl = TRUE) +} From 3aeb4ebe7b18f5b26914cdf730cced3eb536f48d Mon Sep 17 00:00:00 2001 From: Junyang Qian Date: Tue, 9 Aug 2016 12:48:31 -0700 Subject: [PATCH 49/49] remove comment --- R/pkg/R/install.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 4451917bb8a6..987bac7bebc0 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -161,7 +161,6 @@ get_preferred_mirror <- function(version, packageName) { jsonUrl <- paste0("http://www.apache.org/dyn/closer.cgi?path=", file.path("spark", version, packageName), ".tgz&as_json=1") - # jsonUrl <- "http://www.apache.org/dyn/closer.cgi?as_json=1" textLines <- readLines(jsonUrl, warn = FALSE) rowNum <- grep("\"preferred\"", textLines) linePreferred <- textLines[rowNum]