Skip to content

Commit ea6356f

Browse files
committed
Merge branch 'master' into SPARK-19227
2 parents b5244ec + 18ee55d commit ea6356f

File tree

28 files changed

+760
-231
lines changed

28 files changed

+760
-231
lines changed

R/check-cran.sh

Lines changed: 3 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -20,25 +20,14 @@
2020
set -o pipefail
2121
set -e
2222

23-
FWDIR="$(cd `dirname $0`; pwd)"
23+
FWDIR="$(cd `dirname "${BASH_SOURCE[0]}"`; pwd)"
2424
pushd $FWDIR > /dev/null
2525

26-
if [ ! -z "$R_HOME" ]
27-
then
28-
R_SCRIPT_PATH="$R_HOME/bin"
29-
else
30-
# if system wide R_HOME is not found, then exit
31-
if [ ! `command -v R` ]; then
32-
echo "Cannot find 'R_HOME'. Please specify 'R_HOME' or make sure R is properly installed."
33-
exit 1
34-
fi
35-
R_SCRIPT_PATH="$(dirname $(which R))"
36-
fi
37-
echo "Using R_SCRIPT_PATH = ${R_SCRIPT_PATH}"
26+
. $FWDIR/find-r.sh
3827

3928
# Install the package (this is required for code in vignettes to run when building it later)
4029
# Build the latest docs, but not vignettes, which is built with the package next
41-
$FWDIR/create-docs.sh
30+
. $FWDIR/install-dev.sh
4231

4332
# Build source package with vignettes
4433
SPARK_HOME="$(cd "${FWDIR}"/..; pwd)"
@@ -84,19 +73,4 @@ else
8473
SPARK_HOME="${SPARK_HOME}" "$R_SCRIPT_PATH/"R CMD check $CRAN_CHECK_OPTIONS SparkR_"$VERSION".tar.gz
8574
fi
8675

87-
# Install source package to get it to generate vignettes rds files, etc.
88-
if [ -n "$CLEAN_INSTALL" ]
89-
then
90-
echo "Removing lib path and installing from source package"
91-
LIB_DIR="$FWDIR/lib"
92-
rm -rf $LIB_DIR
93-
mkdir -p $LIB_DIR
94-
"$R_SCRIPT_PATH/"R CMD INSTALL SparkR_"$VERSION".tar.gz --library=$LIB_DIR
95-
96-
# Zip the SparkR package so that it can be distributed to worker nodes on YARN
97-
pushd $LIB_DIR > /dev/null
98-
jar cfM "$LIB_DIR/sparkr.zip" SparkR
99-
popd > /dev/null
100-
fi
101-
10276
popd > /dev/null

R/create-docs.sh

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,26 +29,27 @@ set -o pipefail
2929
set -e
3030

3131
# Figure out where the script is
32-
export FWDIR="$(cd "`dirname "$0"`"; pwd)"
33-
export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
32+
export FWDIR="$(cd "`dirname "${BASH_SOURCE[0]}"`"; pwd)"
33+
export SPARK_HOME="$(cd "`dirname "${BASH_SOURCE[0]}"`"/..; pwd)"
3434

3535
# Required for setting SPARK_SCALA_VERSION
3636
. "${SPARK_HOME}"/bin/load-spark-env.sh
3737

3838
echo "Using Scala $SPARK_SCALA_VERSION"
3939

40-
pushd $FWDIR
40+
pushd $FWDIR > /dev/null
41+
. $FWDIR/find-r.sh
4142

4243
# Install the package (this will also generate the Rd files)
43-
./install-dev.sh
44+
. $FWDIR/install-dev.sh
4445

4546
# Now create HTML files
4647

4748
# knit_rd puts html in current working directory
4849
mkdir -p pkg/html
4950
pushd pkg/html
5051

51-
Rscript -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); library(knitr); knit_rd("SparkR", links = tools::findHTMLlinks(paste(libDir, "SparkR", sep="/")))'
52+
"$R_SCRIPT_PATH/"Rscript -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); library(knitr); knit_rd("SparkR", links = tools::findHTMLlinks(paste(libDir, "SparkR", sep="/")))'
5253

5354
popd
5455

R/create-rd.sh

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/bin/bash
2+
3+
#
4+
# Licensed to the Apache Software Foundation (ASF) under one or more
5+
# contributor license agreements. See the NOTICE file distributed with
6+
# this work for additional information regarding copyright ownership.
7+
# The ASF licenses this file to You under the Apache License, Version 2.0
8+
# (the "License"); you may not use this file except in compliance with
9+
# the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing, software
14+
# distributed under the License is distributed on an "AS IS" BASIS,
15+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
# See the License for the specific language governing permissions and
17+
# limitations under the License.
18+
#
19+
20+
# This scripts packages the SparkR source files (R and C files) and
21+
# creates a package that can be loaded in R. The package is by default installed to
22+
# $FWDIR/lib and the package can be loaded by using the following command in R:
23+
#
24+
# library(SparkR, lib.loc="$FWDIR/lib")
25+
#
26+
# NOTE(shivaram): Right now we use $SPARK_HOME/R/lib to be the installation directory
27+
# to load the SparkR package on the worker nodes.
28+
29+
set -o pipefail
30+
set -e
31+
32+
FWDIR="$(cd `dirname "${BASH_SOURCE[0]}"`; pwd)"
33+
pushd $FWDIR > /dev/null
34+
. $FWDIR/find-r.sh
35+
36+
# Generate Rd files if devtools is installed
37+
"$R_SCRIPT_PATH/"Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }'

R/find-r.sh

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
3+
#
4+
# Licensed to the Apache Software Foundation (ASF) under one or more
5+
# contributor license agreements. See the NOTICE file distributed with
6+
# this work for additional information regarding copyright ownership.
7+
# The ASF licenses this file to You under the Apache License, Version 2.0
8+
# (the "License"); you may not use this file except in compliance with
9+
# the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing, software
14+
# distributed under the License is distributed on an "AS IS" BASIS,
15+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
# See the License for the specific language governing permissions and
17+
# limitations under the License.
18+
#
19+
20+
if [ -z "$R_SCRIPT_PATH" ]
21+
then
22+
if [ ! -z "$R_HOME" ]
23+
then
24+
R_SCRIPT_PATH="$R_HOME/bin"
25+
else
26+
# if system wide R_HOME is not found, then exit
27+
if [ ! `command -v R` ]; then
28+
echo "Cannot find 'R_HOME'. Please specify 'R_HOME' or make sure R is properly installed."
29+
exit 1
30+
fi
31+
R_SCRIPT_PATH="$(dirname $(which R))"
32+
fi
33+
echo "Using R_SCRIPT_PATH = ${R_SCRIPT_PATH}"
34+
fi

R/install-dev.sh

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -29,27 +29,15 @@
2929
set -o pipefail
3030
set -e
3131

32-
FWDIR="$(cd `dirname $0`; pwd)"
32+
FWDIR="$(cd `dirname "${BASH_SOURCE[0]}"`; pwd)"
3333
LIB_DIR="$FWDIR/lib"
3434

3535
mkdir -p $LIB_DIR
3636

3737
pushd $FWDIR > /dev/null
38-
if [ ! -z "$R_HOME" ]
39-
then
40-
R_SCRIPT_PATH="$R_HOME/bin"
41-
else
42-
# if system wide R_HOME is not found, then exit
43-
if [ ! `command -v R` ]; then
44-
echo "Cannot find 'R_HOME'. Please specify 'R_HOME' or make sure R is properly installed."
45-
exit 1
46-
fi
47-
R_SCRIPT_PATH="$(dirname $(which R))"
48-
fi
49-
echo "Using R_SCRIPT_PATH = ${R_SCRIPT_PATH}"
50-
51-
# Generate Rd files if devtools is installed
52-
"$R_SCRIPT_PATH/"Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }'
38+
. $FWDIR/find-r.sh
39+
40+
. $FWDIR/create-rd.sh
5341

5442
# Install SparkR to $LIB_DIR
5543
"$R_SCRIPT_PATH/"R CMD INSTALL --library=$LIB_DIR $FWDIR/pkg/

R/install-source-package.sh

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#!/bin/bash
2+
3+
#
4+
# Licensed to the Apache Software Foundation (ASF) under one or more
5+
# contributor license agreements. See the NOTICE file distributed with
6+
# this work for additional information regarding copyright ownership.
7+
# The ASF licenses this file to You under the Apache License, Version 2.0
8+
# (the "License"); you may not use this file except in compliance with
9+
# the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing, software
14+
# distributed under the License is distributed on an "AS IS" BASIS,
15+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
# See the License for the specific language governing permissions and
17+
# limitations under the License.
18+
#
19+
20+
# This scripts packages the SparkR source files (R and C files) and
21+
# creates a package that can be loaded in R. The package is by default installed to
22+
# $FWDIR/lib and the package can be loaded by using the following command in R:
23+
#
24+
# library(SparkR, lib.loc="$FWDIR/lib")
25+
#
26+
# NOTE(shivaram): Right now we use $SPARK_HOME/R/lib to be the installation directory
27+
# to load the SparkR package on the worker nodes.
28+
29+
set -o pipefail
30+
set -e
31+
32+
FWDIR="$(cd `dirname "${BASH_SOURCE[0]}"`; pwd)"
33+
pushd $FWDIR > /dev/null
34+
. $FWDIR/find-r.sh
35+
36+
if [ -z "$VERSION" ]; then
37+
VERSION=`grep Version $FWDIR/pkg/DESCRIPTION | awk '{print $NF}'`
38+
fi
39+
40+
if [ ! -f "$FWDIR"/SparkR_"$VERSION".tar.gz ]; then
41+
echo -e "R source package file $FWDIR/SparkR_$VERSION.tar.gz is not found."
42+
echo -e "Please build R source package with check-cran.sh"
43+
exit -1;
44+
fi
45+
46+
echo "Removing lib path and installing from source package"
47+
LIB_DIR="$FWDIR/lib"
48+
rm -rf $LIB_DIR
49+
mkdir -p $LIB_DIR
50+
"$R_SCRIPT_PATH/"R CMD INSTALL SparkR_"$VERSION".tar.gz --library=$LIB_DIR
51+
52+
# Zip the SparkR package so that it can be distributed to worker nodes on YARN
53+
pushd $LIB_DIR > /dev/null
54+
jar cfM "$LIB_DIR/sparkr.zip" SparkR
55+
popd > /dev/null
56+
57+
popd

R/pkg/R/install.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
#' \itemize{
5151
#' \item Mac OS X: \file{~/Library/Caches/spark}
5252
#' \item Unix: \env{$XDG_CACHE_HOME} if defined, otherwise \file{~/.cache/spark}
53-
#' \item Windows: \file{\%LOCALAPPDATA\%\\spark\\spark\\Cache}.
53+
#' \item Windows: \file{\%LOCALAPPDATA\%\\Apache\\Spark\\Cache}.
5454
#' }
5555
#' @param overwrite If \code{TRUE}, download and overwrite the existing tar file in localDir
5656
#' and force re-install Spark (in case the local directory or file is corrupted)
@@ -239,7 +239,7 @@ sparkCachePath <- function() {
239239
"or restart and enter an installation path in localDir.")
240240
stop(msg)
241241
} else {
242-
path <- file.path(winAppPath, "spark", "spark", "Cache")
242+
path <- file.path(winAppPath, "Apache", "Spark", "Cache")
243243
}
244244
} else if (.Platform$OS.type == "unix") {
245245
if (Sys.info()["sysname"] == "Darwin") {

R/pkg/R/mllib_clustering.R

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,13 @@ setMethod("spark.lda", signature(data = "SparkDataFrame"),
397397
#' \item{\code{topics}}{top 10 terms and their weights of all topics}
398398
#' \item{\code{vocabulary}}{whole terms of the training corpus, NULL if libsvm format file
399399
#' used as training set}
400+
#' \item{\code{trainingLogLikelihood}}{Log likelihood of the observed tokens in the training set,
401+
#' given the current parameter estimates:
402+
#' log P(docs | topics, topic distributions for docs, Dirichlet hyperparameters)
403+
#' It is only for distributed LDA model (i.e., optimizer = "em")}
404+
#' \item{\code{logPrior}}{Log probability of the current parameter estimate:
405+
#' log P(topics, topic distributions for docs | Dirichlet hyperparameters)
406+
#' It is only for distributed LDA model (i.e., optimizer = "em")}
400407
#' @rdname spark.lda
401408
#' @aliases summary,LDAModel-method
402409
#' @export
@@ -413,11 +420,22 @@ setMethod("summary", signature(object = "LDAModel"),
413420
vocabSize <- callJMethod(jobj, "vocabSize")
414421
topics <- dataFrame(callJMethod(jobj, "topics", maxTermsPerTopic))
415422
vocabulary <- callJMethod(jobj, "vocabulary")
423+
trainingLogLikelihood <- if (isDistributed) {
424+
callJMethod(jobj, "trainingLogLikelihood")
425+
} else {
426+
NA
427+
}
428+
logPrior <- if (isDistributed) {
429+
callJMethod(jobj, "logPrior")
430+
} else {
431+
NA
432+
}
416433
list(docConcentration = unlist(docConcentration),
417434
topicConcentration = topicConcentration,
418435
logLikelihood = logLikelihood, logPerplexity = logPerplexity,
419436
isDistributed = isDistributed, vocabSize = vocabSize,
420-
topics = topics, vocabulary = unlist(vocabulary))
437+
topics = topics, vocabulary = unlist(vocabulary),
438+
trainingLogLikelihood = trainingLogLikelihood, logPrior = logPrior)
421439
})
422440

423441
# Returns the log perplexity of a Latent Dirichlet Allocation model produced by \code{spark.lda}

R/pkg/inst/tests/testthat/test_mllib_clustering.R

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,12 +166,16 @@ test_that("spark.lda with libsvm", {
166166
topics <- stats$topicTopTerms
167167
weights <- stats$topicTopTermsWeights
168168
vocabulary <- stats$vocabulary
169+
trainingLogLikelihood <- stats$trainingLogLikelihood
170+
logPrior <- stats$logPrior
169171

170-
expect_false(isDistributed)
172+
expect_true(isDistributed)
171173
expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
172174
expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
173175
expect_equal(vocabSize, 11)
174176
expect_true(is.null(vocabulary))
177+
expect_true(trainingLogLikelihood <= 0 & !is.na(trainingLogLikelihood))
178+
expect_true(logPrior <= 0 & !is.na(logPrior))
175179

176180
# Test model save/load
177181
modelPath <- tempfile(pattern = "spark-lda", fileext = ".tmp")
@@ -181,11 +185,13 @@ test_that("spark.lda with libsvm", {
181185
model2 <- read.ml(modelPath)
182186
stats2 <- summary(model2)
183187

184-
expect_false(stats2$isDistributed)
188+
expect_true(stats2$isDistributed)
185189
expect_equal(logLikelihood, stats2$logLikelihood)
186190
expect_equal(logPerplexity, stats2$logPerplexity)
187191
expect_equal(vocabSize, stats2$vocabSize)
188192
expect_equal(vocabulary, stats2$vocabulary)
193+
expect_equal(trainingLogLikelihood, stats2$trainingLogLikelihood)
194+
expect_equal(logPrior, stats2$logPrior)
189195

190196
unlink(modelPath)
191197
})
@@ -202,12 +208,16 @@ test_that("spark.lda with text input", {
202208
topics <- stats$topicTopTerms
203209
weights <- stats$topicTopTermsWeights
204210
vocabulary <- stats$vocabulary
211+
trainingLogLikelihood <- stats$trainingLogLikelihood
212+
logPrior <- stats$logPrior
205213

206214
expect_false(isDistributed)
207215
expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
208216
expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
209217
expect_equal(vocabSize, 10)
210218
expect_true(setequal(stats$vocabulary, c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")))
219+
expect_true(is.na(trainingLogLikelihood))
220+
expect_true(is.na(logPrior))
211221

212222
# Test model save/load
213223
modelPath <- tempfile(pattern = "spark-lda-text", fileext = ".tmp")
@@ -222,6 +232,8 @@ test_that("spark.lda with text input", {
222232
expect_equal(logPerplexity, stats2$logPerplexity)
223233
expect_equal(vocabSize, stats2$vocabSize)
224234
expect_true(all.equal(vocabulary, stats2$vocabulary))
235+
expect_true(is.na(stats2$trainingLogLikelihood))
236+
expect_true(is.na(stats2$logPrior))
225237

226238
unlink(modelPath)
227239
})

R/pkg/inst/tests/testthat/test_mllib_tree.R

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,6 @@ test_that("spark.randomForest", {
126126
63.53160, 64.05470, 65.12710, 64.30450,
127127
66.70910, 67.86125, 68.08700, 67.21865,
128128
68.89275, 69.53180, 69.39640, 69.68250),
129-
130129
tolerance = 1e-4)
131130
stats <- summary(model)
132131
expect_equal(stats$numTrees, 20)

0 commit comments

Comments
 (0)