apache
diff --git a/‎R/check-cran.sh‎
Lines changed: 3 additions & 29 deletions b/‎R/check-cran.sh‎
Lines changed: 3 additions & 29 deletions
diff --git a/‎R/create-docs.sh‎
Lines changed: 6 additions & 5 deletions b/‎R/create-docs.sh‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎R/create-rd.sh‎
Lines changed: 37 additions & 0 deletions b/‎R/create-rd.sh‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎R/find-r.sh‎
Lines changed: 34 additions & 0 deletions b/‎R/find-r.sh‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎R/install-dev.sh‎
Lines changed: 4 additions & 16 deletions b/‎R/install-dev.sh‎
Lines changed: 4 additions & 16 deletions
diff --git a/‎R/install-source-package.sh‎
Lines changed: 57 additions & 0 deletions b/‎R/install-source-package.sh‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎R/pkg/R/install.R‎
Lines changed: 2 additions & 2 deletions b/‎R/pkg/R/install.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/pkg/R/mllib_clustering.R‎
Lines changed: 19 additions & 1 deletion b/‎R/pkg/R/mllib_clustering.R‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎R/pkg/inst/tests/testthat/test_mllib_clustering.R‎
Lines changed: 14 additions & 2 deletions b/‎R/pkg/inst/tests/testthat/test_mllib_clustering.R‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎R/pkg/inst/tests/testthat/test_mllib_tree.R‎
Lines changed: 0 additions & 1 deletion b/‎R/pkg/inst/tests/testthat/test_mllib_tree.R‎
Lines changed: 0 additions & 1 deletion
@@ -20,25 +20,14 @@
 set -o pipefail
 set -e
 
-FWDIR="$(cd `dirname $0`; pwd)"
+FWDIR="$(cd `dirname "${BASH_SOURCE[0]}"`; pwd)"
 pushd $FWDIR > /dev/null
 
-if [ ! -z "$R_HOME" ]
-  then
-    R_SCRIPT_PATH="$R_HOME/bin"
-  else
-    # if system wide R_HOME is not found, then exit
-    if [ ! `command -v R` ]; then
-      echo "Cannot find 'R_HOME'. Please specify 'R_HOME' or make sure R is properly installed."
-      exit 1
-    fi
-    R_SCRIPT_PATH="$(dirname $(which R))"
-fi
-echo "Using R_SCRIPT_PATH = ${R_SCRIPT_PATH}"
+. $FWDIR/find-r.sh
 
 # Install the package (this is required for code in vignettes to run when building it later)
 # Build the latest docs, but not vignettes, which is built with the package next
-$FWDIR/create-docs.sh
+. $FWDIR/install-dev.sh
 
 # Build source package with vignettes
 SPARK_HOME="$(cd "${FWDIR}"/..; pwd)"
@@ -84,19 +73,4 @@ else
   SPARK_HOME="${SPARK_HOME}" "$R_SCRIPT_PATH/"R CMD check $CRAN_CHECK_OPTIONS SparkR_"$VERSION".tar.gz
 fi
 
-# Install source package to get it to generate vignettes rds files, etc.
-if [ -n "$CLEAN_INSTALL" ]
-then
-  echo "Removing lib path and installing from source package"
-  LIB_DIR="$FWDIR/lib"
-  rm -rf $LIB_DIR
-  mkdir -p $LIB_DIR
-  "$R_SCRIPT_PATH/"R CMD INSTALL SparkR_"$VERSION".tar.gz --library=$LIB_DIR
-
-  # Zip the SparkR package so that it can be distributed to worker nodes on YARN
-  pushd $LIB_DIR > /dev/null
-  jar cfM "$LIB_DIR/sparkr.zip" SparkR
-  popd > /dev/null
-fi
-
 popd > /dev/null
@@ -29,26 +29,27 @@ set -o pipefail
 set -e
 
 # Figure out where the script is
-export FWDIR="$(cd "`dirname "$0"`"; pwd)"
-export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+export FWDIR="$(cd "`dirname "${BASH_SOURCE[0]}"`"; pwd)"
+export SPARK_HOME="$(cd "`dirname "${BASH_SOURCE[0]}"`"/..; pwd)"
 
 # Required for setting SPARK_SCALA_VERSION
 . "${SPARK_HOME}"/bin/load-spark-env.sh
 
 echo "Using Scala $SPARK_SCALA_VERSION"
 
-pushd $FWDIR
+pushd $FWDIR > /dev/null
+. $FWDIR/find-r.sh
 
 # Install the package (this will also generate the Rd files)
-./install-dev.sh
+. $FWDIR/install-dev.sh
 
 # Now create HTML files
 
 # knit_rd puts html in current working directory
 mkdir -p pkg/html
 pushd pkg/html
 
-Rscript -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); library(knitr); knit_rd("SparkR", links = tools::findHTMLlinks(paste(libDir, "SparkR", sep="/")))'
+"$R_SCRIPT_PATH/"Rscript -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); library(knitr); knit_rd("SparkR", links = tools::findHTMLlinks(paste(libDir, "SparkR", sep="/")))'
 
 popd
 
 
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This scripts packages the SparkR source files (R and C files) and
+# creates a package that can be loaded in R. The package is by default installed to
+# $FWDIR/lib and the package can be loaded by using the following command in R:
+#
+#   library(SparkR, lib.loc="$FWDIR/lib")
+#
+# NOTE(shivaram): Right now we use $SPARK_HOME/R/lib to be the installation directory
+# to load the SparkR package on the worker nodes.
+
+set -o pipefail
+set -e
+
+FWDIR="$(cd `dirname "${BASH_SOURCE[0]}"`; pwd)"
+pushd $FWDIR > /dev/null
+. $FWDIR/find-r.sh
+
+# Generate Rd files if devtools is installed
+"$R_SCRIPT_PATH/"Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }'
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if [ -z "$R_SCRIPT_PATH" ]
+then
+  if [ ! -z "$R_HOME" ]
+  then
+    R_SCRIPT_PATH="$R_HOME/bin"
+  else
+    # if system wide R_HOME is not found, then exit
+    if [ ! `command -v R` ]; then
+      echo "Cannot find 'R_HOME'. Please specify 'R_HOME' or make sure R is properly installed."
+      exit 1
+    fi
+    R_SCRIPT_PATH="$(dirname $(which R))"
+  fi
+  echo "Using R_SCRIPT_PATH = ${R_SCRIPT_PATH}"
+fi
@@ -29,27 +29,15 @@
 set -o pipefail
 set -e
 
-FWDIR="$(cd `dirname $0`; pwd)"
+FWDIR="$(cd `dirname "${BASH_SOURCE[0]}"`; pwd)"
 LIB_DIR="$FWDIR/lib"
 
 mkdir -p $LIB_DIR
 
 pushd $FWDIR > /dev/null
-if [ ! -z "$R_HOME" ]
-  then
-    R_SCRIPT_PATH="$R_HOME/bin"
-  else
-    # if system wide R_HOME is not found, then exit
-    if [ ! `command -v R` ]; then
-      echo "Cannot find 'R_HOME'. Please specify 'R_HOME' or make sure R is properly installed."
-      exit 1
-    fi
-    R_SCRIPT_PATH="$(dirname $(which R))"
-fi
-echo "Using R_SCRIPT_PATH = ${R_SCRIPT_PATH}"
-
-# Generate Rd files if devtools is installed
-"$R_SCRIPT_PATH/"Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }'
+. $FWDIR/find-r.sh
+
+. $FWDIR/create-rd.sh
 
 # Install SparkR to $LIB_DIR
 "$R_SCRIPT_PATH/"R CMD INSTALL --library=$LIB_DIR $FWDIR/pkg/
 
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This scripts packages the SparkR source files (R and C files) and
+# creates a package that can be loaded in R. The package is by default installed to
+# $FWDIR/lib and the package can be loaded by using the following command in R:
+#
+#   library(SparkR, lib.loc="$FWDIR/lib")
+#
+# NOTE(shivaram): Right now we use $SPARK_HOME/R/lib to be the installation directory
+# to load the SparkR package on the worker nodes.
+
+set -o pipefail
+set -e
+
+FWDIR="$(cd `dirname "${BASH_SOURCE[0]}"`; pwd)"
+pushd $FWDIR > /dev/null
+. $FWDIR/find-r.sh
+
+if [ -z "$VERSION" ]; then
+  VERSION=`grep Version $FWDIR/pkg/DESCRIPTION | awk '{print $NF}'`
+fi
+
+if [ ! -f "$FWDIR"/SparkR_"$VERSION".tar.gz ]; then
+  echo -e "R source package file $FWDIR/SparkR_$VERSION.tar.gz is not found."
+  echo -e "Please build R source package with check-cran.sh"
+  exit -1;
+fi
+
+echo "Removing lib path and installing from source package"
+LIB_DIR="$FWDIR/lib"
+rm -rf $LIB_DIR
+mkdir -p $LIB_DIR
+"$R_SCRIPT_PATH/"R CMD INSTALL SparkR_"$VERSION".tar.gz --library=$LIB_DIR
+
+# Zip the SparkR package so that it can be distributed to worker nodes on YARN
+pushd $LIB_DIR > /dev/null
+jar cfM "$LIB_DIR/sparkr.zip" SparkR
+popd > /dev/null
+
+popd
@@ -50,7 +50,7 @@
 #'                 \itemize{
 #'                   \item Mac OS X: \file{~/Library/Caches/spark}
 #'                   \item Unix: \env{$XDG_CACHE_HOME} if defined, otherwise \file{~/.cache/spark}
-#'                   \item Windows: \file{\%LOCALAPPDATA\%\\spark\\spark\\Cache}.
+#'                   \item Windows: \file{\%LOCALAPPDATA\%\\Apache\\Spark\\Cache}.
 #'                 }
 #' @param overwrite If \code{TRUE}, download and overwrite the existing tar file in localDir
 #'                  and force re-install Spark (in case the local directory or file is corrupted)
@@ -239,7 +239,7 @@ sparkCachePath <- function() {
                    "or restart and enter an installation path in localDir.")
       stop(msg)
     } else {
-      path <- file.path(winAppPath, "spark", "spark", "Cache")
+      path <- file.path(winAppPath, "Apache", "Spark", "Cache")
     }
   } else if (.Platform$OS.type == "unix") {
     if (Sys.info()["sysname"] == "Darwin") {
 
@@ -397,6 +397,13 @@ setMethod("spark.lda", signature(data = "SparkDataFrame"),
 #'         \item{\code{topics}}{top 10 terms and their weights of all topics}
 #'         \item{\code{vocabulary}}{whole terms of the training corpus, NULL if libsvm format file
 #'               used as training set}
+#'         \item{\code{trainingLogLikelihood}}{Log likelihood of the observed tokens in the training set,
+#'               given the current parameter estimates:
+#'               log P(docs | topics, topic distributions for docs, Dirichlet hyperparameters)
+#'               It is only for distributed LDA model (i.e., optimizer = "em")}
+#'         \item{\code{logPrior}}{Log probability of the current parameter estimate:
+#'               log P(topics, topic distributions for docs | Dirichlet hyperparameters)
+#'               It is only for distributed LDA model (i.e., optimizer = "em")}
 #' @rdname spark.lda
 #' @aliases summary,LDAModel-method
 #' @export
@@ -413,11 +420,22 @@ setMethod("summary", signature(object = "LDAModel"),
             vocabSize <- callJMethod(jobj, "vocabSize")
             topics <- dataFrame(callJMethod(jobj, "topics", maxTermsPerTopic))
             vocabulary <- callJMethod(jobj, "vocabulary")
+            trainingLogLikelihood <- if (isDistributed) {
+              callJMethod(jobj, "trainingLogLikelihood")
+            } else {
+              NA
+            }
+            logPrior <- if (isDistributed) {
+              callJMethod(jobj, "logPrior")
+            } else {
+              NA
+            }
             list(docConcentration = unlist(docConcentration),
                  topicConcentration = topicConcentration,
                  logLikelihood = logLikelihood, logPerplexity = logPerplexity,
                  isDistributed = isDistributed, vocabSize = vocabSize,
-                 topics = topics, vocabulary = unlist(vocabulary))
+                 topics = topics, vocabulary = unlist(vocabulary),
+                 trainingLogLikelihood = trainingLogLikelihood, logPrior = logPrior)
           })
 
 #  Returns the log perplexity of a Latent Dirichlet Allocation model produced by \code{spark.lda}
 
@@ -166,12 +166,16 @@ test_that("spark.lda with libsvm", {
   topics <- stats$topicTopTerms
   weights <- stats$topicTopTermsWeights
   vocabulary <- stats$vocabulary
+  trainingLogLikelihood <- stats$trainingLogLikelihood
+  logPrior <- stats$logPrior
 
-  expect_false(isDistributed)
+  expect_true(isDistributed)
   expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
   expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
   expect_equal(vocabSize, 11)
   expect_true(is.null(vocabulary))
+  expect_true(trainingLogLikelihood <= 0 & !is.na(trainingLogLikelihood))
+  expect_true(logPrior <= 0 & !is.na(logPrior))
 
   # Test model save/load
   modelPath <- tempfile(pattern = "spark-lda", fileext = ".tmp")
@@ -181,11 +185,13 @@ test_that("spark.lda with libsvm", {
   model2 <- read.ml(modelPath)
   stats2 <- summary(model2)
 
-  expect_false(stats2$isDistributed)
+  expect_true(stats2$isDistributed)
   expect_equal(logLikelihood, stats2$logLikelihood)
   expect_equal(logPerplexity, stats2$logPerplexity)
   expect_equal(vocabSize, stats2$vocabSize)
   expect_equal(vocabulary, stats2$vocabulary)
+  expect_equal(trainingLogLikelihood, stats2$trainingLogLikelihood)
+  expect_equal(logPrior, stats2$logPrior)
 
   unlink(modelPath)
 })
@@ -202,12 +208,16 @@ test_that("spark.lda with text input", {
   topics <- stats$topicTopTerms
   weights <- stats$topicTopTermsWeights
   vocabulary <- stats$vocabulary
+  trainingLogLikelihood <- stats$trainingLogLikelihood
+  logPrior <- stats$logPrior
 
   expect_false(isDistributed)
   expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
   expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
   expect_equal(vocabSize, 10)
   expect_true(setequal(stats$vocabulary, c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")))
+  expect_true(is.na(trainingLogLikelihood))
+  expect_true(is.na(logPrior))
 
   # Test model save/load
   modelPath <- tempfile(pattern = "spark-lda-text", fileext = ".tmp")
@@ -222,6 +232,8 @@ test_that("spark.lda with text input", {
   expect_equal(logPerplexity, stats2$logPerplexity)
   expect_equal(vocabSize, stats2$vocabSize)
   expect_true(all.equal(vocabulary, stats2$vocabulary))
+  expect_true(is.na(stats2$trainingLogLikelihood))
+  expect_true(is.na(stats2$logPrior))
 
   unlink(modelPath)
 })
 
@@ -126,7 +126,6 @@ test_that("spark.randomForest", {
                                          63.53160, 64.05470, 65.12710, 64.30450,
                                          66.70910, 67.86125, 68.08700, 67.21865,
                                          68.89275, 69.53180, 69.39640, 69.68250),
-
                tolerance = 1e-4)
   stats <- summary(model)
   expect_equal(stats$numTrees, 20)