apache
diff --git a/‎R/pkg/R/functions.R‎
Lines changed: 14 additions & 10 deletions b/‎R/pkg/R/functions.R‎
Lines changed: 14 additions & 10 deletions
diff --git a/‎R/pkg/inst/tests/testthat/test_mllib.R‎
Lines changed: 17 additions & 7 deletions b/‎R/pkg/inst/tests/testthat/test_mllib.R‎
Lines changed: 17 additions & 7 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 12 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/api/r/RUtils.scala‎
Lines changed: 0 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/api/r/RUtils.scala‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala‎
Lines changed: 7 additions & 7 deletions b/‎core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala‎
Lines changed: 7 additions & 7 deletions
@@ -1485,7 +1485,7 @@ setMethod("soundex",
 
 #' Return the partition ID as a column
 #'
-#' Return the partition ID of the Spark task as a SparkDataFrame column.
+#' Return the partition ID as a SparkDataFrame column.
 #' Note that this is nondeterministic because it depends on data partitioning and
 #' task scheduling.
 #'
@@ -2317,7 +2317,8 @@ setMethod("date_format", signature(y = "Column", x = "character"),
 
 #' from_utc_timestamp
 #'
-#' Assumes given timestamp is UTC and converts to given timezone.
+#' Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
+#' that corresponds to the same time of day in the given timezone.
 #'
 #' @param y Column to compute on.
 #' @param x time zone to use.
@@ -2340,7 +2341,7 @@ setMethod("from_utc_timestamp", signature(y = "Column", x = "character"),
 #' Locate the position of the first occurrence of substr column in the given string.
 #' Returns null if either of the arguments are null.
 #'
-#' NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+#' NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
 #' could not be found in str.
 #'
 #' @param y column to check
@@ -2391,7 +2392,8 @@ setMethod("next_day", signature(y = "Column", x = "character"),
 
 #' to_utc_timestamp
 #'
-#' Assumes given timestamp is in given timezone and converts to UTC.
+#' Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
+#' another timestamp that corresponds to the same time of day in UTC.
 #'
 #' @param y Column to compute on
 #' @param x timezone to use
@@ -2539,7 +2541,7 @@ setMethod("shiftLeft", signature(y = "Column", x = "numeric"),
 
 #' shiftRight
 #'
-#' Shift the given value numBits right. If the given value is a long value, it will return
+#' (Signed) shift the given value numBits right. If the given value is a long value, it will return
 #' a long value else it will return an integer value.
 #'
 #' @param y column to compute on.
@@ -2777,7 +2779,7 @@ setMethod("window", signature(x = "Column"),
 #' locate
 #'
 #' Locate the position of the first occurrence of substr.
-#' NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+#' NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
 #' could not be found in str.
 #'
 #' @param substr a character string to be matched.
@@ -2823,7 +2825,8 @@ setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"),
 
 #' rand
 #'
-#' Generate a random column with i.i.d. samples from U[0.0, 1.0].
+#' Generate a random column with independent and identically distributed (i.i.d.) samples
+#' from U[0.0, 1.0].
 #'
 #' @param seed a random seed. Can be missing.
 #' @family normal_funcs
@@ -2852,7 +2855,8 @@ setMethod("rand", signature(seed = "numeric"),
 
 #' randn
 #'
-#' Generate a column with i.i.d. samples from the standard normal distribution.
+#' Generate a column with independent and identically distributed (i.i.d.) samples from
+#' the standard normal distribution.
 #'
 #' @param seed a random seed. Can be missing.
 #' @family normal_funcs
@@ -3442,8 +3446,8 @@ setMethod("size",
 
 #' sort_array
 #'
-#' Sorts the input array for the given column in ascending order,
-#' according to the natural ordering of the array elements.
+#' Sorts the input array in ascending or descending order according
+#' to the natural ordering of the array elements.
 #'
 #' @param x A Column to sort
 #' @param asc A logical flag indicating the sorting order.
 
@@ -64,6 +64,16 @@ test_that("spark.glm and predict", {
   rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 
+  # binomial family
+  binomialTraining <- training[training$Species %in% c("versicolor", "virginica"), ]
+  model <- spark.glm(binomialTraining, Species ~ Sepal_Length + Sepal_Width,
+    family = binomial(link = "logit"))
+  prediction <- predict(model, binomialTraining)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
+  expected <- c("virginica", "virginica", "virginica", "versicolor", "virginica",
+    "versicolor", "virginica", "versicolor", "virginica", "versicolor")
+  expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected)
+
   # poisson family
   model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
   family = poisson(link = identity))
@@ -128,12 +138,12 @@ test_that("spark.glm summary", {
   expect_equal(stats$aic, rStats$aic)
 
   # Test spark.glm works with weighted dataset
-  a1 <- c(0, 1, 2, 3)
-  a2 <- c(5, 2, 1, 3)
-  w <- c(1, 2, 3, 4)
-  b <- c(1, 0, 1, 0)
+  a1 <- c(0, 1, 2, 3, 4)
+  a2 <- c(5, 2, 1, 3, 2)
+  w <- c(1, 2, 3, 4, 5)
+  b <- c(1, 0, 1, 0, 0)
   data <- as.data.frame(cbind(a1, a2, w, b))
-  df <- suppressWarnings(createDataFrame(data))
+  df <- createDataFrame(data)
 
   stats <- summary(spark.glm(df, b ~ a1 + a2, family = "binomial", weightCol = "w"))
   rStats <- summary(glm(b ~ a1 + a2, family = "binomial", data = data, weights = w))
@@ -158,7 +168,7 @@ test_that("spark.glm summary", {
   data <- as.data.frame(cbind(a1, a2, b))
   df <- suppressWarnings(createDataFrame(data))
   regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0))
-  expect_equal(regStats$aic, 13.32836, tolerance = 1e-4) # 13.32836 is from summary() result
+  expect_equal(regStats$aic, 14.00976, tolerance = 1e-4) # 14.00976 is from summary() result
 })
 
 test_that("spark.glm save/load", {
@@ -575,7 +585,7 @@ test_that("spark.isotonicRegression", {
   feature <- c(0.0, 1.0, 2.0, 3.0, 4.0)
   weight <- c(1.0, 1.0, 1.0, 1.0, 1.0)
   data <- as.data.frame(cbind(label, feature, weight))
-  df <- suppressWarnings(createDataFrame(data))
+  df <- createDataFrame(data)
 
   model <- spark.isoreg(df, label ~ feature, isotonic = FALSE,
                         weightCol = "weight")
 
@@ -183,6 +183,8 @@ class SparkContext(config: SparkConf) extends Logging {
   // log out Spark Version in Spark driver log
   logInfo(s"Running Spark version $SPARK_VERSION")
 
+  warnDeprecatedVersions()
+
   /* ------------------------------------------------------------------------------------- *
    | Private variables. These variables keep the internal state of the context, and are    |
    | not accessible by the outside world. They're mutable since we want to initialize all  |
@@ -346,6 +348,16 @@ class SparkContext(config: SparkConf) extends Logging {
     value
   }
 
+  private def warnDeprecatedVersions(): Unit = {
+    val javaVersion = System.getProperty("java.version").split("[+.\\-]+", 3)
+    if (javaVersion.length >= 2 && javaVersion(1).toInt == 7) {
+      logWarning("Support for Java 7 is deprecated as of Spark 2.0.0")
+    }
+    if (scala.util.Properties.releaseVersion.exists(_.startsWith("2.10"))) {
+      logWarning("Support for Scala 2.10 is deprecated as of Spark 2.1.0")
+    }
+  }
+
   /** Control our logLevel. This overrides any user-defined log settings.
    * @param logLevel The desired log level as a string.
    * Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
 
@@ -84,7 +84,6 @@ private[spark] object RUtils {
       }
     } else {
       // Otherwise, assume the package is local
-      // TODO: support this for Mesos
       val sparkRPkgPath = localSparkRPackagePath.getOrElse {
           throw new SparkException("SPARK_HOME not set. Can't locate SparkR package.")
       }
 
@@ -322,17 +322,14 @@ object SparkSubmit {
     }
 
     // Require all R files to be local
-    if (args.isR && !isYarnCluster) {
+    if (args.isR && !isYarnCluster && !isMesosCluster) {
       if (Utils.nonLocalPaths(args.primaryResource).nonEmpty) {
         printErrorAndExit(s"Only local R files are supported: ${args.primaryResource}")
       }
     }
 
     // The following modes are not supported or applicable
     (clusterManager, deployMode) match {
-      case (MESOS, CLUSTER) if args.isR =>
-        printErrorAndExit("Cluster deploy mode is currently not supported for R " +
-          "applications on Mesos clusters.")
       case (STANDALONE, CLUSTER) if args.isPython =>
         printErrorAndExit("Cluster deploy mode is currently not supported for python " +
           "applications on standalone clusters.")
@@ -410,9 +407,9 @@ object SparkSubmit {
       printErrorAndExit("Distributing R packages with standalone cluster is not supported.")
     }
 
-    // TODO: Support SparkR with mesos cluster
-    if (args.isR && clusterManager == MESOS) {
-      printErrorAndExit("SparkR is not supported for Mesos cluster.")
+    // TODO: Support distributing R packages with mesos cluster
+    if (args.isR && clusterManager == MESOS && !RUtils.rPackages.isEmpty) {
+      printErrorAndExit("Distributing R packages with mesos cluster is not supported.")
     }
 
     // If we're running an R app, set the main class to our specific R runner
@@ -598,6 +595,9 @@ object SparkSubmit {
         if (args.pyFiles != null) {
           sysProps("spark.submit.pyFiles") = args.pyFiles
         }
+      } else if (args.isR) {
+        // Second argument is main class
+        childArgs += (args.primaryResource, "")
       } else {
         childArgs += (args.primaryResource, args.mainClass)
       }
Original file line number	Diff line number	Diff line change
`@@ -84,7 +84,6 @@ private[spark] object RUtils {`
`84`	`84`	`}`
`85`	`85`	`} else {`
`86`	`86`	`// Otherwise, assume the package is local`
`87`		`- // TODO: support this for Mesos`
`88`	`87`	`val sparkRPkgPath = localSparkRPackagePath.getOrElse {`
`89`	`88`	`throw new SparkException("SPARK_HOME not set. Can't locate SparkR package.")`
`90`	`89`	`}`
Original file line number	Diff line number	Diff line change
`@@ -322,17 +322,14 @@ object SparkSubmit {`
`322`	`322`	`}`
`323`	`323`
`324`	`324`	`// Require all R files to be local`
`325`		`- if (args.isR && !isYarnCluster) {`
	`325`	`+ if (args.isR && !isYarnCluster && !isMesosCluster) {`
`326`	`326`	`if (Utils.nonLocalPaths(args.primaryResource).nonEmpty) {`
`327`	`327`	`printErrorAndExit(s"Only local R files are supported: ${args.primaryResource}")`
`328`	`328`	`}`
`329`	`329`	`}`
`330`	`330`
`331`	`331`	`// The following modes are not supported or applicable`
`332`	`332`	`(clusterManager, deployMode) match {`
`333`		`- case (MESOS, CLUSTER) if args.isR =>`
`334`		`- printErrorAndExit("Cluster deploy mode is currently not supported for R " +`
`335`		`- "applications on Mesos clusters.")`
`336`	`333`	`case (STANDALONE, CLUSTER) if args.isPython =>`
`337`	`334`	`printErrorAndExit("Cluster deploy mode is currently not supported for python " +`
`338`	`335`	`"applications on standalone clusters.")`
`@@ -410,9 +407,9 @@ object SparkSubmit {`
`410`	`407`	`printErrorAndExit("Distributing R packages with standalone cluster is not supported.")`
`411`	`408`	`}`
`412`	`409`
`413`		`- // TODO: Support SparkR with mesos cluster`
`414`		`- if (args.isR && clusterManager == MESOS) {`
`415`		`- printErrorAndExit("SparkR is not supported for Mesos cluster.")`
	`410`	`+ // TODO: Support distributing R packages with mesos cluster`
	`411`	`+ if (args.isR && clusterManager == MESOS && !RUtils.rPackages.isEmpty) {`
	`412`	`+ printErrorAndExit("Distributing R packages with mesos cluster is not supported.")`
`416`	`413`	`}`
`417`	`414`
`418`	`415`	`// If we're running an R app, set the main class to our specific R runner`
`@@ -598,6 +595,9 @@ object SparkSubmit {`
`598`	`595`	`if (args.pyFiles != null) {`
`599`	`596`	`sysProps("spark.submit.pyFiles") = args.pyFiles`
`600`	`597`	`}`
	`598`	`+ } else if (args.isR) {`
	`599`	`+ // Second argument is main class`
	`600`	`+ childArgs += (args.primaryResource, "")`
`601`	`601`	`} else {`
`602`	`602`	`childArgs += (args.primaryResource, args.mainClass)`
`603`	`603`	`}`