Skip to content

Commit 8921868

Browse files
committed
Merge branch 'master' into sc-4929
2 parents ddc39b8 + b06c23d commit 8921868

File tree

97 files changed

+2172
-1244
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

97 files changed

+2172
-1244
lines changed

R/pkg/R/functions.R

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1485,7 +1485,7 @@ setMethod("soundex",
14851485

14861486
#' Return the partition ID as a column
14871487
#'
1488-
#' Return the partition ID of the Spark task as a SparkDataFrame column.
1488+
#' Return the partition ID as a SparkDataFrame column.
14891489
#' Note that this is nondeterministic because it depends on data partitioning and
14901490
#' task scheduling.
14911491
#'
@@ -2317,7 +2317,8 @@ setMethod("date_format", signature(y = "Column", x = "character"),
23172317

23182318
#' from_utc_timestamp
23192319
#'
2320-
#' Assumes given timestamp is UTC and converts to given timezone.
2320+
#' Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
2321+
#' that corresponds to the same time of day in the given timezone.
23212322
#'
23222323
#' @param y Column to compute on.
23232324
#' @param x time zone to use.
@@ -2340,7 +2341,7 @@ setMethod("from_utc_timestamp", signature(y = "Column", x = "character"),
23402341
#' Locate the position of the first occurrence of substr column in the given string.
23412342
#' Returns null if either of the arguments are null.
23422343
#'
2343-
#' NOTE: The position is not zero based, but 1 based index, returns 0 if substr
2344+
#' NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
23442345
#' could not be found in str.
23452346
#'
23462347
#' @param y column to check
@@ -2391,7 +2392,8 @@ setMethod("next_day", signature(y = "Column", x = "character"),
23912392

23922393
#' to_utc_timestamp
23932394
#'
2394-
#' Assumes given timestamp is in given timezone and converts to UTC.
2395+
#' Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
2396+
#' another timestamp that corresponds to the same time of day in UTC.
23952397
#'
23962398
#' @param y Column to compute on
23972399
#' @param x timezone to use
@@ -2539,7 +2541,7 @@ setMethod("shiftLeft", signature(y = "Column", x = "numeric"),
25392541

25402542
#' shiftRight
25412543
#'
2542-
#' Shift the given value numBits right. If the given value is a long value, it will return
2544+
#' (Signed) shift the given value numBits right. If the given value is a long value, it will return
25432545
#' a long value else it will return an integer value.
25442546
#'
25452547
#' @param y column to compute on.
@@ -2777,7 +2779,7 @@ setMethod("window", signature(x = "Column"),
27772779
#' locate
27782780
#'
27792781
#' Locate the position of the first occurrence of substr.
2780-
#' NOTE: The position is not zero based, but 1 based index, returns 0 if substr
2782+
#' NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
27812783
#' could not be found in str.
27822784
#'
27832785
#' @param substr a character string to be matched.
@@ -2823,7 +2825,8 @@ setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"),
28232825

28242826
#' rand
28252827
#'
2826-
#' Generate a random column with i.i.d. samples from U[0.0, 1.0].
2828+
#' Generate a random column with independent and identically distributed (i.i.d.) samples
2829+
#' from U[0.0, 1.0].
28272830
#'
28282831
#' @param seed a random seed. Can be missing.
28292832
#' @family normal_funcs
@@ -2852,7 +2855,8 @@ setMethod("rand", signature(seed = "numeric"),
28522855

28532856
#' randn
28542857
#'
2855-
#' Generate a column with i.i.d. samples from the standard normal distribution.
2858+
#' Generate a column with independent and identically distributed (i.i.d.) samples from
2859+
#' the standard normal distribution.
28562860
#'
28572861
#' @param seed a random seed. Can be missing.
28582862
#' @family normal_funcs
@@ -3442,8 +3446,8 @@ setMethod("size",
34423446

34433447
#' sort_array
34443448
#'
3445-
#' Sorts the input array for the given column in ascending order,
3446-
#' according to the natural ordering of the array elements.
3449+
#' Sorts the input array in ascending or descending order according
3450+
#' to the natural ordering of the array elements.
34473451
#'
34483452
#' @param x A Column to sort
34493453
#' @param asc A logical flag indicating the sorting order.

R/pkg/inst/tests/testthat/test_mllib.R

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,16 @@ test_that("spark.glm and predict", {
6464
rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
6565
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
6666

67+
# binomial family
68+
binomialTraining <- training[training$Species %in% c("versicolor", "virginica"), ]
69+
model <- spark.glm(binomialTraining, Species ~ Sepal_Length + Sepal_Width,
70+
family = binomial(link = "logit"))
71+
prediction <- predict(model, binomialTraining)
72+
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
73+
expected <- c("virginica", "virginica", "virginica", "versicolor", "virginica",
74+
"versicolor", "virginica", "versicolor", "virginica", "versicolor")
75+
expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected)
76+
6777
# poisson family
6878
model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
6979
family = poisson(link = identity))
@@ -128,12 +138,12 @@ test_that("spark.glm summary", {
128138
expect_equal(stats$aic, rStats$aic)
129139

130140
# Test spark.glm works with weighted dataset
131-
a1 <- c(0, 1, 2, 3)
132-
a2 <- c(5, 2, 1, 3)
133-
w <- c(1, 2, 3, 4)
134-
b <- c(1, 0, 1, 0)
141+
a1 <- c(0, 1, 2, 3, 4)
142+
a2 <- c(5, 2, 1, 3, 2)
143+
w <- c(1, 2, 3, 4, 5)
144+
b <- c(1, 0, 1, 0, 0)
135145
data <- as.data.frame(cbind(a1, a2, w, b))
136-
df <- suppressWarnings(createDataFrame(data))
146+
df <- createDataFrame(data)
137147

138148
stats <- summary(spark.glm(df, b ~ a1 + a2, family = "binomial", weightCol = "w"))
139149
rStats <- summary(glm(b ~ a1 + a2, family = "binomial", data = data, weights = w))
@@ -158,7 +168,7 @@ test_that("spark.glm summary", {
158168
data <- as.data.frame(cbind(a1, a2, b))
159169
df <- suppressWarnings(createDataFrame(data))
160170
regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0))
161-
expect_equal(regStats$aic, 13.32836, tolerance = 1e-4) # 13.32836 is from summary() result
171+
expect_equal(regStats$aic, 14.00976, tolerance = 1e-4) # 14.00976 is from summary() result
162172
})
163173

164174
test_that("spark.glm save/load", {
@@ -575,7 +585,7 @@ test_that("spark.isotonicRegression", {
575585
feature <- c(0.0, 1.0, 2.0, 3.0, 4.0)
576586
weight <- c(1.0, 1.0, 1.0, 1.0, 1.0)
577587
data <- as.data.frame(cbind(label, feature, weight))
578-
df <- suppressWarnings(createDataFrame(data))
588+
df <- createDataFrame(data)
579589

580590
model <- spark.isoreg(df, label ~ feature, isotonic = FALSE,
581591
weightCol = "weight")

core/src/main/scala/org/apache/spark/SparkContext.scala

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,8 @@ class SparkContext(config: SparkConf) extends Logging {
183183
// log out Spark Version in Spark driver log
184184
logInfo(s"Running Spark version $SPARK_VERSION")
185185

186+
warnDeprecatedVersions()
187+
186188
/* ------------------------------------------------------------------------------------- *
187189
| Private variables. These variables keep the internal state of the context, and are |
188190
| not accessible by the outside world. They're mutable since we want to initialize all |
@@ -346,6 +348,16 @@ class SparkContext(config: SparkConf) extends Logging {
346348
value
347349
}
348350

351+
private def warnDeprecatedVersions(): Unit = {
352+
val javaVersion = System.getProperty("java.version").split("[+.\\-]+", 3)
353+
if (javaVersion.length >= 2 && javaVersion(1).toInt == 7) {
354+
logWarning("Support for Java 7 is deprecated as of Spark 2.0.0")
355+
}
356+
if (scala.util.Properties.releaseVersion.exists(_.startsWith("2.10"))) {
357+
logWarning("Support for Scala 2.10 is deprecated as of Spark 2.1.0")
358+
}
359+
}
360+
349361
/** Control our logLevel. This overrides any user-defined log settings.
350362
* @param logLevel The desired log level as a string.
351363
* Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN

core/src/main/scala/org/apache/spark/api/r/RUtils.scala

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@ private[spark] object RUtils {
8484
}
8585
} else {
8686
// Otherwise, assume the package is local
87-
// TODO: support this for Mesos
8887
val sparkRPkgPath = localSparkRPackagePath.getOrElse {
8988
throw new SparkException("SPARK_HOME not set. Can't locate SparkR package.")
9089
}

core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -322,17 +322,14 @@ object SparkSubmit {
322322
}
323323

324324
// Require all R files to be local
325-
if (args.isR && !isYarnCluster) {
325+
if (args.isR && !isYarnCluster && !isMesosCluster) {
326326
if (Utils.nonLocalPaths(args.primaryResource).nonEmpty) {
327327
printErrorAndExit(s"Only local R files are supported: ${args.primaryResource}")
328328
}
329329
}
330330

331331
// The following modes are not supported or applicable
332332
(clusterManager, deployMode) match {
333-
case (MESOS, CLUSTER) if args.isR =>
334-
printErrorAndExit("Cluster deploy mode is currently not supported for R " +
335-
"applications on Mesos clusters.")
336333
case (STANDALONE, CLUSTER) if args.isPython =>
337334
printErrorAndExit("Cluster deploy mode is currently not supported for python " +
338335
"applications on standalone clusters.")
@@ -410,9 +407,9 @@ object SparkSubmit {
410407
printErrorAndExit("Distributing R packages with standalone cluster is not supported.")
411408
}
412409

413-
// TODO: Support SparkR with mesos cluster
414-
if (args.isR && clusterManager == MESOS) {
415-
printErrorAndExit("SparkR is not supported for Mesos cluster.")
410+
// TODO: Support distributing R packages with mesos cluster
411+
if (args.isR && clusterManager == MESOS && !RUtils.rPackages.isEmpty) {
412+
printErrorAndExit("Distributing R packages with mesos cluster is not supported.")
416413
}
417414

418415
// If we're running an R app, set the main class to our specific R runner
@@ -598,6 +595,9 @@ object SparkSubmit {
598595
if (args.pyFiles != null) {
599596
sysProps("spark.submit.pyFiles") = args.pyFiles
600597
}
598+
} else if (args.isR) {
599+
// Second argument is main class
600+
childArgs += (args.primaryResource, "")
601601
} else {
602602
childArgs += (args.primaryResource, args.mainClass)
603603
}

0 commit comments

Comments
 (0)