Skip to content

Commit 4e65e09

Browse files
authored
Merge pull request apache-spark-on-k8s#212 from palantir/rk/latest
2 parents 7e4bccd + c786340 commit 4e65e09

File tree

135 files changed

+2610
-1261
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

135 files changed

+2610
-1261
lines changed

R/README.md

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,7 @@ To run one of them, use `./bin/spark-submit <filename> <args>`. For example:
6666
```bash
6767
./bin/spark-submit examples/src/main/r/dataframe.R
6868
```
69-
You can also run the unit tests for SparkR by running. You need to install the [testthat](http://cran.r-project.org/web/packages/testthat/index.html) package first:
70-
```bash
71-
R -e 'install.packages("testthat", repos="http://cran.us.r-project.org")'
72-
./R/run-tests.sh
73-
```
69+
You can run R unit tests by following the instructions under [Running R Tests](http://spark.apache.org/docs/latest/building-spark.html#running-r-tests).
7470

7571
### Running on YARN
7672

R/WINDOWS.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,9 @@ To run the SparkR unit tests on Windows, the following steps are required —ass
3434

3535
4. Set the environment variable `HADOOP_HOME` to the full path to the newly created `hadoop` directory.
3636

37-
5. Run unit tests for SparkR by running the command below. You need to install the [testthat](http://cran.r-project.org/web/packages/testthat/index.html) package first:
37+
5. Run unit tests for SparkR by running the command below. You need to install the needed packages following the instructions under [Running R Tests](http://spark.apache.org/docs/latest/building-spark.html#running-r-tests) first:
3838

3939
```
40-
R -e "install.packages('testthat', repos='http://cran.us.r-project.org')"
4140
.\bin\spark-submit2.cmd --conf spark.hadoop.fs.defaultFS="file:///" R\pkg\tests\run-all.R
4241
```
4342

R/pkg/NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,7 @@ exportMethods("%<=>%",
357357
"to_utc_timestamp",
358358
"translate",
359359
"trim",
360+
"trunc",
360361
"unbase64",
361362
"unhex",
362363
"unix_timestamp",

R/pkg/R/SQLContext.R

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ setMethod("toDF", signature(x = "RDD"),
334334
#'
335335
#' Loads a JSON file, returning the result as a SparkDataFrame
336336
#' By default, (\href{http://jsonlines.org/}{JSON Lines text format or newline-delimited JSON}
337-
#' ) is supported. For JSON (one record per file), set a named property \code{wholeFile} to
337+
#' ) is supported. For JSON (one record per file), set a named property \code{multiLine} to
338338
#' \code{TRUE}.
339339
#' It goes through the entire dataset once to determine the schema.
340340
#'
@@ -348,7 +348,7 @@ setMethod("toDF", signature(x = "RDD"),
348348
#' sparkR.session()
349349
#' path <- "path/to/file.json"
350350
#' df <- read.json(path)
351-
#' df <- read.json(path, wholeFile = TRUE)
351+
#' df <- read.json(path, multiLine = TRUE)
352352
#' df <- jsonFile(path)
353353
#' }
354354
#' @name read.json
@@ -598,7 +598,7 @@ tableToDF <- function(tableName) {
598598
#' df1 <- read.df("path/to/file.json", source = "json")
599599
#' schema <- structType(structField("name", "string"),
600600
#' structField("info", "map<string,double>"))
601-
#' df2 <- read.df(mapTypeJsonPath, "json", schema, wholeFile = TRUE)
601+
#' df2 <- read.df(mapTypeJsonPath, "json", schema, multiLine = TRUE)
602602
#' df3 <- loadDF("data/test_table", "parquet", mergeSchema = "true")
603603
#' }
604604
#' @name read.df

R/pkg/R/functions.R

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4015,3 +4015,32 @@ setMethod("input_file_name", signature("missing"),
40154015
jc <- callJStatic("org.apache.spark.sql.functions", "input_file_name")
40164016
column(jc)
40174017
})
4018+
4019+
#' trunc
4020+
#'
4021+
#' Returns date truncated to the unit specified by the format.
4022+
#'
4023+
#' @param x Column to compute on.
4024+
#' @param format string used for specify the truncation method. For example, "year", "yyyy",
4025+
#' "yy" for truncate by year, or "month", "mon", "mm" for truncate by month.
4026+
#'
4027+
#' @rdname trunc
4028+
#' @name trunc
4029+
#' @family date time functions
4030+
#' @aliases trunc,Column-method
4031+
#' @export
4032+
#' @examples
4033+
#' \dontrun{
4034+
#' trunc(df$c, "year")
4035+
#' trunc(df$c, "yy")
4036+
#' trunc(df$c, "month")
4037+
#' trunc(df$c, "mon")
4038+
#' }
4039+
#' @note trunc since 2.3.0
4040+
setMethod("trunc",
4041+
signature(x = "Column"),
4042+
function(x, format) {
4043+
jc <- callJStatic("org.apache.spark.sql.functions", "trunc",
4044+
x@jc, as.character(format))
4045+
column(jc)
4046+
})

R/pkg/tests/fulltests/test_sparkSQL.R

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1382,6 +1382,8 @@ test_that("column functions", {
13821382
c20 <- to_timestamp(c) + to_timestamp(c, "yyyy") + to_date(c, "yyyy")
13831383
c21 <- posexplode_outer(c) + explode_outer(c)
13841384
c22 <- not(c)
1385+
c23 <- trunc(c, "year") + trunc(c, "yyyy") + trunc(c, "yy") +
1386+
trunc(c, "month") + trunc(c, "mon") + trunc(c, "mm")
13851387

13861388
# Test if base::is.nan() is exposed
13871389
expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE))

R/pkg/tests/run-all.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,10 @@ if (.Platform$OS.type == "windows") {
3030
install.spark()
3131

3232
sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R")
33-
sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE)
3433
sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db")
3534
invisible(lapply(sparkRWhitelistSQLDirs,
3635
function(x) { unlink(file.path(sparkRDir, x), recursive = TRUE, force = TRUE)}))
36+
sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE)
3737

3838
sparkRTestMaster <- "local[1]"
3939
if (identical(Sys.getenv("NOT_CRAN"), "true")) {

appveyor.yml

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ branches:
2626

2727
only_commits:
2828
files:
29+
- appveyor.yml
30+
- dev/appveyor-install-dependencies.ps1
2931
- R/
3032
- sql/core/src/main/scala/org/apache/spark/sql/api/r/
3133
- core/src/main/scala/org/apache/spark/api/r/
@@ -38,12 +40,8 @@ install:
3840
# Install maven and dependencies
3941
- ps: .\dev\appveyor-install-dependencies.ps1
4042
# Required package for R unit tests
41-
- cmd: R -e "install.packages('testthat', repos='http://cran.us.r-project.org')"
42-
- cmd: R -e "packageVersion('testthat')"
43-
- cmd: R -e "install.packages('e1071', repos='http://cran.us.r-project.org')"
44-
- cmd: R -e "packageVersion('e1071')"
45-
- cmd: R -e "install.packages('survival', repos='http://cran.us.r-project.org')"
46-
- cmd: R -e "packageVersion('survival')"
43+
- cmd: R -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival'), repos='http://cran.us.r-project.org')"
44+
- cmd: R -e "packageVersion('knitr'); packageVersion('rmarkdown'); packageVersion('testthat'); packageVersion('e1071'); packageVersion('survival')"
4745

4846
build_script:
4947
- cmd: mvn -DskipTests -Psparkr -Phive -Phive-thriftserver package

common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java

Lines changed: 49 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@
4444
import static org.apache.spark.network.util.NettyUtils.getRemoteAddress;
4545
import org.apache.spark.network.util.TransportConf;
4646

47-
4847
/**
4948
* RPC Handler for a server which can serve shuffle blocks from outside of an Executor process.
5049
*
@@ -91,26 +90,8 @@ protected void handleMessage(
9190
try {
9291
OpenBlocks msg = (OpenBlocks) msgObj;
9392
checkAuth(client, msg.appId);
94-
95-
Iterator<ManagedBuffer> iter = new Iterator<ManagedBuffer>() {
96-
private int index = 0;
97-
98-
@Override
99-
public boolean hasNext() {
100-
return index < msg.blockIds.length;
101-
}
102-
103-
@Override
104-
public ManagedBuffer next() {
105-
final ManagedBuffer block = blockManager.getBlockData(msg.appId, msg.execId,
106-
msg.blockIds[index]);
107-
index++;
108-
metrics.blockTransferRateBytes.mark(block != null ? block.size() : 0);
109-
return block;
110-
}
111-
};
112-
113-
long streamId = streamManager.registerStream(client.getClientId(), iter);
93+
long streamId = streamManager.registerStream(client.getClientId(),
94+
new ManagedBufferIterator(msg.appId, msg.execId, msg.blockIds));
11495
if (logger.isTraceEnabled()) {
11596
logger.trace("Registered streamId {} with {} buffers for client {} from host {}",
11697
streamId,
@@ -209,4 +190,51 @@ public Map<String, Metric> getMetrics() {
209190
}
210191
}
211192

193+
private class ManagedBufferIterator implements Iterator<ManagedBuffer> {
194+
195+
private int index = 0;
196+
private final String appId;
197+
private final String execId;
198+
private final int shuffleId;
199+
// An array containing mapId and reduceId pairs.
200+
private final int[] mapIdAndReduceIds;
201+
202+
ManagedBufferIterator(String appId, String execId, String[] blockIds) {
203+
this.appId = appId;
204+
this.execId = execId;
205+
String[] blockId0Parts = blockIds[0].split("_");
206+
if (blockId0Parts.length != 4 || !blockId0Parts[0].equals("shuffle")) {
207+
throw new IllegalArgumentException("Unexpected shuffle block id format: " + blockIds[0]);
208+
}
209+
this.shuffleId = Integer.parseInt(blockId0Parts[1]);
210+
mapIdAndReduceIds = new int[2 * blockIds.length];
211+
for (int i = 0; i < blockIds.length; i++) {
212+
String[] blockIdParts = blockIds[i].split("_");
213+
if (blockIdParts.length != 4 || !blockIdParts[0].equals("shuffle")) {
214+
throw new IllegalArgumentException("Unexpected shuffle block id format: " + blockIds[i]);
215+
}
216+
if (Integer.parseInt(blockIdParts[1]) != shuffleId) {
217+
throw new IllegalArgumentException("Expected shuffleId=" + shuffleId +
218+
", got:" + blockIds[i]);
219+
}
220+
mapIdAndReduceIds[2 * i] = Integer.parseInt(blockIdParts[2]);
221+
mapIdAndReduceIds[2 * i + 1] = Integer.parseInt(blockIdParts[3]);
222+
}
223+
}
224+
225+
@Override
226+
public boolean hasNext() {
227+
return index < mapIdAndReduceIds.length;
228+
}
229+
230+
@Override
231+
public ManagedBuffer next() {
232+
final ManagedBuffer block = blockManager.getBlockData(appId, execId, shuffleId,
233+
mapIdAndReduceIds[index], mapIdAndReduceIds[index + 1]);
234+
index += 2;
235+
metrics.blockTransferRateBytes.mark(block != null ? block.size() : 0);
236+
return block;
237+
}
238+
}
239+
212240
}

common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -150,27 +150,20 @@ public void registerExecutor(
150150
}
151151

152152
/**
153-
* Obtains a FileSegmentManagedBuffer from a shuffle block id. We expect the blockId has the
154-
* format "shuffle_ShuffleId_MapId_ReduceId" (from ShuffleBlockId), and additionally make
155-
* assumptions about how the hash and sort based shuffles store their data.
153+
* Obtains a FileSegmentManagedBuffer from (shuffleId, mapId, reduceId). We make assumptions
154+
* about how the hash and sort based shuffles store their data.
156155
*/
157-
public ManagedBuffer getBlockData(String appId, String execId, String blockId) {
158-
String[] blockIdParts = blockId.split("_");
159-
if (blockIdParts.length < 4) {
160-
throw new IllegalArgumentException("Unexpected block id format: " + blockId);
161-
} else if (!blockIdParts[0].equals("shuffle")) {
162-
throw new IllegalArgumentException("Expected shuffle block id, got: " + blockId);
163-
}
164-
int shuffleId = Integer.parseInt(blockIdParts[1]);
165-
int mapId = Integer.parseInt(blockIdParts[2]);
166-
int reduceId = Integer.parseInt(blockIdParts[3]);
167-
156+
public ManagedBuffer getBlockData(
157+
String appId,
158+
String execId,
159+
int shuffleId,
160+
int mapId,
161+
int reduceId) {
168162
ExecutorShuffleInfo executor = executors.get(new AppExecId(appId, execId));
169163
if (executor == null) {
170164
throw new RuntimeException(
171165
String.format("Executor is not registered (appId=%s, execId=%s)", appId, execId));
172166
}
173-
174167
return getSortBasedShuffleBlockData(executor, shuffleId, mapId, reduceId);
175168
}
176169

0 commit comments

Comments
 (0)