Skip to content

Commit f79f12c

Browse files
committed
Merge branch 'master' into resolveDataSourceScanFilesTwice
2 parents f1da0a4 + 2ff1467 commit f79f12c

File tree

265 files changed

+2834
-1747
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

265 files changed

+2834
-1747
lines changed

R/WINDOWS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,6 @@ To run the SparkR unit tests on Windows, the following steps are required —ass
3838

3939
```
4040
R -e "install.packages('testthat', repos='http://cran.us.r-project.org')"
41-
.\bin\spark-submit2.cmd --conf spark.hadoop.fs.default.name="file:///" R\pkg\tests\run-all.R
41+
.\bin\spark-submit2.cmd --conf spark.hadoop.fs.defaultFS="file:///" R\pkg\tests\run-all.R
4242
```
4343

R/pkg/R/DataFrame.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ setMethod("dtypes",
280280

281281
#' Column Names of SparkDataFrame
282282
#'
283-
#' Return all column names as a list.
283+
#' Return a vector of column names.
284284
#'
285285
#' @param x a SparkDataFrame.
286286
#'
@@ -338,7 +338,7 @@ setMethod("colnames",
338338
})
339339

340340
#' @param value a character vector. Must have the same length as the number
341-
#' of columns in the SparkDataFrame.
341+
#' of columns to be renamed.
342342
#' @rdname columns
343343
#' @aliases colnames<-,SparkDataFrame-method
344344
#' @name colnames<-

R/pkg/R/mllib_classification.R

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,9 @@ setClass("NaiveBayesModel", representation(jobj = "jobj"))
7575
#' @examples
7676
#' \dontrun{
7777
#' sparkR.session()
78-
#' df <- createDataFrame(iris)
79-
#' training <- df[df$Species %in% c("versicolor", "virginica"), ]
80-
#' model <- spark.svmLinear(training, Species ~ ., regParam = 0.5)
78+
#' t <- as.data.frame(Titanic)
79+
#' training <- createDataFrame(t)
80+
#' model <- spark.svmLinear(training, Survived ~ ., regParam = 0.5)
8181
#' summary <- summary(model)
8282
#'
8383
#' # fitted values on training data
@@ -220,9 +220,9 @@ function(object, path, overwrite = FALSE) {
220220
#' \dontrun{
221221
#' sparkR.session()
222222
#' # binary logistic regression
223-
#' df <- createDataFrame(iris)
224-
#' training <- df[df$Species %in% c("versicolor", "virginica"), ]
225-
#' model <- spark.logit(training, Species ~ ., regParam = 0.5)
223+
#' t <- as.data.frame(Titanic)
224+
#' training <- createDataFrame(t)
225+
#' model <- spark.logit(training, Survived ~ ., regParam = 0.5)
226226
#' summary <- summary(model)
227227
#'
228228
#' # fitted values on training data
@@ -239,8 +239,7 @@ function(object, path, overwrite = FALSE) {
239239
#'
240240
#' # multinomial logistic regression
241241
#'
242-
#' df <- createDataFrame(iris)
243-
#' model <- spark.logit(df, Species ~ ., regParam = 0.5)
242+
#' model <- spark.logit(training, Class ~ ., regParam = 0.5)
244243
#' summary <- summary(model)
245244
#'
246245
#' }

R/pkg/R/mllib_clustering.R

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,9 @@ setClass("LDAModel", representation(jobj = "jobj"))
7272
#' @examples
7373
#' \dontrun{
7474
#' sparkR.session()
75-
#' df <- createDataFrame(iris)
76-
#' model <- spark.bisectingKmeans(df, Sepal_Length ~ Sepal_Width, k = 4)
75+
#' t <- as.data.frame(Titanic)
76+
#' df <- createDataFrame(t)
77+
#' model <- spark.bisectingKmeans(df, Class ~ Survived, k = 4)
7778
#' summary(model)
7879
#'
7980
#' # get fitted result from a bisecting k-means model
@@ -82,7 +83,7 @@ setClass("LDAModel", representation(jobj = "jobj"))
8283
#'
8384
#' # fitted values on training data
8485
#' fitted <- predict(model, df)
85-
#' head(select(fitted, "Sepal_Length", "prediction"))
86+
#' head(select(fitted, "Class", "prediction"))
8687
#'
8788
#' # save fitted model to input path
8889
#' path <- "path/to/model"
@@ -338,14 +339,14 @@ setMethod("write.ml", signature(object = "GaussianMixtureModel", path = "charact
338339
#' @examples
339340
#' \dontrun{
340341
#' sparkR.session()
341-
#' data(iris)
342-
#' df <- createDataFrame(iris)
343-
#' model <- spark.kmeans(df, Sepal_Length ~ Sepal_Width, k = 4, initMode = "random")
342+
#' t <- as.data.frame(Titanic)
343+
#' df <- createDataFrame(t)
344+
#' model <- spark.kmeans(df, Class ~ Survived, k = 4, initMode = "random")
344345
#' summary(model)
345346
#'
346347
#' # fitted values on training data
347348
#' fitted <- predict(model, df)
348-
#' head(select(fitted, "Sepal_Length", "prediction"))
349+
#' head(select(fitted, "Class", "prediction"))
349350
#'
350351
#' # save fitted model to input path
351352
#' path <- "path/to/model"

R/pkg/R/mllib_regression.R

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,14 +68,14 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
6868
#' @examples
6969
#' \dontrun{
7070
#' sparkR.session()
71-
#' data(iris)
72-
#' df <- createDataFrame(iris)
73-
#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian")
71+
#' t <- as.data.frame(Titanic)
72+
#' df <- createDataFrame(t)
73+
#' model <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian")
7474
#' summary(model)
7575
#'
7676
#' # fitted values on training data
7777
#' fitted <- predict(model, df)
78-
#' head(select(fitted, "Sepal_Length", "prediction"))
78+
#' head(select(fitted, "Freq", "prediction"))
7979
#'
8080
#' # save fitted model to input path
8181
#' path <- "path/to/model"
@@ -137,9 +137,9 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
137137
#' @examples
138138
#' \dontrun{
139139
#' sparkR.session()
140-
#' data(iris)
141-
#' df <- createDataFrame(iris)
142-
#' model <- glm(Sepal_Length ~ Sepal_Width, df, family = "gaussian")
140+
#' t <- as.data.frame(Titanic)
141+
#' df <- createDataFrame(t)
142+
#' model <- glm(Freq ~ Sex + Age, df, family = "gaussian")
143143
#' summary(model)
144144
#' }
145145
#' @note glm since 1.5.0

R/pkg/R/mllib_tree.R

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -143,14 +143,15 @@ print.summary.treeEnsemble <- function(x) {
143143
#'
144144
#' # fit a Gradient Boosted Tree Classification Model
145145
#' # label must be binary - Only binary classification is supported for GBT.
146-
#' df <- createDataFrame(iris[iris$Species != "virginica", ])
147-
#' model <- spark.gbt(df, Species ~ Petal_Length + Petal_Width, "classification")
146+
#' t <- as.data.frame(Titanic)
147+
#' df <- createDataFrame(t)
148+
#' model <- spark.gbt(df, Survived ~ Age + Freq, "classification")
148149
#'
149150
#' # numeric label is also supported
150-
#' iris2 <- iris[iris$Species != "virginica", ]
151-
#' iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1)
152-
#' df <- createDataFrame(iris2)
153-
#' model <- spark.gbt(df, NumericSpecies ~ ., type = "classification")
151+
#' t2 <- as.data.frame(Titanic)
152+
#' t2$NumericGender <- ifelse(t2$Sex == "Male", 0, 1)
153+
#' df <- createDataFrame(t2)
154+
#' model <- spark.gbt(df, NumericGender ~ ., type = "classification")
154155
#' }
155156
#' @note spark.gbt since 2.1.0
156157
setMethod("spark.gbt", signature(data = "SparkDataFrame", formula = "formula"),
@@ -351,8 +352,9 @@ setMethod("write.ml", signature(object = "GBTClassificationModel", path = "chara
351352
#' summary(savedModel)
352353
#'
353354
#' # fit a Random Forest Classification Model
354-
#' df <- createDataFrame(iris)
355-
#' model <- spark.randomForest(df, Species ~ Petal_Length + Petal_Width, "classification")
355+
#' t <- as.data.frame(Titanic)
356+
#' df <- createDataFrame(t)
357+
#' model <- spark.randomForest(df, Survived ~ Freq + Age, "classification")
356358
#' }
357359
#' @note spark.randomForest since 2.1.0
358360
setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "formula"),

R/pkg/inst/tests/testthat/test_sparkSQL.R

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -898,6 +898,12 @@ test_that("names() colnames() set the column names", {
898898
expect_equal(names(z)[3], "c")
899899
names(z)[3] <- "c2"
900900
expect_equal(names(z)[3], "c2")
901+
902+
# Test subset assignment
903+
colnames(df)[1] <- "col5"
904+
expect_equal(colnames(df)[1], "col5")
905+
names(df)[2] <- "col6"
906+
expect_equal(names(df)[2], "col6")
901907
})
902908

903909
test_that("head() and first() return the correct data", {

R/pkg/vignettes/sparkr-vignettes.Rmd

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -565,11 +565,10 @@ We use a simple example to demonstrate `spark.logit` usage. In general, there ar
565565
and 3). Obtain the coefficient matrix of the fitted model using `summary` and use the model for prediction with `predict`.
566566

567567
Binomial logistic regression
568-
```{r, warning=FALSE}
569-
df <- createDataFrame(iris)
570-
# Create a DataFrame containing two classes
571-
training <- df[df$Species %in% c("versicolor", "virginica"), ]
572-
model <- spark.logit(training, Species ~ ., regParam = 0.00042)
568+
```{r}
569+
t <- as.data.frame(Titanic)
570+
training <- createDataFrame(t)
571+
model <- spark.logit(training, Survived ~ ., regParam = 0.04741301)
573572
summary(model)
574573
```
575574

@@ -579,10 +578,11 @@ fitted <- predict(model, training)
579578
```
580579

581580
Multinomial logistic regression against three classes
582-
```{r, warning=FALSE}
583-
df <- createDataFrame(iris)
581+
```{r}
582+
t <- as.data.frame(Titanic)
583+
training <- createDataFrame(t)
584584
# Note in this case, Spark infers it is multinomial logistic regression, so family = "multinomial" is optional.
585-
model <- spark.logit(df, Species ~ ., regParam = 0.056)
585+
model <- spark.logit(training, Class ~ ., regParam = 0.07815179)
586586
summary(model)
587587
```
588588

@@ -609,11 +609,12 @@ MLPC employs backpropagation for learning the model. We use the logistic loss fu
609609

610610
`spark.mlp` requires at least two columns in `data`: one named `"label"` and the other one `"features"`. The `"features"` column should be in libSVM-format.
611611

612-
We use iris data set to show how to use `spark.mlp` in classification.
613-
```{r, warning=FALSE}
614-
df <- createDataFrame(iris)
612+
We use Titanic data set to show how to use `spark.mlp` in classification.
613+
```{r}
614+
t <- as.data.frame(Titanic)
615+
training <- createDataFrame(t)
615616
# fit a Multilayer Perceptron Classification Model
616-
model <- spark.mlp(df, Species ~ ., blockSize = 128, layers = c(4, 3), solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1, initialWeights = c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
617+
model <- spark.mlp(training, Survived ~ Age + Sex, blockSize = 128, layers = c(2, 3), solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1, initialWeights = c( 0, 0, 0, 5, 5, 5, 9, 9, 9))
617618
```
618619

619620
To avoid lengthy display, we only present partial results of the model summary. You can check the full result from your sparkR shell.
@@ -630,7 +631,7 @@ options(ops)
630631
```
631632
```{r}
632633
# make predictions use the fitted model
633-
predictions <- predict(model, df)
634+
predictions <- predict(model, training)
634635
head(select(predictions, predictions$prediction))
635636
```
636637

@@ -769,12 +770,13 @@ predictions <- predict(rfModel, df)
769770

770771
`spark.bisectingKmeans` is a kind of [hierarchical clustering](https://en.wikipedia.org/wiki/Hierarchical_clustering) using a divisive (or "top-down") approach: all observations start in one cluster, and splits are performed recursively as one moves down the hierarchy.
771772

772-
```{r, warning=FALSE}
773-
df <- createDataFrame(iris)
774-
model <- spark.bisectingKmeans(df, Sepal_Length ~ Sepal_Width, k = 4)
773+
```{r}
774+
t <- as.data.frame(Titanic)
775+
training <- createDataFrame(t)
776+
model <- spark.bisectingKmeans(training, Class ~ Survived, k = 4)
775777
summary(model)
776-
fitted <- predict(model, df)
777-
head(select(fitted, "Sepal_Length", "prediction"))
778+
fitted <- predict(model, training)
779+
head(select(fitted, "Class", "prediction"))
778780
```
779781

780782
#### Gaussian Mixture Model
@@ -912,9 +914,10 @@ testSummary
912914

913915
### Model Persistence
914916
The following example shows how to save/load an ML model by SparkR.
915-
```{r, warning=FALSE}
916-
irisDF <- createDataFrame(iris)
917-
gaussianGLM <- spark.glm(irisDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian")
917+
```{r}
918+
t <- as.data.frame(Titanic)
919+
training <- createDataFrame(t)
920+
gaussianGLM <- spark.glm(training, Freq ~ Sex + Age, family = "gaussian")
918921
919922
# Save and then load a fitted MLlib model
920923
modelPath <- tempfile(pattern = "ml", fileext = ".tmp")
@@ -925,7 +928,7 @@ gaussianGLM2 <- read.ml(modelPath)
925928
summary(gaussianGLM2)
926929
927930
# Check model prediction
928-
gaussianPredictions <- predict(gaussianGLM2, irisDF)
931+
gaussianPredictions <- predict(gaussianGLM2, training)
929932
head(gaussianPredictions)
930933
931934
unlink(modelPath)

R/run-tests.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ FAILED=0
2323
LOGFILE=$FWDIR/unit-tests.out
2424
rm -f $LOGFILE
2525

26-
SPARK_TESTING=1 $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" --conf spark.hadoop.fs.default.name="file:///" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
26+
SPARK_TESTING=1 $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" --conf spark.hadoop.fs.defaultFS="file:///" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
2727
FAILED=$((PIPESTATUS[0]||$FAILED))
2828

2929
NUM_TEST_WARNING="$(grep -c -e 'Warnings ----------------' $LOGFILE)"

appveyor.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ build_script:
4646
- cmd: mvn -DskipTests -Psparkr -Phive -Phive-thriftserver package
4747

4848
test_script:
49-
- cmd: .\bin\spark-submit2.cmd --conf spark.hadoop.fs.default.name="file:///" R\pkg\tests\run-all.R
49+
- cmd: .\bin\spark-submit2.cmd --conf spark.hadoop.fs.defaultFS="file:///" R\pkg\tests\run-all.R
5050

5151
notifications:
5252
- provider: Email

0 commit comments

Comments
 (0)