apache
diff --git a/‎.travis.yml‎
Lines changed: 1 addition & 1 deletion b/‎.travis.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎NOTICE‎
Lines changed: 6 additions & 0 deletions b/‎NOTICE‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎R/pkg/tests/fulltests/test_sparkSQL.R‎
Lines changed: 148 additions & 119 deletions b/‎R/pkg/tests/fulltests/test_sparkSQL.R‎
Lines changed: 148 additions & 119 deletions
diff --git a/‎appveyor.yml‎
Lines changed: 1 addition & 0 deletions b/‎appveyor.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎assembly/pom.xml‎
Lines changed: 10 additions & 0 deletions b/‎assembly/pom.xml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎bin/find-spark-home.cmd‎
Lines changed: 60 additions & 0 deletions b/‎bin/find-spark-home.cmd‎
Lines changed: 60 additions & 0 deletions
@@ -43,7 +43,7 @@ notifications:
 # 5. Run maven install before running lint-java.
 install:
   - export MAVEN_SKIP_RC=1
-  - build/mvn -T 4 -q -DskipTests -Pmesos -Pyarn -Pkinesis-asl -Phive -Phive-thriftserver install
+  - build/mvn -T 4 -q -DskipTests -Pkubernetes -Pmesos -Pyarn -Pkinesis-asl -Phive -Phive-thriftserver install
 
 # 6. Run lint-java.
 script:
 
@@ -448,6 +448,12 @@ Copyright (C) 2011 Google Inc.
 Apache Commons Pool
 Copyright 1999-2009 The Apache Software Foundation
 
+This product includes/uses Kubernetes & OpenShift 3 Java Client (https://github.com/fabric8io/kubernetes-client)
+Copyright (C) 2015 Red Hat, Inc.
+
+This product includes/uses OkHttp (https://github.com/square/okhttp)
+Copyright (C) 2012 The Android Open Source Project
+
 =========================================================================
 ==  NOTICE file corresponding to section 4(d) of the Apache License,   ==
 ==  Version 2.0, in this case for the DataNucleus distribution.        ==
 
@@ -3021,41 +3021,54 @@ test_that("dapplyCollect() on DataFrame with a binary column", {
 })
 
 test_that("repartition by columns on DataFrame", {
-  df <- createDataFrame(
-    list(list(1L, 1, "1", 0.1), list(1L, 2, "2", 0.2), list(3L, 3, "3", 0.3)),
-    c("a", "b", "c", "d"))
-
-  # no column and number of partitions specified
-  retError <- tryCatch(repartition(df), error = function(e) e)
-  expect_equal(grepl
-    ("Please, specify the number of partitions and/or a column\\(s\\)", retError), TRUE)
-
-  # repartition by column and number of partitions
-  actual <- repartition(df, 3, col = df$"a")
-
-  # Checking that at least the dimensions are identical
-  expect_identical(dim(df), dim(actual))
-  expect_equal(getNumPartitions(actual), 3L)
-
-  # repartition by number of partitions
-  actual <- repartition(df, 13L)
-  expect_identical(dim(df), dim(actual))
-  expect_equal(getNumPartitions(actual), 13L)
-
-  expect_equal(getNumPartitions(coalesce(actual, 1L)), 1L)
-
-  # a test case with a column and dapply
-  schema <-  structType(structField("a", "integer"), structField("avg", "double"))
-  df <- repartition(df, col = df$"a")
-  df1 <- dapply(
-    df,
-    function(x) {
-      y <- (data.frame(x$a[1], mean(x$b)))
-    },
-    schema)
+  # The tasks here launch R workers with shuffles. So, we decrease the number of shuffle
+  # partitions to reduce the number of the tasks to speed up the test. This is particularly
+  # slow on Windows because the R workers are unable to be forked. See also SPARK-21693.
+  conf <- callJMethod(sparkSession, "conf")
+  shufflepartitionsvalue <- callJMethod(conf, "get", "spark.sql.shuffle.partitions")
+  callJMethod(conf, "set", "spark.sql.shuffle.partitions", "5")
+  tryCatch({
+    df <- createDataFrame(
+      list(list(1L, 1, "1", 0.1), list(1L, 2, "2", 0.2), list(3L, 3, "3", 0.3)),
+      c("a", "b", "c", "d"))
+
+    # no column and number of partitions specified
+    retError <- tryCatch(repartition(df), error = function(e) e)
+    expect_equal(grepl
+      ("Please, specify the number of partitions and/or a column\\(s\\)", retError), TRUE)
+
+    # repartition by column and number of partitions
+    actual <- repartition(df, 3, col = df$"a")
+
+    # Checking that at least the dimensions are identical
+    expect_identical(dim(df), dim(actual))
+    expect_equal(getNumPartitions(actual), 3L)
+
+    # repartition by number of partitions
+    actual <- repartition(df, 13L)
+    expect_identical(dim(df), dim(actual))
+    expect_equal(getNumPartitions(actual), 13L)
+
+    expect_equal(getNumPartitions(coalesce(actual, 1L)), 1L)
+
+    # a test case with a column and dapply
+    schema <-  structType(structField("a", "integer"), structField("avg", "double"))
+    df <- repartition(df, col = df$"a")
+
+    df1 <- dapply(
+      df,
+      function(x) {
+        y <- (data.frame(x$a[1], mean(x$b)))
+      },
+      schema)
 
-  # Number of partitions is equal to 2
-  expect_equal(nrow(df1), 2)
+    # Number of partitions is equal to 2
+    expect_equal(nrow(df1), 2)
+  },
+  finally = {
+    # Resetting the conf back to default value
+    callJMethod(conf, "set", "spark.sql.shuffle.partitions", shufflepartitionsvalue)
+  })
 })
 
 test_that("coalesce, repartition, numPartitions", {
@@ -3078,101 +3091,117 @@ test_that("coalesce, repartition, numPartitions", {
 })
 
 test_that("gapply() and gapplyCollect() on a DataFrame", {
-  df <- createDataFrame(
-    list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3", 0.3)),
-    c("a", "b", "c", "d"))
-  expected <- collect(df)
-  df1 <- gapply(df, "a", function(key, x) { x }, schema(df))
-  actual <- collect(df1)
-  expect_identical(actual, expected)
-
-  df1Collect <- gapplyCollect(df, list("a"), function(key, x) { x })
-  expect_identical(df1Collect, expected)
-
-  # gapply on empty grouping columns.
-  df1 <- gapply(df, c(), function(key, x) { x }, schema(df))
-  actual <- collect(df1)
-  expect_identical(actual, expected)
-
-  # Computes the sum of second column by grouping on the first and third columns
-  # and checks if the sum is larger than 2
-  schemas <- list(structType(structField("a", "integer"), structField("e", "boolean")),
-                  "a INT, e BOOLEAN")
-  for (schema in schemas) {
-    df2 <- gapply(
+  # The tasks here launch R workers with shuffles. So, we decrease the number of shuffle
+  # partitions to reduce the number of the tasks to speed up the test. This is particularly
+  # slow on Windows because the R workers are unable to be forked. See also SPARK-21693.
+  conf <- callJMethod(sparkSession, "conf")
+  shufflepartitionsvalue <- callJMethod(conf, "get", "spark.sql.shuffle.partitions")
+  # TODO: Lower number of 'spark.sql.shuffle.partitions' causes test failures
+  # for an unknown reason. Probably we should fix it.
+  callJMethod(conf, "set", "spark.sql.shuffle.partitions", "16")
+  tryCatch({
+    df <- createDataFrame(
+      list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3", 0.3)),
+      c("a", "b", "c", "d"))
+    expected <- collect(df)
+    df1 <- gapply(df, "a", function(key, x) { x }, schema(df))
+    actual <- collect(df1)
+    expect_identical(actual, expected)
+
+    df1Collect <- gapplyCollect(df, list("a"), function(key, x) { x })
+    expect_identical(df1Collect, expected)
+
+    # gapply on empty grouping columns.
+    df1 <- gapply(df, c(), function(key, x) { x }, schema(df))
+    actual <- collect(df1)
+    expect_identical(actual, expected)
+
+    # Computes the sum of second column by grouping on the first and third columns
+    # and checks if the sum is larger than 2
+    schemas <- list(structType(structField("a", "integer"), structField("e", "boolean")),
+                    "a INT, e BOOLEAN")
+    for (schema in schemas) {
+      df2 <- gapply(
+        df,
+        c(df$"a", df$"c"),
+        function(key, x) {
+          y <- data.frame(key[1], sum(x$b) > 2)
+        },
+        schema)
+      actual <- collect(df2)$e
+      expected <- c(TRUE, TRUE)
+      expect_identical(actual, expected)
+
+      df2Collect <- gapplyCollect(
+        df,
+        c(df$"a", df$"c"),
+        function(key, x) {
+          y <- data.frame(key[1], sum(x$b) > 2)
+          colnames(y) <- c("a", "e")
+          y
+        })
+      actual <- df2Collect$e
+      expect_identical(actual, expected)
+    }
+
+    # Computes the arithmetic mean of the second column by grouping
+    # on the first and third columns. Output the groupping value and the average.
+    schema <-  structType(structField("a", "integer"), structField("c", "string"),
+                          structField("avg", "double"))
+    df3 <- gapply(
       df,
-      c(df$"a", df$"c"),
+      c("a", "c"),
       function(key, x) {
-        y <- data.frame(key[1], sum(x$b) > 2)
+        y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
       },
       schema)
-    actual <- collect(df2)$e
-    expected <- c(TRUE, TRUE)
+    actual <- collect(df3)
+    actual <- actual[order(actual$a), ]
+    rownames(actual) <- NULL
+    expected <- collect(select(df, "a", "b", "c"))
+    expected <- data.frame(aggregate(expected$b, by = list(expected$a, expected$c), FUN = mean))
+    colnames(expected) <- c("a", "c", "avg")
+    expected <- expected[order(expected$a), ]
+    rownames(expected) <- NULL
     expect_identical(actual, expected)
 
-    df2Collect <- gapplyCollect(
+    df3Collect <- gapplyCollect(
       df,
-      c(df$"a", df$"c"),
+      c("a", "c"),
       function(key, x) {
-        y <- data.frame(key[1], sum(x$b) > 2)
-        colnames(y) <- c("a", "e")
+        y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
+        colnames(y) <- c("a", "c", "avg")
         y
       })
-      actual <- df2Collect$e
-      expect_identical(actual, expected)
-  }
-
-  # Computes the arithmetic mean of the second column by grouping
-  # on the first and third columns. Output the groupping value and the average.
-  schema <-  structType(structField("a", "integer"), structField("c", "string"),
-               structField("avg", "double"))
-  df3 <- gapply(
-    df,
-    c("a", "c"),
-    function(key, x) {
-      y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
-    },
-    schema)
-  actual <- collect(df3)
-  actual <-  actual[order(actual$a), ]
-  rownames(actual) <- NULL
-  expected <- collect(select(df, "a", "b", "c"))
-  expected <- data.frame(aggregate(expected$b, by = list(expected$a, expected$c), FUN = mean))
-  colnames(expected) <- c("a", "c", "avg")
-  expected <-  expected[order(expected$a), ]
-  rownames(expected) <- NULL
-  expect_identical(actual, expected)
-
-  df3Collect <- gapplyCollect(
-    df,
-    c("a", "c"),
-    function(key, x) {
-      y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
-      colnames(y) <- c("a", "c", "avg")
-      y
-    })
-  actual <- df3Collect[order(df3Collect$a), ]
-  expect_identical(actual$avg, expected$avg)
-
-  irisDF <- suppressWarnings(createDataFrame(iris))
-  schema <-  structType(structField("Sepal_Length", "double"), structField("Avg", "double"))
-  # Groups by `Sepal_Length` and computes the average for `Sepal_Width`
-  df4 <- gapply(
-    cols = "Sepal_Length",
-    irisDF,
-    function(key, x) {
-      y <- data.frame(key, mean(x$Sepal_Width), stringsAsFactors = FALSE)
-    },
-    schema)
-  actual <- collect(df4)
-  actual <- actual[order(actual$Sepal_Length), ]
-  rownames(actual) <- NULL
-  agg_local_df <- data.frame(aggregate(iris$Sepal.Width, by = list(iris$Sepal.Length), FUN = mean),
-                    stringsAsFactors = FALSE)
-  colnames(agg_local_df) <- c("Sepal_Length", "Avg")
-  expected <-  agg_local_df[order(agg_local_df$Sepal_Length), ]
-  rownames(expected) <- NULL
-  expect_identical(actual, expected)
+    actual <- df3Collect[order(df3Collect$a), ]
+    expect_identical(actual$avg, expected$avg)
+
+    irisDF <- suppressWarnings(createDataFrame(iris))
+    schema <- structType(structField("Sepal_Length", "double"), structField("Avg", "double"))
+    # Groups by `Sepal_Length` and computes the average for `Sepal_Width`
+    df4 <- gapply(
+      cols = "Sepal_Length",
+      irisDF,
+      function(key, x) {
+        y <- data.frame(key, mean(x$Sepal_Width), stringsAsFactors = FALSE)
+      },
+      schema)
+    actual <- collect(df4)
+    actual <- actual[order(actual$Sepal_Length), ]
+    rownames(actual) <- NULL
+    agg_local_df <- data.frame(aggregate(iris$Sepal.Width,
+                                         by = list(iris$Sepal.Length),
+                                         FUN = mean),
+                               stringsAsFactors = FALSE)
+    colnames(agg_local_df) <- c("Sepal_Length", "Avg")
+    expected <- agg_local_df[order(agg_local_df$Sepal_Length), ]
+    rownames(expected) <- NULL
+    expect_identical(actual, expected)
+  },
+  finally = {
+    # Resetting the conf back to default value
+    callJMethod(conf, "set", "spark.sql.shuffle.partitions", shufflepartitionsvalue)
+  })
 })
 
 test_that("Window functions on a DataFrame", {
 
@@ -33,6 +33,7 @@ only_commits:
     - core/src/main/scala/org/apache/spark/api/r/
     - mllib/src/main/scala/org/apache/spark/ml/r/
     - core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+    - bin/*.cmd
 
 cache:
   - C:\Users\appveyor\.m2
 
@@ -148,6 +148,16 @@
         </dependency>
       </dependencies>
     </profile>
+    <profile>
+      <id>kubernetes</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-kubernetes_${scala.binary.version}</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
     <profile>
       <id>hive</id>
       <dependencies>
 
@@ -0,0 +1,60 @@
+@echo off
+
+rem
+rem Licensed to the Apache Software Foundation (ASF) under one or more
+rem contributor license agreements.  See the NOTICE file distributed with
+rem this work for additional information regarding copyright ownership.
+rem The ASF licenses this file to You under the Apache License, Version 2.0
+rem (the "License"); you may not use this file except in compliance with
+rem the License.  You may obtain a copy of the License at
+rem
+rem    http://www.apache.org/licenses/LICENSE-2.0
+rem
+rem Unless required by applicable law or agreed to in writing, software
+rem distributed under the License is distributed on an "AS IS" BASIS,
+rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+rem See the License for the specific language governing permissions and
+rem limitations under the License.
+rem
+
+rem Path to Python script finding SPARK_HOME
+set FIND_SPARK_HOME_PYTHON_SCRIPT=%~dp0find_spark_home.py
+
+rem Default to standard python interpreter unless told otherwise
+set PYTHON_RUNNER=python
+rem If PYSPARK_DRIVER_PYTHON is set, it overwrites the python version
+if not "x%PYSPARK_DRIVER_PYTHON%"=="x" (
+  set PYTHON_RUNNER=%PYSPARK_DRIVER_PYTHON%
+)
+rem If PYSPARK_PYTHON is set, it overwrites the python version
+if not "x%PYSPARK_PYTHON%"=="x" (
+  set PYTHON_RUNNER=%PYSPARK_PYTHON%
+)
+
+rem If there is python installed, trying to use the root dir as SPARK_HOME
+where %PYTHON_RUNNER% > nul 2>&1
+if %ERRORLEVEL% neq 0 (
+  if not exist %PYTHON_RUNNER% (
+    if "x%SPARK_HOME%"=="x" (
+      echo Missing Python executable '%PYTHON_RUNNER%', defaulting to '%~dp0..' for SPARK_HOME ^
+environment variable. Please install Python or specify the correct Python executable in ^
+PYSPARK_DRIVER_PYTHON or PYSPARK_PYTHON environment variable to detect SPARK_HOME safely.
+      set SPARK_HOME=%~dp0..
+    )
+  )
+)
+
+rem Only attempt to find SPARK_HOME if it is not set.
+if "x%SPARK_HOME%"=="x" (
+  if not exist "%FIND_SPARK_HOME_PYTHON_SCRIPT%" (
+    rem If we are not in the same directory as find_spark_home.py we are not pip installed so we don't
+    rem need to search the different Python directories for a Spark installation.
+    rem Note only that, if the user has pip installed PySpark but is directly calling pyspark-shell or
+    rem spark-submit in another directory we want to use that version of PySpark rather than the
+    rem pip installed version of PySpark.
+    set SPARK_HOME=%~dp0..
+  ) else (
+    rem We are pip installed, use the Python script to resolve a reasonable SPARK_HOME
+    for /f "delims=" %%i in ('%PYTHON_RUNNER% %FIND_SPARK_HOME_PYTHON_SCRIPT%') do set SPARK_HOME=%%i
+  )
+)