apache
diff --git a/‎LICENSE‎
Lines changed: 1 addition & 0 deletions b/‎LICENSE‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎NOTICE‎
Lines changed: 1 addition & 1 deletion b/‎NOTICE‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/DOCUMENTATION.md‎
Lines changed: 6 additions & 6 deletions b/‎R/DOCUMENTATION.md‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎R/README.md‎
Lines changed: 14 additions & 16 deletions b/‎R/README.md‎
Lines changed: 14 additions & 16 deletions
diff --git a/‎R/pkg/R/utils.R‎
Lines changed: 1 addition & 1 deletion b/‎R/pkg/R/utils.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎build/spark-build-info‎
Lines changed: 38 additions & 0 deletions b/‎build/spark-build-info‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java‎
Lines changed: 1 addition & 1 deletion b/‎common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/pom.xml‎
Lines changed: 31 additions & 0 deletions b/‎core/pom.xml‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java‎
Lines changed: 28 additions & 10 deletions b/‎core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java‎
Lines changed: 28 additions & 10 deletions
diff --git a/‎core/src/main/java/org/apache/spark/shuffle/sort/ShuffleSortDataFormat.java‎
Lines changed: 7 additions & 6 deletions b/‎core/src/main/java/org/apache/spark/shuffle/sort/ShuffleSortDataFormat.java‎
Lines changed: 7 additions & 6 deletions
@@ -296,3 +296,4 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (MIT License) blockUI (http://jquery.malsup.com/block/)
      (MIT License) RowsGroup (http://datatables.net/license/mit)
      (MIT License) jsonFormatter (http://www.jqueryscript.net/other/jQuery-Plugin-For-Pretty-JSON-Formatting-jsonFormatter.html)
+     (MIT License) modernizr (https://github.com/Modernizr/Modernizr/blob/master/LICENSE)
@@ -1,5 +1,5 @@
 Apache Spark
-Copyright 2014 The Apache Software Foundation.
+Copyright 2014 and onwards The Apache Software Foundation.
 
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).
 
@@ -1,12 +1,12 @@
 # SparkR Documentation
 
-SparkR documentation is generated using in-source comments annotated using using
-`roxygen2`. After making changes to the documentation, to generate man pages,
+SparkR documentation is generated by using in-source comments and annotated by using
+[`roxygen2`](https://cran.r-project.org/web/packages/roxygen2/index.html). After making changes to the documentation and generating man pages,
 you can run the following from an R console in the SparkR home directory
-
-    library(devtools)
-    devtools::document(pkg="./pkg", roclets=c("rd"))
-
+```R
+library(devtools)
+devtools::document(pkg="./pkg", roclets=c("rd"))
+```
 You can verify if your changes are good by running
 
     R CMD check pkg/
@@ -7,8 +7,7 @@ SparkR is an R package that provides a light-weight frontend to use Spark from R
 Libraries of sparkR need to be created in `$SPARK_HOME/R/lib`. This can be done by running the script `$SPARK_HOME/R/install-dev.sh`.
 By default the above script uses the system wide installation of R. However, this can be changed to any user installed location of R by setting the environment variable `R_HOME` the full path of the base directory where R is installed, before running install-dev.sh script.
 Example: 
-
-```
+```bash
 # where /home/username/R is where R is installed and /home/username/R/bin contains the files R and RScript
 export R_HOME=/home/username/R
 ./install-dev.sh
@@ -20,8 +19,8 @@ export R_HOME=/home/username/R
 
 Build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#building-with-buildmvn) and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run
 
-```
-  build/mvn -DskipTests -Psparkr package
+```bash
+build/mvn -DskipTests -Psparkr package
 ```
 
 #### Running sparkR
@@ -40,9 +39,8 @@ To set other options like driver memory, executor memory etc. you can pass in th
 
 #### Using SparkR from RStudio
 
-If you wish to use SparkR from RStudio or other R frontends you will need to set some environment variables which point SparkR to your Spark installation. For example 
-
-```
+If you wish to use SparkR from RStudio or other R frontends you will need to set some environment variables which point SparkR to your Spark installation. For example
+```R
 # Set this to where Spark is installed
 Sys.setenv(SPARK_HOME="/Users/username/spark")
 # This line loads SparkR from the installed directory
@@ -59,25 +57,25 @@ Once you have made your changes, please include unit tests for them and run exis
 
 #### Generating documentation
 
-The SparkR documentation (Rd files and HTML files) are not a part of the source repository. To generate them you can run the script `R/create-docs.sh`. This script uses `devtools` and `knitr` to generate the docs and these packages need to be installed on the machine before using the script.
+The SparkR documentation (Rd files and HTML files) are not a part of the source repository. To generate them you can run the script `R/create-docs.sh`. This script uses `devtools` and `knitr` to generate the docs and these packages need to be installed on the machine before using the script. Also, you may need to install these [prerequisites](https://github.com/apache/spark/tree/master/docs#prerequisites). See also, `R/DOCUMENTATION.md`
 
 ### Examples, Unit tests
 
 SparkR comes with several sample programs in the `examples/src/main/r` directory.
 To run one of them, use `./bin/spark-submit <filename> <args>`. For example:
-
-    ./bin/spark-submit examples/src/main/r/dataframe.R
-
+```bash
+./bin/spark-submit examples/src/main/r/dataframe.R
+```
 You can also run the unit tests for SparkR by running. You need to install the [testthat](http://cran.r-project.org/web/packages/testthat/index.html) package first:
-
-    R -e 'install.packages("testthat", repos="http://cran.us.r-project.org")'
-    ./R/run-tests.sh
+```bash
+R -e 'install.packages("testthat", repos="http://cran.us.r-project.org")'
+./R/run-tests.sh
+```
 
 ### Running on YARN
 
 The `./bin/spark-submit` can also be used to submit jobs to YARN clusters. You will need to set YARN conf dir before doing so. For example on CDH you can run
-
-```
+```bash
 export YARN_CONF_DIR=/etc/hadoop/conf
 ./bin/spark-submit --master yarn examples/src/main/r/dataframe.R
 ```
@@ -489,7 +489,7 @@ processClosure <- function(node, oldEnv, defVars, checkedFuncs, newEnv) {
 #   checkedFunc An environment of function objects examined during cleanClosure. It can be
 #               considered as a "name"-to-"list of functions" mapping.
 # return value
-#   a new version of func that has an correct environment (closure).
+#   a new version of func that has a correct environment (closure).
 cleanClosure <- function(func, checkedFuncs = new.env()) {
   if (is.function(func)) {
     newEnv <- new.env(parent = .GlobalEnv)
 
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script generates the build info for spark and places it into the spark-version-info.properties file.
+# Arguments:
+#   build_tgt_directory - The target directory where properties file would be created. [./core/target/extra-resources]
+#   spark_version - The current version of spark
+
+RESOURCE_DIR="$1"
+mkdir -p "$RESOURCE_DIR"
+SPARK_BUILD_INFO="${RESOURCE_DIR}"/spark-version-info.properties
+
+echo_build_properties() {
+  echo version=$1
+  echo user=$USER
+  echo revision=$(git rev-parse HEAD)
+  echo branch=$(git rev-parse --abbrev-ref HEAD)
+  echo date=$(date -u +%Y-%m-%dT%H:%M:%SZ)
+  echo url=$(git config --get remote.origin.url)
+}
+
+echo_build_properties $2 > "$SPARK_BUILD_INFO"
@@ -51,6 +51,6 @@ public long size() {
    * Creates a memory block pointing to the memory used by the long array.
    */
   public static MemoryBlock fromLongArray(final long[] array) {
-    return new MemoryBlock(array, Platform.LONG_ARRAY_OFFSET, array.length * 8);
+    return new MemoryBlock(array, Platform.LONG_ARRAY_OFFSET, array.length * 8L);
   }
 }
@@ -337,7 +337,38 @@
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+    <resources>
+      <resource>
+        <directory>${project.basedir}/src/main/resources</directory>
+      </resource>
+      <resource>
+        <!-- Include the properties file to provide the build information. -->
+        <directory>${project.build.directory}/extra-resources</directory>
+        <filtering>true</filtering>
+      </resource>
+    </resources>
     <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-antrun-plugin</artifactId>
+        <executions>
+          <execution>
+            <phase>generate-resources</phase>
+            <configuration>
+              <!-- Execute the shell script to generate the spark build information. -->
+              <tasks>
+                <exec executable="${project.basedir}/../build/spark-build-info">
+                  <arg value="${project.build.directory}/extra-resources"/>
+                  <arg value="${pom.version}"/>
+                </exec>
+              </tasks>
+            </configuration>
+            <goals>
+              <goal>run</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-dependency-plugin</artifactId>
 
@@ -22,12 +22,12 @@
 import org.apache.spark.memory.MemoryConsumer;
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.array.LongArray;
+import org.apache.spark.unsafe.memory.MemoryBlock;
 import org.apache.spark.util.collection.Sorter;
 import org.apache.spark.util.collection.unsafe.sort.RadixSort;
 
 final class ShuffleInMemorySorter {
 
-  private final Sorter<PackedRecordPointer, LongArray> sorter;
   private static final class SortComparator implements Comparator<PackedRecordPointer> {
     @Override
     public int compare(PackedRecordPointer left, PackedRecordPointer right) {
@@ -44,6 +44,9 @@ public int compare(PackedRecordPointer left, PackedRecordPointer right) {
    * An array of record pointers and partition ids that have been encoded by
    * {@link PackedRecordPointer}. The sort operates on this array instead of directly manipulating
    * records.
+   *
+   * Only part of the array will be used to store the pointers, the rest part is preserved as
+   * temporary buffer for sorting.
    */
   private LongArray array;
 
@@ -54,14 +57,14 @@ public int compare(PackedRecordPointer left, PackedRecordPointer right) {
   private final boolean useRadixSort;
 
   /**
-   * Set to 2x for radix sort to reserve extra memory for sorting, otherwise 1x.
+   * The position in the pointer array where new records can be inserted.
    */
-  private final int memoryAllocationFactor;
+  private int pos = 0;
 
   /**
-   * The position in the pointer array where new records can be inserted.
+   * How many records could be inserted, because part of the array should be left for sorting.
    */
-  private int pos = 0;
+  private int usableCapacity = 0;
 
   private int initialSize;
 
@@ -70,9 +73,14 @@ public int compare(PackedRecordPointer left, PackedRecordPointer right) {
     assert (initialSize > 0);
     this.initialSize = initialSize;
     this.useRadixSort = useRadixSort;
-    this.memoryAllocationFactor = useRadixSort ? 2 : 1;
     this.array = consumer.allocateArray(initialSize);
-    this.sorter = new Sorter<>(ShuffleSortDataFormat.INSTANCE);
+    this.usableCapacity = getUsableCapacity();
+  }
+
+  private int getUsableCapacity() {
+    // Radix sort requires same amount of used memory as buffer, Tim sort requires
+    // half of the used memory as buffer.
+    return (int) (array.size() / (useRadixSort ? 2 : 1.5));
   }
 
   public void free() {
@@ -89,7 +97,8 @@ public int numRecords() {
   public void reset() {
     if (consumer != null) {
       consumer.freeArray(array);
-      this.array = consumer.allocateArray(initialSize);
+      array = consumer.allocateArray(initialSize);
+      usableCapacity = getUsableCapacity();
     }
     pos = 0;
   }
@@ -101,14 +110,15 @@ public void expandPointerArray(LongArray newArray) {
       array.getBaseOffset(),
       newArray.getBaseObject(),
       newArray.getBaseOffset(),
-      array.size() * (8 / memoryAllocationFactor)
+      pos * 8L
     );
     consumer.freeArray(array);
     array = newArray;
+    usableCapacity = getUsableCapacity();
   }
 
   public boolean hasSpaceForAnotherRecord() {
-    return pos < array.size() / memoryAllocationFactor;
+    return pos < usableCapacity;
   }
 
   public long getMemoryUsage() {
@@ -170,6 +180,14 @@ public ShuffleSorterIterator getSortedIterator() {
         PackedRecordPointer.PARTITION_ID_START_BYTE_INDEX,
         PackedRecordPointer.PARTITION_ID_END_BYTE_INDEX, false, false);
     } else {
+      MemoryBlock unused = new MemoryBlock(
+        array.getBaseObject(),
+        array.getBaseOffset() + pos * 8L,
+        (array.size() - pos) * 8L);
+      LongArray buffer = new LongArray(unused);
+      Sorter<PackedRecordPointer, LongArray> sorter =
+        new Sorter<>(new ShuffleSortDataFormat(buffer));
+
       sorter.sort(array, 0, pos, SORT_COMPARATOR);
     }
     return new ShuffleSorterIterator(pos, array, offset);
 
@@ -19,14 +19,15 @@
 
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.array.LongArray;
-import org.apache.spark.unsafe.memory.MemoryBlock;
 import org.apache.spark.util.collection.SortDataFormat;
 
 final class ShuffleSortDataFormat extends SortDataFormat<PackedRecordPointer, LongArray> {
 
-  public static final ShuffleSortDataFormat INSTANCE = new ShuffleSortDataFormat();
+  private final LongArray buffer;
 
-  private ShuffleSortDataFormat() { }
+  ShuffleSortDataFormat(LongArray buffer) {
+    this.buffer = buffer;
+  }
 
   @Override
   public PackedRecordPointer getKey(LongArray data, int pos) {
@@ -70,8 +71,8 @@ public void copyRange(LongArray src, int srcPos, LongArray dst, int dstPos, int
 
   @Override
   public LongArray allocate(int length) {
-    // This buffer is used temporary (usually small), so it's fine to allocated from JVM heap.
-    return new LongArray(MemoryBlock.fromLongArray(new long[length]));
+    assert (length <= buffer.size()) :
+      "the buffer is smaller than required: " + buffer.size() + " < " + length;
+    return buffer;
   }
-
 }
Original file line number	Diff line number	Diff line change
`@@ -51,6 +51,6 @@ public long size() {`
`51`	`51`	`* Creates a memory block pointing to the memory used by the long array.`
`52`	`52`	`*/`
`53`	`53`	`public static MemoryBlock fromLongArray(final long[] array) {`
`54`		`- return new MemoryBlock(array, Platform.LONG_ARRAY_OFFSET, array.length * 8);`
	`54`	`+ return new MemoryBlock(array, Platform.LONG_ARRAY_OFFSET, array.length * 8L);`
`55`	`55`	`}`
`56`	`56`	`}`