Skip to content

Commit a802ca8

Browse files
committed
Merge remote branch 'upstream/master' into patch-5
2 parents 5e374c7 + 5fa0a05 commit a802ca8

File tree

15 files changed

+173
-123
lines changed

15 files changed

+173
-123
lines changed

bin/compute-classpath.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,10 +81,10 @@ ASSEMBLY_JAR=$(ls "$assembly_folder"/spark-assembly*hadoop*.jar 2>/dev/null)
8181
# Verify that versions of java used to build the jars and run Spark are compatible
8282
jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" nonexistent/class/path 2>&1)
8383
if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
84-
echo "Loading Spark jar with '$JAR_CMD' failed. "
85-
echo "This is likely because Spark was compiled with Java 7 and run "
86-
echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark "
87-
echo "or build Spark with Java 6."
84+
echo "Loading Spark jar with '$JAR_CMD' failed. " 1>&2
85+
echo "This is likely because Spark was compiled with Java 7 and run " 1>&2
86+
echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark " 1>&2
87+
echo "or build Spark with Java 6." 1>&2
8888
exit 1
8989
fi
9090

bin/pyspark

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ export SPARK_HOME="$FWDIR"
2626
SCALA_VERSION=2.10
2727

2828
if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
29-
echo "Usage: ./bin/pyspark [options]"
29+
echo "Usage: ./bin/pyspark [options]" 1>&2
3030
$FWDIR/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
3131
exit 0
3232
fi
@@ -36,8 +36,8 @@ if [ ! -f "$FWDIR/RELEASE" ]; then
3636
# Exit if the user hasn't compiled Spark
3737
ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*.jar >& /dev/null
3838
if [[ $? != 0 ]]; then
39-
echo "Failed to find Spark assembly in $FWDIR/assembly/target" >&2
40-
echo "You need to build Spark before running this program" >&2
39+
echo "Failed to find Spark assembly in $FWDIR/assembly/target" 1>&2
40+
echo "You need to build Spark before running this program" 1>&2
4141
exit 1
4242
fi
4343
fi

bin/run-example

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@ if [ -n "$1" ]; then
2727
EXAMPLE_CLASS="$1"
2828
shift
2929
else
30-
echo "Usage: ./bin/run-example <example-class> [example-args]"
31-
echo " - set MASTER=XX to use a specific master"
32-
echo " - can use abbreviated example class name (e.g. SparkPi, mllib.LinearRegression)"
30+
echo "Usage: ./bin/run-example <example-class> [example-args]" 1>&2
31+
echo " - set MASTER=XX to use a specific master" 1>&2
32+
echo " - can use abbreviated example class name (e.g. SparkPi, mllib.LinearRegression)" 1>&2
3333
exit 1
3434
fi
3535

@@ -40,8 +40,8 @@ elif [ -e "$EXAMPLES_DIR"/target/scala-$SCALA_VERSION/spark-examples-*hadoop*.ja
4040
fi
4141

4242
if [[ -z $SPARK_EXAMPLES_JAR ]]; then
43-
echo "Failed to find Spark examples assembly in $FWDIR/lib or $FWDIR/examples/target" >&2
44-
echo "You need to build Spark before running this program" >&2
43+
echo "Failed to find Spark examples assembly in $FWDIR/lib or $FWDIR/examples/target" 1>&2
44+
echo "You need to build Spark before running this program" 1>&2
4545
exit 1
4646
fi
4747

bin/spark-class

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,13 @@ export SPARK_HOME="$FWDIR"
3333
. $FWDIR/bin/load-spark-env.sh
3434

3535
if [ -z "$1" ]; then
36-
echo "Usage: spark-class <class> [<args>]" >&2
36+
echo "Usage: spark-class <class> [<args>]" 1>&2
3737
exit 1
3838
fi
3939

4040
if [ -n "$SPARK_MEM" ]; then
41-
echo "Warning: SPARK_MEM is deprecated, please use a more specific config option"
42-
echo "(e.g., spark.executor.memory or SPARK_DRIVER_MEMORY)."
41+
echo -e "Warning: SPARK_MEM is deprecated, please use a more specific config option" 1>&2
42+
echo -e "(e.g., spark.executor.memory or SPARK_DRIVER_MEMORY)." 1>&2
4343
fi
4444

4545
# Use SPARK_MEM or 512m as the default memory, to be overridden by specific options
@@ -147,10 +147,9 @@ fi
147147
export CLASSPATH
148148

149149
if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
150-
echo -n "Spark Command: "
151-
echo "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
152-
echo "========================================"
153-
echo
150+
echo -n "Spark Command: " 1>&2
151+
echo "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@" 1>&2
152+
echo -e "========================================\n" 1>&2
154153
fi
155154

156155
exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"

core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,8 +141,8 @@ class HadoopRDD[K, V](
141141
// local process. The local cache is accessed through HadoopRDD.putCachedMetadata().
142142
// The caching helps minimize GC, since a JobConf can contain ~10KB of temporary objects.
143143
// synchronize to prevent ConcurrentModificationException (Spark-1097, Hadoop-10456)
144-
broadcastedConf.synchronized {
145-
val newJobConf = new JobConf(broadcastedConf.value.value)
144+
conf.synchronized {
145+
val newJobConf = new JobConf(conf)
146146
initLocalJobConfFuncOpt.map(f => f(newJobConf))
147147
HadoopRDD.putCachedMetadata(jobConfCacheKey, newJobConf)
148148
newJobConf

core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@ private[spark] class DiskBlockManager(shuffleManager: ShuffleBlockManager, rootD
4444
* directory, create multiple subdirectories that we will hash files into, in order to avoid
4545
* having really large inodes at the top level. */
4646
private val localDirs: Array[File] = createLocalDirs()
47+
if (localDirs.isEmpty) {
48+
logError("Failed to create any local dir.")
49+
System.exit(ExecutorExitCode.DISK_STORE_FAILED_TO_CREATE_DIR)
50+
}
4751
private val subDirs = Array.fill(localDirs.length)(new Array[File](subDirsPerLocalDir))
4852
private var shuffleSender : ShuffleSender = null
4953

@@ -116,7 +120,7 @@ private[spark] class DiskBlockManager(shuffleManager: ShuffleBlockManager, rootD
116120
private def createLocalDirs(): Array[File] = {
117121
logDebug(s"Creating local directories at root dirs '$rootDirs'")
118122
val dateFormat = new SimpleDateFormat("yyyyMMddHHmmss")
119-
rootDirs.split(",").map { rootDir =>
123+
rootDirs.split(",").flatMap { rootDir =>
120124
var foundLocalDir = false
121125
var localDir: File = null
122126
var localDirId: String = null
@@ -136,11 +140,13 @@ private[spark] class DiskBlockManager(shuffleManager: ShuffleBlockManager, rootD
136140
}
137141
}
138142
if (!foundLocalDir) {
139-
logError(s"Failed $MAX_DIR_CREATION_ATTEMPTS attempts to create local dir in $rootDir")
140-
System.exit(ExecutorExitCode.DISK_STORE_FAILED_TO_CREATE_DIR)
143+
logError(s"Failed $MAX_DIR_CREATION_ATTEMPTS attempts to create local dir in $rootDir." +
144+
" Ignoring this directory.")
145+
None
146+
} else {
147+
logInfo(s"Created local directory at $localDir")
148+
Some(localDir)
141149
}
142-
logInfo(s"Created local directory at $localDir")
143-
localDir
144150
}
145151
}
146152

core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ class ExternalAppendOnlyMap[K, V, C](
252252
if (it.hasNext) {
253253
var kc = it.next()
254254
kcPairs += kc
255-
val minHash = kc._1.hashCode()
255+
val minHash = getKeyHashCode(kc)
256256
while (it.hasNext && it.head._1.hashCode() == minHash) {
257257
kc = it.next()
258258
kcPairs += kc
@@ -294,8 +294,9 @@ class ExternalAppendOnlyMap[K, V, C](
294294
// Select a key from the StreamBuffer that holds the lowest key hash
295295
val minBuffer = mergeHeap.dequeue()
296296
val (minPairs, minHash) = (minBuffer.pairs, minBuffer.minKeyHash)
297-
var (minKey, minCombiner) = minPairs.remove(0)
298-
assert(minKey.hashCode() == minHash)
297+
val minPair = minPairs.remove(0)
298+
var (minKey, minCombiner) = minPair
299+
assert(getKeyHashCode(minPair) == minHash)
299300

300301
// For all other streams that may have this key (i.e. have the same minimum key hash),
301302
// merge in the corresponding value (if any) from that stream
@@ -327,15 +328,16 @@ class ExternalAppendOnlyMap[K, V, C](
327328
* StreamBuffers are ordered by the minimum key hash found across all of their own pairs.
328329
*/
329330
private class StreamBuffer(
330-
val iterator: BufferedIterator[(K, C)], val pairs: ArrayBuffer[(K, C)])
331+
val iterator: BufferedIterator[(K, C)],
332+
val pairs: ArrayBuffer[(K, C)])
331333
extends Comparable[StreamBuffer] {
332334

333335
def isEmpty = pairs.length == 0
334336

335337
// Invalid if there are no more pairs in this stream
336-
def minKeyHash = {
338+
def minKeyHash: Int = {
337339
assert(pairs.length > 0)
338-
pairs.head._1.hashCode()
340+
getKeyHashCode(pairs.head)
339341
}
340342

341343
override def compareTo(other: StreamBuffer): Int = {
@@ -422,10 +424,22 @@ class ExternalAppendOnlyMap[K, V, C](
422424
}
423425

424426
private[spark] object ExternalAppendOnlyMap {
427+
428+
/**
429+
* Return the key hash code of the given (key, combiner) pair.
430+
* If the key is null, return a special hash code.
431+
*/
432+
private def getKeyHashCode[K, C](kc: (K, C)): Int = {
433+
if (kc._1 == null) 0 else kc._1.hashCode()
434+
}
435+
436+
/**
437+
* A comparator for (key, combiner) pairs based on their key hash codes.
438+
*/
425439
private class KCComparator[K, C] extends Comparator[(K, C)] {
426440
def compare(kc1: (K, C), kc2: (K, C)): Int = {
427-
val hash1 = kc1._1.hashCode()
428-
val hash2 = kc2._1.hashCode()
441+
val hash1 = getKeyHashCode(kc1)
442+
val hash2 = getKeyHashCode(kc2)
429443
if (hash1 < hash2) -1 else if (hash1 == hash2) 0 else 1
430444
}
431445
}

core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -334,8 +334,8 @@ class ExternalAppendOnlyMapSuite extends FunSuite with LocalSparkContext {
334334
conf.set("spark.shuffle.memoryFraction", "0.001")
335335
sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
336336

337-
val map = new ExternalAppendOnlyMap[Int, Int, ArrayBuffer[Int]](createCombiner,
338-
mergeValue, mergeCombiners)
337+
val map = new ExternalAppendOnlyMap[Int, Int, ArrayBuffer[Int]](
338+
createCombiner, mergeValue, mergeCombiners)
339339

340340
(1 to 100000).foreach { i => map.insert(i, i) }
341341
map.insert(Int.MaxValue, Int.MaxValue)
@@ -346,11 +346,32 @@ class ExternalAppendOnlyMapSuite extends FunSuite with LocalSparkContext {
346346
it.next()
347347
}
348348
}
349+
350+
test("spilling with null keys and values") {
351+
val conf = new SparkConf(true)
352+
conf.set("spark.shuffle.memoryFraction", "0.001")
353+
sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
354+
355+
val map = new ExternalAppendOnlyMap[Int, Int, ArrayBuffer[Int]](
356+
createCombiner, mergeValue, mergeCombiners)
357+
358+
(1 to 100000).foreach { i => map.insert(i, i) }
359+
map.insert(null.asInstanceOf[Int], 1)
360+
map.insert(1, null.asInstanceOf[Int])
361+
map.insert(null.asInstanceOf[Int], null.asInstanceOf[Int])
362+
363+
val it = map.iterator
364+
while (it.hasNext) {
365+
// Should not throw NullPointerException
366+
it.next()
367+
}
368+
}
369+
349370
}
350371

351372
/**
352373
* A dummy class that always returns the same hash code, to easily test hash collisions
353374
*/
354-
case class FixedHashObject(val v: Int, val h: Int) extends Serializable {
375+
case class FixedHashObject(v: Int, h: Int) extends Serializable {
355376
override def hashCode(): Int = h
356377
}

docs/streaming-programming-guide.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ import org.apache.spark.streaming.*;
148148
import org.apache.spark.streaming.api.java.*;
149149
import scala.Tuple2;
150150
// Create a StreamingContext with a local master
151-
JavaStreamingContext jssc = new JavaStreamingContext("local", "JavaNetworkWordCount", new Duration(1000))
151+
JavaStreamingContext jssc = new JavaStreamingContext("local[2]", "JavaNetworkWordCount", new Duration(1000))
152152
{% endhighlight %}
153153

154154
Using this context, we then create a new DStream
@@ -216,7 +216,7 @@ jssc.awaitTermination(); // Wait for the computation to terminate
216216
{% endhighlight %}
217217

218218
The complete code can be found in the Spark Streaming example
219-
[JavaNetworkWordCount]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/java/index.html?org/apache/spark/examples/streaming/JavaNetworkWordCount.java).
219+
[JavaNetworkWordCount]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/java/org/apache/spark/examples/streaming/JavaNetworkWordCount.java).
220220
<br>
221221

222222
</div>

mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,8 @@ class RowMatrix(
347347
* The principal components are stored a local matrix of size n-by-k.
348348
* Each column corresponds for one principal component,
349349
* and the columns are in descending order of component variance.
350+
* The row data do not need to be "centered" first; it is not necessary for
351+
* the mean of each column to be 0.
350352
*
351353
* @param k number of top principal components.
352354
* @return a matrix of size n-by-k, whose columns are principal components

0 commit comments

Comments
 (0)