Merge remote branch 'upstream/master' into patch-5

YanjieGao · YanjieGao · commit a802ca88e1dd · 2014-07-04T11:25:51.000+08:00
diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
@@ -81,10 +81,10 @@ ASSEMBLY_JAR=$(ls "$assembly_folder"/spark-assembly*hadoop*.jar 2>/dev/null)
 # Verify that versions of java used to build the jars and run Spark are compatible
 jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" nonexistent/class/path 2>&1)
 if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
-  echo "Loading Spark jar with '$JAR_CMD' failed. "
-  echo "This is likely because Spark was compiled with Java 7 and run "
-  echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark "
-  echo "or build Spark with Java 6."
+  echo "Loading Spark jar with '$JAR_CMD' failed. " 1>&2
+  echo "This is likely because Spark was compiled with Java 7 and run " 1>&2
+  echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark " 1>&2
+  echo "or build Spark with Java 6." 1>&2
   exit 1
 fi
 
diff --git a/bin/pyspark b/bin/pyspark
@@ -26,7 +26,7 @@ export SPARK_HOME="$FWDIR"
 SCALA_VERSION=2.10
 
 if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
-  echo "Usage: ./bin/pyspark [options]"
+  echo "Usage: ./bin/pyspark [options]" 1>&2
   $FWDIR/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
   exit 0
 fi
@@ -36,8 +36,8 @@ if [ ! -f "$FWDIR/RELEASE" ]; then
   # Exit if the user hasn't compiled Spark
   ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*.jar >& /dev/null
   if [[ $? != 0 ]]; then
-    echo "Failed to find Spark assembly in $FWDIR/assembly/target" >&2
-    echo "You need to build Spark before running this program" >&2
+    echo "Failed to find Spark assembly in $FWDIR/assembly/target" 1>&2
+    echo "You need to build Spark before running this program" 1>&2
     exit 1
   fi
 fi
diff --git a/bin/run-example b/bin/run-example
@@ -27,9 +27,9 @@ if [ -n "$1" ]; then
   EXAMPLE_CLASS="$1"
   shift
 else
-  echo "Usage: ./bin/run-example <example-class> [example-args]"
-  echo "  - set MASTER=XX to use a specific master"
-  echo "  - can use abbreviated example class name (e.g. SparkPi, mllib.LinearRegression)"
+  echo "Usage: ./bin/run-example <example-class> [example-args]" 1>&2
+  echo "  - set MASTER=XX to use a specific master" 1>&2
+  echo "  - can use abbreviated example class name (e.g. SparkPi, mllib.LinearRegression)" 1>&2
   exit 1
 fi
 
@@ -40,8 +40,8 @@ elif [ -e "$EXAMPLES_DIR"/target/scala-$SCALA_VERSION/spark-examples-*hadoop*.ja
 fi
 
 if [[ -z $SPARK_EXAMPLES_JAR ]]; then
-  echo "Failed to find Spark examples assembly in $FWDIR/lib or $FWDIR/examples/target" >&2
-  echo "You need to build Spark before running this program" >&2
+  echo "Failed to find Spark examples assembly in $FWDIR/lib or $FWDIR/examples/target" 1>&2
+  echo "You need to build Spark before running this program" 1>&2
   exit 1
 fi
 
diff --git a/bin/spark-class b/bin/spark-class
@@ -33,13 +33,13 @@ export SPARK_HOME="$FWDIR"
 . $FWDIR/bin/load-spark-env.sh
 
 if [ -z "$1" ]; then
-  echo "Usage: spark-class <class> [<args>]" >&2
+  echo "Usage: spark-class <class> [<args>]" 1>&2
   exit 1
 fi
 
 if [ -n "$SPARK_MEM" ]; then
-  echo "Warning: SPARK_MEM is deprecated, please use a more specific config option"
-  echo "(e.g., spark.executor.memory or SPARK_DRIVER_MEMORY)."
+  echo -e "Warning: SPARK_MEM is deprecated, please use a more specific config option" 1>&2
+  echo -e "(e.g., spark.executor.memory or SPARK_DRIVER_MEMORY)." 1>&2
 fi
 
 # Use SPARK_MEM or 512m as the default memory, to be overridden by specific options
@@ -147,10 +147,9 @@ fi
 export CLASSPATH
 
 if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
-  echo -n "Spark Command: "
-  echo "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
-  echo "========================================"
-  echo
+  echo -n "Spark Command: " 1>&2
+  echo "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@" 1>&2
+  echo -e "========================================\n" 1>&2
 fi
 
 exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -141,8 +141,8 @@ class HadoopRDD[K, V](
       // local process. The local cache is accessed through HadoopRDD.putCachedMetadata().
       // The caching helps minimize GC, since a JobConf can contain ~10KB of temporary objects.
       // synchronize to prevent ConcurrentModificationException (Spark-1097, Hadoop-10456)
-      broadcastedConf.synchronized {
-        val newJobConf = new JobConf(broadcastedConf.value.value)
+      conf.synchronized {
+        val newJobConf = new JobConf(conf)
         initLocalJobConfFuncOpt.map(f => f(newJobConf))
         HadoopRDD.putCachedMetadata(jobConfCacheKey, newJobConf)
         newJobConf
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
@@ -44,6 +44,10 @@ private[spark] class DiskBlockManager(shuffleManager: ShuffleBlockManager, rootD
    * directory, create multiple subdirectories that we will hash files into, in order to avoid
    * having really large inodes at the top level. */
   private val localDirs: Array[File] = createLocalDirs()
+  if (localDirs.isEmpty) {
+    logError("Failed to create any local dir.")
+    System.exit(ExecutorExitCode.DISK_STORE_FAILED_TO_CREATE_DIR)
+  }
   private val subDirs = Array.fill(localDirs.length)(new Array[File](subDirsPerLocalDir))
   private var shuffleSender : ShuffleSender = null
 
@@ -116,7 +120,7 @@ private[spark] class DiskBlockManager(shuffleManager: ShuffleBlockManager, rootD
   private def createLocalDirs(): Array[File] = {
     logDebug(s"Creating local directories at root dirs '$rootDirs'")
     val dateFormat = new SimpleDateFormat("yyyyMMddHHmmss")
-    rootDirs.split(",").map { rootDir =>
+    rootDirs.split(",").flatMap { rootDir =>
       var foundLocalDir = false
       var localDir: File = null
       var localDirId: String = null
@@ -136,11 +140,13 @@ private[spark] class DiskBlockManager(shuffleManager: ShuffleBlockManager, rootD
         }
       }
       if (!foundLocalDir) {
-        logError(s"Failed $MAX_DIR_CREATION_ATTEMPTS attempts to create local dir in $rootDir")
-        System.exit(ExecutorExitCode.DISK_STORE_FAILED_TO_CREATE_DIR)
+        logError(s"Failed $MAX_DIR_CREATION_ATTEMPTS attempts to create local dir in $rootDir." +
+                  " Ignoring this directory.")
+        None
+      } else {
+        logInfo(s"Created local directory at $localDir")
+        Some(localDir)
       }
-      logInfo(s"Created local directory at $localDir")
-      localDir
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -252,7 +252,7 @@ class ExternalAppendOnlyMap[K, V, C](
       if (it.hasNext) {
         var kc = it.next()
         kcPairs += kc
-        val minHash = kc._1.hashCode()
+        val minHash = getKeyHashCode(kc)
         while (it.hasNext && it.head._1.hashCode() == minHash) {
           kc = it.next()
           kcPairs += kc
@@ -294,8 +294,9 @@ class ExternalAppendOnlyMap[K, V, C](
       // Select a key from the StreamBuffer that holds the lowest key hash
       val minBuffer = mergeHeap.dequeue()
       val (minPairs, minHash) = (minBuffer.pairs, minBuffer.minKeyHash)
-      var (minKey, minCombiner) = minPairs.remove(0)
-      assert(minKey.hashCode() == minHash)
+      val minPair = minPairs.remove(0)
+      var (minKey, minCombiner) = minPair
+      assert(getKeyHashCode(minPair) == minHash)
 
       // For all other streams that may have this key (i.e. have the same minimum key hash),
       // merge in the corresponding value (if any) from that stream
@@ -327,15 +328,16 @@ class ExternalAppendOnlyMap[K, V, C](
      * StreamBuffers are ordered by the minimum key hash found across all of their own pairs.
      */
     private class StreamBuffer(
-        val iterator: BufferedIterator[(K, C)], val pairs: ArrayBuffer[(K, C)])
+        val iterator: BufferedIterator[(K, C)],
+        val pairs: ArrayBuffer[(K, C)])
       extends Comparable[StreamBuffer] {
 
       def isEmpty = pairs.length == 0
 
       // Invalid if there are no more pairs in this stream
-      def minKeyHash = {
+      def minKeyHash: Int = {
         assert(pairs.length > 0)
-        pairs.head._1.hashCode()
+        getKeyHashCode(pairs.head)
       }
 
       override def compareTo(other: StreamBuffer): Int = {
@@ -422,10 +424,22 @@ class ExternalAppendOnlyMap[K, V, C](
 }
 
 private[spark] object ExternalAppendOnlyMap {
+
+  /**
+   * Return the key hash code of the given (key, combiner) pair.
+   * If the key is null, return a special hash code.
+   */
+  private def getKeyHashCode[K, C](kc: (K, C)): Int = {
+    if (kc._1 == null) 0 else kc._1.hashCode()
+  }
+
+  /**
+   * A comparator for (key, combiner) pairs based on their key hash codes.
+   */
   private class KCComparator[K, C] extends Comparator[(K, C)] {
     def compare(kc1: (K, C), kc2: (K, C)): Int = {
-      val hash1 = kc1._1.hashCode()
-      val hash2 = kc2._1.hashCode()
+      val hash1 = getKeyHashCode(kc1)
+      val hash2 = getKeyHashCode(kc2)
       if (hash1 < hash2) -1 else if (hash1 == hash2) 0 else 1
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
@@ -334,8 +334,8 @@ class ExternalAppendOnlyMapSuite extends FunSuite with LocalSparkContext {
     conf.set("spark.shuffle.memoryFraction", "0.001")
     sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
 
-    val map = new ExternalAppendOnlyMap[Int, Int, ArrayBuffer[Int]](createCombiner,
-      mergeValue, mergeCombiners)
+    val map = new ExternalAppendOnlyMap[Int, Int, ArrayBuffer[Int]](
+      createCombiner, mergeValue, mergeCombiners)
 
     (1 to 100000).foreach { i => map.insert(i, i) }
     map.insert(Int.MaxValue, Int.MaxValue)
@@ -346,11 +346,32 @@ class ExternalAppendOnlyMapSuite extends FunSuite with LocalSparkContext {
       it.next()
     }
   }
+
+  test("spilling with null keys and values") {
+    val conf = new SparkConf(true)
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+
+    val map = new ExternalAppendOnlyMap[Int, Int, ArrayBuffer[Int]](
+      createCombiner, mergeValue, mergeCombiners)
+
+    (1 to 100000).foreach { i => map.insert(i, i) }
+    map.insert(null.asInstanceOf[Int], 1)
+    map.insert(1, null.asInstanceOf[Int])
+    map.insert(null.asInstanceOf[Int], null.asInstanceOf[Int])
+
+    val it = map.iterator
+    while (it.hasNext) {
+      // Should not throw NullPointerException
+      it.next()
+    }
+  }
+
 }
 
 /**
  * A dummy class that always returns the same hash code, to easily test hash collisions
  */
-case class FixedHashObject(val v: Int, val h: Int) extends Serializable {
+case class FixedHashObject(v: Int, h: Int) extends Serializable {
   override def hashCode(): Int = h
 }
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
@@ -148,7 +148,7 @@ import org.apache.spark.streaming.*;
 import org.apache.spark.streaming.api.java.*;
 import scala.Tuple2;
 // Create a StreamingContext with a local master
-JavaStreamingContext jssc = new JavaStreamingContext("local", "JavaNetworkWordCount", new Duration(1000))
+JavaStreamingContext jssc = new JavaStreamingContext("local[2]", "JavaNetworkWordCount", new Duration(1000))
 {% endhighlight %}
 
 Using this context, we then create a new DStream
@@ -216,7 +216,7 @@ jssc.awaitTermination();   // Wait for the computation to terminate
 {% endhighlight %}
 
 The complete code can be found in the Spark Streaming example
-[JavaNetworkWordCount]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/java/index.html?org/apache/spark/examples/streaming/JavaNetworkWordCount.java).
+[JavaNetworkWordCount]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/java/org/apache/spark/examples/streaming/JavaNetworkWordCount.java).
 <br>
 
 </div>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -347,6 +347,8 @@ class RowMatrix(
    * The principal components are stored a local matrix of size n-by-k.
    * Each column corresponds for one principal component,
    * and the columns are in descending order of component variance.
+   * The row data do not need to be "centered" first; it is not necessary for
+   * the mean of each column to be 0.
    *
    * @param k number of top principal components.
    * @return a matrix of size n-by-k, whose columns are principal components
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -111,7 +111,7 @@ abstract class Expression extends TreeNode[Expression] {
       } else {
         e1.dataType match {
           case n: NumericType =>
-            f.asInstanceOf[(Numeric[n.JvmType], n.JvmType, n.JvmType) => Int](
+            f.asInstanceOf[(Numeric[n.JvmType], n.JvmType, n.JvmType) => n.JvmType](
               n.numeric, evalE1.asInstanceOf[n.JvmType], evalE2.asInstanceOf[n.JvmType])
           case other => sys.error(s"Type $other does not support numeric operations")
         }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -50,7 +50,7 @@ trait SQLConf {
   /** ********************** SQLConf functionality methods ************ */
 
   @transient
-  private val settings = java.util.Collections.synchronizedMap(
+  protected[sql] val settings = java.util.Collections.synchronizedMap(
     new java.util.HashMap[String, String]())
 
   def set(props: Properties): Unit = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -39,25 +39,27 @@ class JoinSuite extends QueryTest {
   test("plans broadcast hash join, given hints") {
 
     def mkTest(buildSide: BuildSide, leftTable: String, rightTable: String) = {
-      TestSQLContext.set("spark.sql.join.broadcastTables",
-        s"${if (buildSide == BuildRight) rightTable else leftTable}")
-      val rdd = sql(s"""SELECT * FROM $leftTable JOIN $rightTable ON key = a""")
-      // Using `sparkPlan` because for relevant patterns in HashJoin to be
-      // matched, other strategies need to be applied.
-      val physical = rdd.queryExecution.sparkPlan
-      val bhj = physical.collect { case j: BroadcastHashJoin if j.buildSide == buildSide => j }
-
-      assert(bhj.size === 1, "planner does not pick up hint to generate broadcast hash join")
-      checkAnswer(
-        rdd,
-        Seq(
-          (1, "1", 1, 1),
-          (1, "1", 1, 2),
-          (2, "2", 2, 1),
-          (2, "2", 2, 2),
-          (3, "3", 3, 1),
-          (3, "3", 3, 2)
-        ))
+      TestSQLContext.settings.synchronized {
+        TestSQLContext.set("spark.sql.join.broadcastTables",
+          s"${if (buildSide == BuildRight) rightTable else leftTable}")
+        val rdd = sql( s"""SELECT * FROM $leftTable JOIN $rightTable ON key = a""")
+        // Using `sparkPlan` because for relevant patterns in HashJoin to be
+        // matched, other strategies need to be applied.
+        val physical = rdd.queryExecution.sparkPlan
+        val bhj = physical.collect { case j: BroadcastHashJoin if j.buildSide == buildSide => j}
+
+        assert(bhj.size === 1, "planner does not pick up hint to generate broadcast hash join")
+        checkAnswer(
+          rdd,
+          Seq(
+            (1, "1", 1, 1),
+            (1, "1", 1, 2),
+            (2, "2", 2, 1),
+            (2, "2", 2, 2),
+            (3, "3", 3, 1),
+            (3, "3", 3, 2)
+          ))
+      }
     }
 
     mkTest(BuildRight, "testData", "testData2")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala

Original file line number	Diff line number	Diff line change
`@@ -347,6 +347,8 @@ class RowMatrix(`
`347`	`347`	`* The principal components are stored a local matrix of size n-by-k.`
`348`	`348`	`* Each column corresponds for one principal component,`
`349`	`349`	`* and the columns are in descending order of component variance.`
	`350`	`+ * The row data do not need to be "centered" first; it is not necessary for`
	`351`	`+ * the mean of each column to be 0.`
`350`	`352`	`*`
`351`	`353`	`* @param k number of top principal components.`
`352`	`354`	`* @return a matrix of size n-by-k, whose columns are principal components`