CodingCat
diff --git a/‎assembly/pom.xml‎
Lines changed: 1 addition & 1 deletion b/‎assembly/pom.xml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bagel/pom.xml‎
Lines changed: 1 addition & 1 deletion b/‎bagel/pom.xml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bin/run-example‎
Lines changed: 1 addition & 1 deletion b/‎bin/run-example‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/pom.xml‎
Lines changed: 2 additions & 2 deletions b/‎core/pom.xml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/Partitioner.scala‎
Lines changed: 19 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/Partitioner.scala‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 4 additions & 4 deletions b/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/api/python/PythonPartitioner.scala‎
Lines changed: 2 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/api/python/PythonPartitioner.scala‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala‎
Lines changed: 4 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala‎
Lines changed: 7 additions & 6 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala‎
Lines changed: 6 additions & 4 deletions b/‎core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala‎
Lines changed: 6 additions & 4 deletions
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>1.0.0-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
 
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>1.0.0-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
 
@@ -51,7 +51,7 @@ if [[ ! $EXAMPLE_CLASS == org.apache.spark.examples* ]]; then
   EXAMPLE_CLASS="org.apache.spark.examples.$EXAMPLE_CLASS"
 fi
 
-./bin/spark-submit \
+"$FWDIR"/bin/spark-submit \
   --master $EXAMPLE_MASTER \
   --class $EXAMPLE_CLASS \
   "$SPARK_EXAMPLES_JAR" \
 
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>1.0.0-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -235,7 +235,7 @@
     </dependency>
     <dependency>
       <groupId>org.easymock</groupId>
-      <artifactId>easymock</artifactId>
+      <artifactId>easymockclassextension</artifactId>
       <scope>test</scope>
     </dependency>
     <dependency>
 
@@ -83,11 +83,17 @@ class HashPartitioner(partitions: Int) extends Partitioner {
     case _ =>
       false
   }
+
+  override def hashCode: Int = numPartitions
 }
 
 /**
  * A [[org.apache.spark.Partitioner]] that partitions sortable records by range into roughly
  * equal ranges. The ranges are determined by sampling the content of the RDD passed in.
+ *
+ * Note that the actual number of partitions created by the RangePartitioner might not be the same
+ * as the `partitions` parameter, in the case where the number of sampled records is less than
+ * the value of `partitions`.
  */
 class RangePartitioner[K : Ordering : ClassTag, V](
     partitions: Int,
@@ -119,7 +125,7 @@ class RangePartitioner[K : Ordering : ClassTag, V](
     }
   }
 
-  def numPartitions = partitions
+  def numPartitions = rangeBounds.length + 1
 
   private val binarySearch: ((Array[K], K) => Int) = CollectionsUtils.makeBinarySearch[K]
 
@@ -155,4 +161,16 @@ class RangePartitioner[K : Ordering : ClassTag, V](
     case _ =>
       false
   }
+
+  override def hashCode(): Int = {
+    val prime = 31
+    var result = 1
+    var i = 0
+    while (i < rangeBounds.length) {
+      result = prime * result + rangeBounds(i).hashCode
+      i += 1
+    }
+    result = prime * result + ascending.hashCode
+    result
+  }
 }
@@ -455,7 +455,7 @@ class SparkContext(config: SparkConf) extends Logging {
    */
   def textFile(path: String, minPartitions: Int = defaultMinPartitions): RDD[String] = {
     hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
-      minPartitions).map(pair => pair._2.toString)
+      minPartitions).map(pair => pair._2.toString).setName(path)
   }
 
   /**
@@ -496,7 +496,7 @@ class SparkContext(config: SparkConf) extends Logging {
       classOf[String],
       classOf[String],
       updateConf,
-      minPartitions)
+      minPartitions).setName(path)
   }
 
   /**
@@ -551,7 +551,7 @@ class SparkContext(config: SparkConf) extends Logging {
       inputFormatClass,
       keyClass,
       valueClass,
-      minPartitions)
+      minPartitions).setName(path)
   }
 
   /**
@@ -623,7 +623,7 @@ class SparkContext(config: SparkConf) extends Logging {
     val job = new NewHadoopJob(conf)
     NewFileInputFormat.addInputPath(job, new Path(path))
     val updatedConf = job.getConfiguration
-    new NewHadoopRDD(this, fClass, kClass, vClass, updatedConf)
+    new NewHadoopRDD(this, fClass, kClass, vClass, updatedConf).setName(path)
   }
 
   /**
 
@@ -50,4 +50,6 @@ private[spark] class PythonPartitioner(
     case _ =>
       false
   }
+
+  override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode
 }
@@ -737,7 +737,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     val outfmt = job.getOutputFormatClass
     val jobFormat = outfmt.newInstance
 
-    if (jobFormat.isInstanceOf[NewFileOutputFormat[_, _]]) {
+    if (self.conf.getBoolean("spark.hadoop.validateOutputSpecs", true) &&
+      jobFormat.isInstanceOf[NewFileOutputFormat[_, _]]) {
       // FileOutputFormat ignores the filesystem parameter
       jobFormat.checkOutputSpecs(job)
     }
@@ -803,7 +804,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     logDebug("Saving as hadoop file of type (" + keyClass.getSimpleName + ", " +
       valueClass.getSimpleName + ")")
 
-    if (outputFormatInstance.isInstanceOf[FileOutputFormat[_, _]]) {
+    if (self.conf.getBoolean("spark.hadoop.validateOutputSpecs", true) &&
+      outputFormatInstance.isInstanceOf[FileOutputFormat[_, _]]) {
       // FileOutputFormat ignores the filesystem parameter
       val ignoredFs = FileSystem.get(conf)
       conf.getOutputFormat.checkOutputSpecs(ignoredFs, conf)
 
@@ -38,7 +38,7 @@ import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.partial.{ApproximateActionListener, ApproximateEvaluator, PartialResult}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.{BlockId, BlockManager, BlockManagerMaster, RDDBlockId}
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{SystemClock, Clock, Utils}
 
 /**
  * The high-level scheduling layer that implements stage-oriented scheduling. It computes a DAG of
@@ -61,7 +61,8 @@ class DAGScheduler(
     listenerBus: LiveListenerBus,
     mapOutputTracker: MapOutputTrackerMaster,
     blockManagerMaster: BlockManagerMaster,
-    env: SparkEnv)
+    env: SparkEnv,
+    clock: Clock = SystemClock)
   extends Logging {
 
   import DAGScheduler._
@@ -781,7 +782,7 @@ class DAGScheduler(
       logDebug("New pending tasks: " + myPending)
       taskScheduler.submitTasks(
         new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.jobId, properties))
-      stageToInfos(stage).submissionTime = Some(System.currentTimeMillis())
+      stageToInfos(stage).submissionTime = Some(clock.getTime())
     } else {
       logDebug("Stage " + stage + " is actually done; %b %d %d".format(
         stage.isAvailable, stage.numAvailableOutputs, stage.numPartitions))
@@ -807,11 +808,11 @@ class DAGScheduler(
 
     def markStageAsFinished(stage: Stage) = {
       val serviceTime = stageToInfos(stage).submissionTime match {
-        case Some(t) => "%.03f".format((System.currentTimeMillis() - t) / 1000.0)
+        case Some(t) => "%.03f".format((clock.getTime() - t) / 1000.0)
         case _ => "Unknown"
       }
       logInfo("%s (%s) finished in %s s".format(stage, stage.name, serviceTime))
-      stageToInfos(stage).completionTime = Some(System.currentTimeMillis())
+      stageToInfos(stage).completionTime = Some(clock.getTime())
       listenerBus.post(SparkListenerStageCompleted(stageToInfos(stage)))
       runningStages -= stage
     }
@@ -1015,7 +1016,7 @@ class DAGScheduler(
       return
     }
     val dependentStages = resultStageToJob.keys.filter(x => stageDependsOn(x, failedStage)).toSeq
-    stageToInfos(failedStage).completionTime = Some(System.currentTimeMillis())
+    stageToInfos(failedStage).completionTime = Some(clock.getTime())
     for (resultStage <- dependentStages) {
       val job = resultStageToJob(resultStage)
       failJobAndIndependentStages(job, s"Job aborted due to stage failure: $reason",
 
@@ -20,6 +20,7 @@ package org.apache.spark.util.collection
 import java.io.{InputStream, BufferedInputStream, FileInputStream, File, Serializable, EOFException}
 import java.util.Comparator
 
+import scala.collection.BufferedIterator
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
@@ -231,7 +232,7 @@ class ExternalAppendOnlyMap[K, V, C](
     // Input streams are derived both from the in-memory map and spilled maps on disk
     // The in-memory map is sorted in place, while the spilled maps are already in sorted order
     private val sortedMap = currentMap.destructiveSortedIterator(comparator)
-    private val inputStreams = Seq(sortedMap) ++ spilledMaps
+    private val inputStreams = (Seq(sortedMap) ++ spilledMaps).map(it => it.buffered)
 
     inputStreams.foreach { it =>
       val kcPairs = getMorePairs(it)
@@ -246,13 +247,13 @@ class ExternalAppendOnlyMap[K, V, C](
      * In the event of key hash collisions, this ensures no pairs are hidden from being merged.
      * Assume the given iterator is in sorted order.
      */
-    private def getMorePairs(it: Iterator[(K, C)]): ArrayBuffer[(K, C)] = {
+    private def getMorePairs(it: BufferedIterator[(K, C)]): ArrayBuffer[(K, C)] = {
       val kcPairs = new ArrayBuffer[(K, C)]
       if (it.hasNext) {
         var kc = it.next()
         kcPairs += kc
         val minHash = kc._1.hashCode()
-        while (it.hasNext && kc._1.hashCode() == minHash) {
+        while (it.hasNext && it.head._1.hashCode() == minHash) {
           kc = it.next()
           kcPairs += kc
         }
@@ -325,7 +326,8 @@ class ExternalAppendOnlyMap[K, V, C](
      *
      * StreamBuffers are ordered by the minimum key hash found across all of their own pairs.
      */
-    private case class StreamBuffer(iterator: Iterator[(K, C)], pairs: ArrayBuffer[(K, C)])
+    private class StreamBuffer(
+        val iterator: BufferedIterator[(K, C)], val pairs: ArrayBuffer[(K, C)])
       extends Comparable[StreamBuffer] {
 
       def isEmpty = pairs.length == 0
Original file line number	Diff line number	Diff line change
`@@ -50,4 +50,6 @@ private[spark] class PythonPartitioner(`
`50`	`50`	`case _ =>`
`51`	`51`	`false`
`52`	`52`	`}`
	`53`	`+`
	`54`	`+ override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode`
`53`	`55`	`}`