Make call sites for stages non-optional and well defined

pwendell · darabos · commit d00a85bad223 · 2014-06-13T14:00:18.000+02:00
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -195,7 +195,7 @@ class DAGScheduler(
       case Some(stage) => stage
       case None =>
         val stage =
-          newOrUsedStage(shuffleDep.rdd, shuffleDep.rdd.partitions.size, shuffleDep, jobId)
+          newOrUsedStage(shuffleDep.rdd, shuffleDep.rdd.partitions.size, shuffleDep, jobId, shuffleDep.rdd.creationSite)
         shuffleToMapStage(shuffleDep.shuffleId) = stage
         stage
     }
@@ -212,7 +212,7 @@ class DAGScheduler(
       numTasks: Int,
       shuffleDep: Option[ShuffleDependency[_,_]],
       jobId: Int,
-      callSite: Option[CallSite] = None)
+      callSite: CallSite)
     : Stage =
   {
     val id = nextStageId.getAndIncrement()
@@ -235,7 +235,7 @@ class DAGScheduler(
       numTasks: Int,
       shuffleDep: ShuffleDependency[_,_],
       jobId: Int,
-      callSite: Option[CallSite] = None)
+      callSite: CallSite)
     : Stage =
   {
     val stage = newStage(rdd, numTasks, Some(shuffleDep), jobId, callSite)
@@ -674,7 +674,7 @@ class DAGScheduler(
     try {
       // New stage creation may throw an exception if, for example, jobs are run on a
       // HadoopRDD whose underlying HDFS files have been deleted.
-      finalStage = newStage(finalRDD, partitions.size, None, jobId, Some(callSite))
+      finalStage = newStage(finalRDD, partitions.size, None, jobId, callSite)
     } catch {
       case e: Exception =>
         logWarning("Creating new stage failed due to exception - job: " + jobId, e)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
@@ -36,6 +36,11 @@ import org.apache.spark.util.CallSite
  * Each Stage also has a jobId, identifying the job that first submitted the stage.  When FIFO
  * scheduling is used, this allows Stages from earlier jobs to be computed first or recovered
  * faster on failure.
+ *
+ * The callSite provides a location in user code which relates to the stage. For a shuffle map
+ * stage, the callSite gives the user code that created the RDD being shuffled. For a result
+ * stage, the callSite gives the user code that executes the associated action (e.g. count()).
+ *
  */
 private[spark] class Stage(
     val id: Int,
@@ -44,7 +49,7 @@ private[spark] class Stage(
     val shuffleDep: Option[ShuffleDependency[_,_]],  // Output shuffle if stage is a map stage
     val parents: List[Stage],
     val jobId: Int,
-    callSite: Option[CallSite])
+    val callSite: CallSite)
   extends Logging {
 
   val isShuffleMap = shuffleDep.isDefined
@@ -101,8 +106,8 @@ private[spark] class Stage(
     id
   }
 
-  val name = callSite.map(_.short).getOrElse(rdd.getCreationSite)
-  val details = callSite.map(_.long).getOrElse("")
+  val name = callSite.short
+  val details = callSite.long
 
   override def toString = "Stage " + id