-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-1853] Show Streaming application code context (file, line number) in Spark Stages UI #2464
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d466d75
9d38d3c
1500deb
70f494f
5f3105a
1d90cc3
2a09ad6
ccde038
a207eb7
5051c58
f51fd9f
c26d933
33a7295
491a1eb
196121b
8c5d443
ceb43da
c461cf4
b9ed945
7baa427
904cd92
390b45d
dc54c71
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,14 +23,15 @@ import java.io.{IOException, ObjectInputStream, ObjectOutputStream} | |
| import scala.deprecated | ||
| import scala.collection.mutable.HashMap | ||
| import scala.reflect.ClassTag | ||
| import scala.util.matching.Regex | ||
|
|
||
| import org.apache.spark.{Logging, SparkException} | ||
| import org.apache.spark.rdd.{BlockRDD, RDD} | ||
| import org.apache.spark.storage.StorageLevel | ||
| import org.apache.spark.streaming._ | ||
| import org.apache.spark.streaming.StreamingContext._ | ||
| import org.apache.spark.streaming.scheduler.Job | ||
| import org.apache.spark.util.MetadataCleaner | ||
| import org.apache.spark.util.{CallSite, MetadataCleaner} | ||
|
|
||
| /** | ||
| * A Discretized Stream (DStream), the basic abstraction in Spark Streaming, is a continuous | ||
|
|
@@ -106,6 +107,9 @@ abstract class DStream[T: ClassTag] ( | |
| /** Return the StreamingContext associated with this DStream */ | ||
| def context = ssc | ||
|
|
||
| /* Set the creation call site */ | ||
| private[streaming] val creationSite = DStream.getCreationSite() | ||
|
|
||
| /** Persist the RDDs of this DStream with the given storage level */ | ||
| def persist(level: StorageLevel): DStream[T] = { | ||
| if (this.isInitialized) { | ||
|
|
@@ -272,43 +276,41 @@ abstract class DStream[T: ClassTag] ( | |
| } | ||
|
|
||
| /** | ||
| * Retrieve a precomputed RDD of this DStream, or computes the RDD. This is an internal | ||
| * method that should not be called directly. | ||
| * Get the RDD corresponding to the given time; either retrieve it from cache | ||
| * or compute-and-cache it. | ||
| */ | ||
| private[streaming] def getOrCompute(time: Time): Option[RDD[T]] = { | ||
| // If this DStream was not initialized (i.e., zeroTime not set), then do it | ||
| // If RDD was already generated, then retrieve it from HashMap | ||
| generatedRDDs.get(time) match { | ||
|
|
||
| // If an RDD was already generated and is being reused, then | ||
| // probably all RDDs in this DStream will be reused and hence should be cached | ||
| case Some(oldRDD) => Some(oldRDD) | ||
|
|
||
| // if RDD was not generated, and if the time is valid | ||
| // (based on sliding time of this DStream), then generate the RDD | ||
| case None => { | ||
| if (isTimeValid(time)) { | ||
| compute(time) match { | ||
| case Some(newRDD) => | ||
| if (storageLevel != StorageLevel.NONE) { | ||
| newRDD.persist(storageLevel) | ||
| logInfo("Persisting RDD " + newRDD.id + " for time " + | ||
| time + " to " + storageLevel + " at time " + time) | ||
| } | ||
| if (checkpointDuration != null && | ||
| (time - zeroTime).isMultipleOf(checkpointDuration)) { | ||
| newRDD.checkpoint() | ||
| logInfo("Marking RDD " + newRDD.id + " for time " + time + | ||
| " for checkpointing at time " + time) | ||
| } | ||
| generatedRDDs.put(time, newRDD) | ||
| Some(newRDD) | ||
| case None => | ||
| None | ||
| // If RDD was already generated, then retrieve it from HashMap, | ||
| // or else compute the RDD | ||
| generatedRDDs.get(time).orElse { | ||
| // Compute the RDD if time is valid (e.g. correct time in a sliding window) | ||
| // of RDD generation, else generate nothing. | ||
| if (isTimeValid(time)) { | ||
| // Set the thread-local property for call sites to this DStream's creation site | ||
| // such that RDDs generated by compute gets that as their creation site. | ||
| // Note that this `getOrCompute` may get called from another DStream which may have | ||
| // set its own call site. So we store its call site in a temporary variable, | ||
| // set this DStream's creation site, generate RDDs and then restore the previous call site. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about explaining this from top down, start with what we're trying to do (set RDD call sites properly) to how we're doing it (using thread-local property). Right now it's easy to get lost if the reader isn't familiar with how call sites are set through SparkContext |
||
| val prevCallSite = ssc.sparkContext.getCallSite() | ||
| ssc.sparkContext.setCallSite(creationSite) | ||
| val rddOption = compute(time) | ||
| ssc.sparkContext.setCallSite(prevCallSite) | ||
|
|
||
| rddOption.foreach { case newRDD => | ||
| // Register the generated RDD for caching and checkpointing | ||
| if (storageLevel != StorageLevel.NONE) { | ||
| newRDD.persist(storageLevel) | ||
| logDebug(s"Persisting RDD ${newRDD.id} for time $time to $storageLevel") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you mean to change this from info to debug?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes i did. too verbose. |
||
| } | ||
| } else { | ||
| None | ||
| if (checkpointDuration != null && (time - zeroTime).isMultipleOf(checkpointDuration)) { | ||
| newRDD.checkpoint() | ||
| logInfo(s"Marking RDD ${newRDD.id} for time $time for checkpointing") | ||
| } | ||
| generatedRDDs.put(time, newRDD) | ||
| } | ||
| rddOption | ||
| } else { | ||
| None | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -799,3 +801,29 @@ abstract class DStream[T: ClassTag] ( | |
| this | ||
| } | ||
| } | ||
|
|
||
| private[streaming] object DStream { | ||
|
|
||
| /** Get the creation site of a DStream from the stack trace of when the DStream is created. */ | ||
| def getCreationSite(): CallSite = { | ||
| val SPARK_CLASS_REGEX = """^org\.apache\.spark""".r | ||
| val SPARK_STREAMING_TESTCLASS_REGEX = """^org\.apache\.spark\.streaming\.test""".r | ||
| val SPARK_EXAMPLES_CLASS_REGEX = """^org\.apache\.spark\.examples""".r | ||
| val SCALA_CLASS_REGEX = """^scala""".r | ||
|
|
||
| /** Filtering function that excludes non-user classes for a streaming application */ | ||
| def streamingExclustionFunction(className: String): Boolean = { | ||
| def doesMatch(r: Regex) = r.findFirstIn(className).isDefined | ||
| val isSparkClass = doesMatch(SPARK_CLASS_REGEX) | ||
| val isSparkExampleClass = doesMatch(SPARK_EXAMPLES_CLASS_REGEX) | ||
| val isSparkStreamingTestClass = doesMatch(SPARK_STREAMING_TESTCLASS_REGEX) | ||
| val isScalaClass = doesMatch(SCALA_CLASS_REGEX) | ||
|
|
||
| // If the class is a spark example class or a streaming test class then it is considered | ||
| // as a streaming application class and don't exclude. Otherwise, exclude any | ||
| // non-Spark and non-Scala class, as the rest would streaming application classes. | ||
| (isSparkClass || isScalaClass) && !isSparkExampleClass && !isSparkStreamingTestClass | ||
| } | ||
| org.apache.spark.util.Utils.getCallSite(streamingExclustionFunction) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,13 +19,16 @@ package org.apache.spark.streaming | |
|
|
||
| import java.util.concurrent.atomic.AtomicInteger | ||
|
|
||
| import scala.language.postfixOps | ||
|
|
||
| import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException} | ||
| import org.apache.spark.storage.StorageLevel | ||
| import org.apache.spark.streaming.dstream.DStream | ||
| import org.apache.spark.streaming.receiver.Receiver | ||
| import org.apache.spark.util.{MetadataCleaner, Utils} | ||
| import org.scalatest.{BeforeAndAfter, FunSuite} | ||
| import org.apache.spark.util.Utils | ||
| import org.scalatest.{Assertions, BeforeAndAfter, FunSuite} | ||
| import org.scalatest.concurrent.Timeouts | ||
| import org.scalatest.concurrent.Eventually._ | ||
| import org.scalatest.exceptions.TestFailedDueToTimeoutException | ||
| import org.scalatest.time.SpanSugar._ | ||
|
|
||
|
|
@@ -257,6 +260,10 @@ class StreamingContextSuite extends FunSuite with BeforeAndAfter with Timeouts w | |
| assert(exception.getMessage.contains("transform"), "Expected exception not thrown") | ||
| } | ||
|
|
||
| test("DStream and generated RDD creation sites") { | ||
| testPackage.test() | ||
| } | ||
|
|
||
| def addInputStream(s: StreamingContext): DStream[Int] = { | ||
| val input = (1 to 100).map(i => (1 to i)) | ||
| val inputStream = new TestInputStream(s, input, 1) | ||
|
|
@@ -293,3 +300,37 @@ class TestReceiver extends Receiver[Int](StorageLevel.MEMORY_ONLY) with Logging | |
| object TestReceiver { | ||
| val counter = new AtomicInteger(1) | ||
| } | ||
|
|
||
| /** Streaming application for testing DStream and RDD creation sites */ | ||
| package object testPackage extends Assertions { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. minor: why package? Also classes / objects should be capitalized. Actually I would just put this in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a package so that I can exclude Regarding naming strategy, besides the example in
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah I did not realize there's already another suite that does that. It's ok to keep it then. |
||
| def test() { | ||
| val conf = new SparkConf().setMaster("local").setAppName("CreationSite test") | ||
| val ssc = new StreamingContext(conf , Milliseconds(100)) | ||
| try { | ||
| val inputStream = ssc.receiverStream(new TestReceiver) | ||
|
|
||
| // Verify creation site of DStream | ||
| val creationSite = inputStream.creationSite | ||
| assert(creationSite.shortForm.contains("receiverStream") && | ||
| creationSite.shortForm.contains("StreamingContextSuite") | ||
| ) | ||
| assert(creationSite.longForm.contains("testPackage")) | ||
|
|
||
| // Verify creation site of generated RDDs | ||
| var rddGenerated = false | ||
| var rddCreationSiteCorrect = true | ||
|
|
||
| inputStream.foreachRDD { rdd => | ||
| rddCreationSiteCorrect = rdd.creationSite == creationSite | ||
| rddGenerated = true | ||
| } | ||
| ssc.start() | ||
|
|
||
| eventually(timeout(10000 millis), interval(10 millis)) { | ||
| assert(rddGenerated && rddCreationSiteCorrect, "RDD creation site was not correct") | ||
| } | ||
| } finally { | ||
| ssc.stop() | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These changes are just a refactoring of the code (from
case Someandcase NonetoOption.orElse), with no change in the logic.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I confirmed that this logic is the same as before