-
Notifications
You must be signed in to change notification settings - Fork 3
[SPARK-8979][Streaming] Implements a PIDRateEstimator #17
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,69 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.streaming.scheduler | ||
|
|
||
| import java.util.concurrent.atomic.AtomicLong | ||
|
|
||
| import org.apache.spark.annotation.DeveloperApi | ||
| import org.apache.spark.streaming.scheduler.rate.RateEstimator | ||
| import org.apache.spark.util.ThreadUtils | ||
|
|
||
| import scala.concurrent.{ExecutionContext, Future} | ||
|
|
||
| /** | ||
| * :: DeveloperApi :: | ||
| * A StreamingListener that receives batch completion updates, and maintains | ||
| * an estimate of the speed at which this stream should ingest messages, | ||
| * given an estimate computation from a `RateEstimator` | ||
| */ | ||
| @DeveloperApi | ||
| private [streaming] abstract class RateController(val streamUID: Int, rateEstimator: RateEstimator) | ||
| extends StreamingListener with Serializable { | ||
|
|
||
| protected def publish(rate: Long): Unit | ||
|
|
||
| // Used to compute & publish the rate update asynchronously | ||
| @transient private val executionContext = ExecutionContext.fromExecutorService( | ||
| ThreadUtils.newDaemonSingleThreadExecutor("stream-rate-update")) | ||
|
|
||
| private val rateLimit : AtomicLong = new AtomicLong(-1L) | ||
|
|
||
| // Asynchronous computation of the rate update | ||
| private def computeAndPublish(time: Long, elems: Long, workDelay: Long, waitDelay: Long): Unit = | ||
| Future[Unit] { | ||
| val newSpeed = rateEstimator.compute(time, elems, workDelay, waitDelay) | ||
| newSpeed foreach { s => | ||
| rateLimit.set(s.toLong) | ||
| publish(getLatestRate()) | ||
| } | ||
| } (executionContext) | ||
|
|
||
| def getLatestRate(): Long = rateLimit.get() | ||
|
|
||
| override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted){ | ||
| val elements = batchCompleted.batchInfo.streamIdToInputInfo | ||
|
|
||
| for ( | ||
| processingEnd <- batchCompleted.batchInfo.processingEndTime; | ||
| workDelay <- batchCompleted.batchInfo.processingDelay; | ||
| waitDelay <- batchCompleted.batchInfo.schedulingDelay; | ||
| elems <- elements.get(streamUID).map(_.numRecords) | ||
| ) computeAndPublish(processingEnd, elems, workDelay, waitDelay) | ||
| } | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,92 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.streaming.scheduler.rate | ||
|
|
||
| import org.apache.spark.Logging | ||
|
|
||
| /** | ||
| * Implements a proportional-integral-derivative (PID) controller which acts on | ||
| * the speed of ingestion of elements into Spark Streaming. | ||
| * | ||
| * @param batchDurationMillis the batch duration, in milliseconds | ||
| * @param proportional how much the correction should depend on the current | ||
| * error, | ||
| * @param integral how much the correction should depend on the accumulation | ||
| * of past errors, | ||
| * @param derivative how much the correction should depend on a prediction | ||
| * of future errors, based on current rate of change | ||
| */ | ||
| private[streaming] class PIDRateEstimator(batchIntervalMillis: Long, | ||
| proportional: Double = -1D, | ||
| integral: Double = -.2D, | ||
| derivative: Double = 0D) | ||
| extends RateEstimator with Logging { | ||
|
|
||
| private var init: Boolean = true | ||
| private var latestTime : Long = -1L | ||
| private var latestSpeed : Double = -1D | ||
| private var latestError : Double = -1L | ||
|
|
||
| if (batchIntervalMillis <= 0) logError("Specified batch interval ${batchIntervalMillis} " + | ||
| "in PIDRateEstimator is invalid.") | ||
|
|
||
| def compute(time: Long, // in milliseconds | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: incorrect indentation |
||
| elements: Long, | ||
| processingDelay: Long, // in milliseconds | ||
| schedulingDelay: Long // in milliseconds | ||
| ): Option[Double] = { | ||
|
|
||
| this.synchronized { | ||
| if (time > latestTime && processingDelay > 0 && batchIntervalMillis > 0) { | ||
|
|
||
| // in seconds, should be close to batchDuration | ||
| val delaySinceUpdate = (time - latestTime).toDouble / 1000 | ||
|
|
||
| // in elements/second | ||
| val processingSpeed = elements.toDouble / processingDelay * 1000 | ||
|
|
||
| // in elements/second | ||
| val error = latestSpeed - processingSpeed | ||
|
|
||
| // in elements/second | ||
| val sumError = schedulingDelay.toDouble * processingSpeed / batchIntervalMillis | ||
|
|
||
| // in elements/(second ^ 2) | ||
| val dError = (error - latestError) / delaySinceUpdate | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, |
||
|
|
||
| val newSpeed = (latestSpeed + proportional * error + | ||
| integral * sumError + | ||
| derivative * dError) max 0D | ||
| latestTime = time | ||
| if (init) { | ||
| latestSpeed = processingSpeed | ||
| latestError = 0D | ||
| init = false | ||
|
|
||
| None | ||
| } else { | ||
| latestSpeed = newSpeed | ||
| latestError = error | ||
|
|
||
| Some(newSpeed) | ||
| } | ||
| } else None | ||
| } | ||
| } | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,46 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.streaming.scheduler.rate | ||
|
|
||
| import org.apache.spark.annotation.DeveloperApi | ||
|
|
||
| /** | ||
| * :: DeveloperApi :: | ||
| * A component that estimates the rate at wich an InputDStream should ingest | ||
| * elements, based on updates at every batch completion. | ||
| */ | ||
| @DeveloperApi | ||
| private[streaming] trait RateEstimator extends Serializable { | ||
|
|
||
| /** | ||
| * Computes the number of elements the stream attached to this `RateEstimator` | ||
| * should ingest per second, given an update on the size and completion | ||
| * times of the latest batch. | ||
| */ | ||
| def compute(time: Long, elements: Long, | ||
| processingDelay: Long, schedulingDelay: Long): Option[Double] | ||
| } | ||
|
|
||
| /** | ||
| * The trivial rate estimator never sends an update | ||
| */ | ||
| private[streaming] class NoopRateEstimator extends RateEstimator { | ||
|
|
||
| def compute(time: Long, elements: Long, | ||
| processingDelay: Long, schedulingDelay: Long): Option[Double] = None | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,15 +21,16 @@ import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer} | |
| import scala.concurrent.Future | ||
| import scala.concurrent.ExecutionContext.Implicits.global | ||
|
|
||
| import org.apache.spark.Logging | ||
| import org.apache.spark.storage.StorageLevel | ||
| import org.apache.spark.streaming.dstream.DStream | ||
| import org.apache.spark.streaming.receiver.Receiver | ||
| import org.apache.spark.streaming.scheduler._ | ||
| import org.apache.spark.streaming.scheduler.rate._ | ||
|
|
||
| import org.scalatest.Matchers | ||
| import org.scalatest.concurrent.Eventually._ | ||
| import org.scalatest.time.SpanSugar._ | ||
| import org.apache.spark.Logging | ||
|
|
||
| class StreamingListenerSuite extends TestSuiteBase with Matchers { | ||
|
|
||
|
|
@@ -131,6 +132,37 @@ class StreamingListenerSuite extends TestSuiteBase with Matchers { | |
| } | ||
| } | ||
|
|
||
| // This test is long to run an may be dependent on your machine's | ||
| // characteristics (high variance in estimating processing speed on a | ||
| // small batch) | ||
| ignore("latest speed reporting") { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test should not be |
||
| val operation = (d: DStream[Int]) => d.map(Thread.sleep(_)) | ||
| val midInput = Seq.fill(10)(Seq.fill(100)(1)) | ||
| val midSsc = setupStreams(midInput, operation) | ||
| val midLatestRate = new RateController(0, | ||
| new PIDRateEstimator(batchDuration.milliseconds, -1, 0, 0)){ | ||
| def publish(r: Long): Unit = () | ||
| } | ||
| midSsc.addStreamingListener(midLatestRate) | ||
| runStreams(midSsc, midInput.size, midInput.size) | ||
|
|
||
| val midSp = midLatestRate.getLatestRate() | ||
|
|
||
| // between two batch sizes that are both below the system's limits, | ||
| // the estimate of elements processed per batch should be comparable | ||
| val bigInput = Seq.fill(10)(Seq.fill(500)(1)) | ||
| val bigSsc = setupStreams(bigInput, operation) | ||
| val bigLatestRate = new RateController(0, | ||
| new PIDRateEstimator(batchDuration.milliseconds, -1, 0, 0)){ | ||
| def publish(r: Long): Unit = () | ||
| } | ||
| bigSsc.addStreamingListener(bigLatestRate) | ||
| runStreams(bigSsc, bigInput.size, bigInput.size) | ||
|
|
||
| val bigSp = bigLatestRate.getLatestRate() | ||
| bigSp should (be >= (midSp / 2) and be <= (midSp * 2)) | ||
| } | ||
|
|
||
| /** Check if a sequence of numbers is in increasing order */ | ||
| def isInIncreasingOrder(seq: Seq[Long]): Boolean = { | ||
| for (i <- 1 until seq.size) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This must be private[streaming]
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I assume the same goes for
RateEstimator,NoopRateEstimator, andPIDRateEstimator, at least for now, right ?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.