CodingCat
diff --git a/‎core/src/main/scala/org/apache/spark/Partitioner.scala‎
Lines changed: 61 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/Partitioner.scala‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala‎
Lines changed: 3 additions & 3 deletions b/‎core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala‎
Lines changed: 28 additions & 8 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala‎
Lines changed: 28 additions & 8 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala‎
Lines changed: 34 additions & 16 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala‎
Lines changed: 34 additions & 16 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala‎
Lines changed: 22 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala‎
Lines changed: 22 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/util/Utils.scala‎
Lines changed: 1 addition & 1 deletion b/‎core/src/main/scala/org/apache/spark/util/Utils.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/src/test/scala/org/apache/spark/PartitioningSuite.scala‎
Lines changed: 34 additions & 0 deletions b/‎core/src/test/scala/org/apache/spark/PartitioningSuite.scala‎
Lines changed: 34 additions & 0 deletions
@@ -156,3 +156,64 @@ class RangePartitioner[K : Ordering : ClassTag, V](
       false
   }
 }
+
+/**
+ * A [[org.apache.spark.Partitioner]] that partitions records into specified bounds
+ * Default value is 1000. Once all partitions have bounds elements, the partitioner
+ * allocates 1 element per partition so eventually the smaller partitions are at most
+ * off by 1 key compared to the larger partitions.
+ */
+class BoundaryPartitioner[K : Ordering : ClassTag, V](
+                                                    partitions: Int,
+                                                    @transient rdd: RDD[_ <: Product2[K,V]],
+                                                    private val boundary: Int = 1000)
+  extends Partitioner {
+
+  // this array keeps track of keys assigned to a partition
+  // counts[0] refers to # of keys in partition 0 and so on
+  private val counts: Array[Int] = {
+    new Array[Int](numPartitions)
+  }
+
+  def numPartitions = math.abs(partitions)
+
+  /*
+  * Ideally, this should've been calculated based on # partitions and total keys
+  * But we are not calling count on RDD here to avoid calling an action.
+   * User has the flexibility of calling count and passing in any appropriate boundary
+   */
+  def keysPerPartition = boundary
+
+  var currPartition = 0
+
+  /*
+  * Pick current partition for the key until we hit the bound for keys / partition,
+  * start allocating to next partition at that time.
+  *
+  * NOTE: In case where we have lets say 2000 keys and user says 3 partitions with 500
+  * passed in as boundary, the first 500 will goto P1, 501-1000 go to P2, 1001-1500 go to P3,
+  * after that, next keys go to one partition at a time. So 1501 goes to P1, 1502 goes to P2,
+  * 1503 goes to P3 and so on.
+   */
+  def getPartition(key: Any): Int = {
+    val partition = currPartition
+    counts(partition) = counts(partition) + 1
+    /*
+    * Since we are filling up a partition before moving to next one (this helps in maintaining
+    * order of keys, in certain cases, it is possible to end up with empty partitions, like
+    * 3 partitions, 500 keys / partition and if rdd has 700 keys, 1 partition will be entirely
+    * empty.
+     */
+    if(counts(currPartition) >= keysPerPartition)
+      currPartition = (currPartition + 1) % numPartitions
+    partition
+  }
+
+  override def equals(other: Any): Boolean = other match {
+    case r: BoundaryPartitioner[_,_] =>
+      (r.counts.sameElements(counts) && r.boundary == boundary
+        && r.currPartition == currPartition)
+    case _ =>
+      false
+  }
+}
@@ -217,7 +217,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * Return approximate number of distinct values for each key in this RDD.
    * The accuracy of approximation can be controlled through the relative standard deviation
    * (relativeSD) parameter, which also controls the amount of memory used. Lower values result in
-   * more accurate counts but increase the memory footprint and vise versa. Uses the provided
+   * more accurate counts but increase the memory footprint and vice versa. Uses the provided
    * Partitioner to partition the output RDD.
    */
   def countApproxDistinctByKey(relativeSD: Double, partitioner: Partitioner): RDD[(K, Long)] = {
@@ -232,7 +232,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * Return approximate number of distinct values for each key in this RDD.
    * The accuracy of approximation can be controlled through the relative standard deviation
    * (relativeSD) parameter, which also controls the amount of memory used. Lower values result in
-   * more accurate counts but increase the memory footprint and vise versa. HashPartitions the
+   * more accurate counts but increase the memory footprint and vice versa. HashPartitions the
    * output RDD into numPartitions.
    *
    */
@@ -244,7 +244,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * Return approximate number of distinct values for each key this RDD.
    * The accuracy of approximation can be controlled through the relative standard deviation
    * (relativeSD) parameter, which also controls the amount of memory used. Lower values result in
-   * more accurate counts but increase the memory footprint and vise versa. The default value of
+   * more accurate counts but increase the memory footprint and vice versa. The default value of
    * relativeSD is 0.05. Hash-partitions the output RDD using the existing partitioner/parallelism
    * level.
    */
 
@@ -20,6 +20,7 @@ package org.apache.spark.scheduler
 import java.util.concurrent.{LinkedBlockingQueue, Semaphore}
 
 import org.apache.spark.Logging
+import org.apache.spark.util.Utils
 
 /**
  * Asynchronously passes SparkListenerEvents to registered SparkListeners.
@@ -42,7 +43,7 @@ private[spark] class LiveListenerBus extends SparkListenerBus with Logging {
 
   private val listenerThread = new Thread("SparkListenerBus") {
     setDaemon(true)
-    override def run() {
+    override def run(): Unit = Utils.logUncaughtExceptions {
       while (true) {
         eventLock.acquire()
         // Atomically remove and process this event
@@ -77,11 +78,8 @@ private[spark] class LiveListenerBus extends SparkListenerBus with Logging {
     val eventAdded = eventQueue.offer(event)
     if (eventAdded) {
       eventLock.release()
-    } else if (!queueFullErrorMessageLogged) {
-      logError("Dropping SparkListenerEvent because no remaining room in event queue. " +
-        "This likely means one of the SparkListeners is too slow and cannot keep up with the " +
-        "rate at which tasks are being started by the scheduler.")
-      queueFullErrorMessageLogged = true
+    } else {
+      logQueueFullErrorMessage()
     }
   }
 
@@ -96,13 +94,18 @@ private[spark] class LiveListenerBus extends SparkListenerBus with Logging {
       if (System.currentTimeMillis > finishTime) {
         return false
       }
-      /* Sleep rather than using wait/notify, because this is used only for testing and wait/notify
-       * add overhead in the general case. */
+      /* Sleep rather than using wait/notify, because this is used only for testing and
+       * wait/notify add overhead in the general case. */
       Thread.sleep(10)
     }
     true
   }
 
+  /**
+   * For testing only. Return whether the listener daemon thread is still alive.
+   */
+  def listenerThreadIsAlive: Boolean = synchronized { listenerThread.isAlive }
+
   /**
    * Return whether the event queue is empty.
    *
@@ -111,6 +114,23 @@ private[spark] class LiveListenerBus extends SparkListenerBus with Logging {
    */
   def queueIsEmpty: Boolean = synchronized { eventQueue.isEmpty }
 
+  /**
+   * Log an error message to indicate that the event queue is full. Do this only once.
+   */
+  private def logQueueFullErrorMessage(): Unit = {
+    if (!queueFullErrorMessageLogged) {
+      if (listenerThread.isAlive) {
+        logError("Dropping SparkListenerEvent because no remaining room in event queue. " +
+          "This likely means one of the SparkListeners is too slow and cannot keep up with" +
+          "the rate at which tasks are being started by the scheduler.")
+      } else {
+        logError("SparkListenerBus thread is dead! This means SparkListenerEvents have not" +
+          "been (and will no longer be) propagated to listeners for some time.")
+      }
+      queueFullErrorMessageLogged = true
+    }
+  }
+
   def stop() {
     if (!started) {
       throw new IllegalStateException("Attempted to stop a listener bus that has not yet started!")
 
@@ -20,10 +20,13 @@ package org.apache.spark.scheduler
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
+import org.apache.spark.Logging
+import org.apache.spark.util.Utils
+
 /**
  * A SparkListenerEvent bus that relays events to its listeners
  */
-private[spark] trait SparkListenerBus {
+private[spark] trait SparkListenerBus extends Logging {
 
   // SparkListeners attached to this event bus
   protected val sparkListeners = new ArrayBuffer[SparkListener]
@@ -34,38 +37,53 @@ private[spark] trait SparkListenerBus {
   }
 
   /**
-   * Post an event to all attached listeners. This does nothing if the event is
-   * SparkListenerShutdown.
+   * Post an event to all attached listeners.
+   * This does nothing if the event is SparkListenerShutdown.
    */
   def postToAll(event: SparkListenerEvent) {
     event match {
       case stageSubmitted: SparkListenerStageSubmitted =>
-        sparkListeners.foreach(_.onStageSubmitted(stageSubmitted))
+        foreachListener(_.onStageSubmitted(stageSubmitted))
       case stageCompleted: SparkListenerStageCompleted =>
-        sparkListeners.foreach(_.onStageCompleted(stageCompleted))
+        foreachListener(_.onStageCompleted(stageCompleted))
       case jobStart: SparkListenerJobStart =>
-        sparkListeners.foreach(_.onJobStart(jobStart))
+        foreachListener(_.onJobStart(jobStart))
       case jobEnd: SparkListenerJobEnd =>
-        sparkListeners.foreach(_.onJobEnd(jobEnd))
+        foreachListener(_.onJobEnd(jobEnd))
       case taskStart: SparkListenerTaskStart =>
-        sparkListeners.foreach(_.onTaskStart(taskStart))
+        foreachListener(_.onTaskStart(taskStart))
       case taskGettingResult: SparkListenerTaskGettingResult =>
-        sparkListeners.foreach(_.onTaskGettingResult(taskGettingResult))
+        foreachListener(_.onTaskGettingResult(taskGettingResult))
       case taskEnd: SparkListenerTaskEnd =>
-        sparkListeners.foreach(_.onTaskEnd(taskEnd))
+        foreachListener(_.onTaskEnd(taskEnd))
       case environmentUpdate: SparkListenerEnvironmentUpdate =>
-        sparkListeners.foreach(_.onEnvironmentUpdate(environmentUpdate))
+        foreachListener(_.onEnvironmentUpdate(environmentUpdate))
       case blockManagerAdded: SparkListenerBlockManagerAdded =>
-        sparkListeners.foreach(_.onBlockManagerAdded(blockManagerAdded))
+        foreachListener(_.onBlockManagerAdded(blockManagerAdded))
       case blockManagerRemoved: SparkListenerBlockManagerRemoved =>
-        sparkListeners.foreach(_.onBlockManagerRemoved(blockManagerRemoved))
+        foreachListener(_.onBlockManagerRemoved(blockManagerRemoved))
       case unpersistRDD: SparkListenerUnpersistRDD =>
-        sparkListeners.foreach(_.onUnpersistRDD(unpersistRDD))
+        foreachListener(_.onUnpersistRDD(unpersistRDD))
       case applicationStart: SparkListenerApplicationStart =>
-        sparkListeners.foreach(_.onApplicationStart(applicationStart))
+        foreachListener(_.onApplicationStart(applicationStart))
       case applicationEnd: SparkListenerApplicationEnd =>
-        sparkListeners.foreach(_.onApplicationEnd(applicationEnd))
+        foreachListener(_.onApplicationEnd(applicationEnd))
       case SparkListenerShutdown =>
     }
   }
+
+  /**
+   * Apply the given function to all attached listeners, catching and logging any exception.
+   */
+  private def foreachListener(f: SparkListener => Unit): Unit = {
+    sparkListeners.foreach { listener =>
+      try {
+        f(listener)
+      } catch {
+        case e: Exception =>
+          logError(s"Listener ${Utils.getFormattedClassName(listener)} threw an exception", e)
+      }
+    }
+  }
+
 }
@@ -25,7 +25,7 @@ import scala.collection.mutable.Set
 import com.esotericsoftware.reflectasm.shaded.org.objectweb.asm.{ClassReader, ClassVisitor, MethodVisitor, Type}
 import com.esotericsoftware.reflectasm.shaded.org.objectweb.asm.Opcodes._
 
-import org.apache.spark.Logging
+import org.apache.spark.{Logging, SparkException}
 
 private[spark] object ClosureCleaner extends Logging {
   // Get an ASM class reader for a given class from the JAR that loaded it
@@ -108,6 +108,9 @@ private[spark] object ClosureCleaner extends Logging {
     val outerObjects = getOuterObjects(func)
 
     val accessedFields = Map[Class[_], Set[String]]()
+    
+    getClassReader(func.getClass).accept(new ReturnStatementFinder(), 0)
+    
     for (cls <- outerClasses)
       accessedFields(cls) = Set[String]()
     for (cls <- func.getClass :: innerClasses)
@@ -180,6 +183,24 @@ private[spark] object ClosureCleaner extends Logging {
   }
 }
 
+private[spark]
+class ReturnStatementFinder extends ClassVisitor(ASM4) {
+  override def visitMethod(access: Int, name: String, desc: String,
+      sig: String, exceptions: Array[String]): MethodVisitor = {
+    if (name.contains("apply")) {
+      new MethodVisitor(ASM4) {
+        override def visitTypeInsn(op: Int, tp: String) {
+          if (op == NEW && tp.contains("scala/runtime/NonLocalReturnControl")) {
+            throw new SparkException("Return statements aren't allowed in Spark closures")
+          }
+        }
+      }
+    } else {
+      new MethodVisitor(ASM4) {}
+    }
+  }
+}
+
 private[spark]
 class FieldAccessFinder(output: Map[Class[_], Set[String]]) extends ClassVisitor(ASM4) {
   override def visitMethod(access: Int, name: String, desc: String,
 
@@ -1128,7 +1128,7 @@ private[spark] object Utils extends Logging {
   }
 
   /** 
-   * Executes the given block, printing and re-throwing any uncaught exceptions.
+   * Execute the given block, logging and re-throwing any uncaught exception.
    * This is particularly useful for wrapping code that runs in a thread, to ensure
    * that exceptions are printed, and to avoid having to catch Throwable.
    */
 
@@ -66,6 +66,40 @@ class PartitioningSuite extends FunSuite with SharedSparkContext with PrivateMet
     assert(descendingP4 != p4)
   }
 
+  test("BoundaryPartitioner equality") {
+    // Make an RDD where all the elements are the same so that the partition range bounds
+    // are deterministically all the same.
+    val rdd = sc.parallelize(1.to(4000)).map(x => (x, x))
+
+    val p2 = new BoundaryPartitioner(2, rdd, 1000)
+    val p4 = new BoundaryPartitioner(4, rdd, 1000)
+    val anotherP4 = new BoundaryPartitioner(4, rdd)
+
+    assert(p2 === p2)
+    assert(p4 === p4)
+    assert(p2 != p4)
+    assert(p4 != p2)
+    assert(p4 === anotherP4)
+    assert(anotherP4 === p4)
+  }
+
+  test("BoundaryPartitioner getPartition") {
+    val rdd = sc.parallelize(1.to(2000)).map(x => (x, x))
+    val partitioner = new BoundaryPartitioner(4, rdd,  500)
+    1.to(2000).map { element => {
+      val partition = partitioner.getPartition(element)
+      if (element <= 500) {
+        assert(partition === 0)
+      } else if (element > 501 && element <= 1000) {
+        assert(partition === 1)
+      } else if (element > 1001 && element <= 1500) {
+        assert(partition === 2)
+      } else if (element > 1501 && element <= 2000) {
+        assert(partition === 3)
+      }
+    }}
+  }
+
   test("RangePartitioner getPartition") {
     val rdd = sc.parallelize(1.to(2000)).map(x => (x, x))
     // We have different behaviour of getPartition for partitions with less than 1000 and more than
Original file line number	Diff line number	Diff line change
`@@ -1128,7 +1128,7 @@ private[spark] object Utils extends Logging {`
`1128`	`1128`	`}`
`1129`	`1129`
`1130`	`1130`	`/**`
`1131`		`- * Executes the given block, printing and re-throwing any uncaught exceptions.`
	`1131`	`+ * Execute the given block, logging and re-throwing any uncaught exception.`
`1132`	`1132`	`* This is particularly useful for wrapping code that runs in a thread, to ensure`
`1133`	`1133`	`* that exceptions are printed, and to avoid having to catch Throwable.`
`1134`	`1134`	`*/`