[SPARK-18475] Be able to increase parallelism in StructuredStreaming Kafka source

brkyvz · zsxwing · commit c2ce1beb2d03 · 2017-01-18T16:01:02.000-08:00
## What changes were proposed in this pull request? This PR adds the configuration `numPartitions` to the StructuredStreaming Kafka Source. Setting this value to a value higher than the number of `TopicPartitions` that you're going to consume will allow Spark to have multiple tasks reading from the same `TopicPartition` allowing users to handle skewed partitions. While the number of `TopicPartitions` could be dynamic from batch to batch, e.g. you may delete/create topics, in ETL use cases where you generally have a set of static number of TopicPartitions, this configuration has been very useful. If the `TopicPartitions` are dynamic, then we will always have a parallelism of `max(topicPartitions.length, numPartitions)`. ## How was this patch tested? Unit tests. I used this on production data and it certainly helped in handling peak loads and skewed partitions. Author: Burak Yavuz <brkyvz@gmail.com> Closes apache#166 from brkyvz/kafka-par-split.
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala
@@ -271,7 +271,7 @@ private[kafka010] case class CachedKafkaConsumer private(
     }
   }
 
-  private def close(): Unit = consumer.close()
+  private[kafka010] def close(): Unit = consumer.close()
 
   private def seek(offset: Long): Unit = {
     logDebug(s"Seeking to $groupId $topicPartition $offset")
@@ -334,22 +334,27 @@ private[kafka010] object CachedKafkaConsumer extends Logging {
   def getOrCreate(
       topic: String,
       partition: Int,
-      kafkaParams: ju.Map[String, Object]): CachedKafkaConsumer = synchronized {
+      kafkaParams: ju.Map[String, Object],
+      reuse: Boolean): CachedKafkaConsumer = synchronized {
     val groupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String]
     val topicPartition = new TopicPartition(topic, partition)
     val key = CacheKey(groupId, topicPartition)
 
     // If this is reattempt at running the task, then invalidate cache and start with
     // a new consumer
-    if (TaskContext.get != null && TaskContext.get.attemptNumber > 1) {
+    if (!reuse || TaskContext.get != null && TaskContext.get.attemptNumber > 1) {
+      logDebug("Creating new CachedKafkaConsumer")
       val removedConsumer = cache.remove(key)
       if (removedConsumer != null) {
         removedConsumer.close()
       }
       new CachedKafkaConsumer(topicPartition, kafkaParams)
     } else {
       if (!cache.containsKey(key)) {
+        logDebug("Creating new CachedKafkaConsumer")
         cache.put(key, new CachedKafkaConsumer(topicPartition, kafkaParams))
+      } else {
+        logDebug(s"CachedKafkaConsumer exists for key: $key. Reusing.")
       }
       cache.get(key)
     }
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -119,6 +119,13 @@ private[kafka010] class KafkaSource(
     groupId
   }
 
+  /**
+   * Number of partitions to read from Kafka. If this value is greater than the number of Kafka
+   * topicPartitions, we will not use the CachedConsumer.
+   */
+  private val minNumParitions =
+    sourceOptions.getOrElse("minNumParitions", "0").toInt
+
   /**
    * A KafkaConsumer used in the driver to query the latest Kafka offsets. This only queries the
    * offsets and never commits them.
@@ -279,39 +286,15 @@ private[kafka010] class KafkaSource(
     }.toSeq
     logDebug("TopicPartitions: " + topicPartitions.mkString(", "))
 
-    val sortedExecutors = getSortedExecutorList(sc)
-    val numExecutors = sortedExecutors.length
-    logDebug("Sorted executors: " + sortedExecutors.mkString(", "))
-
-    // Calculate offset ranges
-    val offsetRanges = topicPartitions.map { tp =>
-      val fromOffset = fromPartitionOffsets.get(tp).getOrElse {
-        newPartitionOffsets.getOrElse(tp, {
-          // This should not happen since newPartitionOffsets contains all partitions not in
-          // fromPartitionOffsets
-          throw new IllegalStateException(s"$tp doesn't have a from offset")
-        })
-      }
-      val untilOffset = untilPartitionOffsets(tp)
-      val preferredLoc = if (numExecutors > 0) {
-        // This allows cached KafkaConsumers in the executors to be re-used to read the same
-        // partition in every batch.
-        Some(sortedExecutors(floorMod(tp.hashCode, numExecutors)))
-      } else None
-      KafkaSourceRDDOffsetRange(tp, fromOffset, untilOffset, preferredLoc)
-    }.filter { range =>
-      if (range.untilOffset < range.fromOffset) {
-        reportDataLoss(s"Partition ${range.topicPartition}'s offset was changed from " +
-          s"${range.fromOffset} to ${range.untilOffset}, some data may have been missed")
-        false
-      } else {
-        true
-      }
-    }.toArray
+    val offsetRanges = getOffsetRanges(topicPartitions, fromPartitionOffsets, newPartitionOffsets,
+      untilPartitionOffsets)
+    // We can't re-use CachedConsumers if we are using multiple partitions to read from a
+    // single Kafka TopicPartition
+    val reuseCachedConsumers = canReuseCachedConsumers(topicPartitions.length)
 
     // Create an RDD that reads from Kafka and get the (key, value) pair as byte arrays.
-    val rdd = new KafkaSourceRDD(
-      sc, executorKafkaParams, offsetRanges, pollTimeoutMs, failOnDataLoss).map { cr =>
+    val rdd = new KafkaSourceRDD(sc, executorKafkaParams, offsetRanges, pollTimeoutMs,
+      failOnDataLoss, reuseCachedConsumers).map { cr =>
       InternalRow(
         cr.key,
         cr.value,
@@ -393,6 +376,85 @@ private[kafka010] class KafkaSource(
     partitionOffsets
   }
 
+  /**
+   * If we divide topic partitions into multiple read tasks, we can't re-use CachedConsumers on
+   * the executors.
+   */
+  private def canReuseCachedConsumers(numTopicPartitions: Int): Boolean = {
+    math.max(minNumParitions, numTopicPartitions) == numTopicPartitions
+  }
+
+  /**
+   * Calculate the offset ranges that we are going to process this batch. If `numPartitions`
+   * is not set or is set less than or equal the number of `topicPartitions` that we're going to
+   * consume, then we fall back to a 1-1 mapping of Spark tasks to Kafka partitions. If
+   * `numPartitions` is set higher than the number of our `topicPartitions`, then we will split up
+   * the read tasks of the skewed partitions to multiple Spark tasks.
+   * The number of Spark tasks will be *approximately* `numPartitions`. It can be less or more
+   * depending on rounding errors or Kafka partitions that didn't receive any new data.
+   */
+  private def getOffsetRanges(
+      topicPartitions: Seq[TopicPartition],
+      fromPartitionOffsets: Map[TopicPartition, Long],
+      newPartitionOffsets: Map[TopicPartition, Long],
+      untilPartitionOffsets: Map[TopicPartition, Long]): Seq[KafkaSourceRDDOffsetRange] = {
+    val numPartitionsToRead = math.max(minNumParitions, topicPartitions.length)
+
+    val offsets = topicPartitions.flatMap { tp =>
+      val fromOffset = fromPartitionOffsets.get(tp).getOrElse {
+        newPartitionOffsets.getOrElse(tp, {
+          // This should not happen since newPartitionOffsets contains all partitions not in
+          // fromPartitionOffsets
+          throw new IllegalStateException(s"$tp doesn't have a from offset")
+        })
+      }
+      val untilOffset = untilPartitionOffsets(tp)
+      if (untilOffset < fromOffset) {
+        reportDataLoss(s"Partition $tp's offset was changed from " +
+          s"$fromOffset to $untilOffset, some data may have been missed")
+        None
+      } else {
+        Some(KafkaSourceRDDOffsetRange(tp, fromOffset, untilOffset, None))
+      }
+    }
+
+    if (numPartitionsToRead == topicPartitions.length) {
+      val sortedExecutors = getSortedExecutorList(sc)
+      val numExecutors = sortedExecutors.length
+      logDebug("Sorted executors: " + sortedExecutors.mkString(", "))
+
+      // One-to-One mapping
+      offsets.map { case KafkaSourceRDDOffsetRange(tp, fromOffset, untilOffset, _) =>
+        val preferredLoc = if (numExecutors > 0) {
+          // This allows cached KafkaConsumers in the executors to be re-used to read the same
+          // partition in every batch.
+          Some(sortedExecutors(floorMod(tp.hashCode, numExecutors)))
+        } else None
+        KafkaSourceRDDOffsetRange(tp, fromOffset, untilOffset, preferredLoc)
+      }.toList
+    } else {
+      // one-to-many mapping. We can't re-use CachedConsumers in this instance.
+      val totalSize = offsets.map(o => o.untilOffset - o.fromOffset).sum
+      offsets.flatMap { offsetRange =>
+        val tp = offsetRange.topicPartition
+        val size = offsetRange.untilOffset - offsetRange.fromOffset
+        // number of partitions to divvy up this topic partition to
+        val parts = math.max(math.round(size * 1.0 / totalSize * numPartitionsToRead), 1).toInt
+        var remaining = size
+        var startOffset = offsetRange.fromOffset
+        (0 until parts).map { part =>
+          // Fine to do integer division. Last partition will consume all the round off errors
+          val thisPartition = remaining / (parts - part)
+          remaining -= thisPartition
+          val endOffset = startOffset + thisPartition
+          val offsetRange = KafkaSourceRDDOffsetRange(tp, startOffset, endOffset, None)
+          startOffset = endOffset
+          offsetRange
+        }
+      }.toList
+    }
+  }
+
   /**
    * Fetch the latest offset of partitions.
    */
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -212,7 +212,7 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
            |Instead set the source option '$STARTING_OFFSETS_OPTION_KEY' to 'earliest' or 'latest'
            |to specify where to start. Structured Streaming manages which offsets are consumed
            |internally, rather than relying on the kafkaConsumer to do it. This will ensure that no
-           |data is missed when when new topics/partitions are dynamically subscribed. Note that
+           |data is missed when new topics/partitions are dynamically subscribed. Note that
            |'$STARTING_OFFSETS_OPTION_KEY' only applies when a new Streaming query is started, and
            |that resuming will always pick up from where the query left off. See the docs for more
            |details.
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
@@ -21,13 +21,14 @@ import java.{util => ju}
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.kafka.clients.consumer.ConsumerRecord
+import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
 import org.apache.kafka.common.TopicPartition
 
 import org.apache.spark.{Partition, SparkContext, TaskContext}
 import org.apache.spark.partial.{BoundedDouble, PartialResult}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
+import org.apache.spark.util.CompletionIterator
 import org.apache.spark.util.NextIterator
 
 
@@ -63,7 +64,8 @@ private[kafka010] class KafkaSourceRDD(
     executorKafkaParams: ju.Map[String, Object],
     offsetRanges: Seq[KafkaSourceRDDOffsetRange],
     pollTimeoutMs: Long,
-    failOnDataLoss: Boolean)
+    failOnDataLoss: Boolean,
+    reuseCachedConsumers: Boolean = true)
   extends RDD[ConsumerRecord[Array[Byte], Array[Byte]]](sc, Nil) {
 
   override def persist(newLevel: StorageLevel): this.type = {
@@ -119,6 +121,15 @@ private[kafka010] class KafkaSourceRDD(
     part.offsetRange.preferredLoc.map(Seq(_)).getOrElse(Seq.empty)
   }
 
+  /** Pulled out for mockability in testing. */
+  protected def getOrCreateKafkaConsumer(
+      topic: String,
+      partition: Int,
+      kafkaParams: ju.Map[String, Object],
+      reuseCachedConsumers: Boolean): CachedKafkaConsumer = {
+    CachedKafkaConsumer.getOrCreate(topic, partition, executorKafkaParams, reuseCachedConsumers)
+  }
+
   override def compute(
       thePart: Partition,
       context: TaskContext): Iterator[ConsumerRecord[Array[Byte], Array[Byte]]] = {
@@ -133,9 +144,18 @@ private[kafka010] class KafkaSourceRDD(
         s"skipping ${range.topic} ${range.partition}")
       Iterator.empty
     } else {
-      new NextIterator[ConsumerRecord[Array[Byte], Array[Byte]]]() {
-        val consumer = CachedKafkaConsumer.getOrCreate(
-          range.topic, range.partition, executorKafkaParams)
+      if (!reuseCachedConsumers) {
+        // if we can't reuse CachedKafkaConsumers, let's reset the groupId, because we will have
+        // multiple tasks reading from the same topic partitions
+        val old = executorKafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String]
+        executorKafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, old + "-" + thePart.index.toString)
+      }
+
+      logDebug(s"Creating iterator for $range")
+
+      val underlying = new NextIterator[ConsumerRecord[Array[Byte], Array[Byte]]]() {
+        val consumer = getOrCreateKafkaConsumer(range.topic, range.partition, executorKafkaParams,
+          reuseCachedConsumers)
         var requestOffset = range.fromOffset
 
         override def getNext(): ConsumerRecord[Array[Byte], Array[Byte]] = {
@@ -156,8 +176,19 @@ private[kafka010] class KafkaSourceRDD(
           }
         }
 
-        override protected def close(): Unit = {}
+        override protected def close(): Unit = {
+          if (!reuseCachedConsumers) {
+            consumer.close()
+          }
+        }
+      }
+      if (!reuseCachedConsumers) {
+        // Don't forget to close consumers! You may take down your Kafka cluster.
+        context.addTaskCompletionListener { _ =>
+          underlying.closeIfNeeded()
+        }
       }
+      underlying
     }
   }
 }
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala