[SPARK-4964] one potential way of hiding most of the implementation, while still allowing access to offsets (but not subclassing)

koeninger · koeninger · commit 2e67117d1d50 · 2015-01-27T22:55:50.000-06:00
diff --git a/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/KafkaCluster.scala b/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/KafkaCluster.scala
@@ -32,6 +32,7 @@ import kafka.consumer.{ConsumerConfig, SimpleConsumer}
   *   Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
   *   NOT zookeeper servers, specified in host1:port1,host2:port2 form
   */
+private[spark]
 class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
   import KafkaCluster.{Err, LeaderOffset}
 
@@ -297,6 +298,7 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
   }
 }
 
+private[spark]
 object KafkaCluster {
   type Err = ArrayBuffer[Throwable]
 
diff --git a/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/KafkaRDD.scala b/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/KafkaRDD.scala
@@ -34,8 +34,6 @@ import kafka.utils.VerifiableProperties
 /** A batch-oriented interface for consuming from Kafka.
   * Starting and ending offsets are specified in advance,
   * so that you can control exactly-once semantics.
-  * For an easy interface to Kafka-managed offsets,
-  *  see {@link org.apache.spark.rdd.kafka.KafkaCluster}
   * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
   * configuration parameters</a>.
   *   Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
@@ -44,17 +42,20 @@ import kafka.utils.VerifiableProperties
   *   range of offsets for a given Kafka topic/partition
   * @param messageHandler function for translating each message into the desired type
   */
+private[spark]
 class KafkaRDD[
   K: ClassTag,
   V: ClassTag,
   U <: Decoder[_]: ClassTag,
   T <: Decoder[_]: ClassTag,
-  R: ClassTag](
+  R: ClassTag] private[spark] (
     sc: SparkContext,
-    val kafkaParams: Map[String, String],
-    val batch: Array[KafkaRDDPartition],
+    kafkaParams: Map[String, String],
+    private[spark] val batch: Array[KafkaRDDPartition],
     messageHandler: MessageAndMetadata[K, V] => R
-  ) extends RDD[R](sc, Nil) with Logging {
+  ) extends RDD[R](sc, Nil) with Logging with HasOffsetRanges {
+
+  def offsetRanges: Array[OffsetRange] = batch.asInstanceOf[Array[OffsetRange]]
 
   override def getPartitions: Array[Partition] = batch.asInstanceOf[Array[Partition]]
 
@@ -160,6 +161,7 @@ class KafkaRDD[
 
 }
 
+private[spark]
 object KafkaRDD {
   import KafkaCluster.LeaderOffset
 
diff --git a/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/KafkaRDDPartition.scala b/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/KafkaRDDPartition.scala
@@ -26,15 +26,16 @@ import org.apache.spark.Partition
   * @param host preferred kafka host, i.e. the leader at the time the rdd was created
   * @param port preferred kafka host's port
   */
+private[spark]
 class KafkaRDDPartition(
   override val index: Int,
-  val topic: String,
-  val partition: Int,
-  val fromOffset: Long,
-  val untilOffset: Long,
-  val host: String,
-  val port: Int
-) extends Partition {
+  override val topic: String,
+  override val partition: Int,
+  override val fromOffset: Long,
+  override val untilOffset: Long,
+  override val host: String,
+  override val port: Int
+) extends Partition with OffsetRange {
   def toTuple: (Int, String, Int, Long, Long, String, Int) = (
     index,
     topic,
@@ -47,6 +48,7 @@ class KafkaRDDPartition(
 
 }
 
+private[spark]
 object KafkaRDDPartition {
   def apply(
     index: Int,
diff --git a/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/OffsetRange.scala b/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/OffsetRange.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rdd.kafka
+
+/** Represents a range of offsets from a single Kafka TopicAndPartition */
+trait OffsetRange {
+  /** kafka topic name */
+  def topic: String
+
+  /** kafka partition id */
+  def partition: Int
+
+  /** inclusive starting offset */
+  def fromOffset: Long
+
+  /** exclusive ending offset */
+  def untilOffset: Long
+
+  /** preferred kafka host, i.e. the leader at the time of creation */
+  def host: String
+
+  /** preferred kafka host's port */
+  def port: Int
+}
+
+/** Something that has a collection of OffsetRanges */
+trait HasOffsetRanges {
+  def offsetRanges: Array[OffsetRange]
+}
+
+private class OffsetRangeImpl(
+  override val topic: String,
+  override val partition: Int,
+  override val fromOffset: Long,
+  override val untilOffset: Long,
+  override val host: String,
+  override val port: Int
+) extends OffsetRange
+
+object OffsetRange {
+  def apply(
+    topic: String,
+    partition: Int,
+    fromOffset: Long,
+    untilOffset: Long,
+    host: String,
+    port: Int): OffsetRange =
+    new OffsetRangeImpl(
+      topic,
+      partition,
+      fromOffset,
+      untilOffset,
+      host,
+      port)
+}
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DeterministicKafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DeterministicKafkaInputDStream.scala
@@ -51,6 +51,7 @@ import org.apache.spark.streaming.dstream._
   * @param messageHandler function for translating each message into the desired type
   * @param maxRetries maximum number of times in a row to retry getting leaders' offsets
   */
+private[streaming]
 class DeterministicKafkaInputDStream[
   K: ClassTag,
   V: ClassTag,
@@ -61,7 +62,7 @@ class DeterministicKafkaInputDStream[
     val kafkaParams: Map[String, String],
     val fromOffsets: Map[TopicAndPartition, Long],
     messageHandler: MessageAndMetadata[K, V] => R,
-    maxRetries: Int = 1
+    maxRetries: Int
 ) extends InputDStream[R](ssc_) with Logging {
 
   protected[streaming] override val checkpointData =
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -23,12 +23,18 @@ import java.util.{Map => JMap}
 import scala.reflect.ClassTag
 import scala.collection.JavaConversions._
 
+import kafka.common.TopicAndPartition
+import kafka.message.MessageAndMetadata
 import kafka.serializer.{Decoder, StringDecoder}
 
+
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.StreamingContext
 import org.apache.spark.streaming.api.java.{JavaPairReceiverInputDStream, JavaStreamingContext}
-import org.apache.spark.streaming.dstream.ReceiverInputDStream
+import org.apache.spark.streaming.dstream.{InputDStream, ReceiverInputDStream}
+import org.apache.spark.rdd.kafka.{KafkaCluster, KafkaRDD, KafkaRDDPartition, OffsetRange}
 
 object KafkaUtils {
   /**
@@ -144,4 +150,116 @@ object KafkaUtils {
     createStream[K, V, U, T](
       jssc.ssc, kafkaParams.toMap, Map(topics.mapValues(_.intValue()).toSeq: _*), storageLevel)
   }
+
+  /** A batch-oriented interface for consuming from Kafka.
+   * Starting and ending offsets are specified in advance,
+   * so that you can control exactly-once semantics.
+   * @param sc SparkContext object
+   * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+   * configuration parameters</a>.
+   *   Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
+   *   NOT zookeeper servers, specified in host1:port1,host2:port2 form.
+   * @param batch Each OffsetRange in the batch corresponds to a
+   *   range of offsets for a given Kafka topic/partition
+   * @param messageHandler function for translating each message into the desired type
+   */
+  def createRDD[
+    K: ClassTag,
+    V: ClassTag,
+    U <: Decoder[_]: ClassTag,
+    T <: Decoder[_]: ClassTag,
+    R: ClassTag] (
+      sc: SparkContext,
+      kafkaParams: Map[String, String],
+      batch: Array[OffsetRange],
+      messageHandler: MessageAndMetadata[K, V] => R
+  ): RDD[R] = {
+    val parts = batch.zipWithIndex.map { case (o, i) =>
+        new KafkaRDDPartition(i, o.topic, o.partition, o.fromOffset, o.untilOffset, o.host, o.port)
+    }.toArray
+    new KafkaRDD[K, V, U, T, R](sc, kafkaParams, parts, messageHandler)
+  }
+
+  /**
+   * This DOES NOT guarantee that side-effects of an action will see each message exactly once.
+   * If you need that guarantee, get the offsets from this stream and store them with your output.
+   * Nor does this store offsets in Kafka / Zookeeper.
+   * If checkpointed, it will store offset ranges in the checkpoint, such that each message
+   * will be transformed effectively exactly once even after failure,
+   * provided you have sufficient Kafka log retention.
+   *
+   * @param ssc StreamingContext object
+   * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+   * configuration parameters</a>.
+   *   Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
+   *   NOT zookeeper servers, specified in host1:port1,host2:port2 form.
+   * @param messageHandler function for translating each message into the desired type
+   * @param fromOffsets per-topic/partition Kafka offsets defining the (inclusive)
+   *  starting point of the stream
+   * @param maxRetries maximum number of times in a row to retry getting leaders' offsets
+   */
+  def createExactlyOnceStream[
+    K: ClassTag,
+    V: ClassTag,
+    U <: Decoder[_]: ClassTag,
+    T <: Decoder[_]: ClassTag,
+    R: ClassTag] (
+      ssc: StreamingContext,
+      kafkaParams: Map[String, String],
+      fromOffsets: Map[TopicAndPartition, Long],
+      messageHandler: MessageAndMetadata[K, V] => R,
+      maxRetries: Int
+  ): InputDStream[R] = {
+    new DeterministicKafkaInputDStream[K, V, U, T, R](
+      ssc, kafkaParams, fromOffsets, messageHandler, maxRetries)
+  }
+
+  /**
+   * This DOES NOT guarantee that side-effects of an action will see each message exactly once.
+   * If you need that guarantee, get the offsets from this stream and store them with your output.
+   * Nor does this store offsets in Kafka / Zookeeper.
+   * If checkpointed, it will store offset ranges in the checkpoint, such that each message
+   * will be transformed effectively exactly once even after failure,
+   * provided you have sufficient Kafka log retention.
+   *
+   * @param ssc StreamingContext object
+   * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+   * configuration parameters</a>.
+   *   Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
+   *   NOT zookeeper servers, specified in host1:port1,host2:port2 form.
+   *   If starting without a checkpoint, "auto.offset.reset" may be set to "largest" or "smallest"
+   *   to determine where the stream starts (defaults to "largest")
+   * @param topics names of the topics to consume
+   */
+  def createExactlyOnceStream[
+    K: ClassTag,
+    V: ClassTag,
+    U <: Decoder[_]: ClassTag,
+    T <: Decoder[_]: ClassTag] (
+      ssc: StreamingContext,
+      kafkaParams: Map[String, String],
+      topics: Set[String]
+  ): InputDStream[(K, V)] = {
+    val messageHandler = (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message)
+    val kc = new KafkaCluster(kafkaParams)
+    val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase)
+
+    (for {
+      topicPartitions <- kc.getPartitions(topics).right
+      leaderOffsets <- (if (reset == Some("smallest")) {
+        kc.getEarliestLeaderOffsets(topicPartitions)
+      } else {
+        kc.getLatestLeaderOffsets(topicPartitions)
+      }).right
+    } yield {
+      val fromOffsets = leaderOffsets.map { case (tp, lo) =>
+          (tp, lo.offset)
+      }
+      new DeterministicKafkaInputDStream[K, V, U, T, (K, V)](
+        ssc, kafkaParams, fromOffsets, messageHandler, 1)
+    }).fold(
+      errs => throw new Exception(errs.mkString("\n")),
+      ok => ok
+    )
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,7 @@ import kafka.consumer.{ConsumerConfig, SimpleConsumer}`
`32`	`32`	`* Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),`
`33`	`33`	`* NOT zookeeper servers, specified in host1:port1,host2:port2 form`
`34`	`34`	`*/`
	`35`	`+private[spark]`
`35`	`36`	`class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {`
`36`	`37`	`import KafkaCluster.{Err, LeaderOffset}`
`37`	`38`
`@@ -297,6 +298,7 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {`
`297`	`298`	`}`
`298`	`299`	`}`
`299`	`300`
	`301`	`+private[spark]`
`300`	`302`	`object KafkaCluster {`
`301`	`303`	`type Err = ArrayBuffer[Throwable]`
`302`	`304`