@@ -23,12 +23,18 @@ import java.util.{Map => JMap}
2323import scala .reflect .ClassTag
2424import scala .collection .JavaConversions ._
2525
26+ import kafka .common .TopicAndPartition
27+ import kafka .message .MessageAndMetadata
2628import kafka .serializer .{Decoder , StringDecoder }
2729
30+
31+ import org .apache .spark .SparkContext
32+ import org .apache .spark .rdd .RDD
2833import org .apache .spark .storage .StorageLevel
2934import org .apache .spark .streaming .StreamingContext
3035import org .apache .spark .streaming .api .java .{JavaPairReceiverInputDStream , JavaStreamingContext }
31- import org .apache .spark .streaming .dstream .ReceiverInputDStream
36+ import org .apache .spark .streaming .dstream .{InputDStream , ReceiverInputDStream }
37+ import org .apache .spark .rdd .kafka .{KafkaCluster , KafkaRDD , KafkaRDDPartition , OffsetRange }
3238
3339object KafkaUtils {
3440 /**
@@ -144,4 +150,116 @@ object KafkaUtils {
144150 createStream[K , V , U , T ](
145151 jssc.ssc, kafkaParams.toMap, Map (topics.mapValues(_.intValue()).toSeq: _* ), storageLevel)
146152 }
153+
154+ /** A batch-oriented interface for consuming from Kafka.
155+ * Starting and ending offsets are specified in advance,
156+ * so that you can control exactly-once semantics.
157+ * @param sc SparkContext object
158+ * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
159+ * configuration parameters</a>.
160+ * Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
161+ * NOT zookeeper servers, specified in host1:port1,host2:port2 form.
162+ * @param batch Each OffsetRange in the batch corresponds to a
163+ * range of offsets for a given Kafka topic/partition
164+ * @param messageHandler function for translating each message into the desired type
165+ */
166+ def createRDD [
167+ K : ClassTag ,
168+ V : ClassTag ,
169+ U <: Decoder [_]: ClassTag ,
170+ T <: Decoder [_]: ClassTag ,
171+ R : ClassTag ] (
172+ sc : SparkContext ,
173+ kafkaParams : Map [String , String ],
174+ batch : Array [OffsetRange ],
175+ messageHandler : MessageAndMetadata [K , V ] => R
176+ ): RDD [R ] = {
177+ val parts = batch.zipWithIndex.map { case (o, i) =>
178+ new KafkaRDDPartition (i, o.topic, o.partition, o.fromOffset, o.untilOffset, o.host, o.port)
179+ }.toArray
180+ new KafkaRDD [K , V , U , T , R ](sc, kafkaParams, parts, messageHandler)
181+ }
182+
183+ /**
184+ * This DOES NOT guarantee that side-effects of an action will see each message exactly once.
185+ * If you need that guarantee, get the offsets from this stream and store them with your output.
186+ * Nor does this store offsets in Kafka / Zookeeper.
187+ * If checkpointed, it will store offset ranges in the checkpoint, such that each message
188+ * will be transformed effectively exactly once even after failure,
189+ * provided you have sufficient Kafka log retention.
190+ *
191+ * @param ssc StreamingContext object
192+ * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
193+ * configuration parameters</a>.
194+ * Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
195+ * NOT zookeeper servers, specified in host1:port1,host2:port2 form.
196+ * @param messageHandler function for translating each message into the desired type
197+ * @param fromOffsets per-topic/partition Kafka offsets defining the (inclusive)
198+ * starting point of the stream
199+ * @param maxRetries maximum number of times in a row to retry getting leaders' offsets
200+ */
201+ def createExactlyOnceStream [
202+ K : ClassTag ,
203+ V : ClassTag ,
204+ U <: Decoder [_]: ClassTag ,
205+ T <: Decoder [_]: ClassTag ,
206+ R : ClassTag ] (
207+ ssc : StreamingContext ,
208+ kafkaParams : Map [String , String ],
209+ fromOffsets : Map [TopicAndPartition , Long ],
210+ messageHandler : MessageAndMetadata [K , V ] => R ,
211+ maxRetries : Int
212+ ): InputDStream [R ] = {
213+ new DeterministicKafkaInputDStream [K , V , U , T , R ](
214+ ssc, kafkaParams, fromOffsets, messageHandler, maxRetries)
215+ }
216+
217+ /**
218+ * This DOES NOT guarantee that side-effects of an action will see each message exactly once.
219+ * If you need that guarantee, get the offsets from this stream and store them with your output.
220+ * Nor does this store offsets in Kafka / Zookeeper.
221+ * If checkpointed, it will store offset ranges in the checkpoint, such that each message
222+ * will be transformed effectively exactly once even after failure,
223+ * provided you have sufficient Kafka log retention.
224+ *
225+ * @param ssc StreamingContext object
226+ * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
227+ * configuration parameters</a>.
228+ * Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
229+ * NOT zookeeper servers, specified in host1:port1,host2:port2 form.
230+ * If starting without a checkpoint, "auto.offset.reset" may be set to "largest" or "smallest"
231+ * to determine where the stream starts (defaults to "largest")
232+ * @param topics names of the topics to consume
233+ */
234+ def createExactlyOnceStream [
235+ K : ClassTag ,
236+ V : ClassTag ,
237+ U <: Decoder [_]: ClassTag ,
238+ T <: Decoder [_]: ClassTag ] (
239+ ssc : StreamingContext ,
240+ kafkaParams : Map [String , String ],
241+ topics : Set [String ]
242+ ): InputDStream [(K , V )] = {
243+ val messageHandler = (mmd : MessageAndMetadata [K , V ]) => (mmd.key, mmd.message)
244+ val kc = new KafkaCluster (kafkaParams)
245+ val reset = kafkaParams.get(" auto.offset.reset" ).map(_.toLowerCase)
246+
247+ (for {
248+ topicPartitions <- kc.getPartitions(topics).right
249+ leaderOffsets <- (if (reset == Some (" smallest" )) {
250+ kc.getEarliestLeaderOffsets(topicPartitions)
251+ } else {
252+ kc.getLatestLeaderOffsets(topicPartitions)
253+ }).right
254+ } yield {
255+ val fromOffsets = leaderOffsets.map { case (tp, lo) =>
256+ (tp, lo.offset)
257+ }
258+ new DeterministicKafkaInputDStream [K , V , U , T , (K , V )](
259+ ssc, kafkaParams, fromOffsets, messageHandler, 1 )
260+ }).fold(
261+ errs => throw new Exception (errs.mkString(" \n " )),
262+ ok => ok
263+ )
264+ }
147265}
0 commit comments