Skip to content

Commit e642a07

Browse files
Tyson Condiezsxwing
authored andcommitted
[SPARK-18682][SS] Batch Source for Kafka
Today, you can start a stream that reads from kafka. However, given kafka's configurable retention period, it seems like sometimes you might just want to read all of the data that is available now. As such we should add a version that works with spark.read as well. The options should be the same as the streaming kafka source, with the following differences: startingOffsets should default to earliest, and should not allow latest (which would always be empty). endingOffsets should also be allowed and should default to latest. the same assign json format as startingOffsets should also be accepted. It would be really good, if things like .limit(n) were enough to prevent all the data from being read (this might just work). KafkaRelationSuite was added for testing batch queries via KafkaUtils. Author: Tyson Condie <[email protected]> Closes #16686 from tcondie/SPARK-18682. (cherry picked from commit 8df4444) Signed-off-by: Shixiong Zhu <[email protected]>
1 parent dd1abef commit e642a07

File tree

12 files changed

+1180
-430
lines changed

12 files changed

+1180
-430
lines changed

external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala

Lines changed: 73 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ private[kafka010] case class CachedKafkaConsumer private(
4444

4545
private var consumer = createConsumer
4646

47+
/** indicates whether this consumer is in use or not */
48+
private var inuse = true
49+
4750
/** Iterator to the already fetch data */
4851
private var fetchedData = ju.Collections.emptyIterator[ConsumerRecord[Array[Byte], Array[Byte]]]
4952
private var nextOffsetInFetchedData = UNKNOWN_OFFSET
@@ -57,6 +60,20 @@ private[kafka010] case class CachedKafkaConsumer private(
5760
c
5861
}
5962

63+
case class AvailableOffsetRange(earliest: Long, latest: Long)
64+
65+
/**
66+
* Return the available offset range of the current partition. It's a pair of the earliest offset
67+
* and the latest offset.
68+
*/
69+
def getAvailableOffsetRange(): AvailableOffsetRange = {
70+
consumer.seekToBeginning(Set(topicPartition).asJava)
71+
val earliestOffset = consumer.position(topicPartition)
72+
consumer.seekToEnd(Set(topicPartition).asJava)
73+
val latestOffset = consumer.position(topicPartition)
74+
AvailableOffsetRange(earliestOffset, latestOffset)
75+
}
76+
6077
/**
6178
* Get the record for the given offset if available. Otherwise it will either throw error
6279
* (if failOnDataLoss = true), or return the next available offset within [offset, untilOffset),
@@ -107,9 +124,9 @@ private[kafka010] case class CachedKafkaConsumer private(
107124
* `UNKNOWN_OFFSET`.
108125
*/
109126
private def getEarliestAvailableOffsetBetween(offset: Long, untilOffset: Long): Long = {
110-
val (earliestOffset, latestOffset) = getAvailableOffsetRange()
111-
logWarning(s"Some data may be lost. Recovering from the earliest offset: $earliestOffset")
112-
if (offset >= latestOffset || earliestOffset >= untilOffset) {
127+
val range = getAvailableOffsetRange()
128+
logWarning(s"Some data may be lost. Recovering from the earliest offset: ${range.earliest}")
129+
if (offset >= range.latest || range.earliest >= untilOffset) {
113130
// [offset, untilOffset) and [earliestOffset, latestOffset) have no overlap,
114131
// either
115132
// --------------------------------------------------------
@@ -124,13 +141,13 @@ private[kafka010] case class CachedKafkaConsumer private(
124141
// offset untilOffset earliestOffset latestOffset
125142
val warningMessage =
126143
s"""
127-
|The current available offset range is [$earliestOffset, $latestOffset).
144+
|The current available offset range is $range.
128145
| Offset ${offset} is out of range, and records in [$offset, $untilOffset) will be
129146
| skipped ${additionalMessage(failOnDataLoss = false)}
130147
""".stripMargin
131148
logWarning(warningMessage)
132149
UNKNOWN_OFFSET
133-
} else if (offset >= earliestOffset) {
150+
} else if (offset >= range.earliest) {
134151
// -----------------------------------------------------------------------------
135152
// ^ ^ ^ ^
136153
// | | | |
@@ -149,12 +166,12 @@ private[kafka010] case class CachedKafkaConsumer private(
149166
// offset earliestOffset min(untilOffset,latestOffset) max(untilOffset, latestOffset)
150167
val warningMessage =
151168
s"""
152-
|The current available offset range is [$earliestOffset, $latestOffset).
153-
| Offset ${offset} is out of range, and records in [$offset, $earliestOffset) will be
169+
|The current available offset range is $range.
170+
| Offset ${offset} is out of range, and records in [$offset, ${range.earliest}) will be
154171
| skipped ${additionalMessage(failOnDataLoss = false)}
155172
""".stripMargin
156173
logWarning(warningMessage)
157-
earliestOffset
174+
range.earliest
158175
}
159176
}
160177

@@ -183,8 +200,8 @@ private[kafka010] case class CachedKafkaConsumer private(
183200
// - `offset` is out of range so that Kafka returns nothing. Just throw
184201
// `OffsetOutOfRangeException` to let the caller handle it.
185202
// - Cannot fetch any data before timeout. TimeoutException will be thrown.
186-
val (earliestOffset, latestOffset) = getAvailableOffsetRange()
187-
if (offset < earliestOffset || offset >= latestOffset) {
203+
val range = getAvailableOffsetRange()
204+
if (offset < range.earliest || offset >= range.latest) {
188205
throw new OffsetOutOfRangeException(
189206
Map(topicPartition -> java.lang.Long.valueOf(offset)).asJava)
190207
} else {
@@ -284,18 +301,6 @@ private[kafka010] case class CachedKafkaConsumer private(
284301
logDebug(s"Polled $groupId ${p.partitions()} ${r.size}")
285302
fetchedData = r.iterator
286303
}
287-
288-
/**
289-
* Return the available offset range of the current partition. It's a pair of the earliest offset
290-
* and the latest offset.
291-
*/
292-
private def getAvailableOffsetRange(): (Long, Long) = {
293-
consumer.seekToBeginning(Set(topicPartition).asJava)
294-
val earliestOffset = consumer.position(topicPartition)
295-
consumer.seekToEnd(Set(topicPartition).asJava)
296-
val latestOffset = consumer.position(topicPartition)
297-
(earliestOffset, latestOffset)
298-
}
299304
}
300305

301306
private[kafka010] object CachedKafkaConsumer extends Logging {
@@ -310,7 +315,7 @@ private[kafka010] object CachedKafkaConsumer extends Logging {
310315
new ju.LinkedHashMap[CacheKey, CachedKafkaConsumer](capacity, 0.75f, true) {
311316
override def removeEldestEntry(
312317
entry: ju.Map.Entry[CacheKey, CachedKafkaConsumer]): Boolean = {
313-
if (this.size > capacity) {
318+
if (entry.getValue.inuse == false && this.size > capacity) {
314319
logWarning(s"KafkaConsumer cache hitting max capacity of $capacity, " +
315320
s"removing consumer for ${entry.getKey}")
316321
try {
@@ -327,6 +332,43 @@ private[kafka010] object CachedKafkaConsumer extends Logging {
327332
}
328333
}
329334

335+
def releaseKafkaConsumer(
336+
topic: String,
337+
partition: Int,
338+
kafkaParams: ju.Map[String, Object]): Unit = {
339+
val groupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String]
340+
val topicPartition = new TopicPartition(topic, partition)
341+
val key = CacheKey(groupId, topicPartition)
342+
343+
synchronized {
344+
val consumer = cache.get(key)
345+
if (consumer != null) {
346+
consumer.inuse = false
347+
} else {
348+
logWarning(s"Attempting to release consumer that does not exist")
349+
}
350+
}
351+
}
352+
353+
/**
354+
* Removes (and closes) the Kafka Consumer for the given topic, partition and group id.
355+
*/
356+
def removeKafkaConsumer(
357+
topic: String,
358+
partition: Int,
359+
kafkaParams: ju.Map[String, Object]): Unit = {
360+
val groupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String]
361+
val topicPartition = new TopicPartition(topic, partition)
362+
val key = CacheKey(groupId, topicPartition)
363+
364+
synchronized {
365+
val removedConsumer = cache.remove(key)
366+
if (removedConsumer != null) {
367+
removedConsumer.close()
368+
}
369+
}
370+
}
371+
330372
/**
331373
* Get a cached consumer for groupId, assigned to topic and partition.
332374
* If matching consumer doesn't already exist, will be created using kafkaParams.
@@ -342,16 +384,18 @@ private[kafka010] object CachedKafkaConsumer extends Logging {
342384
// If this is reattempt at running the task, then invalidate cache and start with
343385
// a new consumer
344386
if (TaskContext.get != null && TaskContext.get.attemptNumber > 1) {
345-
val removedConsumer = cache.remove(key)
346-
if (removedConsumer != null) {
347-
removedConsumer.close()
348-
}
349-
new CachedKafkaConsumer(topicPartition, kafkaParams)
387+
removeKafkaConsumer(topic, partition, kafkaParams)
388+
val consumer = new CachedKafkaConsumer(topicPartition, kafkaParams)
389+
consumer.inuse = true
390+
cache.put(key, consumer)
391+
consumer
350392
} else {
351393
if (!cache.containsKey(key)) {
352394
cache.put(key, new CachedKafkaConsumer(topicPartition, kafkaParams))
353395
}
354-
cache.get(key)
396+
val consumer = cache.get(key)
397+
consumer.inuse = true
398+
consumer
355399
}
356400
}
357401
}
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql.kafka010
19+
20+
import java.{util => ju}
21+
22+
import scala.collection.JavaConverters._
23+
24+
import org.apache.kafka.clients.consumer.{Consumer, KafkaConsumer}
25+
import org.apache.kafka.clients.consumer.internals.NoOpConsumerRebalanceListener
26+
import org.apache.kafka.common.TopicPartition
27+
28+
/**
29+
* Subscribe allows you to subscribe to a fixed collection of topics.
30+
* SubscribePattern allows you to use a regex to specify topics of interest.
31+
* Note that unlike the 0.8 integration, using Subscribe or SubscribePattern
32+
* should respond to adding partitions during a running stream.
33+
* Finally, Assign allows you to specify a fixed collection of partitions.
34+
* All three strategies have overloaded constructors that allow you to specify
35+
* the starting offset for a particular partition.
36+
*/
37+
sealed trait ConsumerStrategy {
38+
/** Create a [[KafkaConsumer]] and subscribe to topics according to a desired strategy */
39+
def createConsumer(kafkaParams: ju.Map[String, Object]): Consumer[Array[Byte], Array[Byte]]
40+
}
41+
42+
/**
43+
* Specify a fixed collection of partitions.
44+
*/
45+
case class AssignStrategy(partitions: Array[TopicPartition]) extends ConsumerStrategy {
46+
override def createConsumer(
47+
kafkaParams: ju.Map[String, Object]): Consumer[Array[Byte], Array[Byte]] = {
48+
val consumer = new KafkaConsumer[Array[Byte], Array[Byte]](kafkaParams)
49+
consumer.assign(ju.Arrays.asList(partitions: _*))
50+
consumer
51+
}
52+
53+
override def toString: String = s"Assign[${partitions.mkString(", ")}]"
54+
}
55+
56+
/**
57+
* Subscribe to a fixed collection of topics.
58+
*/
59+
case class SubscribeStrategy(topics: Seq[String]) extends ConsumerStrategy {
60+
override def createConsumer(
61+
kafkaParams: ju.Map[String, Object]): Consumer[Array[Byte], Array[Byte]] = {
62+
val consumer = new KafkaConsumer[Array[Byte], Array[Byte]](kafkaParams)
63+
consumer.subscribe(topics.asJava)
64+
consumer
65+
}
66+
67+
override def toString: String = s"Subscribe[${topics.mkString(", ")}]"
68+
}
69+
70+
/**
71+
* Use a regex to specify topics of interest.
72+
*/
73+
case class SubscribePatternStrategy(topicPattern: String) extends ConsumerStrategy {
74+
override def createConsumer(
75+
kafkaParams: ju.Map[String, Object]): Consumer[Array[Byte], Array[Byte]] = {
76+
val consumer = new KafkaConsumer[Array[Byte], Array[Byte]](kafkaParams)
77+
consumer.subscribe(
78+
ju.regex.Pattern.compile(topicPattern),
79+
new NoOpConsumerRebalanceListener())
80+
consumer
81+
}
82+
83+
override def toString: String = s"SubscribePattern[$topicPattern]"
84+
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql.kafka010
19+
20+
import org.apache.kafka.common.TopicPartition
21+
22+
/**
23+
* Objects that represent desired offset range limits for starting,
24+
* ending, and specific offsets.
25+
*/
26+
private[kafka010] sealed trait KafkaOffsetRangeLimit
27+
28+
/**
29+
* Represents the desire to bind to the earliest offsets in Kafka
30+
*/
31+
private[kafka010] case object EarliestOffsetRangeLimit extends KafkaOffsetRangeLimit
32+
33+
/**
34+
* Represents the desire to bind to the latest offsets in Kafka
35+
*/
36+
private[kafka010] case object LatestOffsetRangeLimit extends KafkaOffsetRangeLimit
37+
38+
/**
39+
* Represents the desire to bind to specific offsets. A offset == -1 binds to the
40+
* latest offset, and offset == -2 binds to the earliest offset.
41+
*/
42+
private[kafka010] case class SpecificOffsetRangeLimit(
43+
partitionOffsets: Map[TopicPartition, Long]) extends KafkaOffsetRangeLimit
44+
45+
private[kafka010] object KafkaOffsetRangeLimit {
46+
/**
47+
* Used to denote offset range limits that are resolved via Kafka
48+
*/
49+
val LATEST = -1L // indicates resolution to the latest offset
50+
val EARLIEST = -2L // indicates resolution to the earliest offset
51+
}

0 commit comments

Comments
 (0)