@@ -21,6 +21,7 @@ import java.{util => ju}
2121import java .util .concurrent .{Executors , ThreadFactory }
2222
2323import scala .collection .JavaConverters ._
24+ import scala .collection .mutable .ArrayBuffer
2425import scala .concurrent .{ExecutionContext , Future }
2526import scala .concurrent .duration .Duration
2627import scala .util .control .NonFatal
@@ -137,6 +138,12 @@ private[kafka010] class KafkaOffsetReader(
137138 // Poll to get the latest assigned partitions
138139 consumer.poll(0 )
139140 val partitions = consumer.assignment()
141+
142+ // Call `position` to wait until the potential offset request triggered by `poll(0)` is
143+ // done. This is a workaround for KAFKA-7703, which an async `seekToBeginning` triggered by
144+ // `poll(0)` may reset offsets that should have been set by another request.
145+ partitions.asScala.map(p => p -> consumer.position(p)).foreach(_ => {})
146+
140147 consumer.pause(partitions)
141148 assert(partitions.asScala == partitionOffsets.keySet,
142149 " If startingOffsets contains specific offsets, you must specify all TopicPartitions.\n " +
@@ -192,19 +199,82 @@ private[kafka010] class KafkaOffsetReader(
192199 /**
193200 * Fetch the latest offsets for the topic partitions that are indicated
194201 * in the [[ConsumerStrategy ]].
202+ *
203+ * Kafka may return earliest offsets when we are requesting latest offsets if `poll` is called
204+ * right before `seekToEnd` (KAFKA-7703). As a workaround, we will call `position` right after
205+ * `poll` to wait until the potential offset request triggered by `poll(0)` is done.
206+ *
207+ * In addition, to avoid other unknown issues, we also use the given `knownOffsets` to audit the
208+ * latest offsets returned by Kafka. If we find some incorrect offsets (a latest offset is less
209+ * than an offset in `knownOffsets`), we will retry at most `maxOffsetFetchAttempts` times. When
210+ * a topic is recreated, the latest offsets may be less than offsets in `knownOffsets`. We cannot
211+ * distinguish this with KAFKA-7703, so we just return whatever we get from Kafka after retrying.
195212 */
196- def fetchLatestOffsets (): Map [TopicPartition , Long ] = runUninterruptibly {
213+ def fetchLatestOffsets (
214+ knownOffsets : Option [PartitionOffsetMap ]): PartitionOffsetMap = runUninterruptibly {
197215 withRetriesWithoutInterrupt {
198216 // Poll to get the latest assigned partitions
199217 consumer.poll(0 )
200218 val partitions = consumer.assignment()
219+
220+ // Call `position` to wait until the potential offset request triggered by `poll(0)` is
221+ // done. This is a workaround for KAFKA-7703, which an async `seekToBeginning` triggered by
222+ // `poll(0)` may reset offsets that should have been set by another request.
223+ partitions.asScala.map(p => p -> consumer.position(p)).foreach(_ => {})
224+
201225 consumer.pause(partitions)
202226 logDebug(s " Partitions assigned to consumer: $partitions. Seeking to the end. " )
203227
204- consumer.seekToEnd(partitions)
205- val partitionOffsets = partitions.asScala.map(p => p -> consumer.position(p)).toMap
206- logDebug(s " Got latest offsets for partition : $partitionOffsets" )
207- partitionOffsets
228+ if (knownOffsets.isEmpty) {
229+ consumer.seekToEnd(partitions)
230+ partitions.asScala.map(p => p -> consumer.position(p)).toMap
231+ } else {
232+ var partitionOffsets : PartitionOffsetMap = Map .empty
233+
234+ /**
235+ * Compare `knownOffsets` and `partitionOffsets`. Returns all partitions that have incorrect
236+ * latest offset (offset in `knownOffsets` is great than the one in `partitionOffsets`).
237+ */
238+ def findIncorrectOffsets (): Seq [(TopicPartition , Long , Long )] = {
239+ var incorrectOffsets = ArrayBuffer [(TopicPartition , Long , Long )]()
240+ partitionOffsets.foreach { case (tp, offset) =>
241+ knownOffsets.foreach(_.get(tp).foreach { knownOffset =>
242+ if (knownOffset > offset) {
243+ val incorrectOffset = (tp, knownOffset, offset)
244+ incorrectOffsets += incorrectOffset
245+ }
246+ })
247+ }
248+ incorrectOffsets
249+ }
250+
251+ // Retry to fetch latest offsets when detecting incorrect offsets. We don't use
252+ // `withRetriesWithoutInterrupt` to retry because:
253+ //
254+ // - `withRetriesWithoutInterrupt` will reset the consumer for each attempt but a fresh
255+ // consumer has a much bigger chance to hit KAFKA-7703.
256+ // - Avoid calling `consumer.poll(0)` which may cause KAFKA-7703.
257+ var incorrectOffsets : Seq [(TopicPartition , Long , Long )] = Nil
258+ var attempt = 0
259+ do {
260+ consumer.seekToEnd(partitions)
261+ partitionOffsets = partitions.asScala.map(p => p -> consumer.position(p)).toMap
262+ attempt += 1
263+
264+ incorrectOffsets = findIncorrectOffsets()
265+ if (incorrectOffsets.nonEmpty) {
266+ logWarning(" Found incorrect offsets in some partitions " +
267+ s " (partition, previous offset, fetched offset): $incorrectOffsets" )
268+ if (attempt < maxOffsetFetchAttempts) {
269+ logWarning(" Retrying to fetch latest offsets because of incorrect offsets" )
270+ Thread .sleep(offsetFetchAttemptIntervalMs)
271+ }
272+ }
273+ } while (incorrectOffsets.nonEmpty && attempt < maxOffsetFetchAttempts)
274+
275+ logDebug(s " Got latest offsets for partition : $partitionOffsets" )
276+ partitionOffsets
277+ }
208278 }
209279 }
210280
0 commit comments