Skip to content

Commit 88057d8

Browse files
authored
HBASE-25539: Add age of oldest wal metric (#2945)
Signed-off-by: Bharath Vissapragada <[email protected]>
1 parent 1beda0d commit 88057d8

File tree

16 files changed

+506
-129
lines changed

16 files changed

+506
-129
lines changed

hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationGlobalSourceSourceImpl.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,18 @@ public void incrCompletedRecoveryQueue() {
186186
public void incrFailedRecoveryQueue() {
187187
failedRecoveryQueue.incr(1L);
188188
}
189+
190+
@Override
191+
public void setOldestWalAge(long age) {
192+
// Not implemented
193+
}
194+
195+
@Override
196+
public long getOldestWalAge() {
197+
// Not implemented
198+
return 0;
199+
}
200+
189201
@Override
190202
public void init() {
191203
rms.init();

hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ public interface MetricsReplicationSourceSource extends BaseSource {
4949
public static final String SOURCE_COMPLETED_LOGS = "source.completedLogs";
5050
public static final String SOURCE_COMPLETED_RECOVERY_QUEUES = "source.completedRecoverQueues";
5151
public static final String SOURCE_FAILED_RECOVERY_QUEUES = "source.failedRecoverQueues";
52+
/* Used to track the age of oldest wal in ms since its creation time */
53+
String OLDEST_WAL_AGE = "source.oldestWalAge";
5254

5355
void setLastShippedAge(long age);
5456
void incrSizeOfLogQueue(int size);
@@ -76,4 +78,6 @@ public interface MetricsReplicationSourceSource extends BaseSource {
7678
long getWALEditsRead();
7779
long getShippedOps();
7880
long getEditsFiltered();
81+
void setOldestWalAge(long age);
82+
long getOldestWalAge();
7983
}

hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSourceImpl.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ public class MetricsReplicationSourceSourceImpl implements MetricsReplicationSou
3939
private final String logReadInBytesKey;
4040
private final String shippedHFilesKey;
4141
private final String sizeOfHFileRefsQueueKey;
42+
private final String oldestWalAgeKey;
4243

4344
private final MutableHistogram ageOfLastShippedOpHist;
4445
private final MutableGaugeLong sizeOfLogQueueGauge;
@@ -65,6 +66,7 @@ public class MetricsReplicationSourceSourceImpl implements MetricsReplicationSou
6566
private final MutableFastCounter repeatedFileBytes;
6667
private final MutableFastCounter completedWAL;
6768
private final MutableFastCounter completedRecoveryQueue;
69+
private final MutableGaugeLong oldestWalAge;
6870

6971
public MetricsReplicationSourceSourceImpl(MetricsReplicationSourceImpl rms, String id) {
7072
this.rms = rms;
@@ -121,6 +123,9 @@ public MetricsReplicationSourceSourceImpl(MetricsReplicationSourceImpl rms, Stri
121123

122124
completedRecoveryKey = this.keyPrefix + "completedRecoverQueues";
123125
completedRecoveryQueue = rms.getMetricsRegistry().getCounter(completedRecoveryKey, 0L);
126+
127+
oldestWalAgeKey = this.keyPrefix + "oldestWalAge";
128+
oldestWalAge = rms.getMetricsRegistry().getGauge(oldestWalAgeKey, 0L);
124129
}
125130

126131
@Override public void setLastShippedAge(long age) {
@@ -183,6 +188,7 @@ public MetricsReplicationSourceSourceImpl(MetricsReplicationSourceImpl rms, Stri
183188
rms.removeMetric(repeatedBytesKey);
184189
rms.removeMetric(completedLogsKey);
185190
rms.removeMetric(completedRecoveryKey);
191+
rms.removeMetric(oldestWalAgeKey);
186192
}
187193

188194
@Override
@@ -248,6 +254,14 @@ public void incrCompletedRecoveryQueue() {
248254
@Override
249255
public void incrFailedRecoveryQueue() {/*no op*/}
250256

257+
@Override public void setOldestWalAge(long age) {
258+
oldestWalAge.set(age);
259+
}
260+
261+
@Override public long getOldestWalAge() {
262+
return oldestWalAge.value();
263+
}
264+
251265
@Override
252266
public void init() {
253267
rms.init();

hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsSource.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,17 @@ public void incrFailedRecoveryQueue() {
386386
globalSourceSource.incrFailedRecoveryQueue();
387387
}
388388

389+
/*
390+
Sets the age of oldest log file just for source.
391+
*/
392+
public void setOldestWalAge(long age) {
393+
singleSourceSource.setOldestWalAge(age);
394+
}
395+
396+
public long getOldestWalAge() {
397+
return singleSourceSource.getOldestWalAge();
398+
}
399+
389400
@Override
390401
public void init() {
391402
singleSourceSource.init();

hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSource.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,13 @@ public void init(Configuration conf, FileSystem fs, ReplicationSourceManager man
5757
}
5858

5959
@Override
60-
protected RecoveredReplicationSourceShipper createNewShipper(String walGroupId,
61-
PriorityBlockingQueue<Path> queue) {
62-
return new RecoveredReplicationSourceShipper(conf, walGroupId, queue, this, queueStorage);
60+
protected RecoveredReplicationSourceShipper createNewShipper(String walGroupId) {
61+
return new RecoveredReplicationSourceShipper(conf, walGroupId, logQueue, this, queueStorage);
6362
}
6463

65-
public void locateRecoveredPaths(PriorityBlockingQueue<Path> queue) throws IOException {
64+
public void locateRecoveredPaths(String walGroupId) throws IOException {
6665
boolean hasPathChanged = false;
66+
PriorityBlockingQueue<Path> queue = logQueue.getQueue(walGroupId);
6767
PriorityBlockingQueue<Path> newPaths = new PriorityBlockingQueue<Path>(queueSizePerGroup,
6868
new AbstractFSWALProvider.WALStartTimeComparator());
6969
pathsLoop: for (Path path : queue) {
@@ -116,9 +116,9 @@ public void locateRecoveredPaths(PriorityBlockingQueue<Path> queue) throws IOExc
116116
// put the correct locations in the queue
117117
// since this is a recovered queue with no new incoming logs,
118118
// there shouldn't be any concurrency issues
119-
queue.clear();
119+
logQueue.clear(walGroupId);
120120
for (Path path : newPaths) {
121-
queue.add(path);
121+
logQueue.enqueueLog(path, walGroupId);
122122
}
123123
}
124124
}

hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSourceShipper.java

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,7 @@
1818
package org.apache.hadoop.hbase.replication.regionserver;
1919

2020
import java.io.IOException;
21-
import java.util.concurrent.PriorityBlockingQueue;
2221
import org.apache.hadoop.conf.Configuration;
23-
import org.apache.hadoop.fs.Path;
2422
import org.apache.hadoop.hbase.replication.ReplicationException;
2523
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
2624
import org.apache.hadoop.hbase.util.Threads;
@@ -40,9 +38,9 @@ public class RecoveredReplicationSourceShipper extends ReplicationSourceShipper
4038
private final ReplicationQueueStorage replicationQueues;
4139

4240
public RecoveredReplicationSourceShipper(Configuration conf, String walGroupId,
43-
PriorityBlockingQueue<Path> queue, RecoveredReplicationSource source,
41+
ReplicationSourceLogQueue logQueue, RecoveredReplicationSource source,
4442
ReplicationQueueStorage queueStorage) {
45-
super(conf, walGroupId, queue, source);
43+
super(conf, walGroupId, logQueue, source);
4644
this.source = source;
4745
this.replicationQueues = queueStorage;
4846
}
@@ -58,7 +56,7 @@ public long getStartPosition() {
5856
int numRetries = 0;
5957
while (numRetries <= maxRetriesMultiplier) {
6058
try {
61-
source.locateRecoveredPaths(queue);
59+
source.locateRecoveredPaths(walGroupId);
6260
break;
6361
} catch (IOException e) {
6462
LOG.error("Error while locating recovered queue paths, attempt #" + numRetries);
@@ -75,9 +73,9 @@ private long getRecoveredQueueStartPos() {
7573
String peerClusterZNode = source.getQueueId();
7674
try {
7775
startPosition = this.replicationQueues.getWALPosition(source.getServer().getServerName(),
78-
peerClusterZNode, this.queue.peek().getName());
79-
LOG.trace("Recovered queue started with log {} at position {}", this.queue.peek(),
80-
startPosition);
76+
peerClusterZNode, this.logQueue.getQueue(walGroupId).peek().getName());
77+
LOG.trace("Recovered queue started with log {} at position {}",
78+
this.logQueue.getQueue(walGroupId).peek(), startPosition);
8179
} catch (ReplicationException e) {
8280
terminate("Couldn't get the position of this recovered queue " + peerClusterZNode, e);
8381
}

hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java

Lines changed: 21 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,12 @@
2424
import java.util.ArrayList;
2525
import java.util.Collection;
2626
import java.util.Collections;
27-
import java.util.HashMap;
2827
import java.util.List;
2928
import java.util.Map;
3029
import java.util.Set;
3130
import java.util.TreeMap;
3231
import java.util.UUID;
3332
import java.util.concurrent.ConcurrentHashMap;
34-
import java.util.concurrent.PriorityBlockingQueue;
3533
import java.util.concurrent.TimeUnit;
3634
import java.util.concurrent.TimeoutException;
3735
import java.util.concurrent.atomic.AtomicBoolean;
@@ -85,11 +83,9 @@
8583
public class ReplicationSource implements ReplicationSourceInterface {
8684

8785
private static final Logger LOG = LoggerFactory.getLogger(ReplicationSource.class);
88-
// Queues of logs to process, entry in format of walGroupId->queue,
89-
// each presents a queue for one wal group
90-
private Map<String, PriorityBlockingQueue<Path>> queues = new HashMap<>();
9186
// per group queue size, keep no more than this number of logs in each wal group
9287
protected int queueSizePerGroup;
88+
protected ReplicationSourceLogQueue logQueue;
9389
protected ReplicationQueueStorage queueStorage;
9490
protected ReplicationPeer replicationPeer;
9591

@@ -115,8 +111,6 @@ public class ReplicationSource implements ReplicationSourceInterface {
115111
volatile boolean sourceRunning = false;
116112
// Metrics for this source
117113
private MetricsSource metrics;
118-
// WARN threshold for the number of queued logs, defaults to 2
119-
private int logQueueWarnThreshold;
120114
// ReplicationEndpoint which will handle the actual replication
121115
private volatile ReplicationEndpoint replicationEndpoint;
122116

@@ -210,6 +204,7 @@ public void init(Configuration conf, FileSystem fs, ReplicationSourceManager man
210204
this.maxRetriesMultiplier =
211205
this.conf.getInt("replication.source.maxretriesmultiplier", 300); // 5 minutes @ 1 sec per
212206
this.queueSizePerGroup = this.conf.getInt("hbase.regionserver.maxlogs", 32);
207+
this.logQueue = new ReplicationSourceLogQueue(conf, metrics, this);
213208
this.queueStorage = queueStorage;
214209
this.replicationPeer = replicationPeer;
215210
this.manager = manager;
@@ -219,7 +214,6 @@ public void init(Configuration conf, FileSystem fs, ReplicationSourceManager man
219214

220215
this.queueId = queueId;
221216
this.replicationQueueInfo = new ReplicationQueueInfo(queueId);
222-
this.logQueueWarnThreshold = this.conf.getInt("replication.source.log.queue.warn", 2);
223217

224218
// A defaultBandwidth of '0' means no bandwidth; i.e. no throttling.
225219
defaultBandwidth = this.conf.getLong("replication.source.per.peer.node.bandwidth", 0);
@@ -250,35 +244,20 @@ public void enqueueLog(Path wal) {
250244
}
251245
// Use WAL prefix as the WALGroupId for this peer.
252246
String walPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(wal.getName());
253-
PriorityBlockingQueue<Path> queue = queues.get(walPrefix);
254-
if (queue == null) {
255-
queue = new PriorityBlockingQueue<>(queueSizePerGroup,
256-
new AbstractFSWALProvider.WALStartTimeComparator());
257-
// make sure that we do not use an empty queue when setting up a ReplicationSource, otherwise
258-
// the shipper may quit immediately
259-
queue.put(wal);
260-
queues.put(walPrefix, queue);
247+
boolean queueExists = logQueue.enqueueLog(wal, walPrefix);
248+
249+
if (!queueExists) {
261250
if (this.isSourceActive() && this.walEntryFilter != null) {
262251
// new wal group observed after source startup, start a new worker thread to track it
263252
// notice: it's possible that wal enqueued when this.running is set but worker thread
264253
// still not launched, so it's necessary to check workerThreads before start the worker
265-
tryStartNewShipper(walPrefix, queue);
254+
tryStartNewShipper(walPrefix);
266255
}
267-
} else {
268-
queue.put(wal);
269256
}
270257
if (LOG.isTraceEnabled()) {
271258
LOG.trace("{} Added wal {} to queue of source {}.", logPeerId(), walPrefix,
272259
this.replicationQueueInfo.getQueueId());
273260
}
274-
this.metrics.incrSizeOfLogQueue();
275-
// This will wal a warning for each new wal that gets created above the warn threshold
276-
int queueSize = queue.size();
277-
if (queueSize > this.logQueueWarnThreshold) {
278-
LOG.warn("{} WAL group {} queue size: {} exceeds value of " +
279-
"replication.source.log.queue.warn {}", logPeerId(), walPrefix, queueSize,
280-
logQueueWarnThreshold);
281-
}
282261
}
283262

284263
@Override
@@ -370,16 +349,16 @@ private void initializeWALEntryFilter(UUID peerClusterId) {
370349
this.walEntryFilter = new ChainWALEntryFilter(filters);
371350
}
372351

373-
private void tryStartNewShipper(String walGroupId, PriorityBlockingQueue<Path> queue) {
352+
private void tryStartNewShipper(String walGroupId) {
374353
workerThreads.compute(walGroupId, (key, value) -> {
375354
if (value != null) {
376355
LOG.debug("{} preempted start of shipping worker walGroupId={}", logPeerId(), walGroupId);
377356
return value;
378357
} else {
379358
LOG.debug("{} starting shipping worker for walGroupId={}", logPeerId(), walGroupId);
380-
ReplicationSourceShipper worker = createNewShipper(walGroupId, queue);
359+
ReplicationSourceShipper worker = createNewShipper(walGroupId);
381360
ReplicationSourceWALReader walReader =
382-
createNewWALReader(walGroupId, queue, worker.getStartPosition());
361+
createNewWALReader(walGroupId, worker.getStartPosition());
383362
Threads.setDaemonThreadRunning(
384363
walReader, Thread.currentThread().getName()
385364
+ ".replicationSource.wal-reader." + walGroupId + "," + queueId,
@@ -399,7 +378,7 @@ public Map<String, ReplicationStatus> getWalGroupStatus() {
399378
String walGroupId = walGroupShipper.getKey();
400379
ReplicationSourceShipper shipper = walGroupShipper.getValue();
401380
ageOfLastShippedOp = metrics.getAgeOfLastShippedOp(walGroupId);
402-
int queueSize = queues.get(walGroupId).size();
381+
int queueSize = logQueue.getQueueSize(walGroupId);
403382
replicationDelay = metrics.getReplicationDelay();
404383
Path currentPath = shipper.getCurrentPath();
405384
fileSize = -1;
@@ -438,16 +417,16 @@ private long getFileSize(Path currentPath) throws IOException {
438417
return fileSize;
439418
}
440419

441-
protected ReplicationSourceShipper createNewShipper(String walGroupId,
442-
PriorityBlockingQueue<Path> queue) {
443-
return new ReplicationSourceShipper(conf, walGroupId, queue, this);
420+
protected ReplicationSourceShipper createNewShipper(String walGroupId) {
421+
return new ReplicationSourceShipper(conf, walGroupId, logQueue, this);
444422
}
445423

446-
private ReplicationSourceWALReader createNewWALReader(String walGroupId,
447-
PriorityBlockingQueue<Path> queue, long startPosition) {
424+
private ReplicationSourceWALReader createNewWALReader(String walGroupId, long startPosition) {
448425
return replicationPeer.getPeerConfig().isSerial()
449-
? new SerialReplicationSourceWALReader(fs, conf, queue, startPosition, walEntryFilter, this)
450-
: new ReplicationSourceWALReader(fs, conf, queue, startPosition, walEntryFilter, this);
426+
? new SerialReplicationSourceWALReader(fs, conf, logQueue, startPosition, walEntryFilter,
427+
this, walGroupId)
428+
: new ReplicationSourceWALReader(fs, conf, logQueue, startPosition, walEntryFilter,
429+
this, walGroupId);
451430
}
452431

453432
/**
@@ -607,14 +586,12 @@ private void initialize() {
607586
throw new IllegalStateException("Source should be active.");
608587
}
609588
LOG.info("{} queueId={} (queues={}) is replicating from cluster={} to cluster={}",
610-
logPeerId(), this.replicationQueueInfo.getQueueId(), this.queues.size(), clusterId,
589+
logPeerId(), this.replicationQueueInfo.getQueueId(), logQueue.getNumQueues(), clusterId,
611590
peerClusterId);
612591
initializeWALEntryFilter(peerClusterId);
613592
// Start workers
614-
for (Map.Entry<String, PriorityBlockingQueue<Path>> entry : queues.entrySet()) {
615-
String walGroupId = entry.getKey();
616-
PriorityBlockingQueue<Path> queue = entry.getValue();
617-
tryStartNewShipper(walGroupId, queue);
593+
for (String walGroupId: logQueue.getQueues().keySet()) {
594+
tryStartNewShipper(walGroupId);
618595
}
619596
this.startupOngoing.set(false);
620597
}
@@ -844,7 +821,7 @@ void removeWorker(ReplicationSourceShipper worker) {
844821
workerThreads.remove(worker.walGroupId, worker);
845822
}
846823

847-
private String logPeerId(){
824+
public String logPeerId(){
848825
return "peerId=" + this.getPeerId() + ",";
849826
}
850827
}

0 commit comments

Comments
 (0)