Skip to content

Commit 8e52339

Browse files
committed
HBASE-23307 Add running of ReplicationBarrierCleaner to hbck2 fixMeta invocation (#859)
Signed-off-by: Lijin Bin <[email protected]>
1 parent 3b0c276 commit 8e52339

File tree

7 files changed

+41
-17
lines changed

7 files changed

+41
-17
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3869,4 +3869,11 @@ public String getClusterId() {
38693869
return cachedClusterId.getFromCacheOrFetch();
38703870
}
38713871

3872+
@Override
3873+
public void runReplicationBarrierCleaner() {
3874+
ReplicationBarrierCleaner rbc = this.replicationBarrierCleaner;
3875+
if (rbc != null) {
3876+
rbc.chore();
3877+
}
3878+
}
38723879
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChore.java

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/**
1+
/*
22
* Licensed to the Apache Software Foundation (ASF) under one
33
* or more contributor license agreements. See the NOTICE file
44
* distributed with this work for additional information
@@ -122,22 +122,26 @@ protected synchronized void chore() {
122122
LOG.warn("hbckChore is either disabled or is already running. Can't run the chore");
123123
return;
124124
}
125-
running = true;
126125
regionInfoMap.clear();
127126
disabledTableRegions.clear();
128127
splitParentRegions.clear();
129128
orphanRegionsOnRS.clear();
130129
orphanRegionsOnFS.clear();
131130
inconsistentRegions.clear();
132131
checkingStartTimestamp = EnvironmentEdgeManager.currentTime();
133-
loadRegionsFromInMemoryState();
134-
loadRegionsFromRSReport();
132+
running = true;
135133
try {
136-
loadRegionsFromFS();
137-
} catch (IOException e) {
138-
LOG.warn("Failed to load the regions from filesystem", e);
134+
loadRegionsFromInMemoryState();
135+
loadRegionsFromRSReport();
136+
try {
137+
loadRegionsFromFS();
138+
} catch (IOException e) {
139+
LOG.warn("Failed to load the regions from filesystem", e);
140+
}
141+
saveCheckResultToSnapshot();
142+
} catch (Throwable t) {
143+
LOG.warn("Unexpected", t);
139144
}
140-
saveCheckResultToSnapshot();
141145
running = false;
142146
}
143147

@@ -262,6 +266,10 @@ private void loadRegionsFromFS() throws IOException {
262266
List<Path> regionDirs = FSUtils.getRegionDirs(fs, tableDir);
263267
for (Path regionDir : regionDirs) {
264268
String encodedRegionName = regionDir.getName();
269+
if (encodedRegionName == null) {
270+
LOG.warn("Failed get of encoded name from {}", regionDir);
271+
continue;
272+
}
265273
HbckRegionInfo hri = regionInfoMap.get(encodedRegionName);
266274
if (hri == null) {
267275
orphanRegionsOnFS.put(encodedRegionName, regionDir);

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,4 +537,8 @@ default SplitWALManager getSplitWALManager(){
537537
*/
538538
List<RegionPlan> executeRegionPlansWithThrottling(List<RegionPlan> plans);
539539

540+
/**
541+
* Run the ReplicationBarrierChore.
542+
*/
543+
void runReplicationBarrierCleaner();
540544
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetaFixer.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,9 @@ void fix() throws IOException {
7777
}
7878
fixHoles(report);
7979
fixOverlaps(report);
80+
// Run the ReplicationBarrierCleaner here; it may clear out rep_barrier rows which
81+
// can help cleaning up damaged hbase:meta.
82+
this.masterServices.runReplicationBarrierCleaner();
8083
}
8184

8285
/**

hbase-server/src/main/java/org/apache/hadoop/hbase/master/cleaner/ReplicationBarrierCleaner.java

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@
4848
*/
4949
@InterfaceAudience.Private
5050
public class ReplicationBarrierCleaner extends ScheduledChore {
51-
5251
private static final Logger LOG = LoggerFactory.getLogger(ReplicationBarrierCleaner.class);
5352

5453
private static final String REPLICATION_BARRIER_CLEANER_INTERVAL =
@@ -71,7 +70,9 @@ public ReplicationBarrierCleaner(Configuration conf, Stoppable stopper, Connecti
7170
}
7271

7372
@Override
74-
protected void chore() {
73+
// Public so can be run out of MasterRpcServices. Synchronized so only one
74+
// running instance at a time.
75+
public synchronized void chore() {
7576
long totalRows = 0;
7677
long cleanedRows = 0;
7778
long deletedRows = 0;
@@ -168,11 +169,9 @@ protected void chore() {
168169
LOG.warn("Failed to clean up replication barrier", e);
169170
}
170171
if (totalRows > 0) {
171-
LOG.info(
172-
"Cleanup replication barriers: totalRows {}, " +
173-
"cleanedRows {}, deletedRows {}, deletedBarriers {}, deletedLastPushedSeqIds {}",
174-
totalRows, cleanedRows, deletedRows, deletedBarriers, deletedLastPushedSeqIds);
172+
LOG.info("TotalRows={}, cleanedRows={}, deletedRows={}, deletedBarriers={}, " +
173+
"deletedLastPushedSeqIds={}", totalRows, cleanedRows, deletedRows,
174+
deletedBarriers, deletedLastPushedSeqIds);
175175
}
176176
}
177-
178177
}

hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -490,4 +490,7 @@ public List<RegionPlan> executeRegionPlansWithThrottling(List<RegionPlan> plans)
490490
public AsyncClusterConnection getAsyncClusterConnection() {
491491
return null;
492492
}
493-
}
493+
494+
@Override
495+
public void runReplicationBarrierCleaner() {}
496+
}

hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClusterRestartFailover.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ public void test() throws Exception {
108108
.filter(p -> (p instanceof ServerCrashProcedure) &&
109109
((ServerCrashProcedure) p).getServerName().equals(SERVER_FOR_TEST)).findAny();
110110
assertTrue("Should have one SCP for " + SERVER_FOR_TEST, procedure.isPresent());
111-
assertFalse("Submit the SCP for the same serverName " + SERVER_FOR_TEST + " which should fail",
111+
assertTrue("Submit the SCP for the same serverName " + SERVER_FOR_TEST + " which should fail",
112112
UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST) ==
113113
Procedure.NO_PROC_ID);
114114

0 commit comments

Comments
 (0)