Skip to content

Commit 7eace84

Browse files
committed
HBASE-22737 Add a new admin method and shell cmd to trigger the hbck chore to run
1 parent 4829c18 commit 7eace84

File tree

13 files changed

+172
-62
lines changed

13 files changed

+172
-62
lines changed

hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,15 @@
3434

3535
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
3636
import org.apache.hadoop.hbase.shaded.protobuf.RequestConverter;
37-
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos;
37+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.AssignsResponse;
38+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.BypassProcedureRequest;
39+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.BypassProcedureResponse;
3840
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.GetTableStateResponse;
3941
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.HbckService.BlockingInterface;
40-
42+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreRequest;
43+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreResponse;
44+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleServerCrashProcedureResponse;
45+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.UnassignsResponse;
4146

4247
/**
4348
* Use {@link Connection#getHbck()} to obtain an instance of {@link Hbck} instead of
@@ -105,9 +110,8 @@ public TableState setTableStateInMeta(TableState state) throws IOException {
105110
public List<Long> assigns(List<String> encodedRegionNames, boolean override)
106111
throws IOException {
107112
try {
108-
MasterProtos.AssignsResponse response =
109-
this.hbck.assigns(rpcControllerFactory.newController(),
110-
RequestConverter.toAssignRegionsRequest(encodedRegionNames, override));
113+
AssignsResponse response = this.hbck.assigns(rpcControllerFactory.newController(),
114+
RequestConverter.toAssignRegionsRequest(encodedRegionNames, override));
111115
return response.getPidList();
112116
} catch (ServiceException se) {
113117
LOG.debug(toCommaDelimitedString(encodedRegionNames), se);
@@ -119,9 +123,8 @@ public List<Long> assigns(List<String> encodedRegionNames, boolean override)
119123
public List<Long> unassigns(List<String> encodedRegionNames, boolean override)
120124
throws IOException {
121125
try {
122-
MasterProtos.UnassignsResponse response =
123-
this.hbck.unassigns(rpcControllerFactory.newController(),
124-
RequestConverter.toUnassignRegionsRequest(encodedRegionNames, override));
126+
UnassignsResponse response = this.hbck.unassigns(rpcControllerFactory.newController(),
127+
RequestConverter.toUnassignRegionsRequest(encodedRegionNames, override));
125128
return response.getPidList();
126129
} catch (ServiceException se) {
127130
LOG.debug(toCommaDelimitedString(encodedRegionNames), se);
@@ -137,13 +140,13 @@ private static String toCommaDelimitedString(List<String> list) {
137140
public List<Boolean> bypassProcedure(List<Long> pids, long waitTime, boolean override,
138141
boolean recursive)
139142
throws IOException {
140-
MasterProtos.BypassProcedureResponse response = ProtobufUtil.call(
141-
new Callable<MasterProtos.BypassProcedureResponse>() {
143+
BypassProcedureResponse response = ProtobufUtil.call(
144+
new Callable<BypassProcedureResponse>() {
142145
@Override
143-
public MasterProtos.BypassProcedureResponse call() throws Exception {
146+
public BypassProcedureResponse call() throws Exception {
144147
try {
145148
return hbck.bypassProcedure(rpcControllerFactory.newController(),
146-
MasterProtos.BypassProcedureRequest.newBuilder().addAllProcId(pids).
149+
BypassProcedureRequest.newBuilder().addAllProcId(pids).
147150
setWaitTime(waitTime).setOverride(override).setRecursive(recursive).build());
148151
} catch (Throwable t) {
149152
LOG.error(pids.stream().map(i -> i.toString()).
@@ -159,7 +162,7 @@ public MasterProtos.BypassProcedureResponse call() throws Exception {
159162
public List<Long> scheduleServerCrashProcedures(List<ServerName> serverNames)
160163
throws IOException {
161164
try {
162-
MasterProtos.ScheduleServerCrashProcedureResponse response =
165+
ScheduleServerCrashProcedureResponse response =
163166
this.hbck.scheduleServerCrashProcedure(rpcControllerFactory.newController(),
164167
RequestConverter.toScheduleServerCrashProcedureRequest(serverNames));
165168
return response.getPidList();
@@ -171,4 +174,16 @@ public List<Long> scheduleServerCrashProcedures(List<ServerName> serverNames)
171174
throw new IOException(se);
172175
}
173176
}
174-
}
177+
178+
@Override
179+
public boolean runHbckChore() throws IOException {
180+
try {
181+
RunHbckChoreResponse response = this.hbck.runHbckChore(rpcControllerFactory.newController(),
182+
RunHbckChoreRequest.newBuilder().build());
183+
return response.getRan();
184+
} catch (ServiceException se) {
185+
LOG.debug("Failed to run HBCK chore", se);
186+
throw new IOException(se);
187+
}
188+
}
189+
}

hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,4 +121,12 @@ default List<Long> scheduleServerCrashProcedure(List<HBaseProtos.ServerName> ser
121121
}
122122

123123
List<Long> scheduleServerCrashProcedures(List<ServerName> serverNames) throws IOException;
124+
125+
/**
126+
* Request HBCK chore to run at master side.
127+
*
128+
* @return <code>true</code> if HBCK chore ran, <code>false</code> if HBCK chore already running
129+
* @throws IOException if a remote or network exception occurs
130+
*/
131+
boolean runHbckChore() throws IOException;
124132
}

hbase-protocol-shaded/src/main/protobuf/Master.proto

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,13 @@ message IsNormalizerEnabledResponse {
358358
required bool enabled = 1;
359359
}
360360

361+
message RunHbckChoreRequest {
362+
}
363+
364+
message RunHbckChoreResponse {
365+
required bool ran = 1;
366+
}
367+
361368
message RunCatalogScanRequest {
362369
}
363370

@@ -1138,4 +1145,10 @@ service HbckService {
11381145
/** Schedule a ServerCrashProcedure to help recover a crash server */
11391146
rpc ScheduleServerCrashProcedure(ScheduleServerCrashProcedureRequest)
11401147
returns(ScheduleServerCrashProcedureResponse);
1148+
1149+
/**
1150+
* Request HBCK chore to run at master side.
1151+
*/
1152+
rpc RunHbckChore(RunHbckChoreRequest)
1153+
returns(RunHbckChoreResponse);
11411154
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,7 @@ public void run() {
385385
private ClusterStatusPublisher clusterStatusPublisherChore = null;
386386
private SnapshotCleanerChore snapshotCleanerChore = null;
387387

388-
private HbckChecker hbckChecker;
388+
private HbckChore hbckChore;
389389
CatalogJanitor catalogJanitorChore;
390390
private LogCleaner logCleaner;
391391
private HFileCleaner hfileCleaner;
@@ -1109,8 +1109,8 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
11091109
getChoreService().scheduleChore(normalizerChore);
11101110
this.catalogJanitorChore = new CatalogJanitor(this);
11111111
getChoreService().scheduleChore(catalogJanitorChore);
1112-
this.hbckChecker = new HbckChecker(this);
1113-
getChoreService().scheduleChore(hbckChecker);
1112+
this.hbckChore = new HbckChore(this);
1113+
getChoreService().scheduleChore(hbckChore);
11141114
this.serverManager.startChore();
11151115

11161116
// Only for rolling upgrade, where we need to migrate the data in namespace table to meta table.
@@ -1590,7 +1590,7 @@ private void stopChores() {
15901590
choreService.cancelChore(this.hfileCleaner);
15911591
choreService.cancelChore(this.replicationBarrierCleaner);
15921592
choreService.cancelChore(this.snapshotCleanerChore);
1593-
choreService.cancelChore(this.hbckChecker);
1593+
choreService.cancelChore(this.hbckChore);
15941594
}
15951595
}
15961596

@@ -3761,7 +3761,7 @@ public Map<String, ReplicationStatus> getWalGroupsReplicationStatus() {
37613761
return super.getWalGroupsReplicationStatus();
37623762
}
37633763

3764-
public HbckChecker getHbckChecker() {
3765-
return this.hbckChecker;
3764+
public HbckChore getHbckChore() {
3765+
return this.hbckChore;
37663766
}
37673767
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChecker.java renamed to hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChore.java

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,11 @@
4747
*/
4848
@InterfaceAudience.Private
4949
@InterfaceStability.Evolving
50-
public class HbckChecker extends ScheduledChore {
51-
private static final Logger LOG = LoggerFactory.getLogger(HbckChecker.class.getName());
50+
public class HbckChore extends ScheduledChore {
51+
private static final Logger LOG = LoggerFactory.getLogger(HbckChore.class.getName());
5252

53-
private static final String HBCK_CHECKER_INTERVAL = "hbase.master.hbck.checker.interval";
54-
private static final int DEFAULT_HBCK_CHECKER_INTERVAL = 60 * 60 * 1000;
53+
private static final String HBCK_CHORE_INTERVAL = "hbase.master.hbck.chore.interval";
54+
private static final int DEFAULT_HBCK_CHORE_INTERVAL = 60 * 60 * 1000;
5555

5656
private final MasterServices master;
5757

@@ -100,14 +100,14 @@ public class HbckChecker extends ScheduledChore {
100100
private volatile long checkingStartTimestamp = 0;
101101
private volatile long checkingEndTimestamp = 0;
102102

103-
public HbckChecker(MasterServices master) {
104-
super("HbckChecker-", master,
105-
master.getConfiguration().getInt(HBCK_CHECKER_INTERVAL, DEFAULT_HBCK_CHECKER_INTERVAL));
103+
public HbckChore(MasterServices master) {
104+
super("HbckChore-", master,
105+
master.getConfiguration().getInt(HBCK_CHORE_INTERVAL, DEFAULT_HBCK_CHORE_INTERVAL));
106106
this.master = master;
107107
}
108108

109109
@Override
110-
protected void chore() {
110+
protected synchronized void chore() {
111111
running = true;
112112
regionInfoMap.clear();
113113
orphanRegionsOnRS.clear();
@@ -277,6 +277,6 @@ public long getCheckingStartTimestamp() {
277277
* Used for web ui to show when the HBCK checking report generated.
278278
*/
279279
public long getCheckingEndTimestamp() {
280-
return this.checkingStartTimestamp;
280+
return this.checkingEndTimestamp;
281281
}
282282
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@
4949
import org.apache.hadoop.hbase.client.MasterSwitchType;
5050
import org.apache.hadoop.hbase.client.RegionInfo;
5151
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
52-
import org.apache.hadoop.hbase.client.Result;
5352
import org.apache.hadoop.hbase.client.Table;
5453
import org.apache.hadoop.hbase.client.TableDescriptor;
5554
import org.apache.hadoop.hbase.client.TableState;
@@ -68,10 +67,7 @@
6867
import org.apache.hadoop.hbase.ipc.RpcServerFactory;
6968
import org.apache.hadoop.hbase.ipc.RpcServerInterface;
7069
import org.apache.hadoop.hbase.ipc.ServerRpcController;
71-
import org.apache.hadoop.hbase.master.assignment.MergeTableRegionsProcedure;
72-
import org.apache.hadoop.hbase.master.assignment.RegionStateStore;
7370
import org.apache.hadoop.hbase.master.assignment.RegionStates;
74-
import org.apache.hadoop.hbase.master.assignment.SplitTableRegionProcedure;
7571
import org.apache.hadoop.hbase.master.locking.LockProcedure;
7672
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
7773
import org.apache.hadoop.hbase.master.procedure.MasterProcedureUtil;
@@ -91,7 +87,6 @@
9187
import org.apache.hadoop.hbase.quotas.QuotaObserverChore;
9288
import org.apache.hadoop.hbase.quotas.QuotaUtil;
9389
import org.apache.hadoop.hbase.quotas.SpaceQuotaSnapshot;
94-
import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
9590
import org.apache.hadoop.hbase.regionserver.RSRpcServices;
9691
import org.apache.hadoop.hbase.regionserver.RpcSchedulerFactory;
9792
import org.apache.hadoop.hbase.replication.ReplicationException;
@@ -112,10 +107,8 @@
112107
import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
113108
import org.apache.hadoop.hbase.util.Bytes;
114109
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
115-
import org.apache.hadoop.hbase.util.FSUtils;
116110
import org.apache.hadoop.hbase.util.ForeignExceptionUtil;
117111
import org.apache.hadoop.hbase.util.Pair;
118-
import org.apache.hadoop.hbase.util.PairOfSameType;
119112
import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
120113
import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
121114
import org.apache.yetus.audience.InterfaceAudience;
@@ -264,6 +257,8 @@
264257
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunCatalogScanResponse;
265258
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunCleanerChoreRequest;
266259
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunCleanerChoreResponse;
260+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreRequest;
261+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreResponse;
267262
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.SecurityCapabilitiesRequest;
268263
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.SecurityCapabilitiesResponse;
269264
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.SetBalancerRunningRequest;
@@ -2371,6 +2366,20 @@ public FileArchiveNotificationResponse reportFileArchival(RpcController controll
23712366

23722367
// HBCK Services
23732368

2369+
@Override
2370+
public RunHbckChoreResponse runHbckChore(RpcController c, RunHbckChoreRequest req)
2371+
throws ServiceException {
2372+
rpcPreCheck("runHbckChore");
2373+
LOG.info("{} request HBCK chore to run", master.getClientIdAuditPrefix());
2374+
HbckChore hbckChore = master.getHbckChore();
2375+
boolean ran = false;
2376+
if (!hbckChore.isRunning()) {
2377+
hbckChore.chore();
2378+
ran = true;
2379+
}
2380+
return RunHbckChoreResponse.newBuilder().setRan(ran).build();
2381+
}
2382+
23742383
/**
23752384
* Update state of the table in meta only. This is required by hbck in some situations to cleanup
23762385
* stuck assign/ unassign regions procedures for the table.

hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
import="java.time.ZonedDateTime"
2828
import="java.time.format.DateTimeFormatter"
2929
%>
30-
<%@ page import="org.apache.hadoop.hbase.master.HbckChecker" %>
30+
<%@ page import="org.apache.hadoop.hbase.master.HbckChore" %>
3131
<%@ page import="org.apache.hadoop.hbase.master.HMaster" %>
3232
<%@ page import="org.apache.hadoop.hbase.ServerName" %>
3333
<%@ page import="org.apache.hadoop.hbase.util.Bytes" %>
@@ -38,18 +38,18 @@
3838
<%
3939
HMaster master = (HMaster) getServletContext().getAttribute(HMaster.MASTER);
4040
pageContext.setAttribute("pageTitle", "HBase Master HBCK Report: " + master.getServerName());
41-
HbckChecker hbckChecker = master.getHbckChecker();
41+
HbckChore hbckChore = master.getHbckChore();
4242
Map<String, Pair<ServerName, List<ServerName>>> inconsistentRegions = null;
4343
Map<String, ServerName> orphanRegionsOnRS = null;
4444
List<String> orphanRegionsOnFS = null;
4545
long startTimestamp = 0;
4646
long endTimestamp = 0;
47-
if (hbckChecker != null) {
48-
inconsistentRegions = hbckChecker.getInconsistentRegions();
49-
orphanRegionsOnRS = hbckChecker.getOrphanRegionsOnRS();
50-
orphanRegionsOnFS = hbckChecker.getOrphanRegionsOnFS();
51-
startTimestamp = hbckChecker.getCheckingStartTimestamp();
52-
endTimestamp = hbckChecker.getCheckingEndTimestamp();
47+
if (hbckChore != null) {
48+
inconsistentRegions = hbckChore.getInconsistentRegions();
49+
orphanRegionsOnRS = hbckChore.getOrphanRegionsOnRS();
50+
orphanRegionsOnFS = hbckChore.getOrphanRegionsOnFS();
51+
startTimestamp = hbckChore.getCheckingStartTimestamp();
52+
endTimestamp = hbckChore.getCheckingEndTimestamp();
5353
}
5454
ZonedDateTime zdt = ZonedDateTime.ofInstant(Instant.ofEpochMilli(startTimestamp),
5555
ZoneId.systemDefault());

hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import org.apache.hadoop.hbase.coprocessor.MasterCoprocessorEnvironment;
3737
import org.apache.hadoop.hbase.coprocessor.MasterObserver;
3838
import org.apache.hadoop.hbase.coprocessor.ObserverContext;
39+
import org.apache.hadoop.hbase.master.HMaster;
3940
import org.apache.hadoop.hbase.master.RegionState;
4041
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
4142
import org.apache.hadoop.hbase.master.procedure.TableProcedureInterface;
@@ -237,6 +238,20 @@ public void testScheduleSCP() throws Exception {
237238
waitOnPids(pids);
238239
}
239240

241+
@Test
242+
public void testRunHbckChore() throws Exception {
243+
HMaster master = TEST_UTIL.getMiniHBaseCluster().getMaster();
244+
long endTimestamp = master.getHbckChore().getCheckingEndTimestamp();
245+
Hbck hbck = getHbck();
246+
boolean ran = false;
247+
while (!ran) {
248+
ran = hbck.runHbckChore();
249+
if (ran) {
250+
assertTrue(master.getHbckChore().getCheckingEndTimestamp() > endTimestamp);
251+
}
252+
}
253+
}
254+
240255
public static class FailingSplitAfterMetaUpdatedMasterObserver
241256
implements MasterCoprocessor, MasterObserver {
242257
public volatile CountDownLatch latch;

0 commit comments

Comments
 (0)