Skip to content

Commit d2a1f19

Browse files
HBASE-28690 Aborting Active HMaster is not rejecting reportRegionStateTransition if procedure is initialised by next Active master (#6129)
Added masterActiveTime as fencing token for remote procedures Signed-off-by: Duo Zhang <[email protected]> Reviewed-by: Aman Poonia <[email protected]>
1 parent 0646151 commit d2a1f19

31 files changed

+192
-78
lines changed

hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3097,10 +3097,12 @@ public static CloseRegionRequest buildCloseRegionRequest(ServerName server, byte
30973097
}
30983098

30993099
public static CloseRegionRequest buildCloseRegionRequest(ServerName server, byte[] regionName,
3100-
ServerName destinationServer, long closeProcId, boolean evictCache) {
3100+
ServerName destinationServer, long closeProcId, boolean evictCache,
3101+
long initiatingMasterActiveTime) {
31013102
CloseRegionRequest.Builder builder =
31023103
getBuilder(server, regionName, destinationServer, closeProcId);
31033104
builder.setEvictCache(evictCache);
3105+
builder.setInitiatingMasterActiveTime(initiatingMasterActiveTime);
31043106
return builder.build();
31053107
}
31063108

hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,14 +222,22 @@ protected final void submitTask(Runnable task, long delay, TimeUnit unit) {
222222
*/
223223
public static abstract class RemoteOperation {
224224
private final RemoteProcedure remoteProcedure;
225+
// active time of the master that sent this request, used for fencing
226+
private final long initiatingMasterActiveTime;
225227

226-
protected RemoteOperation(final RemoteProcedure remoteProcedure) {
228+
protected RemoteOperation(final RemoteProcedure remoteProcedure,
229+
long initiatingMasterActiveTime) {
227230
this.remoteProcedure = remoteProcedure;
231+
this.initiatingMasterActiveTime = initiatingMasterActiveTime;
228232
}
229233

230234
public RemoteProcedure getRemoteProcedure() {
231235
return remoteProcedure;
232236
}
237+
238+
public long getInitiatingMasterActiveTime() {
239+
return initiatingMasterActiveTime;
240+
}
233241
}
234242

235243
/**

hbase-protocol-shaded/src/main/protobuf/server/master/RegionServerStatus.proto

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ message RegionStateTransition {
9797
optional uint64 open_seq_num = 3;
9898

9999
repeated int64 proc_id = 4;
100+
101+
// Master active time as fencing token
102+
optional int64 initiating_master_active_time = 5;
100103
enum TransitionCode {
101104
OPENED = 0;
102105
FAILED_OPEN = 1;
@@ -155,6 +158,8 @@ message RemoteProcedureResult {
155158
}
156159
required Status status = 2;
157160
optional ForeignExceptionMessage error = 3;
161+
// Master active time as fencing token
162+
optional int64 initiating_master_active_time = 4;
158163
}
159164
message ReportProcedureDoneRequest {
160165
repeated RemoteProcedureResult result = 1;

hbase-protocol-shaded/src/main/protobuf/server/region/Admin.proto

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ message OpenRegionRequest {
8080
repeated RegionOpenInfo open_info = 1;
8181
// the intended server for this RPC.
8282
optional uint64 serverStartCode = 2;
83+
// Master active time as fencing token
84+
optional int64 initiating_master_active_time = 3;
8385
// wall clock time from master
8486
optional uint64 master_system_time = 5;
8587

@@ -123,6 +125,8 @@ message CloseRegionRequest {
123125
optional uint64 serverStartCode = 5;
124126
optional int64 close_proc_id = 6 [default = -1];
125127
optional bool evict_cache = 7 [default = false];
128+
// Master active time as fencing token
129+
optional int64 initiating_master_active_time = 8;
126130
}
127131

128132
message CloseRegionResponse {
@@ -272,6 +276,8 @@ message RemoteProcedureRequest {
272276
required uint64 proc_id = 1;
273277
required string proc_class = 2;
274278
optional bytes proc_data = 3;
279+
// Master active time as fencing token
280+
optional int64 initiating_master_active_time = 4;
275281
}
276282

277283
message ExecuteProceduresRequest {

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3153,6 +3153,7 @@ public long getMasterStartTime() {
31533153
}
31543154

31553155
/** Returns timestamp in millis when HMaster became the active master. */
3156+
@Override
31563157
public long getMasterActiveTime() {
31573158
return masterActiveTime;
31583159
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
import org.apache.hadoop.hbase.DoNotRetryIOException;
4040
import org.apache.hadoop.hbase.HBaseRpcServicesBase;
4141
import org.apache.hadoop.hbase.HConstants;
42+
import org.apache.hadoop.hbase.MasterNotRunningException;
4243
import org.apache.hadoop.hbase.MetaTableAccessor;
4344
import org.apache.hadoop.hbase.NamespaceDescriptor;
4445
import org.apache.hadoop.hbase.ServerMetrics;
@@ -64,7 +65,6 @@
6465
import org.apache.hadoop.hbase.ipc.QosPriority;
6566
import org.apache.hadoop.hbase.ipc.RpcServer;
6667
import org.apache.hadoop.hbase.ipc.RpcServer.BlockingServiceAndInterface;
67-
import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
6868
import org.apache.hadoop.hbase.ipc.ServerRpcController;
6969
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
7070
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
@@ -396,6 +396,7 @@
396396
import org.apache.hadoop.hbase.shaded.protobuf.generated.RSGroupAdminProtos.UpdateRSGroupConfigRequest;
397397
import org.apache.hadoop.hbase.shaded.protobuf.generated.RSGroupAdminProtos.UpdateRSGroupConfigResponse;
398398
import org.apache.hadoop.hbase.shaded.protobuf.generated.RecentLogs;
399+
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos;
399400
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.FileArchiveNotificationRequest;
400401
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.FileArchiveNotificationResponse;
401402
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.GetLastFlushedSequenceIdRequest;
@@ -1854,6 +1855,15 @@ public ReportRegionStateTransitionResponse reportRegionStateTransition(RpcContro
18541855
ReportRegionStateTransitionRequest req) throws ServiceException {
18551856
try {
18561857
server.checkServiceStarted();
1858+
for (RegionServerStatusProtos.RegionStateTransition transition : req.getTransitionList()) {
1859+
long procId =
1860+
transition.getProcIdCount() > 0 ? transition.getProcId(0) : Procedure.NO_PROC_ID;
1861+
// -1 is less than any possible MasterActiveCode
1862+
long initiatingMasterActiveTime = transition.hasInitiatingMasterActiveTime()
1863+
? transition.getInitiatingMasterActiveTime()
1864+
: -1;
1865+
throwOnOldMaster(procId, initiatingMasterActiveTime);
1866+
}
18571867
return server.getAssignmentManager().reportRegionStateTransition(req);
18581868
} catch (IOException ioe) {
18591869
throw new ServiceException(ioe);
@@ -2553,8 +2563,14 @@ public ReportProcedureDoneResponse reportProcedureDone(RpcController controller,
25532563
// Check Masters is up and ready for duty before progressing. Remote side will keep trying.
25542564
try {
25552565
this.server.checkServiceStarted();
2556-
} catch (ServerNotRunningYetException snrye) {
2557-
throw new ServiceException(snrye);
2566+
for (RemoteProcedureResult result : request.getResultList()) {
2567+
// -1 is less than any possible MasterActiveCode
2568+
long initiatingMasterActiveTime =
2569+
result.hasInitiatingMasterActiveTime() ? result.getInitiatingMasterActiveTime() : -1;
2570+
throwOnOldMaster(result.getProcId(), initiatingMasterActiveTime);
2571+
}
2572+
} catch (IOException ioe) {
2573+
throw new ServiceException(ioe);
25582574
}
25592575
request.getResultList().forEach(result -> {
25602576
if (result.getStatus() == RemoteProcedureResult.Status.SUCCESS) {
@@ -2567,6 +2583,18 @@ public ReportProcedureDoneResponse reportProcedureDone(RpcController controller,
25672583
return ReportProcedureDoneResponse.getDefaultInstance();
25682584
}
25692585

2586+
private void throwOnOldMaster(long procId, long initiatingMasterActiveTime)
2587+
throws MasterNotRunningException {
2588+
if (initiatingMasterActiveTime > server.getMasterActiveTime()) {
2589+
// procedure is initiated by new active master but report received on master with older active
2590+
// time
2591+
LOG.warn(
2592+
"Report for procId: {} and initiatingMasterAT {} received on master with activeTime {}",
2593+
procId, initiatingMasterActiveTime, server.getMasterActiveTime());
2594+
throw new MasterNotRunningException("Another master is active");
2595+
}
2596+
}
2597+
25702598
@Override
25712599
public FileArchiveNotificationResponse reportFileArchival(RpcController controller,
25722600
FileArchiveNotificationRequest request) throws ServiceException {

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,9 @@ long splitRegion(final RegionInfo regionInfo, final byte[] splitRow, final long
267267
/** Returns true if master is the active one */
268268
boolean isActiveMaster();
269269

270+
/** Returns timestamp in millis when this master became the active one. */
271+
long getMasterActiveTime();
272+
270273
/** Returns true if master is initialized */
271274
boolean isInitialized();
272275

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,9 @@ public TableOperationType getTableOperationType() {
6464
}
6565

6666
@Override
67-
public RemoteOperation newRemoteOperation() {
68-
return new RegionCloseOperation(this, region, getProcId(), assignCandidate, evictCache);
67+
public RemoteOperation newRemoteOperation(MasterProcedureEnv env) {
68+
return new RegionCloseOperation(this, region, getProcId(), assignCandidate, evictCache,
69+
env.getMasterServices().getMasterActiveTime());
6970
}
7071

7172
@Override

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,9 @@ public TableOperationType getTableOperationType() {
5757
}
5858

5959
@Override
60-
public RemoteOperation newRemoteOperation() {
61-
return new RegionOpenOperation(this, region, getProcId());
60+
public RemoteOperation newRemoteOperation(MasterProcedureEnv env) {
61+
return new RegionOpenOperation(this, region, getProcId(),
62+
env.getMasterServices().getMasterActiveTime());
6263
}
6364

6465
@Override

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,11 @@ public Optional<RemoteProcedureDispatcher.RemoteOperation> remoteCallBuild(Maste
9696
if (state == RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT_SUCCEED) {
9797
return Optional.empty();
9898
}
99-
return Optional.of(newRemoteOperation());
99+
return Optional.of(newRemoteOperation(env));
100100
}
101101

102-
protected abstract RemoteProcedureDispatcher.RemoteOperation newRemoteOperation();
102+
protected abstract RemoteProcedureDispatcher.RemoteOperation
103+
newRemoteOperation(MasterProcedureEnv env);
103104

104105
@Override
105106
public void remoteOperationCompleted(MasterProcedureEnv env) {

0 commit comments

Comments
 (0)