Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,8 @@ public boolean unregisterListener(final ServerListener listener) {
}

/**
* Let the server manager know a new regionserver has come online
* Let the server manager know a regionserver is requesting configurations.
* Regionserver will not be added here, but in its first report.
* @param request the startup request
* @param versionNumber the version number of the new regionserver
* @param version the version of the new regionserver, could contain strings like "SNAPSHOT"
Expand All @@ -227,10 +228,6 @@ ServerName regionServerStartup(RegionServerStartupRequest request, int versionNu
ServerName sn = ServerName.valueOf(hostname, request.getPort(), request.getServerStartCode());
checkClockSkew(sn, request.getServerCurrentTime());
checkIsDead(sn, "STARTUP");
if (!checkAndRecordNewServer(sn, ServerMetricsBuilder.of(sn, versionNumber, version))) {
LOG.warn(
"THIS SHOULD NOT HAPPEN, RegionServerStartup" + " could not record the server: " + sn);
}
return sn;
}

Expand Down Expand Up @@ -281,7 +278,6 @@ public void regionServerReport(ServerName sn,
if (null == this.onlineServers.replace(sn, sl)) {
// Already have this host+port combo and its just different start code?
// Just let the server in. Presume master joining a running cluster.
// recordNewServer is what happens at the end of reportServerStartup.
// The only thing we are skipping is passing back to the regionserver
// the ServerName to use. Here we presume a master has already done
// that so we'll press on with whatever it gave us for ServerName.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1014,10 +1014,9 @@ public void run() {
// node was created, in case any coprocessors want to use ZooKeeper
this.rsHost = new RegionServerCoprocessorHost(this, this.conf);

// Try and register with the Master; tell it we are here. Break if server is stopped or
// the clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and
// start up all Services. Use RetryCounter to get backoff in case Master is struggling to
// come up.
// Get configurations from the Master. Break if server is stopped or
// the clusterup flag is down or hdfs went wacky. Then start up all Services.
// Use RetryCounter to get backoff in case Master is struggling to come up.
LOG.debug("About to register with Master.");
RetryCounterFactory rcf =
new RetryCounterFactory(Integer.MAX_VALUE, this.sleeper.getPeriod(), 1000 * 60 * 5);
Expand Down Expand Up @@ -1050,7 +1049,7 @@ public void run() {
}
}

// We registered with the Master. Go into run mode.
// Run mode.
long lastMsg = System.currentTimeMillis();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about setting this lastMsg here should be set to Long.MAX_LONG so we heartbeat immediately after the report-for-duty... so no lag before the RS is 'online'?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you mean set it to LONG.MIN_LONG (or 0), since the line that checks it says if ((now - lastMsg) >= msgInterval), but yes good one 👍

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, do in a follow-on @caroliney14 ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@saintstack do we have to do this one? asking because the lastMsg timestamp ultimately ends up getting stored here, not sure if it'll have any impact we care about. also, one stage of the PR build kept failing for some unknown reason when I had pushed up the change long lastMsg = 0; which is why I ended up undoing it

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. We not have to do it. It was suggestion. You've done a mountain of great work in here already.

I'll keep an eye on it. Was thinking we heartbeat immediately but that might not be a good idea on a big cluster....

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sounds good, thank you @saintstack

long oldRequestCount = -1;
// The main run loop.
Expand Down Expand Up @@ -1084,7 +1083,14 @@ public void run() {
}
long now = System.currentTimeMillis();
if ((now - lastMsg) >= msgInterval) {
tryRegionServerReport(lastMsg, now);
// Register with the Master now that our setup is complete.
if (tryRegionServerReport(lastMsg, now) && !online.get()) {
// Wake up anyone waiting for this server to online
synchronized (online) {
online.set(true);
online.notifyAll();
}
}
lastMsg = System.currentTimeMillis();
}
if (!isStopped() && !isAborted()) {
Expand Down Expand Up @@ -1253,12 +1259,12 @@ private long getWriteRequestCount() {
}

@InterfaceAudience.Private
protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
throws IOException {
RegionServerStatusService.BlockingInterface rss = rssStub;
if (rss == null) {
// the current server could be stopping.
return;
return false;
}
ClusterStatusProtos.ServerLoad sl = buildServerLoad(reportStartTime, reportEndTime);
try {
Expand All @@ -1278,7 +1284,9 @@ protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
// Couldn't connect to the master, get location from zk and reconnect
// Method blocks until new master is found or we are stopped
createRegionServerStatusStub(true);
return false;
}
return true;
}

/**
Expand Down Expand Up @@ -1653,11 +1661,6 @@ protected void handleReportForDutyResponse(final RegionServerStartupResponse c)
", sessionid=0x" +
Long.toHexString(this.zooKeeper.getRecoverableZooKeeper().getSessionId()));

// Wake up anyone waiting for this server to online
synchronized (online) {
online.set(true);
online.notifyAll();
}
} catch (Throwable e) {
stop("Failed initialization");
throw convertThrowableToIOE(cleanup(e, "Failed init"),
Expand Down Expand Up @@ -2836,10 +2839,9 @@ private boolean keepLooping() {
}

/*
* Let the master know we're here Run initialization using parameters passed
* us by the master.
* Run initialization using parameters passed us by the master.
* @return A Map of key/value configurations we got from the Master else
* null if we failed to register.
* null if we failed during report.
* @throws IOException
*/
private RegionServerStartupResponse reportForDuty() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -446,8 +446,9 @@ public JVMClusterUtil.RegionServerThread startRegionServerAndWait(long timeout)
ServerName rsServerName = t.getRegionServer().getServerName();

long start = System.currentTimeMillis();
ClusterMetrics clusterStatus = getClusterMetrics();
ClusterMetrics clusterStatus;
while ((System.currentTimeMillis() - start) < timeout) {
clusterStatus = getClusterMetrics();
if (clusterStatus != null && clusterStatus.getLiveServerMetrics().containsKey(rsServerName)) {
return t;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,9 @@ public MyMaster(Configuration conf) throws IOException, KeeperException, Interru
}

@Override
protected void tryRegionServerReport(long reportStartTime, long reportEndTime) {
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime) {
// do nothing
return true;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,23 +61,13 @@ public static class MyMaster extends HMaster {
public MyMaster(Configuration conf) throws IOException, KeeperException, InterruptedException {
super(conf);
}

@Override
protected void tryRegionServerReport(long reportStartTime, long reportEndTime) {
// do nothing
}
}

public static class MyRegionServer extends MiniHBaseCluster.MiniHBaseClusterRegionServer {

public MyRegionServer(Configuration conf) throws IOException, InterruptedException {
super(conf);
}

@Override
protected void tryRegionServerReport(long reportStartTime, long reportEndTime) {
// do nothing
}
}

@BeforeClass
Expand Down Expand Up @@ -108,11 +98,14 @@ public void testClusterRequests() throws Exception {
request.setServer(ProtobufUtil.toServerName(serverName));
long expectedRequestNumber = 10000;

MetricsMasterSource masterSource = master.getMasterMetrics().getMetricsSource();
ClusterStatusProtos.ServerLoad sl = ClusterStatusProtos.ServerLoad.newBuilder()
.setTotalNumberOfRequests(expectedRequestNumber).build();
request.setLoad(sl);

MetricsMasterSource masterSource = master.getMasterMetrics().getMetricsSource();
// Init master source again to reset cluster requests counter
masterSource.init();

master.getMasterRpcServices().regionServerReport(null, request.build());
metricsHelper.assertCounter("cluster_requests", expectedRequestNumber, masterSource);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,14 @@ public IgnoreYouAreDeadRS(Configuration conf) throws IOException, InterruptedExc
}

@Override
protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
throws IOException {
try {
super.tryRegionServerReport(reportStartTime, reportEndTime);
} catch (YouAreDeadException e) {
// ignore, do not abort
}
return true;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,15 @@ public MyRegionServer(Configuration conf) throws IOException {
}

@Override
protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
throws IOException {
try {
super.tryRegionServerReport(reportStartTime, reportEndTime);
} catch (YouAreDeadException e) {
LOG.info("Caught YouAreDeadException, ignore", e);
return false;
}
return true;
}

@Override
Expand Down