Skip to content

Commit 18c856d

Browse files
committed
HBASE-25973 Balancer should explain progress in a better way in log - backport branch-2
1 parent dfc9ac8 commit 18c856d

File tree

7 files changed

+913
-62
lines changed

7 files changed

+913
-62
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1877,6 +1877,8 @@ public List<RegionPlan> executeRegionPlansWithThrottling(List<RegionPlan> plans)
18771877
}
18781878
}
18791879
}
1880+
LOG.info("Balancer is going into sleep until next period in {}ms", getConfiguration()
1881+
.getInt(HConstants.HBASE_BALANCER_PERIOD, HConstants.DEFAULT_HBASE_BALANCER_PERIOD));
18801882
return successRegionPlans;
18811883
}
18821884

hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java

Lines changed: 68 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,8 @@ public class StochasticLoadBalancer extends BaseLoadBalancer {
133133

134134
private List<CandidateGenerator> candidateGenerators;
135135
private List<CostFunction> costFunctions; // FindBugs: Wants this protected; IS2_INCONSISTENT_SYNC
136-
136+
// To save currently configed sum of multiplier. Defaulted at 1 for cases that carry high cost
137+
private float sumMultiplier = 1.0f;
137138
// to save and report costs to JMX
138139
private double curOverallCost = 0d;
139140
private double[] tempFunctionCosts;
@@ -229,7 +230,6 @@ protected void loadConf(Configuration conf) {
229230

230231
regionReplicaHostCostFunction = new RegionReplicaHostCostFunction(conf);
231232
regionReplicaRackCostFunction = new RegionReplicaRackCostFunction(conf);
232-
233233
costFunctions = new ArrayList<>();
234234
addCostFunction(new RegionCountSkewCostFunction(conf));
235235
addCostFunction(new PrimaryRegionCountSkewCostFunction(conf));
@@ -310,63 +310,66 @@ private boolean areSomeRegionReplicasColocated(BalancerClusterState c) {
310310
boolean needsBalance(TableName tableName, BalancerClusterState cluster) {
311311
ClusterLoadState cs = new ClusterLoadState(cluster.clusterState);
312312
if (cs.getNumServers() < MIN_SERVER_BALANCE) {
313-
if (LOG.isDebugEnabled()) {
314-
LOG.debug("Not running balancer because only " + cs.getNumServers()
315-
+ " active regionserver(s)");
316-
}
317-
if (this.isBalancerRejectionRecording) {
318-
sendRejectionReasonToRingBuffer("The number of RegionServers " +
319-
cs.getNumServers() + " < MIN_SERVER_BALANCE(" + MIN_SERVER_BALANCE + ")", null);
320-
}
313+
LOG.info("Not running balancer because only " + cs.getNumServers() +
314+
" active regionserver(s)");
315+
sendRejectionReasonToRingBuffer(
316+
"The number of RegionServers " + cs.getNumServers() + " < MIN_SERVER_BALANCE(" +
317+
MIN_SERVER_BALANCE + ")", null);
321318
return false;
322319
}
323320
if (areSomeRegionReplicasColocated(cluster)) {
321+
LOG.info("Running balancer because at least one server hosts replicas of the same region.");
324322
return true;
325323
}
326324

327325
if (idleRegionServerExist(cluster)){
326+
LOG.info("Running balancer because cluster has idle server(s).");
328327
return true;
329328
}
330329

330+
sumMultiplier = 0.0f;
331331
double total = 0.0;
332-
float sumMultiplier = 0.0f;
333332
for (CostFunction c : costFunctions) {
334333
float multiplier = c.getMultiplier();
335-
if (multiplier <= 0) {
336-
LOG.trace("{} not needed because multiplier is <= 0", c.getClass().getSimpleName());
337-
continue;
338-
}
334+
double cost = c.cost();
339335
if (!c.isNeeded()) {
340336
LOG.trace("{} not needed", c.getClass().getSimpleName());
341337
continue;
342338
}
339+
total += cost * multiplier;
343340
sumMultiplier += multiplier;
344-
total += c.cost() * multiplier;
345-
}
346-
347-
boolean balanced = total <= 0 || sumMultiplier <= 0 ||
348-
(sumMultiplier > 0 && (total / sumMultiplier) < minCostNeedBalance);
349-
if(balanced && isBalancerRejectionRecording){
350-
String reason = "";
351-
if (total <= 0) {
352-
reason = "(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern) = " + total + " <= 0";
353-
} else if (sumMultiplier <= 0) {
354-
reason = "sumMultiplier = " + sumMultiplier + " <= 0";
355-
} else if ((total / sumMultiplier) < minCostNeedBalance) {
356-
reason =
357-
"[(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern)]/sumMultiplier = " + (total
358-
/ sumMultiplier) + " <= minCostNeedBalance(" + minCostNeedBalance + ")";
359-
}
360-
sendRejectionReasonToRingBuffer(reason, costFunctions);
361-
}
362-
if (LOG.isDebugEnabled()) {
363-
LOG.debug("{} {}; total cost={}, sum multiplier={}; cost/multiplier to need a balance is {}",
364-
balanced ? "Skipping load balancing because balanced" : "We need to load balance",
365-
isByTable ? String.format("table (%s)", tableName) : "cluster",
366-
total, sumMultiplier, minCostNeedBalance);
367-
if (LOG.isTraceEnabled()) {
368-
LOG.trace("Balance decision detailed function costs={}", functionCost());
341+
}
342+
if (sumMultiplier <= 0) {
343+
LOG.error("At least one cost function needs a multiplier > 0. For example, set "
344+
+ "hbase.master.balancer.stochastic.regionCountCost to a positive value or default");
345+
return false;
346+
}
347+
348+
boolean balanced = (total / sumMultiplier < minCostNeedBalance);
349+
if (balanced) {
350+
if (isBalancerRejectionRecording) {
351+
String reason = "";
352+
if (total <= 0) {
353+
reason = "(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern) = " +
354+
total + " <= 0";
355+
} else if (sumMultiplier <= 0) {
356+
reason = "sumMultiplier = " + sumMultiplier + " <= 0";
357+
} else if ((total / sumMultiplier) < minCostNeedBalance) {
358+
reason =
359+
"[(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern)]/sumMultiplier = " +
360+
(total / sumMultiplier) + " <= minCostNeedBalance(" + minCostNeedBalance + ")";
361+
}
362+
sendRejectionReasonToRingBuffer(reason, costFunctions);
369363
}
364+
LOG.info("{} - skipping load balancing because weighted average imbalance={} <= "
365+
+ "threshold({}). If you want more aggressive balancing, either lower "
366+
+ "hbase.master.balancer.stochastic.minCostNeedBalance from {} or increase the relative "
367+
+ "multiplier(s) of the specific cost function(s). functionCost={}",
368+
isByTable ? "Table specific ("+tableName+")" : "Cluster wide", total / sumMultiplier,
369+
minCostNeedBalance, minCostNeedBalance, functionCost());
370+
} else {
371+
LOG.info("{} - Calculating plan. may take up to {}ms to complete.",
372+
isByTable ? "Table specific ("+tableName+")" : "Cluster wide", maxRunningTime);
370373
}
371374
return !balanced;
372375
}
@@ -452,8 +455,9 @@ protected List<RegionPlan> balanceTable(TableName tableName, Map<ServerName,
452455
maxSteps);
453456
}
454457
}
455-
LOG.info("start StochasticLoadBalancer.balancer, initCost=" + currentCost + ", functionCost="
456-
+ functionCost() + " computedMaxSteps: " + computedMaxSteps);
458+
LOG.info("Start StochasticLoadBalancer.balancer, initial weighted average imbalance={}, "
459+
+ "functionCost={} computedMaxSteps={}",
460+
currentCost / sumMultiplier, functionCost(), computedMaxSteps);
457461

458462
final String initFunctionTotalCosts = totalCostsPerFunc();
459463
// Perform a stochastic walk to see if we can get a good fit.
@@ -499,17 +503,19 @@ protected List<RegionPlan> balanceTable(TableName tableName, Map<ServerName,
499503
updateStochasticCosts(tableName, curOverallCost, curFunctionCosts);
500504
if (initCost > currentCost) {
501505
plans = createRegionPlans(cluster);
502-
LOG.info("Finished computing new load balance plan. Computation took {}" +
503-
" to try {} different iterations. Found a solution that moves " +
504-
"{} regions; Going from a computed cost of {}" +
505-
" to a new cost of {}", java.time.Duration.ofMillis(endTime - startTime),
506-
step, plans.size(), initCost, currentCost);
506+
LOG.info("Finished computing new moving plan. Computation took {} ms" +
507+
" to try {} different iterations. Found a solution that moves " +
508+
"{} regions; Going from a computed imbalance of {}" +
509+
" to a new imbalance of {}. ",
510+
endTime - startTime, step, plans.size(),
511+
initCost / sumMultiplier, currentCost / sumMultiplier);
512+
507513
sendRegionPlansToRingBuffer(plans, currentCost, initCost, initFunctionTotalCosts, step);
508514
return plans;
509515
}
510-
LOG.info("Could not find a better load balance plan. Tried {} different configurations in " +
511-
"{}, and did not find anything with a computed cost less than {}", step,
512-
java.time.Duration.ofMillis(endTime - startTime), initCost);
516+
LOG.info("Could not find a better moving plan. Tried {} different configurations in " +
517+
"{} ms, and did not find anything with an imbalance score less than {}", step,
518+
endTime - startTime, initCost / sumMultiplier);
513519
return null;
514520
}
515521

@@ -520,8 +526,7 @@ private void sendRejectionReasonToRingBuffer(String reason, List<CostFunction> c
520526
.setReason(reason);
521527
if (costFunctions != null) {
522528
for (CostFunction c : costFunctions) {
523-
float multiplier = c.getMultiplier();
524-
if (multiplier <= 0 || !c.isNeeded()) {
529+
if (!c.isNeeded()) {
525530
continue;
526531
}
527532
builder.addCostFuncInfo(c.getClass().getName(), c.cost(), c.getMultiplier());
@@ -580,7 +585,8 @@ private void updateStochasticCosts(TableName tableName, double overall, double[]
580585
}
581586

582587
private void addCostFunction(CostFunction costFunction) {
583-
if (costFunction.getMultiplier() > 0) {
588+
float multiplier = costFunction.getMultiplier();
589+
if (multiplier > 0) {
584590
costFunctions.add(costFunction);
585591
}
586592
}
@@ -591,9 +597,13 @@ private String functionCost() {
591597
builder.append(c.getClass().getSimpleName());
592598
builder.append(" : (");
593599
if (c.isNeeded()) {
594-
builder.append(c.getMultiplier());
600+
builder.append("multiplier=" + c.getMultiplier());
595601
builder.append(", ");
596-
builder.append(c.cost());
602+
double cost = c.cost();
603+
builder.append("imbalance=" + cost);
604+
if (cost < minCostNeedBalance) {
605+
builder.append(", balanced");
606+
}
597607
} else {
598608
builder.append("not needed");
599609
}
@@ -605,7 +615,7 @@ private String functionCost() {
605615
private String totalCostsPerFunc() {
606616
StringBuilder builder = new StringBuilder();
607617
for (CostFunction c : costFunctions) {
608-
if (c.getMultiplier() <= 0 || !c.isNeeded()) {
618+
if (!c.isNeeded()) {
609619
continue;
610620
}
611621
double cost = c.getMultiplier() * c.cost();
@@ -689,7 +699,7 @@ void initCosts(BalancerClusterState cluster) {
689699
allowedOnPath = ".*(/src/test/.*|StochasticLoadBalancer).java")
690700
void updateCostsWithAction(BalancerClusterState cluster, BalanceAction action) {
691701
for (CostFunction c : costFunctions) {
692-
if (c.getMultiplier() > 0 && c.isNeeded()) {
702+
if (c.isNeeded()) {
693703
c.postAction(action);
694704
}
695705
}
@@ -728,7 +738,7 @@ String[] getCostFunctionNames() {
728738
CostFunction c = costFunctions.get(i);
729739
this.tempFunctionCosts[i] = 0.0;
730740

731-
if (c.getMultiplier() <= 0 || !c.isNeeded()) {
741+
if (!c.isNeeded()) {
732742
continue;
733743
}
734744

hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/BalancerTestBase.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@ public static void beforeAllTests() throws Exception {
7575
conf.setFloat("hbase.master.balancer.stochastic.maxMovePercent", 0.75f);
7676
conf.setFloat("hbase.regions.slop", 0.0f);
7777
conf.setFloat("hbase.master.balancer.stochastic.localityCost", 0);
78-
conf.setBoolean("hbase.master.balancer.stochastic.runMaxSteps", true);
7978
loadBalancer = new StochasticLoadBalancer();
8079
MasterServices services = mock(MasterServices.class);
8180
when(services.getConfiguration()).thenReturn(conf);

hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancer.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ public void testNeedBalance() {
169169
for (boolean isByTable : perTableBalancerConfigs) {
170170
conf.setBoolean(HConstants.HBASE_MASTER_LOADBALANCE_BYTABLE, isByTable);
171171
loadBalancer.onConfigurationChange(conf);
172+
172173
for (int[] mockCluster : clusterStateMocks) {
173174
Map<ServerName, List<RegionInfo>> servers = mockClusterServers(mockCluster);
174175
Map<TableName, Map<ServerName, List<RegionInfo>>> LoadOfAllTable =

hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancerBalanceCluster.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,13 @@ public class TestStochasticLoadBalancerBalanceCluster extends BalancerTestBase {
5353
public void testBalanceCluster() throws Exception {
5454
conf.setLong("hbase.master.balancer.stochastic.maxRunningTime", 3 * 60 * 1000); // 3 min
5555
conf.setFloat("hbase.master.balancer.stochastic.maxMovePercent", 1.0f);
56+
conf.setLong(StochasticLoadBalancer.MAX_STEPS_KEY, 20000000L);
5657
loadBalancer.onConfigurationChange(conf);
58+
5759
for (int[] mockCluster : clusterStateMocks) {
5860
Map<ServerName, List<RegionInfo>> servers = mockClusterServers(mockCluster);
5961
List<ServerAndLoad> list = convertToList(servers);
6062
LOG.info("Mock Cluster : " + printMock(list) + " " + printStats(list));
61-
6263
Map<TableName, Map<ServerName, List<RegionInfo>>> LoadOfAllTable =
6364
(Map) mockClusterServersWithTables(servers);
6465
List<RegionPlan> plans = loadBalancer.balanceCluster(LoadOfAllTable);

hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancerLargeCluster.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ public void testLargeCluster() {
3838
int numRegionsPerServer = 80; // all servers except one
3939
int numTables = 100;
4040
int replication = 1;
41-
conf.setLong("hbase.master.balancer.stochastic.maxRunningTime", 6 * 60 * 1000);
42-
conf.setFloat("hbase.master.balancer.stochastic.maxMovePercent", 1.0f);
41+
conf.setLong("hbase.master.balancer.stochastic.maxRunningTime", 3 * 60 * 1000);
42+
conf.setBoolean("hbase.master.balancer.stochastic.runMaxSteps", true);
4343
loadBalancer.onConfigurationChange(conf);
4444
testWithCluster(numNodes, numRegions, numRegionsPerServer, replication, numTables, true, true);
4545
}

0 commit comments

Comments
 (0)