Skip to content

Commit c53e94b

Browse files
committed
HBASE-25973 Balancer should explain progress in a better way in log - backport branch-2
1 parent dfc9ac8 commit c53e94b

File tree

2 files changed

+67
-58
lines changed

2 files changed

+67
-58
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1877,6 +1877,8 @@ public List<RegionPlan> executeRegionPlansWithThrottling(List<RegionPlan> plans)
18771877
}
18781878
}
18791879
}
1880+
LOG.info("Balancer is going into sleep until next period in {}ms", getConfiguration()
1881+
.getInt(HConstants.HBASE_BALANCER_PERIOD, HConstants.DEFAULT_HBASE_BALANCER_PERIOD));
18801882
return successRegionPlans;
18811883
}
18821884

hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java

Lines changed: 65 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,8 @@ public class StochasticLoadBalancer extends BaseLoadBalancer {
133133

134134
private List<CandidateGenerator> candidateGenerators;
135135
private List<CostFunction> costFunctions; // FindBugs: Wants this protected; IS2_INCONSISTENT_SYNC
136-
136+
// To save currently configed sum of multiplier. Defaulted at 1 for cases that carry high cost
137+
private float sumMultiplier = 1.0f;
137138
// to save and report costs to JMX
138139
private double curOverallCost = 0d;
139140
private double[] tempFunctionCosts;
@@ -229,7 +230,6 @@ protected void loadConf(Configuration conf) {
229230

230231
regionReplicaHostCostFunction = new RegionReplicaHostCostFunction(conf);
231232
regionReplicaRackCostFunction = new RegionReplicaRackCostFunction(conf);
232-
233233
costFunctions = new ArrayList<>();
234234
addCostFunction(new RegionCountSkewCostFunction(conf));
235235
addCostFunction(new PrimaryRegionCountSkewCostFunction(conf));
@@ -310,63 +310,63 @@ private boolean areSomeRegionReplicasColocated(BalancerClusterState c) {
310310
boolean needsBalance(TableName tableName, BalancerClusterState cluster) {
311311
ClusterLoadState cs = new ClusterLoadState(cluster.clusterState);
312312
if (cs.getNumServers() < MIN_SERVER_BALANCE) {
313-
if (LOG.isDebugEnabled()) {
314-
LOG.debug("Not running balancer because only " + cs.getNumServers()
315-
+ " active regionserver(s)");
316-
}
317-
if (this.isBalancerRejectionRecording) {
318-
sendRejectionReasonToRingBuffer("The number of RegionServers " +
319-
cs.getNumServers() + " < MIN_SERVER_BALANCE(" + MIN_SERVER_BALANCE + ")", null);
320-
}
313+
LOG.info("Not running balancer because only " + cs.getNumServers() + " active regionserver(s)");
314+
sendRejectionReasonToRingBuffer(
315+
"The number of RegionServers " + cs.getNumServers() + " < MIN_SERVER_BALANCE(" + MIN_SERVER_BALANCE + ")", null);
321316
return false;
322317
}
323318
if (areSomeRegionReplicasColocated(cluster)) {
319+
LOG.info("Running balancer because at least one server hosts replicas of the same region.");
324320
return true;
325321
}
326322

327323
if (idleRegionServerExist(cluster)){
324+
LOG.info("Running balancer because cluster has idle server(s).");
328325
return true;
329326
}
330327

328+
sumMultiplier = 0.0f;
331329
double total = 0.0;
332-
float sumMultiplier = 0.0f;
333330
for (CostFunction c : costFunctions) {
334331
float multiplier = c.getMultiplier();
335-
if (multiplier <= 0) {
336-
LOG.trace("{} not needed because multiplier is <= 0", c.getClass().getSimpleName());
337-
continue;
338-
}
332+
double cost = c.cost();
339333
if (!c.isNeeded()) {
340334
LOG.trace("{} not needed", c.getClass().getSimpleName());
341335
continue;
342336
}
337+
total += cost * multiplier;
343338
sumMultiplier += multiplier;
344-
total += c.cost() * multiplier;
345-
}
346-
347-
boolean balanced = total <= 0 || sumMultiplier <= 0 ||
348-
(sumMultiplier > 0 && (total / sumMultiplier) < minCostNeedBalance);
349-
if(balanced && isBalancerRejectionRecording){
350-
String reason = "";
351-
if (total <= 0) {
352-
reason = "(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern) = " + total + " <= 0";
353-
} else if (sumMultiplier <= 0) {
354-
reason = "sumMultiplier = " + sumMultiplier + " <= 0";
355-
} else if ((total / sumMultiplier) < minCostNeedBalance) {
356-
reason =
357-
"[(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern)]/sumMultiplier = " + (total
358-
/ sumMultiplier) + " <= minCostNeedBalance(" + minCostNeedBalance + ")";
359-
}
360-
sendRejectionReasonToRingBuffer(reason, costFunctions);
361-
}
362-
if (LOG.isDebugEnabled()) {
363-
LOG.debug("{} {}; total cost={}, sum multiplier={}; cost/multiplier to need a balance is {}",
364-
balanced ? "Skipping load balancing because balanced" : "We need to load balance",
365-
isByTable ? String.format("table (%s)", tableName) : "cluster",
366-
total, sumMultiplier, minCostNeedBalance);
367-
if (LOG.isTraceEnabled()) {
368-
LOG.trace("Balance decision detailed function costs={}", functionCost());
339+
}
340+
if (sumMultiplier <= 0) {
341+
LOG.error("At least one cost function needs a multiplier > 0. For example, set "
342+
+ "hbase.master.balancer.stochastic.regionCountCost to a positive value or default");
343+
return false;
344+
}
345+
346+
boolean balanced = (total / sumMultiplier < minCostNeedBalance);
347+
if (balanced) {
348+
if (isBalancerRejectionRecording) {
349+
String reason = "";
350+
if (total <= 0) {
351+
reason = "(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern) = " + total + " <= 0";
352+
} else if (sumMultiplier <= 0) {
353+
reason = "sumMultiplier = " + sumMultiplier + " <= 0";
354+
} else if ((total / sumMultiplier) < minCostNeedBalance) {
355+
reason =
356+
"[(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern)]/sumMultiplier = " + (
357+
total / sumMultiplier) + " <= minCostNeedBalance(" + minCostNeedBalance + ")";
358+
}
359+
sendRejectionReasonToRingBuffer(reason, costFunctions);
369360
}
361+
LOG.info("{} - skipping load balancing because weighted average imbalance={} <= "
362+
+ "threshold({}). If you want more aggressive balancing, either lower "
363+
+ "hbase.master.balancer.stochastic.minCostNeedBalance from {} or increase the relative "
364+
+ "multiplier(s) of the specific cost function(s). functionCost={}",
365+
isByTable ? "Table specific ("+tableName+")" : "Cluster wide", total / sumMultiplier,
366+
minCostNeedBalance, minCostNeedBalance, functionCost());
367+
} else {
368+
LOG.info("{} - Calculating plan. may take up to {}ms to complete.",
369+
isByTable ? "Table specific ("+tableName+")" : "Cluster wide", maxRunningTime);
370370
}
371371
return !balanced;
372372
}
@@ -452,8 +452,9 @@ protected List<RegionPlan> balanceTable(TableName tableName, Map<ServerName,
452452
maxSteps);
453453
}
454454
}
455-
LOG.info("start StochasticLoadBalancer.balancer, initCost=" + currentCost + ", functionCost="
456-
+ functionCost() + " computedMaxSteps: " + computedMaxSteps);
455+
LOG.info("Start StochasticLoadBalancer.balancer, initial weighted average imbalance={}, "
456+
+ "functionCost={} computedMaxSteps={}",
457+
currentCost / sumMultiplier, functionCost(), computedMaxSteps);
457458

458459
final String initFunctionTotalCosts = totalCostsPerFunc();
459460
// Perform a stochastic walk to see if we can get a good fit.
@@ -499,17 +500,19 @@ protected List<RegionPlan> balanceTable(TableName tableName, Map<ServerName,
499500
updateStochasticCosts(tableName, curOverallCost, curFunctionCosts);
500501
if (initCost > currentCost) {
501502
plans = createRegionPlans(cluster);
502-
LOG.info("Finished computing new load balance plan. Computation took {}" +
503-
" to try {} different iterations. Found a solution that moves " +
504-
"{} regions; Going from a computed cost of {}" +
505-
" to a new cost of {}", java.time.Duration.ofMillis(endTime - startTime),
506-
step, plans.size(), initCost, currentCost);
503+
LOG.info("Finished computing new moving plan. Computation took {} ms" +
504+
" to try {} different iterations. Found a solution that moves " +
505+
"{} regions; Going from a computed imbalance of {}" +
506+
" to a new imbalance of {}. ",
507+
endTime - startTime, step, plans.size(),
508+
initCost / sumMultiplier, currentCost / sumMultiplier);
509+
507510
sendRegionPlansToRingBuffer(plans, currentCost, initCost, initFunctionTotalCosts, step);
508511
return plans;
509512
}
510-
LOG.info("Could not find a better load balance plan. Tried {} different configurations in " +
511-
"{}, and did not find anything with a computed cost less than {}", step,
512-
java.time.Duration.ofMillis(endTime - startTime), initCost);
513+
LOG.info("Could not find a better moving plan. Tried {} different configurations in " +
514+
"{} ms, and did not find anything with an imbalance score less than {}", step,
515+
endTime - startTime, initCost / sumMultiplier);
513516
return null;
514517
}
515518

@@ -520,8 +523,7 @@ private void sendRejectionReasonToRingBuffer(String reason, List<CostFunction> c
520523
.setReason(reason);
521524
if (costFunctions != null) {
522525
for (CostFunction c : costFunctions) {
523-
float multiplier = c.getMultiplier();
524-
if (multiplier <= 0 || !c.isNeeded()) {
526+
if (!c.isNeeded()) {
525527
continue;
526528
}
527529
builder.addCostFuncInfo(c.getClass().getName(), c.cost(), c.getMultiplier());
@@ -580,7 +582,8 @@ private void updateStochasticCosts(TableName tableName, double overall, double[]
580582
}
581583

582584
private void addCostFunction(CostFunction costFunction) {
583-
if (costFunction.getMultiplier() > 0) {
585+
float multiplier = costFunction.getMultiplier();
586+
if (multiplier > 0) {
584587
costFunctions.add(costFunction);
585588
}
586589
}
@@ -591,9 +594,13 @@ private String functionCost() {
591594
builder.append(c.getClass().getSimpleName());
592595
builder.append(" : (");
593596
if (c.isNeeded()) {
594-
builder.append(c.getMultiplier());
597+
builder.append("multiplier=" + c.getMultiplier());
595598
builder.append(", ");
596-
builder.append(c.cost());
599+
double cost = c.cost();
600+
builder.append("imbalance=" + cost);
601+
if (cost < minCostNeedBalance) {
602+
builder.append(", balanced");
603+
}
597604
} else {
598605
builder.append("not needed");
599606
}
@@ -605,7 +612,7 @@ private String functionCost() {
605612
private String totalCostsPerFunc() {
606613
StringBuilder builder = new StringBuilder();
607614
for (CostFunction c : costFunctions) {
608-
if (c.getMultiplier() <= 0 || !c.isNeeded()) {
615+
if (!c.isNeeded()) {
609616
continue;
610617
}
611618
double cost = c.getMultiplier() * c.cost();
@@ -689,7 +696,7 @@ void initCosts(BalancerClusterState cluster) {
689696
allowedOnPath = ".*(/src/test/.*|StochasticLoadBalancer).java")
690697
void updateCostsWithAction(BalancerClusterState cluster, BalanceAction action) {
691698
for (CostFunction c : costFunctions) {
692-
if (c.getMultiplier() > 0 && c.isNeeded()) {
699+
if (c.isNeeded()) {
693700
c.postAction(action);
694701
}
695702
}
@@ -728,7 +735,7 @@ String[] getCostFunctionNames() {
728735
CostFunction c = costFunctions.get(i);
729736
this.tempFunctionCosts[i] = 0.0;
730737

731-
if (c.getMultiplier() <= 0 || !c.isNeeded()) {
738+
if (!c.isNeeded()) {
732739
continue;
733740
}
734741

0 commit comments

Comments
 (0)