Skip to content

Commit 4f3e63e

Browse files
authored
HBASE-28168 Add option in RegionMover.java to isolate one or more reg… (#5476)
Signed-off-by: Wellington Chevreuil <[email protected]>
1 parent e876afe commit 4f3e63e

File tree

2 files changed

+351
-7
lines changed

2 files changed

+351
-7
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/util/RegionMover.java

Lines changed: 174 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import java.nio.file.Files;
3131
import java.nio.file.Paths;
3232
import java.util.ArrayList;
33+
import java.util.Arrays;
3334
import java.util.Collections;
3435
import java.util.EnumSet;
3536
import java.util.HashSet;
@@ -52,15 +53,23 @@
5253
import org.apache.hadoop.hbase.ClusterMetrics.Option;
5354
import org.apache.hadoop.hbase.HBaseConfiguration;
5455
import org.apache.hadoop.hbase.HConstants;
56+
import org.apache.hadoop.hbase.HRegionLocation;
57+
import org.apache.hadoop.hbase.MetaTableAccessor;
5558
import org.apache.hadoop.hbase.ServerName;
5659
import org.apache.hadoop.hbase.UnknownRegionException;
5760
import org.apache.hadoop.hbase.client.Admin;
5861
import org.apache.hadoop.hbase.client.Connection;
5962
import org.apache.hadoop.hbase.client.ConnectionFactory;
6063
import org.apache.hadoop.hbase.client.DoNotRetryRegionException;
6164
import org.apache.hadoop.hbase.client.RegionInfo;
65+
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
66+
import org.apache.hadoop.hbase.client.Result;
6267
import org.apache.hadoop.hbase.master.RackManager;
68+
import org.apache.hadoop.hbase.master.RegionState;
6369
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
70+
import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
71+
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
72+
import org.apache.hadoop.hbase.zookeeper.ZNodePaths;
6473
import org.apache.yetus.audience.InterfaceAudience;
6574
import org.slf4j.Logger;
6675
import org.slf4j.LoggerFactory;
@@ -93,6 +102,7 @@ public class RegionMover extends AbstractHBaseTool implements Closeable {
93102
private boolean ack = true;
94103
private int maxthreads = 1;
95104
private int timeout;
105+
private List<String> isolateRegionIdArray;
96106
private String loadUnload;
97107
private String hostname;
98108
private String filename;
@@ -109,6 +119,7 @@ private RegionMover(RegionMoverBuilder builder) throws IOException {
109119
this.excludeFile = builder.excludeFile;
110120
this.designatedFile = builder.designatedFile;
111121
this.maxthreads = builder.maxthreads;
122+
this.isolateRegionIdArray = builder.isolateRegionIdArray;
112123
this.ack = builder.ack;
113124
this.port = builder.port;
114125
this.timeout = builder.timeout;
@@ -153,6 +164,7 @@ public static class RegionMoverBuilder {
153164
private boolean ack = true;
154165
private int maxthreads = 1;
155166
private int timeout = Integer.MAX_VALUE;
167+
private List<String> isolateRegionIdArray = new ArrayList<>();
156168
private String hostname;
157169
private String filename;
158170
private String excludeFile = null;
@@ -213,6 +225,14 @@ public RegionMoverBuilder maxthreads(int threads) {
213225
return this;
214226
}
215227

228+
/**
229+
* Set the region ID to isolate on the region server.
230+
*/
231+
public RegionMoverBuilder isolateRegionIdArray(List<String> isolateRegionIdArray) {
232+
this.isolateRegionIdArray = isolateRegionIdArray;
233+
return this;
234+
}
235+
216236
/**
217237
* Path of file containing hostnames to be excluded during region movement. Exclude file should
218238
* have 'host:port' per line. Port is mandatory here as we can have many RS running on a single
@@ -406,6 +426,25 @@ public boolean unloadFromRack()
406426
}
407427

408428
private boolean unloadRegions(boolean unloadFromRack)
429+
throws ExecutionException, InterruptedException, TimeoutException {
430+
return unloadRegions(unloadFromRack, null);
431+
}
432+
433+
/**
434+
* Isolated regions specified in {@link #isolateRegionIdArray} on {@link #hostname} in ack Mode
435+
* and Unload regions from given {@link #hostname} using ack/noAck mode and {@link #maxthreads}.
436+
* In noAck mode we do not make sure that region is successfully online on the target region
437+
* server,hence it is the best effort. We do not unload regions to hostnames given in
438+
* {@link #excludeFile}. If designatedFile is present with some contents, we will unload regions
439+
* to hostnames provided in {@link #designatedFile}
440+
* @return true if region isolation succeeded, false otherwise
441+
*/
442+
public boolean isolateRegions()
443+
throws ExecutionException, InterruptedException, TimeoutException {
444+
return unloadRegions(false, isolateRegionIdArray);
445+
}
446+
447+
private boolean unloadRegions(boolean unloadFromRack, List<String> isolateRegionIdArray)
409448
throws InterruptedException, ExecutionException, TimeoutException {
410449
deleteFile(this.filename);
411450
ExecutorService unloadPool = Executors.newFixedThreadPool(1);
@@ -459,7 +498,7 @@ private boolean unloadRegions(boolean unloadFromRack)
459498
LOG.warn("No Regions were moved - no servers available");
460499
return false;
461500
}
462-
unloadRegions(server, regionServers, movedRegions);
501+
unloadRegions(server, regionServers, movedRegions, isolateRegionIdArray);
463502
} catch (Exception e) {
464503
LOG.error("Error while unloading regions ", e);
465504
return false;
@@ -474,9 +513,111 @@ private boolean unloadRegions(boolean unloadFromRack)
474513
}
475514

476515
private void unloadRegions(ServerName server, List<ServerName> regionServers,
477-
List<RegionInfo> movedRegions) throws Exception {
516+
List<RegionInfo> movedRegions, List<String> isolateRegionIdArray) throws Exception {
478517
while (true) {
518+
List<RegionInfo> isolateRegionInfoList = Collections.synchronizedList(new ArrayList<>());
519+
RegionInfo isolateRegionInfo = null;
520+
if (isolateRegionIdArray != null && !isolateRegionIdArray.isEmpty()) {
521+
// Region will be moved to target region server with Ack mode.
522+
final ExecutorService isolateRegionPool = Executors.newFixedThreadPool(maxthreads);
523+
List<Future<Boolean>> isolateRegionTaskList = new ArrayList<>();
524+
List<RegionInfo> recentlyIsolatedRegion = Collections.synchronizedList(new ArrayList<>());
525+
boolean allRegionOpsSuccessful = true;
526+
boolean isMetaIsolated = false;
527+
RegionInfo metaRegionInfo = RegionInfoBuilder.FIRST_META_REGIONINFO;
528+
List<HRegionLocation> hRegionLocationRegionIsolation =
529+
Collections.synchronizedList(new ArrayList<>());
530+
for (String isolateRegionId : isolateRegionIdArray) {
531+
if (isolateRegionId.equalsIgnoreCase(metaRegionInfo.getEncodedName())) {
532+
isMetaIsolated = true;
533+
continue;
534+
}
535+
Result result = MetaTableAccessor.scanByRegionEncodedName(conn, isolateRegionId);
536+
HRegionLocation hRegionLocation =
537+
MetaTableAccessor.getRegionLocation(conn, result.getRow());
538+
if (hRegionLocation != null) {
539+
hRegionLocationRegionIsolation.add(hRegionLocation);
540+
} else {
541+
LOG.error("Region " + isolateRegionId + " doesn't exists/can't fetch from"
542+
+ " meta...Quitting now");
543+
// We only move the regions if all the regions were found.
544+
allRegionOpsSuccessful = false;
545+
break;
546+
}
547+
}
548+
549+
if (!allRegionOpsSuccessful) {
550+
break;
551+
}
552+
// If hbase:meta region was isolated, then it needs to be part of isolateRegionInfoList.
553+
if (isMetaIsolated) {
554+
ZKWatcher zkWatcher = new ZKWatcher(conf, null, null);
555+
List<HRegionLocation> result = new ArrayList<>();
556+
for (String znode : zkWatcher.getMetaReplicaNodes()) {
557+
String path = ZNodePaths.joinZNode(zkWatcher.getZNodePaths().baseZNode, znode);
558+
int replicaId = zkWatcher.getZNodePaths().getMetaReplicaIdFromPath(path);
559+
RegionState state = MetaTableLocator.getMetaRegionState(zkWatcher, replicaId);
560+
result.add(new HRegionLocation(state.getRegion(), state.getServerName()));
561+
}
562+
ServerName metaSeverName = result.get(0).getServerName();
563+
// For isolating hbase:meta, it should move explicitly in Ack mode,
564+
// hence the forceMoveRegionByAck = true.
565+
if (!metaSeverName.equals(server)) {
566+
LOG.info("Region of hbase:meta " + metaRegionInfo.getEncodedName() + " is on server "
567+
+ metaSeverName + " moving to " + server);
568+
submitRegionMovesWhileUnloading(metaSeverName, Collections.singletonList(server),
569+
movedRegions, Collections.singletonList(metaRegionInfo), true);
570+
} else {
571+
LOG.info("Region of hbase:meta " + metaRegionInfo.getEncodedName() + " already exists"
572+
+ " on server : " + server);
573+
}
574+
isolateRegionInfoList.add(RegionInfoBuilder.FIRST_META_REGIONINFO);
575+
}
576+
577+
if (!hRegionLocationRegionIsolation.isEmpty()) {
578+
for (HRegionLocation hRegionLocation : hRegionLocationRegionIsolation) {
579+
isolateRegionInfo = hRegionLocation.getRegion();
580+
isolateRegionInfoList.add(isolateRegionInfo);
581+
if (hRegionLocation.getServerName() == server) {
582+
LOG.info("Region " + hRegionLocation.getRegion().getEncodedName() + " already exists"
583+
+ " on server : " + server.getHostname());
584+
} else {
585+
Future<Boolean> isolateRegionTask =
586+
isolateRegionPool.submit(new MoveWithAck(conn, isolateRegionInfo,
587+
hRegionLocation.getServerName(), server, recentlyIsolatedRegion));
588+
isolateRegionTaskList.add(isolateRegionTask);
589+
}
590+
}
591+
}
592+
593+
if (!isolateRegionTaskList.isEmpty()) {
594+
isolateRegionPool.shutdown();
595+
// Now that we have fetched all the region's regionInfo, we can move them.
596+
waitMoveTasksToFinish(isolateRegionPool, isolateRegionTaskList,
597+
admin.getConfiguration().getLong(MOVE_WAIT_MAX_KEY, DEFAULT_MOVE_WAIT_MAX));
598+
599+
Set<RegionInfo> currentRegionsOnTheServer = new HashSet<>(admin.getRegions(server));
600+
if (!currentRegionsOnTheServer.containsAll(isolateRegionInfoList)) {
601+
// If all the regions are not online on the target server,
602+
// we don't put RS in decommission mode and exit from here.
603+
LOG.error("One of the Region move failed OR stuck in transition...Quitting now");
604+
break;
605+
}
606+
} else {
607+
LOG.info("All regions already exists on server : " + server.getHostname());
608+
}
609+
// Once region has been moved to target RS, put the target RS into decommission mode,
610+
// so master doesn't assign new region to the target RS while we unload the target RS.
611+
// Also pass 'offload' flag as false since we don't want master to offload the target RS.
612+
List<ServerName> listOfServer = new ArrayList<>();
613+
listOfServer.add(server);
614+
LOG.info("Putting server : " + server.getHostname() + " in decommission/draining mode");
615+
admin.decommissionRegionServers(listOfServer, false);
616+
}
479617
List<RegionInfo> regionsToMove = admin.getRegions(server);
618+
// Remove all the regions from the online Region list, that we just isolated.
619+
// This will also include hbase:meta if it was isolated.
620+
regionsToMove.removeAll(isolateRegionInfoList);
480621
regionsToMove.removeAll(movedRegions);
481622
if (regionsToMove.isEmpty()) {
482623
LOG.info("No Regions to move....Quitting now");
@@ -488,21 +629,25 @@ private void unloadRegions(ServerName server, List<ServerName> regionServers,
488629
Optional<RegionInfo> metaRegion = getMetaRegionInfoIfToBeMoved(regionsToMove);
489630
if (metaRegion.isPresent()) {
490631
RegionInfo meta = metaRegion.get();
632+
// hbase:meta should move explicitly in Ack mode.
491633
submitRegionMovesWhileUnloading(server, regionServers, movedRegions,
492-
Collections.singletonList(meta));
634+
Collections.singletonList(meta), true);
493635
regionsToMove.remove(meta);
494636
}
495-
submitRegionMovesWhileUnloading(server, regionServers, movedRegions, regionsToMove);
637+
submitRegionMovesWhileUnloading(server, regionServers, movedRegions, regionsToMove, false);
496638
}
497639
}
498640

499641
private void submitRegionMovesWhileUnloading(ServerName server, List<ServerName> regionServers,
500-
List<RegionInfo> movedRegions, List<RegionInfo> regionsToMove) throws Exception {
642+
List<RegionInfo> movedRegions, List<RegionInfo> regionsToMove, boolean forceMoveRegionByAck)
643+
throws Exception {
501644
final ExecutorService moveRegionsPool = Executors.newFixedThreadPool(this.maxthreads);
502645
List<Future<Boolean>> taskList = new ArrayList<>();
503646
int serverIndex = 0;
504647
for (RegionInfo regionToMove : regionsToMove) {
505-
if (ack) {
648+
// To move/isolate hbase:meta on a server, it should happen explicitly by Ack mode, hence the
649+
// forceMoveRegionByAck = true.
650+
if (ack || forceMoveRegionByAck) {
506651
Future<Boolean> task = moveRegionsPool.submit(new MoveWithAck(conn, regionToMove, server,
507652
regionServers.get(serverIndex), movedRegions));
508653
taskList.add(task);
@@ -748,9 +893,17 @@ private ServerName stripServer(List<ServerName> regionServers, String hostname,
748893
@Override
749894
protected void addOptions() {
750895
this.addRequiredOptWithArg("r", "regionserverhost", "region server <hostname>|<hostname:port>");
751-
this.addRequiredOptWithArg("o", "operation", "Expected: load/unload/unload_from_rack");
896+
this.addRequiredOptWithArg("o", "operation",
897+
"Expected: load/unload/unload_from_rack/isolate_regions");
752898
this.addOptWithArg("m", "maxthreads",
753899
"Define the maximum number of threads to use to unload and reload the regions");
900+
this.addOptWithArg("i", "isolateRegionIds",
901+
"Comma separated list of Region IDs hash to isolate on a RegionServer and put region server"
902+
+ " in draining mode. This option should only be used with '-o isolate_regions'."
903+
+ " By putting region server in decommission/draining mode, master can't assign any"
904+
+ " new region on this server. If one or more regions are not found OR failed to isolate"
905+
+ " successfully, utility will exist without putting RS in draining/decommission mode."
906+
+ " Ex. --isolateRegionIds id1,id2,id3 OR -i id1,id2,id3");
754907
this.addOptWithArg("x", "excludefile",
755908
"File with <hostname:port> per line to exclude as unload targets; default excludes only "
756909
+ "target host; useful for rack decommisioning.");
@@ -772,9 +925,14 @@ protected void addOptions() {
772925
protected void processOptions(CommandLine cmd) {
773926
String hostname = cmd.getOptionValue("r");
774927
rmbuilder = new RegionMoverBuilder(hostname);
928+
this.loadUnload = cmd.getOptionValue("o").toLowerCase(Locale.ROOT);
775929
if (cmd.hasOption('m')) {
776930
rmbuilder.maxthreads(Integer.parseInt(cmd.getOptionValue('m')));
777931
}
932+
if (this.loadUnload.equals("isolate_regions") && cmd.hasOption("isolateRegionIds")) {
933+
rmbuilder
934+
.isolateRegionIdArray(Arrays.asList(cmd.getOptionValue("isolateRegionIds").split(",")));
935+
}
778936
if (cmd.hasOption('n')) {
779937
rmbuilder.ack(false);
780938
}
@@ -803,6 +961,15 @@ protected int doWork() throws Exception {
803961
success = rm.unload();
804962
} else if (loadUnload.equalsIgnoreCase("unload_from_rack")) {
805963
success = rm.unloadFromRack();
964+
} else if (loadUnload.equalsIgnoreCase("isolate_regions")) {
965+
if (rm.isolateRegionIdArray != null && !rm.isolateRegionIdArray.isEmpty()) {
966+
success = rm.isolateRegions();
967+
} else {
968+
LOG.error("Missing -i/--isolate_regions option with '-o isolate_regions' option");
969+
LOG.error("Use -h or --help for usage instructions");
970+
printUsage();
971+
success = false;
972+
}
806973
} else {
807974
printUsage();
808975
success = false;

0 commit comments

Comments
 (0)