181181import org .apache .hadoop .hbase .mob .MobFileCompactionChore ;
182182import org .apache .hadoop .hbase .monitoring .MemoryBoundedLogMessageBuffer ;
183183import org .apache .hadoop .hbase .monitoring .MonitoredTask ;
184+ import org .apache .hadoop .hbase .monitoring .TaskGroup ;
184185import org .apache .hadoop .hbase .monitoring .TaskMonitor ;
185186import org .apache .hadoop .hbase .namequeues .NamedQueueRecorder ;
186187import org .apache .hadoop .hbase .procedure .MasterProcedureManagerHost ;
@@ -462,6 +463,8 @@ public class HMaster extends HBaseServerBase<MasterRpcServices> implements Maste
462463 public static final String WARMUP_BEFORE_MOVE = "hbase.master.warmup.before.move" ;
463464 private static final boolean DEFAULT_WARMUP_BEFORE_MOVE = true ;
464465
466+ private TaskGroup startupTaskGroup ;
467+
465468 /**
466469 * Initializes the HMaster. The steps are as follows:
467470 * <p>
@@ -908,12 +911,12 @@ private void tryMigrateMetaLocationsFromZooKeeper() throws IOException, KeeperEx
908911 * Notice that now we will not schedule a special procedure to make meta online(unless the first
909912 * time where meta has not been created yet), we will rely on SCP to bring meta online.
910913 */
911- private void finishActiveMasterInitialization (MonitoredTask status )
914+ private void finishActiveMasterInitialization (TaskGroup startupTaskGroup )
912915 throws IOException , InterruptedException , KeeperException , ReplicationException {
913916 /*
914917 * We are active master now... go initialize components we need to run.
915918 */
916- status . setStatus ("Initializing Master file system" );
919+ startupTaskGroup . addTask ("Initializing Master file system" );
917920
918921 this .masterActiveTime = EnvironmentEdgeManager .currentTime ();
919922 // TODO: Do this using Dependency Injection, using PicoContainer, Guice or Spring.
@@ -926,15 +929,15 @@ private void finishActiveMasterInitialization(MonitoredTask status)
926929
927930 // warm-up HTDs cache on master initialization
928931 if (preLoadTableDescriptors ) {
929- status . setStatus ("Pre-loading table descriptors" );
932+ startupTaskGroup . addTask ("Pre-loading table descriptors" );
930933 this .tableDescriptors .getAll ();
931934 }
932935
933936 // Publish cluster ID; set it in Master too. The superclass RegionServer does this later but
934937 // only after it has checked in with the Master. At least a few tests ask Master for clusterId
935938 // before it has called its run method and before RegionServer has done the reportForDuty.
936939 ClusterId clusterId = fileSystemManager .getClusterId ();
937- status . setStatus ("Publishing Cluster ID " + clusterId + " in ZooKeeper" );
940+ startupTaskGroup . addTask ("Publishing Cluster ID " + clusterId + " in ZooKeeper" );
938941 ZKClusterId .setClusterId (this .zooKeeper , fileSystemManager .getClusterId ());
939942 this .clusterId = clusterId .toString ();
940943
@@ -953,7 +956,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
953956 }
954957 }
955958
956- status . setStatus ("Initialize ServerManager and schedule SCP for crash servers" );
959+ startupTaskGroup . addTask ("Initialize ServerManager and schedule SCP for crash servers" );
957960 // The below two managers must be created before loading procedures, as they will be used during
958961 // loading.
959962 // initialize master local region
@@ -1000,9 +1003,9 @@ private void finishActiveMasterInitialization(MonitoredTask status)
10001003 // This manager must be accessed AFTER hbase:meta is confirmed on line..
10011004 this .tableStateManager = new TableStateManager (this );
10021005
1003- status . setStatus ("Initializing ZK system trackers" );
1006+ startupTaskGroup . addTask ("Initializing ZK system trackers" );
10041007 initializeZKBasedSystemTrackers ();
1005- status . setStatus ("Loading last flushed sequence id of regions" );
1008+ startupTaskGroup . addTask ("Loading last flushed sequence id of regions" );
10061009 try {
10071010 this .serverManager .loadLastFlushedSequenceIds ();
10081011 } catch (IOException e ) {
@@ -1018,7 +1021,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
10181021 zombieDetector .start ();
10191022
10201023 if (!maintenanceMode ) {
1021- status . setStatus ("Initializing master coprocessors" );
1024+ startupTaskGroup . addTask ("Initializing master coprocessors" );
10221025 setQuotasObserver (conf );
10231026 initializeCoprocessorHost (conf );
10241027 } else {
@@ -1029,7 +1032,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
10291032 }
10301033
10311034 // Checking if meta needs initializing.
1032- status . setStatus ("Initializing meta table if this is a new deploy" );
1035+ startupTaskGroup . addTask ("Initializing meta table if this is a new deploy" );
10331036 InitMetaProcedure initMetaProc = null ;
10341037 // Print out state of hbase:meta on startup; helps debugging.
10351038 if (!this .assignmentManager .getRegionStates ().hasTableRegionStates (TableName .META_TABLE_NAME )) {
@@ -1049,7 +1052,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
10491052 this .balancer .updateClusterMetrics (getClusterMetricsWithoutCoprocessor ());
10501053
10511054 // start up all service threads.
1052- status . setStatus ("Initializing master service threads" );
1055+ startupTaskGroup . addTask ("Initializing master service threads" );
10531056 startServiceThreads ();
10541057 // wait meta to be initialized after we start procedure executor
10551058 if (initMetaProc != null ) {
@@ -1062,16 +1065,16 @@ private void finishActiveMasterInitialization(MonitoredTask status)
10621065 // With this as part of master initialization, it precludes our being able to start a single
10631066 // server that is both Master and RegionServer. Needs more thought. TODO.
10641067 String statusStr = "Wait for region servers to report in" ;
1065- status . setStatus (statusStr );
1066- LOG .info (Objects .toString (status ));
1067- waitForRegionServers (status );
1068+ MonitoredTask waitRegionServer = startupTaskGroup . addTask (statusStr );
1069+ LOG .info (Objects .toString (waitRegionServer ));
1070+ waitForRegionServers (waitRegionServer );
10681071
10691072 // Check if master is shutting down because issue initializing regionservers or balancer.
10701073 if (isStopped ()) {
10711074 return ;
10721075 }
10731076
1074- status . setStatus ("Starting assignment manager" );
1077+ startupTaskGroup . addTask ("Starting assignment manager" );
10751078 // FIRST HBASE:META READ!!!!
10761079 // The below cannot make progress w/o hbase:meta being online.
10771080 // This is the FIRST attempt at going to hbase:meta. Meta on-lining is going on in background
@@ -1136,7 +1139,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
11361139 this .balancer .updateClusterMetrics (getClusterMetricsWithoutCoprocessor ());
11371140
11381141 // Start balancer and meta catalog janitor after meta and regions have been assigned.
1139- status . setStatus ("Starting balancer and catalog janitor" );
1142+ startupTaskGroup . addTask ("Starting balancer and catalog janitor" );
11401143 this .clusterStatusChore = new ClusterStatusChore (this , balancer );
11411144 getChoreService ().scheduleChore (clusterStatusChore );
11421145 this .balancerChore = new BalancerChore (this );
@@ -1156,7 +1159,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
11561159 if (!waitForNamespaceOnline ()) {
11571160 return ;
11581161 }
1159- status . setStatus ("Starting cluster schema service" );
1162+ startupTaskGroup . addTask ("Starting cluster schema service" );
11601163 try {
11611164 initClusterSchemaService ();
11621165 } catch (IllegalStateException e ) {
@@ -1179,7 +1182,6 @@ private void finishActiveMasterInitialization(MonitoredTask status)
11791182 }
11801183 }
11811184
1182- status .markComplete ("Initialization successful" );
11831185 LOG .info (String .format ("Master has completed initialization %.3fsec" ,
11841186 (EnvironmentEdgeManager .currentTime () - masterActiveTime ) / 1000.0f ));
11851187 this .masterFinishedInitializationTime = EnvironmentEdgeManager .currentTime ();
@@ -1198,6 +1200,8 @@ private void finishActiveMasterInitialization(MonitoredTask status)
11981200 }
11991201 // Set master as 'initialized'.
12001202 setInitialized (true );
1203+ startupTaskGroup .markComplete ("Initialization successful" );
1204+ MonitoredTask afterInitialized = startupTaskGroup .addTask ("Progress after master initialized" );
12011205
12021206 if (tableFamilyDesc == null && replBarrierFamilyDesc == null ) {
12031207 // create missing CFs in meta table after master is set to 'initialized'.
@@ -1228,7 +1232,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
12281232 }
12291233
12301234 assignmentManager .checkIfShouldMoveSystemRegionAsync ();
1231- status .setStatus ("Starting quota manager" );
1235+ afterInitialized .setStatus ("Starting quota manager" );
12321236 initQuotaManager ();
12331237 if (QuotaUtil .isQuotaEnabled (conf )) {
12341238 // Create the quota snapshot notifier
@@ -1251,13 +1255,13 @@ private void finishActiveMasterInitialization(MonitoredTask status)
12511255 this .serverManager .clearDeadServersWithSameHostNameAndPortOfOnlineServer ();
12521256
12531257 // Check and set the znode ACLs if needed in case we are overtaking a non-secure configuration
1254- status .setStatus ("Checking ZNode ACLs" );
1258+ afterInitialized .setStatus ("Checking ZNode ACLs" );
12551259 zooKeeper .checkAndSetZNodeAcls ();
12561260
1257- status .setStatus ("Initializing MOB Cleaner" );
1261+ afterInitialized .setStatus ("Initializing MOB Cleaner" );
12581262 initMobCleaner ();
12591263
1260- status .setStatus ("Calling postStartMaster coprocessors" );
1264+ afterInitialized .setStatus ("Calling postStartMaster coprocessors" );
12611265 if (this .cpHost != null ) {
12621266 // don't let cp initialization errors kill the master
12631267 try {
@@ -1282,6 +1286,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
12821286
12831287 this .rollingUpgradeChore = new RollingUpgradeChore (this );
12841288 getChoreService ().scheduleChore (rollingUpgradeChore );
1289+ afterInitialized .markComplete ("Progress after master initialized complete" );
12851290 }
12861291
12871292 private void createMissingCFsInMetaDuringUpgrade (TableDescriptor metaDescriptor )
@@ -2401,14 +2406,16 @@ private void startActiveMasterManager(int infoPort) throws KeeperException {
24012406 Threads .sleep (timeout );
24022407 }
24032408 }
2404- MonitoredTask status = TaskMonitor .get ().createStatus ("Master startup" );
2405- status .setDescription ("Master startup" );
2409+ boolean ignoreClearStartupStatus =
2410+ conf .getBoolean ("hbase.master.ignore.clear.startup.status" , true );
2411+ startupTaskGroup = TaskGroup .createTaskGroup (ignoreClearStartupStatus );
2412+ startupTaskGroup .setDescription ("Master startup" );
24062413 try {
2407- if (activeMasterManager .blockUntilBecomingActiveMaster (timeout , status )) {
2408- finishActiveMasterInitialization (status );
2414+ if (activeMasterManager .blockUntilBecomingActiveMaster (timeout , startupTaskGroup )) {
2415+ finishActiveMasterInitialization (startupTaskGroup );
24092416 }
24102417 } catch (Throwable t ) {
2411- status . setStatus ("Failed to become active: " + t . getMessage () );
2418+ startupTaskGroup . abort ("Failed to become active master" );
24122419 LOG .error (HBaseMarkers .FATAL , "Failed to become active master" , t );
24132420 // HBASE-5680: Likely hadoop23 vs hadoop 20.x/1.x incompatibility
24142421 if (
@@ -2423,7 +2430,9 @@ private void startActiveMasterManager(int infoPort) throws KeeperException {
24232430 abort ("Unhandled exception. Starting shutdown." , t );
24242431 }
24252432 } finally {
2426- status .cleanup ();
2433+ if (!ignoreClearStartupStatus ) {
2434+ startupTaskGroup .cleanup ();
2435+ }
24272436 }
24282437 }
24292438
@@ -3099,6 +3108,10 @@ public MemoryBoundedLogMessageBuffer getRegionServerFatalLogBuffer() {
30993108 return rsFatals ;
31003109 }
31013110
3111+ public TaskGroup getStartupProgress () {
3112+ return startupTaskGroup ;
3113+ }
3114+
31023115 /**
31033116 * Shutdown the cluster. Master runs a coordinated stop of all RegionServers and then itself.
31043117 */
0 commit comments