2424
2525import java .io .Closeable ;
2626import java .io .IOException ;
27+ import java .net .BindException ;
2728import java .net .InetSocketAddress ;
2829import java .util .ArrayList ;
2930import java .util .Arrays ;
3940import java .util .TreeSet ;
4041import java .util .concurrent .Callable ;
4142import java .util .concurrent .ConcurrentHashMap ;
43+ import java .util .concurrent .ConcurrentMap ;
4244import java .util .concurrent .ExecutionException ;
4345import java .util .concurrent .ExecutorService ;
4446import java .util .concurrent .Future ;
8183import org .apache .hadoop .hbase .client .TableDescriptor ;
8284import org .apache .hadoop .hbase .client .TableDescriptorBuilder ;
8385import org .apache .hadoop .hbase .filter .FirstKeyOnlyFilter ;
86+ import org .apache .hadoop .hbase .http .InfoServer ;
8487import org .apache .hadoop .hbase .tool .CanaryTool .RegionTask .TaskType ;
8588import org .apache .hadoop .hbase .util .Bytes ;
8689import org .apache .hadoop .hbase .util .EnvironmentEdgeManager ;
122125 */
123126@ InterfaceAudience .LimitedPrivate (HBaseInterfaceAudience .TOOLS )
124127public class CanaryTool implements Tool , Canary {
128+ public static final String HBASE_CANARY_INFO_PORT = "hbase.canary.info.port" ;
129+
130+ public static final int DEFAULT_CANARY_INFOPORT = 16050 ;
131+
132+ public static final String HBASE_CANARY_INFO_BINDADDRESS = "hbase.canary.info.bindAddress" ;
133+
134+ private InfoServer infoServer ;
135+
136+ private void putUpWebUI () throws IOException {
137+ int port = conf .getInt (HBASE_CANARY_INFO_PORT , DEFAULT_CANARY_INFOPORT );
138+ // -1 is for disabling info server
139+ if (port < 0 ) {
140+ return ;
141+ }
142+ if (zookeeperMode ) {
143+ LOG .info ("WebUI is not supported in Zookeeper mode" );
144+ } else if (regionServerMode ) {
145+ LOG .info ("WebUI is not supported in RegionServer mode" );
146+ } else {
147+ String addr = conf .get (HBASE_CANARY_INFO_BINDADDRESS , "0.0.0.0" );
148+ try {
149+ infoServer = new InfoServer ("canary" , addr , port , false , conf );
150+ infoServer .addUnprivilegedServlet ("canary" , "/canary-status" , CanaryStatusServlet .class );
151+ infoServer .setAttribute ("sink" , this .sink );
152+ infoServer .start ();
153+ LOG .info ("Bind Canary http info server to {}:{} " , addr , port );
154+ } catch (BindException e ) {
155+ LOG .warn ("Failed binding Canary http info server to {}:{}" , addr , port , e );
156+ }
157+ }
158+ }
125159
126160 @ Override
127161 public int checkRegions (String [] targets ) throws Exception {
@@ -273,17 +307,53 @@ public void publishReadTiming(String znode, String server, long msTime) {
273307 public static class RegionStdOutSink extends StdOutSink {
274308 private Map <String , LongAdder > perTableReadLatency = new HashMap <>();
275309 private LongAdder writeLatency = new LongAdder ();
276- private final Map <String , List <RegionTaskResult >> regionMap = new ConcurrentHashMap <>();
310+ private final ConcurrentMap <String , List <RegionTaskResult >> regionMap =
311+ new ConcurrentHashMap <>();
312+ private ConcurrentMap <ServerName , LongAdder > perServerFailuresCount =
313+ new ConcurrentHashMap <>();
314+ private ConcurrentMap <String , LongAdder > perTableFailuresCount = new ConcurrentHashMap <>();
315+
316+ public ConcurrentMap <ServerName , LongAdder > getPerServerFailuresCount () {
317+ return perServerFailuresCount ;
318+ }
319+
320+ public ConcurrentMap <String , LongAdder > getPerTableFailuresCount () {
321+ return perTableFailuresCount ;
322+ }
323+
324+ public void resetFailuresCountDetails () {
325+ perServerFailuresCount .clear ();
326+ perTableFailuresCount .clear ();
327+ }
328+
329+ private void incFailuresCountDetails (ServerName serverName , RegionInfo region ) {
330+ perServerFailuresCount .compute (serverName , (server , count ) -> {
331+ if (count == null ) {
332+ count = new LongAdder ();
333+ }
334+ count .increment ();
335+ return count ;
336+ });
337+ perTableFailuresCount .compute (region .getTable ().getNameAsString (), (tableName , count ) -> {
338+ if (count == null ) {
339+ count = new LongAdder ();
340+ }
341+ count .increment ();
342+ return count ;
343+ });
344+ }
277345
278346 public void publishReadFailure (ServerName serverName , RegionInfo region , Exception e ) {
279347 incReadFailureCount ();
348+ incFailuresCountDetails (serverName , region );
280349 LOG .error ("Read from {} on serverName={} failed" ,
281350 region .getRegionNameAsString (), serverName , e );
282351 }
283352
284353 public void publishReadFailure (ServerName serverName , RegionInfo region ,
285354 ColumnFamilyDescriptor column , Exception e ) {
286355 incReadFailureCount ();
356+ incFailuresCountDetails (serverName , region );
287357 LOG .error ("Read from {} on serverName={}, columnFamily={} failed" ,
288358 region .getRegionNameAsString (), serverName ,
289359 column .getNameAsString (), e );
@@ -304,12 +374,14 @@ public void publishReadTiming(ServerName serverName, RegionInfo region,
304374
305375 public void publishWriteFailure (ServerName serverName , RegionInfo region , Exception e ) {
306376 incWriteFailureCount ();
377+ incFailuresCountDetails (serverName , region );
307378 LOG .error ("Write to {} on {} failed" , region .getRegionNameAsString (), serverName , e );
308379 }
309380
310381 public void publishWriteFailure (ServerName serverName , RegionInfo region ,
311382 ColumnFamilyDescriptor column , Exception e ) {
312383 incWriteFailureCount ();
384+ incFailuresCountDetails (serverName , region );
313385 LOG .error ("Write to {} on {} {} failed" , region .getRegionNameAsString (), serverName ,
314386 column .getNameAsString (), e );
315387 }
@@ -345,7 +417,7 @@ public LongAdder getWriteLatency() {
345417 return this .writeLatency ;
346418 }
347419
348- public Map <String , List <RegionTaskResult >> getRegionMap () {
420+ public ConcurrentMap <String , List <RegionTaskResult >> getRegionMap () {
349421 return this .regionMap ;
350422 }
351423
@@ -908,6 +980,7 @@ public int run(String[] args) throws Exception {
908980 System .arraycopy (args , index , monitorTargets , 0 , length );
909981 }
910982
983+ putUpWebUI ();
911984 if (zookeeperMode ) {
912985 return checkZooKeeper ();
913986 } else if (regionServerMode ) {
@@ -1352,6 +1425,7 @@ public void run() {
13521425 try {
13531426 List <Future <Void >> taskFutures = new LinkedList <>();
13541427 RegionStdOutSink regionSink = this .getSink ();
1428+ regionSink .resetFailuresCountDetails ();
13551429 if (this .targets != null && this .targets .length > 0 ) {
13561430 String [] tables = generateMonitorTables (this .targets );
13571431 // Check to see that each table name passed in the -readTableTimeouts argument is also
0 commit comments