From 3a009e90852951cb679b0e675e3ea8ae2ef72d09 Mon Sep 17 00:00:00 2001 From: Aravindan Vijayan Date: Tue, 27 Aug 2019 22:39:37 -0700 Subject: [PATCH 1/5] HDDS-1783 : Latency metric for applyTransaction in ContainerStateMachine. --- .../common/transport/server/ratis/CSMMetrics.java | 10 ++++++++++ .../transport/server/ratis/ContainerStateMachine.java | 7 ++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java index ccf57cb2f36e3..28d2cb0397a0e 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java @@ -60,6 +60,8 @@ public class CSMMetrics { private @Metric MutableCounterLong numStartTransactionVerifyFailures; private @Metric MutableCounterLong numContainerNotOpenVerifyFailures; + private @Metric MutableRate applyTransactionLatency; + public CSMMetrics() { int numCmdTypes = ContainerProtos.Type.values().length; this.opsLatency = new MutableRate[numCmdTypes]; @@ -181,6 +183,11 @@ public long getNumBytesCommittedCount() { return numBytesCommittedCount.value(); } + @VisibleForTesting + public MutableRate getApplyTransactionLatency() { + return applyTransactionLatency; + } + public void incPipelineLatency(ContainerProtos.Type type, long latencyNanos) { opsLatency[type.ordinal()].add(latencyNanos); transactionLatency.add(latencyNanos); @@ -194,6 +201,9 @@ public void incNumContainerNotOpenVerifyFailures() { numContainerNotOpenVerifyFailures.incr(); } + public void recordApplyTransactionLatency(long latencyNanos) { + applyTransactionLatency.add(latencyNanos); + } public void unRegister() { MetricsSystem ms = DefaultMetricsSystem.instance(); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java index aadec8dcd7ca7..5f71a46dfe0af 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java @@ -665,6 +665,7 @@ public CompletableFuture applyTransaction(TransactionContext trx) { .setTerm(trx.getLogEntry().getTerm()) .setLogIndex(index); + long applyTxnStartTime = Time.monotonicNowNanos(); try { applyTransactionSemaphore.acquire(); metrics.incNumApplyTransactionsOps(); @@ -732,7 +733,11 @@ public CompletableFuture applyTransaction(TransactionContext trx) { } } return applyTransactionFuture; - }).whenComplete((r, t) -> applyTransactionSemaphore.release()); + }).whenComplete((r, t) -> { + applyTransactionSemaphore.release(); + metrics.recordApplyTransactionLatency( + Time.monotonicNowNanos() - applyTxnStartTime); + }); return applyTransactionFuture; } catch (IOException | InterruptedException e) { metrics.incNumApplyTransactionsFails(); From 643eb4ba4147cafb51f770624084fc4287783ef8 Mon Sep 17 00:00:00 2001 From: Aravindan Vijayan Date: Wed, 28 Aug 2019 11:25:56 -0700 Subject: [PATCH 2/5] HDDS-1783 : Latency metric for applyTransaction in ContainerStateMachine. (Add a rate metric for WriteStateMachineData) --- .../common/transport/server/ratis/CSMMetrics.java | 12 ++++++++---- .../server/ratis/ContainerStateMachine.java | 10 +++++++--- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java index 28d2cb0397a0e..f6bab959f42ae 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java @@ -60,7 +60,8 @@ public class CSMMetrics { private @Metric MutableCounterLong numStartTransactionVerifyFailures; private @Metric MutableCounterLong numContainerNotOpenVerifyFailures; - private @Metric MutableRate applyTransactionLatency; + private @Metric MutableRate applyTransaction; + private @Metric MutableRate writeStateMachineData; public CSMMetrics() { int numCmdTypes = ContainerProtos.Type.values().length; @@ -183,7 +184,6 @@ public long getNumBytesCommittedCount() { return numBytesCommittedCount.value(); } - @VisibleForTesting public MutableRate getApplyTransactionLatency() { return applyTransactionLatency; } @@ -201,8 +201,12 @@ public void incNumContainerNotOpenVerifyFailures() { numContainerNotOpenVerifyFailures.incr(); } - public void recordApplyTransactionLatency(long latencyNanos) { - applyTransactionLatency.add(latencyNanos); + public void recordApplyTransactionCompletion(long latencyNanos) { + applyTransaction.add(latencyNanos); + } + + public void recordWriteStateMachineCompletion(long latencyNanos) { + writeStateMachineData.add(latencyNanos); } public void unRegister() { diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java index 5f71a46dfe0af..1876c045ecb46 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java @@ -410,7 +410,8 @@ private ExecutorService getCommandExecutor( } private CompletableFuture handleWriteChunk( - ContainerCommandRequestProto requestProto, long entryIndex, long term) { + ContainerCommandRequestProto requestProto, long entryIndex, long term, + long startTime) { final WriteChunkRequestProto write = requestProto.getWriteChunk(); RaftServer server = ratisServer.getServer(); Preconditions.checkState(server instanceof RaftServerProxy); @@ -460,6 +461,8 @@ private CompletableFuture handleWriteChunk( write.getBlockID() + " logIndex " + entryIndex + " chunkName " + write.getChunkData().getChunkName()); raftFuture.complete(r::toByteString); + metrics.recordWriteStateMachineCompletion( + Time.monotonicNowNanos() - startTime); } writeChunkFutureMap.remove(entryIndex); @@ -476,6 +479,7 @@ private CompletableFuture handleWriteChunk( public CompletableFuture writeStateMachineData(LogEntryProto entry) { try { metrics.incNumWriteStateMachineOps(); + long writeStateMachineStartTime = Time.monotonicNowNanos(); ContainerCommandRequestProto requestProto = getContainerCommandRequestProto( entry.getStateMachineLogEntry().getLogData()); @@ -492,7 +496,7 @@ public CompletableFuture writeStateMachineData(LogEntryProto entry) { switch (cmdType) { case WriteChunk: return handleWriteChunk(requestProto, entry.getIndex(), - entry.getTerm()); + entry.getTerm(), writeStateMachineStartTime); default: throw new IllegalStateException("Cmd Type:" + cmdType + " should not have state machine data"); @@ -735,7 +739,7 @@ public CompletableFuture applyTransaction(TransactionContext trx) { return applyTransactionFuture; }).whenComplete((r, t) -> { applyTransactionSemaphore.release(); - metrics.recordApplyTransactionLatency( + metrics.recordApplyTransactionCompletion( Time.monotonicNowNanos() - applyTxnStartTime); }); return applyTransactionFuture; From 9bdbea3237dfa3c548a824657fdb26f0376c3b2a Mon Sep 17 00:00:00 2001 From: Aravindan Vijayan Date: Wed, 28 Aug 2019 11:27:22 -0700 Subject: [PATCH 3/5] HDDS-1783 : Latency metric for applyTransaction in ContainerStateMachine. (Fix compilation issue) --- .../container/common/transport/server/ratis/CSMMetrics.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java index f6bab959f42ae..f27cd6d57a280 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java @@ -185,7 +185,7 @@ public long getNumBytesCommittedCount() { } public MutableRate getApplyTransactionLatency() { - return applyTransactionLatency; + return applyTransaction; } public void incPipelineLatency(ContainerProtos.Type type, long latencyNanos) { From 6d82c08205fd70a933ec3fbfd48e4e25a70fa70e Mon Sep 17 00:00:00 2001 From: Aravindan Vijayan Date: Thu, 29 Aug 2019 10:44:00 -0700 Subject: [PATCH 4/5] HDDS-1783 : Latency metric for applyTransaction in ContainerStateMachine. (Add unit test.) --- .../transport/server/ratis/TestCSMMetrics.java | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java index 4853059a52f6e..8e3c9578fd156 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java @@ -19,6 +19,7 @@ package org.apache.hadoop.ozone.container.common.transport.server.ratis; import static org.apache.hadoop.test.MetricsAsserts.assertCounter; +import static org.apache.hadoop.test.MetricsAsserts.getDoubleGauge; import static org.apache.hadoop.test.MetricsAsserts.getMetrics; import java.io.File; @@ -49,6 +50,8 @@ import org.apache.hadoop.hdds.conf.OzoneConfiguration; import static org.apache.ratis.rpc.SupportedRpcType.GRPC; +import static org.junit.Assert.assertTrue; + import org.apache.ratis.protocol.RaftGroupId; import org.apache.ratis.util.function.CheckedBiConsumer; @@ -118,6 +121,12 @@ static void runContainerStateMachineMetrics( assertCounter("NumStartTransactionVerifyFailures", 0L, metric); assertCounter("NumContainerNotOpenVerifyFailures", 0L, metric); assertCounter("WriteChunkNumOps", 0L, metric); + double applyTransactionLatency = getDoubleGauge( + "ApplyTransactionAvgTime", metric); + assertTrue(applyTransactionLatency == 0.0); + double writeStateMachineLatency = getDoubleGauge( + "WriteStateMachineDataAvgTime", metric); + assertTrue(writeStateMachineLatency == 0.0); // Write Chunk BlockID blockID = ContainerTestHelper.getTestBlockID(ContainerTestHelper. @@ -152,6 +161,13 @@ static void runContainerStateMachineMetrics( RaftGroupId.valueOf(pipeline.getId().getId()).toString()); assertCounter("NumQueryStateMachineOps", 1L, metric); assertCounter("NumApplyTransactionOps", 1L, metric); + applyTransactionLatency = getDoubleGauge( + "ApplyTransactionAvgTime", metric); + assertTrue(applyTransactionLatency > 0.0); + writeStateMachineLatency = getDoubleGauge( + "WriteStateMachineDataAvgTime", metric); + assertTrue(writeStateMachineLatency > 0.0); + } finally { if (client != null) { client.close(); From d567fe4b29c8161c2eb39efebdc710f5a108fc8c Mon Sep 17 00:00:00 2001 From: Aravindan Vijayan Date: Thu, 29 Aug 2019 15:29:20 -0700 Subject: [PATCH 5/5] HDDS-1783 : Latency metric for applyTransaction in ContainerStateMachine. (Fix typo.) --- hadoop-ozone/dist/src/main/compose/ozonesecure-mr/docker-config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-ozone/dist/src/main/compose/ozonesecure-mr/docker-config b/hadoop-ozone/dist/src/main/compose/ozonesecure-mr/docker-config index 6565eefeac4c0..f5c5fbd230bda 100644 --- a/hadoop-ozone/dist/src/main/compose/ozonesecure-mr/docker-config +++ b/hadoop-ozone/dist/src/main/compose/ozonesecure-mr/docker-config @@ -47,7 +47,7 @@ HDFS-SITE.XML_dfs.datanode.address=0.0.0.0:1019 HDFS-SITE.XML_dfs.datanode.http.address=0.0.0.0:1012 CORE-SITE.XML_dfs.data.transfer.protection=authentication CORE-SITE.XML_hadoop.security.authentication=kerberos -COER-SITE.XML_hadoop.security.auth_to_local=RULE:[2:$1@$0](.*@EXAMPLE.COM)s/@.*///L +CORE-SITE.XML_hadoop.security.auth_to_local=RULE:[2:$1@$0](.*@EXAMPLE.COM)s/@.*///L CORE-SITE.XML_hadoop.security.key.provider.path=kms://http@kms:9600/kms #temporary disable authorization as org.apache.hadoop.yarn.server.api.ResourceTrackerPB is not properly annotated to support it