From 73b3a92a72054822c84e9fd778e88981d692394c Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Thu, 7 Dec 2023 15:44:49 -0800 Subject: [PATCH 1/2] [SPARK-46313][CORE] Log Spark HA recovery duration --- .../main/scala/org/apache/spark/deploy/master/Master.scala | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala index 56a4530e20b9..2643af81791f 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala @@ -604,8 +604,11 @@ private[deploy] class Master( workers.count(_.state == WorkerState.UNKNOWN) == 0 && apps.count(_.state == ApplicationState.UNKNOWN) == 0 + private var recoveryStartTimeNs = 0L + private def beginRecovery(storedApps: Seq[ApplicationInfo], storedDrivers: Seq[DriverInfo], storedWorkers: Seq[WorkerInfo]): Unit = { + recoveryStartTimeNs = System.nanoTime() for (app <- storedApps) { logInfo("Trying to recover app: " + app.id) try { @@ -662,7 +665,8 @@ private[deploy] class Master( state = RecoveryState.ALIVE schedule() - logInfo("Recovery complete - resuming operations!") + val timeTakenNs = System.nanoTime() - recoveryStartTimeNs + logInfo(s"Recovery complete in ${timeTakenNs / 1000000000d}s - resuming operations!") } /** From 09ca410f7d7c4a46d8da01a136392649d5d522b8 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Thu, 7 Dec 2023 18:27:23 -0800 Subject: [PATCH 2/2] Address comments --- core/src/main/scala/org/apache/spark/deploy/master/Master.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala index 2643af81791f..c8107bbe4cd9 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala @@ -666,7 +666,7 @@ private[deploy] class Master( state = RecoveryState.ALIVE schedule() val timeTakenNs = System.nanoTime() - recoveryStartTimeNs - logInfo(s"Recovery complete in ${timeTakenNs / 1000000000d}s - resuming operations!") + logInfo(f"Recovery complete in ${timeTakenNs / 1000000000d}%.3fs - resuming operations!") } /**