From 9149cfb800d4baa96ec7e43d8716ac228810ebc1 Mon Sep 17 00:00:00 2001 From: gkc2104 Date: Tue, 4 Apr 2017 17:04:25 -0700 Subject: [PATCH 1/9] [SPARK-4899][MESOS] Support for checkpointing on Coarse and Fine grained schedulers --- docs/running-on-mesos.md | 20 +++++++++++++++++++ .../MesosCoarseGrainedSchedulerBackend.scala | 4 ++-- .../MesosFineGrainedSchedulerBackend.scala | 4 ++-- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md index 8d5ad12cb85be..6679a05bf0dc6 100644 --- a/docs/running-on-mesos.md +++ b/docs/running-on-mesos.md @@ -516,6 +516,26 @@ See the [configuration page](configuration.html) for information on Spark config Fetcher Cache + + spark.mesos.checkpoint + false + + If set, agents running tasks started by this framework will write the framework pid, executor pids and status updates to disk. + If the agent exits (e.g., due to a crash or as part of upgrading Mesos), this checkpointed data allows the restarted agent to + reconnect to executors that were started by the old instance of the agent. Enabling checkpointing improves fault tolerance, + at the cost of a (usually small) increase in disk I/O. + + + + spark.mesos.failoverTimeout + 0.0 + + The amount of time (in seconds) that the master will wait for thescheduler to failover before it tears down the framework + by killing all its tasks/executors. This should be non-zero if aframework expects to reconnect after a failure and not lose + its tasks/executors. + NOTE: To avoid accidental destruction of tasks, productionframeworks typically set this to a large value (e.g., 1 week). + + # Troubleshooting and Debugging diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index f555072c3842a..e5f5c3443531d 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -158,8 +158,8 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( sc.appName, sc.conf, sc.conf.getOption("spark.mesos.driver.webui.url").orElse(sc.ui.map(_.webUrl)), - None, - None, + sc.conf.getOption("spark.mesos.checkpoint"), + sc.conf.getOption("spark.mesos.failoverTimeout"), sc.conf.getOption("spark.mesos.driver.frameworkId") ) diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala index 7e561916a71e2..e90ab60c9cd0f 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala @@ -78,8 +78,8 @@ private[spark] class MesosFineGrainedSchedulerBackend( sc.appName, sc.conf, sc.conf.getOption("spark.mesos.driver.webui.url").orElse(sc.ui.map(_.webUrl)), - Option.empty, - Option.empty, + sc.conf.getOption("spark.mesos.checkpoint"), + sc.conf.getOption("spark.mesos.failoverTimeout"), sc.conf.getOption("spark.mesos.driver.frameworkId") ) From 0cbee4e47cfb4e31727559977d392b604438ec09 Mon Sep 17 00:00:00 2001 From: gkc2104 Date: Wed, 5 Apr 2017 12:19:12 -0700 Subject: [PATCH 2/9] [SPARK-4899][MESOS] Updated implimentation --- core/src/main/scala/org/apache/spark/SparkConf.scala | 9 +++++++++ .../test/scala/org/apache/spark/SparkConfSuite.scala | 12 ++++++++++++ .../mesos/MesosCoarseGrainedSchedulerBackend.scala | 4 ++-- .../mesos/MesosFineGrainedSchedulerBackend.scala | 4 ++-- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index fe912e639bcbc..7aa1da17ef867 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -406,6 +406,15 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria def getBoolean(key: String, defaultValue: Boolean): Boolean = { getOption(key).map(_.toBoolean).getOrElse(defaultValue) } + /** Get a parameter as a Option[Boolean] */ + def getOptionBoolean(key: String): Option[Boolean] = { + getOption(key).map(x => x.toBoolean) + } + + /** Get a parameter as a Option[Double] */ + def getOptionDouble(key: String): Option[Double] = { + getOption(key).map(_.toDouble) + } /** Get all executor environment variables set on this SparkConf */ def getExecutorEnv: Seq[(String, String)] = { diff --git a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala index 0897891ee1758..48139c3f06b3f 100644 --- a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala @@ -108,6 +108,18 @@ class SparkConfSuite extends SparkFunSuite with LocalSparkContext with ResetSyst assert(conf.getOption("k4") === None) } + test("option get and set") { + + // These tests will fail + + val conf = new SparkConf(false) + assert(conf.getAll.toSet === Set()) + assert(conf.getOptionBoolean("key") === None) + assert(conf.getOptionDouble("key2") === None) + conf.set("lol", "fail") + assert(conf.getBoolean("lol", false) === 1) + } + test("creating SparkContext without master and app name") { val conf = new SparkConf(false) intercept[SparkException] { sc = new SparkContext(conf) } diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index e5f5c3443531d..621390052dc77 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -158,8 +158,8 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( sc.appName, sc.conf, sc.conf.getOption("spark.mesos.driver.webui.url").orElse(sc.ui.map(_.webUrl)), - sc.conf.getOption("spark.mesos.checkpoint"), - sc.conf.getOption("spark.mesos.failoverTimeout"), + sc.conf.getOptionBoolean("spark.mesos.checkpoint"), + sc.conf.getOptionDouble("spark.mesos.failoverTimeout"), sc.conf.getOption("spark.mesos.driver.frameworkId") ) diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala index e90ab60c9cd0f..0699c097277ca 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala @@ -78,8 +78,8 @@ private[spark] class MesosFineGrainedSchedulerBackend( sc.appName, sc.conf, sc.conf.getOption("spark.mesos.driver.webui.url").orElse(sc.ui.map(_.webUrl)), - sc.conf.getOption("spark.mesos.checkpoint"), - sc.conf.getOption("spark.mesos.failoverTimeout"), + sc.conf.getOptionBoolean("spark.mesos.checkpoint"), + sc.conf.getOptionDouble("spark.mesos.failoverTimeout"), sc.conf.getOption("spark.mesos.driver.frameworkId") ) From d94b2f24d5fe1f67b3c79b0ed50e08d0a07cc416 Mon Sep 17 00:00:00 2001 From: Kamal Gurala Date: Thu, 20 Apr 2017 11:37:16 -0700 Subject: [PATCH 3/9] [SPARK-4899][MESOS] Updated implimentation --- .../main/scala/org/apache/spark/SparkConf.scala | 9 --------- .../scala/org/apache/spark/SparkConfSuite.scala | 12 ------------ .../mesos/MesosCoarseGrainedSchedulerBackend.scala | 4 ++-- .../mesos/MesosFineGrainedSchedulerBackend.scala | 4 ++-- .../MesosCoarseGrainedSchedulerBackendSuite.scala | 14 ++++++++++++++ 5 files changed, 18 insertions(+), 25 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index 7aa1da17ef867..fe912e639bcbc 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -406,15 +406,6 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria def getBoolean(key: String, defaultValue: Boolean): Boolean = { getOption(key).map(_.toBoolean).getOrElse(defaultValue) } - /** Get a parameter as a Option[Boolean] */ - def getOptionBoolean(key: String): Option[Boolean] = { - getOption(key).map(x => x.toBoolean) - } - - /** Get a parameter as a Option[Double] */ - def getOptionDouble(key: String): Option[Double] = { - getOption(key).map(_.toDouble) - } /** Get all executor environment variables set on this SparkConf */ def getExecutorEnv: Seq[(String, String)] = { diff --git a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala index 48139c3f06b3f..0897891ee1758 100644 --- a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala @@ -108,18 +108,6 @@ class SparkConfSuite extends SparkFunSuite with LocalSparkContext with ResetSyst assert(conf.getOption("k4") === None) } - test("option get and set") { - - // These tests will fail - - val conf = new SparkConf(false) - assert(conf.getAll.toSet === Set()) - assert(conf.getOptionBoolean("key") === None) - assert(conf.getOptionDouble("key2") === None) - conf.set("lol", "fail") - assert(conf.getBoolean("lol", false) === 1) - } - test("creating SparkContext without master and app name") { val conf = new SparkConf(false) intercept[SparkException] { sc = new SparkContext(conf) } diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index 621390052dc77..89b0905300a42 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -158,8 +158,8 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( sc.appName, sc.conf, sc.conf.getOption("spark.mesos.driver.webui.url").orElse(sc.ui.map(_.webUrl)), - sc.conf.getOptionBoolean("spark.mesos.checkpoint"), - sc.conf.getOptionDouble("spark.mesos.failoverTimeout"), + sc.conf.getOption("spark.mesos.checkpoint").map(_.toBoolean), + sc.conf.getOption("spark.mesos.failoverTimeout").map(_.toDouble), sc.conf.getOption("spark.mesos.driver.frameworkId") ) diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala index 0699c097277ca..71869664fe01e 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala @@ -78,8 +78,8 @@ private[spark] class MesosFineGrainedSchedulerBackend( sc.appName, sc.conf, sc.conf.getOption("spark.mesos.driver.webui.url").orElse(sc.ui.map(_.webUrl)), - sc.conf.getOptionBoolean("spark.mesos.checkpoint"), - sc.conf.getOptionDouble("spark.mesos.failoverTimeout"), + sc.conf.getOption("spark.mesos.checkpoint").map(_.toBoolean), + sc.conf.getOption("spark.mesos.failoverTimeout").map(_.toDouble), sc.conf.getOption("spark.mesos.driver.frameworkId") ) diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala index cdb3b68489654..586b03107813b 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala @@ -139,6 +139,20 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite assert(cpus == offerCores) } + test("mesos supports checkpointing") { + setBackend() + + val executorMemory = backend.executorMemory(sc) + val offerCores = 10 + offerResources(List(Resources(executorMemory * 2, offerCores))) + + val taskInfos = verifyTaskLaunched(driver, "o1") + assert(taskInfos.length == 1) + + val cpus = backend.getResource(taskInfos.head.getResourcesList, "cpus") + assert(cpus == offerCores) + } + test("mesos does not acquire more than spark.cores.max") { val maxCores = 10 setBackend(Map("spark.cores.max" -> maxCores.toString)) From 4ad7650ca30660f0be36caebe42ed7fc64e74cd6 Mon Sep 17 00:00:00 2001 From: Kamal Gurala Date: Thu, 20 Apr 2017 12:30:06 -0700 Subject: [PATCH 4/9] [SPARK-4899][MESOS] Added basic test --- ...osCoarseGrainedSchedulerBackendSuite.scala | 38 +++++++++++++++---- ...esosFineGrainedSchedulerBackendSuite.scala | 36 ++++++++++++++++++ 2 files changed, 66 insertions(+), 8 deletions(-) diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala index 586b03107813b..5ae71a02ce2ad 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala @@ -140,17 +140,39 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite } test("mesos supports checkpointing") { - setBackend() - val executorMemory = backend.executorMemory(sc) - val offerCores = 10 - offerResources(List(Resources(executorMemory * 2, offerCores))) + val checkpoint = true + val failoverTimeout = 10 + setBackend(Map("spark.mesos.checkpoint" -> checkpoint.toString, + "spark.mesos.failoverTimeout" -> failoverTimeout.toString)) - val taskInfos = verifyTaskLaunched(driver, "o1") - assert(taskInfos.length == 1) + val taskScheduler = mock[TaskSchedulerImpl] + when(taskScheduler.sc).thenReturn(sc) + val driver = mock[SchedulerDriver] + when(driver.start()).thenReturn(Protos.Status.DRIVER_RUNNING) + val securityManager = mock[SecurityManager] + + val backend = new MesosCoarseGrainedSchedulerBackend( + taskScheduler, sc, "master", securityManager) { + override protected def createSchedulerDriver( + masterUrl: String, + scheduler: Scheduler, + sparkUser: String, + appName: String, + conf: SparkConf, + webuiUrl: Option[String] = None, + checkpoint: Option[Boolean] = None, + failoverTimeout: Option[Double] = None, + frameworkId: Option[String] = None): SchedulerDriver = { + markRegistered() + assert(checkpoint.equals(true)) + assert(failoverTimeout.equals(10)) + driver + } + } + + backend.start() - val cpus = backend.getResource(taskInfos.head.getResourcesList, "cpus") - assert(cpus == offerCores) } test("mesos does not acquire more than spark.cores.max") { diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala index 4ee85b91830a9..9ac8ba8823dfb 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala @@ -79,6 +79,42 @@ class MesosFineGrainedSchedulerBackendSuite backend.start() } + test("mesos supports checkpointing") { + val conf = new SparkConf + conf.set("spark.mesos.checkpoint", "true") + conf.set("spark.mesos.failoverTimeout", 10) + + val sc = mock[SparkContext] + when(sc.conf).thenReturn(conf) + when(sc.sparkUser).thenReturn("sparkUser1") + when(sc.appName).thenReturn("appName1") + + val taskScheduler = mock[TaskSchedulerImpl] + val driver = mock[SchedulerDriver] + when(driver.start()).thenReturn(Protos.Status.DRIVER_RUNNING) + + val backend = new MesosFineGrainedSchedulerBackend(taskScheduler, sc, "master") { + override protected def createSchedulerDriver( + masterUrl: String, + scheduler: Scheduler, + sparkUser: String, + appName: String, + conf: SparkConf, + webuiUrl: Option[String] = None, + checkpoint: Option[Boolean] = None, + failoverTimeout: Option[Double] = None, + frameworkId: Option[String] = None): SchedulerDriver = { + markRegistered() + assert(checkpoint.equals(true)) + assert(failoverTimeout.equals(10)) + driver + } + } + + backend.start() + + } + test("Use configured mesosExecutor.cores for ExecutorInfo") { val mesosExecutorCores = 3 val conf = new SparkConf From acfbf1862fb78bfadec50ab606af8fe299583237 Mon Sep 17 00:00:00 2001 From: Kamal Gurala Date: Thu, 20 Apr 2017 16:15:33 -0700 Subject: [PATCH 5/9] [SPARK-4899][MESOS] Fixed bug --- .../cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala index 9ac8ba8823dfb..11acc962098ff 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala @@ -82,7 +82,7 @@ class MesosFineGrainedSchedulerBackendSuite test("mesos supports checkpointing") { val conf = new SparkConf conf.set("spark.mesos.checkpoint", "true") - conf.set("spark.mesos.failoverTimeout", 10) + conf.set("spark.mesos.failoverTimeout", "10") val sc = mock[SparkContext] when(sc.conf).thenReturn(conf) From ceb5d378ee9bde9a20673ba5640444ba621bd28b Mon Sep 17 00:00:00 2001 From: Kamal Gurala Date: Thu, 20 Apr 2017 17:46:41 -0700 Subject: [PATCH 6/9] [SPARK-4899][MESOS] Fixed bug in tests --- .../mesos/MesosCoarseGrainedSchedulerBackendSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala index 5ae71a02ce2ad..27eb296e065de 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala @@ -165,8 +165,8 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite failoverTimeout: Option[Double] = None, frameworkId: Option[String] = None): SchedulerDriver = { markRegistered() - assert(checkpoint.equals(true)) - assert(failoverTimeout.equals(10)) + assert(checkpoint.contains(true)) + assert(failoverTimeout.contains(10.0)) driver } } From 4745ce23e7b68c60d35f4c614a8bef570a7ef017 Mon Sep 17 00:00:00 2001 From: Kamal Gurala Date: Thu, 20 Apr 2017 18:58:40 -0700 Subject: [PATCH 7/9] [SPARK-4899][MESOS] Fixed bug in tests again --- .../cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala index 11acc962098ff..aa3396ffbec7e 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala @@ -105,8 +105,8 @@ class MesosFineGrainedSchedulerBackendSuite failoverTimeout: Option[Double] = None, frameworkId: Option[String] = None): SchedulerDriver = { markRegistered() - assert(checkpoint.equals(true)) - assert(failoverTimeout.equals(10)) + assert(checkpoint.contains(true)) + assert(failoverTimeout.contains(10)) driver } } From d2c3646d04bcc035b9bc3c5eae613b0ba7554653 Mon Sep 17 00:00:00 2001 From: Kamal Gurala Date: Thu, 20 Apr 2017 19:10:15 -0700 Subject: [PATCH 8/9] [SPARK-4899][MESOS] Fixed bug in tests again --- .../mesos/MesosCoarseGrainedSchedulerBackendSuite.scala | 3 ++- .../cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala index 27eb296e065de..308951769288a 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala @@ -144,7 +144,8 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite val checkpoint = true val failoverTimeout = 10 setBackend(Map("spark.mesos.checkpoint" -> checkpoint.toString, - "spark.mesos.failoverTimeout" -> failoverTimeout.toString)) + "spark.mesos.failoverTimeout" -> failoverTimeout.toString, + "spark.mesos.driver.webui.url" -> "http://webui")) val taskScheduler = mock[TaskSchedulerImpl] when(taskScheduler.sc).thenReturn(sc) diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala index aa3396ffbec7e..ec0e613ded752 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala @@ -83,6 +83,7 @@ class MesosFineGrainedSchedulerBackendSuite val conf = new SparkConf conf.set("spark.mesos.checkpoint", "true") conf.set("spark.mesos.failoverTimeout", "10") + conf.set("spark.mesos.driver.webui.url", "http://webui") val sc = mock[SparkContext] when(sc.conf).thenReturn(conf) From 67cf27623672adb419ccb3ce0112cb77471ea5f1 Mon Sep 17 00:00:00 2001 From: Kamal Gurala Date: Fri, 21 Apr 2017 10:48:31 -0700 Subject: [PATCH 9/9] [SPARK-4899][MESOS] Updated docs and small fix --- docs/running-on-mesos.md | 4 ++-- .../cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md index 6679a05bf0dc6..3f5439a287b57 100644 --- a/docs/running-on-mesos.md +++ b/docs/running-on-mesos.md @@ -530,8 +530,8 @@ See the [configuration page](configuration.html) for information on Spark config spark.mesos.failoverTimeout 0.0 - The amount of time (in seconds) that the master will wait for thescheduler to failover before it tears down the framework - by killing all its tasks/executors. This should be non-zero if aframework expects to reconnect after a failure and not lose + The amount of time (in seconds) that the master will wait for the scheduler to failover before it tears down the framework + by killing all its tasks/executors. This should be non-zero if a framework expects to reconnect after a failure and not lose its tasks/executors. NOTE: To avoid accidental destruction of tasks, productionframeworks typically set this to a large value (e.g., 1 week). diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala index ec0e613ded752..21343b063b00f 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala @@ -107,7 +107,7 @@ class MesosFineGrainedSchedulerBackendSuite frameworkId: Option[String] = None): SchedulerDriver = { markRegistered() assert(checkpoint.contains(true)) - assert(failoverTimeout.contains(10)) + assert(failoverTimeout.contains(10.0)) driver } }