Skip to content

Commit 880bc27

Browse files
committed
Add Mesos Cluster UI to display driver results
1 parent 9986731 commit 880bc27

File tree

6 files changed

+193
-28
lines changed

6 files changed

+193
-28
lines changed

core/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala

Lines changed: 51 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -37,46 +37,52 @@ import org.apache.spark.deploy.worker.DriverRunner
3737
import java.io.{IOException, File}
3838
import java.util.Date
3939
import java.text.SimpleDateFormat
40-
import scala.collection.mutable
40+
import org.apache.spark.deploy.mesos.ui.MesosClusterUI
41+
import org.apache.spark.deploy.mesos.Messages.{DispatcherStateResponse, RequestDispatcherState}
42+
43+
private [deploy] object Messages {
44+
case object RequestDispatcherState
45+
46+
case class DispatcherStateResponse(
47+
activeDrivers: Iterable[DriverInfo],
48+
completedDrivers: Iterable[DriverInfo]) {
49+
}
50+
}
4151

4252
/*
4353
* A dispatcher actor that is responsible for managing drivers, that is intended to
4454
* used for Mesos cluster mode.
4555
* This class is needed since Mesos doesn't manage frameworks, so the dispatcher acts as
4656
* a daemon to launch drivers as Mesos frameworks upon request.
4757
*/
48-
class MesosClusterDispatcher(
58+
private [spark] class MesosClusterDispatcher(
4959
host: String,
5060
serverPort: Int,
5161
actorPort: Int,
62+
webUiPort: Int,
5263
systemName: String,
5364
actorName: String,
5465
conf: SparkConf,
5566
masterUrl: String,
5667
workDirPath: Option[String] = None) extends Actor with ActorLogReceive with Logging {
5768
val server = new MesosRestServer(host, serverPort, self, conf, masterUrl)
5869

59-
val runners = new HashMap[String, DriverRunner]
60-
val drivers = new HashMap[String, DriverInfo]
61-
val completedDrivers = new ArrayBuffer[DriverInfo]
62-
val RETAINED_DRIVERS = conf.getInt("spark.deploy.retainedDrivers", 200)
63-
var nextDriverNumber = 0
64-
65-
var workDir: File = null
66-
67-
def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss")
70+
val dispatcherPublicAddress = {
71+
val envVar = System.getenv("SPARK_PUBLIC_DNS")
72+
if (envVar != null) envVar else host
73+
}
6874

69-
def createWorkDir() {
70-
workDir = workDirPath.map(new File(_)).getOrElse(new File(sparkHome, "work"))
75+
lazy val workDir: File = {
76+
val dir = workDirPath.map(new File(_)).getOrElse(new File(sparkHome, "work"))
7177

7278
// Attempt to remove the work directory if it exists on startup.
7379
// This is to avoid unbounded growing the work directory as drivers
7480
// are only deleted when it is over the retained count while it's running.
7581
// We don't fail startup if we are not able to remove, as this is
7682
// a short-term solution.
7783
try {
78-
if (workDir.exists()) {
79-
workDir.delete()
84+
if (dir.exists()) {
85+
dir.delete()
8086
}
8187
} catch {
8288
case e: IOException =>
@@ -86,19 +92,31 @@ class MesosClusterDispatcher(
8692
try {
8793
// This sporadically fails - not sure why ... !workDir.exists() && !workDir.mkdirs()
8894
// So attempting to create and then check if directory was created or not.
89-
workDir.mkdirs()
90-
if (!workDir.exists() || !workDir.isDirectory) {
91-
logError("Failed to create work directory " + workDir)
95+
dir.mkdirs()
96+
if (!dir.exists() || !dir.isDirectory) {
97+
logError("Failed to create work directory " + dir)
9298
System.exit(1)
9399
}
94-
assert (workDir.isDirectory)
100+
assert (dir.isDirectory)
95101
} catch {
96102
case e: Exception =>
97-
logError("Failed to create work directory " + workDir, e)
103+
logError("Failed to create work directory " + dir, e)
98104
System.exit(1)
99105
}
106+
dir
100107
}
101108

109+
val webUi = new MesosClusterUI(
110+
self, new SecurityManager(conf), webUiPort, conf, workDir, dispatcherPublicAddress)
111+
112+
val runners = new HashMap[String, DriverRunner]
113+
val drivers = new HashMap[String, DriverInfo]
114+
val completedDrivers = new ArrayBuffer[DriverInfo]
115+
val RETAINED_DRIVERS = conf.getInt("spark.deploy.retainedDrivers", 200)
116+
var nextDriverNumber = 0
117+
118+
def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss")
119+
102120
val sparkHome =
103121
new File(sys.env.get("SPARK_HOME").getOrElse("."))
104122

@@ -122,11 +140,12 @@ class MesosClusterDispatcher(
122140
}
123141

124142
override def preStart() {
125-
createWorkDir()
126143
server.start()
144+
webUi.bind()
127145
}
128146

129147
override def postStop() {
148+
webUi.stop()
130149
server.stop()
131150
runners.values.foreach(_.kill())
132151
}
@@ -173,6 +192,10 @@ class MesosClusterDispatcher(
173192
throw new Exception(s"Received unexpected state update for driver $driverId: $state")
174193
}
175194
}
195+
196+
case RequestDispatcherState => {
197+
sender ! DispatcherStateResponse(drivers.values, completedDrivers)
198+
}
176199
}
177200

178201
def removeDriver(driverId: String, state: DriverState, exception: Option[Exception]) {
@@ -225,6 +248,7 @@ object MesosClusterDispatcher {
225248
args.host,
226249
args.port,
227250
boundPort,
251+
args.webUiPort,
228252
systemName,
229253
actorName,
230254
conf,
@@ -237,6 +261,7 @@ object MesosClusterDispatcher {
237261
class ClusterDispatcherArguments(args: Array[String], conf: SparkConf) {
238262
var host = Utils.localHostName()
239263
var port = 7077
264+
var webUiPort = 8081
240265
var masterUrl: String = null
241266

242267
parse(args.toList)
@@ -251,6 +276,10 @@ object MesosClusterDispatcher {
251276
port = value
252277
parse(tail)
253278

279+
case ("--webui-port" | "-p") :: IntParam(value) :: tail =>
280+
webUiPort = value
281+
parse(tail)
282+
254283
case ("--master" | "-m") :: value :: tail =>
255284
if (!value.startsWith("mesos://")) {
256285
System.err.println("Cluster dispatcher only supports mesos (uri begins with mesos://)")
@@ -283,6 +312,7 @@ object MesosClusterDispatcher {
283312
"Options:\n" +
284313
" -h HOST, --host HOST Hostname to listen on\n" +
285314
" -p PORT, --port PORT Port to listen on (default: 7077)\n" +
315+
" --webui-port WEBUI_PORT WebUI Port to listen on (default: 8081)\n" +
286316
" -m --master MASTER URI for connecting to Mesos master\n")
287317
System.exit(exitCode)
288318
}
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.deploy.mesos.ui
19+
20+
import org.apache.spark.ui.{UIUtils, WebUIPage}
21+
import javax.servlet.http.HttpServletRequest
22+
import scala.xml.Node
23+
import org.apache.spark.deploy.mesos.Messages.{DispatcherStateResponse, RequestDispatcherState}
24+
import scala.concurrent.Await
25+
import org.apache.spark.deploy.master.DriverInfo
26+
27+
class DriverOutputPage(parent: MesosClusterUI) extends WebUIPage("") {
28+
private val dispatcher = parent.dispatcherActorRef
29+
private val timeout = parent.timeout
30+
31+
def render(request: HttpServletRequest): Seq[Node] = {
32+
val stateFuture = (dispatcher ? RequestDispatcherState)(timeout).mapTo[DispatcherStateResponse]
33+
val state = Await.result(stateFuture, timeout)
34+
val content =
35+
<div class="row-fluid">
36+
<div class="span12">
37+
<h3>Active drivers</h3>
38+
{state.activeDrivers.map(d => driverContent(d)).flatten}
39+
<h3>Completed drivers</h3>
40+
{state.completedDrivers.map(d => completedDriverContent(d)).flatten}
41+
</div>
42+
</div>;
43+
UIUtils.basicSparkPage(content, "Spark Drivers for Mesos cluster")
44+
}
45+
46+
def driverContent(info: DriverInfo): Seq[Node] = {
47+
<ul class="unstyled">
48+
<li><strong>ID:</strong> {info.id}</li>
49+
<li><strong>Submit Date:</strong> {info.submitDate}</li>
50+
<li><strong>Start Date:</strong> {info.startTime}</li>
51+
<li><strong>Output:</strong>
52+
<a href={"logPage?driverId=%s&logType=stdout"
53+
.format(info.id)}>stdout</a>
54+
<a href={"logPage?driverId=%s&logType=stderr"
55+
.format(info.id)}>stderr</a>
56+
</li>
57+
</ul>
58+
}
59+
60+
def completedDriverContent(info: DriverInfo): Seq[Node] = {
61+
<ul class="unstyled">
62+
<li><strong>ID:</strong> {info.id}</li>
63+
<li><strong>Submit Date:</strong> {info.submitDate}</li>
64+
<li><strong>Start Date:</strong> {info.startTime}</li>
65+
<li><strong>Output:</strong>
66+
<a href={"logPage?driverId=%s&logType=stdout"
67+
.format(info.id)}>stdout</a>
68+
<a href={"logPage?driverId=%s&logType=stderr"
69+
.format(info.id)}>stderr</a>
70+
</li>
71+
<li><strong>State:</strong>{info.state}</li>
72+
<li><strong>Exception:</strong>{info.exception}</li>
73+
</ul>
74+
}
75+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.deploy.mesos.ui
19+
20+
import java.io.File
21+
import org.apache.spark.ui.{SparkUI, WebUI}
22+
import org.apache.spark.SparkConf
23+
import org.apache.spark.SecurityManager
24+
import akka.pattern.AskableActorRef
25+
import org.apache.spark.ui.JettyUtils._
26+
import org.apache.spark.util.AkkaUtils
27+
import org.apache.spark.deploy.worker.ui.{ActiveWebUiUrlAccessor, LogPage}
28+
29+
/**
30+
* UI that displays driver results from the [[org.apache.spark.deploy.mesos.MesosClusterDispatcher]]
31+
*/
32+
private [spark] class MesosClusterUI(
33+
val dispatcherActorRef: AskableActorRef,
34+
securityManager: SecurityManager,
35+
port: Int,
36+
conf: SparkConf,
37+
workDir: File,
38+
dispatcherPublicAddress: String)
39+
extends WebUI(securityManager, port, conf) with ActiveWebUiUrlAccessor {
40+
41+
val timeout = AkkaUtils.askTimeout(conf)
42+
43+
initialize()
44+
45+
def activeWebUiUrl: String = "http://" + dispatcherPublicAddress + ":" + boundPort
46+
47+
override def initialize() {
48+
attachPage(new DriverOutputPage(this))
49+
attachPage(new LogPage(this, workDir))
50+
attachHandler(createStaticHandler(MesosClusterUI.STATIC_RESOURCE_DIR, "/static"))
51+
}
52+
}
53+
54+
private[spark] object MesosClusterUI {
55+
val STATIC_RESOURCE_DIR = SparkUI.STATIC_RESOURCE_DIR
56+
}

core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ private[worker] class Worker(
6161
assert (port > 0)
6262

6363
// For worker and executor IDs
64-
private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss")
64+
private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss")
6565

6666
// Send a heartbeat every (heartbeat timeout) / 4 milliseconds
6767
private val HEARTBEAT_MILLIS = conf.getLong("spark.worker.timeout", 60) * 1000 / 4
@@ -85,10 +85,10 @@ private[worker] class Worker(
8585

8686
private val CLEANUP_ENABLED = conf.getBoolean("spark.worker.cleanup.enabled", false)
8787
// How often worker will clean up old app folders
88-
private val CLEANUP_INTERVAL_MILLIS =
88+
private val CLEANUP_INTERVAL_MILLIS =
8989
conf.getLong("spark.worker.cleanup.interval", 60 * 30) * 1000
9090
// TTL for app folders/data; after TTL expires it will be cleaned up
91-
private val APP_DATA_RETENTION_SECS =
91+
private val APP_DATA_RETENTION_SECS =
9292
conf.getLong("spark.worker.cleanup.appDataTtl", 7 * 24 * 3600)
9393

9494
private val testing: Boolean = sys.props.contains("spark.testing")
@@ -112,7 +112,7 @@ private[worker] class Worker(
112112
} else {
113113
new File(sys.env.get("SPARK_HOME").getOrElse("."))
114114
}
115-
115+
116116
var workDir: File = null
117117
val finishedExecutors = new HashMap[String, ExecutorRunner]
118118
val drivers = new HashMap[String, DriverRunner]
@@ -134,7 +134,7 @@ private[worker] class Worker(
134134

135135
private val metricsSystem = MetricsSystem.createMetricsSystem("worker", conf, securityMgr)
136136
private val workerSource = new WorkerSource(this)
137-
137+
138138
private var registrationRetryTimer: Option[Cancellable] = None
139139

140140
var coresUsed = 0

core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
package org.apache.spark.deploy.worker.ui
1919

20+
import java.io.File
2021
import javax.servlet.http.HttpServletRequest
2122

2223
import scala.xml.Node
@@ -30,6 +31,9 @@ private[ui] class LogPage(parent: WorkerWebUI) extends WebUIPage("logPage") with
3031
private val worker = parent.worker
3132
private val workDir = parent.workDir
3233

34+
private[spark] class LogPage(
35+
urlAccessor: ActiveWebUiUrlAccessor,
36+
workDir: File) extends WebUIPage("logPage") with Logging {
3337
def renderLog(request: HttpServletRequest): String = {
3438
val defaultBytes = 100 * 1024
3539

@@ -73,7 +77,7 @@ private[ui] class LogPage(parent: WorkerWebUI) extends WebUIPage("logPage") with
7377
}
7478

7579
val (logText, startByte, endByte, logLength) = getLog(logDir, logType, offset, byteLength)
76-
val linkToMaster = <p><a href={worker.activeMasterWebUiUrl}>Back to Master</a></p>
80+
val linkToMaster = <p><a href={urlAccessor.activeWebUiUrl}>Back to Master</a></p>
7781
val range = <span>Bytes {startByte.toString} - {endByte.toString} of {logLength}</span>
7882

7983
val backButton =

core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ class WorkerWebUI(
4444

4545
/** Initialize all components of the server. */
4646
def initialize() {
47-
val logPage = new LogPage(this)
47+
val logPage = new LogPage(worker, workDir)
4848
attachPage(logPage)
4949
attachPage(new WorkerPage(this))
5050
attachHandler(createStaticHandler(WorkerWebUI.STATIC_RESOURCE_BASE, "/static"))

0 commit comments

Comments
 (0)