G-Research · EnricoMi · Dec 2, 2024 · Jan 15, 2025 · Jan 15, 2025 · Jan 15, 2025
diff --git a/.github/workflows/armada.yml b/.github/workflows/armada.yml
@@ -0,0 +1,51 @@
+name: Aramda
+
+on:
+  pull_request:
+
+jobs:
+  armada:
+    name: Armada integration
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          path: spark-armada
+      - uses: actions/checkout@v4
+        with:
+          repository: armadaproject/armada-operator
+          path: armada-operator
+      - run: |
+          cd spark-armada
+          ./build/sbt package -Parmada -Pkubernetes
+          ./bin/docker-image-tool.sh -t testing build
+          docker image save -o ../spark_testing.tar spark:testing
+          cd ..
+
+          cd armada-operator
+          make kind-all
+          ./bin/tooling/kind load image-archive ../spark_testing.tar --name armada
+
+          # sleep a bit, or we see: create queue request failed: rpc error: code = DeadlineExceeded
+          sleep 60
+
+          ./bin/app/armadactl create queue test
+
+          # sleep a bit, or we see: rpc error: code = PermissionDenied desc = could not find queue "test"
+          sleep 60
+
+          ./bin/app/armadactl submit ../spark-armada/examples/spark-driver-job.yaml
+          ./bin/app/armadactl submit ../spark-armada/examples/spark-executor-job.yaml
+
+          # wait for the jobs to start
+          sleep 60
+
+          # inspect jobs
+          kubectl get pods
+          for pod in $(kubectl get pods | grep armada | cut -d " " -f 1)
+          do
+            echo "$pod"
+            kubectl logs pod/$pod
+            echo
+          done
+
diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -237,6 +237,16 @@
         </dependency>
       </dependencies>
     </profile>
+    <profile>
+      <id>armada</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-armada_${scala.binary.version}</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
     <profile>
       <id>hive</id>
       <dependencies>

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -260,9 +260,10 @@ private[spark] class SparkSubmit extends Logging {
           case "yarn" => YARN
           case m if m.startsWith("spark") => STANDALONE
           case m if m.startsWith("k8s") => KUBERNETES
+          case m if m.startsWith("armada") => ARMADA
           case m if m.startsWith("local") => LOCAL
           case _ =>
-            error("Master must either be yarn or start with spark, k8s, or local")
+            error("Master must either be yarn or start with spark, k8s, armada, or local")
             -1
         }
       case None => LOCAL // default master or remote mode.
@@ -296,6 +297,15 @@ private[spark] class SparkSubmit extends Logging {
       }
     }
 
+    if (clusterManager == ARMADA) {
+      printMessage(s"Armada selected as cluster manager.")
+      if (!Utils.classIsLoadable(ARMADA_CLUSTER_SUBMIT_CLASS) && !Utils.isTesting) {
+        error(
+          s"Could not load ARMADA class \"${ARMADA_CLUSTER_SUBMIT_CLASS}\". " +
+          "This copy of Spark may not have been compiled with ARMADA support.")
+      }
+    }
+
     // Fail fast, the following modes are not supported or applicable
     (clusterManager, deployMode) match {
       case (STANDALONE, CLUSTER) if args.isPython =>
@@ -329,6 +339,8 @@ private[spark] class SparkSubmit extends Logging {
     val isKubernetesClient = clusterManager == KUBERNETES && deployMode == CLIENT
     val isKubernetesClusterModeDriver = isKubernetesClient &&
       sparkConf.getBoolean("spark.kubernetes.submitInDriver", false)
+    // TODO: does client/cluster mode matter here?
+    val isArmada = clusterManager == ARMADA
     val isCustomClasspathInClusterModeDisallowed =
       !sparkConf.get(ALLOW_CUSTOM_CLASSPATH_BY_PROXY_USER_IN_CLUSTER_MODE) &&
       args.proxyUser != null &&
@@ -416,6 +428,7 @@ private[spark] class SparkSubmit extends Logging {
         downloadFileList(_, targetDir, sparkConf, hadoopConf)
       }.orNull
 
+      // TODO: May have to do the same/similar for Armada
       if (isKubernetesClusterModeDriver) {
         // SPARK-33748: this mimics the behaviour of Yarn cluster mode. If the driver is running
         // in cluster mode, the archives should be available in the driver's current working
@@ -670,6 +683,7 @@ private[spark] class SparkSubmit extends Logging {
         confKey = KEYTAB.key),
       OptionAssigner(args.pyFiles, ALL_CLUSTER_MGRS, CLUSTER, confKey = SUBMIT_PYTHON_FILES.key),
 
+      // TODO: Add Armada where appropriate.
       // Propagate attributes for dependency resolution at the driver side
       OptionAssigner(args.packages, STANDALONE | KUBERNETES,
         CLUSTER, confKey = JAR_PACKAGES.key),
@@ -864,6 +878,12 @@ private[spark] class SparkSubmit extends Logging {
       }
     }
 
+    if (isArmada) {
+      // FIXME: Make sure we populate what we need here!
+      childMainClass = ARMADA_CLUSTER_SUBMIT_CLASS
+      childArgs ++= Array("--class", args.mainClass)
+    }
+
     // Load any properties specified through --conf and the default properties file
     for ((k, v) <- args.sparkProperties) {
       sparkConf.setIfMissing(k, v)
@@ -1071,7 +1091,8 @@ object SparkSubmit extends CommandLineUtils with Logging {
   private val STANDALONE = 2
   private val LOCAL = 8
   private val KUBERNETES = 16
-  private val ALL_CLUSTER_MGRS = YARN | STANDALONE | LOCAL | KUBERNETES
+  private val ARMADA = 32
+  private val ALL_CLUSTER_MGRS = YARN | STANDALONE | LOCAL | KUBERNETES | ARMADA
 
   // Deploy modes
   private val CLIENT = 1
@@ -1095,6 +1116,8 @@ object SparkSubmit extends CommandLineUtils with Logging {
   private[deploy] val STANDALONE_CLUSTER_SUBMIT_CLASS = classOf[ClientApp].getName()
   private[deploy] val KUBERNETES_CLUSTER_SUBMIT_CLASS =
     "org.apache.spark.deploy.k8s.submit.KubernetesClientApplication"
+  private[deploy] val ARMADA_CLUSTER_SUBMIT_CLASS =
+    "org.apache.spark.deploy.armada.submit.ArmadaClientApplication"
 
   override def main(args: Array[String]): Unit = {
     Option(System.getenv("SPARK_PREFER_IPV6"))

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -515,7 +515,8 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
       s"""
         |Options:
         |  --master MASTER_URL         spark://host:port, yarn,
-        |                              k8s://https://host:port, or local (Default: local[*]).
+        |                              k8s://https://host:port, armada://host:port,
+        |                              or local (Default: local[*]).
         |  --deploy-mode DEPLOY_MODE   Whether to launch the driver program locally ("client") or
         |                              on one of the worker machines inside the cluster ("cluster")
         |                              (Default: client).

diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
@@ -279,12 +279,10 @@ abstract class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter
     executorStageSummaryWrappers.foreach { exec =>
       // only the first executor is expected to be excluded
       val expectedExcludedFlag = exec.executorId == execIds.head
-      assert(exec.info.isBlacklistedForStage === expectedExcludedFlag)
       assert(exec.info.isExcludedForStage === expectedExcludedFlag)
     }
 
     check[ExecutorSummaryWrapper](execIds.head) { exec =>
-      assert(exec.info.blacklistedInStages === Set(stages.head.stageId))
       assert(exec.info.excludedInStages === Set(stages.head.stageId))
 
     }
@@ -306,7 +304,6 @@ abstract class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter
     assert(executorStageSummaryWrappersForNode.nonEmpty)
     executorStageSummaryWrappersForNode.foreach { exec =>
       // both executor is expected to be excluded
-      assert(exec.info.isBlacklistedForStage)
       assert(exec.info.isExcludedForStage)
 
     }
@@ -467,7 +464,6 @@ abstract class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter
     }
 
     check[ExecutorSummaryWrapper](execIds.head) { exec =>
-      assert(exec.info.blacklistedInStages === Set())
       assert(exec.info.excludedInStages === Set())
     }
 
@@ -495,7 +491,6 @@ abstract class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter
       stageAttemptId = stages.last.attemptNumber()))
 
     check[ExecutorSummaryWrapper](execIds.head) { exec =>
-      assert(exec.info.blacklistedInStages === Set(stages.last.stageId))
       assert(exec.info.excludedInStages === Set(stages.last.stageId))
     }
 
@@ -652,29 +647,25 @@ abstract class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter
     time += 1
     listener.onExecutorExcluded(SparkListenerExecutorExcluded(time, "1", 42))
     check[ExecutorSummaryWrapper]("1") { exec =>
-      assert(exec.info.isBlacklisted)
       assert(exec.info.isExcluded)
     }
 
     time += 1
     listener.onExecutorUnexcluded(SparkListenerExecutorUnexcluded(time, "1"))
     check[ExecutorSummaryWrapper]("1") { exec =>
-      assert(!exec.info.isBlacklisted)
       assert(!exec.info.isExcluded)
     }
 
     // Exclude a node.
     time += 1
     listener.onNodeExcluded(SparkListenerNodeExcluded(time, "1.example.com", 2))
     check[ExecutorSummaryWrapper]("1") { exec =>
-      assert(exec.info.isBlacklisted)
       assert(exec.info.isExcluded)
     }
 
     time += 1
     listener.onNodeUnexcluded(SparkListenerNodeUnexcluded(time, "1.example.com"))
     check[ExecutorSummaryWrapper]("1") { exec =>
-      assert(!exec.info.isBlacklisted)
       assert(!exec.info.isExcluded)
     }
 

diff --git a/examples/runSparkPi.sh b/examples/runSparkPi.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Start up the driver, get it's ip address, then start the executor with it
+set -e
+
+echo
+echo starting SparkPi driver
+armadactl submit examples/spark-pi-driver.yaml > /tmp/jobid.txt
+JOB_ID=`cat /tmp/jobid.txt | awk  '{print $5}'`
+cat /tmp/jobid.txt
+echo
+
+
+echo waiting for SparkPi driver to start
+sleep 20
+
+echo
+echo SparkPi driver ip addr:
+IP_ADDR=`kubectl get pod "armada-$JOB_ID-0" -o jsonpath='{.status.podIP}'`
+echo     $IP_ADDR
+echo
+
+echo passing drivers ip addr to executor and starting it
+IP_ADDR=$IP_ADDR envsubst < examples/spark-pi-executor.yaml > /tmp/ex.yaml
+armadactl submit /tmp/ex.yaml
+echo
+
+echo SparkPi driver/executor started
diff --git a/examples/spark-driver-job.yaml b/examples/spark-driver-job.yaml
@@ -0,0 +1,31 @@
+  queue: test                                                                                                                                                                                                                                                                                                      
+  jobSetId: job-set-1                                                                                                                                                                                                                                                                                              
+  jobs:                                                                                                                                                                                                                                                                                                            
+    - namespace: default                                                                                                                                                                                                                                                                                           
+      priority: 0                                                                                                                                                                                                                                                                                                  
+      podSpec:                                                                                                                                                                                                                                                                                                     
+        terminationGracePeriodSeconds: 0                                                                                                                                                                                                                                                                           
+        restartPolicy: Never                                                                                                                                                                                                                                                                                       
+        containers:                                                                                                                                                                                                                                                                                                
+          - name: spark-driver                                                                                                                                                                                                                                                                                     
+            image: spark:testing                                                                                                                                                                                                                                                                                   
+            env:                                                                                                                                                                                                                                                                                                   
+            - name: SPARK_DRIVER_BIND_ADDRESS                                                                                                                                                                                                                                                                      
+              value: "0.0.0.0:1234"                                                                                                                                                                                                                                                                                
+            command:                                                                                                                                                                                                                                                                                               
+              - /opt/entrypoint.sh                                                                                                                                                                                                                                                                                 
+            args:                                                                                                                                                                                                                                                                                                  
+              - driver                                                                                                                                                                                                                                                                                             
+              - --verbose                                                                                                                                                                                                                                                                                          
+              - --class                                                                                                                                                                                                                                                                                            
+              - org.apache.spark.examples.LocalPi                                                                                                                                                                                                                                                                  
+              - --master                                                                                                                                                                                                                                                                                           
+              - armada://192.168.1.167:50051                                                                                                                                                                                                                                                                       
+              - submit                                                                                                                                                                                                                                                                                             
+            resources:                                                                                                                                                                                                                                                                                             
+              limits:                                                                                                                                                                                                                                                                                              
+                memory: 1Gi                                                                                                                                                                                                                                                                                        
+                cpu: 1                                                                                                                                                                                                                                                                                             
+              requests:                                                                                                                                                                                                                                                                                            
+                memory: 1Gi                                                                                                                                                                                                                                                                                        
+                cpu: 1