[SPARK-5342][YARN] Allow long running Spark apps to run on secure YARN/HDFS.

harishreedharan · harishreedharan · commit 2b0d745ec7b7 · 2015-02-18T22:43:09.000-08:00
Current Spark apps running on Secure YARN/HDFS would not be able to write data
to HDFS after 7 days, since delegation tokens cannot be renewed beyond that. This
means Spark Streaming apps will not be able to run on Secure YARN.

This commit adds basic functionality to fix this issue. In this patch:
- new parameters are added - principal and keytab, which can be used to login to a KDC
- the client logs in, and then get tokens to start the AM
- the keytab is copied to the staging directory
- the AM waits for 60% of the time till expiry of the tokens and then logs in using the keytab
- each time after 60% of the time, new tokens are created and sent to the executors
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -17,20 +17,26 @@
 
 package org.apache.spark.deploy
 
+import java.io.{ByteArrayInputStream, DataInputStream, DataOutputStream, ByteArrayOutputStream}
 import java.lang.reflect.Method
+import java.net.URI
+import java.nio.ByteBuffer
 import java.security.PrivilegedExceptionAction
+import java.util.concurrent.atomic.AtomicBoolean
+import java.util.concurrent.{TimeUnit, ThreadFactory, Executors}
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
+import org.apache.hadoop.fs.{FileUtil, FileStatus, FileSystem, Path}
 import org.apache.hadoop.fs.FileSystem.Statistics
-import org.apache.hadoop.mapred.JobConf
+import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier
+import org.apache.hadoop.mapred.{Master, JobConf}
 import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
 import org.apache.hadoop.security.Credentials
 import org.apache.hadoop.security.UserGroupInformation
 
-import org.apache.spark.{Logging, SparkContext, SparkConf, SparkException}
+import org.apache.spark._
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{SerializableBuffer, Utils}
 
 import scala.collection.JavaConversions._
 
@@ -40,9 +46,14 @@ import scala.collection.JavaConversions._
  */
 @DeveloperApi
 class SparkHadoopUtil extends Logging {
-  val conf: Configuration = newConfiguration(new SparkConf())
+  val sparkConf = new SparkConf()
+  val conf: Configuration = newConfiguration(sparkConf)
   UserGroupInformation.setConfiguration(conf)
 
+  private var keytabFile: Option[String] = None
+  private var loginPrincipal: Option[String] = None
+  private val loggedInViaKeytab = new AtomicBoolean(false)
+
   /**
    * Runs the given function with a Hadoop UserGroupInformation as a thread local variable
    * (distributed to child threads), used for authenticating HDFS and YARN calls.
@@ -121,6 +132,100 @@ class SparkHadoopUtil extends Logging {
     UserGroupInformation.loginUserFromKeytab(principalName, keytabFilename)
   }
 
+  def setPrincipalAndKeytabForLogin(principal: String, keytab: String): Unit ={
+    loginPrincipal = Option(principal)
+    keytabFile = Option(keytab)
+  }
+
+  private[spark] def scheduleLoginFromKeytab(callback: (SerializableBuffer)  => Unit): Unit = {
+
+    loginPrincipal match {
+      case Some(principal) =>
+        val keytab = keytabFile.get
+        val remoteFs = FileSystem.get(conf)
+        val remoteKeytabPath = new Path(
+          remoteFs.getHomeDirectory, System.getenv("SPARK_STAGING_DIR") + Path.SEPARATOR + keytab)
+        val localFS = FileSystem.getLocal(conf)
+        // At this point, SparkEnv is likely no initialized, so create a dir, put the keytab there.
+        val tempDir = Utils.createTempDir()
+        val localURI = new URI(tempDir.getAbsolutePath + Path.SEPARATOR + keytab)
+        val qualifiedURI = new  URI(localFS.makeQualified(new Path(localURI)).toString)
+        FileUtil.copy(
+          remoteFs, remoteKeytabPath, localFS, new Path(qualifiedURI), false, false, conf)
+        // Get the current credentials, find out when they expire.
+        val creds = UserGroupInformation.getCurrentUser.getCredentials
+        val credStream = new ByteArrayOutputStream()
+        creds.writeTokenStorageToStream(new DataOutputStream(credStream))
+        val in = new DataInputStream(new ByteArrayInputStream(credStream.toByteArray))
+        val tokenIdentifier = new DelegationTokenIdentifier()
+        tokenIdentifier.readFields(in)
+        val timeToRenewal = (0.6 * (tokenIdentifier.getMaxDate - System.currentTimeMillis())).toLong
+        Executors.newSingleThreadScheduledExecutor(new ThreadFactory {
+          override def newThread(r: Runnable): Thread = {
+            val t = new Thread(r)
+            t.setName("Delegation Token Refresh Thread")
+            t.setDaemon(true)
+            t
+          }
+        }).scheduleWithFixedDelay(new Runnable {
+          override def run(): Unit = {
+            if (!loggedInViaKeytab.get()) {
+              loginUserFromKeytab(principal, tempDir.getAbsolutePath + Path.SEPARATOR + keytab)
+              loggedInViaKeytab.set(true)
+            }
+            val nns = getNameNodesToAccess(sparkConf) + remoteKeytabPath
+            val newCredentials = new Credentials()
+            obtainTokensForNamenodes(nns, conf, newCredentials)
+            // Now write this out via Akka to executors.
+            val outputStream = new ByteArrayOutputStream()
+            newCredentials.writeTokenStorageToStream(new DataOutputStream(outputStream))
+            callback(new SerializableBuffer(ByteBuffer.wrap(outputStream.toByteArray)))
+          }
+        }, timeToRenewal, timeToRenewal, TimeUnit.MILLISECONDS)
+
+    }
+  }
+
+  /**
+   * Get the list of namenodes the user may access.
+   */
+  def getNameNodesToAccess(sparkConf: SparkConf): Set[Path] = {
+    sparkConf.get("spark.yarn.access.namenodes", "")
+      .split(",")
+      .map(_.trim())
+      .filter(!_.isEmpty)
+      .map(new Path(_))
+      .toSet
+  }
+
+  def getTokenRenewer(conf: Configuration): String = {
+    val delegTokenRenewer = Master.getMasterPrincipal(conf)
+    logDebug("delegation token renewer is: " + delegTokenRenewer)
+    if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) {
+      val errorMessage = "Can't get Master Kerberos principal for use as renewer"
+      logError(errorMessage)
+      throw new SparkException(errorMessage)
+    }
+    delegTokenRenewer
+  }
+
+  /**
+   * Obtains tokens for the namenodes passed in and adds them to the credentials.
+   */
+  def obtainTokensForNamenodes(
+    paths: Set[Path],
+    conf: Configuration,
+    creds: Credentials): Unit = {
+    if (UserGroupInformation.isSecurityEnabled()) {
+      val delegTokenRenewer = getTokenRenewer(conf)
+      paths.foreach { dst =>
+        val dstFs = dst.getFileSystem(conf)
+        logDebug("getting token for namenode: " + dst)
+        dstFs.addDelegationTokens(delegTokenRenewer, creds)
+      }
+    }
+  }
+
   /**
    * Returns a function that can be called to find Hadoop FileSystem bytes read. If
    * getFSBytesReadOnThreadCallback is called from thread r at time t, the returned callback will
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.executor
 
 import java.net.URL
+import java.io.{ByteArrayInputStream, DataInputStream}
 import java.nio.ByteBuffer
 
 import scala.collection.mutable
@@ -26,6 +27,7 @@ import scala.concurrent.Await
 import akka.actor.{Actor, ActorSelection, Props}
 import akka.pattern.Patterns
 import akka.remote.{RemotingLifecycleEvent, DisassociatedEvent}
+import org.apache.hadoop.security.Credentials
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkEnv}
 import org.apache.spark.TaskState.TaskState
@@ -105,6 +107,12 @@ private[spark] class CoarseGrainedExecutorBackend(
       executor.stop()
       context.stop(self)
       context.system.shutdown()
+
+    case UpdateCredentials(newCredentials) =>
+      val credentials = new Credentials()
+      credentials.readTokenStorageStream(
+        new DataInputStream(new ByteArrayInputStream(newCredentials.value.array())))
+      SparkHadoopUtil.get.addCurrentUserCredentials(credentials)
   }
 
   override def statusUpdate(taskId: Long, state: TaskState, data: ByteBuffer) {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -51,6 +51,10 @@ private[spark] object CoarseGrainedClusterMessages {
   case class StatusUpdate(executorId: String, taskId: Long, state: TaskState,
     data: SerializableBuffer) extends CoarseGrainedClusterMessage
 
+  // Driver to all executors.
+  case class UpdateCredentials(newCredentials: SerializableBuffer)
+    extends CoarseGrainedClusterMessage
+
   object StatusUpdate {
     /** Alternate factory method that takes a ByteBuffer directly for the data field */
     def apply(executorId: String, taskId: Long, state: TaskState, data: ByteBuffer)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -27,6 +27,7 @@ import akka.actor._
 import akka.pattern.ask
 import akka.remote.{DisassociatedEvent, RemotingLifecycleEvent}
 
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.{ExecutorAllocationClient, Logging, SparkEnv, SparkException, TaskState}
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
@@ -75,6 +76,10 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSyste
     override protected def log = CoarseGrainedSchedulerBackend.this.log
     private val addressToExecutorId = new HashMap[Address, String]
 
+    // If a principal and keytab have been set, use that to create new credentials for executors
+    // periodically
+    SparkHadoopUtil.get.scheduleLoginFromKeytab(sendNewCredentialsToExecutors _)
+
     override def preStart() {
       // Listen for remote client disconnection events, since they don't go through Akka's watch()
       context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent])
@@ -85,6 +90,12 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSyste
       context.system.scheduler.schedule(0.millis, reviveInterval.millis, self, ReviveOffers)
     }
 
+    def sendNewCredentialsToExecutors(credentials: SerializableBuffer): Unit = {
+      executorDataMap.values.foreach{ x =>
+        x.executorActor ! UpdateCredentials(credentials)
+      }
+    }
+
     def receiveWithLogging = {
       case RegisterExecutor(executorId, hostPort, cores, logUrls) =>
         Utils.checkHostPort(hostPort, "Host port expected " + hostPort)
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -576,6 +576,11 @@ object ApplicationMaster extends Logging {
       master = new ApplicationMaster(amArgs, new YarnRMClient(amArgs))
       System.exit(master.run())
     }
+    // At this point, we have tokens that will expire only after a while, so we now schedule a
+    // login for some time before the tokens expire. Since the SparkContext has already started,
+    // we can now get access to the driver actor as well.
+    SparkHadoopUtil.get.setPrincipalAndKeytabForLogin(
+      System.getenv("SPARK_PRINCIPAL"), System.getenv("SPARK_KEYTAB"))
   }
 
   private[spark] def sparkContextInitialized(sc: SparkContext) = {
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -70,7 +70,7 @@ private[spark] class Client(
   private val isClusterMode = args.isClusterMode
 
   private var loginFromKeytab = false
-  private var kerberosFileName: String = null
+  private var keytabFileName: String = null
 
 
   def stop(): Unit = yarnClient.stop()
@@ -89,6 +89,7 @@ private[spark] class Client(
    * available in the alpha API.
    */
   def submitApplication(): ApplicationId = {
+    // Setup the credentials before doing anything else, so we have don't have issues at any point.
     setupCredentials()
     yarnClient.init(yarnConf)
     yarnClient.start()
@@ -319,6 +320,21 @@ private[spark] class Client(
     env("SPARK_YARN_MODE") = "true"
     env("SPARK_YARN_STAGING_DIR") = stagingDir
     env("SPARK_USER") = UserGroupInformation.getCurrentUser().getShortUserName()
+    // If we logged in from keytab, make sure we copy the keytab to the staging directory on
+    // HDFS, and setup the relevant environment vars, so the AM can login again.
+    if (loginFromKeytab) {
+      val fs = FileSystem.get(hadoopConf)
+      val stagingDirPath = new Path(fs.getHomeDirectory, stagingDir)
+      val localUri = new URI(args.keytab)
+      val localPath = getQualifiedLocalPath(localUri, hadoopConf)
+      val destinationPath = new Path(stagingDirPath, keytabFileName)
+      val replication = sparkConf.getInt("spark.yarn.submit.file.replication",
+        fs.getDefaultReplication(destinationPath)).toShort
+      copyFileToRemote(destinationPath, localPath, replication)
+      env("SPARK_PRINCIPAL") = args.principal
+      env("SPARK_KEYTAB") = keytabFileName
+    }
+
 
     // Set the environment variables to be passed on to the executors.
     distCacheMgr.setDistFilesEnv(env)
@@ -553,7 +569,7 @@ private[spark] class Client(
             // Generate a file name that can be used for the keytab file, that does not conflict
             // with any user file.
             val f = new File(keytabPath)
-            kerberosFileName = f.getName + "-" + System.currentTimeMillis()
+            keytabFileName = f.getName + "-" + System.currentTimeMillis()
             val ugi = UserGroupInformation.loginUserFromKeytabAndReturnUGI(principal, keytabPath)
             credentials = ugi.getCredentials
             loginFromKeytab = true
@@ -891,23 +907,11 @@ object Client extends Logging {
    * Get the list of namenodes the user may access.
    */
   private[yarn] def getNameNodesToAccess(sparkConf: SparkConf): Set[Path] = {
-    sparkConf.get("spark.yarn.access.namenodes", "")
-      .split(",")
-      .map(_.trim())
-      .filter(!_.isEmpty)
-      .map(new Path(_))
-      .toSet
+    SparkHadoopUtil.get.getNameNodesToAccess(sparkConf)
   }
 
   private[yarn] def getTokenRenewer(conf: Configuration): String = {
-    val delegTokenRenewer = Master.getMasterPrincipal(conf)
-    logDebug("delegation token renewer is: " + delegTokenRenewer)
-    if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) {
-      val errorMessage = "Can't get Master Kerberos principal for use as renewer"
-      logError(errorMessage)
-      throw new SparkException(errorMessage)
-    }
-    delegTokenRenewer
+    SparkHadoopUtil.get.getTokenRenewer(conf)
   }
 
   /**
@@ -917,14 +921,7 @@ object Client extends Logging {
       paths: Set[Path],
       conf: Configuration,
       creds: Credentials): Unit = {
-    if (UserGroupInformation.isSecurityEnabled()) {
-      val delegTokenRenewer = getTokenRenewer(conf)
-      paths.foreach { dst =>
-        val dstFs = dst.getFileSystem(conf)
-        logDebug("getting token for namenode: " + dst)
-        dstFs.addDelegationTokens(delegTokenRenewer, creds)
-      }
-    }
+    SparkHadoopUtil.get.obtainTokensForNamenodes(paths, conf, creds)
   }
 
   /**

Original file line number	Diff line number	Diff line change
`@@ -576,6 +576,11 @@ object ApplicationMaster extends Logging {`
`576`	`576`	`master = new ApplicationMaster(amArgs, new YarnRMClient(amArgs))`
`577`	`577`	`System.exit(master.run())`
`578`	`578`	`}`
	`579`	`+ // At this point, we have tokens that will expire only after a while, so we now schedule a`
	`580`	`+ // login for some time before the tokens expire. Since the SparkContext has already started,`
	`581`	`+ // we can now get access to the driver actor as well.`
	`582`	`+ SparkHadoopUtil.get.setPrincipalAndKeytabForLogin(`
	`583`	`+ System.getenv("SPARK_PRINCIPAL"), System.getenv("SPARK_KEYTAB"))`
`579`	`584`	`}`
`580`	`585`
`581`	`586`	`private[spark] def sparkContextInitialized(sc: SparkContext) = {`