Skip to content

Commit f6954da

Browse files
Got rid of Akka communication to renew, instead the executors check a known file's
modification time to read the credentials.
1 parent 5c11c3e commit f6954da

File tree

7 files changed

+136
-86
lines changed

7 files changed

+136
-86
lines changed

core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,16 @@ class SparkHadoopUtil extends Logging {
5555
def runAsSparkUser(func: () => Unit) {
5656
val user = Utils.getCurrentUserName()
5757
logDebug("running as user: " + user)
58+
updateCredentialsIfRequired()
5859
val ugi = UserGroupInformation.createRemoteUser(user)
5960
transferCredentials(UserGroupInformation.getCurrentUser(), ugi)
6061
ugi.doAs(new PrivilegedExceptionAction[Unit] {
6162
def run: Unit = func()
6263
})
6364
}
6465

66+
def updateCredentialsIfRequired(): Unit = {}
67+
6568
def transferCredentials(source: UserGroupInformation, dest: UserGroupInformation) {
6669
for (token <- source.getTokens()) {
6770
dest.addToken(token)
@@ -122,9 +125,13 @@ class SparkHadoopUtil extends Logging {
122125
UserGroupInformation.loginUserFromKeytab(principalName, keytabFilename)
123126
}
124127

125-
def setPrincipalAndKeytabForLogin(principal: String, keytab: String): Unit = ???
126-
127-
private[spark] def scheduleLoginFromKeytab(callback: (String) => Unit): Unit = {}
128+
/**
129+
* Schedule a login from the keytab and principal set using the --principal and --keytab
130+
* arguments to spark-submit. This login happens only when the credentials of the current user
131+
* are about to expire. This method reads SPARK_PRINCIPAL and SPARK_KEYTAB from the environment
132+
* to do the login. This method is a no-op in non-YARN mode.
133+
*/
134+
private[spark] def scheduleLoginFromKeytab(): Unit = {}
128135

129136
/**
130137
* Returns a function that can be called to find Hadoop FileSystem bytes read. If

core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -109,15 +109,6 @@ private[spark] class CoarseGrainedExecutorBackend(
109109
context.stop(self)
110110
context.system.shutdown()
111111

112-
// Add new credentials received from the driver to the current user.
113-
case UpdateCredentials(newCredentialsPath) =>
114-
logInfo("New credentials received from driver, adding the credentials to the current user")
115-
val credentials = new Credentials()
116-
val remoteFs = FileSystem.get(SparkHadoopUtil.get.conf)
117-
val inStream = remoteFs.open(new Path(newCredentialsPath))
118-
credentials.readTokenStorageStream(inStream)
119-
SparkHadoopUtil.get.addCurrentUserCredentials(credentials)
120-
inStream.close()
121112
}
122113

123114
override def statusUpdate(taskId: Long, state: TaskState, data: ByteBuffer) {

core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,6 @@ private[spark] object CoarseGrainedClusterMessages {
5151
case class StatusUpdate(executorId: String, taskId: Long, state: TaskState,
5252
data: SerializableBuffer) extends CoarseGrainedClusterMessage
5353

54-
// When the delegation tokens are about expire, the driver creates new tokens and sends them to
55-
// the executors via this message.
56-
case class UpdateCredentials(newCredentialsLocation: String)
57-
extends CoarseGrainedClusterMessage
58-
5954
object StatusUpdate {
6055
/** Alternate factory method that takes a ByteBuffer directly for the data field */
6156
def apply(executorId: String, taskId: Long, state: TaskState, data: ByteBuffer)

core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -73,16 +73,6 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSyste
7373
// Executors we have requested the cluster manager to kill that have not died yet
7474
private val executorsPendingToRemove = new HashSet[String]
7575

76-
/**
77-
* Send new credentials to executors. This is the method that is called when the scheduled
78-
* login completes, so the new credentials can be sent to the executors.
79-
* @param credentialsPath
80-
*/
81-
def sendNewCredentialsToExecutors(credentialsPath: String): Unit = {
82-
// We don't care about the reply, so going to deadLetters is fine.
83-
executorDataMap.values.foreach(_.executorActor ! UpdateCredentials(credentialsPath))
84-
}
85-
8676
class DriverActor(sparkProperties: Seq[(String, String)]) extends Actor with ActorLogReceive {
8777
override protected def log = CoarseGrainedSchedulerBackend.this.log
8878
private val addressToExecutorId = new HashMap[Address, String]
@@ -253,7 +243,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSyste
253243

254244
// If a principal and keytab have been set, use that to create new credentials for executors
255245
// periodically
256-
SparkHadoopUtil.get.scheduleLoginFromKeytab(sendNewCredentialsToExecutors _)
246+
SparkHadoopUtil.get.scheduleLoginFromKeytab()
257247
}
258248

259249
def stopExecutors() {

yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -257,11 +257,6 @@ private[spark] class ApplicationMaster(
257257
private def runDriver(securityMgr: SecurityManager): Unit = {
258258
addAmIpFilter()
259259

260-
// This must be done before SparkContext is initialized, since the CoarseGrainedSchedulerBackend
261-
// is started at that time. That is what schedules the re-logins. It is scheduled only if the
262-
// principal is actually setup. So we make sure it is available.
263-
SparkHadoopUtil.get.setPrincipalAndKeytabForLogin(
264-
System.getenv("SPARK_PRINCIPAL"), System.getenv("SPARK_KEYTAB"))
265260
userClassThread = startUserApplication()
266261

267262
// This a bit hacky, but we need to wait until the spark.driver.port property has

yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -246,14 +246,12 @@ private[spark] class Client(
246246
// If we passed in a keytab, make sure we copy the keytab to the staging directory on
247247
// HDFS, and setup the relevant environment vars, so the AM can login again.
248248
if (loginFromKeytab) {
249-
val fs = FileSystem.get(hadoopConf)
250-
val stagingDirPath = new Path(fs.getHomeDirectory, appStagingDir)
251249
val localUri = new URI(args.keytab)
252250
val localPath = getQualifiedLocalPath(localUri, hadoopConf)
253-
val destinationPath = new Path(stagingDirPath, keytabFileName)
254-
copyFileToRemote(destinationPath, localPath, replication)
251+
val destinationPath = copyFileToRemote(dst, localPath, replication)
252+
val destFs = FileSystem.get(destinationPath.toUri(), hadoopConf)
255253
distCacheMgr.addResource(
256-
fs, hadoopConf, destinationPath, localResources, LocalResourceType.FILE, keytabFileName,
254+
destFs, hadoopConf, destinationPath, localResources, LocalResourceType.FILE, keytabFileName,
257255
statCache, appMasterOnly = true)
258256
}
259257

@@ -577,10 +575,12 @@ private[spark] class Client(
577575
val f = new File(args.keytab)
578576
// Generate a file name that can be used for the keytab file, that does not conflict
579577
// with any user file.
580-
keytabFileName = f.getName + "-" + UUID.randomUUID()
578+
keytabFileName = f.getName + "-" + UUID.randomUUID().toString
581579
val ugi = UserGroupInformation.loginUserFromKeytabAndReturnUGI(args.principal, args.keytab)
582580
credentials = ugi.getCredentials
583581
loginFromKeytab = true
582+
val credentialsFile = "credentials-" + UUID.randomUUID().toString
583+
sparkConf.set("spark.yarn.credentials.file", credentialsFile)
584584
logInfo("Successfully logged into Kerberos.")
585585
} else {
586586
credentials = UserGroupInformation.getCurrentUser.getCredentials

yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala

Lines changed: 119 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,17 @@ package org.apache.spark.deploy.yarn
2020
import java.io._
2121
import java.net.URI
2222
import java.nio.ByteBuffer
23-
import java.util.concurrent.{TimeUnit, Executors}
23+
import java.util.concurrent.{ TimeUnit, Executors}
2424
import java.util.regex.Matcher
2525
import java.util.regex.Pattern
2626

2727
import scala.collection.mutable.HashMap
28+
import scala.collection.JavaConversions._
2829
import scala.util.Try
2930

31+
import org.apache.hadoop.fs.Options.Rename
3032
import org.apache.hadoop.fs.{FileUtil, Path, FileSystem}
33+
import org.apache.hadoop.hdfs.DistributedFileSystem
3134
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier
3235
import org.apache.hadoop.io.Text
3336
import org.apache.hadoop.mapred.{Master, JobConf}
@@ -41,7 +44,7 @@ import org.apache.hadoop.conf.Configuration
4144

4245
import org.apache.spark.{SparkException, SecurityManager, SparkConf}
4346
import org.apache.spark.deploy.SparkHadoopUtil
44-
import org.apache.spark.util.{SerializableBuffer, Utils}
47+
import org.apache.spark.util.Utils
4548

4649
/**
4750
* Contains util methods to interact with Hadoop from spark.
@@ -52,6 +55,13 @@ class YarnSparkHadoopUtil extends SparkHadoopUtil {
5255
private var principal: String = null
5356
@volatile private var loggedInViaKeytab = false
5457
@volatile private var loggedInUGI: UserGroupInformation = null
58+
@volatile private var lastCredentialsRefresh = 0l
59+
private lazy val delegationTokenRenewer =
60+
Executors.newSingleThreadScheduledExecutor(
61+
Utils.namedThreadFactory("Delegation Token Refresh Thread"))
62+
private lazy val delegationTokenExecuterUpdaterThread = new Runnable {
63+
override def run(): Unit = updateCredentialsIfRequired()
64+
}
5565

5666
override def transferCredentials(source: UserGroupInformation, dest: UserGroupInformation) {
5767
dest.addCredentials(source.getCredentials())
@@ -92,57 +102,118 @@ class YarnSparkHadoopUtil extends SparkHadoopUtil {
92102
if (credentials != null) credentials.getSecretKey(new Text(key)) else null
93103
}
94104

95-
override def setPrincipalAndKeytabForLogin(principal: String, keytab: String): Unit = {
96-
this.principal = principal
97-
this.keytab = keytab
105+
private[spark] override def scheduleLoginFromKeytab(): Unit = {
106+
val principal = System.getenv("SPARK_PRINCIPAL")
107+
val keytab = System.getenv("SPARK_KEYTAB")
108+
if (principal != null) {
109+
val delegationTokenRenewerThread =
110+
new Runnable {
111+
override def run(): Unit = {
112+
if (!loggedInViaKeytab) {
113+
// Keytab is copied by YARN to the working directory of the AM, so full path is
114+
// not needed.
115+
loggedInUGI = UserGroupInformation.loginUserFromKeytabAndReturnUGI(
116+
principal, keytab)
117+
loggedInViaKeytab = true
118+
}
119+
val nns = getNameNodesToAccess(sparkConf)
120+
val newCredentials = loggedInUGI.getCredentials
121+
obtainTokensForNamenodes(nns, conf, newCredentials)
122+
val remoteFs = FileSystem.get(conf)
123+
val stagingDirPath =
124+
new Path(remoteFs.getHomeDirectory, System.getenv("SPARK_YARN_STAGING_DIR"))
125+
val tokenPathStr = sparkConf.get("spark.yarn.credentials.file")
126+
val tokenPath = new Path(stagingDirPath.toString, tokenPathStr)
127+
val tempTokenPath = new Path(stagingDirPath.toString, tokenPathStr + ".tmp")
128+
val stream = remoteFs.create(tempTokenPath, true)
129+
// Now write this out to HDFS
130+
newCredentials.writeTokenStorageToStream(stream)
131+
stream.hflush()
132+
stream.close()
133+
remoteFs.delete(tokenPath, true)
134+
remoteFs.rename(tempTokenPath, tokenPath)
135+
delegationTokenRenewer.schedule(
136+
this, (0.75 * (getLatestValidity - System.currentTimeMillis())).toLong,
137+
TimeUnit.MILLISECONDS)
138+
}
139+
}
140+
val timeToRenewal = (0.75 * (getLatestValidity - System.currentTimeMillis())).toLong
141+
delegationTokenRenewer.schedule(
142+
delegationTokenRenewerThread, timeToRenewal, TimeUnit.MILLISECONDS)
143+
}
98144
}
99145

100-
private[spark] override def scheduleLoginFromKeytab(
101-
callback: (String) => Unit): Unit = {
102-
if (principal != null) {
103-
// Get the current credentials, find out when they expire.
104-
val creds = {
105-
if (loggedInUGI == null) {
106-
UserGroupInformation.getCurrentUser.getCredentials
107-
} else {
108-
loggedInUGI.getCredentials
146+
override def updateCredentialsIfRequired(): Unit = {
147+
try {
148+
val credentialsFile = sparkConf.get("spark.yarn.credentials.file")
149+
if (credentialsFile != null && !credentialsFile.isEmpty) {
150+
val remoteFs = FileSystem.get(conf)
151+
val sparkStagingDir = System.getenv("SPARK_YARN_STAGING_DIR")
152+
val stagingDirPath = new Path(remoteFs.getHomeDirectory, sparkStagingDir)
153+
val credentialsFilePath = new Path(stagingDirPath, credentialsFile)
154+
if (remoteFs.exists(credentialsFilePath)) {
155+
val status = remoteFs.getFileStatus(credentialsFilePath)
156+
val modTimeAtStart = status.getModificationTime
157+
if (modTimeAtStart > lastCredentialsRefresh) {
158+
val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsFilePath)
159+
val newStatus = remoteFs.getFileStatus(credentialsFilePath)
160+
// File was updated after we started reading it, lets come back later and try to read it.
161+
if (newStatus.getModificationTime != modTimeAtStart) {
162+
delegationTokenRenewer
163+
.schedule(delegationTokenExecuterUpdaterThread, 1, TimeUnit.HOURS)
164+
} else {
165+
UserGroupInformation.getCurrentUser.addCredentials(newCredentials)
166+
lastCredentialsRefresh = status.getModificationTime
167+
val totalValidity = getLatestValidity - lastCredentialsRefresh
168+
val timeToRunRenewal = lastCredentialsRefresh + (0.8 * totalValidity).toLong
169+
val timeFromNowToRenewal = timeToRunRenewal - System.currentTimeMillis()
170+
delegationTokenRenewer.schedule(delegationTokenExecuterUpdaterThread,
171+
timeFromNowToRenewal, TimeUnit.MILLISECONDS)
172+
}
173+
} else {
174+
// Check every hour to see if new credentials arrived.
175+
delegationTokenRenewer.schedule(delegationTokenExecuterUpdaterThread, 1, TimeUnit.HOURS)
176+
}
109177
}
110178
}
111-
val credStream = new ByteArrayOutputStream()
112-
creds.writeTokenStorageToStream(new DataOutputStream(credStream))
113-
val in = new DataInputStream(new ByteArrayInputStream(credStream.toByteArray))
114-
val tokenIdentifier = new DelegationTokenIdentifier()
115-
tokenIdentifier.readFields(in)
116-
val timeToRenewal = (0.6 * (tokenIdentifier.getMaxDate - System.currentTimeMillis())).toLong
117-
Executors.newSingleThreadScheduledExecutor(
118-
Utils.namedThreadFactory("Delegation Token Refresh Thread")).scheduleWithFixedDelay(
119-
new Runnable {
120-
override def run(): Unit = {
121-
if (!loggedInViaKeytab) {
122-
// Keytab is copied by YARN to the working directory of the AM, so full path is
123-
// not needed.
124-
loggedInUGI = UserGroupInformation.loginUserFromKeytabAndReturnUGI(
125-
principal, keytab)
126-
loggedInViaKeytab = true
127-
}
128-
val nns = getNameNodesToAccess(sparkConf)
129-
val newCredentials = loggedInUGI.getCredentials
130-
obtainTokensForNamenodes(nns, conf, newCredentials)
131-
val remoteFs = FileSystem.get(conf)
132-
val stagingDir = System.getenv("SPARK_YARN_STAGING_DIR")
133-
val tokenPath = new Path(remoteFs.getHomeDirectory, stagingDir + Path.SEPARATOR +
134-
"credentials - " + System.currentTimeMillis())
135-
val stream = remoteFs.create(tokenPath, true)
136-
// Now write this out via Akka to executors.
137-
newCredentials.writeTokenStorageToStream(stream)
138-
stream.hflush()
139-
stream.close()
140-
callback(tokenPath.toString)
141-
}
142-
}, timeToRenewal, timeToRenewal, TimeUnit.MILLISECONDS)
179+
} catch {
180+
// Since the file may get deleted while we are reading it,
181+
case e: Exception =>
182+
logWarning(
183+
"Error encountered while trying to update credentials, will try again in 1 hour", e)
184+
delegationTokenRenewer.schedule(delegationTokenExecuterUpdaterThread, 1, TimeUnit.HOURS)
143185
}
144186
}
145187

188+
private[spark] def getCredentialsFromHDFSFile(
189+
remoteFs: FileSystem,
190+
tokenPath: Path
191+
): Credentials = {
192+
val stream = remoteFs.open(tokenPath)
193+
val newCredentials = new Credentials()
194+
newCredentials.readFields(stream)
195+
newCredentials
196+
}
197+
198+
private[spark] def getLatestValidity: Long = {
199+
val creds = UserGroupInformation.getCurrentUser.getCredentials
200+
var latestValidity: Long = 0
201+
creds.getAllTokens
202+
.filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
203+
.foreach { t =>
204+
val identifier = new DelegationTokenIdentifier()
205+
identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
206+
latestValidity = {
207+
if (latestValidity < identifier.getMaxDate) {
208+
identifier.getMaxDate
209+
} else {
210+
latestValidity
211+
}
212+
}
213+
}
214+
latestValidity
215+
}
216+
146217
/**
147218
* Get the list of namenodes the user may access.
148219
*/
@@ -172,7 +243,8 @@ class YarnSparkHadoopUtil extends SparkHadoopUtil {
172243
def obtainTokensForNamenodes(
173244
paths: Set[Path],
174245
conf: Configuration,
175-
creds: Credentials): Unit = {
246+
creds: Credentials
247+
): Unit = {
176248
if (UserGroupInformation.isSecurityEnabled()) {
177249
val delegTokenRenewer = getTokenRenewer(conf)
178250
paths.foreach { dst =>

0 commit comments

Comments
 (0)