-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-21917][CORE][YARN] Supporting adding http(s) resources in yarn mode #19130
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a67f6a1
96720e8
b3e0961
95b48ea
19b7fbf
a011982
9538cae
915de61
3bd487f
0fb7943
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,11 +25,11 @@ import java.text.ParseException | |
|
|
||
| import scala.annotation.tailrec | ||
| import scala.collection.mutable.{ArrayBuffer, HashMap, Map} | ||
| import scala.util.Properties | ||
| import scala.util.{Properties, Try} | ||
|
|
||
| import org.apache.commons.lang3.StringUtils | ||
| import org.apache.hadoop.conf.{Configuration => HadoopConfiguration} | ||
| import org.apache.hadoop.fs.Path | ||
| import org.apache.hadoop.fs.{FileSystem, Path} | ||
| import org.apache.hadoop.security.UserGroupInformation | ||
| import org.apache.hadoop.yarn.conf.YarnConfiguration | ||
| import org.apache.ivy.Ivy | ||
|
|
@@ -48,6 +48,7 @@ import org.apache.spark._ | |
| import org.apache.spark.api.r.RUtils | ||
| import org.apache.spark.deploy.rest._ | ||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.internal.config._ | ||
| import org.apache.spark.launcher.SparkLauncher | ||
| import org.apache.spark.util._ | ||
|
|
||
|
|
@@ -367,6 +368,52 @@ object SparkSubmit extends CommandLineUtils with Logging { | |
| }.orNull | ||
| } | ||
|
|
||
| // When running in YARN, for some remote resources with scheme: | ||
|
||
| // 1. Hadoop FileSystem doesn't support them. | ||
| // 2. We explicitly bypass Hadoop FileSystem with "spark.yarn.dist.forceDownloadSchemes". | ||
| // We will download them to local disk prior to add to YARN's distributed cache. | ||
| // For yarn client mode, since we already download them with above code, so we only need to | ||
| // figure out the local path and replace the remote one. | ||
| if (clusterManager == YARN) { | ||
| sparkConf.setIfMissing(SecurityManager.SPARK_AUTH_SECRET_CONF, "unused") | ||
| val secMgr = new SecurityManager(sparkConf) | ||
| val forceDownloadSchemes = sparkConf.get(FORCE_DOWNLOAD_SCHEMES) | ||
|
|
||
| def shouldDownload(scheme: String): Boolean = { | ||
| forceDownloadSchemes.contains(scheme) || | ||
| Try { FileSystem.getFileSystemClass(scheme, hadoopConf) }.isFailure | ||
| } | ||
|
|
||
| def downloadResource(resource: String): String = { | ||
| val uri = Utils.resolveURI(resource) | ||
| uri.getScheme match { | ||
| case "local" | "file" => resource | ||
| case e if shouldDownload(e) => | ||
|
||
| val file = new File(targetDir, new Path(uri).getName) | ||
| if (file.exists()) { | ||
| file.toURI.toString | ||
| } else { | ||
| downloadFile(resource, targetDir, sparkConf, hadoopConf, secMgr) | ||
| } | ||
| case _ => uri.toString | ||
| } | ||
| } | ||
|
|
||
| args.primaryResource = Option(args.primaryResource).map { downloadResource }.orNull | ||
| args.files = Option(args.files).map { files => | ||
| Utils.stringToSeq(files).map(downloadResource).mkString(",") | ||
| }.orNull | ||
| args.pyFiles = Option(args.pyFiles).map { pyFiles => | ||
| Utils.stringToSeq(pyFiles).map(downloadResource).mkString(",") | ||
| }.orNull | ||
| args.jars = Option(args.jars).map { jars => | ||
| Utils.stringToSeq(jars).map(downloadResource).mkString(",") | ||
| }.orNull | ||
| args.archives = Option(args.archives).map { archives => | ||
| Utils.stringToSeq(archives).map(downloadResource).mkString(",") | ||
| }.orNull | ||
| } | ||
|
|
||
| // If we're running a python app, set the main class to our specific python runner | ||
| if (args.isPython && deployMode == CLIENT) { | ||
| if (args.primaryResource == PYSPARK_SHELL) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@vanzin , it is a little difficult to mock the download behavior, so here I check if "spark.testing" is configured, return a dummy local path if it is configured. What do you think about this approach?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good.