-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-21714][CORE][BACKPORT-2.2] Avoiding re-uploading remote resources in yarn client mode #19074
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -208,14 +208,20 @@ object SparkSubmit extends CommandLineUtils { | |
|
|
||
| /** | ||
| * Prepare the environment for submitting an application. | ||
| * This returns a 4-tuple: | ||
| * (1) the arguments for the child process, | ||
| * (2) a list of classpath entries for the child, | ||
| * (3) a map of system properties, and | ||
| * (4) the main class for the child | ||
| * | ||
| * @param args the parsed SparkSubmitArguments used for environment preparation. | ||
| * @param conf the Hadoop Configuration, this argument will only be set in unit test. | ||
| * @return a 4-tuple: | ||
| * (1) the arguments for the child process, | ||
| * (2) a list of classpath entries for the child, | ||
| * (3) a map of system properties, and | ||
| * (4) the main class for the child | ||
| * | ||
| * Exposed for testing. | ||
| */ | ||
| private[deploy] def prepareSubmitEnvironment(args: SparkSubmitArguments) | ||
| private[deploy] def prepareSubmitEnvironment( | ||
| args: SparkSubmitArguments, | ||
| conf: Option[HadoopConfiguration] = None) | ||
| : (Seq[String], Seq[String], Map[String, String], String) = { | ||
| // Return values | ||
| val childArgs = new ArrayBuffer[String]() | ||
|
|
@@ -311,12 +317,16 @@ object SparkSubmit extends CommandLineUtils { | |
| } | ||
|
|
||
| // In client mode, download remote files. | ||
| var localPrimaryResource: String = null | ||
| var localJars: String = null | ||
| var localPyFiles: String = null | ||
| var localFiles: String = null | ||
| if (deployMode == CLIENT) { | ||
| val hadoopConf = new HadoopConfiguration() | ||
| args.primaryResource = Option(args.primaryResource).map(downloadFile(_, hadoopConf)).orNull | ||
| args.jars = Option(args.jars).map(downloadFileList(_, hadoopConf)).orNull | ||
| args.pyFiles = Option(args.pyFiles).map(downloadFileList(_, hadoopConf)).orNull | ||
| args.files = Option(args.files).map(downloadFileList(_, hadoopConf)).orNull | ||
| val hadoopConf = conf.getOrElse(new HadoopConfiguration()) | ||
| localPrimaryResource = Option(args.primaryResource).map(downloadFile(_, hadoopConf)).orNull | ||
| localJars = Option(args.jars).map(downloadFileList(_, hadoopConf)).orNull | ||
| localPyFiles = Option(args.pyFiles).map(downloadFileList(_, hadoopConf)).orNull | ||
| localFiles = Option(args.files).map(downloadFileList(_, hadoopConf)).orNull | ||
| } | ||
|
|
||
| // Require all python files to be local, so we can add them to the PYTHONPATH | ||
|
|
@@ -366,7 +376,7 @@ object SparkSubmit extends CommandLineUtils { | |
| // If a python file is provided, add it to the child arguments and list of files to deploy. | ||
| // Usage: PythonAppRunner <main python file> <extra python files> [app arguments] | ||
| args.mainClass = "org.apache.spark.deploy.PythonRunner" | ||
| args.childArgs = ArrayBuffer(args.primaryResource, args.pyFiles) ++ args.childArgs | ||
| args.childArgs = ArrayBuffer(localPrimaryResource, localPyFiles) ++ args.childArgs | ||
| if (clusterManager != YARN) { | ||
| // The YARN backend distributes the primary file differently, so don't merge it. | ||
| args.files = mergeFileLists(args.files, args.primaryResource) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is a behavior change here. I think we should use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you think it should use local files/primary resource here?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also here the changes restricted to client deploy mode, I'm not sure why it is related to standalone cluster mode? |
||
|
|
@@ -376,8 +386,8 @@ object SparkSubmit extends CommandLineUtils { | |
| // The YARN backend handles python files differently, so don't merge the lists. | ||
| args.files = mergeFileLists(args.files, args.pyFiles) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The same reason as above. |
||
| } | ||
| if (args.pyFiles != null) { | ||
| sysProps("spark.submit.pyFiles") = args.pyFiles | ||
| if (localPyFiles != null) { | ||
| sysProps("spark.submit.pyFiles") = localPyFiles | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -431,7 +441,7 @@ object SparkSubmit extends CommandLineUtils { | |
| // If an R file is provided, add it to the child arguments and list of files to deploy. | ||
| // Usage: RRunner <main R file> [app arguments] | ||
| args.mainClass = "org.apache.spark.deploy.RRunner" | ||
| args.childArgs = ArrayBuffer(args.primaryResource) ++ args.childArgs | ||
| args.childArgs = ArrayBuffer(localPrimaryResource) ++ args.childArgs | ||
| args.files = mergeFileLists(args.files, args.primaryResource) | ||
| } | ||
| } | ||
|
|
@@ -468,6 +478,7 @@ object SparkSubmit extends CommandLineUtils { | |
| OptionAssigner(args.queue, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.queue"), | ||
| OptionAssigner(args.numExecutors, YARN, ALL_DEPLOY_MODES, | ||
| sysProp = "spark.executor.instances"), | ||
| OptionAssigner(args.pyFiles, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.dist.pyFiles"), | ||
| OptionAssigner(args.jars, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.dist.jars"), | ||
| OptionAssigner(args.files, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.dist.files"), | ||
| OptionAssigner(args.archives, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.dist.archives"), | ||
|
|
@@ -491,15 +502,28 @@ object SparkSubmit extends CommandLineUtils { | |
| sysProp = "spark.driver.cores"), | ||
| OptionAssigner(args.supervise.toString, STANDALONE | MESOS, CLUSTER, | ||
| sysProp = "spark.driver.supervise"), | ||
| OptionAssigner(args.ivyRepoPath, STANDALONE, CLUSTER, sysProp = "spark.jars.ivy") | ||
| OptionAssigner(args.ivyRepoPath, STANDALONE, CLUSTER, sysProp = "spark.jars.ivy"), | ||
|
|
||
| // An internal option used only for spark-shell to add user jars to repl's classloader, | ||
| // previously it uses "spark.jars" or "spark.yarn.dist.jars" which now may be pointed to | ||
| // remote jars, so adding a new option to only specify local jars for spark-shell internally. | ||
| OptionAssigner(localJars, ALL_CLUSTER_MGRS, CLIENT, sysProp = "spark.repl.local.jars") | ||
| ) | ||
|
|
||
| // In client mode, launch the application main class directly | ||
| // In addition, add the main application jar and any added jars (if any) to the classpath | ||
| // Also add the main application jar and any added jars to classpath in case YARN client | ||
| // requires these jars. | ||
| if (deployMode == CLIENT || isYarnCluster) { | ||
| if (deployMode == CLIENT) { | ||
| childMainClass = args.mainClass | ||
| if (localPrimaryResource != null && isUserJar(localPrimaryResource)) { | ||
| childClasspath += localPrimaryResource | ||
| } | ||
| if (localJars != null) { childClasspath ++= localJars.split(",") } | ||
| } | ||
| // Add the main application jar and any added jars to classpath in case YARN client | ||
| // requires these jars. | ||
| // This assumes both primaryResource and user jars are local jars, otherwise it will not be | ||
| // added to the classpath of YARN client. | ||
| if (isYarnCluster) { | ||
| if (isUserJar(args.primaryResource)) { | ||
| childClasspath += args.primaryResource | ||
| } | ||
|
|
@@ -556,10 +580,6 @@ object SparkSubmit extends CommandLineUtils { | |
| if (args.isPython) { | ||
| sysProps.put("spark.yarn.isPython", "true") | ||
| } | ||
|
|
||
| if (args.pyFiles != null) { | ||
| sysProps("spark.submit.pyFiles") = args.pyFiles | ||
| } | ||
| } | ||
|
|
||
| // assure a keytab is available from any place in a JVM | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you want to avoid download for yarn, can we just check the cluster mode it here?