Skip to content

Commit b4695e7

Browse files
author
Andrew Or
committed
Merge branch 'master' of github.com:apache/spark into rest
2 parents c9a8ad7 + a9ed511 commit b4695e7

File tree

144 files changed

+3947
-1593
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

144 files changed

+3947
-1593
lines changed

build/mvn

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,14 @@ install_app() {
3434
local binary="${_DIR}/$3"
3535

3636
# setup `curl` and `wget` silent options if we're running on Jenkins
37-
local curl_opts=""
37+
local curl_opts="-L"
3838
local wget_opts=""
3939
if [ -n "$AMPLAB_JENKINS" ]; then
40-
curl_opts="-s"
41-
wget_opts="--quiet"
40+
curl_opts="-s ${curl_opts}"
41+
wget_opts="--quiet ${wget_opts}"
4242
else
43-
curl_opts="--progress-bar"
44-
wget_opts="--progress=bar:force"
43+
curl_opts="--progress-bar ${curl_opts}"
44+
wget_opts="--progress=bar:force ${wget_opts}"
4545
fi
4646

4747
if [ -z "$3" -o ! -f "$binary" ]; then

core/src/main/scala/org/apache/spark/SparkContext.scala

Lines changed: 110 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,33 +20,42 @@ package org.apache.spark
2020
import scala.language.implicitConversions
2121

2222
import java.io._
23+
import java.lang.reflect.Constructor
2324
import java.net.URI
2425
import java.util.{Arrays, Properties, UUID}
2526
import java.util.concurrent.atomic.AtomicInteger
2627
import java.util.UUID.randomUUID
28+
2729
import scala.collection.{Map, Set}
2830
import scala.collection.JavaConversions._
2931
import scala.collection.generic.Growable
3032
import scala.collection.mutable.HashMap
3133
import scala.reflect.{ClassTag, classTag}
34+
35+
import akka.actor.Props
36+
3237
import org.apache.hadoop.conf.Configuration
3338
import org.apache.hadoop.fs.Path
34-
import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, DoubleWritable, FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable}
35-
import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat, TextInputFormat}
39+
import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, DoubleWritable,
40+
FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable}
41+
import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat,
42+
TextInputFormat}
3643
import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob}
3744
import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat}
45+
3846
import org.apache.mesos.MesosNativeLibrary
39-
import akka.actor.Props
4047

4148
import org.apache.spark.annotation.{DeveloperApi, Experimental}
4249
import org.apache.spark.broadcast.Broadcast
4350
import org.apache.spark.deploy.{LocalSparkCluster, SparkHadoopUtil}
4451
import org.apache.spark.executor.TriggerThreadDump
45-
import org.apache.spark.input.{StreamInputFormat, PortableDataStream, WholeTextFileInputFormat, FixedLengthBinaryInputFormat}
52+
import org.apache.spark.input.{StreamInputFormat, PortableDataStream, WholeTextFileInputFormat,
53+
FixedLengthBinaryInputFormat}
4654
import org.apache.spark.partial.{ApproximateEvaluator, PartialResult}
4755
import org.apache.spark.rdd._
4856
import org.apache.spark.scheduler._
49-
import org.apache.spark.scheduler.cluster.{CoarseGrainedSchedulerBackend, SparkDeploySchedulerBackend, SimrSchedulerBackend}
57+
import org.apache.spark.scheduler.cluster.{CoarseGrainedSchedulerBackend,
58+
SparkDeploySchedulerBackend, SimrSchedulerBackend}
5059
import org.apache.spark.scheduler.cluster.mesos.{CoarseMesosSchedulerBackend, MesosSchedulerBackend}
5160
import org.apache.spark.scheduler.local.LocalBackend
5261
import org.apache.spark.storage._
@@ -387,9 +396,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
387396
}
388397
executorAllocationManager.foreach(_.start())
389398

390-
// At this point, all relevant SparkListeners have been registered, so begin releasing events
391-
listenerBus.start()
392-
393399
private[spark] val cleaner: Option[ContextCleaner] = {
394400
if (conf.getBoolean("spark.cleaner.referenceTracking", true)) {
395401
Some(new ContextCleaner(this))
@@ -399,6 +405,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
399405
}
400406
cleaner.foreach(_.start())
401407

408+
setupAndStartListenerBus()
402409
postEnvironmentUpdate()
403410
postApplicationStart()
404411

@@ -1017,12 +1024,48 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
10171024
* filesystems), or an HTTP, HTTPS or FTP URI. To access the file in Spark jobs,
10181025
* use `SparkFiles.get(fileName)` to find its download location.
10191026
*/
1020-
def addFile(path: String) {
1027+
def addFile(path: String): Unit = {
1028+
addFile(path, false)
1029+
}
1030+
1031+
/**
1032+
* Add a file to be downloaded with this Spark job on every node.
1033+
* The `path` passed can be either a local file, a file in HDFS (or other Hadoop-supported
1034+
* filesystems), or an HTTP, HTTPS or FTP URI. To access the file in Spark jobs,
1035+
* use `SparkFiles.get(fileName)` to find its download location.
1036+
*
1037+
* A directory can be given if the recursive option is set to true. Currently directories are only
1038+
* supported for Hadoop-supported filesystems.
1039+
*/
1040+
def addFile(path: String, recursive: Boolean): Unit = {
10211041
val uri = new URI(path)
1022-
val key = uri.getScheme match {
1023-
case null | "file" => env.httpFileServer.addFile(new File(uri.getPath))
1024-
case "local" => "file:" + uri.getPath
1025-
case _ => path
1042+
val schemeCorrectedPath = uri.getScheme match {
1043+
case null | "local" => "file:" + uri.getPath
1044+
case _ => path
1045+
}
1046+
1047+
val hadoopPath = new Path(schemeCorrectedPath)
1048+
val scheme = new URI(schemeCorrectedPath).getScheme
1049+
if (!Array("http", "https", "ftp").contains(scheme)) {
1050+
val fs = hadoopPath.getFileSystem(hadoopConfiguration)
1051+
if (!fs.exists(hadoopPath)) {
1052+
throw new FileNotFoundException(s"Added file $hadoopPath does not exist.")
1053+
}
1054+
val isDir = fs.isDirectory(hadoopPath)
1055+
if (!isLocal && scheme == "file" && isDir) {
1056+
throw new SparkException(s"addFile does not support local directories when not running " +
1057+
"local mode.")
1058+
}
1059+
if (!recursive && isDir) {
1060+
throw new SparkException(s"Added file $hadoopPath is a directory and recursive is not " +
1061+
"turned on.")
1062+
}
1063+
}
1064+
1065+
val key = if (!isLocal && scheme == "file") {
1066+
env.httpFileServer.addFile(new File(uri.getPath))
1067+
} else {
1068+
schemeCorrectedPath
10261069
}
10271070
val timestamp = System.currentTimeMillis
10281071
addedFiles(key) = timestamp
@@ -1563,6 +1606,58 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
15631606
/** Register a new RDD, returning its RDD ID */
15641607
private[spark] def newRddId(): Int = nextRddId.getAndIncrement()
15651608

1609+
/**
1610+
* Registers listeners specified in spark.extraListeners, then starts the listener bus.
1611+
* This should be called after all internal listeners have been registered with the listener bus
1612+
* (e.g. after the web UI and event logging listeners have been registered).
1613+
*/
1614+
private def setupAndStartListenerBus(): Unit = {
1615+
// Use reflection to instantiate listeners specified via `spark.extraListeners`
1616+
try {
1617+
val listenerClassNames: Seq[String] =
1618+
conf.get("spark.extraListeners", "").split(',').map(_.trim).filter(_ != "")
1619+
for (className <- listenerClassNames) {
1620+
// Use reflection to find the right constructor
1621+
val constructors = {
1622+
val listenerClass = Class.forName(className)
1623+
listenerClass.getConstructors.asInstanceOf[Array[Constructor[_ <: SparkListener]]]
1624+
}
1625+
val constructorTakingSparkConf = constructors.find { c =>
1626+
c.getParameterTypes.sameElements(Array(classOf[SparkConf]))
1627+
}
1628+
lazy val zeroArgumentConstructor = constructors.find { c =>
1629+
c.getParameterTypes.isEmpty
1630+
}
1631+
val listener: SparkListener = {
1632+
if (constructorTakingSparkConf.isDefined) {
1633+
constructorTakingSparkConf.get.newInstance(conf)
1634+
} else if (zeroArgumentConstructor.isDefined) {
1635+
zeroArgumentConstructor.get.newInstance()
1636+
} else {
1637+
throw new SparkException(
1638+
s"$className did not have a zero-argument constructor or a" +
1639+
" single-argument constructor that accepts SparkConf. Note: if the class is" +
1640+
" defined inside of another Scala class, then its constructors may accept an" +
1641+
" implicit parameter that references the enclosing class; in this case, you must" +
1642+
" define the listener as a top-level class in order to prevent this extra" +
1643+
" parameter from breaking Spark's ability to find a valid constructor.")
1644+
}
1645+
}
1646+
listenerBus.addListener(listener)
1647+
logInfo(s"Registered listener $className")
1648+
}
1649+
} catch {
1650+
case e: Exception =>
1651+
try {
1652+
stop()
1653+
} finally {
1654+
throw new SparkException(s"Exception when registering SparkListener", e)
1655+
}
1656+
}
1657+
1658+
listenerBus.start()
1659+
}
1660+
15661661
/** Post the application start event */
15671662
private def postApplicationStart() {
15681663
// Note: this code assumes that the task scheduler has been initialized and has contacted
@@ -1582,8 +1677,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
15821677
val schedulingMode = getSchedulingMode.toString
15831678
val addedJarPaths = addedJars.keys.toSeq
15841679
val addedFilePaths = addedFiles.keys.toSeq
1585-
val environmentDetails =
1586-
SparkEnv.environmentDetails(conf, schedulingMode, addedJarPaths, addedFilePaths)
1680+
val environmentDetails = SparkEnv.environmentDetails(conf, schedulingMode, addedJarPaths,
1681+
addedFilePaths)
15871682
val environmentUpdate = SparkListenerEnvironmentUpdate(environmentDetails)
15881683
listenerBus.post(environmentUpdate)
15891684
}

core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import java.lang.reflect.Method
2121
import java.security.PrivilegedExceptionAction
2222

2323
import org.apache.hadoop.conf.Configuration
24-
import org.apache.hadoop.fs.{FileSystem, Path}
24+
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
2525
import org.apache.hadoop.fs.FileSystem.Statistics
2626
import org.apache.hadoop.mapred.JobConf
2727
import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
@@ -191,6 +191,21 @@ class SparkHadoopUtil extends Logging {
191191
val method = context.getClass.getMethod("getConfiguration")
192192
method.invoke(context).asInstanceOf[Configuration]
193193
}
194+
195+
/**
196+
* Get [[FileStatus]] objects for all leaf children (files) under the given base path. If the
197+
* given path points to a file, return a single-element collection containing [[FileStatus]] of
198+
* that file.
199+
*/
200+
def listLeafStatuses(fs: FileSystem, basePath: Path): Seq[FileStatus] = {
201+
def recurse(path: Path) = {
202+
val (directories, leaves) = fs.listStatus(path).partition(_.isDir)
203+
leaves ++ directories.flatMap(f => listLeafStatuses(fs, f.getPath))
204+
}
205+
206+
val baseStatus = fs.getFileStatus(basePath)
207+
if (baseStatus.isDir) recurse(basePath) else Array(baseStatus)
208+
}
194209
}
195210

196211
object SparkHadoopUtil {

core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala

Lines changed: 58 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -506,13 +506,64 @@ private[spark] class TaskSetManager(
506506
* Get the level we can launch tasks according to delay scheduling, based on current wait time.
507507
*/
508508
private def getAllowedLocalityLevel(curTime: Long): TaskLocality.TaskLocality = {
509-
while (curTime - lastLaunchTime >= localityWaits(currentLocalityIndex) &&
510-
currentLocalityIndex < myLocalityLevels.length - 1)
511-
{
512-
// Jump to the next locality level, and remove our waiting time for the current one since
513-
// we don't want to count it again on the next one
514-
lastLaunchTime += localityWaits(currentLocalityIndex)
515-
currentLocalityIndex += 1
509+
// Remove the scheduled or finished tasks lazily
510+
def tasksNeedToBeScheduledFrom(pendingTaskIds: ArrayBuffer[Int]): Boolean = {
511+
var indexOffset = pendingTaskIds.size
512+
while (indexOffset > 0) {
513+
indexOffset -= 1
514+
val index = pendingTaskIds(indexOffset)
515+
if (copiesRunning(index) == 0 && !successful(index)) {
516+
return true
517+
} else {
518+
pendingTaskIds.remove(indexOffset)
519+
}
520+
}
521+
false
522+
}
523+
// Walk through the list of tasks that can be scheduled at each location and returns true
524+
// if there are any tasks that still need to be scheduled. Lazily cleans up tasks that have
525+
// already been scheduled.
526+
def moreTasksToRunIn(pendingTasks: HashMap[String, ArrayBuffer[Int]]): Boolean = {
527+
val emptyKeys = new ArrayBuffer[String]
528+
val hasTasks = pendingTasks.exists {
529+
case (id: String, tasks: ArrayBuffer[Int]) =>
530+
if (tasksNeedToBeScheduledFrom(tasks)) {
531+
true
532+
} else {
533+
emptyKeys += id
534+
false
535+
}
536+
}
537+
// The key could be executorId, host or rackId
538+
emptyKeys.foreach(id => pendingTasks.remove(id))
539+
hasTasks
540+
}
541+
542+
while (currentLocalityIndex < myLocalityLevels.length - 1) {
543+
val moreTasks = myLocalityLevels(currentLocalityIndex) match {
544+
case TaskLocality.PROCESS_LOCAL => moreTasksToRunIn(pendingTasksForExecutor)
545+
case TaskLocality.NODE_LOCAL => moreTasksToRunIn(pendingTasksForHost)
546+
case TaskLocality.NO_PREF => pendingTasksWithNoPrefs.nonEmpty
547+
case TaskLocality.RACK_LOCAL => moreTasksToRunIn(pendingTasksForRack)
548+
}
549+
if (!moreTasks) {
550+
// This is a performance optimization: if there are no more tasks that can
551+
// be scheduled at a particular locality level, there is no point in waiting
552+
// for the locality wait timeout (SPARK-4939).
553+
lastLaunchTime = curTime
554+
logDebug(s"No tasks for locality level ${myLocalityLevels(currentLocalityIndex)}, " +
555+
s"so moving to locality level ${myLocalityLevels(currentLocalityIndex + 1)}")
556+
currentLocalityIndex += 1
557+
} else if (curTime - lastLaunchTime >= localityWaits(currentLocalityIndex)) {
558+
// Jump to the next locality level, and reset lastLaunchTime so that the next locality
559+
// wait timer doesn't immediately expire
560+
lastLaunchTime += localityWaits(currentLocalityIndex)
561+
currentLocalityIndex += 1
562+
logDebug(s"Moving to ${myLocalityLevels(currentLocalityIndex)} after waiting for " +
563+
s"${localityWaits(currentLocalityIndex)}ms")
564+
} else {
565+
return myLocalityLevels(currentLocalityIndex)
566+
}
516567
}
517568
myLocalityLevels(currentLocalityIndex)
518569
}

core/src/main/scala/org/apache/spark/util/ListenerBus.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ import org.apache.spark.Logging
2828
*/
2929
private[spark] trait ListenerBus[L <: AnyRef, E] extends Logging {
3030

31-
private val listeners = new CopyOnWriteArrayList[L]
31+
// Marked `private[spark]` for access in tests.
32+
private[spark] val listeners = new CopyOnWriteArrayList[L]
3233

3334
/**
3435
* Add a listener to listen events. This method is thread-safe and can be called in any thread.

0 commit comments

Comments
 (0)