Skip to content

Commit afc2fa1

Browse files
committed
Merge remote-tracking branch 'upstream/master' into option_partfile_merge
Conflicts: sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
2 parents d4ed7e6 + 945d8bc commit afc2fa1

File tree

157 files changed

+3732
-4004
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

157 files changed

+3732
-4004
lines changed

R/pkg/R/deserialize.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,11 @@ readList <- function(con) {
102102

103103
readRaw <- function(con) {
104104
dataLen <- readInt(con)
105-
data <- readBin(con, raw(), as.integer(dataLen), endian = "big")
105+
readBin(con, raw(), as.integer(dataLen), endian = "big")
106106
}
107107

108108
readRawLen <- function(con, dataLen) {
109-
data <- readBin(con, raw(), as.integer(dataLen), endian = "big")
109+
readBin(con, raw(), as.integer(dataLen), endian = "big")
110110
}
111111

112112
readDeserialize <- function(con) {

R/pkg/R/sparkR.R

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,16 +104,13 @@ sparkR.init <- function(
104104
return(get(".sparkRjsc", envir = .sparkREnv))
105105
}
106106

107-
sparkMem <- Sys.getenv("SPARK_MEM", "1024m")
108107
jars <- suppressWarnings(normalizePath(as.character(sparkJars)))
109108

110109
# Classpath separator is ";" on Windows
111110
# URI needs four /// as from http://stackoverflow.com/a/18522792
112111
if (.Platform$OS.type == "unix") {
113-
collapseChar <- ":"
114112
uriSep <- "//"
115113
} else {
116-
collapseChar <- ";"
117114
uriSep <- "////"
118115
}
119116

bin/pyspark

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,4 +82,4 @@ fi
8282

8383
export PYSPARK_DRIVER_PYTHON
8484
export PYSPARK_DRIVER_PYTHON_OPTS
85-
exec "$SPARK_HOME"/bin/spark-submit pyspark-shell-main "$@"
85+
exec "$SPARK_HOME"/bin/spark-submit pyspark-shell-main --name "PySparkShell" "$@"

bin/pyspark2.cmd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,4 @@ set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.8.2.1-src.zip;%PYTHONPATH%
3535
set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
3636
set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py
3737

38-
call %SPARK_HOME%\bin\spark-submit2.cmd pyspark-shell-main %*
38+
call %SPARK_HOME%\bin\spark-submit2.cmd pyspark-shell-main --name "PySparkShell" %*

core/src/main/scala/org/apache/spark/Partitioner.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ object Partitioner {
5656
*/
5757
def defaultPartitioner(rdd: RDD[_], others: RDD[_]*): Partitioner = {
5858
val bySize = (Seq(rdd) ++ others).sortBy(_.partitions.size).reverse
59-
for (r <- bySize if r.partitioner.isDefined) {
59+
for (r <- bySize if r.partitioner.isDefined && r.partitioner.get.numPartitions > 0) {
6060
return r.partitioner.get
6161
}
6262
if (rdd.context.conf.contains("spark.default.parallelism")) {

core/src/main/scala/org/apache/spark/deploy/master/Master.scala

Lines changed: 79 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,7 @@ private[master] class Master(
541541

542542
/**
543543
* Schedule executors to be launched on the workers.
544+
* Returns an array containing number of cores assigned to each worker.
544545
*
545546
* There are two modes of launching executors. The first attempts to spread out an application's
546547
* executors on as many workers as possible, while the second does the opposite (i.e. launch them
@@ -551,59 +552,100 @@ private[master] class Master(
551552
* multiple executors from the same application may be launched on the same worker if the worker
552553
* has enough cores and memory. Otherwise, each executor grabs all the cores available on the
553554
* worker by default, in which case only one executor may be launched on each worker.
555+
*
556+
* It is important to allocate coresPerExecutor on each worker at a time (instead of 1 core
557+
* at a time). Consider the following example: cluster has 4 workers with 16 cores each.
558+
* User requests 3 executors (spark.cores.max = 48, spark.executor.cores = 16). If 1 core is
559+
* allocated at a time, 12 cores from each worker would be assigned to each executor.
560+
* Since 12 < 16, no executors would launch [SPARK-8881].
554561
*/
555-
private def startExecutorsOnWorkers(): Unit = {
556-
// Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app
557-
// in the queue, then the second app, etc.
558-
if (spreadOutApps) {
559-
// Try to spread out each app among all the workers, until it has all its cores
560-
for (app <- waitingApps if app.coresLeft > 0) {
561-
val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE)
562-
.filter(worker => worker.memoryFree >= app.desc.memoryPerExecutorMB &&
563-
worker.coresFree >= app.desc.coresPerExecutor.getOrElse(1))
564-
.sortBy(_.coresFree).reverse
565-
val numUsable = usableWorkers.length
566-
val assigned = new Array[Int](numUsable) // Number of cores to give on each node
567-
var toAssign = math.min(app.coresLeft, usableWorkers.map(_.coresFree).sum)
568-
var pos = 0
569-
while (toAssign > 0) {
570-
if (usableWorkers(pos).coresFree - assigned(pos) > 0) {
571-
toAssign -= 1
572-
assigned(pos) += 1
562+
private def scheduleExecutorsOnWorkers(
563+
app: ApplicationInfo,
564+
usableWorkers: Array[WorkerInfo],
565+
spreadOutApps: Boolean): Array[Int] = {
566+
// If the number of cores per executor is not specified, then we can just schedule
567+
// 1 core at a time since we expect a single executor to be launched on each worker
568+
val coresPerExecutor = app.desc.coresPerExecutor.getOrElse(1)
569+
val memoryPerExecutor = app.desc.memoryPerExecutorMB
570+
val numUsable = usableWorkers.length
571+
val assignedCores = new Array[Int](numUsable) // Number of cores to give to each worker
572+
val assignedMemory = new Array[Int](numUsable) // Amount of memory to give to each worker
573+
var coresToAssign = math.min(app.coresLeft, usableWorkers.map(_.coresFree).sum)
574+
var freeWorkers = (0 until numUsable).toIndexedSeq
575+
576+
def canLaunchExecutor(pos: Int): Boolean = {
577+
usableWorkers(pos).coresFree - assignedCores(pos) >= coresPerExecutor &&
578+
usableWorkers(pos).memoryFree - assignedMemory(pos) >= memoryPerExecutor
579+
}
580+
581+
while (coresToAssign >= coresPerExecutor && freeWorkers.nonEmpty) {
582+
freeWorkers = freeWorkers.filter(canLaunchExecutor)
583+
freeWorkers.foreach { pos =>
584+
var keepScheduling = true
585+
while (keepScheduling && canLaunchExecutor(pos) && coresToAssign >= coresPerExecutor) {
586+
coresToAssign -= coresPerExecutor
587+
assignedCores(pos) += coresPerExecutor
588+
// If cores per executor is not set, we are assigning 1 core at a time
589+
// without actually meaning to launch 1 executor for each core assigned
590+
if (app.desc.coresPerExecutor.isDefined) {
591+
assignedMemory(pos) += memoryPerExecutor
592+
}
593+
594+
// Spreading out an application means spreading out its executors across as
595+
// many workers as possible. If we are not spreading out, then we should keep
596+
// scheduling executors on this worker until we use all of its resources.
597+
// Otherwise, just move on to the next worker.
598+
if (spreadOutApps) {
599+
keepScheduling = false
573600
}
574-
pos = (pos + 1) % numUsable
575-
}
576-
// Now that we've decided how many cores to give on each node, let's actually give them
577-
for (pos <- 0 until numUsable if assigned(pos) > 0) {
578-
allocateWorkerResourceToExecutors(app, assigned(pos), usableWorkers(pos))
579601
}
580602
}
581-
} else {
582-
// Pack each app into as few workers as possible until we've assigned all its cores
583-
for (worker <- workers if worker.coresFree > 0 && worker.state == WorkerState.ALIVE) {
584-
for (app <- waitingApps if app.coresLeft > 0) {
585-
allocateWorkerResourceToExecutors(app, app.coresLeft, worker)
586-
}
603+
}
604+
assignedCores
605+
}
606+
607+
/**
608+
* Schedule and launch executors on workers
609+
*/
610+
private def startExecutorsOnWorkers(): Unit = {
611+
// Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app
612+
// in the queue, then the second app, etc.
613+
for (app <- waitingApps if app.coresLeft > 0) {
614+
val coresPerExecutor: Option[Int] = app.desc.coresPerExecutor
615+
// Filter out workers that don't have enough resources to launch an executor
616+
val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE)
617+
.filter(worker => worker.memoryFree >= app.desc.memoryPerExecutorMB &&
618+
worker.coresFree >= coresPerExecutor.getOrElse(1))
619+
.sortBy(_.coresFree).reverse
620+
val assignedCores = scheduleExecutorsOnWorkers(app, usableWorkers, spreadOutApps)
621+
622+
// Now that we've decided how many cores to allocate on each worker, let's allocate them
623+
for (pos <- 0 until usableWorkers.length if assignedCores(pos) > 0) {
624+
allocateWorkerResourceToExecutors(
625+
app, assignedCores(pos), coresPerExecutor, usableWorkers(pos))
587626
}
588627
}
589628
}
590629

591630
/**
592631
* Allocate a worker's resources to one or more executors.
593632
* @param app the info of the application which the executors belong to
594-
* @param coresToAllocate cores on this worker to be allocated to this application
633+
* @param assignedCores number of cores on this worker for this application
634+
* @param coresPerExecutor number of cores per executor
595635
* @param worker the worker info
596636
*/
597637
private def allocateWorkerResourceToExecutors(
598638
app: ApplicationInfo,
599-
coresToAllocate: Int,
639+
assignedCores: Int,
640+
coresPerExecutor: Option[Int],
600641
worker: WorkerInfo): Unit = {
601-
val memoryPerExecutor = app.desc.memoryPerExecutorMB
602-
val coresPerExecutor = app.desc.coresPerExecutor.getOrElse(coresToAllocate)
603-
var coresLeft = coresToAllocate
604-
while (coresLeft >= coresPerExecutor && worker.memoryFree >= memoryPerExecutor) {
605-
val exec = app.addExecutor(worker, coresPerExecutor)
606-
coresLeft -= coresPerExecutor
642+
// If the number of cores per executor is specified, we divide the cores assigned
643+
// to this worker evenly among the executors with no remainder.
644+
// Otherwise, we launch a single executor that grabs all the assignedCores on this worker.
645+
val numExecutors = coresPerExecutor.map { assignedCores / _ }.getOrElse(1)
646+
val coresToAssign = coresPerExecutor.getOrElse(assignedCores)
647+
for (i <- 1 to numExecutors) {
648+
val exec = app.addExecutor(worker, coresToAssign)
607649
launchExecutor(worker, exec)
608650
app.state = ApplicationState.RUNNING
609651
}

core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ class NewHadoopRDD[K, V](
128128
configurable.setConf(conf)
129129
case _ =>
130130
}
131-
val reader = format.createRecordReader(
131+
private var reader = format.createRecordReader(
132132
split.serializableHadoopSplit.value, hadoopAttemptContext)
133133
reader.initialize(split.serializableHadoopSplit.value, hadoopAttemptContext)
134134

@@ -141,6 +141,12 @@ class NewHadoopRDD[K, V](
141141
override def hasNext: Boolean = {
142142
if (!finished && !havePair) {
143143
finished = !reader.nextKeyValue
144+
if (finished) {
145+
// Close and release the reader here; close() will also be called when the task
146+
// completes, but for tasks that read from many files, it helps to release the
147+
// resources early.
148+
close()
149+
}
144150
havePair = !finished
145151
}
146152
!finished
@@ -159,18 +165,23 @@ class NewHadoopRDD[K, V](
159165

160166
private def close() {
161167
try {
162-
reader.close()
163-
if (bytesReadCallback.isDefined) {
164-
inputMetrics.updateBytesRead()
165-
} else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit] ||
166-
split.serializableHadoopSplit.value.isInstanceOf[CombineFileSplit]) {
167-
// If we can't get the bytes read from the FS stats, fall back to the split size,
168-
// which may be inaccurate.
169-
try {
170-
inputMetrics.incBytesRead(split.serializableHadoopSplit.value.getLength)
171-
} catch {
172-
case e: java.io.IOException =>
173-
logWarning("Unable to get input size to set InputMetrics for task", e)
168+
if (reader != null) {
169+
// Close reader and release it
170+
reader.close()
171+
reader = null
172+
173+
if (bytesReadCallback.isDefined) {
174+
inputMetrics.updateBytesRead()
175+
} else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit] ||
176+
split.serializableHadoopSplit.value.isInstanceOf[CombineFileSplit]) {
177+
// If we can't get the bytes read from the FS stats, fall back to the split size,
178+
// which may be inaccurate.
179+
try {
180+
inputMetrics.incBytesRead(split.serializableHadoopSplit.value.getLength)
181+
} catch {
182+
case e: java.io.IOException =>
183+
logWarning("Unable to get input size to set InputMetrics for task", e)
184+
}
174185
}
175186
}
176187
} catch {

core/src/main/scala/org/apache/spark/ui/WebUI.scala

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,25 @@ private[spark] abstract class WebUI(
107107
}
108108
}
109109

110+
/**
111+
* Add a handler for static content.
112+
*
113+
* @param resourceBase Root of where to find resources to serve.
114+
* @param path Path in UI where to mount the resources.
115+
*/
116+
def addStaticHandler(resourceBase: String, path: String): Unit = {
117+
attachHandler(JettyUtils.createStaticHandler(resourceBase, path))
118+
}
119+
120+
/**
121+
* Remove a static content handler.
122+
*
123+
* @param path Path in UI to unmount.
124+
*/
125+
def removeStaticHandler(path: String): Unit = {
126+
handlers.find(_.getContextPath() == path).foreach(detachHandler)
127+
}
128+
110129
/** Initialize all components of the server. */
111130
def initialize()
112131

core/src/main/scala/org/apache/spark/util/Utils.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -443,18 +443,19 @@ private[spark] object Utils extends Logging {
443443
val lockFileName = s"${url.hashCode}${timestamp}_lock"
444444
val localDir = new File(getLocalDir(conf))
445445
val lockFile = new File(localDir, lockFileName)
446-
val raf = new RandomAccessFile(lockFile, "rw")
446+
val lockFileChannel = new RandomAccessFile(lockFile, "rw").getChannel()
447447
// Only one executor entry.
448448
// The FileLock is only used to control synchronization for executors download file,
449449
// it's always safe regardless of lock type (mandatory or advisory).
450-
val lock = raf.getChannel().lock()
450+
val lock = lockFileChannel.lock()
451451
val cachedFile = new File(localDir, cachedFileName)
452452
try {
453453
if (!cachedFile.exists()) {
454454
doFetchFile(url, localDir, cachedFileName, conf, securityMgr, hadoopConf)
455455
}
456456
} finally {
457457
lock.release()
458+
lockFileChannel.close()
458459
}
459460
copyFile(
460461
url,

core/src/test/scala/org/apache/spark/PartitioningSuite.scala

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,13 +91,13 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva
9191

9292
test("RangePartitioner for keys that are not Comparable (but with Ordering)") {
9393
// Row does not extend Comparable, but has an implicit Ordering defined.
94-
implicit object RowOrdering extends Ordering[Row] {
95-
override def compare(x: Row, y: Row): Int = x.value - y.value
94+
implicit object RowOrdering extends Ordering[Item] {
95+
override def compare(x: Item, y: Item): Int = x.value - y.value
9696
}
9797

98-
val rdd = sc.parallelize(1 to 4500).map(x => (Row(x), Row(x)))
98+
val rdd = sc.parallelize(1 to 4500).map(x => (Item(x), Item(x)))
9999
val partitioner = new RangePartitioner(1500, rdd)
100-
partitioner.getPartition(Row(100))
100+
partitioner.getPartition(Item(100))
101101
}
102102

103103
test("RangPartitioner.sketch") {
@@ -252,4 +252,4 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva
252252
}
253253

254254

255-
private sealed case class Row(value: Int)
255+
private sealed case class Item(value: Int)

0 commit comments

Comments
 (0)