apache
diff --git a/‎common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java‎
Lines changed: 4 additions & 0 deletions b/‎common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java‎
Lines changed: 24 additions & 12 deletions b/‎common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java‎
Lines changed: 24 additions & 12 deletions
diff --git a/‎common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java‎
Lines changed: 58 additions & 0 deletions b/‎common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexRecord.java‎
Lines changed: 40 additions & 0 deletions b/‎common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexRecord.java‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎conf/metrics.properties.template‎
Lines changed: 1 addition & 0 deletions b/‎conf/metrics.properties.template‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 16 additions & 20 deletions b/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 16 additions & 20 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/RDD.scala‎
Lines changed: 25 additions & 12 deletions b/‎core/src/main/scala/org/apache/spark/rdd/RDD.scala‎
Lines changed: 25 additions & 12 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala‎
Lines changed: 5 additions & 3 deletions b/‎core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/rpc/netty/NettyStreamManager.scala‎
Lines changed: 8 additions & 4 deletions b/‎core/src/main/scala/org/apache/spark/rpc/netty/NettyStreamManager.scala‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/Task.scala‎
Lines changed: 3 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/Task.scala‎
Lines changed: 3 additions & 2 deletions
@@ -60,6 +60,10 @@ public TransportConf(String module, ConfigProvider conf) {
     SPARK_NETWORK_IO_LAZYFD_KEY = getConfKey("io.lazyFD");
   }
 
+  public int getInt(String name, int defaultValue) {
+    return conf.getInt(name, defaultValue);
+  }
+
   private String getConfKey(String suffix) {
     return "spark." + module + "." + suffix;
   }
 
@@ -21,6 +21,7 @@
 import java.nio.charset.StandardCharsets;
 import java.util.*;
 import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Executor;
 import java.util.concurrent.Executors;
 
@@ -29,6 +30,9 @@
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Objects;
+import com.google.common.cache.CacheBuilder;
+import com.google.common.cache.CacheLoader;
+import com.google.common.cache.LoadingCache;
 import com.google.common.collect.Maps;
 import org.fusesource.leveldbjni.JniDBFactory;
 import org.fusesource.leveldbjni.internal.NativeDB;
@@ -66,6 +70,12 @@ public class ExternalShuffleBlockResolver {
   @VisibleForTesting
   final ConcurrentMap<AppExecId, ExecutorShuffleInfo> executors;
 
+  /**
+   *  Caches index file information so that we can avoid open/close the index files
+   *  for each block fetch.
+   */
+  private final LoadingCache<File, ShuffleIndexInformation> shuffleIndexCache;
+
   // Single-threaded Java executor used to perform expensive recursive directory deletion.
   private final Executor directoryCleaner;
 
@@ -95,6 +105,15 @@ public ExternalShuffleBlockResolver(TransportConf conf, File registeredExecutorF
       Executor directoryCleaner) throws IOException {
     this.conf = conf;
     this.registeredExecutorFile = registeredExecutorFile;
+    int indexCacheEntries = conf.getInt("spark.shuffle.service.index.cache.entries", 1024);
+    CacheLoader<File, ShuffleIndexInformation> indexCacheLoader =
+        new CacheLoader<File, ShuffleIndexInformation>() {
+          public ShuffleIndexInformation load(File file) throws IOException {
+            return new ShuffleIndexInformation(file);
+          }
+        };
+    shuffleIndexCache = CacheBuilder.newBuilder()
+                                    .maximumSize(indexCacheEntries).build(indexCacheLoader);
     if (registeredExecutorFile != null) {
       Options options = new Options();
       options.createIfMissing(false);
@@ -265,24 +284,17 @@ private ManagedBuffer getSortBasedShuffleBlockData(
     File indexFile = getFile(executor.localDirs, executor.subDirsPerLocalDir,
       "shuffle_" + shuffleId + "_" + mapId + "_0.index");
 
-    DataInputStream in = null;
     try {
-      in = new DataInputStream(new FileInputStream(indexFile));
-      in.skipBytes(reduceId * 8);
-      long offset = in.readLong();
-      long nextOffset = in.readLong();
+      ShuffleIndexInformation shuffleIndexInformation = shuffleIndexCache.get(indexFile);
+      ShuffleIndexRecord shuffleIndexRecord = shuffleIndexInformation.getIndex(reduceId);
       return new FileSegmentManagedBuffer(
         conf,
         getFile(executor.localDirs, executor.subDirsPerLocalDir,
           "shuffle_" + shuffleId + "_" + mapId + "_0.data"),
-        offset,
-        nextOffset - offset);
-    } catch (IOException e) {
+        shuffleIndexRecord.getOffset(),
+        shuffleIndexRecord.getLength());
+    } catch (ExecutionException e) {
       throw new RuntimeException("Failed to open file: " + indexFile, e);
-    } finally {
-      if (in != null) {
-        JavaUtils.closeQuietly(in);
-      }
     }
   }
 
 
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.shuffle;
+
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.LongBuffer;
+
+/**
+ * Keeps the index information for a particular map output
+ * as an in-memory LongBuffer.
+ */
+public class ShuffleIndexInformation {
+  /** offsets as long buffer */
+  private final LongBuffer offsets;
+
+  public ShuffleIndexInformation(File indexFile) throws IOException {
+    int size = (int)indexFile.length();
+    ByteBuffer buffer = ByteBuffer.allocate(size);
+    offsets = buffer.asLongBuffer();
+    DataInputStream dis = null;
+    try {
+      dis = new DataInputStream(new FileInputStream(indexFile));
+      dis.readFully(buffer.array());
+    } finally {
+      if (dis != null) {
+        dis.close();
+      }
+    }
+  }
+
+  /**
+   * Get index offset for a particular reducer.
+   */
+  public ShuffleIndexRecord getIndex(int reduceId) {
+    long offset = offsets.get(reduceId);
+    long nextOffset = offsets.get(reduceId + 1);
+    return new ShuffleIndexRecord(offset, nextOffset - offset);
+  }
+}
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.shuffle;
+
+/**
+ * Contains offset and length of the shuffle block data.
+ */
+public class ShuffleIndexRecord {
+  private final long offset;
+  private final long length;
+
+  public ShuffleIndexRecord(long offset, long length) {
+    this.offset = offset;
+    this.length = length;
+  }
+
+  public long getOffset() {
+    return offset;
+  }
+
+  public long getLength() {
+    return length;
+  }
+}
+
@@ -93,6 +93,7 @@
 #   period    10         Poll period
 #   unit      seconds    Unit of the poll period
 #   ttl       1          TTL of messages sent by Ganglia
+#   dmax      0          Lifetime in seconds of metrics (0 never expired)
 #   mode      multicast  Ganglia network mode ('unicast' or 'multicast')
 
 # org.apache.spark.metrics.sink.JmxSink
 
@@ -21,7 +21,7 @@ import java.io._
 import java.lang.reflect.Constructor
 import java.net.URI
 import java.util.{Arrays, Locale, Properties, ServiceLoader, UUID}
-import java.util.concurrent.ConcurrentMap
+import java.util.concurrent.{ConcurrentHashMap, ConcurrentMap}
 import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger, AtomicReference}
 
 import scala.collection.JavaConverters._
@@ -262,8 +262,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   private[spark] def env: SparkEnv = _env
 
   // Used to store a URL for each static file/jar together with the file's local timestamp
-  private[spark] val addedFiles = HashMap[String, Long]()
-  private[spark] val addedJars = HashMap[String, Long]()
+  private[spark] val addedFiles = new ConcurrentHashMap[String, Long]().asScala
+  private[spark] val addedJars = new ConcurrentHashMap[String, Long]().asScala
 
   // Keeps track of all persisted RDDs
   private[spark] val persistentRdds = {
@@ -1430,14 +1430,14 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
       schemeCorrectedPath
     }
     val timestamp = System.currentTimeMillis
-    addedFiles(key) = timestamp
-
-    // Fetch the file locally in case a job is executed using DAGScheduler.runLocally().
-    Utils.fetchFile(path, new File(SparkFiles.getRootDirectory()), conf, env.securityManager,
-      hadoopConfiguration, timestamp, useCache = false)
-
-    logInfo("Added file " + path + " at " + key + " with timestamp " + addedFiles(key))
-    postEnvironmentUpdate()
+    if (addedFiles.putIfAbsent(key, timestamp).isEmpty) {
+      logInfo(s"Added file $path at $key with timestamp $timestamp")
+      // Fetch the file locally so that closures which are run on the driver can still use the
+      // SparkFiles API to access files.
+      Utils.fetchFile(path, new File(SparkFiles.getRootDirectory()), conf, env.securityManager,
+        hadoopConfiguration, timestamp, useCache = false)
+      postEnvironmentUpdate()
+    }
   }
 
   /**
@@ -1705,12 +1705,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
                 case exc: FileNotFoundException =>
                   logError(s"Jar not found at $path")
                   null
-                case e: Exception =>
-                  // For now just log an error but allow to go through so spark examples work.
-                  // The spark examples don't really need the jar distributed since its also
-                  // the app jar.
-                  logError("Error adding jar (" + e + "), was the --addJars option used?")
-                  null
               }
             }
           // A JAR file which exists locally on every worker node
@@ -1721,11 +1715,13 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
         }
       }
       if (key != null) {
-        addedJars(key) = System.currentTimeMillis
-        logInfo("Added JAR " + path + " at " + key + " with timestamp " + addedJars(key))
+        val timestamp = System.currentTimeMillis
+        if (addedJars.putIfAbsent(key, timestamp).isEmpty) {
+          logInfo(s"Added JAR $path at $key with timestamp $timestamp")
+          postEnvironmentUpdate()
+        }
       }
     }
-    postEnvironmentUpdate()
   }
 
   /**
 
@@ -474,12 +474,17 @@ abstract class RDD[T: ClassTag](
   def sample(
       withReplacement: Boolean,
       fraction: Double,
-      seed: Long = Utils.random.nextLong): RDD[T] = withScope {
-    require(fraction >= 0.0, "Negative fraction value: " + fraction)
-    if (withReplacement) {
-      new PartitionwiseSampledRDD[T, T](this, new PoissonSampler[T](fraction), true, seed)
-    } else {
-      new PartitionwiseSampledRDD[T, T](this, new BernoulliSampler[T](fraction), true, seed)
+      seed: Long = Utils.random.nextLong): RDD[T] = {
+    require(fraction >= 0,
+      s"Fraction must be nonnegative, but got ${fraction}")
+
+    withScope {
+      require(fraction >= 0.0, "Negative fraction value: " + fraction)
+      if (withReplacement) {
+        new PartitionwiseSampledRDD[T, T](this, new PoissonSampler[T](fraction), true, seed)
+      } else {
+        new PartitionwiseSampledRDD[T, T](this, new BernoulliSampler[T](fraction), true, seed)
+      }
     }
   }
 
@@ -493,14 +498,22 @@ abstract class RDD[T: ClassTag](
    */
   def randomSplit(
       weights: Array[Double],
-      seed: Long = Utils.random.nextLong): Array[RDD[T]] = withScope {
-    val sum = weights.sum
-    val normalizedCumWeights = weights.map(_ / sum).scanLeft(0.0d)(_ + _)
-    normalizedCumWeights.sliding(2).map { x =>
-      randomSampleWithRange(x(0), x(1), seed)
-    }.toArray
+      seed: Long = Utils.random.nextLong): Array[RDD[T]] = {
+    require(weights.forall(_ >= 0),
+      s"Weights must be nonnegative, but got ${weights.mkString("[", ",", "]")}")
+    require(weights.sum > 0,
+      s"Sum of weights must be positive, but got ${weights.mkString("[", ",", "]")}")
+
+    withScope {
+      val sum = weights.sum
+      val normalizedCumWeights = weights.map(_ / sum).scanLeft(0.0d)(_ + _)
+      normalizedCumWeights.sliding(2).map { x =>
+        randomSampleWithRange(x(0), x(1), seed)
+      }.toArray
+    }
   }
 
+
   /**
    * Internal method exposed for Random Splits in DataFrames. Samples an RDD given a probability
    * range.
 
@@ -17,7 +17,7 @@
 
 package org.apache.spark.rpc.netty
 
-import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, ThreadPoolExecutor, TimeUnit}
+import java.util.concurrent.{ConcurrentHashMap, ConcurrentMap, LinkedBlockingQueue, ThreadPoolExecutor, TimeUnit}
 import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.JavaConverters._
@@ -42,8 +42,10 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
     val inbox = new Inbox(ref, endpoint)
   }
 
-  private val endpoints = new ConcurrentHashMap[String, EndpointData]
-  private val endpointRefs = new ConcurrentHashMap[RpcEndpoint, RpcEndpointRef]
+  private val endpoints: ConcurrentMap[String, EndpointData] =
+    new ConcurrentHashMap[String, EndpointData]
+  private val endpointRefs: ConcurrentMap[RpcEndpoint, RpcEndpointRef] =
+    new ConcurrentHashMap[RpcEndpoint, RpcEndpointRef]
 
   // Track the receivers whose inboxes may contain messages.
   private val receivers = new LinkedBlockingQueue[EndpointData]
 
@@ -66,14 +66,18 @@ private[netty] class NettyStreamManager(rpcEnv: NettyRpcEnv)
   }
 
   override def addFile(file: File): String = {
-    require(files.putIfAbsent(file.getName(), file) == null,
-      s"File ${file.getName()} already registered.")
+    val existingPath = files.putIfAbsent(file.getName, file)
+    require(existingPath == null || existingPath == file,
+      s"File ${file.getName} was already registered with a different path " +
+        s"(old path = $existingPath, new path = $file")
     s"${rpcEnv.address.toSparkURL}/files/${Utils.encodeFileNameToURIRawPath(file.getName())}"
   }
 
   override def addJar(file: File): String = {
-    require(jars.putIfAbsent(file.getName(), file) == null,
-      s"JAR ${file.getName()} already registered.")
+    val existingPath = jars.putIfAbsent(file.getName, file)
+    require(existingPath == null || existingPath == file,
+      s"File ${file.getName} was already registered with a different path " +
+        s"(old path = $existingPath, new path = $file")
     s"${rpcEnv.address.toSparkURL}/jars/${Utils.encodeFileNameToURIRawPath(file.getName())}"
   }
 
 
@@ -21,6 +21,7 @@ import java.io.{DataInputStream, DataOutputStream}
 import java.nio.ByteBuffer
 import java.util.Properties
 
+import scala.collection.mutable
 import scala.collection.mutable.HashMap
 
 import org.apache.spark._
@@ -198,8 +199,8 @@ private[spark] object Task {
    */
   def serializeWithDependencies(
       task: Task[_],
-      currentFiles: HashMap[String, Long],
-      currentJars: HashMap[String, Long],
+      currentFiles: mutable.Map[String, Long],
+      currentJars: mutable.Map[String, Long],
       serializer: SerializerInstance)
     : ByteBuffer = {
Original file line number	Diff line number	Diff line change
`@@ -60,6 +60,10 @@ public TransportConf(String module, ConfigProvider conf) {`
`60`	`60`	`SPARK_NETWORK_IO_LAZYFD_KEY = getConfKey("io.lazyFD");`
`61`	`61`	`}`
`62`	`62`
	`63`	`+ public int getInt(String name, int defaultValue) {`
	`64`	`+ return conf.getInt(name, defaultValue);`
	`65`	`+ }`
	`66`	`+`
`63`	`67`	`private String getConfKey(String suffix) {`
`64`	`68`	`return "spark." + module + "." + suffix;`
`65`	`69`	`}`