apache
diff --git a/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 16 additions & 20 deletions b/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 16 additions & 20 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala‎
Lines changed: 5 additions & 3 deletions b/‎core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/rpc/netty/NettyStreamManager.scala‎
Lines changed: 8 additions & 4 deletions b/‎core/src/main/scala/org/apache/spark/rpc/netty/NettyStreamManager.scala‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/Task.scala‎
Lines changed: 3 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/Task.scala‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎core/src/test/scala/org/apache/spark/SparkContextSuite.scala‎
Lines changed: 51 additions & 0 deletions b/‎core/src/test/scala/org/apache/spark/SparkContextSuite.scala‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎docs/js/api-docs.js‎
Lines changed: 20 additions & 0 deletions b/‎docs/js/api-docs.js‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎docs/sql-programming-guide.md‎
Lines changed: 14 additions & 42 deletions b/‎docs/sql-programming-guide.md‎
Lines changed: 14 additions & 42 deletions
@@ -21,7 +21,7 @@ import java.io._
 import java.lang.reflect.Constructor
 import java.net.URI
 import java.util.{Arrays, Locale, Properties, ServiceLoader, UUID}
-import java.util.concurrent.ConcurrentMap
+import java.util.concurrent.{ConcurrentHashMap, ConcurrentMap}
 import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger, AtomicReference}
 
 import scala.collection.JavaConverters._
@@ -262,8 +262,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   private[spark] def env: SparkEnv = _env
 
   // Used to store a URL for each static file/jar together with the file's local timestamp
-  private[spark] val addedFiles = HashMap[String, Long]()
-  private[spark] val addedJars = HashMap[String, Long]()
+  private[spark] val addedFiles = new ConcurrentHashMap[String, Long]().asScala
+  private[spark] val addedJars = new ConcurrentHashMap[String, Long]().asScala
 
   // Keeps track of all persisted RDDs
   private[spark] val persistentRdds = {
@@ -1430,14 +1430,14 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
       schemeCorrectedPath
     }
     val timestamp = System.currentTimeMillis
-    addedFiles(key) = timestamp
-
-    // Fetch the file locally in case a job is executed using DAGScheduler.runLocally().
-    Utils.fetchFile(path, new File(SparkFiles.getRootDirectory()), conf, env.securityManager,
-      hadoopConfiguration, timestamp, useCache = false)
-
-    logInfo("Added file " + path + " at " + key + " with timestamp " + addedFiles(key))
-    postEnvironmentUpdate()
+    if (addedFiles.putIfAbsent(key, timestamp).isEmpty) {
+      logInfo(s"Added file $path at $key with timestamp $timestamp")
+      // Fetch the file locally so that closures which are run on the driver can still use the
+      // SparkFiles API to access files.
+      Utils.fetchFile(path, new File(SparkFiles.getRootDirectory()), conf, env.securityManager,
+        hadoopConfiguration, timestamp, useCache = false)
+      postEnvironmentUpdate()
+    }
   }
 
   /**
@@ -1705,12 +1705,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
                 case exc: FileNotFoundException =>
                   logError(s"Jar not found at $path")
                   null
-                case e: Exception =>
-                  // For now just log an error but allow to go through so spark examples work.
-                  // The spark examples don't really need the jar distributed since its also
-                  // the app jar.
-                  logError("Error adding jar (" + e + "), was the --addJars option used?")
-                  null
               }
             }
           // A JAR file which exists locally on every worker node
@@ -1721,11 +1715,13 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
         }
       }
       if (key != null) {
-        addedJars(key) = System.currentTimeMillis
-        logInfo("Added JAR " + path + " at " + key + " with timestamp " + addedJars(key))
+        val timestamp = System.currentTimeMillis
+        if (addedJars.putIfAbsent(key, timestamp).isEmpty) {
+          logInfo(s"Added JAR $path at $key with timestamp $timestamp")
+          postEnvironmentUpdate()
+        }
       }
     }
-    postEnvironmentUpdate()
   }
 
   /**
 
@@ -17,7 +17,7 @@
 
 package org.apache.spark.rpc.netty
 
-import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, ThreadPoolExecutor, TimeUnit}
+import java.util.concurrent.{ConcurrentHashMap, ConcurrentMap, LinkedBlockingQueue, ThreadPoolExecutor, TimeUnit}
 import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.JavaConverters._
@@ -42,8 +42,10 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
     val inbox = new Inbox(ref, endpoint)
   }
 
-  private val endpoints = new ConcurrentHashMap[String, EndpointData]
-  private val endpointRefs = new ConcurrentHashMap[RpcEndpoint, RpcEndpointRef]
+  private val endpoints: ConcurrentMap[String, EndpointData] =
+    new ConcurrentHashMap[String, EndpointData]
+  private val endpointRefs: ConcurrentMap[RpcEndpoint, RpcEndpointRef] =
+    new ConcurrentHashMap[RpcEndpoint, RpcEndpointRef]
 
   // Track the receivers whose inboxes may contain messages.
   private val receivers = new LinkedBlockingQueue[EndpointData]
 
@@ -66,14 +66,18 @@ private[netty] class NettyStreamManager(rpcEnv: NettyRpcEnv)
   }
 
   override def addFile(file: File): String = {
-    require(files.putIfAbsent(file.getName(), file) == null,
-      s"File ${file.getName()} already registered.")
+    val existingPath = files.putIfAbsent(file.getName, file)
+    require(existingPath == null || existingPath == file,
+      s"File ${file.getName} was already registered with a different path " +
+        s"(old path = $existingPath, new path = $file")
     s"${rpcEnv.address.toSparkURL}/files/${Utils.encodeFileNameToURIRawPath(file.getName())}"
   }
 
   override def addJar(file: File): String = {
-    require(jars.putIfAbsent(file.getName(), file) == null,
-      s"JAR ${file.getName()} already registered.")
+    val existingPath = jars.putIfAbsent(file.getName, file)
+    require(existingPath == null || existingPath == file,
+      s"File ${file.getName} was already registered with a different path " +
+        s"(old path = $existingPath, new path = $file")
     s"${rpcEnv.address.toSparkURL}/jars/${Utils.encodeFileNameToURIRawPath(file.getName())}"
   }
 
 
@@ -21,6 +21,7 @@ import java.io.{DataInputStream, DataOutputStream}
 import java.nio.ByteBuffer
 import java.util.Properties
 
+import scala.collection.mutable
 import scala.collection.mutable.HashMap
 
 import org.apache.spark._
@@ -198,8 +199,8 @@ private[spark] object Task {
    */
   def serializeWithDependencies(
       task: Task[_],
-      currentFiles: HashMap[String, Long],
-      currentJars: HashMap[String, Long],
+      currentFiles: mutable.Map[String, Long],
+      currentJars: mutable.Map[String, Long],
       serializer: SerializerInstance)
     : ByteBuffer = {
 
 
@@ -216,6 +216,57 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext {
     }
   }
 
+  test("cannot call addFile with different paths that have the same filename") {
+    val dir = Utils.createTempDir()
+    try {
+      val subdir1 = new File(dir, "subdir1")
+      val subdir2 = new File(dir, "subdir2")
+      assert(subdir1.mkdir())
+      assert(subdir2.mkdir())
+      val file1 = new File(subdir1, "file")
+      val file2 = new File(subdir2, "file")
+      Files.write("old", file1, StandardCharsets.UTF_8)
+      Files.write("new", file2, StandardCharsets.UTF_8)
+      sc = new SparkContext("local-cluster[1,1,1024]", "test")
+      sc.addFile(file1.getAbsolutePath)
+      def getAddedFileContents(): String = {
+        sc.parallelize(Seq(0)).map { _ =>
+          scala.io.Source.fromFile(SparkFiles.get("file")).mkString
+        }.first()
+      }
+      assert(getAddedFileContents() === "old")
+      intercept[IllegalArgumentException] {
+        sc.addFile(file2.getAbsolutePath)
+      }
+      assert(getAddedFileContents() === "old")
+    } finally {
+      Utils.deleteRecursively(dir)
+    }
+  }
+
+  // Regression tests for SPARK-16787
+  for (
+    schedulingMode <- Seq("local-mode", "non-local-mode");
+    method <- Seq("addJar", "addFile")
+  ) {
+    val jarPath = Thread.currentThread().getContextClassLoader.getResource("TestUDTF.jar").toString
+    val master = schedulingMode match {
+      case "local-mode" => "local"
+      case "non-local-mode" => "local-cluster[1,1,1024]"
+    }
+    test(s"$method can be called twice with same file in $schedulingMode (SPARK-16787)") {
+      sc = new SparkContext(master, "test")
+      method match {
+        case "addJar" =>
+          sc.addJar(jarPath)
+          sc.addJar(jarPath)
+        case "addFile" =>
+          sc.addFile(jarPath)
+          sc.addFile(jarPath)
+      }
+    }
+  }
+
   test("Cancelling job group should not cause SparkContext to shutdown (SPARK-6414)") {
     try {
       sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local"))
 
@@ -41,3 +41,23 @@ function addBadges(allAnnotations, name, tag, html) {
     .add(annotations.closest("div.fullcomment").prevAll("h4.signature"))
     .prepend(html);
 }
+
+$(document).ready(function() {
+  var script = document.createElement('script');
+  script.type = 'text/javascript';
+  script.async = true;
+  script.onload = function(){
+    MathJax.Hub.Config({
+      displayAlign: "left",
+      tex2jax: {
+        inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
+        displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
+        processEscapes: true,
+        skipTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'a']
+      }
+    });
+  };
+  script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
+                'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+  document.getElementsByTagName('head')[0].appendChild(script);
+});
@@ -132,7 +132,7 @@ from a Hive table, or from [Spark data sources](#data-sources).
 
 As an example, the following creates a DataFrame based on the content of a JSON file:
 
-{% include_example create_DataFrames r/RSparkSQLExample.R %}
+{% include_example create_df r/RSparkSQLExample.R %}
 
 </div>
 </div>
@@ -180,7 +180,7 @@ In addition to simple column references and expressions, DataFrames also have a
 
 <div data-lang="r"  markdown="1">
 
-{% include_example dataframe_operations r/RSparkSQLExample.R %}
+{% include_example untyped_ops r/RSparkSQLExample.R %}
 
 For a complete list of the types of operations that can be performed on a DataFrame refer to the [API Documentation](api/R/index.html).
 
@@ -214,7 +214,7 @@ The `sql` function on a `SparkSession` enables applications to run SQL queries p
 <div data-lang="r"  markdown="1">
 The `sql` function enables applications to run SQL queries programmatically and returns the result as a `SparkDataFrame`.
 
-{% include_example sql_query r/RSparkSQLExample.R %}
+{% include_example run_sql r/RSparkSQLExample.R %}
 
 </div>
 </div>
@@ -377,7 +377,7 @@ In the simplest form, the default data source (`parquet` unless otherwise config
 
 <div data-lang="r"  markdown="1">
 
-{% include_example source_parquet r/RSparkSQLExample.R %}
+{% include_example generic_load_save_functions r/RSparkSQLExample.R %}
 
 </div>
 </div>
@@ -400,13 +400,11 @@ using this syntax.
 </div>
 
 <div data-lang="python"  markdown="1">
-
 {% include_example manual_load_options python/sql/datasource.py %}
 </div>
-<div data-lang="r"  markdown="1">
-
-{% include_example source_json r/RSparkSQLExample.R %}
 
+<div data-lang="r"  markdown="1">
+{% include_example manual_load_options r/RSparkSQLExample.R %}
 </div>
 </div>
 
@@ -425,13 +423,11 @@ file directly with SQL.
 </div>
 
 <div data-lang="python"  markdown="1">
-
 {% include_example direct_sql python/sql/datasource.py %}
 </div>
 
 <div data-lang="r"  markdown="1">
-
-{% include_example direct_query r/RSparkSQLExample.R %}
+{% include_example direct_sql r/RSparkSQLExample.R %}
 
 </div>
 </div>
@@ -523,7 +519,7 @@ Using the data from the above example:
 
 <div data-lang="r"  markdown="1">
 
-{% include_example load_programmatically r/RSparkSQLExample.R %}
+{% include_example basic_parquet_example r/RSparkSQLExample.R %}
 
 </div>
 
@@ -839,7 +835,7 @@ Note that the file that is offered as _a json file_ is not a typical JSON file.
 line must contain a separate, self-contained valid JSON object. As a consequence,
 a regular multi-line JSON file will most often fail.
 
-{% include_example load_json_file r/RSparkSQLExample.R %}
+{% include_example json_dataset r/RSparkSQLExample.R %}
 
 </div>
 
@@ -925,7 +921,7 @@ You may need to grant write privilege to the user who starts the spark applicati
 When working with Hive one must instantiate `SparkSession` with Hive support. This
 adds support for finding tables in the MetaStore and writing queries using HiveQL.
 
-{% include_example hive_table r/RSparkSQLExample.R %}
+{% include_example spark_hive r/RSparkSQLExample.R %}
 
 </div>
 </div>
@@ -1067,43 +1063,19 @@ the Data Sources API. The following options are supported:
 <div class="codetabs">
 
 <div data-lang="scala"  markdown="1">
-
-{% highlight scala %}
-val jdbcDF = spark.read.format("jdbc").options(
-  Map("url" -> "jdbc:postgresql:dbserver",
-  "dbtable" -> "schema.tablename")).load()
-{% endhighlight %}
-
+{% include_example jdbc_dataset scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
 </div>
 
 <div data-lang="java"  markdown="1">
-
-{% highlight java %}
-
-Map<String, String> options = new HashMap<>();
-options.put("url", "jdbc:postgresql:dbserver");
-options.put("dbtable", "schema.tablename");
-
-Dataset<Row> jdbcDF = spark.read().format("jdbc"). options(options).load();
-{% endhighlight %}
-
-
+{% include_example jdbc_dataset java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
 </div>
 
 <div data-lang="python"  markdown="1">
-
-{% highlight python %}
-
-df = spark.read.format('jdbc').options(url='jdbc:postgresql:dbserver', dbtable='schema.tablename').load()
-
-{% endhighlight %}
-
+{% include_example jdbc_dataset python/sql/datasource.py %}
 </div>
 
 <div data-lang="r"  markdown="1">
-
-{% include_example jdbc r/RSparkSQLExample.R %}
-
+{% include_example jdbc_dataset r/RSparkSQLExample.R %}
 </div>
 
 <div data-lang="sql"  markdown="1">