From d516aaed34664f486c12e633c202fa7249a94c26 Mon Sep 17 00:00:00 2001
From: Mingyu Kim <mkim@palantir.com>
Date: Tue, 15 Sep 2015 00:32:25 -0700
Subject: [PATCH 1/3] SPARK-10611 Clone Configuration for each task for
 NewHadoopRDD

---
 .../org/apache/spark/rdd/BinaryFileRDD.scala  |  5 +--
 .../org/apache/spark/rdd/NewHadoopRDD.scala   | 36 ++++++++++++++++---
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala b/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
index 6fec00dcd0d8..aedced7408cd 100644
--- a/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
@@ -34,12 +34,13 @@ private[spark] class BinaryFileRDD[T](
 
   override def getPartitions: Array[Partition] = {
     val inputFormat = inputFormatClass.newInstance
+    val conf = getConf
     inputFormat match {
       case configurable: Configurable =>
-        configurable.setConf(getConf)
+        configurable.setConf(conf)
       case _ =>
     }
-    val jobContext = newJobContext(getConf, jobId)
+    val jobContext = newJobContext(conf, jobId)
     inputFormat.setMinPartitions(jobContext, minPartitions)
     val rawSplits = inputFormat.getSplits(jobContext).toArray
     val result = new Array[Partition](rawSplits.size)
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index 174979aaeb23..f177cac28dae 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -84,6 +84,27 @@ class NewHadoopRDD[K, V](
 
   @transient protected val jobId = new JobID(jobTrackerId, id)
 
+  private val shouldCloneJobConf = sparkContext.conf.getBoolean("spark.hadoop.cloneConf", false)
+
+  protected def getConf: Configuration = {
+    val conf: Configuration = confBroadcast.value.value
+    if (shouldCloneJobConf) {
+      // Hadoop Configuration objects are not thread-safe, which may lead to various problems if
+      // one job modifies a configuration while another reads it (SPARK-2546, SPARK-10611).  This
+      // problem occurs somewhat rarely because most jobs treat the configuration as though it's
+      // immutable.  One solution, implemented here, is to clone the Configuration object.
+      // Unfortunately, this clone can be very expensive.  To avoid unexpected performance regressions
+      // for workloads and Hadoop versions that do not suffer from these thread-safety issues, this
+      // cloning is disabled by default.
+      NewHadoopRDD.CONFIGURATION_INSTANTIATION_LOCK.synchronized {
+        logDebug("Cloning Hadoop Configuration")
+        new Configuration(conf)
+      }
+    } else {
+      conf
+    }
+  }
+
   override def getPartitions: Array[Partition] = {
     val inputFormat = inputFormatClass.newInstance
     inputFormat match {
@@ -104,7 +125,7 @@ class NewHadoopRDD[K, V](
     val iter = new Iterator[(K, V)] {
       val split = theSplit.asInstanceOf[NewHadoopPartition]
       logInfo("Input split: " + split.serializableHadoopSplit)
-      val conf = confBroadcast.value.value
+      val conf = getConf
 
       val inputMetrics = context.taskMetrics
         .getInputMetricsForReadMethod(DataReadMethod.Hadoop)
@@ -230,11 +251,15 @@ class NewHadoopRDD[K, V](
     super.persist(storageLevel)
   }
 
-
-  def getConf: Configuration = confBroadcast.value.value
 }
 
 private[spark] object NewHadoopRDD {
+  /**
+   * Configuration's constructor is not threadsafe (see SPARK-1097 and HADOOP-10456).
+   * Therefore, we synchronize on this lock before calling new Configuration().
+   */
+  val CONFIGURATION_INSTANTIATION_LOCK = new Object()
+
   /**
    * Analogous to [[org.apache.spark.rdd.MapPartitionsRDD]], but passes in an InputSplit to
    * the given function rather than the index of the partition.
@@ -268,12 +293,13 @@ private[spark] class WholeTextFileRDD(
 
   override def getPartitions: Array[Partition] = {
     val inputFormat = inputFormatClass.newInstance
+    val conf = getConf
     inputFormat match {
       case configurable: Configurable =>
-        configurable.setConf(getConf)
+        configurable.setConf(conf)
       case _ =>
     }
-    val jobContext = newJobContext(getConf, jobId)
+    val jobContext = newJobContext(conf, jobId)
     inputFormat.setMinPartitions(jobContext, minPartitions)
     val rawSplits = inputFormat.getSplits(jobContext).toArray
     val result = new Array[Partition](rawSplits.size)

From 6460266824c1be789388cc7ff4bb1c44023b11d0 Mon Sep 17 00:00:00 2001
From: Mingyu Kim <mkim@palantir.com>
Date: Tue, 15 Sep 2015 14:58:05 -0700
Subject: [PATCH 2/3] Make getConf public

---
 core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index f177cac28dae..f280ac5a424b 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -44,7 +44,6 @@ private[spark] class NewHadoopPartition(
   extends Partition {
 
   val serializableHadoopSplit = new SerializableWritable(rawSplit)
-
   override def hashCode(): Int = 41 * (41 + rddId) + index
 }
 
@@ -86,7 +85,7 @@ class NewHadoopRDD[K, V](
 
   private val shouldCloneJobConf = sparkContext.conf.getBoolean("spark.hadoop.cloneConf", false)
 
-  protected def getConf: Configuration = {
+  def getConf: Configuration = {
     val conf: Configuration = confBroadcast.value.value
     if (shouldCloneJobConf) {
       // Hadoop Configuration objects are not thread-safe, which may lead to various problems if

From 4ba2acd09e41aec97f049f57b68785cf419347f6 Mon Sep 17 00:00:00 2001
From: Mingyu Kim <mkim@palantir.com>
Date: Tue, 15 Sep 2015 17:08:21 -0700
Subject: [PATCH 3/3] Fixing style issues

---
 core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index f280ac5a424b..2872b93b8730 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -92,9 +92,9 @@ class NewHadoopRDD[K, V](
       // one job modifies a configuration while another reads it (SPARK-2546, SPARK-10611).  This
       // problem occurs somewhat rarely because most jobs treat the configuration as though it's
       // immutable.  One solution, implemented here, is to clone the Configuration object.
-      // Unfortunately, this clone can be very expensive.  To avoid unexpected performance regressions
-      // for workloads and Hadoop versions that do not suffer from these thread-safety issues, this
-      // cloning is disabled by default.
+      // Unfortunately, this clone can be very expensive.  To avoid unexpected performance
+      // regressions for workloads and Hadoop versions that do not suffer from these thread-safety
+      // issues, this cloning is disabled by default.
       NewHadoopRDD.CONFIGURATION_INSTANTIATION_LOCK.synchronized {
         logDebug("Cloning Hadoop Configuration")
         new Configuration(conf)