apache
diff --git a/‎core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala‎
Lines changed: 16 additions & 6 deletions b/‎core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala‎
Lines changed: 16 additions & 6 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala‎
Lines changed: 2 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala‎
Lines changed: 19 additions & 0 deletions b/‎core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎docs/sql-migration-guide-upgrade.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/sql-migration-guide-upgrade.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala‎
Lines changed: 3 additions & 1 deletion b/‎external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala‎
Lines changed: 22 additions & 13 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala‎
Lines changed: 22 additions & 13 deletions
diff --git a/‎mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala‎
Lines changed: 26 additions & 0 deletions b/‎mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎python/pyspark/sql/functions.py‎
Lines changed: 5 additions & 5 deletions b/‎python/pyspark/sql/functions.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java‎
Lines changed: 3 additions & 0 deletions b/‎sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala‎
Lines changed: 0 additions & 6 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala‎
Lines changed: 0 additions & 6 deletions
@@ -18,6 +18,7 @@
 package org.apache.spark.broadcast
 
 import java.io._
+import java.lang.ref.SoftReference
 import java.nio.ByteBuffer
 import java.util.zip.Adler32
 
@@ -61,9 +62,11 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
    * Value of the broadcast object on executors. This is reconstructed by [[readBroadcastBlock]],
    * which builds this value by reading blocks from the driver and/or other executors.
    *
-   * On the driver, if the value is required, it is read lazily from the block manager.
+   * On the driver, if the value is required, it is read lazily from the block manager. We hold
+   * a soft reference so that it can be garbage collected if required, as we can always reconstruct
+   * in the future.
    */
-  @transient private lazy val _value: T = readBroadcastBlock()
+  @transient private var _value: SoftReference[T] = _
 
   /** The compression codec to use, or None if compression is disabled */
   @transient private var compressionCodec: Option[CompressionCodec] = _
@@ -92,8 +95,15 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
   /** The checksum for all the blocks. */
   private var checksums: Array[Int] = _
 
-  override protected def getValue() = {
-    _value
+  override protected def getValue() = synchronized {
+    val memoized: T = if (_value == null) null.asInstanceOf[T] else _value.get
+    if (memoized != null) {
+      memoized
+    } else {
+      val newlyRead = readBroadcastBlock()
+      _value = new SoftReference[T](newlyRead)
+      newlyRead
+    }
   }
 
   private def calcChecksum(block: ByteBuffer): Int = {
@@ -205,8 +215,8 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
   }
 
   private def readBroadcastBlock(): T = Utils.tryOrIOException {
-    TorrentBroadcast.synchronized {
-      val broadcastCache = SparkEnv.get.broadcastManager.cachedValues
+    val broadcastCache = SparkEnv.get.broadcastManager.cachedValues
+    broadcastCache.synchronized {
 
       Option(broadcastCache.get(broadcastId)).map(_.asInstanceOf[T]).getOrElse {
         setConf(SparkEnv.get.conf)
 
@@ -61,11 +61,12 @@ private[deploy] object DependencyUtils extends Logging {
       hadoopConf: Configuration,
       secMgr: SecurityManager): String = {
     val targetDir = Utils.createTempDir()
+    val userJarName = userJar.split(File.separatorChar).last
     Option(jars)
       .map {
         resolveGlobPaths(_, hadoopConf)
           .split(",")
-          .filterNot(_.contains(userJar.split("/").last))
+          .filterNot(_.contains(userJarName))
           .mkString(",")
       }
       .filterNot(_ == "")
 
@@ -962,6 +962,25 @@ class SparkSubmitSuite
     }
   }
 
+  test("remove copies of application jar from classpath") {
+    val fs = File.separator
+    val sparkConf = new SparkConf(false)
+    val hadoopConf = new Configuration()
+    val secMgr = new SecurityManager(sparkConf)
+
+    val appJarName = "myApp.jar"
+    val jar1Name = "myJar1.jar"
+    val jar2Name = "myJar2.jar"
+    val userJar = s"file:/path${fs}to${fs}app${fs}jar$fs$appJarName"
+    val jars = s"file:/$jar1Name,file:/$appJarName,file:/$jar2Name"
+
+    val resolvedJars = DependencyUtils
+      .resolveAndDownloadJars(jars, userJar, sparkConf, hadoopConf, secMgr)
+
+    assert(!resolvedJars.contains(appJarName))
+    assert(resolvedJars.contains(jar1Name) && resolvedJars.contains(jar2Name))
+  }
+
   test("Avoid re-upload remote resources in yarn client mode") {
     val hadoopConf = new Configuration()
     updateConfWithFakeS3Fs(hadoopConf)
 
@@ -27,6 +27,8 @@ displayTitle: Spark SQL Upgrading Guide
 
   - In Spark version 2.4 and earlier, float/double -0.0 is semantically equal to 0.0, but users can still distinguish them via `Dataset.show`, `Dataset.collect` etc. Since Spark 3.0, float/double -0.0 is replaced by 0.0 internally, and users can't distinguish them any more.
 
+  - In Spark version 2.4 and earlier, users can create a map with duplicated keys via built-in functions like `CreateMap`, `StringToMap`, etc. The behavior of map with duplicated keys is undefined, e.g. map look up respects the duplicated key appears first, `Dataset.collect` only keeps the duplicated key appears last, `MapKeys` returns duplicated keys, etc. Since Spark 3.0, these built-in functions will remove duplicated map keys with last wins policy. Users may still read map values with duplicated keys from data sources which do not enforce it (e.g. Parquet), the behavior will be udefined.
+
   - In Spark version 2.4 and earlier, the `SET` command works without any warnings even if the specified key is for `SparkConf` entries and it has no effect because the command does not update `SparkConf`, but the behavior might confuse users. Since 3.0, the command fails if a `SparkConf` key is used. You can disable such a check by setting `spark.sql.execution.setCommandRejectsSparkConfs` to `false`.
 
 ## Upgrading From Spark SQL 2.3 to 2.4
 
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.avro
 
-import java.math.{BigDecimal}
+import java.math.BigDecimal
 import java.nio.ByteBuffer
 
 import scala.collection.JavaConverters._
@@ -218,6 +218,8 @@ class AvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType) {
             i += 1
           }
 
+          // The Avro map will never have null or duplicated map keys, it's safe to create a
+          // ArrayBasedMapData directly here.
           updater.set(ordinal, new ArrayBasedMapData(keyArray, valueArray))
 
       case (UNION, _) =>
 
@@ -37,7 +37,7 @@ import org.apache.spark.ml.param.{Param, ParamMap, ParamPair, Params}
 import org.apache.spark.ml.param.shared.{HasParallelism, HasWeightCol}
 import org.apache.spark.ml.util._
 import org.apache.spark.ml.util.Instrumentation.instrumented
-import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 import org.apache.spark.storage.StorageLevel
@@ -169,6 +169,12 @@ final class OneVsRestModel private[ml] (
     // Check schema
     transformSchema(dataset.schema, logging = true)
 
+    if (getPredictionCol == "" && getRawPredictionCol == "") {
+      logWarning(s"$uid: OneVsRestModel.transform() was called as NOOP" +
+        " since no output columns were set.")
+      return dataset.toDF
+    }
+
     // determine the input columns: these need to be passed through
     val origCols = dataset.schema.map(f => col(f.name))
 
@@ -209,6 +215,9 @@ final class OneVsRestModel private[ml] (
       newDataset.unpersist()
     }
 
+    var predictionColNames = Seq.empty[String]
+    var predictionColumns = Seq.empty[Column]
+
     if (getRawPredictionCol != "") {
       val numClass = models.length
 
@@ -219,24 +228,24 @@ final class OneVsRestModel private[ml] (
         Vectors.dense(predArray)
       }
 
-      // output the index of the classifier with highest confidence as prediction
-      val labelUDF = udf { (rawPredictions: Vector) => rawPredictions.argmax.toDouble }
+      predictionColNames = predictionColNames :+ getRawPredictionCol
+      predictionColumns = predictionColumns :+ rawPredictionUDF(col(accColName))
+    }
 
-      // output confidence as raw prediction, label and label metadata as prediction
-      aggregatedDataset
-        .withColumn(getRawPredictionCol, rawPredictionUDF(col(accColName)))
-        .withColumn(getPredictionCol, labelUDF(col(getRawPredictionCol)), labelMetadata)
-        .drop(accColName)
-    } else {
+    if (getPredictionCol != "") {
       // output the index of the classifier with highest confidence as prediction
       val labelUDF = udf { (predictions: Map[Int, Double]) =>
         predictions.maxBy(_._2)._1.toDouble
       }
-      // output label and label metadata as prediction
-      aggregatedDataset
-        .withColumn(getPredictionCol, labelUDF(col(accColName)), labelMetadata)
-        .drop(accColName)
+
+      predictionColNames = predictionColNames :+ getPredictionCol
+      predictionColumns = predictionColumns :+ labelUDF(col(accColName))
+        .as(getPredictionCol, labelMetadata)
     }
+
+    aggregatedDataset
+      .withColumns(predictionColNames, predictionColumns)
+      .drop(accColName)
   }
 
   @Since("1.4.1")
 
@@ -290,6 +290,32 @@ class OneVsRestSuite extends MLTest with DefaultReadWriteTest {
     checkModelData(ovaModel, newOvaModel)
   }
 
+  test("should ignore empty output cols") {
+    val lr = new LogisticRegression().setMaxIter(1)
+    val ovr = new OneVsRest().setClassifier(lr)
+    val ovrModel = ovr.fit(dataset)
+
+    val output1 = ovrModel.setPredictionCol("").setRawPredictionCol("")
+      .transform(dataset)
+    assert(output1.schema.fieldNames.toSet ===
+      Set("label", "features"))
+
+    val output2 = ovrModel.setPredictionCol("prediction").setRawPredictionCol("")
+      .transform(dataset)
+    assert(output2.schema.fieldNames.toSet ===
+      Set("label", "features", "prediction"))
+
+    val output3 = ovrModel.setPredictionCol("").setRawPredictionCol("rawPrediction")
+      .transform(dataset)
+    assert(output3.schema.fieldNames.toSet ===
+      Set("label", "features", "rawPrediction"))
+
+    val output4 = ovrModel.setPredictionCol("prediction").setRawPredictionCol("rawPrediction")
+      .transform(dataset)
+    assert(output4.schema.fieldNames.toSet ===
+      Set("label", "features", "prediction", "rawPrediction"))
+  }
+
   test("should support all NumericType labels and not support other types") {
     val ovr = new OneVsRest().setClassifier(new LogisticRegression().setMaxIter(1))
     MLTestingUtils.checkNumericTypes[OneVsRestModel, OneVsRest](
 
@@ -2656,11 +2656,11 @@ def map_concat(*cols):
     >>> from pyspark.sql.functions import map_concat
     >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map(3, 'c', 1, 'd') as map2")
     >>> df.select(map_concat("map1", "map2").alias("map3")).show(truncate=False)
-    +--------------------------------+
-    |map3                            |
-    +--------------------------------+
-    |[1 -> a, 2 -> b, 3 -> c, 1 -> d]|
-    +--------------------------------+
+    +------------------------+
+    |map3                    |
+    +------------------------+
+    |[1 -> d, 2 -> b, 3 -> c]|
+    +------------------------+
     """
     sc = SparkContext._active_spark_context
     if len(cols) == 1 and isinstance(cols[0], (list, set)):
 
@@ -28,6 +28,9 @@
  * Currently we just use 2 UnsafeArrayData to represent UnsafeMapData, with extra 8 bytes at head
  * to indicate the number of bytes of the unsafe key array.
  * [unsafe key array numBytes] [unsafe key array] [unsafe value array]
+ *
+ * Note that, user is responsible to guarantee that the key array does not have duplicated
+ * elements, otherwise the behavior is undefined.
  */
 // TODO: Use a more efficient format which doesn't depend on unsafe array.
 public final class UnsafeMapData extends MapData {
 
@@ -431,12 +431,6 @@ object CatalystTypeConverters {
         map,
         (key: Any) => convertToCatalyst(key),
         (value: Any) => convertToCatalyst(value))
-    case (keys: Array[_], values: Array[_]) =>
-      // case for mapdata with duplicate keys
-      new ArrayBasedMapData(
-        new GenericArrayData(keys.map(convertToCatalyst)),
-        new GenericArrayData(values.map(convertToCatalyst))
-      )
     case other => other
   }