apache
diff --git a/‎.github/workflows/stale.yml‎
Lines changed: 23 additions & 0 deletions b/‎.github/workflows/stale.yml‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎LICENSE-binary‎
Lines changed: 0 additions & 1 deletion b/‎LICENSE-binary‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎R/pkg/R/functions.R‎
Lines changed: 2 additions & 2 deletions b/‎R/pkg/R/functions.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎assembly/pom.xml‎
Lines changed: 0 additions & 5 deletions b/‎assembly/pom.xml‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java‎
Lines changed: 12 additions & 0 deletions b/‎common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java‎
Lines changed: 7 additions & 2 deletions b/‎core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 28 additions & 14 deletions b/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 28 additions & 14 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/TestUtils.scala‎
Lines changed: 11 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/TestUtils.scala‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala‎
Lines changed: 2 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala‎
Lines changed: 2 additions & 2 deletions
@@ -0,0 +1,23 @@
+name: Close stale PRs
+
+on:
+  schedule:
+  - cron: "0 0 * * *"
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/[email protected]
+      with:
+        repo-token: ${{ secrets.GITHUB_TOKEN }}
+        stale-pr-message: |
+          We're closing this PR because it hasn't been updated in a while.
+          This isn't a judgement on the merit of the PR in any way. It's just
+          a way of keeping the PR queue manageable.
+
+          If you'd like to revive this PR, please reopen it!
+        days-before-stale: 100
+        # Setting this to 0 is the same as setting it to 1.
+        # See: https://github.com/actions/stale/issues/28
+        days-before-close: 0
@@ -393,7 +393,6 @@ org.eclipse.jetty:jetty-util
 org.eclipse.jetty:jetty-webapp
 org.eclipse.jetty:jetty-xml
 org.scala-lang.modules:scala-xml_2.12
-org.opencypher:okapi-shade
 com.github.joshelser:dropwizard-metrics-hadoop-metrics2-reporter
 com.zaxxer.HikariCP
 org.apache.hive:hive-beeline
 
@@ -3620,8 +3620,8 @@ setMethod("size",
 #' (array indices start at 1, or from the end if start is negative) with the specified length.
 #'
 #' @rdname column_collection_functions
-#' @param start an index indicating the first element occurring in the result.
-#' @param length a number of consecutive elements chosen to the result.
+#' @param start the starting index
+#' @param length the length of the slice
 #' @aliases slice slice,Column-method
 #' @note slice since 2.4.0
 setMethod("slice",
 
@@ -64,11 +64,6 @@
       <artifactId>spark-graphx_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-graph_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
-    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-sql_${scala.binary.version}</artifactId>
 
@@ -133,4 +133,16 @@ private void appendUnit(StringBuilder sb, long value, String unit) {
    * @throws ArithmeticException if a numeric overflow occurs
    */
   public Duration extractAsDuration() { return Duration.of(microseconds, ChronoUnit.MICROS); }
+
+  /**
+   * A constant holding the minimum value an {@code CalendarInterval} can have.
+   */
+  public static CalendarInterval MIN_VALUE =
+    new CalendarInterval(Integer.MIN_VALUE, Integer.MIN_VALUE, Long.MIN_VALUE);
+
+  /**
+   * A constant holding the maximum value an {@code CalendarInterval} can have.
+   */
+  public static CalendarInterval MAX_VALUE =
+    new CalendarInterval(Integer.MAX_VALUE, Integer.MAX_VALUE, Long.MAX_VALUE);
 }
@@ -694,7 +694,10 @@ public boolean append(Object kbase, long koff, int klen, Object vbase, long voff
       assert (vlen % 8 == 0);
       assert (longArray != null);
 
-      if (numKeys == MAX_CAPACITY
+      // We should not increase number of keys to be MAX_CAPACITY. The usage pattern of this map is
+      // lookup + append. If we append key until the number of keys to be MAX_CAPACITY, next time
+      // the call of lookup will hang forever because it cannot find an empty slot.
+      if (numKeys == MAX_CAPACITY - 1
         // The map could be reused from last spill (because of no enough memory to grow),
         // then we don't try to grow again if hit the `growthThreshold`.
         || !canGrowArray && numKeys >= growthThreshold) {
@@ -741,7 +744,9 @@ public boolean append(Object kbase, long koff, int klen, Object vbase, long voff
         longArray.set(pos * 2 + 1, keyHashcode);
         isDefined = true;
 
-        if (numKeys >= growthThreshold && longArray.size() < MAX_CAPACITY) {
+        // We use two array entries per key, so the array size is twice the capacity.
+        // We should compare the current capacity of the array, instead of its size.
+        if (numKeys >= growthThreshold && longArray.size() / 2 < MAX_CAPACITY) {
           try {
             growAndRehash();
           } catch (SparkOutOfMemoryError oom) {
 
@@ -42,7 +42,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.{LocalSparkCluster, SparkHadoopUtil}
 import org.apache.spark.deploy.StandaloneResourceUtils._
-import org.apache.spark.executor.ExecutorMetrics
+import org.apache.spark.executor.{ExecutorMetrics, ExecutorMetricsSource}
 import org.apache.spark.input.{FixedLengthBinaryInputFormat, PortableDataStream, StreamInputFormat, WholeTextFileInputFormat}
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
@@ -551,9 +551,16 @@ class SparkContext(config: SparkConf) extends Logging {
     _dagScheduler = new DAGScheduler(this)
     _heartbeatReceiver.ask[Boolean](TaskSchedulerIsSet)
 
+    val _executorMetricsSource =
+      if (_conf.get(METRICS_EXECUTORMETRICS_SOURCE_ENABLED)) {
+        Some(new ExecutorMetricsSource)
+      } else {
+        None
+      }
+
     // create and start the heartbeater for collecting memory metrics
     _heartbeater = new Heartbeater(
-      () => SparkContext.this.reportHeartBeat(),
+      () => SparkContext.this.reportHeartBeat(_executorMetricsSource),
       "driver-heartbeater",
       conf.get(EXECUTOR_HEARTBEAT_INTERVAL))
     _heartbeater.start()
@@ -622,6 +629,7 @@ class SparkContext(config: SparkConf) extends Logging {
     _env.metricsSystem.registerSource(_dagScheduler.metricsSource)
     _env.metricsSystem.registerSource(new BlockManagerSource(_env.blockManager))
     _env.metricsSystem.registerSource(new JVMCPUSource())
+    _executorMetricsSource.foreach(_.register(_env.metricsSystem))
     _executorAllocationManager.foreach { e =>
       _env.metricsSystem.registerSource(e.executorAllocationManagerSource)
     }
@@ -1525,17 +1533,17 @@ class SparkContext(config: SparkConf) extends Logging {
    */
   def addFile(path: String, recursive: Boolean): Unit = {
     val uri = new Path(path).toUri
-    val schemeCorrectedPath = uri.getScheme match {
-      case null => new File(path).getCanonicalFile.toURI.toString
+    val schemeCorrectedURI = uri.getScheme match {
+      case null => new File(path).getCanonicalFile.toURI
       case "local" =>
         logWarning("File with 'local' scheme is not supported to add to file server, since " +
           "it is already available on every node.")
         return
-      case _ => path
+      case _ => uri
     }
 
-    val hadoopPath = new Path(schemeCorrectedPath)
-    val scheme = new URI(schemeCorrectedPath).getScheme
+    val hadoopPath = new Path(schemeCorrectedURI)
+    val scheme = schemeCorrectedURI.getScheme
     if (!Array("http", "https", "ftp").contains(scheme)) {
       val fs = hadoopPath.getFileSystem(hadoopConfiguration)
       val isDir = fs.getFileStatus(hadoopPath).isDirectory
@@ -1555,7 +1563,11 @@ class SparkContext(config: SparkConf) extends Logging {
     val key = if (!isLocal && scheme == "file") {
       env.rpcEnv.fileServer.addFile(new File(uri.getPath))
     } else {
-      schemeCorrectedPath
+        if (uri.getScheme == null) {
+          schemeCorrectedURI.toString
+        } else {
+          path
+        }
     }
     val timestamp = System.currentTimeMillis
     if (addedFiles.putIfAbsent(key, timestamp).isEmpty) {
@@ -1848,7 +1860,7 @@ class SparkContext(config: SparkConf) extends Logging {
 
     def checkRemoteJarFile(path: String): String = {
       val hadoopPath = new Path(path)
-      val scheme = new URI(path).getScheme
+      val scheme = hadoopPath.toUri.getScheme
       if (!Array("http", "https", "ftp").contains(scheme)) {
         try {
           val fs = hadoopPath.getFileSystem(hadoopConfiguration)
@@ -1870,21 +1882,21 @@ class SparkContext(config: SparkConf) extends Logging {
       }
     }
 
-    if (path == null) {
-      logWarning("null specified as parameter to addJar")
+    if (path == null || path.isEmpty) {
+      logWarning("null or empty path specified as parameter to addJar")
     } else {
       val key = if (path.contains("\\")) {
         // For local paths with backslashes on Windows, URI throws an exception
         addLocalJarFile(new File(path))
       } else {
-        val uri = new URI(path)
+        val uri = new Path(path).toUri
         // SPARK-17650: Make sure this is a valid URL before adding it to the list of dependencies
         Utils.validateURL(uri)
         uri.getScheme match {
           // A JAR file which exists only on the driver node
           case null =>
             // SPARK-22585 path without schema is not url encoded
-            addLocalJarFile(new File(uri.getRawPath))
+            addLocalJarFile(new File(uri.getPath))
           // A JAR file which exists only on the driver node
           case "file" => addLocalJarFile(new File(uri.getPath))
           // A JAR file which exists locally on every worker node
@@ -2473,8 +2485,10 @@ class SparkContext(config: SparkConf) extends Logging {
   }
 
   /** Reports heartbeat metrics for the driver. */
-  private def reportHeartBeat(): Unit = {
+  private def reportHeartBeat(executorMetricsSource: Option[ExecutorMetricsSource]): Unit = {
     val currentMetrics = ExecutorMetrics.getCurrentMetrics(env.memoryManager)
+    executorMetricsSource.foreach(_.updateMetricsSnapshot(currentMetrics))
+
     val driverUpdates = new HashMap[(Int, Int), ExecutorMetrics]
     // In the driver, we do not track per-stage metrics, so use a dummy stage for the key
     driverUpdates.put(EventLoggingListener.DRIVER_STAGE_KEY, new ExecutorMetrics(currentMetrics))
 
@@ -247,6 +247,16 @@ private[spark] object TestUtils {
       url: URL,
       method: String = "GET",
       headers: Seq[(String, String)] = Nil): Int = {
+    withHttpConnection(url, method, headers = headers) { connection =>
+      connection.getResponseCode()
+    }
+  }
+
+  def withHttpConnection[T](
+      url: URL,
+      method: String = "GET",
+      headers: Seq[(String, String)] = Nil)
+      (fn: HttpURLConnection => T): T = {
     val connection = url.openConnection().asInstanceOf[HttpURLConnection]
     connection.setRequestMethod(method)
     headers.foreach { case (k, v) => connection.setRequestProperty(k, v) }
@@ -271,7 +281,7 @@ private[spark] object TestUtils {
 
     try {
       connection.connect()
-      connection.getResponseCode()
+      fn(connection)
     } finally {
       connection.disconnect()
     }
 
@@ -137,10 +137,10 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
    * Remove keys that don't start with "spark." from `sparkProperties`.
    */
   private def ignoreNonSparkProperties(): Unit = {
-    sparkProperties.foreach { case (k, v) =>
+    sparkProperties.keys.foreach { k =>
       if (!k.startsWith("spark.")) {
         sparkProperties -= k
-        logWarning(s"Ignoring non-spark config property: $k=$v")
+        logWarning(s"Ignoring non-Spark config property: $k")
       }
     }
   }
Original file line number	Diff line number	Diff line change
`@@ -137,10 +137,10 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S`
`137`	`137`	* Remove keys that don't start with "spark." from `sparkProperties`.
`138`	`138`	`*/`
`139`	`139`	`private def ignoreNonSparkProperties(): Unit = {`
`140`		`- sparkProperties.foreach { case (k, v) =>`
	`140`	`+ sparkProperties.keys.foreach { k =>`
`141`	`141`	`if (!k.startsWith("spark.")) {`
`142`	`142`	`sparkProperties -= k`
`143`		`- logWarning(s"Ignoring non-spark config property: $k=$v")`
	`143`	`+ logWarning(s"Ignoring non-Spark config property: $k")`
`144`	`144`	`}`
`145`	`145`	`}`
`146`	`146`	`}`