From ebf76126046ef918488b8a69ab0bc5763cbd3a34 Mon Sep 17 00:00:00 2001
From: Gibson Chikafa <gibbschikafa@gmail.com>
Date: Tue, 18 Feb 2025 10:57:49 +0100
Subject: [PATCH 01/86] Hopsfy spark-3.5

---
 assembly/pom.xml                              |   2 +-
 assembly/src/main/assembly/assembly.xml       |   2 +
 common/network-yarn/pom.xml                   |   4 +-
 connector/docker-integration-tests/pom.xml    |   2 +-
 connector/kafka-0-10-assembly/pom.xml         |   4 +-
 connector/kafka-0-10-sql/pom.xml              |   2 +-
 connector/kafka-0-10-token-provider/pom.xml   |   2 +-
 connector/kinesis-asl-assembly/pom.xml        |   4 +-
 core/pom.xml                                  |  10 +-
 dev/make-distribution.sh                      |   8 +-
 hadoop-cloud/pom.xml                          |  14 +-
 launcher/pom.xml                              |   4 +-
 pom.xml                                       | 182 +++++++++++++-----
 resource-managers/yarn/pom.xml                |   6 +-
 sql/core/pom.xml                              |   2 +-
 sql/hive-thriftserver/pom.xml                 |   2 +-
 sql/hive/pom.xml                              |  21 +-
 .../sql/hive/client/HiveClientImpl.scala      |   8 +-
 .../hive/client/IsolatedClientLoader.scala    |  35 ++--
 .../sql/hive/execution/HiveFileFormat.scala   |   2 +-
 20 files changed, 210 insertions(+), 106 deletions(-)
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 7c4c5d84792bc..e2b19febfb57d 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -137,7 +137,7 @@
           <version>${project.version}</version>
         </dependency>
         <dependency>
-          <groupId>org.apache.hadoop</groupId>
+          <groupId>${hadoop.group}</groupId>
           <artifactId>hadoop-yarn-server-web-proxy</artifactId>
         </dependency>
       </dependencies>
diff --git a/assembly/src/main/assembly/assembly.xml b/assembly/src/main/assembly/assembly.xml
index 009d4b92f406c..7db5b432aaf50 100644
--- a/assembly/src/main/assembly/assembly.xml
+++ b/assembly/src/main/assembly/assembly.xml
@@ -83,6 +83,8 @@
       <useProjectArtifact>false</useProjectArtifact>
       <excludes>
         <exclude>org.apache.hadoop:*:jar</exclude>
+        <exclude>io.hops:*:jar</exclude>
+        <exclude>io.hops.metadata:*:jar</exclude>
         <exclude>org.apache.spark:*:jar</exclude>
         <exclude>org.apache.zookeeper:*:jar</exclude>
         <exclude>org.apache.avro:*:jar</exclude>
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index ee1492783cd9b..7cd1d526fdab5 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -64,12 +64,12 @@
 
     <!-- Provided dependencies -->
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-client-api</artifactId>
       <version>${hadoop.version}</version>
     </dependency>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-client-runtime</artifactId>
       <version>${hadoop.version}</version>
     </dependency>
diff --git a/connector/docker-integration-tests/pom.xml b/connector/docker-integration-tests/pom.xml
index 91b89665d4700..d7a6ef9547aba 100644
--- a/connector/docker-integration-tests/pom.xml
+++ b/connector/docker-integration-tests/pom.xml
@@ -92,7 +92,7 @@
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-minikdc</artifactId>
       <scope>test</scope>
     </dependency>
diff --git a/connector/kafka-0-10-assembly/pom.xml b/connector/kafka-0-10-assembly/pom.xml
index 73f5c23a9f5c7..5e42afb48354e 100644
--- a/connector/kafka-0-10-assembly/pom.xml
+++ b/connector/kafka-0-10-assembly/pom.xml
@@ -70,13 +70,13 @@
       <scope>provided</scope>
     </dependency>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-client-api</artifactId>
       <version>${hadoop.version}</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-client-runtime</artifactId>
       <version>${hadoop.version}</version>
       <scope>provided</scope>
diff --git a/connector/kafka-0-10-sql/pom.xml b/connector/kafka-0-10-sql/pom.xml
index 89ce0a2ff5cef..38e481ffae6b6 100644
--- a/connector/kafka-0-10-sql/pom.xml
+++ b/connector/kafka-0-10-sql/pom.xml
@@ -121,7 +121,7 @@
       </exclusions>
     </dependency>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-minikdc</artifactId>
     </dependency>
     <!-- Kafka embedded server uses Zookeeper 3.5.7 API -->
diff --git a/connector/kafka-0-10-token-provider/pom.xml b/connector/kafka-0-10-token-provider/pom.xml
index a139af88905ac..1159507872b2b 100644
--- a/connector/kafka-0-10-token-provider/pom.xml
+++ b/connector/kafka-0-10-token-provider/pom.xml
@@ -65,7 +65,7 @@
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-client-runtime</artifactId>
       <scope>${hadoop.deps.scope}</scope>
     </dependency>
diff --git a/connector/kinesis-asl-assembly/pom.xml b/connector/kinesis-asl-assembly/pom.xml
index e6c3f0219f3e1..c555fd6cfd35d 100644
--- a/connector/kinesis-asl-assembly/pom.xml
+++ b/connector/kinesis-asl-assembly/pom.xml
@@ -100,13 +100,13 @@
       <scope>provided</scope>
     </dependency>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-client-api</artifactId>
       <version>${hadoop.version}</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-client-runtime</artifactId>
       <version>${hadoop.version}</version>
       <scope>provided</scope>
diff --git a/core/pom.xml b/core/pom.xml
index f0771a62db3a3..359e47f238acf 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -71,12 +71,12 @@
       <artifactId>xbean-asm9-shaded</artifactId>
     </dependency>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-client-api</artifactId>
       <version>${hadoop.version}</version>
     </dependency>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-client-runtime</artifactId>
       <version>${hadoop.version}</version>
     </dependency>
@@ -426,7 +426,7 @@
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-minikdc</artifactId>
       <scope>test</scope>
     </dependency>
@@ -472,13 +472,13 @@
     </dependency>
 
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-aws</artifactId>
       <version>${hadoop.version}</version>
       <scope>test</scope>
       <exclusions>
         <exclusion>
-          <groupId>org.apache.hadoop</groupId>
+          <groupId>${hadoop.group}</groupId>
           <artifactId>hadoop-common</artifactId>
         </exclusion>
         <exclusion>
diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
index ef7c010e930a1..6758a8aee0322 100755
--- a/dev/make-distribution.sh
+++ b/dev/make-distribution.sh
@@ -187,7 +187,13 @@ echo "Spark $VERSION$GITREVSTRING built for Hadoop $SPARK_HADOOP_VERSION" > "$DI
 echo "Build flags: $@" >> "$DISTDIR/RELEASE"
 
 # Copy jars
-cp "$SPARK_HOME"/assembly/target/scala*/jars/* "$DISTDIR/jars/"
+# Fabio: copy jars from the spark-assemmbly-*-dist directory which
+# contains the distribution prepared by the maven-assembly-plugin
+# The maven-assembly-plugin has rules to remove the hadoop/hops dependencies
+# from the final distribution
+# You need to run the -Pbigtop-dist profile for this to work
+cp "$SPARK_HOME"/assembly/target/spark-assembly_"$SCALA_VERSION"-"$VERSION"-dist/lib/* "$DISTDIR/jars/"
+cp "$SPARK_HOME"/assembly/target/spark-assembly_"$SCALA_VERSION"-"$VERSION"-dist/*.jar "$DISTDIR/jars/"
 
 # Only create the yarn directory if the yarn artifacts were built.
 if [ -f "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar ]; then
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index fc2d9b5799adb..c4f00569c207c 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -59,13 +59,13 @@
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-client-api</artifactId>
       <version>${hadoop.version}</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-client-runtime</artifactId>
       <version>${hadoop.version}</version>
     </dependency>
@@ -74,7 +74,7 @@
       intra-jackson-module version problems.
       -->
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-aws</artifactId>
       <version>${hadoop.version}</version>
       <scope>${hadoop.deps.scope}</scope>
@@ -130,7 +130,7 @@
       <scope>${hadoop.deps.scope}</scope>
     </dependency>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-azure</artifactId>
       <version>${hadoop.version}</version>
       <scope>${hadoop.deps.scope}</scope>
@@ -146,13 +146,13 @@
     but it still needs some selective exclusion across versions, especially 3.0.x.
     -->
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-cloud-storage</artifactId>
       <version>${hadoop.version}</version>
       <scope>${hadoop.deps.scope}</scope>
       <exclusions>
         <exclusion>
-          <groupId>org.apache.hadoop</groupId>
+          <groupId>${hadoop.group}</groupId>
           <artifactId>hadoop-common</artifactId>
         </exclusion>
         <exclusion>
@@ -167,7 +167,7 @@
           manually exclude it to avoid recurring issues similar to HADOOP-18159 in Spark.
         -->
         <exclusion>
-          <groupId>org.apache.hadoop</groupId>
+          <groupId>${hadoop.group}</groupId>
           <artifactId>hadoop-cos</artifactId>
         </exclusion>
       </exclusions>
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 5a56efc2d168f..c67b33fee6c9c 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -91,13 +91,13 @@
 
     <!-- Not needed by the test code, but referenced by SparkSubmit which is used by the tests. -->
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-client-api</artifactId>
       <version>${hadoop.version}</version>
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-client-runtime</artifactId>
       <version>${hadoop.version}</version>
       <scope>test</scope>
diff --git a/pom.xml b/pom.xml
index 4f912329beed6..ccadad0f84238 100644
--- a/pom.xml
+++ b/pom.xml
@@ -119,23 +119,24 @@
     <exec-maven-plugin.version>3.1.0</exec-maven-plugin.version>
     <sbt.project.name>spark</sbt.project.name>
     <asm.version>9.5</asm.version>
-    <slf4j.version>2.0.7</slf4j.version>
-    <log4j.version>2.20.0</log4j.version>
+    <slf4j.version>2.0.17</slf4j.version>
+    <log4j.version>2.24.3</log4j.version>
     <!-- make sure to update IsolatedClientLoader whenever this version is changed -->
-    <hadoop.version>3.3.4</hadoop.version>
+    <hadoop.version>3.2.0.2</hadoop.version>
+    <hadoop.group>io.hops</hadoop.group>
     <!-- SPARK-41247: When updating `protobuf.version`, also need to update `protoVersion` in `SparkBuild.scala` -->
     <protobuf.version>3.23.4</protobuf.version>
     <protoc-jar-maven-plugin.version>3.11.4</protoc-jar-maven-plugin.version>
     <yarn.version>${hadoop.version}</yarn.version>
     <zookeeper.version>3.6.3</zookeeper.version>
     <curator.version>2.13.0</curator.version>
-    <hive.group>org.apache.hive</hive.group>
+    <hive.group>io.hops.hive</hive.group>
     <hive.classifier>core</hive.classifier>
     <!-- Version used in Maven Hive dependency -->
-    <hive.version>2.3.9</hive.version>
-    <hive23.version>2.3.9</hive23.version>
+    <hive.version>3.0.0.8-SNAPSHOT</hive.version>
+    <hive23.version>3.0.0.8-SNAPSHOT</hive23.version>
     <!-- Version used for internal directory structure -->
-    <hive.version.short>2.3</hive.version.short>
+    <hive.version.short>3.0</hive.version.short>
     <!-- note that this should be compatible with Kafka brokers version 0.10 and up -->
     <kafka.version>3.4.1</kafka.version>
     <!-- After 10.15.1.3, the minimum required version is JDK9 -->
@@ -266,12 +267,14 @@
     -->
     <hadoop.deps.scope>compile</hadoop.deps.scope>
     <hive.deps.scope>compile</hive.deps.scope>
-    <hive.storage.version>2.8.1</hive.storage.version>
+    <hive.storage.version>2.6.1.2</hive.storage.version>
     <hive.storage.scope>compile</hive.storage.scope>
     <hive.common.scope>compile</hive.common.scope>
     <hive.llap.scope>compile</hive.llap.scope>
     <hive.serde.scope>compile</hive.serde.scope>
     <hive.shims.scope>compile</hive.shims.scope>
+    <hive.service.scope>compile</hive.service.scope>
+    <hive.jdbc.scope>compile</hive.jdbc.scope>
     <orc.deps.scope>compile</orc.deps.scope>
     <parquet.deps.scope>compile</parquet.deps.scope>
     <parquet.test.deps.scope>test</parquet.test.deps.scope>
@@ -348,6 +351,17 @@
         <enabled>false</enabled>
       </snapshots>
     </repository>
+    <repository>
+      <id>Hops</id>
+      <name>Hops Repository</name>
+      <url>https://archiva.hops.works/repository/Hops/</url>
+      <releases>
+        <enabled>true</enabled>
+      </releases>
+      <snapshots>
+        <enabled>true</enabled>
+      </snapshots>
+    </repository>
   </repositories>
   <pluginRepositories>
     <pluginRepository>
@@ -773,6 +787,11 @@
         <artifactId>log4j-slf4j2-impl</artifactId>
         <version>${log4j.version}</version>
       </dependency>
+      <dependency>
+        <groupId>org.apache.logging.log4j</groupId>
+        <artifactId>log4j-slf4j-impl</artifactId>
+        <version>${log4j.version}</version>
+      </dependency>
       <dependency>
         <groupId>org.apache.logging.log4j</groupId>
         <artifactId>log4j-api</artifactId>
@@ -1326,26 +1345,26 @@
       </dependency>
       <!-- Hadoop 3.x dependencies -->
       <dependency>
-        <groupId>org.apache.hadoop</groupId>
+        <groupId>${hadoop.group}</groupId>
         <artifactId>hadoop-client-api</artifactId>
         <version>${hadoop.version}</version>
         <scope>${hadoop.deps.scope}</scope>
       </dependency>
       <dependency>
-        <groupId>org.apache.hadoop</groupId>
+        <groupId>${hadoop.group}</groupId>
         <artifactId>hadoop-client-runtime</artifactId>
         <version>${hadoop.version}</version>
         <scope>${hadoop.deps.scope}</scope>
       </dependency>
       <dependency>
-        <groupId>org.apache.hadoop</groupId>
+        <groupId>${hadoop.group}</groupId>
         <artifactId>hadoop-client-minicluster</artifactId>
         <version>${yarn.version}</version>
         <scope>test</scope>
       </dependency>
       <!-- End of Hadoop 3.x dependencies -->
       <dependency>
-        <groupId>org.apache.hadoop</groupId>
+        <groupId>${hadoop.group}</groupId>
         <artifactId>hadoop-client</artifactId>
         <version>${hadoop.version}</version>
         <scope>${hadoop.deps.scope}</scope>
@@ -1435,7 +1454,7 @@
         </exclusions>
       </dependency>
       <dependency>
-        <groupId>org.apache.hadoop</groupId>
+        <groupId>${hadoop.group}</groupId>
         <artifactId>hadoop-minikdc</artifactId>
         <version>${hadoop.version}</version>
         <scope>test</scope>
@@ -1567,7 +1586,7 @@
         <scope>${hadoop.deps.scope}</scope>
       </dependency>
       <dependency>
-        <groupId>org.apache.hadoop</groupId>
+        <groupId>${hadoop.group}</groupId>
         <artifactId>hadoop-yarn-api</artifactId>
         <version>${yarn.version}</version>
         <scope>${hadoop.deps.scope}</scope>
@@ -1611,7 +1630,7 @@
         </exclusions>
       </dependency>
       <dependency>
-        <groupId>org.apache.hadoop</groupId>
+        <groupId>${hadoop.group}</groupId>
         <artifactId>hadoop-yarn-common</artifactId>
         <version>${yarn.version}</version>
         <scope>${hadoop.deps.scope}</scope>
@@ -1659,7 +1678,7 @@
         </exclusions>
       </dependency>
       <dependency>
-        <groupId>org.apache.hadoop</groupId>
+        <groupId>${hadoop.group}</groupId>
         <artifactId>hadoop-yarn-server-tests</artifactId>
         <version>${yarn.version}</version>
         <classifier>tests</classifier>
@@ -1702,7 +1721,7 @@
             <artifactId>*</artifactId>
           </exclusion>
           <exclusion>
-            <groupId>org.apache.hadoop</groupId>
+            <groupId>${hadoop.group}</groupId>
             <artifactId>hadoop-yarn-server-resourcemanager</artifactId>
           </exclusion>
         </exclusions>
@@ -1714,27 +1733,27 @@
         fail, see also SPARK-33104.
       -->
       <dependency>
-        <groupId>org.apache.hadoop</groupId>
+        <groupId>${hadoop.group}</groupId>
         <artifactId>hadoop-yarn-server-resourcemanager</artifactId>
         <version>${yarn.version}</version>
         <scope>test</scope>
       </dependency>
       <dependency>
-        <groupId>org.apache.hadoop</groupId>
+        <groupId>${hadoop.group}</groupId>
         <artifactId>hadoop-yarn-server-web-proxy</artifactId>
         <version>${yarn.version}</version>
         <scope>${hadoop.deps.scope}</scope>
         <exclusions>
           <exclusion>
-            <groupId>org.apache.hadoop</groupId>
+            <groupId>${hadoop.group}</groupId>
             <artifactId>hadoop-yarn-server-common</artifactId>
           </exclusion>
           <exclusion>
-            <groupId>org.apache.hadoop</groupId>
+            <groupId>${hadoop.group}</groupId>
             <artifactId>hadoop-yarn-common</artifactId>
           </exclusion>
           <exclusion>
-            <groupId>org.apache.hadoop</groupId>
+            <groupId>${hadoop.group}</groupId>
             <artifactId>hadoop-yarn-api</artifactId>
           </exclusion>
           <exclusion>
@@ -1797,7 +1816,7 @@
         </exclusions>
       </dependency>
       <dependency>
-        <groupId>org.apache.hadoop</groupId>
+        <groupId>${hadoop.group}</groupId>
         <artifactId>hadoop-yarn-client</artifactId>
         <version>${yarn.version}</version>
         <scope>${hadoop.deps.scope}</scope>
@@ -1912,7 +1931,7 @@
           </exclusion>
           <exclusion>
             <groupId>${hive.group}</groupId>
-            <artifactId>hive-jdbc</artifactId>
+            <artifactId>hops-jdbc</artifactId>
           </exclusion>
           <exclusion>
             <groupId>${hive.group}</groupId>
@@ -1968,7 +1987,7 @@
           </exclusion>
           <exclusion>
             <groupId>${hive.group}</groupId>
-            <artifactId>hive-jdbc</artifactId>
+            <artifactId>hops-jdbc</artifactId>
           </exclusion>
           <exclusion>
             <groupId>${hive.group}</groupId>
@@ -2023,11 +2042,11 @@
             <artifactId>ant</artifactId>
           </exclusion>
           <exclusion>
-            <groupId>org.apache.hadoop</groupId>
+            <groupId>${hadoop.group}</groupId>
             <artifactId>hadoop-common</artifactId>
           </exclusion>
           <exclusion>
-            <groupId>org.apache.hadoop</groupId>
+            <groupId>${hadoop.group}</groupId>
             <artifactId>hadoop-auth</artifactId>
           </exclusion>
           <exclusion>
@@ -2077,7 +2096,7 @@
           </exclusion>
           <!-- hive-storage-api is needed and must be explicitly included later -->
           <exclusion>
-            <groupId>org.apache.hive</groupId>
+            <groupId>io.hops.hive</groupId>
             <artifactId>hive-storage-api</artifactId>
           </exclusion>
           <!-- End of Hive 2.3 exclusion -->
@@ -2137,11 +2156,6 @@
             <groupId>org.apache.avro</groupId>
             <artifactId>avro-mapred</artifactId>
           </exclusion>
-          <!--  Do not need Calcite because we disabled hive.cbo.enable -->
-          <exclusion>
-            <groupId>org.apache.calcite</groupId>
-            <artifactId>calcite-core</artifactId>
-          </exclusion>
           <exclusion>
             <groupId>org.apache.calcite</groupId>
             <artifactId>calcite-avatica</artifactId>
@@ -2178,6 +2192,10 @@
             <groupId>org.slf4j</groupId>
             <artifactId>slf4j-log4j12</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>org.apache.logging.log4j</groupId>
+            <artifactId>*</artifactId>
+          </exclusion>
           <exclusion>
             <groupId>log4j</groupId>
             <artifactId>log4j</artifactId>
@@ -2190,6 +2208,10 @@
             <groupId>org.codehaus.groovy</groupId>
             <artifactId>groovy-all</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>com.sun.jersey</groupId>
+            <artifactId>*</artifactId>
+          </exclusion>
           <exclusion>
             <groupId>jline</groupId>
             <artifactId>jline</artifactId>
@@ -2240,7 +2262,7 @@
       </dependency>
       <dependency>
         <groupId>${hive.group}</groupId>
-        <artifactId>hive-jdbc</artifactId>
+        <artifactId>hops-jdbc</artifactId>
         <version>${hive.version}</version>
         <exclusions>
           <exclusion>
@@ -2287,10 +2309,26 @@
             <groupId>org.slf4j</groupId>
             <artifactId>slf4j-api</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>log4j</groupId>
+            <artifactId>log4j</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>log4j</groupId>
+            <artifactId>log4j</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.sun.jersey</groupId>
+            <artifactId>*</artifactId>
+          </exclusion>
           <exclusion>
             <groupId>org.slf4j</groupId>
             <artifactId>slf4j-log4j12</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>org.apache.logging.log4j</groupId>
+            <artifactId>*</artifactId>
+          </exclusion>
           <exclusion>
             <groupId>log4j</groupId>
             <artifactId>log4j</artifactId>
@@ -2340,6 +2378,14 @@
             <groupId>org.slf4j</groupId>
             <artifactId>slf4j-api</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>log4j</groupId>
+            <artifactId>log4j</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.logging.log4j</groupId>
+            <artifactId>log4j-slf4j-impl</artifactId>
+          </exclusion>
           <exclusion>
             <groupId>org.slf4j</groupId>
             <artifactId>slf4j-log4j12</artifactId>
@@ -2438,7 +2484,18 @@
       <dependency>
         <groupId>${hive.group}</groupId>
         <artifactId>hive-service-rpc</artifactId>
-        <version>3.1.3</version>
+        <version>${hive.version}</version>
+        <exclusions>
+          <exclusion>
+            <groupId>*</groupId>
+            <artifactId>*</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
+      <dependency>
+        <groupId>${hive.group}</groupId>
+        <artifactId>hive-service</artifactId>
+        <version>${hive.version}</version>
         <exclusions>
           <exclusion>
             <groupId>*</groupId>
@@ -2473,7 +2530,7 @@
             <artifactId>guava</artifactId>
           </exclusion>
           <exclusion>
-            <groupId>org.apache.hadoop</groupId>
+            <groupId>${hadoop.group}</groupId>
             <artifactId>hadoop-yarn-server-resourcemanager</artifactId>
           </exclusion>
           <exclusion>
@@ -2508,6 +2565,10 @@
             <groupId>org.codehaus.groovy</groupId>
             <artifactId>groovy-all</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>com.sun.jersey</groupId>
+            <artifactId>*</artifactId>
+          </exclusion>
           <!-- Begin of Hive 2.3 exclusion -->
           <!-- Exclude log4j-slf4j-impl, otherwise throw NCDFE when starting spark-shell -->
           <exclusion>
@@ -2521,42 +2582,50 @@
       <!-- hive-llap-common is needed when registering UDFs in Hive 2.3.
          We add it here, otherwise -Phive-provided won't work. -->
       <dependency>
-        <groupId>org.apache.hive</groupId>
+        <groupId>io.hops.hive</groupId>
         <artifactId>hive-llap-common</artifactId>
         <version>${hive23.version}</version>
         <scope>${hive.llap.scope}</scope>
         <exclusions>
           <exclusion>
-            <groupId>org.apache.hive</groupId>
+            <groupId>io.hops.hive</groupId>
             <artifactId>hive-common</artifactId>
           </exclusion>
           <exclusion>
-            <groupId>org.apache.hive</groupId>
+            <groupId>io.hops.hive</groupId>
             <artifactId>hive-serde</artifactId>
           </exclusion>
           <exclusion>
             <groupId>org.slf4j</groupId>
             <artifactId>slf4j-api</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>log4j</groupId>
+            <artifactId>log4j</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.sun.jersey</groupId>
+            <artifactId>*</artifactId>
+          </exclusion>
         </exclusions>
       </dependency>
       <!-- hive-llap-client is needed when run MapReduce test in Hive 2.3. -->
       <dependency>
-        <groupId>org.apache.hive</groupId>
+        <groupId>io.hops.hive</groupId>
         <artifactId>hive-llap-client</artifactId>
         <version>${hive23.version}</version>
         <scope>test</scope>
         <exclusions>
           <exclusion>
-            <groupId>org.apache.hive</groupId>
+            <groupId>io.hops.hive</groupId>
             <artifactId>hive-common</artifactId>
           </exclusion>
           <exclusion>
-            <groupId>org.apache.hive</groupId>
+            <groupId>io.hops.hive</groupId>
             <artifactId>hive-serde</artifactId>
           </exclusion>
           <exclusion>
-            <groupId>org.apache.hive</groupId>
+            <groupId>io.hops.hive</groupId>
             <artifactId>hive-llap-common</artifactId>
           </exclusion>
           <exclusion>
@@ -2575,6 +2644,10 @@
             <groupId>org.slf4j</groupId>
             <artifactId>slf4j-api</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>log4j</groupId>
+            <artifactId>log4j</artifactId>
+          </exclusion>
         </exclusions>
       </dependency>
 
@@ -2586,19 +2659,19 @@
         <scope>${orc.deps.scope}</scope>
         <exclusions>
           <exclusion>
-            <groupId>org.apache.hadoop</groupId>
+            <groupId>${hadoop.group}</groupId>
             <artifactId>hadoop-common</artifactId>
           </exclusion>
           <exclusion>
-            <groupId>org.apache.hadoop</groupId>
+            <groupId>${hadoop.group}</groupId>
             <artifactId>hadoop-hdfs</artifactId>
           </exclusion>
           <exclusion>
-            <groupId>org.apache.hadoop</groupId>
+            <groupId>${hadoop.group}</groupId>
             <artifactId>hadoop-client-api</artifactId>
           </exclusion>
           <exclusion>
-            <groupId>org.apache.hive</groupId>
+            <groupId>io.hops.hive</groupId>
             <artifactId>hive-storage-api</artifactId>
           </exclusion>
         </exclusions>
@@ -2616,11 +2689,11 @@
         <scope>${orc.deps.scope}</scope>
         <exclusions>
           <exclusion>
-            <groupId>org.apache.hadoop</groupId>
+            <groupId>${hadoop.group}</groupId>
             <artifactId>hadoop-common</artifactId>
           </exclusion>
           <exclusion>
-            <groupId>org.apache.hadoop</groupId>
+            <groupId>${hadoop.group}</groupId>
             <artifactId>hadoop-mapreduce-client-core</artifactId>
           </exclusion>
           <exclusion>
@@ -2628,7 +2701,7 @@
             <artifactId>orc-core</artifactId>
           </exclusion>
           <exclusion>
-            <groupId>org.apache.hive</groupId>
+            <groupId>io.hops.hive</groupId>
             <artifactId>hive-storage-api</artifactId>
           </exclusion>
           <exclusion>
@@ -2804,7 +2877,7 @@
         <version>2.9.1</version>
       </dependency>
       <dependency>
-        <groupId>org.apache.hive</groupId>
+        <groupId>io.hops.hive</groupId>
         <artifactId>hive-storage-api</artifactId>
         <version>${hive.storage.version}</version>
         <scope>${hive.storage.scope}</scope>
@@ -3909,4 +3982,11 @@
       </build>
     </profile>
   </profiles>
+  <distributionManagement>
+    <repository>
+      <id>Hops</id>
+      <name>Hops Repo</name>
+      <url>https://archiva.hops.works/repository/Hops/</url>
+    </repository>
+  </distributionManagement>
 </project>
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 6e45fb3113ece..dd494ec5b3b1a 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -40,13 +40,13 @@
       </activation>
       <dependencies>
         <dependency>
-          <groupId>org.apache.hadoop</groupId>
+          <groupId>${hadoop.group}</groupId>
           <artifactId>hadoop-client-runtime</artifactId>
           <version>${hadoop.version}</version>
           <scope>${hadoop.deps.scope}</scope>
         </dependency>
         <dependency>
-          <groupId>org.apache.hadoop</groupId>
+          <groupId>${hadoop.group}</groupId>
           <artifactId>hadoop-client-minicluster</artifactId>
           <version>${hadoop.version}</version>
           <scope>test</scope>
@@ -92,7 +92,7 @@
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-client-api</artifactId>
       <version>${hadoop.version}</version>
     </dependency>
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index def8f5ddf98fb..d10c1782c8a75 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -106,7 +106,7 @@
       <classifier>${orc.classifier}</classifier>
     </dependency>
     <dependency>
-      <groupId>org.apache.hive</groupId>
+      <groupId>${hive.group}</groupId>
       <artifactId>hive-storage-api</artifactId>
     </dependency>
     <dependency>
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 9cf2c20f6761a..918bb9fa5c72c 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -77,7 +77,7 @@
     </dependency>
     <dependency>
       <groupId>${hive.group}</groupId>
-      <artifactId>hive-jdbc</artifactId>
+      <artifactId>hops-jdbc</artifactId>
     </dependency>
     <dependency>
       <groupId>${hive.group}</groupId>
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 3d85c41481dfe..65ff31c0b1d87 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -110,15 +110,30 @@
       <scope>${hive.shims.scope}</scope>
     </dependency>
     <dependency>
-      <groupId>org.apache.hive</groupId>
+      <groupId>${hive.group}</groupId>
       <artifactId>hive-llap-common</artifactId>
       <scope>${hive.llap.scope}</scope>
     </dependency>
     <dependency>
-      <groupId>org.apache.hive</groupId>
+      <groupId>${hive.group}</groupId>
       <artifactId>hive-llap-client</artifactId>
       <scope>${hive.llap.scope}</scope>
     </dependency>
+    <dependency>
+      <groupId>${hive.group}</groupId>
+      <artifactId>hops-jdbc</artifactId>
+      <scope>${hive.jdbc.scope}</scope>
+    </dependency>
+    <dependency>
+      <groupId>${hive.group}</groupId>
+      <artifactId>hive-service-rpc</artifactId>
+      <scope>${hive.service.scope}</scope>
+    </dependency>
+    <dependency>
+      <groupId>${hive.group}</groupId>
+      <artifactId>hive-service</artifactId>
+      <scope>${hive.service.scope}</scope>
+    </dependency>
     <!-- hive-serde already depends on avro, but this brings in customized config of avro deps from parent -->
     <dependency>
       <groupId>org.apache.avro</groupId>
@@ -158,7 +173,7 @@
       <artifactId>datanucleus-core</artifactId>
     </dependency>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
+      <groupId>${hadoop.group}</groupId>
       <artifactId>hadoop-client-runtime</artifactId>
       <scope>${hadoop.deps.scope}</scope>
     </dependency>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index e51658355b10d..baf22ca001fe9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -866,10 +866,10 @@ private[hive] class HiveClientImpl(
     def closeDriver(driver: Driver): Unit = {
       // Since HIVE-18238(Hive 3.0.0), the Driver.close function's return type changed
       // and the CommandProcessorFactory.clean function removed.
-      driver.getClass.getMethod("close").invoke(driver)
-      if (version != hive.v3_0 && version != hive.v3_1) {
-        CommandProcessorFactory.clean(conf)
-      }
+      // Fabio: Comment this to avoid compilation issue with Hive3
+      // if (version != hive.v3_0 && version != hive.v3_1) {
+      //   CommandProcessorFactory.clean(conf)
+      // }
     }
 
     // Hive query needs to start SessionState.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
index 18090b53e3c10..5ae63b6efad17 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
@@ -89,23 +89,24 @@ private[hive] object IsolatedClientLoader extends Logging {
   }
 
   def hiveVersion(version: String): HiveVersion = {
-    VersionUtils.majorMinorPatchVersion(version).flatMap {
-      case (12, _, _) | (0, 12, _) => Some(hive.v12)
-      case (13, _, _) | (0, 13, _) => Some(hive.v13)
-      case (14, _, _) | (0, 14, _) => Some(hive.v14)
-      case (1, 0, _) => Some(hive.v1_0)
-      case (1, 1, _) => Some(hive.v1_1)
-      case (1, 2, _) => Some(hive.v1_2)
-      case (2, 0, _) => Some(hive.v2_0)
-      case (2, 1, _) => Some(hive.v2_1)
-      case (2, 2, _) => Some(hive.v2_2)
-      case (2, 3, _) => Some(hive.v2_3)
-      case (3, 0, _) => Some(hive.v3_0)
-      case (3, 1, _) => Some(hive.v3_1)
-      case _ => None
-    }.getOrElse {
-      throw QueryExecutionErrors.unsupportedHiveMetastoreVersionError(
-        version, HiveUtils.HIVE_METASTORE_VERSION.key)
+    // Fabio: Remove the last digit of the version string as it's the Hopsworks specific version
+    version.substring(0, version.lastIndexOf(".")) match {
+      case "12" | "0.12" | "0.12.0" => hive.v12
+      case "13" | "0.13" | "0.13.0" | "0.13.1" => hive.v13
+      case "14" | "0.14" | "0.14.0" => hive.v14
+      case "1.0" | "1.0.0" | "1.0.1" => hive.v1_0
+      case "1.1" | "1.1.0" | "1.1.1" => hive.v1_1
+      case "1.2" | "1.2.0" | "1.2.1" | "1.2.2" => hive.v1_2
+      case "2.0" | "2.0.0" | "2.0.1" => hive.v2_0
+      case "2.1" | "2.1.0" | "2.1.1" => hive.v2_1
+      case "2.2" | "2.2.0" => hive.v2_2
+      case "2.3" | "2.3.0" | "2.3.1" | "2.3.2" | "2.3.3" | "2.3.4" | "2.3.5" | "2.3.6" | "2.3.7" =>
+        hive.v2_3
+      case "3.0" | "3.0.0" => hive.v3_0
+      case "3.1" | "3.1.0" | "3.1.1" | "3.1.2" => hive.v3_1
+      case version =>
+        throw new UnsupportedOperationException(s"Unsupported Hive Metastore version ($version). " +
+          s"Please set ${HiveUtils.HIVE_METASTORE_VERSION.key} with a valid version.")
     }
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala
index 29734c4de3441..9ee3089977eba 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala
@@ -86,7 +86,7 @@ class HiveFileFormat(fileSinkConf: FileSinkDesc)
     // Add table properties from storage handler to hadoopConf, so any custom storage
     // handler settings can be set to hadoopConf
     HiveTableUtil.configureJobPropertiesForStorageHandler(tableDesc, conf, false)
-    Utilities.copyTableJobPropertiesToConf(tableDesc, conf)
+    Utilities.copyTableJobPropertiesToConf(tableDesc, new JobConf(conf))
 
     // Avoid referencing the outer object.
     val fileSinkConfSer = fileSinkConf

From 9bdf676b913651e4111480965e4af3453edf6017 Mon Sep 17 00:00:00 2001
From: Gibson Chikafa <gibbschikafa@gmail.com>
Date: Fri, 21 Feb 2025 11:57:53 +0100
Subject: [PATCH 02/86] Drop Index support, not compatible with the hive
 version

---
 .../org/apache/spark/sql/hive/client/HiveClientImpl.scala   | 3 +++
 .../scala/org/apache/spark/sql/hive/client/HiveShim.scala   | 6 ++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index baf22ca001fe9..22ceb6dacf932 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -1029,6 +1029,8 @@ private[hive] class HiveClientImpl(
     others.foreach { table =>
       val t = table.getTableName
       logDebug(s"Deleting table $t")
+      shim.dropTable(client, "default", t)
+      /*
       try {
         shim.getIndexes(client, "default", t, 255).foreach { index =>
           shim.dropIndex(client, "default", t, index.getIndexName)
@@ -1041,6 +1043,7 @@ private[hive] class HiveClientImpl(
           // HIVE-18448 Hive 3.0 remove index APIs
           shim.dropTable(client, "default", t)
       }
+      */
     }
     shim.getAllDatabases(client).filterNot(_ == "default").foreach { db =>
       logDebug(s"Dropping Database: $db")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 01c3fc724822c..6f797d36c5f79 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -30,7 +30,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.metastore.IMetaStoreClient
 import org.apache.hadoop.hive.metastore.TableType
-import org.apache.hadoop.hive.metastore.api.{Database, EnvironmentContext, Function => HiveFunction, FunctionType, Index, MetaException, PrincipalType, ResourceType, ResourceUri}
+import org.apache.hadoop.hive.metastore.api.{Database, EnvironmentContext, Function => HiveFunction, FunctionType, MetaException, PrincipalType, ResourceType, ResourceUri}
 import org.apache.hadoop.hive.ql.Driver
 import org.apache.hadoop.hive.ql.io.AcidUtils
 import org.apache.hadoop.hive.ql.metadata.{Hive, HiveException, Partition, Table}
@@ -243,7 +243,7 @@ private[client] sealed abstract class Shim {
 
   def getMSC(hive: Hive): IMetaStoreClient
 
-  def getIndexes(hive: Hive, dbName: String, tableName: String, max: Short): Seq[Index]
+  // def getIndexes(hive: Hive, dbName: String, tableName: String, max: Short): Seq[Index]
 
   protected def findMethod(klass: Class[_], name: String, args: Class[_]*): Method = {
     klass.getMethod(name, args: _*)
@@ -688,6 +688,7 @@ private[client] class Shim_v0_12 extends Shim with Logging {
     hive.renamePartition(table, oldPartSpec, newPart)
   }
 
+  /*
   override def getIndexes(
       hive: Hive,
       dbName: String,
@@ -696,6 +697,7 @@ private[client] class Shim_v0_12 extends Shim with Logging {
     recordHiveCall()
     hive.getIndexes(dbName, tableName, max).asScala.toSeq
   }
+  */
 }
 
 private[client] class Shim_v0_13 extends Shim_v0_12 {

From d871e4ce3d0a38a68bfe7fd167f32e0246cbc96c Mon Sep 17 00:00:00 2001
From: Gibson Chikafa <gibbschikafa@gmail.com>
Date: Fri, 21 Feb 2025 17:20:19 +0100
Subject: [PATCH 03/86] DaysWritableV2

---
 sql/core/pom.xml                              |  8 ++
 .../datasources/DaysWritableV2.scala          | 77 +++++++++++++++++++
 .../spark/sql/hive/HiveInspectors.scala       | 23 +++---
 3 files changed, 97 insertions(+), 11 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritableV2.scala

diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index d10c1782c8a75..d202c1c60d738 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -109,6 +109,14 @@
       <groupId>${hive.group}</groupId>
       <artifactId>hive-storage-api</artifactId>
     </dependency>
+    <dependency>
+      <groupId>${hive.group}</groupId>
+      <artifactId>hive-common</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>${hive.group}</groupId>
+      <artifactId>hive-serde</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.apache.parquet</groupId>
       <artifactId>parquet-column</artifactId>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritableV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritableV2.scala
new file mode 100644
index 0000000000000..3133d7b12eb2f
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritableV2.scala
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.io.{DataInput, DataOutput, IOException}
+
+import org.apache.hadoop.hive.common.`type`.Date
+import org.apache.hadoop.hive.serde2.io.DateWritableV2
+import org.apache.hadoop.io.WritableUtils
+
+import org.apache.spark.sql.catalyst.util.RebaseDateTime.{rebaseGregorianToJulianDays, rebaseJulianToGregorianDays}
+
+
+/**
+ * The class accepts/returns days in Gregorian calendar and rebase them
+ * via conversion to local date in Julian calendar for dates before 1582-10-15
+ * in read/write for backward compatibility with Spark 2.4 and earlier versions.
+ *
+ * @param gregorianDays The number of days since the epoch 1970-01-01 in
+ *                      Gregorian calendar.
+ * @param julianDays The number of days since the epoch 1970-01-01 in
+ *                   Julian calendar.
+ */
+class DaysWritableV2(
+    var gregorianDays: Int,
+    var julianDays: Int)
+  extends DateWritableV2 {
+
+  def this() = this(0, 0)
+  def this(gregorianDays: Int) =
+    this(gregorianDays, rebaseGregorianToJulianDays(gregorianDays))
+  def this(dateWritable: DateWritableV2) = {
+    this(
+      gregorianDays = dateWritable match {
+        case daysWritable: DaysWritableV2 => daysWritable.gregorianDays
+        case dateWritable: DateWritableV2 =>
+        rebaseJulianToGregorianDays(dateWritable.getDays)
+      },
+      julianDays = dateWritable.getDays)
+  }
+
+  override def getDays: Int = julianDays
+  override def get: Date = {
+    Date.ofEpochMilli(DateWritableV2.daysToMillis(julianDays))
+  }
+
+  override def set(d: Int): Unit = {
+    gregorianDays = d
+    julianDays = rebaseGregorianToJulianDays(d)
+  }
+
+  @throws[IOException]
+  override def write(out: DataOutput): Unit = {
+    WritableUtils.writeVInt(out, julianDays)
+  }
+
+  @throws[IOException]
+  override def readFields(in: DataInput): Unit = {
+    julianDays = WritableUtils.readVInt(in)
+    gregorianDays = rebaseJulianToGregorianDays(julianDays)
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 707532f2bad38..8d146064980e8 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -23,7 +23,7 @@ import java.time.Duration
 import scala.collection.JavaConverters._
 
 import org.apache.hadoop.{io => hadoopIo}
-import org.apache.hadoop.hive.common.`type`.{HiveChar, HiveDecimal, HiveIntervalDayTime, HiveIntervalYearMonth, HiveVarchar}
+import org.apache.hadoop.hive.common.`type`.{HiveChar, HiveDecimal, HiveIntervalDayTime, HiveIntervalYearMonth, HiveVarchar, Timestamp}
 import org.apache.hadoop.hive.serde2.{io => hiveIo}
 import org.apache.hadoop.hive.serde2.objectinspector.{StructField => HiveStructField, _}
 import org.apache.hadoop.hive.serde2.objectinspector.primitive._
@@ -33,7 +33,7 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util._
-import org.apache.spark.sql.execution.datasources.DaysWritable
+import org.apache.spark.sql.execution.datasources.DaysWritableV2
 import org.apache.spark.sql.types
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -91,8 +91,8 @@ import org.apache.spark.unsafe.types.UTF8String
  *       org.apache.hadoop.hive.serde2.io.ShortWritable
  *       org.apache.hadoop.hive.serde2.io.ByteWritable
  *       org.apache.hadoop.io.BytesWritable
- *       org.apache.hadoop.hive.serde2.io.DateWritable
- *       org.apache.hadoop.hive.serde2.io.TimestampWritable
+ *       org.apache.hadoop.hive.serde2.io.DateWritableV2
+ *       org.apache.hadoop.hive.serde2.io.TimestampWritableV2
  *       org.apache.hadoop.hive.serde2.io.HiveDecimalWritable
  * Complex Type
  *   List: Object[] / java.util.List
@@ -189,8 +189,8 @@ private[hive] trait HiveInspectors {
     case c: Class[_] if c == classOf[hiveIo.HiveDecimalWritable] => DecimalType.SYSTEM_DEFAULT
     case c: Class[_] if c == classOf[hiveIo.ByteWritable] => ByteType
     case c: Class[_] if c == classOf[hiveIo.ShortWritable] => ShortType
-    case c: Class[_] if c == classOf[hiveIo.DateWritable] => DateType
-    case c: Class[_] if c == classOf[hiveIo.TimestampWritable] => TimestampType
+    case c: Class[_] if c == classOf[hiveIo.DateWritableV2] => DateType
+    case c: Class[_] if c == classOf[hiveIo.TimestampWritableV2] => TimestampType
     case c: Class[_] if c == classOf[hadoopIo.Text] => StringType
     case c: Class[_] if c == classOf[hadoopIo.IntWritable] => IntegerType
     case c: Class[_] if c == classOf[hadoopIo.LongWritable] => LongType
@@ -631,7 +631,7 @@ private[hive] trait HiveInspectors {
         case x: DateObjectInspector if x.preferWritable() =>
           data: Any => {
             if (data != null) {
-              new DaysWritable(x.getPrimitiveWritableObject(data)).gregorianDays
+              new DaysWritableV2(x.getPrimitiveWritableObject(data).getDays).gregorianDays
             } else {
               null
             }
@@ -1079,18 +1079,19 @@ private[hive] trait HiveInspectors {
       new hadoopIo.BytesWritable(value.asInstanceOf[Array[Byte]])
     }
 
-  private def getDateWritable(value: Any): DaysWritable =
+  private def getDateWritable(value: Any): DaysWritableV2 =
     if (value == null) {
       null
     } else {
-      new DaysWritable(value.asInstanceOf[Int])
+      new DaysWritableV2(value.asInstanceOf[Int])
     }
 
-  private def getTimestampWritable(value: Any): hiveIo.TimestampWritable =
+  private def getTimestampWritable(value: Any): hiveIo.TimestampWritableV2 =
     if (value == null) {
       null
     } else {
-      new hiveIo.TimestampWritable(DateTimeUtils.toJavaTimestamp(value.asInstanceOf[Long]))
+      val ts = DateTimeUtils.toJavaTimestamp(value.asInstanceOf[Long])
+      new hiveIo.TimestampWritableV2(Timestamp.ofEpochMilli(ts.getTime))
     }
 
   private def getHiveIntervalDayTimeWritable(value: Any): hiveIo.HiveIntervalDayTimeWritable =

From 2135dcf31eee6ab837f89c0c2b119e287e2e5625 Mon Sep 17 00:00:00 2001
From: Gibson Chikafa <gibbschikafa@gmail.com>
Date: Fri, 21 Feb 2025 14:42:45 +0100
Subject: [PATCH 04/86] Exclude com.vlkan:flatbuffers:1.2.0 dependency from
 build

---
 pom.xml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pom.xml b/pom.xml
index ccadad0f84238..95408ce38d399 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2477,6 +2477,10 @@
             <groupId>tomcat</groupId>
             <artifactId>jasper-runtime</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>com.vlkan</groupId>
+            <artifactId>flatbuffers</artifactId>
+          </exclusion>
           <!-- End of Hive 2.3 exclusion -->
         </exclusions>
       </dependency>

From 3056fcc52bb49b39a7b363daf58c32e0da916cda Mon Sep 17 00:00:00 2001
From: Gibson Chikafa <gibbschikafa@gmail.com>
Date: Tue, 25 Feb 2025 00:05:30 +0100
Subject: [PATCH 05/86] Apply uniffle changes

---
 .../scala/org/apache/spark/ExecutorAllocationManager.scala | 4 +++-
 core/src/main/scala/org/apache/spark/SparkConf.scala       | 7 +++++++
 .../scala/org/apache/spark/scheduler/DAGScheduler.scala    | 3 ++-
 .../scala/org/apache/spark/scheduler/TaskSetManager.scala  | 2 +-
 .../org/apache/spark/sql/execution/ShuffledRowRDD.scala    | 3 +++
 5 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 441bf60e4891e..c7e0deb4d61ed 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -205,7 +205,9 @@ private[spark] class ExecutorAllocationManager(
         s"s${DYN_ALLOCATION_SUSTAINED_SCHEDULER_BACKLOG_TIMEOUT.key} must be > 0!")
     }
     if (!conf.get(config.SHUFFLE_SERVICE_ENABLED) && !reliableShuffleStorage) {
-      if (conf.get(config.DYN_ALLOCATION_SHUFFLE_TRACKING_ENABLED)) {
+      if (conf.isRssEnable()) {
+        logInfo("Dynamic allocation will use remote shuffle service")
+      } else if (conf.get(config.DYN_ALLOCATION_SHUFFLE_TRACKING_ENABLED)) {
         logInfo("Dynamic allocation is enabled without a shuffle service.")
       } else if (decommissionEnabled &&
           conf.get(config.STORAGE_DECOMMISSION_SHUFFLE_BLOCKS_ENABLED)) {
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index f49e9e357c84d..e1927f3a6df3d 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -580,6 +580,13 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
     Utils.redact(this, getAll).sorted.map { case (k, v) => k + "=" + v }.mkString("\n")
   }
 
+  /**
+   * Return true if remote shuffle service is enabled.
+   */
+  def isRssEnable(): Boolean = {
+    val shuffleMgr = get("spark.shuffle.manager", "sort")
+    shuffleMgr.contains("RssShuffleManager") || shuffleMgr.contains("UniffleShuffleManager")
+  }
 }
 
 private[spark] object SparkConf extends Logging {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 5ae29a5cd0230..9cc4dbafbcc10 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -2628,7 +2628,8 @@ private[spark] class DAGScheduler(
     // we know to unregister shuffle output.  (Note that "worker" specifically refers to the process
     // from a Standalone cluster, where the shuffle service lives in the Worker.)
     val fileLost = !sc.shuffleDriverComponents.supportsReliableStorage() &&
-      (workerHost.isDefined || !env.blockManager.externalShuffleServiceEnabled)
+      (workerHost.isDefined || !env.blockManager.externalShuffleServiceEnabled) &&
+      !sc.getConf.isRssEnable()
     removeExecutorAndUnregisterOutputs(
       execId = execId,
       fileLost = fileLost,
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 69b626029e4f4..13198f758d797 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -1046,7 +1046,7 @@ private[spark] class TaskSetManager(
     // could serve the shuffle outputs or the executor lost is caused by decommission (which
     // can destroy the whole host). The reason is the next stage wouldn't be able to fetch the
     // data from this dead executor so we would need to rerun these tasks on other executors.
-    val maybeShuffleMapOutputLoss = isShuffleMapTasks &&
+    val maybeShuffleMapOutputLoss = isShuffleMapTasks && !conf.isRssEnable() &&
       !sched.sc.shuffleDriverComponents.supportsReliableStorage() &&
       (reason.isInstanceOf[ExecutorDecommission] || !env.blockManager.externalShuffleServiceEnabled)
     if (maybeShuffleMapOutputLoss && !isZombie) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
index 367732dbb2059..ad42e8272718f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
@@ -166,6 +166,9 @@ class ShuffledRowRDD(
   }
 
   override def getPreferredLocations(partition: Partition): Seq[String] = {
+    if (conf.isRssEnable()) {
+      return Nil
+    }
     val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
     partition.asInstanceOf[ShuffledRowRDDPartition].spec match {
       case CoalescedPartitionSpec(startReducerIndex, endReducerIndex, _) =>

From 9374587494a22b79a2de995c6f4ac4e9fbad9336 Mon Sep 17 00:00:00 2001
From: Fabio Buso <fabio@logicalclocks.com>
Date: Wed, 6 Oct 2021 00:49:19 +0200
Subject: [PATCH 06/86] Reduce number of event logs flushes (#26)

---
 .../scheduler/EventLoggingListener.scala      | 46 +++++++++----------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
index b52a0f2f999dd..6e41e41fab14a 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
@@ -152,96 +152,96 @@ private[spark] class EventLoggingListener(
     }
 
     // log stage completed event
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onJobStart(event: SparkListenerJobStart): Unit = {
-    logEvent(event.copy(properties = redactProperties(event.properties)), flushLogger = true)
+    logEvent(event.copy(properties = redactProperties(event.properties)))
   }
 
   override def onJobEnd(event: SparkListenerJobEnd): Unit = logEvent(event, flushLogger = true)
 
   override def onBlockManagerAdded(event: SparkListenerBlockManagerAdded): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onBlockManagerRemoved(event: SparkListenerBlockManagerRemoved): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onUnpersistRDD(event: SparkListenerUnpersistRDD): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onApplicationStart(event: SparkListenerApplicationStart): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onApplicationEnd(event: SparkListenerApplicationEnd): Unit = {
     logEvent(event, flushLogger = true)
   }
   override def onExecutorAdded(event: SparkListenerExecutorAdded): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onExecutorRemoved(event: SparkListenerExecutorRemoved): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onExecutorBlacklisted(event: SparkListenerExecutorBlacklisted): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onExecutorExcluded(event: SparkListenerExecutorExcluded): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onExecutorBlacklistedForStage(
       event: SparkListenerExecutorBlacklistedForStage): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onExecutorExcludedForStage(
       event: SparkListenerExecutorExcludedForStage): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onNodeBlacklistedForStage(event: SparkListenerNodeBlacklistedForStage): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onNodeExcludedForStage(event: SparkListenerNodeExcludedForStage): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onExecutorUnblacklisted(event: SparkListenerExecutorUnblacklisted): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onExecutorUnexcluded(event: SparkListenerExecutorUnexcluded): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
 
   override def onNodeBlacklisted(event: SparkListenerNodeBlacklisted): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onNodeExcluded(event: SparkListenerNodeExcluded): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onNodeUnblacklisted(event: SparkListenerNodeUnblacklisted): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onNodeUnexcluded(event: SparkListenerNodeUnexcluded): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onBlockUpdated(event: SparkListenerBlockUpdated): Unit = {
     if (shouldLogBlockUpdates) {
-      logEvent(event, flushLogger = true)
+      logEvent(event)
     }
   }
 
@@ -266,12 +266,12 @@ private[spark] class EventLoggingListener(
   }
 
   override def onResourceProfileAdded(event: SparkListenerResourceProfileAdded): Unit = {
-    logEvent(event, flushLogger = true)
+    logEvent(event)
   }
 
   override def onOtherEvent(event: SparkListenerEvent): Unit = {
     if (event.logEvent) {
-      logEvent(event, flushLogger = true)
+      logEvent(event)
     }
   }
 

From eeb2aaabad61aac94f5400f8aab939a9a9d86b86 Mon Sep 17 00:00:00 2001
From: Gibson Chikafa <gibbschikafa@gmail.com>
Date: Thu, 4 Jul 2024 18:12:50 +0200
Subject: [PATCH 07/86] [HWORKS-1405] Get correct hive version in spark (#40)

---
 .../hive/client/IsolatedClientLoader.scala    | 40 ++++++++++---------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
index 5ae63b6efad17..f5cc72860ff7c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
@@ -89,24 +89,28 @@ private[hive] object IsolatedClientLoader extends Logging {
   }
 
   def hiveVersion(version: String): HiveVersion = {
-    // Fabio: Remove the last digit of the version string as it's the Hopsworks specific version
-    version.substring(0, version.lastIndexOf(".")) match {
-      case "12" | "0.12" | "0.12.0" => hive.v12
-      case "13" | "0.13" | "0.13.0" | "0.13.1" => hive.v13
-      case "14" | "0.14" | "0.14.0" => hive.v14
-      case "1.0" | "1.0.0" | "1.0.1" => hive.v1_0
-      case "1.1" | "1.1.0" | "1.1.1" => hive.v1_1
-      case "1.2" | "1.2.0" | "1.2.1" | "1.2.2" => hive.v1_2
-      case "2.0" | "2.0.0" | "2.0.1" => hive.v2_0
-      case "2.1" | "2.1.0" | "2.1.1" => hive.v2_1
-      case "2.2" | "2.2.0" => hive.v2_2
-      case "2.3" | "2.3.0" | "2.3.1" | "2.3.2" | "2.3.3" | "2.3.4" | "2.3.5" | "2.3.6" | "2.3.7" =>
-        hive.v2_3
-      case "3.0" | "3.0.0" => hive.v3_0
-      case "3.1" | "3.1.0" | "3.1.1" | "3.1.2" => hive.v3_1
-      case version =>
-        throw new UnsupportedOperationException(s"Unsupported Hive Metastore version ($version). " +
-          s"Please set ${HiveUtils.HIVE_METASTORE_VERSION.key} with a valid version.")
+    def extractMajorMinorVersion(version: String): String = {
+      val parts = version.split("\\.")
+      if (parts.length >= 2) parts(0) + "." + parts(1) else parts(0)
+    }
+
+    val majorMinorVersion = extractMajorMinorVersion(version)
+    majorMinorVersion match {
+      case "0.12" => hive.v12
+      case "0.13" => hive.v13
+      case "0.14" => hive.v14
+      case "1.0" => hive.v1_0
+      case "1.1" => hive.v1_1
+      case "1.2" => hive.v1_2
+      case "2.0" => hive.v2_0
+      case "2.1" => hive.v2_1
+      case "2.2" => hive.v2_2
+      case "2.3" => hive.v2_3
+      case "3.0" => hive.v3_0
+      case "3.1" => hive.v3_1
+      case _ => throw new UnsupportedOperationException(s"Unsupported " +
+        s"Hive Metastore version ($version). Please set " +
+          s"${HiveUtils.HIVE_METASTORE_VERSION.key} with a valid version.")
     }
   }
 

From 0c3b65b4d9b03d2491fd775789d943a75e2f950b Mon Sep 17 00:00:00 2001
From: Gibson Chikafa <gibbschikafa@gmail.com>
Date: Fri, 21 Feb 2025 14:39:04 +0100
Subject: [PATCH 08/86] Timestamp incompatibility Spark/Hive/Hudi - Hive fix -
 release 3.1.1.3

---
 .../apache/spark/sql/hive/HiveInspectors.scala   | 16 ++++++++++------
 .../org/apache/spark/sql/hive/TableReader.scala  |  6 ++++--
 .../spark/sql/hive/client/HiveClient.scala       |  8 ++++----
 .../spark/sql/hive/client/HiveClientImpl.scala   | 16 +++++++++-------
 4 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 8d146064980e8..955ffd6b3b5b9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -478,7 +478,7 @@ private[hive] trait HiveInspectors {
         _ => constant
       case poi: WritableConstantTimestampObjectInspector =>
         val t = poi.getWritableConstantValue
-        val constant = DateTimeUtils.fromJavaTimestamp(t.getTimestamp)
+        val constant = DateTimeUtils.fromJavaTimestamp(t.getTimestamp.toSqlTimestamp)
         _ => constant
       case poi: WritableConstantIntObjectInspector =>
         val constant = poi.getWritableConstantValue.get()
@@ -496,7 +496,8 @@ private[hive] trait HiveInspectors {
         val constant = poi.getWritableConstantValue.get()
         _ => constant
       case poi: WritableConstantShortObjectInspector =>
-        val constant = poi.getWritableConstantValue.get()
+        val constant = DateTimeUtils.fromJavaDate(new java.sql.Date(poi.getWritableConstantValue
+          .get()))
         _ => constant
       case poi: WritableConstantByteObjectInspector =>
         val constant = poi.getWritableConstantValue.get()
@@ -507,7 +508,8 @@ private[hive] trait HiveInspectors {
         System.arraycopy(writable.getBytes, 0, constant, 0, constant.length)
         _ => constant
       case poi: WritableConstantDateObjectInspector =>
-        val constant = DateTimeUtils.fromJavaDate(poi.getWritableConstantValue.get())
+        val constant = DateTimeUtils.fromJavaDate(new java.sql.Date(poi.getWritableConstantValue
+          .get().toEpochMilli))
         _ => constant
       case mi: StandardConstantMapObjectInspector =>
         val keyUnwrapper = unwrapperFor(mi.getMapKeyObjectInspector)
@@ -639,7 +641,8 @@ private[hive] trait HiveInspectors {
         case x: DateObjectInspector =>
           data: Any => {
             if (data != null) {
-              DateTimeUtils.fromJavaDate(x.getPrimitiveJavaObject(data))
+              DateTimeUtils.fromJavaDate(new java.sql.Date(x.getPrimitiveJavaObject(data)
+                .toEpochMilli))
             } else {
               null
             }
@@ -647,7 +650,8 @@ private[hive] trait HiveInspectors {
         case x: TimestampObjectInspector if x.preferWritable() =>
           data: Any => {
             if (data != null) {
-              DateTimeUtils.fromJavaTimestamp(x.getPrimitiveWritableObject(data).getTimestamp)
+              DateTimeUtils.fromJavaTimestamp(x.getPrimitiveWritableObject(data).getTimestamp
+                .toSqlTimestamp)
             } else {
               null
             }
@@ -655,7 +659,7 @@ private[hive] trait HiveInspectors {
         case ti: TimestampObjectInspector =>
           data: Any => {
             if (data != null) {
-              DateTimeUtils.fromJavaTimestamp(ti.getPrimitiveJavaObject(data))
+              DateTimeUtils.fromJavaTimestamp(ti.getPrimitiveJavaObject(data).toSqlTimestamp)
             } else {
               null
             }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index cd1d236dd36c9..e3c93cbb692b3 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -522,10 +522,12 @@ private[hive] object HadoopTableReader extends HiveInspectors with Logging {
             row.update(ordinal, HiveShim.toCatalystDecimal(oi, value))
         case oi: TimestampObjectInspector =>
           (value: Any, row: InternalRow, ordinal: Int) =>
-            row.setLong(ordinal, DateTimeUtils.fromJavaTimestamp(oi.getPrimitiveJavaObject(value)))
+            row.setLong(ordinal, DateTimeUtils.fromJavaTimestamp(oi.getPrimitiveJavaObject(value)
+              .toSqlTimestamp))
         case oi: DateObjectInspector =>
           (value: Any, row: InternalRow, ordinal: Int) =>
-            row.setInt(ordinal, DateTimeUtils.fromJavaDate(oi.getPrimitiveJavaObject(value)))
+            row.setInt(ordinal, DateTimeUtils.fromJavaDate(new java.sql.Date(oi
+              .getPrimitiveJavaObject(value).toEpochMilli)))
         case oi: BinaryObjectInspector =>
           (value: Any, row: InternalRow, ordinal: Int) =>
             row.update(ordinal, oi.getPrimitiveJavaObject(value))
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
index 7dbabcae484f1..226c722721e0d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.hive.client
 
-import java.io.PrintStream
+import org.apache.hadoop.hive.common.io.SessionStream
 
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.catalog._
@@ -59,9 +59,9 @@ private[hive] trait HiveClient {
    */
   def runSqlHive(sql: String): Seq[String]
 
-  def setOut(stream: PrintStream): Unit
-  def setInfo(stream: PrintStream): Unit
-  def setError(stream: PrintStream): Unit
+  def setOut(stream: SessionStream): Unit
+  def setInfo(stream: SessionStream): Unit
+  def setError(stream: SessionStream): Unit
 
   /** Returns the names of all tables in the given database. */
   def listTables(dbName: String): Seq[String]
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 22ceb6dacf932..d6a28c1f81c41 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.hive.client
 
-import java.io.PrintStream
 import java.lang.{Iterable => JIterable}
 import java.lang.reflect.InvocationTargetException
 import java.nio.charset.StandardCharsets.UTF_8
@@ -31,6 +30,7 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.common.StatsSetupConst
+import org.apache.hadoop.hive.common.io.SessionStream
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hadoop.hive.metastore.{IMetaStoreClient, TableType => HiveTableType}
@@ -183,8 +183,8 @@ private[hive] class HiveClientImpl(
     // got changed. We reset it to clientLoader.ClassLoader here.
     state.getConf.setClassLoader(clientLoader.classLoader)
     shim.setCurrentSessionState(state)
-    state.out = new PrintStream(outputBuffer, true, UTF_8.name())
-    state.err = new PrintStream(outputBuffer, true, UTF_8.name())
+    state.out = new SessionStream(outputBuffer, true, UTF_8.name())
+    state.err = new SessionStream(outputBuffer, true, UTF_8.name())
     state
   }
 
@@ -312,15 +312,15 @@ private[hive] class HiveClientImpl(
     ret
   }
 
-  def setOut(stream: PrintStream): Unit = withHiveState {
+  def setOut(stream: SessionStream): Unit = withHiveState {
     state.out = stream
   }
 
-  def setInfo(stream: PrintStream): Unit = withHiveState {
+  def setInfo(stream: SessionStream): Unit = withHiveState {
     state.info = stream
   }
 
-  def setError(stream: PrintStream): Unit = withHiveState {
+  def setError(stream: SessionStream): Unit = withHiveState {
     state.err = stream
   }
 
@@ -874,6 +874,8 @@ private[hive] class HiveClientImpl(
 
     // Hive query needs to start SessionState.
     SessionState.start(state)
+    state.out = new SessionStream(outputBuffer, true, UTF_8.name())
+    state.err = new SessionStream(outputBuffer, true, UTF_8.name())
     logDebug(s"Running hiveql '$cmd'")
     if (cmd.toLowerCase(Locale.ROOT).startsWith("set")) { logDebug(s"Changing config: $cmd") }
     try {
@@ -1353,7 +1355,7 @@ private[hive] object HiveClientImpl extends Logging {
         new HiveConf(conf, classOf[HiveConf])
     }
     try {
-      Hive.getWithoutRegisterFns(hiveConf)
+      Hive.getWithFastCheck(hiveConf, false)
     } catch {
       // SPARK-37069: not all Hive versions have the above method (e.g., Hive 2.3.9 has it but
       // 2.3.8 don't), therefore here we fallback when encountering the exception.

From 0bea3ba45b4a6f0e2fba7d18437f5aeb1c34f647 Mon Sep 17 00:00:00 2001
From: Gibson Chikafa <gibbschikafa@gmail.com>
Date: Wed, 26 Jun 2024 13:31:12 +0200
Subject: [PATCH 09/86] Bump hive version to 3.0.0.13.5 (#39)

---
 pom.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index 95408ce38d399..0435adc65bf4e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -133,8 +133,8 @@
     <hive.group>io.hops.hive</hive.group>
     <hive.classifier>core</hive.classifier>
     <!-- Version used in Maven Hive dependency -->
-    <hive.version>3.0.0.8-SNAPSHOT</hive.version>
-    <hive23.version>3.0.0.8-SNAPSHOT</hive23.version>
+    <hive.version>3.0.0.13.5</hive.version>
+    <hive23.version>3.0.0.13.5</hive23.version>
     <!-- Version used for internal directory structure -->
     <hive.version.short>3.0</hive.version.short>
     <!-- note that this should be compatible with Kafka brokers version 0.10 and up -->

From f7d50b23338a58f55f283e0758ef52a2cc7e3b8f Mon Sep 17 00:00:00 2001
From: Gibson Chikafa <gibbschikafa@gmail.com>
Date: Thu, 20 Feb 2025 17:56:37 +0100
Subject: [PATCH 10/86] Change hadoop version

---
 pom.xml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pom.xml b/pom.xml
index 0435adc65bf4e..d88302a032677 100644
--- a/pom.xml
+++ b/pom.xml
@@ -122,7 +122,7 @@
     <slf4j.version>2.0.17</slf4j.version>
     <log4j.version>2.24.3</log4j.version>
     <!-- make sure to update IsolatedClientLoader whenever this version is changed -->
-    <hadoop.version>3.2.0.2</hadoop.version>
+    <hadoop.version>3.2.0.16-EE-SNAPSHOT</hadoop.version>
     <hadoop.group>io.hops</hadoop.group>
     <!-- SPARK-41247: When updating `protobuf.version`, also need to update `protoVersion` in `SparkBuild.scala` -->
     <protobuf.version>3.23.4</protobuf.version>
@@ -352,9 +352,9 @@
       </snapshots>
     </repository>
     <repository>
-      <id>Hops</id>
-      <name>Hops Repository</name>
-      <url>https://archiva.hops.works/repository/Hops/</url>
+      <id>HopsEE</id>
+      <name>Hops Release Repository</name>
+      <url>https://nexus.hops.works/repository/hops-artifacts</url>
       <releases>
         <enabled>true</enabled>
       </releases>
@@ -3988,9 +3988,9 @@
   </profiles>
   <distributionManagement>
     <repository>
-      <id>Hops</id>
-      <name>Hops Repo</name>
-      <url>https://archiva.hops.works/repository/Hops/</url>
+      <id>HopsEE</id>
+      <name>Hops Release Repository</name>
+      <url>https://nexus.hops.works/repository/hops-artifacts</url>
     </repository>
   </distributionManagement>
 </project>

From 2a5fee43a923699d6ce496f9a194711a1a8a8fd2 Mon Sep 17 00:00:00 2001
From: Gibson Chikafa <gibbschikafa@gmail.com>
Date: Wed, 11 Jun 2025 15:49:25 +0200
Subject: [PATCH 11/86] Add Hops Repo

---
 pom.xml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/pom.xml b/pom.xml
index d88302a032677..8244102acca78 100644
--- a/pom.xml
+++ b/pom.xml
@@ -362,6 +362,17 @@
         <enabled>true</enabled>
       </snapshots>
     </repository>
+    <repository>
+      <id>Hops</id>
+      <name>Hops Repo</name>
+      <url>https://archiva.hops.works/repository/Hops/</url>
+      <releases>
+        <enabled>true</enabled>
+      </releases>
+      <snapshots>
+        <enabled>true</enabled>
+      </snapshots>
+    </repository>
   </repositories>
   <pluginRepositories>
     <pluginRepository>

From 4688cfda16bd937aba9b6a87bf5edd7c3ab66c62 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Sat, 14 Jun 2025 20:06:45 +0900
Subject: [PATCH 12/86] [SPARK-52420][PYTHON][TESTS][FOLLOW-UP][3.5] Make
 test_udtf_with_invalid_return_type compatible with Python only client

---
 python/pyspark/sql/tests/connect/test_parity_udtf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/sql/tests/connect/test_parity_udtf.py b/python/pyspark/sql/tests/connect/test_parity_udtf.py
index 17df397d2f11b..2e37c6962caf0 100644
--- a/python/pyspark/sql/tests/connect/test_parity_udtf.py
+++ b/python/pyspark/sql/tests/connect/test_parity_udtf.py
@@ -54,7 +54,7 @@ class TestUDTF:
             def eval(self, a: int):
                 yield a + 1,
 
-        with self.assertRaisesRegex(InvalidPlanInput, "Invalid.*type"):
+        with self.assertRaisesRegex(Exception, "Invalid.*type"):
             TestUDTF(lit(1)).collect()
 
     @unittest.skipIf(

From 5f98a2206b4d122b6dd7705a30db622518e85504 Mon Sep 17 00:00:00 2001
From: Gibson Chikafa <gibbschikafa@gmail.com>
Date: Tue, 17 Jun 2025 00:46:06 +0200
Subject: [PATCH 13/86] Change version to 3.5.8

---
 assembly/pom.xml                                   |  2 +-
 common/kvstore/pom.xml                             |  2 +-
 common/network-common/pom.xml                      |  2 +-
 common/network-shuffle/pom.xml                     |  2 +-
 common/network-yarn/pom.xml                        |  2 +-
 common/sketch/pom.xml                              |  2 +-
 common/tags/pom.xml                                |  2 +-
 common/unsafe/pom.xml                              |  2 +-
 common/utils/pom.xml                               |  2 +-
 connector/avro/pom.xml                             |  2 +-
 connector/connect/client/jvm/pom.xml               |  2 +-
 connector/connect/common/pom.xml                   |  2 +-
 connector/connect/server/pom.xml                   |  2 +-
 connector/docker-integration-tests/pom.xml         |  2 +-
 connector/kafka-0-10-assembly/pom.xml              |  2 +-
 connector/kafka-0-10-sql/pom.xml                   |  2 +-
 connector/kafka-0-10-token-provider/pom.xml        |  2 +-
 connector/kafka-0-10/pom.xml                       |  2 +-
 connector/kinesis-asl-assembly/pom.xml             |  2 +-
 connector/kinesis-asl/pom.xml                      |  2 +-
 connector/protobuf/pom.xml                         |  2 +-
 connector/spark-ganglia-lgpl/pom.xml               |  2 +-
 core/pom.xml                                       |  2 +-
 docs/_config.yml                                   |  2 +-
 examples/pom.xml                                   |  2 +-
 graphx/pom.xml                                     |  2 +-
 hadoop-cloud/pom.xml                               |  2 +-
 launcher/pom.xml                                   |  2 +-
 mllib-local/pom.xml                                |  2 +-
 mllib/pom.xml                                      |  2 +-
 pom.xml                                            | 14 +++++++++++++-
 repl/pom.xml                                       |  2 +-
 resource-managers/kubernetes/core/pom.xml          |  2 +-
 .../kubernetes/integration-tests/pom.xml           |  2 +-
 resource-managers/mesos/pom.xml                    |  2 +-
 resource-managers/yarn/pom.xml                     |  2 +-
 sql/api/pom.xml                                    |  2 +-
 sql/catalyst/pom.xml                               |  2 +-
 sql/core/pom.xml                                   |  2 +-
 sql/hive-thriftserver/pom.xml                      |  2 +-
 sql/hive/pom.xml                                   |  2 +-
 streaming/pom.xml                                  |  2 +-
 tools/pom.xml                                      |  2 +-
 43 files changed, 55 insertions(+), 43 deletions(-)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index e2b19febfb57d..667a8743d3359 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index a543c74ca9003..b63a467f00da7 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 60d6066b3cc0e..4fd4e87656487 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index a8750506edcb2..ac34aaae59620 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 7cd1d526fdab5..393ea344b80c3 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 258b786ed308b..8a9acdf00b624 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index b237b1e272ecc..83a17c7dd9823 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 4f2ffa892d523..222c805d6823c 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/utils/pom.xml b/common/utils/pom.xml
index 7c445789d8fac..148fe7f466f96 100644
--- a/common/utils/pom.xml
+++ b/common/utils/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/avro/pom.xml b/connector/avro/pom.xml
index a6b310bdefa72..187e4d777c0d0 100644
--- a/connector/avro/pom.xml
+++ b/connector/avro/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/connect/client/jvm/pom.xml b/connector/connect/client/jvm/pom.xml
index 1f309e2db75ab..88f4804e4bee2 100644
--- a/connector/connect/client/jvm/pom.xml
+++ b/connector/connect/client/jvm/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/connect/common/pom.xml b/connector/connect/common/pom.xml
index f676a5eccbad5..7762a6e65a9a1 100644
--- a/connector/connect/common/pom.xml
+++ b/connector/connect/common/pom.xml
@@ -22,7 +22,7 @@
     <parent>
         <groupId>org.apache.spark</groupId>
         <artifactId>spark-parent_2.12</artifactId>
-        <version>3.5.8-SNAPSHOT</version>
+        <version>3.5.8</version>
         <relativePath>../../../pom.xml</relativePath>
     </parent>
 
diff --git a/connector/connect/server/pom.xml b/connector/connect/server/pom.xml
index 97ff8140a3632..69f95faaa833c 100644
--- a/connector/connect/server/pom.xml
+++ b/connector/connect/server/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/docker-integration-tests/pom.xml b/connector/docker-integration-tests/pom.xml
index d7a6ef9547aba..77787a9ae4e5b 100644
--- a/connector/docker-integration-tests/pom.xml
+++ b/connector/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10-assembly/pom.xml b/connector/kafka-0-10-assembly/pom.xml
index 5e42afb48354e..ab244bc46339b 100644
--- a/connector/kafka-0-10-assembly/pom.xml
+++ b/connector/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10-sql/pom.xml b/connector/kafka-0-10-sql/pom.xml
index 38e481ffae6b6..e9e4eb00b674a 100644
--- a/connector/kafka-0-10-sql/pom.xml
+++ b/connector/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10-token-provider/pom.xml b/connector/kafka-0-10-token-provider/pom.xml
index 1159507872b2b..a399672aa5d2c 100644
--- a/connector/kafka-0-10-token-provider/pom.xml
+++ b/connector/kafka-0-10-token-provider/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10/pom.xml b/connector/kafka-0-10/pom.xml
index 3d804b66248dd..17273cf66d350 100644
--- a/connector/kafka-0-10/pom.xml
+++ b/connector/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kinesis-asl-assembly/pom.xml b/connector/kinesis-asl-assembly/pom.xml
index c555fd6cfd35d..7299a9ff6b158 100644
--- a/connector/kinesis-asl-assembly/pom.xml
+++ b/connector/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kinesis-asl/pom.xml b/connector/kinesis-asl/pom.xml
index 1bfc3384fa35c..007fea1c4a71f 100644
--- a/connector/kinesis-asl/pom.xml
+++ b/connector/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/protobuf/pom.xml b/connector/protobuf/pom.xml
index de2e490c00341..3d820f8c6f880 100644
--- a/connector/protobuf/pom.xml
+++ b/connector/protobuf/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/spark-ganglia-lgpl/pom.xml b/connector/spark-ganglia-lgpl/pom.xml
index 2fd4109af2d4f..be66665dc1d3e 100644
--- a/connector/spark-ganglia-lgpl/pom.xml
+++ b/connector/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 359e47f238acf..188ef108cb889 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index d19efdb99ba06..c0c5e92d8f1a4 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -19,7 +19,7 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 3.5.8-SNAPSHOT
+SPARK_VERSION: 3.5.8
 SPARK_VERSION_SHORT: 3.5.8
 SCALA_BINARY_VERSION: "2.12"
 SCALA_VERSION: "2.12.18"
diff --git a/examples/pom.xml b/examples/pom.xml
index 00bc0d7bca367..afe985b612b5d 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 9f9ff9b0d9157..aea672a98b06d 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index c4f00569c207c..a8be8dd7a0423 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index c67b33fee6c9c..91f3753fb06b5 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 0f1e9fa843757..e2c1fc385560e 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index adfe9b29141fa..9685f904c7ff2 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 8244102acca78..3fd0769089683 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.12</artifactId>
-  <version>3.5.8-SNAPSHOT</version>
+  <version>3.5.8</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>https://spark.apache.org/</url>
@@ -2730,6 +2730,12 @@
         <artifactId>parquet-column</artifactId>
         <version>${parquet.version}</version>
         <scope>${parquet.deps.scope}</scope>
+        <exclusions>
+          <exclusion>
+            <groupId>com.h2database</groupId>
+            <artifactId>h2</artifactId>
+          </exclusion>
+        </exclusions>
       </dependency>
       <dependency>
         <groupId>org.apache.parquet</groupId>
@@ -2751,6 +2757,12 @@
         <version>${parquet.version}</version>
         <scope>${parquet.test.deps.scope}</scope>
         <classifier>tests</classifier>
+        <exclusions>
+          <exclusion>
+            <groupId>com.h2database</groupId>
+            <artifactId>h2</artifactId>
+          </exclusion>
+        </exclusions>
       </dependency>
       <dependency>
         <groupId>org.apache.parquet</groupId>
diff --git a/repl/pom.xml b/repl/pom.xml
index 8f3ae8b4f2fc7..5f08f83e826ea 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index f1feb2a61325f..1b9d4799634e4 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 115c6ae85ab96..96f4443370938 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index e57c69d1040f3..16786559c11d7 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index dd494ec5b3b1a..9653c7ff8e0ca 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/api/pom.xml b/sql/api/pom.xml
index 49dee295ff196..4cd51159630fe 100644
--- a/sql/api/pom.xml
+++ b/sql/api/pom.xml
@@ -22,7 +22,7 @@
     <parent>
         <groupId>org.apache.spark</groupId>
         <artifactId>spark-parent_2.12</artifactId>
-        <version>3.5.8-SNAPSHOT</version>
+        <version>3.5.8</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 7317a3f7ab94a..424c9decb6164 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index d202c1c60d738..1953607a0ba0d 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 918bb9fa5c72c..42d5e5bc97780 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 65ff31c0b1d87..b0c39ec7831c4 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index a8e82cb10d377..d94c5951de2f4 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index b864ede1149ec..162830e13fc7e 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.8</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From e03729fb79346521a0039e9a2b3dd620e44b6d51 Mon Sep 17 00:00:00 2001
From: Gibson Chikafa <gibbschikafa@gmail.com>
Date: Tue, 17 Jun 2025 00:47:08 +0200
Subject: [PATCH 14/86] Change kafka version to 2.6.0

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 3fd0769089683..0a7adac116725 100644
--- a/pom.xml
+++ b/pom.xml
@@ -138,7 +138,7 @@
     <!-- Version used for internal directory structure -->
     <hive.version.short>3.0</hive.version.short>
     <!-- note that this should be compatible with Kafka brokers version 0.10 and up -->
-    <kafka.version>3.4.1</kafka.version>
+    <kafka.version>2.6.0</kafka.version>
     <!-- After 10.15.1.3, the minimum required version is JDK9 -->
     <derby.version>10.14.2.0</derby.version>
     <parquet.version>1.13.1</parquet.version>

From 8efa025109e8074c416cadd5fe21e4b99f2ba4ec Mon Sep 17 00:00:00 2001
From: Gibson Chikafa <gibbschikafa@gmail.com>
Date: Thu, 19 Jun 2025 09:17:00 +0200
Subject: [PATCH 15/86] Change version to 3.5.5

---
 R/pkg/DESCRIPTION                                      | 2 +-
 assembly/pom.xml                                       | 2 +-
 common/kvstore/pom.xml                                 | 2 +-
 common/network-common/pom.xml                          | 2 +-
 common/network-shuffle/pom.xml                         | 2 +-
 common/network-yarn/pom.xml                            | 2 +-
 common/sketch/pom.xml                                  | 2 +-
 common/tags/pom.xml                                    | 2 +-
 common/unsafe/pom.xml                                  | 2 +-
 common/utils/pom.xml                                   | 2 +-
 connector/avro/pom.xml                                 | 2 +-
 connector/connect/client/jvm/pom.xml                   | 2 +-
 connector/connect/common/pom.xml                       | 2 +-
 connector/connect/server/pom.xml                       | 2 +-
 connector/docker-integration-tests/pom.xml             | 2 +-
 connector/kafka-0-10-assembly/pom.xml                  | 2 +-
 connector/kafka-0-10-sql/pom.xml                       | 2 +-
 connector/kafka-0-10-token-provider/pom.xml            | 2 +-
 connector/kafka-0-10/pom.xml                           | 2 +-
 connector/kinesis-asl-assembly/pom.xml                 | 2 +-
 connector/kinesis-asl/pom.xml                          | 2 +-
 connector/protobuf/pom.xml                             | 2 +-
 connector/spark-ganglia-lgpl/pom.xml                   | 2 +-
 core/pom.xml                                           | 2 +-
 docs/_config.yml                                       | 6 +++---
 examples/pom.xml                                       | 2 +-
 graphx/pom.xml                                         | 2 +-
 hadoop-cloud/pom.xml                                   | 2 +-
 launcher/pom.xml                                       | 2 +-
 mllib-local/pom.xml                                    | 2 +-
 mllib/pom.xml                                          | 2 +-
 pom.xml                                                | 2 +-
 python/pyspark/version.py                              | 2 +-
 repl/pom.xml                                           | 2 +-
 resource-managers/kubernetes/core/pom.xml              | 2 +-
 resource-managers/kubernetes/integration-tests/pom.xml | 2 +-
 resource-managers/mesos/pom.xml                        | 2 +-
 resource-managers/yarn/pom.xml                         | 2 +-
 sql/api/pom.xml                                        | 2 +-
 sql/catalyst/pom.xml                                   | 2 +-
 sql/core/pom.xml                                       | 2 +-
 sql/hive-thriftserver/pom.xml                          | 2 +-
 sql/hive/pom.xml                                       | 2 +-
 streaming/pom.xml                                      | 2 +-
 tools/pom.xml                                          | 2 +-
 45 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index d8b7f76b262f3..5eca59375425e 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 3.5.8
+Version: 3.5.5
 Title: R Front End for 'Apache Spark'
 Description: Provides an R Front end for 'Apache Spark' <https://spark.apache.org>.
 Authors@R:
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 667a8743d3359..6eb1c9c341b81 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index b63a467f00da7..a1ec2748329b9 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 4fd4e87656487..6ae7863161b1e 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index ac34aaae59620..7537e39d93ea5 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 393ea344b80c3..f458b6c4e7e15 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 8a9acdf00b624..43313bd0ec28f 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 83a17c7dd9823..471b499c37297 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 222c805d6823c..4dac8356b77b6 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/utils/pom.xml b/common/utils/pom.xml
index 148fe7f466f96..2e4e0dcdaa2eb 100644
--- a/common/utils/pom.xml
+++ b/common/utils/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/avro/pom.xml b/connector/avro/pom.xml
index 187e4d777c0d0..11811ed080bca 100644
--- a/connector/avro/pom.xml
+++ b/connector/avro/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/connect/client/jvm/pom.xml b/connector/connect/client/jvm/pom.xml
index 88f4804e4bee2..f2630bfb9303f 100644
--- a/connector/connect/client/jvm/pom.xml
+++ b/connector/connect/client/jvm/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/connect/common/pom.xml b/connector/connect/common/pom.xml
index 7762a6e65a9a1..a4f010f7076d4 100644
--- a/connector/connect/common/pom.xml
+++ b/connector/connect/common/pom.xml
@@ -22,7 +22,7 @@
     <parent>
         <groupId>org.apache.spark</groupId>
         <artifactId>spark-parent_2.12</artifactId>
-        <version>3.5.8</version>
+        <version>3.5.5</version>
         <relativePath>../../../pom.xml</relativePath>
     </parent>
 
diff --git a/connector/connect/server/pom.xml b/connector/connect/server/pom.xml
index 69f95faaa833c..54c63a6f6ded8 100644
--- a/connector/connect/server/pom.xml
+++ b/connector/connect/server/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/docker-integration-tests/pom.xml b/connector/docker-integration-tests/pom.xml
index 77787a9ae4e5b..d35c4809b529f 100644
--- a/connector/docker-integration-tests/pom.xml
+++ b/connector/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10-assembly/pom.xml b/connector/kafka-0-10-assembly/pom.xml
index ab244bc46339b..4cd2f31a94645 100644
--- a/connector/kafka-0-10-assembly/pom.xml
+++ b/connector/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10-sql/pom.xml b/connector/kafka-0-10-sql/pom.xml
index e9e4eb00b674a..ebb6f15ad3697 100644
--- a/connector/kafka-0-10-sql/pom.xml
+++ b/connector/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10-token-provider/pom.xml b/connector/kafka-0-10-token-provider/pom.xml
index a399672aa5d2c..cde5d2c4c1348 100644
--- a/connector/kafka-0-10-token-provider/pom.xml
+++ b/connector/kafka-0-10-token-provider/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10/pom.xml b/connector/kafka-0-10/pom.xml
index 17273cf66d350..b59e6401191be 100644
--- a/connector/kafka-0-10/pom.xml
+++ b/connector/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kinesis-asl-assembly/pom.xml b/connector/kinesis-asl-assembly/pom.xml
index 7299a9ff6b158..1b5ee194c268a 100644
--- a/connector/kinesis-asl-assembly/pom.xml
+++ b/connector/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kinesis-asl/pom.xml b/connector/kinesis-asl/pom.xml
index 007fea1c4a71f..608671f47a0c3 100644
--- a/connector/kinesis-asl/pom.xml
+++ b/connector/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/protobuf/pom.xml b/connector/protobuf/pom.xml
index 3d820f8c6f880..91df2118e6092 100644
--- a/connector/protobuf/pom.xml
+++ b/connector/protobuf/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/spark-ganglia-lgpl/pom.xml b/connector/spark-ganglia-lgpl/pom.xml
index be66665dc1d3e..572766941ed93 100644
--- a/connector/spark-ganglia-lgpl/pom.xml
+++ b/connector/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 188ef108cb889..29747b80a431e 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index c0c5e92d8f1a4..b9a4294bbb8f8 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -19,8 +19,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 3.5.8
-SPARK_VERSION_SHORT: 3.5.8
+SPARK_VERSION: 3.5.5
+SPARK_VERSION_SHORT: 3.5.5
 SCALA_BINARY_VERSION: "2.12"
 SCALA_VERSION: "2.12.18"
 MESOS_VERSION: 1.0.0
@@ -40,7 +40,7 @@ DOCSEARCH_SCRIPT: |
       inputSelector: '#docsearch-input',
       enhancedSearchInput: true,
       algoliaOptions: {
-        'facetFilters': ["version:3.5.8"]
+        'facetFilters': ["version:3.5.5"]
       },
       debug: false // Set debug to true if you want to inspect the dropdown
   });
diff --git a/examples/pom.xml b/examples/pom.xml
index afe985b612b5d..4cf1847cc16c3 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index aea672a98b06d..287116cca802a 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index a8be8dd7a0423..94cb21db3a01d 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 91f3753fb06b5..97e74a0998958 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index e2c1fc385560e..a7020fabd259a 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 9685f904c7ff2..a9e8fca6e1b19 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 0a7adac116725..7f501405be76d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.12</artifactId>
-  <version>3.5.8</version>
+  <version>3.5.5</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>https://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index e67736d7e0548..df09fc3284fbd 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__: str = "3.5.8.dev0"
+__version__: str = "3.5.5.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 5f08f83e826ea..1efb8b8fbe1f1 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 1b9d4799634e4..7a97c4c1ff06c 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 96f4443370938..a004dd12fedec 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 16786559c11d7..566e0baf8e23c 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 9653c7ff8e0ca..f7bfa6d6ee6e7 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/api/pom.xml b/sql/api/pom.xml
index 4cd51159630fe..4a21574462c3a 100644
--- a/sql/api/pom.xml
+++ b/sql/api/pom.xml
@@ -22,7 +22,7 @@
     <parent>
         <groupId>org.apache.spark</groupId>
         <artifactId>spark-parent_2.12</artifactId>
-        <version>3.5.8</version>
+        <version>3.5.5</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 424c9decb6164..ff0992e7b21c7 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 1953607a0ba0d..5d678c9464593 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 42d5e5bc97780..176ad85f71237 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index b0c39ec7831c4..a3ebd9f98c96a 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index d94c5951de2f4..92d716fa5c09d 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 162830e13fc7e..f2e171b2dfe97 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 02f7fb33397798d72c71b48e320bfe5d876899be Mon Sep 17 00:00:00 2001
From: Bruce Robbins <bersprockets@gmail.com>
Date: Tue, 24 Jun 2025 14:03:59 +0800
Subject: [PATCH 16/86] [SPARK-52339][SQL][3.5] Fix comparison of
 `InMemoryFileIndex` instances

### What changes were proposed in this pull request?

This is a back-port of #51043.

This PR changes `InMemoryFileIndex#equals` to compare a non-distinct collection of root paths rather than a distinct set of root paths. Without this change, `InMemoryFileIndex#equals` considers the following two collections of root paths to be equal, even though they represent a different number of rows:
```
["/tmp/test", "/tmp/test"]
["/tmp/test", "/tmp/test", "/tmp/test"]
```

### Why are the changes needed?

The bug can cause correctness issues, e.g.
```
// create test data
val data = Seq((1, 2), (2, 3)).toDF("a", "b")
data.write.mode("overwrite").csv("/tmp/test")

val fileList1 = List.fill(2)("/tmp/test")
val fileList2 = List.fill(3)("/tmp/test")

val df1 = spark.read.schema("a int, b int").csv(fileList1: _*)
val df2 = spark.read.schema("a int, b int").csv(fileList2: _*)

df1.count() // correctly returns 4
df2.count() // correctly returns 6

// the following is the same as above, except df1 is persisted
val df1 = spark.read.schema("a int, b int").csv(fileList1: _*).persist
val df2 = spark.read.schema("a int, b int").csv(fileList2: _*)

df1.count() // correctly returns 4
df2.count() // incorrectly returns 4!!
```
In the above example, df1 and df2 were created with a different number of paths: df1 has 2, and df2 has 3. But since the distinct set of root paths is the same (e.g., `Set("/tmp/test") == Set("/tmp/test"))`, the two dataframes are considered equal. Thus, when df1 is persisted, df2 uses df1's cached plan.

The same bug also causes inappropriate exchange reuse.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

New unit test.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #51256 from bersprockets/multi_path_issue_br35.

Authored-by: Bruce Robbins <bersprockets@gmail.com>
Signed-off-by: Kent Yao <yao@apache.org>
---
 .../datasources/InMemoryFileIndex.scala       |  2 +-
 .../datasources/FileIndexSuite.scala          | 24 +++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
index 44d31131e9c6d..8920ff88be519 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
@@ -101,7 +101,7 @@ class InMemoryFileIndex(
   }
 
   override def equals(other: Any): Boolean = other match {
-    case hdfs: InMemoryFileIndex => rootPaths.toSet == hdfs.rootPaths.toSet
+    case hdfs: InMemoryFileIndex => rootPaths.sorted == hdfs.rootPaths.sorted
     case _ => false
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
index 9ac61f0cee5fc..54403ea99c813 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
@@ -554,6 +554,30 @@ class FileIndexSuite extends SharedSparkSession {
     assert(FileIndexOptions.isValidOption("modifiedafter"))
     assert(FileIndexOptions.isValidOption("pathglobfilter"))
   }
+
+  test("SPARK-52339: Correctly compare root paths") {
+    withTempDir { dir =>
+      val file1 = new File(dir, "text1.txt")
+      stringToFile(file1, "text1")
+      val file2 = new File(dir, "text2.txt")
+      stringToFile(file2, "text2")
+      val path1 = new Path(file1.getCanonicalPath)
+      val path2 = new Path(file2.getCanonicalPath)
+
+      val schema = StructType(Seq(StructField("a", StringType, false)))
+
+      // Verify that the order of paths doesn't matter
+      val fileIndex1a = new InMemoryFileIndex(spark, Seq(path1, path2), Map.empty, Some(schema))
+      val fileIndex1b = new InMemoryFileIndex(spark, Seq(path2, path1), Map.empty, Some(schema))
+      assert(fileIndex1a == fileIndex1b)
+
+      // Verify that a different number of paths does matter
+      val fileIndex2a = new InMemoryFileIndex(spark, Seq(path1, path1), Map.empty, Some(schema))
+      val fileIndex2b = new InMemoryFileIndex(spark, Seq(path1, path1, path1),
+        Map.empty, Some(schema))
+      assert(fileIndex2a != fileIndex2b)
+    }
+  }
 }
 
 object DeletionRaceFileSystem {

From a1364217df7cdc5cfb9e763b33f0229fdf71df67 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Wed, 25 Jun 2025 08:42:25 +0900
Subject: [PATCH 17/86] [SPARK-52562][INFRA] Automatically create the base of
 release notes and push
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?

This PR proposes to add the automatic release note process. Here is what it does:

1. Add download link to docs
  Inserts the new release version link into `documentation.md`, keeping versions sorted by recency.

2. Add download link to spark-website
  Updates `js/downloads.js` with the new version's metadata for the downloads page. Replaces existing entry if it's a patch; inserts new entry otherwise. Uses different package lists for Spark 3 vs. Spark 4.

3. Generate news & release notes
  Creates a news post and release notes file as below. Note that I skipped the short link generation step here.
    - For minor/major releases (x.y.0), describes new features
    - For patch/maintenance releases (x.y.z, z > 0), mentions stability fixes and encourages upgrades.

4. Build the Website
  Runs Jekyll to generate updated HTML files for the website.

5. Update latest symlink (only for major/minor)
  Updates the `site/docs/latest` symlink to point to the new version only if it's a major or minor release (x.y.0), so maintenance releases don’t affect the default documentation version.

If the release manager needs to have better release notes, they can create a separate PR to update this.

### Why are the changes needed?

To make the release process easier.

### Does this PR introduce _any_ user-facing change?

No, dev-only.

### How was this patch tested?

I manually tested them in my Mac for now, and checked that they are compatible with Ubuntu. It has to be tested in the official release later again.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #51260 from HyukjinKwon/SPARK-52562.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit 82ab68045759602b1b5a4e8308915375de03be7f)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 dev/create-release/release-build.sh | 232 +++++++++++++++++++++++++++-
 1 file changed, 231 insertions(+), 1 deletion(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index d3f8c509910e9..21235e9a8847f 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -142,9 +142,239 @@ if [[ "$1" == "finalize" ]]; then
   git commit -m "Add docs for Apache Spark $RELEASE_VERSION"
   git push origin HEAD:asf-site
   cd ..
-  rm -rf spark-website
   echo "docs uploaded"
 
+  echo "Uploading release docs to spark-website"
+  cd spark-website
+
+  # TODO: Test it in the actual release
+  # 1. Add download link to documentation.md
+  python3 <<EOF
+import re
+
+release_version = "${RELEASE_VERSION}"
+newline = f'  <li><a href="{{{{site.baseurl}}}}/docs/{release_version}/">Spark {release_version}</a></li>'
+inserted = False
+
+def parse_version(v):
+    return [int(p) for p in v.strip().split(".")]
+
+def vercmp(v1, v2):
+    a = parse_version(v1)
+    b = parse_version(v2)
+    return (a > b) - (a < b)
+
+with open("documentation.md") as f:
+    lines = f.readlines()
+
+with open("documentation.md", "w") as f:
+    for line in lines:
+        match = re.search(r'docs/(\d+\.\d+\.\d+)/', line)
+        if not inserted and match:
+            existing_version = match.group(1)
+            if vercmp(release_version, existing_version) >= 0:
+                f.write(newline + "\n")
+                inserted = True
+        f.write(line)
+    if not inserted:
+        f.write(newline + "\n")
+EOF
+
+  echo "Edited documentation.md"
+
+  # 2. Add download link to js/downloads.js
+  RELEASE_DATE=$(TZ=America/Los_Angeles date +"%m/%d/%Y")
+  IFS='.' read -r rel_maj rel_min rel_patch <<< "$RELEASE_VERSION"
+  NEW_PACKAGES="packagesV14"
+  if [[ "$rel_maj" -ge 4 ]]; then
+    NEW_PACKAGES="packagesV15"
+  fi
+
+  python3 <<EOF
+import re
+
+release_version = "${RELEASE_VERSION}"
+release_date = "${RELEASE_DATE}"
+new_packages = "${NEW_PACKAGES}"
+newline = f'addRelease("{release_version}", new Date("{release_date}"), {new_packages}, true);'
+
+new_major, new_minor, new_patch = [int(p) for p in release_version.split(".")]
+
+def parse_version(v):
+    return [int(p) for p in v.strip().split(".")]
+
+def vercmp(v1, v2):
+    a = parse_version(v1)
+    b = parse_version(v2)
+    return (a > b) - (a < b)
+
+inserted = replaced = False
+
+with open("js/downloads.js") as f:
+    lines = f.readlines()
+
+with open("js/downloads.js", "w") as f:
+    for line in lines:
+        m = re.search(r'addRelease\("(\d+\.\d+\.\d+)"', line)
+        if m:
+            existing_version = m.group(1)
+            cmp_result = vercmp(release_version, existing_version)
+            ex_major, ex_minor, ex_patch = parse_version(existing_version)
+
+            if cmp_result == 0:
+                f.write(newline + "\n")
+                replaced = True
+            elif not replaced and ex_major == new_major and ex_minor == new_minor:
+                f.write(newline + "\n")
+                replaced = True
+            elif not replaced and not inserted and cmp_result > 0:
+                f.write(newline + "\n")
+                f.write(line)
+                inserted = True
+            else:
+                f.write(line)
+        else:
+            f.write(line)
+    if not replaced and not inserted:
+        f.write(newline + "\n")
+EOF
+
+  echo "Edited js/downloads.js"
+
+  # 3. Add news post
+  RELEASE_DATE=$(TZ=America/Los_Angeles date +"%Y-%m-%d")
+  FILENAME="news/_posts/${RELEASE_DATE}-spark-${RELEASE_VERSION//./-}-released.md"
+  mkdir -p news/_posts
+  cat > "$FILENAME" <<EOF
+---
+layout: post
+title: Spark ${RELEASE_VERSION} released
+categories:
+- News
+tags: []
+status: publish
+type: post
+published: true
+meta:
+  _edit_last: '4'
+  _wpas_done_all: '1'
+---
+We are happy to announce the availability of <a href="{{site.baseurl}}/releases/spark-release-${RELEASE_VERSION}.html" title="Spark Release ${RELEASE_VERSION}">Apache Spark ${RELEASE_VERSION}</a>! Visit the <a href="{{site.baseurl}}/releases/spark-release-${RELEASE_VERSION}.html" title="Spark Release ${RELEASE_VERSION}">release notes</a> to read about the new features, or <a href="{{site.baseurl}}/downloads.html">download</a> the release today.
+EOF
+
+  echo "Created $FILENAME"
+
+  # 4. Add release notes with Python to extract JIRA version ID
+  RELEASE_DATE=$(TZ=America/Los_Angeles date +"%Y-%m-%d")
+  JIRA_PROJECT_ID=12315420
+  JIRA_URL="https://issues.apache.org/jira/rest/api/2/project/SPARK/versions"
+  JSON=$(curl -s "$JIRA_URL")
+
+  VERSION_ID=$(python3 - <<EOF
+import sys, json
+
+release_version = "${RELEASE_VERSION}"
+json_str = """$JSON"""
+
+try:
+    versions = json.loads(json_str)
+except Exception as e:
+    print(f"Error parsing JSON: {e}", file=sys.stderr)
+    sys.exit(1)
+
+version_id = ""
+for v in versions:
+    if v.get("name") == release_version:
+        version_id = v.get("id", "")
+        break
+
+print(version_id)
+EOF
+  )
+
+  if [[ -z "$VERSION_ID" ]]; then
+    echo "Error: Couldn't find JIRA version ID for $RELEASE_VERSION" >&2
+  fi
+
+  JIRA_LINK="https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=${JIRA_PROJECT_ID}&version=${VERSION_ID}"
+
+  IFS='.' read -r rel_maj rel_min rel_patch <<< "$RELEASE_VERSION"
+  if [[ "$rel_patch" -eq 0 ]]; then
+    ACKNOWLEDGE="patches and features to this release."
+    BODY="Apache Spark ${RELEASE_VERSION} is a new feature release. It introduces new functionality and improvements. We encourage users to try it and provide feedback."
+  else
+    ACKNOWLEDGE="patches to this release."
+    BODY="Apache Spark ${RELEASE_VERSION} is a maintenance release containing security and correctness fixes. This release is based on the branch-${rel_maj}.${rel_min} maintenance branch of Spark. We strongly recommend all ${rel_maj}.${rel_min} users to upgrade to this stable release."
+  fi
+
+  BODY+="
+
+You can find the list of resolved issues and detailed changes in the [JIRA release notes](${JIRA_LINK}).
+
+We would like to acknowledge all community members for contributing ${ACKNOWLEDGE}"
+
+  FILENAME="releases/_posts/${RELEASE_DATE}-spark-release-${RELEASE_VERSION}.md"
+  mkdir -p releases/_posts
+  cat > "$FILENAME" <<EOF
+---
+layout: post
+title: Spark Release ${RELEASE_VERSION}
+categories: []
+tags: []
+status: publish
+type: post
+published: true
+meta:
+  _edit_last: '4'
+  _wpas_done_all: '1'
+---
+
+${BODY}
+EOF
+
+  echo "Created $FILENAME"
+
+  # 5. Build the website
+  bundle install
+  bundle exec jekyll build
+
+  # 6. Update latest symlink if minor/major release
+  LINK_PATH="site/docs/latest"
+  TARGET_DIR="site/docs/$RELEASE_VERSION"
+  IFS='.' read -r rel_maj rel_min rel_patch <<< "$RELEASE_VERSION"
+  if [[ "$rel_patch" -eq 0 ]]; then
+    if [[ -L "$LINK_PATH" ]]; then
+      CURRENT_TARGET=$(readlink "$LINK_PATH")
+    else
+      CURRENT_TARGET=""
+    fi
+
+    if [[ "$CURRENT_TARGET" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+      IFS='.' read -r cur_maj cur_min cur_patch <<< "$CURRENT_TARGET"
+
+      if [[ "$rel_maj" -gt "$cur_maj" ]]; then
+        ln -sfn "$RELEASE_VERSION" "$LINK_PATH"
+        echo "Updated symlink $LINK_PATH -> $RELEASE_VERSION (major version increased)"
+      elif [[ "$rel_maj" -eq "$cur_maj" && "$rel_min" -gt "$cur_min" ]]; then
+        ln -sfn "$RELEASE_VERSION" "$LINK_PATH"
+        echo "Updated symlink $LINK_PATH -> $RELEASE_VERSION (minor version increased)"
+      else
+        echo "Symlink $LINK_PATH points to $CURRENT_TARGET with equal or newer major.minor, no change"
+      fi
+    else
+      echo "No valid existing version target."
+    fi
+  else
+    echo "Patch release detected ($RELEASE_VERSION), not updating symlink"
+  fi
+
+  git add .
+  git commit -m "Add release docs for Apache Spark $RELEASE_VERSION"
+  git push origin HEAD:asf-site
+  cd ..
+  echo "release docs uploaded"
+  rm -rf spark-website
+
   # Moves the docs from dev directory to release directory.
   echo "Moving Spark docs to the release directory"
   svn mv --username "$ASF_USERNAME" --password "$ASF_PASSWORD" -m"Apache Spark $RELEASE_VERSION" \

From 80a2098d231b231ac6c0dc9326e1b6e2e083c827 Mon Sep 17 00:00:00 2001
From: Kent Yao <yao@apache.org>
Date: Wed, 25 Jun 2025 10:14:11 +0800
Subject: [PATCH 18/86] [SPARK-52339][SQL][FOLLOWUP] Sort paths in
 InMemoryFileIndex#equal only when size matches

### What changes were proposed in this pull request?

A follow-up for https://github.com/apache/spark/pull/51043 that sorts paths in InMemoryFileIndex#equal only when size matches

### Why are the changes needed?

Avoid potential perf regression.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?

Existing test from #51043

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #51263 from yaooqinn/SPARK-52339.

Authored-by: Kent Yao <yao@apache.org>
Signed-off-by: Kent Yao <yao@apache.org>
(cherry picked from commit 1cfe07cdbeea765890cc93f8292ca0a6f13408f2)
Signed-off-by: Kent Yao <yao@apache.org>
---
 .../spark/sql/execution/datasources/InMemoryFileIndex.scala    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
index 8920ff88be519..2180c941aac4f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
@@ -101,7 +101,8 @@ class InMemoryFileIndex(
   }
 
   override def equals(other: Any): Boolean = other match {
-    case hdfs: InMemoryFileIndex => rootPaths.sorted == hdfs.rootPaths.sorted
+    case hdfs: InMemoryFileIndex if rootPaths.size == hdfs.rootPaths.size =>
+      rootPaths.sorted == hdfs.rootPaths.sorted
     case _ => false
   }
 

From 185380c414d2e9822b90a9c0a9e83052a4aa83c1 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Thu, 26 Jun 2025 15:49:22 +0900
Subject: [PATCH 19/86] [SPARK-52584][BUILD] Make build script to support
 preview releases in finalize step

### What changes were proposed in this pull request?

This PR proposes to make release script to support preview releases as well.

### Why are the changes needed?

To make the release easier.

### Does this PR introduce _any_ user-facing change?

No, dev-only.

### How was this patch tested?

Manually tested against spark-website.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #51291 from HyukjinKwon/SPARK-52584.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit 5432402f7dc6372a87b911778f15a4904afb2079)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 dev/create-release/release-build.sh | 210 +++++++++++++++++++---------
 1 file changed, 147 insertions(+), 63 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 21235e9a8847f..3b19786e562be 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -153,7 +153,12 @@ if [[ "$1" == "finalize" ]]; then
 import re
 
 release_version = "${RELEASE_VERSION}"
-newline = f'  <li><a href="{{{{site.baseurl}}}}/docs/{release_version}/">Spark {release_version}</a></li>'
+is_preview = bool(re.search(r'-preview\d*$', release_version))
+base_version = re.sub(r'-preview\d*$', '', release_version)
+
+stable_newline = f'  <li><a href="{{{{site.baseurl}}}}/docs/{release_version}/">Spark {release_version}</a></li>'
+preview_newline = f'  <li><a href="{{{{site.baseurl}}}}/docs/{release_version}/">Spark {release_version} preview</a></li>'
+
 inserted = False
 
 def parse_version(v):
@@ -168,29 +173,63 @@ with open("documentation.md") as f:
     lines = f.readlines()
 
 with open("documentation.md", "w") as f:
-    for line in lines:
-        match = re.search(r'docs/(\d+\.\d+\.\d+)/', line)
-        if not inserted and match:
-            existing_version = match.group(1)
-            if vercmp(release_version, existing_version) >= 0:
-                f.write(newline + "\n")
+    if is_preview:
+        in_preview_section = False
+        for i, line in enumerate(lines):
+            if '<p>Documentation for preview releases:</p>' in line:
+                in_preview_section = True
+                f.write(line)
+                continue
+
+            if in_preview_section and re.search(r'docs/\d+\.\d+\.\d+-preview\d*/', line):
+                existing_version = re.search(r'docs/(\d+\.\d+\.\d+-preview\d*)/', line).group(1)
+
+                if existing_version == release_version:
+                    inserted = True  # Already exists, don't add
+                elif not inserted:
+                    base_existing = re.sub(r'-preview\d*$', '', existing_version)
+                    preview_num_existing = int(re.search(r'preview(\d*)', existing_version).group(1) or "0")
+                    preview_num_new = int(re.search(r'preview(\d*)', release_version).group(1) or "0")
+
+                    if (vercmp(base_version, base_existing) > 0) or \
+                       (vercmp(base_version, base_existing) == 0 and preview_num_new >= preview_num_existing):
+                        f.write(preview_newline + "\n")
+                        inserted = True
+
+                f.write(line)
+                continue
+
+            if in_preview_section and "</ul>" in line and not inserted:
+                f.write(preview_newline + "\n")
                 inserted = True
-        f.write(line)
-    if not inserted:
-        f.write(newline + "\n")
+            f.write(line)
+    else:
+        for line in lines:
+            match = re.search(r'docs/(\d+\.\d+\.\d+)/', line)
+            if not inserted and match:
+                existing_version = match.group(1)
+                if vercmp(release_version, existing_version) >= 0:
+                    f.write(stable_newline + "\n")
+                    inserted = True
+            f.write(line)
+        if not inserted:
+            f.write(stable_newline + "\n")
 EOF
 
   echo "Edited documentation.md"
 
   # 2. Add download link to js/downloads.js
-  RELEASE_DATE=$(TZ=America/Los_Angeles date +"%m/%d/%Y")
-  IFS='.' read -r rel_maj rel_min rel_patch <<< "$RELEASE_VERSION"
-  NEW_PACKAGES="packagesV14"
-  if [[ "$rel_maj" -ge 4 ]]; then
-    NEW_PACKAGES="packagesV15"
-  fi
+  if [[ "$RELEASE_VERSION" =~ -preview[0-9]*$ ]]; then
+    echo "Skipping js/downloads.js for preview release: $RELEASE_VERSION"
+  else
+    RELEASE_DATE=$(TZ=America/Los_Angeles date +"%m/%d/%Y")
+    IFS='.' read -r rel_maj rel_min rel_patch <<< "$RELEASE_VERSION"
+    NEW_PACKAGES="packagesV14"
+    if [[ "$rel_maj" -ge 4 ]]; then
+      NEW_PACKAGES="packagesV15"
+    fi
 
-  python3 <<EOF
+    python3 <<EOF
 import re
 
 release_version = "${RELEASE_VERSION}"
@@ -239,13 +278,44 @@ with open("js/downloads.js", "w") as f:
         f.write(newline + "\n")
 EOF
 
-  echo "Edited js/downloads.js"
+    echo "Edited js/downloads.js"
+  fi
 
   # 3. Add news post
   RELEASE_DATE=$(TZ=America/Los_Angeles date +"%Y-%m-%d")
   FILENAME="news/_posts/${RELEASE_DATE}-spark-${RELEASE_VERSION//./-}-released.md"
   mkdir -p news/_posts
-  cat > "$FILENAME" <<EOF
+
+  if [[ "$RELEASE_VERSION" =~ -preview[0-9]*$ ]]; then
+    BASE_VERSION="${RELEASE_VERSION%%-preview*}"
+    cat > "$FILENAME" <<EOF
+---
+layout: post
+title: Preview release of Spark ${BASE_VERSION}
+categories:
+- News
+tags: []
+status: publish
+type: post
+published: true
+meta:
+  _edit_last: '4'
+  _wpas_done_all: '1'
+---
+To enable wide-scale community testing of the upcoming Spark ${BASE_VERSION} release, the Apache Spark community has posted a
+<a href="https://archive.apache.org/dist/spark/spark-${RELEASE_VERSION}/">Spark ${RELEASE_VERSION} release</a>.
+This preview is not a stable release in terms of either API or functionality, but it is meant to give the community early
+access to try the code that will become Spark ${BASE_VERSION}. If you would like to test the release,
+please <a href="https://archive.apache.org/dist/spark/spark-${RELEASE_VERSION}/">download</a> it, and send feedback using either
+<a href="https://spark.apache.org/community.html">mailing lists</a> or
+<a href="https://issues.apache.org/jira/browse/SPARK/?selectedTab=com.atlassian.jira.jira-projects-plugin:summary-panel">JIRA</a>.
+The documentation is available at the <a href="https://spark.apache.org/docs/${RELEASE_VERSION}/">link</a>.
+
+We'd like to thank our contributors and users for their contributions and early feedback to this release. This release would not have been possible without you.
+EOF
+
+  else
+    cat > "$FILENAME" <<EOF
 ---
 layout: post
 title: Spark ${RELEASE_VERSION} released
@@ -261,16 +331,20 @@ meta:
 ---
 We are happy to announce the availability of <a href="{{site.baseurl}}/releases/spark-release-${RELEASE_VERSION}.html" title="Spark Release ${RELEASE_VERSION}">Apache Spark ${RELEASE_VERSION}</a>! Visit the <a href="{{site.baseurl}}/releases/spark-release-${RELEASE_VERSION}.html" title="Spark Release ${RELEASE_VERSION}">release notes</a> to read about the new features, or <a href="{{site.baseurl}}/downloads.html">download</a> the release today.
 EOF
+  fi
 
   echo "Created $FILENAME"
 
   # 4. Add release notes with Python to extract JIRA version ID
-  RELEASE_DATE=$(TZ=America/Los_Angeles date +"%Y-%m-%d")
-  JIRA_PROJECT_ID=12315420
-  JIRA_URL="https://issues.apache.org/jira/rest/api/2/project/SPARK/versions"
-  JSON=$(curl -s "$JIRA_URL")
+  if [[ "$RELEASE_VERSION" =~ -preview[0-9]*$ ]]; then
+    echo "Skipping JIRA release notes for preview release: $RELEASE_VERSION"
+  else
+    RELEASE_DATE=$(TZ=America/Los_Angeles date +"%Y-%m-%d")
+    JIRA_PROJECT_ID=12315420
+    JIRA_URL="https://issues.apache.org/jira/rest/api/2/project/SPARK/versions"
+    JSON=$(curl -s "$JIRA_URL")
 
-  VERSION_ID=$(python3 - <<EOF
+    VERSION_ID=$(python3 - <<EOF
 import sys, json
 
 release_version = "${RELEASE_VERSION}"
@@ -290,32 +364,32 @@ for v in versions:
 
 print(version_id)
 EOF
-  )
+    )
 
-  if [[ -z "$VERSION_ID" ]]; then
-    echo "Error: Couldn't find JIRA version ID for $RELEASE_VERSION" >&2
-  fi
+    if [[ -z "$VERSION_ID" ]]; then
+      echo "Error: Couldn't find JIRA version ID for $RELEASE_VERSION" >&2
+    fi
 
-  JIRA_LINK="https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=${JIRA_PROJECT_ID}&version=${VERSION_ID}"
+    JIRA_LINK="https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=${JIRA_PROJECT_ID}&version=${VERSION_ID}"
 
-  IFS='.' read -r rel_maj rel_min rel_patch <<< "$RELEASE_VERSION"
-  if [[ "$rel_patch" -eq 0 ]]; then
-    ACKNOWLEDGE="patches and features to this release."
-    BODY="Apache Spark ${RELEASE_VERSION} is a new feature release. It introduces new functionality and improvements. We encourage users to try it and provide feedback."
-  else
-    ACKNOWLEDGE="patches to this release."
-    BODY="Apache Spark ${RELEASE_VERSION} is a maintenance release containing security and correctness fixes. This release is based on the branch-${rel_maj}.${rel_min} maintenance branch of Spark. We strongly recommend all ${rel_maj}.${rel_min} users to upgrade to this stable release."
-  fi
+    IFS='.' read -r rel_maj rel_min rel_patch <<< "$RELEASE_VERSION"
+    if [[ "$rel_patch" -eq 0 ]]; then
+      ACKNOWLEDGE="patches and features to this release."
+      BODY="Apache Spark ${RELEASE_VERSION} is a new feature release. It introduces new functionality and improvements. We encourage users to try it and provide feedback."
+    else
+      ACKNOWLEDGE="patches to this release."
+      BODY="Apache Spark ${RELEASE_VERSION} is a maintenance release containing security and correctness fixes. This release is based on the branch-${rel_maj}.${rel_min} maintenance branch of Spark. We strongly recommend all ${rel_maj}.${rel_min} users to upgrade to this stable release."
+    fi
 
-  BODY+="
+    BODY+="
 
 You can find the list of resolved issues and detailed changes in the [JIRA release notes](${JIRA_LINK}).
 
 We would like to acknowledge all community members for contributing ${ACKNOWLEDGE}"
 
-  FILENAME="releases/_posts/${RELEASE_DATE}-spark-release-${RELEASE_VERSION}.md"
-  mkdir -p releases/_posts
-  cat > "$FILENAME" <<EOF
+    FILENAME="releases/_posts/${RELEASE_DATE}-spark-release-${RELEASE_VERSION}.md"
+    mkdir -p releases/_posts
+    cat > "$FILENAME" <<EOF
 ---
 layout: post
 title: Spark Release ${RELEASE_VERSION}
@@ -332,40 +406,50 @@ meta:
 ${BODY}
 EOF
 
-  echo "Created $FILENAME"
+    echo "Created $FILENAME"
+  fi
 
   # 5. Build the website
   bundle install
   bundle exec jekyll build
 
-  # 6. Update latest symlink if minor/major release
-  LINK_PATH="site/docs/latest"
-  TARGET_DIR="site/docs/$RELEASE_VERSION"
+  # 6. Update latest or preview symlink
   IFS='.' read -r rel_maj rel_min rel_patch <<< "$RELEASE_VERSION"
-  if [[ "$rel_patch" -eq 0 ]]; then
-    if [[ -L "$LINK_PATH" ]]; then
-      CURRENT_TARGET=$(readlink "$LINK_PATH")
-    else
-      CURRENT_TARGET=""
-    fi
 
-    if [[ "$CURRENT_TARGET" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
-      IFS='.' read -r cur_maj cur_min cur_patch <<< "$CURRENT_TARGET"
+  if [[ "$RELEASE_VERSION" =~ -preview[0-9]*$ ]]; then
+    LINK_PATH="site/docs/preview"
 
-      if [[ "$rel_maj" -gt "$cur_maj" ]]; then
-        ln -sfn "$RELEASE_VERSION" "$LINK_PATH"
-        echo "Updated symlink $LINK_PATH -> $RELEASE_VERSION (major version increased)"
-      elif [[ "$rel_maj" -eq "$cur_maj" && "$rel_min" -gt "$cur_min" ]]; then
-        ln -sfn "$RELEASE_VERSION" "$LINK_PATH"
-        echo "Updated symlink $LINK_PATH -> $RELEASE_VERSION (minor version increased)"
+    ln -sfn "$RELEASE_VERSION" "$LINK_PATH"
+    echo "Updated symlink $LINK_PATH -> $RELEASE_VERSION (preview release)"
+
+  else
+    LINK_PATH="site/docs/latest"
+
+    if [[ "$rel_patch" -eq 0 ]]; then
+      if [[ -L "$LINK_PATH" ]]; then
+        CURRENT_TARGET=$(readlink "$LINK_PATH")
       else
-        echo "Symlink $LINK_PATH points to $CURRENT_TARGET with equal or newer major.minor, no change"
+        CURRENT_TARGET=""
+      fi
+
+      if [[ "$CURRENT_TARGET" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+        IFS='.' read -r cur_maj cur_min cur_patch <<< "$CURRENT_TARGET"
+
+        if [[ "$rel_maj" -gt "$cur_maj" ]]; then
+          ln -sfn "$RELEASE_VERSION" "$LINK_PATH"
+          echo "Updated symlink $LINK_PATH -> $RELEASE_VERSION (major version increased)"
+        elif [[ "$rel_maj" -eq "$cur_maj" && "$rel_min" -gt "$cur_min" ]]; then
+          ln -sfn "$RELEASE_VERSION" "$LINK_PATH"
+          echo "Updated symlink $LINK_PATH -> $RELEASE_VERSION (minor version increased)"
+        else
+          echo "Symlink $LINK_PATH points to $CURRENT_TARGET with equal or newer major.minor, no change"
+        fi
+      else
+        echo "No valid existing version target."
       fi
     else
-      echo "No valid existing version target."
+      echo "Patch release detected ($RELEASE_VERSION), not updating symlink"
     fi
-  else
-    echo "Patch release detected ($RELEASE_VERSION), not updating symlink"
   fi
 
   git add .

From 53a22c0c997ae4e45f44202fd6317b8cab21867d Mon Sep 17 00:00:00 2001
From: Cheng Pan <chengpan@apache.org>
Date: Thu, 26 Jun 2025 17:54:44 -0700
Subject: [PATCH 20/86] [SPARK-52568][BUILD][3.5] Fix `exec-maven-plugin`
 version used by `dev/test-dependencies.sh`

Cherry-pick https://github.com/apache/spark/pull/51273 to branch 3.5

### What changes were proposed in this pull request?

Fix `exec-maven-plugin` version used by `dev/test-dependencies.sh` to use the `exec-maven-plugin.version` defined in `pom.xml`, instead of the hardcoded old version(which does not work with Maven 4).

### Why are the changes needed?

Keep toolchain version consistency, and prepare for Maven 4 support.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Run `./dev/test-dependencies.sh`
```
...
++ build/mvn help:evaluate -Dexpression=exec-maven-plugin.version -q -DforceStdout
++ grep -E '[0-9]+\.[0-9]+\.[0-9]+'
Using `mvn` from path: /Users/chengpan/Projects/apache-spark-3.5/build/apache-maven-3.9.6/bin/mvn
+ MVN_EXEC_PLUGIN_VERSION=3.1.0
+ set +e
++ build/mvn -q -Dexec.executable=echo '-Dexec.args=${project.version}' --non-recursive org.codehaus.mojo:exec-maven-plugin:3.1.0:exec
...
```
And pass GHA.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #51288 from pan3793/SPARK-52568-3.5.

Authored-by: Cheng Pan <chengpan@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 dev/test-dependencies.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh
index 36cc7a4f994dc..636ed0ffb0cad 100755
--- a/dev/test-dependencies.sh
+++ b/dev/test-dependencies.sh
@@ -37,6 +37,9 @@ HADOOP_HIVE_PROFILES=(
     hadoop-3-hive-2.3
 )
 
+MVN_EXEC_PLUGIN_VERSION=$(build/mvn help:evaluate \
+    -Dexpression=exec-maven-plugin.version -q -DforceStdout | grep -E "[0-9]+\.[0-9]+\.[0-9]+")
+
 # We'll switch the version to a temp. one, publish POMs using that new version, then switch back to
 # the old version. We need to do this because the `dependency:build-classpath` task needs to
 # resolve Spark's internal submodule dependencies.
@@ -47,7 +50,7 @@ OLD_VERSION=$($MVN -q \
     -Dexec.executable="echo" \
     -Dexec.args='${project.version}' \
     --non-recursive \
-    org.codehaus.mojo:exec-maven-plugin:1.6.0:exec | grep -E '[0-9]+\.[0-9]+\.[0-9]+')
+    org.codehaus.mojo:exec-maven-plugin:${MVN_EXEC_PLUGIN_VERSION}:exec | grep -E '[0-9]+\.[0-9]+\.[0-9]+')
 # dependency:get for guava and jetty-io are workaround for SPARK-37302.
 GUAVA_VERSION=$(build/mvn help:evaluate -Dexpression=guava.version -q -DforceStdout | grep -E "^[0-9.]+$")
 build/mvn dependency:get -Dartifact=com.google.guava:guava:${GUAVA_VERSION} -q
@@ -61,7 +64,7 @@ SCALA_BINARY_VERSION=$($MVN -q \
     -Dexec.executable="echo" \
     -Dexec.args='${scala.binary.version}' \
     --non-recursive \
-    org.codehaus.mojo:exec-maven-plugin:1.6.0:exec | grep -E '[0-9]+\.[0-9]+')
+    org.codehaus.mojo:exec-maven-plugin:${MVN_EXEC_PLUGIN_VERSION}:exec | grep -E '[0-9]+\.[0-9]+')
 if [[ "$SCALA_BINARY_VERSION" != "2.12" ]]; then
   echo "Skip dependency testing on $SCALA_BINARY_VERSION"
   exit 0

From 87f58c031b8996fbfb7e261744dc22a8f1c94a6b Mon Sep 17 00:00:00 2001
From: Gibson Chikafa <gibbschikafa@gmail.com>
Date: Sun, 29 Jun 2025 11:02:53 +0200
Subject: [PATCH 21/86] Add org.codehaus.jackson

---
 pom.xml | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index 7f501405be76d..e5573b19e3405 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1918,13 +1918,23 @@
         <groupId>org.codehaus.jackson</groupId>
         <artifactId>jackson-core-asl</artifactId>
         <version>${codehaus.jackson.version}</version>
-        <scope>${hadoop.deps.scope}</scope>
+        <scope>${hive.deps.scope}</scope>
       </dependency>
       <dependency>
         <groupId>org.codehaus.jackson</groupId>
         <artifactId>jackson-mapper-asl</artifactId>
         <version>${codehaus.jackson.version}</version>
-        <scope>${hadoop.deps.scope}</scope>
+        <scope>${hive.deps.scope}</scope>
+      </dependency>
+      <dependency>
+        <groupId>org.codehaus.jackson</groupId>
+        <artifactId>jackson-xc</artifactId>
+        <version>${codehaus.jackson.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.codehaus.jackson</groupId>
+        <artifactId>jackson-jaxrs</artifactId>
+        <version>${codehaus.jackson.version}</version>
       </dependency>
       <dependency>
         <groupId>${hive.group}</groupId>

From 7d8a59631291e7f6e6510a2c91bebf7c6913fcb9 Mon Sep 17 00:00:00 2001
From: Gibson Chikafa <gibbschikafa@gmail.com>
Date: Sun, 29 Jun 2025 23:07:38 +0200
Subject: [PATCH 22/86] Reset kafka version to 3.4.1

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index e5573b19e3405..b346f76012b2a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -138,7 +138,7 @@
     <!-- Version used for internal directory structure -->
     <hive.version.short>3.0</hive.version.short>
     <!-- note that this should be compatible with Kafka brokers version 0.10 and up -->
-    <kafka.version>2.6.0</kafka.version>
+    <kafka.version>3.4.1</kafka.version>
     <!-- After 10.15.1.3, the minimum required version is JDK9 -->
     <derby.version>10.14.2.0</derby.version>
     <parquet.version>1.13.1</parquet.version>

From 57da4850b10a6fe6e7499cfb7f0f2658d01edcad Mon Sep 17 00:00:00 2001
From: Dongpu Li <dongpli@ebay.com>
Date: Mon, 30 Jun 2025 16:48:59 +0900
Subject: [PATCH 23/86] =?UTF-8?q?[SPARK-52611][SQL]=20Fix=20SQLConf=20vers?=
 =?UTF-8?q?ion=20for=20excludeSubqueryRefsFromRemoveRedundantAliases?=
 =?UTF-8?q?=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… configuration

### What changes were proposed in this pull request?

This PR fixes the added version of `spark.sql.optimizer.excludeSubqueryRefsFromRemoveRedundantAliases.enabled` to
3.5.1 (also in [SPARK-52611])
### Why are the changes needed?

To show the correct version added.

### Does this PR introduce _any_ user-facing change?

Yes but only in the unreleased branches. It will change the version shown in SQL documentation.

### How was this patch tested?

Not tested. Jenkins will test it out.

### Was this patch authored or co-authored using generative AI tooling?

No

Closes #51318 from atongpu/SPARK-52611.

Authored-by: Dongpu Li <dongpli@ebay.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit 9fcacba60c5acbf57319db1f1851355eee18ca89)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 8186d5fa00c3a..ca6938588ddb3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -4398,7 +4398,7 @@ object SQLConf {
       .internal()
       .doc("When true, exclude the references from the subquery expressions (in, exists, etc.) " +
         s"while removing redundant aliases.")
-      .version("4.0.0")
+      .version("3.5.1")
       .booleanConf
       .createWithDefault(true)
 

From a53a9c4d77377af9fbd648a8d9b528754d657aea Mon Sep 17 00:00:00 2001
From: PJ Fanning <pjfanning@users.noreply.github.com>
Date: Mon, 30 Jun 2025 22:34:00 +0800
Subject: [PATCH 24/86] [SPARK-52381][CORE][3.5] JsonProtocol: Only accept
 subclasses of SparkListenerEvent

### What changes were proposed in this pull request?

JsonProtocol tidy up. Only parse JSON relating to Spark events.
https://issues.apache.org/jira/browse/SPARK-52381

### Why are the changes needed?

Tidier code and https://lists.apache.org/thread/9zwkdo85wcdfppgqvbhjly8wdgf595yp

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Unit test

### Was this patch authored or co-authored using generative AI tooling?

No

Closes #51323 from pjfanning/SPARK-52381-br3.5.

Authored-by: PJ Fanning <pjfanning@users.noreply.github.com>
Signed-off-by: yangjie01 <yangjie01@baidu.com>
---
 .../org/apache/spark/util/JsonProtocol.scala  | 10 +++++--
 .../apache/spark/util/JsonProtocolSuite.scala | 30 +++++++++++++++++++
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 377caf776deb0..3b4bc242b4668 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -874,8 +874,14 @@ private[spark] object JsonProtocol extends JsonUtils {
       case `stageExecutorMetrics` => stageExecutorMetricsFromJson(json)
       case `blockUpdate` => blockUpdateFromJson(json)
       case `resourceProfileAdded` => resourceProfileAddedFromJson(json)
-      case other => mapper.readValue(json.toString, Utils.classForName(other))
-        .asInstanceOf[SparkListenerEvent]
+      case other =>
+        val otherClass = Utils.classForName(other)
+        if (classOf[SparkListenerEvent].isAssignableFrom(otherClass)) {
+          mapper.readValue(json.toString, otherClass)
+            .asInstanceOf[SparkListenerEvent]
+        } else {
+          throw new SparkException(s"Unknown event type: $other")
+        }
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index 8105df64705a4..8af8c8579232f 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -874,6 +874,36 @@ class JsonProtocolSuite extends SparkFunSuite {
     val jobFailedEvent = JsonProtocol.sparkEventFromJson(exJobFailureNoStackJson)
     testEvent(jobFailedEvent, exJobFailureExpectedJson)
   }
+
+  test("SPARK-52381: handle class not found") {
+    val unknownJson =
+      """{
+        |  "Event" : "com.example.UnknownEvent",
+        |  "foo" : "foo"
+        |}""".stripMargin
+    try {
+      JsonProtocol.sparkEventFromJson(unknownJson)
+      fail("Expected ClassNotFoundException for unknown event type")
+    } catch {
+      case e: ClassNotFoundException =>
+    }
+  }
+
+  test("SPARK-52381: only read classes that extend SparkListenerEvent") {
+    val unknownJson =
+      """{
+        |  "Event" : "org.apache.spark.SparkException",
+        |  "foo" : "foo"
+        |}""".stripMargin
+    try {
+      JsonProtocol.sparkEventFromJson(unknownJson)
+      fail("Expected SparkException for unknown event type")
+    } catch {
+      case e: SparkException =>
+        assert(e.getMessage.startsWith("Unknown event type"))
+    }
+  }
+
 }
 
 

From 3f2a3ba440d9a5e6d719f739f7e026f57a8e1c32 Mon Sep 17 00:00:00 2001
From: Emil Ejbyfeldt <emil.ejbyfeldt@choreograph.com>
Date: Tue, 1 Jul 2025 09:37:36 -0400
Subject: [PATCH 25/86] [SPARK-52023][SQL] Fix data corruption/segfault
 returning Option[Product] from udaf

### What changes were proposed in this pull request?

This fixes so defining a udaf returning a `Option[Product]` produces correct results instead of the current behavior. Where it throws an exception, segfaults or produces incorrect results.

### Why are the changes needed?

Fix correctness issue.

### Does this PR introduce _any_ user-facing change?

Fixes a correctness issue.

### How was this patch tested?

Existing and new unittest.

### Was this patch authored or co-authored using generative AI tooling?

No

Closes #50827 from eejbyfeldt/SPARK-52023.

Authored-by: Emil Ejbyfeldt <emil.ejbyfeldt@choreograph.com>
Signed-off-by: Herman van Hovell <herman@databricks.com>
(cherry picked from commit 5e6e8f12a50340f19f67659d0161326cd5304bcf)
Signed-off-by: Herman van Hovell <herman@databricks.com>
---
 .../spark/sql/execution/aggregate/udaf.scala  |  2 +-
 .../sql/hive/execution/UDAQuerySuite.scala    | 27 +++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
index e517376bc5fc0..fe6307b5bbe86 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
@@ -530,7 +530,7 @@ case class ScalaAggregator[IN, BUF, OUT](
 
   def eval(buffer: BUF): Any = {
     val row = outputSerializer(agg.finish(buffer))
-    if (outputEncoder.isSerializedAsStruct) row else row.get(0, dataType)
+    if (outputEncoder.isSerializedAsStructForTopLevel) row else row.get(0, dataType)
   }
 
   private[this] lazy val bufferRow = new UnsafeRow(bufferEncoder.namedExpressions.length)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala
index 0bd6b1403d39c..daa08666ea0e6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala
@@ -60,6 +60,22 @@ object LongProductSumAgg extends Aggregator[(jlLong, jlLong), Long, jlLong] {
   def outputEncoder: Encoder[jlLong] = Encoders.LONG
 }
 
+final case class Reduce[T: Encoder](r: (T, T) => T)(implicit i: Encoder[Option[T]])
+  extends Aggregator[T, Option[T], T] {
+  def zero: Option[T] = None
+  def reduce(b: Option[T], a: T): Option[T] = Some(b.fold(a)(r(_, a)))
+  def merge(b1: Option[T], b2: Option[T]): Option[T] =
+    (b1, b2) match {
+      case (Some(a), Some(b)) => Some(r(a, b))
+      case (Some(a), None) => Some(a)
+      case (None, Some(b)) => Some(b)
+      case (None, None) => None
+    }
+  def finish(reduction: Option[T]): T = reduction.get
+  def bufferEncoder: Encoder[Option[T]] = implicitly
+  def outputEncoder: Encoder[T] = implicitly
+}
+
 @SQLUserDefinedType(udt = classOf[CountSerDeUDT])
 case class CountSerDeSQL(nSer: Int, nDeSer: Int, sum: Int)
 
@@ -180,6 +196,9 @@ abstract class UDAQuerySuite extends QueryTest with SQLTestUtils with TestHiveSi
     val data4 = Seq[Boolean](true, false, true).toDF("boolvalues")
     data4.write.saveAsTable("agg4")
 
+    val data5 = Seq[(Int, (Int, Int))]((1, (2, 3))).toDF("key", "value")
+    data5.write.saveAsTable("agg5")
+
     val emptyDF = spark.createDataFrame(
       sparkContext.emptyRDD[Row],
       StructType(StructField("key", StringType) :: StructField("value", IntegerType) :: Nil))
@@ -190,6 +209,8 @@ abstract class UDAQuerySuite extends QueryTest with SQLTestUtils with TestHiveSi
     spark.udf.register("mydoubleavg", udaf(MyDoubleAvgAgg))
     spark.udf.register("longProductSum", udaf(LongProductSumAgg))
     spark.udf.register("arraysum", udaf(ArrayDataAgg))
+    spark.udf.register("reduceOptionPair", udaf(Reduce[Option[(Int, Int)]](
+      (opt1, opt2) => opt1.zip(opt2).map { case ((a1, b1), (a2, b2)) => (a1 + a2, b1 + b2) })))
   }
 
   override def afterAll(): Unit = {
@@ -371,6 +392,12 @@ abstract class UDAQuerySuite extends QueryTest with SQLTestUtils with TestHiveSi
       Row(Seq(12.0, 15.0, 18.0)) :: Nil)
   }
 
+  test("SPARK-52023: Returning Option[Product] from udaf") {
+    checkAnswer(
+      spark.sql("SELECT reduceOptionPair(value) FROM agg5 GROUP BY key"),
+      Row(Row(2, 3)) :: Nil)
+  }
+
   test("verify aggregator ser/de behavior") {
     val data = sparkContext.parallelize((1 to 100).toSeq, 3).toDF("value1")
     val agg = udaf(CountSerDeAgg)

From a56879e67a11967b54812a8365d5bedcfc3d1861 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 1 Jul 2025 08:12:19 -0700
Subject: [PATCH 26/86] Revert "[SPARK-52023][SQL] Fix data corruption/segfault
 returning Option[Product] from udaf"

This reverts commit 3f2a3ba440d9a5e6d719f739f7e026f57a8e1c32.
---
 .../spark/sql/execution/aggregate/udaf.scala  |  2 +-
 .../sql/hive/execution/UDAQuerySuite.scala    | 27 -------------------
 2 files changed, 1 insertion(+), 28 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
index fe6307b5bbe86..e517376bc5fc0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
@@ -530,7 +530,7 @@ case class ScalaAggregator[IN, BUF, OUT](
 
   def eval(buffer: BUF): Any = {
     val row = outputSerializer(agg.finish(buffer))
-    if (outputEncoder.isSerializedAsStructForTopLevel) row else row.get(0, dataType)
+    if (outputEncoder.isSerializedAsStruct) row else row.get(0, dataType)
   }
 
   private[this] lazy val bufferRow = new UnsafeRow(bufferEncoder.namedExpressions.length)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala
index daa08666ea0e6..0bd6b1403d39c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala
@@ -60,22 +60,6 @@ object LongProductSumAgg extends Aggregator[(jlLong, jlLong), Long, jlLong] {
   def outputEncoder: Encoder[jlLong] = Encoders.LONG
 }
 
-final case class Reduce[T: Encoder](r: (T, T) => T)(implicit i: Encoder[Option[T]])
-  extends Aggregator[T, Option[T], T] {
-  def zero: Option[T] = None
-  def reduce(b: Option[T], a: T): Option[T] = Some(b.fold(a)(r(_, a)))
-  def merge(b1: Option[T], b2: Option[T]): Option[T] =
-    (b1, b2) match {
-      case (Some(a), Some(b)) => Some(r(a, b))
-      case (Some(a), None) => Some(a)
-      case (None, Some(b)) => Some(b)
-      case (None, None) => None
-    }
-  def finish(reduction: Option[T]): T = reduction.get
-  def bufferEncoder: Encoder[Option[T]] = implicitly
-  def outputEncoder: Encoder[T] = implicitly
-}
-
 @SQLUserDefinedType(udt = classOf[CountSerDeUDT])
 case class CountSerDeSQL(nSer: Int, nDeSer: Int, sum: Int)
 
@@ -196,9 +180,6 @@ abstract class UDAQuerySuite extends QueryTest with SQLTestUtils with TestHiveSi
     val data4 = Seq[Boolean](true, false, true).toDF("boolvalues")
     data4.write.saveAsTable("agg4")
 
-    val data5 = Seq[(Int, (Int, Int))]((1, (2, 3))).toDF("key", "value")
-    data5.write.saveAsTable("agg5")
-
     val emptyDF = spark.createDataFrame(
       sparkContext.emptyRDD[Row],
       StructType(StructField("key", StringType) :: StructField("value", IntegerType) :: Nil))
@@ -209,8 +190,6 @@ abstract class UDAQuerySuite extends QueryTest with SQLTestUtils with TestHiveSi
     spark.udf.register("mydoubleavg", udaf(MyDoubleAvgAgg))
     spark.udf.register("longProductSum", udaf(LongProductSumAgg))
     spark.udf.register("arraysum", udaf(ArrayDataAgg))
-    spark.udf.register("reduceOptionPair", udaf(Reduce[Option[(Int, Int)]](
-      (opt1, opt2) => opt1.zip(opt2).map { case ((a1, b1), (a2, b2)) => (a1 + a2, b1 + b2) })))
   }
 
   override def afterAll(): Unit = {
@@ -392,12 +371,6 @@ abstract class UDAQuerySuite extends QueryTest with SQLTestUtils with TestHiveSi
       Row(Seq(12.0, 15.0, 18.0)) :: Nil)
   }
 
-  test("SPARK-52023: Returning Option[Product] from udaf") {
-    checkAnswer(
-      spark.sql("SELECT reduceOptionPair(value) FROM agg5 GROUP BY key"),
-      Row(Row(2, 3)) :: Nil)
-  }
-
   test("verify aggregator ser/de behavior") {
     val data = sparkContext.parallelize((1 to 100).toSeq, 3).toDF("value1")
     val agg = udaf(CountSerDeAgg)

From 1c408c31941baf005be6f5bc294128b2ac177815 Mon Sep 17 00:00:00 2001
From: Emil Ejbyfeldt <emil.ejbyfeldt@choreograph.com>
Date: Wed, 2 Jul 2025 06:51:40 -0700
Subject: [PATCH 27/86] [SPARK-52023][SQL][3.5] Fix data corruption/segfault
 returning Option[Product] from udaf

### What changes were proposed in this pull request?

This fixes so defining a udaf returning a `Option[Product]` produces correct results instead of the current behavior. Where it throws an exception, segfaults or produces incorrect results.

### Why are the changes needed?

Fix correctness issue.

### Does this PR introduce _any_ user-facing change?

Fixes a correctness issue.

### How was this patch tested?

Existing and new unittest.

### Was this patch authored or co-authored using generative AI tooling?

No

Closes #51347 from eejbyfeldt/3.5-SPARK-52023.

Authored-by: Emil Ejbyfeldt <emil.ejbyfeldt@choreograph.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 .../spark/sql/execution/aggregate/udaf.scala  |  2 +-
 .../sql/hive/execution/UDAQuerySuite.scala    | 28 +++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
index e517376bc5fc0..fe6307b5bbe86 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
@@ -530,7 +530,7 @@ case class ScalaAggregator[IN, BUF, OUT](
 
   def eval(buffer: BUF): Any = {
     val row = outputSerializer(agg.finish(buffer))
-    if (outputEncoder.isSerializedAsStruct) row else row.get(0, dataType)
+    if (outputEncoder.isSerializedAsStructForTopLevel) row else row.get(0, dataType)
   }
 
   private[this] lazy val bufferRow = new UnsafeRow(bufferEncoder.namedExpressions.length)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala
index 0bd6b1403d39c..31d0452c70617 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala
@@ -60,6 +60,22 @@ object LongProductSumAgg extends Aggregator[(jlLong, jlLong), Long, jlLong] {
   def outputEncoder: Encoder[jlLong] = Encoders.LONG
 }
 
+final case class Reduce[T: Encoder](r: (T, T) => T)(implicit i: Encoder[Option[T]])
+  extends Aggregator[T, Option[T], T] {
+  def zero: Option[T] = None
+  def reduce(b: Option[T], a: T): Option[T] = Some(b.fold(a)(r(_, a)))
+  def merge(b1: Option[T], b2: Option[T]): Option[T] =
+    (b1, b2) match {
+      case (Some(a), Some(b)) => Some(r(a, b))
+      case (Some(a), None) => Some(a)
+      case (None, Some(b)) => Some(b)
+      case (None, None) => None
+    }
+  def finish(reduction: Option[T]): T = reduction.get
+  def bufferEncoder: Encoder[Option[T]] = implicitly
+  def outputEncoder: Encoder[T] = implicitly
+}
+
 @SQLUserDefinedType(udt = classOf[CountSerDeUDT])
 case class CountSerDeSQL(nSer: Int, nDeSer: Int, sum: Int)
 
@@ -180,6 +196,9 @@ abstract class UDAQuerySuite extends QueryTest with SQLTestUtils with TestHiveSi
     val data4 = Seq[Boolean](true, false, true).toDF("boolvalues")
     data4.write.saveAsTable("agg4")
 
+    val data5 = Seq[(Int, (Int, Int))]((1, (2, 3))).toDF("key", "value")
+    data5.write.saveAsTable("agg5")
+
     val emptyDF = spark.createDataFrame(
       sparkContext.emptyRDD[Row],
       StructType(StructField("key", StringType) :: StructField("value", IntegerType) :: Nil))
@@ -190,6 +209,9 @@ abstract class UDAQuerySuite extends QueryTest with SQLTestUtils with TestHiveSi
     spark.udf.register("mydoubleavg", udaf(MyDoubleAvgAgg))
     spark.udf.register("longProductSum", udaf(LongProductSumAgg))
     spark.udf.register("arraysum", udaf(ArrayDataAgg))
+    spark.udf.register("reduceOptionPair", udaf(Reduce[Option[(Int, Int)]](
+      (opt1, opt2) =>
+        opt1.zip(opt2).map { case ((a1, b1), (a2, b2)) => (a1 + a2, b1 + b2) }.headOption)))
   }
 
   override def afterAll(): Unit = {
@@ -371,6 +393,12 @@ abstract class UDAQuerySuite extends QueryTest with SQLTestUtils with TestHiveSi
       Row(Seq(12.0, 15.0, 18.0)) :: Nil)
   }
 
+  test("SPARK-52023: Returning Option[Product] from udaf") {
+    checkAnswer(
+      spark.sql("SELECT reduceOptionPair(value) FROM agg5 GROUP BY key"),
+      Row(Row(2, 3)) :: Nil)
+  }
+
   test("verify aggregator ser/de behavior") {
     val data = sparkContext.parallelize((1 to 100).toSeq, 3).toDF("value1")
     val agg = udaf(CountSerDeAgg)

From 20c9add2462cf361067cd71a082c24903fa08978 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Fri, 4 Jul 2025 16:28:24 -0700
Subject: [PATCH 28/86] [SPARK-52635][BUILD][3.5] Upgrade ORC to 1.9.7

### What changes were proposed in this pull request?

This PR aims to upgrade ORC to 1.9.7.

### Why are the changes needed?

To bring the latest bug fixes.
- https://github.com/apache/orc/issues/2226

Here is the full release note.
- https://github.com/apache/orc/releases/tag/v1.9.7
- https://orc.apache.org/news/2025/07/04/ORC-1.9.7/

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Pass the CIs.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #51336 from dongjoon-hyun/orc197.

Authored-by: Dongjoon Hyun <dongjoon@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 dev/deps/spark-deps-hadoop-3-hive-2.3 | 6 +++---
 pom.xml                               | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3
index dbf0cb34c5353..b4034c171fd3a 100644
--- a/dev/deps/spark-deps-hadoop-3-hive-2.3
+++ b/dev/deps/spark-deps-hadoop-3-hive-2.3
@@ -212,9 +212,9 @@ opencsv/2.3//opencsv-2.3.jar
 opentracing-api/0.33.0//opentracing-api-0.33.0.jar
 opentracing-noop/0.33.0//opentracing-noop-0.33.0.jar
 opentracing-util/0.33.0//opentracing-util-0.33.0.jar
-orc-core/1.9.6/shaded-protobuf/orc-core-1.9.6-shaded-protobuf.jar
-orc-mapreduce/1.9.6/shaded-protobuf/orc-mapreduce-1.9.6-shaded-protobuf.jar
-orc-shims/1.9.6//orc-shims-1.9.6.jar
+orc-core/1.9.7/shaded-protobuf/orc-core-1.9.7-shaded-protobuf.jar
+orc-mapreduce/1.9.7/shaded-protobuf/orc-mapreduce-1.9.7-shaded-protobuf.jar
+orc-shims/1.9.7//orc-shims-1.9.7.jar
 oro/2.0.8//oro-2.0.8.jar
 osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar
 paranamer/2.8//paranamer-2.8.jar
diff --git a/pom.xml b/pom.xml
index 4f912329beed6..3a4530374bfee 100644
--- a/pom.xml
+++ b/pom.xml
@@ -141,7 +141,7 @@
     <!-- After 10.15.1.3, the minimum required version is JDK9 -->
     <derby.version>10.14.2.0</derby.version>
     <parquet.version>1.13.1</parquet.version>
-    <orc.version>1.9.6</orc.version>
+    <orc.version>1.9.7</orc.version>
     <orc.classifier>shaded-protobuf</orc.classifier>
     <jetty.version>9.4.56.v20240826</jetty.version>
     <jakartaservlet.version>4.0.3</jakartaservlet.version>

From 029503a1fe2e99a519913361448c86f970d04778 Mon Sep 17 00:00:00 2001
From: Kent Yao <yao@apache.org>
Date: Mon, 7 Jul 2025 09:29:04 -0700
Subject: [PATCH 29/86] [SPARK-52684][SQL] Make CACHE TABLE Commands atomic
 while encountering execution errors

### What changes were proposed in this pull request?

This PR makes CACHE TABLE commands atomic while encountering execution errors

### Why are the changes needed?

For now, when an AnalysisException occurs, no cache or view will be created, but an execution one occurs, a view or an erroneous 'cache' is created.

### Does this PR introduce _any_ user-facing change?

Yes, but it's a bugfix. It only affects rare corner case that a user leverages this bug to create an erroneous 'cache'/view for some particular purposes

### How was this patch tested?
new tests

### Was this patch authored or co-authored using generative AI tooling?
no

Closes #51386 from yaooqinn/SPARK-52684-35.

Authored-by: Kent Yao <yao@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 .../datasources/v2/CacheTableExec.scala       | 29 +++++++++++++++++--
 .../apache/spark/sql/CachedTableSuite.scala   | 11 ++++++-
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala
index 8c14b5e370736..691629e64956a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.execution.datasources.v2
 
 import java.util.Locale
 
+import scala.util.control.NonFatal
+
 import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis.LocalTempView
@@ -26,8 +28,10 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.MultipartIdentifierHelper
-import org.apache.spark.sql.execution.command.CreateViewCommand
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.execution.command.{CreateViewCommand, DropTempViewCommand}
 import org.apache.spark.storage.StorageLevel
+import org.apache.spark.util.Utils
 
 trait BaseCacheTableExec extends LeafV2CommandExec {
   def relationName: String
@@ -60,7 +64,16 @@ trait BaseCacheTableExec extends LeafV2CommandExec {
 
     if (!isLazy) {
       // Performs eager caching.
-      dataFrameForCachedPlan.count()
+      try {
+        dataFrameForCachedPlan.count()
+      } catch {
+        case NonFatal(e) =>
+          // If the query fails, we should remove the cached table.
+          Utils.tryLogNonFatalError {
+            session.sharedState.cacheManager.uncacheQuery(session, planToCache, cascade = false)
+          }
+          throw e
+      }
     }
 
     Seq.empty
@@ -113,6 +126,18 @@ case class CacheTableAsSelectExec(
   override lazy val dataFrameForCachedPlan: DataFrame = {
     session.table(tempViewName)
   }
+
+  override def run(): Seq[InternalRow] = {
+    try {
+      super.run()
+    } catch {
+      case NonFatal(e) =>
+        Utils.tryLogNonFatalError {
+          DropTempViewCommand(Identifier.of(Array.empty, tempViewName)).run(session)
+        }
+        throw e
+    }
+  }
 }
 
 case class UncacheTableExec(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 9815cb816c994..e54947266951b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -26,7 +26,7 @@ import scala.concurrent.duration._
 
 import org.apache.commons.io.FileUtils
 
-import org.apache.spark.CleanerListener
+import org.apache.spark.{CleanerListener, SparkException}
 import org.apache.spark.executor.DataReadMethod._
 import org.apache.spark.executor.DataReadMethod.DataReadMethod
 import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent, SparkListenerJobStart}
@@ -1729,4 +1729,13 @@ class CachedTableSuite extends QueryTest with SQLTestUtils
     }
 
   }
+
+  test("SPARK-52684: Atomicity of cache table on error") {
+    withTempView("SPARK_52684") {
+      intercept[SparkException] {
+        spark.sql("CACHE TABLE SPARK_52684 AS SELECT raise_error('SPARK-52684') AS c1")
+      }
+      assert(!spark.catalog.tableExists("SPARK_52684"))
+    }
+  }
 }

From 1832d014fa6589d05c2cd73937f34e954fecdd82 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Tue, 8 Jul 2025 18:08:00 +0900
Subject: [PATCH 30/86] [SPARK-52707][BUILD] Remove preview postfix when
 looking up the JIRA versions

### What changes were proposed in this pull request?

This PR proposes to remove preview postfix when looking up the JIRA versions

### Why are the changes needed?

Otherwise, preview builds fail.

### Does this PR introduce _any_ user-facing change?

No, dev-only.

### How was this patch tested?

Manually

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #51399 from HyukjinKwon/SPARK-52707.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit 51bbae091885647e1b835fdd7dcef760194e3bf0)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 dev/create-release/release-build.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 3b19786e562be..cd2c42645fd5a 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -931,11 +931,12 @@ if [[ "$1" == "publish-release" ]]; then
     DEADLINE=$(TZ=America/Los_Angeles date -d "+4 days" "+%a, %d %b %Y %H:%M:%S %Z")
 
     JIRA_API_URL="https://issues.apache.org/jira/rest/api/2/project/SPARK/versions"
+    SPARK_VERSION_BASE=$(echo "$SPARK_VERSION" | sed 's/-preview[0-9]*//')
     JIRA_VERSION_ID=$(curl -s "$JIRA_API_URL" | \
       # Split JSON objects by replacing '},{' with a newline-separated pattern
       tr '}' '\n' | \
       # Find the block containing the exact version name
-      grep -F "\"name\":\"$SPARK_VERSION\"" -A 5 | \
+      grep -F "\"name\":\"$SPARK_VERSION_BASE\"" -A 5 | \
       # Extract the line with "id"
       grep '"id"' | \
       # Extract the numeric id value (assuming "id":"123456")

From fb9cd102620991c53390e1dc62cff320132e385c Mon Sep 17 00:00:00 2001
From: Kent Yao <yao@apache.org>
Date: Wed, 9 Jul 2025 17:39:42 +0800
Subject: [PATCH 31/86] [SPARK-52721][PYTHON] Fix message parameter for
 CANNOT_PARSE_DATATYPE

Fix message parameter for CANNOT_PARSE_DATATYPE

bugfix

No

- Before
AssertionError: Undefined error message parameter for error class: CANNOT_PARSE_DATATYPE. Parameters: {'error':

- After
pyspark.errors.exceptions.base.PySparkValueError: [CANNOT_PARSE_DATATYPE] Unable to parse datatype.

no

Closes #51414 from yaooqinn/SPARK-52721.

Authored-by: Kent Yao <yao@apache.org>
Signed-off-by: Kent Yao <yao@apache.org>
(cherry picked from commit c6e864538e2d8e3c816ba0f0a7bb9e212adc0a9d)
Signed-off-by: Kent Yao <yao@apache.org>
---
 python/pyspark/sql/classic/dataframe.py | 0
 python/pyspark/sql/dataframe.py         | 2 +-
 python/pyspark/sql/types.py             | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 create mode 100644 python/pyspark/sql/classic/dataframe.py

diff --git a/python/pyspark/sql/classic/dataframe.py b/python/pyspark/sql/classic/dataframe.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index afa979dab019e..c52fa568b7b31 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -576,7 +576,7 @@ def schema(self) -> StructType:
             except Exception as e:
                 raise PySparkValueError(
                     error_class="CANNOT_PARSE_DATATYPE",
-                    message_parameters={"error": str(e)},
+                    message_parameters={"msg": str(e)},
                 )
         return self._schema
 
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index a2a8796957623..d4d6b6b086aeb 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1411,7 +1411,7 @@ def _parse_datatype_json_value(json_value: Union[dict, str]) -> DataType:
         else:
             raise PySparkValueError(
                 error_class="CANNOT_PARSE_DATATYPE",
-                message_parameters={"error": str(json_value)},
+                message_parameters={"msg": str(json_value)},
             )
     else:
         tpe = json_value["type"]

From 61032721449bb778cb608693997fae5982de3a29 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Thu, 10 Jul 2025 16:12:47 +0900
Subject: [PATCH 32/86] [SPARK-52749][BUILD] Replace preview1 to dev1 in its
 PyPI package name in the vote email

### What changes were proposed in this pull request?

This PR proposes to replace preview1 to dev1 in its PyPI package name in the vote email.

### Why are the changes needed?

Otherwise, it fails to download, e.g., `pip install https://dist.apache.org/repos/dist/dev/spark/v4.1.0-preview1-rc1-bin/pyspark-4.1.0-preview1.tar.gz`. It has to be `pip install https://dist.apache.org/repos/dist/dev/spark/v4.1.0-preview1-rc1-bin/pyspark-4.1.0.dev1.tar.gz`

### Does this PR introduce _any_ user-facing change?

No, dev-only.

### How was this patch tested?

Manually

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #51436 from HyukjinKwon/change-email-version.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit 35baa9755440b2f31a58b286b9cbfb6041cc189e)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 dev/create-release/release-build.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index cd2c42645fd5a..8c7056e4d1918 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -929,6 +929,7 @@ if [[ "$1" == "publish-release" ]]; then
 
     # Calculate deadline in Pacific Time (PST/PDT)
     DEADLINE=$(TZ=America/Los_Angeles date -d "+4 days" "+%a, %d %b %Y %H:%M:%S %Z")
+    PYSPARK_VERSION=`echo "$RELEASE_VERSION" |  sed -e "s/-/./" -e "s/preview/dev/"`
 
     JIRA_API_URL="https://issues.apache.org/jira/rest/api/2/project/SPARK/versions"
     SPARK_VERSION_BASE=$(echo "$SPARK_VERSION" | sed 's/-preview[0-9]*//')
@@ -1008,7 +1009,7 @@ EOF
       echo "reporting any regressions."
       echo
       echo "If you're working in PySpark you can set up a virtual env and install"
-      echo "the current RC via \"pip install https://dist.apache.org/repos/dist/dev/spark/${GIT_REF}-bin/pyspark-${SPARK_VERSION}.tar.gz\""
+      echo "the current RC via \"pip install https://dist.apache.org/repos/dist/dev/spark/${GIT_REF}-bin/pyspark-${PYSPARK_VERSION}.tar.gz\""
       echo "and see if anything important breaks."
       echo "In the Java/Scala, you can add the staging repository to your project's resolvers and test"
       echo "with the RC (make sure to clean up the artifact cache before/after so"

From 157c7ecb2d630d1b95c177b6d509dae0314e6f1f Mon Sep 17 00:00:00 2001
From: Kent Yao <yao@apache.org>
Date: Thu, 10 Jul 2025 15:28:35 +0800
Subject: [PATCH 33/86] [SPARK-52721][PYTHON][HOTFIX] Fix message parameter for
 CANNOT_PARSE_DATATYPE

---
 python/pyspark/sql/classic/dataframe.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 python/pyspark/sql/classic/dataframe.py

diff --git a/python/pyspark/sql/classic/dataframe.py b/python/pyspark/sql/classic/dataframe.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000

From 218d2926e227b295f1e5682772f6cf4839a0481c Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 16 Jul 2025 00:09:26 -0700
Subject: [PATCH 34/86] [SPARK-52809][SQL] Don't hold reader and iterator
 references for all partitions in task completion listeners for metric update

This patch adds only one task completion listener for metric updating for `DataSourceRDD`, instead of adding separate one for each partition iterator.

For each partition iterator, currently we add one task completion listener used to update final metrics if the task is stopped early. In the listener, the reader and iterator are held. So if the partition is normally exhausted, the references cannot be released early. It is a problem especially if the references are heavy as reported by https://github.com/apache/iceberg/issues/13297.

Since the purpose of the callback is to update the final metrics if the task is stopped early, it means that we only need to do it for the last partition iterator. So we don't need set up a listener for each partition iterator. Thus, we can set up just one listener for all iterator. Once we advance to next partition iterator, we can update the update target (reader and partition iterator).

No

Existing tests.

No

Closes #51503 from viirya/fix_metric_callback.

Authored-by: Liang-Chi Hsieh <viirya@gmail.com>
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
(cherry picked from commit cea0051f71e8b50e8b9e19fb0ff4fb100e82e3d0)
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
---
 .../datasources/v2/DataSourceRDD.scala        | 40 +++++++++++++++----
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
index 67e77a97865df..36872f232e7ed 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
@@ -60,6 +60,14 @@ class DataSourceRDD(
       private var currentIter: Option[Iterator[Object]] = None
       private var currentIndex: Int = 0
 
+      private val partitionMetricCallback = new PartitionMetricCallback(customMetrics)
+
+      // In case of early stopping before consuming the entire iterator,
+      // we need to do one more metric update at the end of the task.
+      context.addTaskCompletionListener[Unit] { _ =>
+        partitionMetricCallback.execute()
+      }
+
       override def hasNext: Boolean = currentIter.exists(_.hasNext) || advanceToNextIter()
 
       override def next(): Object = {
@@ -86,13 +94,10 @@ class DataSourceRDD(
               new PartitionIterator[InternalRow](rowReader, customMetrics))
             (iter, rowReader)
           }
-          context.addTaskCompletionListener[Unit] { _ =>
-            // In case of early stopping before consuming the entire iterator,
-            // we need to do one more metric update at the end of the task.
-            CustomMetrics.updateMetrics(reader.currentMetricsValues, customMetrics)
-            iter.forceUpdateMetrics()
-            reader.close()
-          }
+
+          // Once we advance to the next partition, update the metric callback for early finish
+          partitionMetricCallback.advancePartition(iter, reader)
+
           currentIter = Some(iter)
           hasNext
         }
@@ -107,6 +112,27 @@ class DataSourceRDD(
   }
 }
 
+private class PartitionMetricCallback
+    (customMetrics: Map[String, SQLMetric]) {
+  private var iter: MetricsIterator[_] = null
+  private var reader: PartitionReader[_] = null
+
+  def advancePartition(iter: MetricsIterator[_], reader: PartitionReader[_]): Unit = {
+    execute()
+
+    this.iter = iter
+    this.reader = reader
+  }
+
+  def execute(): Unit = {
+    if (iter != null && reader != null) {
+      CustomMetrics.updateMetrics(reader.currentMetricsValues, customMetrics)
+      iter.forceUpdateMetrics()
+      reader.close()
+    }
+  }
+}
+
 private class PartitionIterator[T](
     reader: PartitionReader[T],
     customMetrics: Map[String, SQLMetric]) extends Iterator[T] {

From eb123a127ebabd9c146ec3e5955745e6142bc3eb Mon Sep 17 00:00:00 2001
From: zml1206 <zhuml1206@gmail.com>
Date: Wed, 16 Jul 2025 17:06:51 +0800
Subject: [PATCH 35/86] [SPARK-46941][SQL][3.5] Can't insert window group limit
 node for top-k computation if contains SizeBasedWindowFunction

### What changes were proposed in this pull request?
This PR backports #44980 to branch-3.5.
Don't insert window group limit node for top-k computation if contains `SizeBasedWindowFunction`.

### Why are the changes needed?
Bug fix, Insert window group limit node for top-k computation contains `SizeBasedWindowFunction` will cause wrong result of the SizeBasedWindowFunction`.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
New UT. Before this pr UT will not pass.

### Was this patch authored or co-authored using generative AI tooling?
No.

Closes #51422 from zml1206/SPARK-46941-3.5.

Authored-by: zml1206 <zhuml1206@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../optimizer/InferWindowGroupLimit.scala     | 11 ++++----
 .../InferWindowGroupLimitSuite.scala          | 18 ++++++++++++-
 .../sql/DataFrameWindowFunctionsSuite.scala   | 27 +++++++++++++++++++
 3 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InferWindowGroupLimit.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InferWindowGroupLimit.scala
index 04204c6a2e108..f2e99721e9261 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InferWindowGroupLimit.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InferWindowGroupLimit.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
-import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, CurrentRow, DenseRank, EqualTo, Expression, GreaterThan, GreaterThanOrEqual, IntegerLiteral, LessThan, LessThanOrEqual, Literal, NamedExpression, PredicateHelper, Rank, RowFrame, RowNumber, SpecifiedWindowFrame, UnboundedPreceding, WindowExpression, WindowSpecDefinition}
+import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, CurrentRow, DenseRank, EqualTo, Expression, GreaterThan, GreaterThanOrEqual, IntegerLiteral, LessThan, LessThanOrEqual, Literal, NamedExpression, PredicateHelper, Rank, RowFrame, RowNumber, SizeBasedWindowFunction, SpecifiedWindowFrame, UnboundedPreceding, WindowExpression, WindowSpecDefinition}
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, Limit, LocalRelation, LogicalPlan, Window, WindowGroupLimit}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.trees.TreePattern.{FILTER, WINDOW}
@@ -53,13 +53,14 @@ object InferWindowGroupLimit extends Rule[LogicalPlan] with PredicateHelper {
   }
 
   /**
-   * All window expressions should use the same expanding window, so that
-   * we can safely do the early stop.
+   * All window expressions should use the same expanding window and do not contains
+   * `SizeBasedWindowFunction`, so that we can safely do the early stop.
    */
   private def isExpandingWindow(
       windowExpression: NamedExpression): Boolean = windowExpression match {
-    case Alias(WindowExpression(_, WindowSpecDefinition(_, _,
-    SpecifiedWindowFrame(RowFrame, UnboundedPreceding, CurrentRow))), _) => true
+    case Alias(WindowExpression(windowFunction, WindowSpecDefinition(_, _,
+    SpecifiedWindowFrame(RowFrame, UnboundedPreceding, CurrentRow))), _)
+      if !windowFunction.isInstanceOf[SizeBasedWindowFunction] => true
     case _ => false
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferWindowGroupLimitSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferWindowGroupLimitSuite.scala
index 5ffb45084184c..cfd2146d868c2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferWindowGroupLimitSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferWindowGroupLimitSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.optimizer
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions.{CurrentRow, DenseRank, Literal, NthValue, NTile, Rank, RowFrame, RowNumber, SpecifiedWindowFrame, UnboundedPreceding}
+import org.apache.spark.sql.catalyst.expressions.{CurrentRow, DenseRank, Literal, NthValue, NTile, PercentRank, Rank, RowFrame, RowNumber, SpecifiedWindowFrame, UnboundedPreceding}
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
@@ -338,4 +338,20 @@ class InferWindowGroupLimitSuite extends PlanTest {
         WithoutOptimize.execute(correctAnswer1.analyze))
     }
   }
+
+  test("SPARK-46941: Can't Insert window group limit node for top-k computation if contains " +
+    "SizeBasedWindowFunction") {
+    val originalQuery =
+      testRelation
+        .select(a, b, c,
+          windowExpr(Rank(c :: Nil),
+            windowSpec(a :: Nil, c.desc :: Nil, windowFrame)).as("rank"),
+          windowExpr(PercentRank(c :: Nil),
+            windowSpec(a :: Nil, c.desc :: Nil, windowFrame)).as("percent_rank"))
+        .where(Symbol("rank") < 2)
+
+    comparePlans(
+      Optimize.execute(originalQuery.analyze),
+      WithoutOptimize.execute(originalQuery.analyze))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
index 47a311c71d55d..a1d5d57933864 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
@@ -1637,4 +1637,31 @@ class DataFrameWindowFunctionsSuite extends QueryTest
       }
     }
   }
+
+  test("SPARK-46941: Can't insert window group limit node for top-k computation if contains " +
+    "SizeBasedWindowFunction") {
+    val df = Seq(
+      (1, "Dave", 1, 2020),
+      (2, "Mark", 2, 2020),
+      (3, "Amy", 3, 2020),
+      (4, "Dave", 1, 2021),
+      (5, "Mark", 2, 2021),
+      (6, "Amy", 3, 2021),
+      (7, "John", 4, 2021)).toDF("id", "name", "score", "year")
+
+    val window = Window.partitionBy($"year").orderBy($"score".desc)
+
+    Seq(-1, 100).foreach { threshold =>
+      withSQLConf(SQLConf.WINDOW_GROUP_LIMIT_THRESHOLD.key -> threshold.toString) {
+        val df2 = df
+          .withColumn("rank", rank().over(window))
+          .withColumn("percent_rank", percent_rank().over(window))
+          .sort($"year")
+        checkAnswer(df2.filter("rank=2"), Seq(
+          Row(2, "Mark", 2, 2020, 2, 0.5),
+          Row(6, "Amy", 3, 2021, 2, 0.3333333333333333)
+        ))
+      }
+    }
+  }
 }

From 8d85c5a731859f7b9bf91ca66e52ab48362845c5 Mon Sep 17 00:00:00 2001
From: Maxime Xu <maxxu@linkedin.com>
Date: Wed, 16 Jul 2025 11:19:25 -0500
Subject: [PATCH 36/86] [SPARK-52776][CORE][3.5] Do not split the comm field in
 ProcfsMetricsGetter

### What changes were proposed in this pull request?

This is a backport of #51457.

We are fixing an issue in `ProcfsMetricsGetter` when parsing the `/proc/<pid>/stat` file. The current implementation will split the comm field by spaces if it contains them, thereby causing subsequent numbers to be shifted. The comm field, and only the comm field, is in parentheses so we can resolve this issue by ignoring everything between the first open parenthesis and last closing parenthesis when splitting the stat file.

### Why are the changes needed?

These changes are needed to prevent a comm field with spaces from causing incorrect calculations for vmem/rssmem metrics. Please see [JIRA](https://issues.apache.org/jira/projects/SPARK/issues/SPARK-52776) for details.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Added a unit test to test for irregular characters in the comm field

### Was this patch authored or co-authored using generative AI tooling?

No

### Original PR Info

Closes #51457 from max2718281/procfs.

Authored-by: Maxime Xu <maxxulinkedin.com>

(cherry picked from commit cf097a5ab7fcffcb64a765db9b8913304340506f)

Closes #51481 from max2718281/procfs-3.5.

Authored-by: Maxime Xu <maxxu@linkedin.com>
Signed-off-by: Mridul Muralidharan <mridul<at>gmail.com>
---
 .../apache/spark/executor/ProcfsMetricsGetter.scala  |  9 ++++++++-
 core/src/test/resources/ProcfsMetrics/487713/stat    |  1 +
 .../spark/executor/ProcfsMetricsGetterSuite.scala    | 12 ++++++++++++
 3 files changed, 21 insertions(+), 1 deletion(-)
 create mode 100644 core/src/test/resources/ProcfsMetrics/487713/stat

diff --git a/core/src/main/scala/org/apache/spark/executor/ProcfsMetricsGetter.scala b/core/src/main/scala/org/apache/spark/executor/ProcfsMetricsGetter.scala
index 5448d7da6d6c1..bff8007bccf7d 100644
--- a/core/src/main/scala/org/apache/spark/executor/ProcfsMetricsGetter.scala
+++ b/core/src/main/scala/org/apache/spark/executor/ProcfsMetricsGetter.scala
@@ -176,7 +176,14 @@ private[spark] class ProcfsMetricsGetter(procfsDir: String = "/proc/") extends L
       }
       Utils.tryWithResource(openReader) { in =>
         val procInfo = in.readLine
-        val procInfoSplit = procInfo.split(" ")
+        // The comm field, which is inside parentheses, could contain spaces. We should not split
+        // by those spaces as doing so could cause the numbers after it to be shifted.
+        val commStartIndex = procInfo.indexOf('(')
+        val commEndIndex = procInfo.lastIndexOf(')') + 1
+        val pidArray = Array(procInfo.substring(0, commStartIndex).trim)
+        val commArray = Array(procInfo.substring(commStartIndex, commEndIndex))
+        val splitAfterComm = procInfo.substring(commEndIndex).trim.split(" ")
+        val procInfoSplit = pidArray ++ commArray ++ splitAfterComm
         val vmem = procInfoSplit(22).toLong
         val rssMem = procInfoSplit(23).toLong * pageSize
         if (procInfoSplit(1).toLowerCase(Locale.US).contains("java")) {
diff --git a/core/src/test/resources/ProcfsMetrics/487713/stat b/core/src/test/resources/ProcfsMetrics/487713/stat
new file mode 100644
index 0000000000000..63640b58155b5
--- /dev/null
+++ b/core/src/test/resources/ProcfsMetrics/487713/stat
@@ -0,0 +1 @@
+487713 ((Executor)	task l)) D 474416 474398 474398 0 -1 4194368 5 0 0 0 0 0 0 0 25 5 1 0 1542745216 7469137920 120815 18446744073709551615 104424108929024 104424108932808 140734257079632 0 0 0 4 3 553671884 1 0 0 17 58 0 0 0 0 0 104424108940536 104424108941336 104424532111360 140734257083781 140734257085131 140734257085131 140734257102797 0
\ No newline at end of file
diff --git a/core/src/test/scala/org/apache/spark/executor/ProcfsMetricsGetterSuite.scala b/core/src/test/scala/org/apache/spark/executor/ProcfsMetricsGetterSuite.scala
index d583afdf07c49..bcafe153be0d5 100644
--- a/core/src/test/scala/org/apache/spark/executor/ProcfsMetricsGetterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/executor/ProcfsMetricsGetterSuite.scala
@@ -62,4 +62,16 @@ class ProcfsMetricsGetterSuite extends SparkFunSuite {
     assert(r.pythonVmemTotal == 0)
     assert(r.pythonRSSTotal == 0)
   }
+
+  test("SPARK-52776: Whitespace and parentheses in the comm field") {
+    val p = new ProcfsMetricsGetter(getTestResourcePath("ProcfsMetrics"))
+    var r = ProcfsMetrics(0, 0, 0, 0, 0, 0)
+    r = p.addProcfsMetricsFromOneProcess(r, 487713)
+    assert(r.jvmVmemTotal == 0)
+    assert(r.jvmRSSTotal == 0)
+    assert(r.pythonVmemTotal == 0)
+    assert(r.pythonRSSTotal == 0)
+    assert(r.otherVmemTotal == 7469137920L)
+    assert(r.otherRSSTotal == 494858240)
+  }
 }

From eef9576e90517acc65bba12b6accbc723d48389f Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Thu, 17 Jul 2025 16:39:29 +0900
Subject: [PATCH 37/86] Revert "Preparing development version 3.5.8-SNAPSHOT"

This reverts commit c67a96c1aceda7271464f850794663f7dc6279d0.
---
 R/pkg/DESCRIPTION                                      | 2 +-
 assembly/pom.xml                                       | 2 +-
 common/kvstore/pom.xml                                 | 2 +-
 common/network-common/pom.xml                          | 2 +-
 common/network-shuffle/pom.xml                         | 2 +-
 common/network-yarn/pom.xml                            | 2 +-
 common/sketch/pom.xml                                  | 2 +-
 common/tags/pom.xml                                    | 2 +-
 common/unsafe/pom.xml                                  | 2 +-
 common/utils/pom.xml                                   | 2 +-
 connector/avro/pom.xml                                 | 2 +-
 connector/connect/client/jvm/pom.xml                   | 2 +-
 connector/connect/common/pom.xml                       | 2 +-
 connector/connect/server/pom.xml                       | 2 +-
 connector/docker-integration-tests/pom.xml             | 2 +-
 connector/kafka-0-10-assembly/pom.xml                  | 2 +-
 connector/kafka-0-10-sql/pom.xml                       | 2 +-
 connector/kafka-0-10-token-provider/pom.xml            | 2 +-
 connector/kafka-0-10/pom.xml                           | 2 +-
 connector/kinesis-asl-assembly/pom.xml                 | 2 +-
 connector/kinesis-asl/pom.xml                          | 2 +-
 connector/protobuf/pom.xml                             | 2 +-
 connector/spark-ganglia-lgpl/pom.xml                   | 2 +-
 core/pom.xml                                           | 2 +-
 docs/_config.yml                                       | 6 +++---
 examples/pom.xml                                       | 2 +-
 graphx/pom.xml                                         | 2 +-
 hadoop-cloud/pom.xml                                   | 2 +-
 launcher/pom.xml                                       | 2 +-
 mllib-local/pom.xml                                    | 2 +-
 mllib/pom.xml                                          | 2 +-
 pom.xml                                                | 2 +-
 python/pyspark/version.py                              | 2 +-
 repl/pom.xml                                           | 2 +-
 resource-managers/kubernetes/core/pom.xml              | 2 +-
 resource-managers/kubernetes/integration-tests/pom.xml | 2 +-
 resource-managers/mesos/pom.xml                        | 2 +-
 resource-managers/yarn/pom.xml                         | 2 +-
 sql/api/pom.xml                                        | 2 +-
 sql/catalyst/pom.xml                                   | 2 +-
 sql/core/pom.xml                                       | 2 +-
 sql/hive-thriftserver/pom.xml                          | 2 +-
 sql/hive/pom.xml                                       | 2 +-
 streaming/pom.xml                                      | 2 +-
 tools/pom.xml                                          | 2 +-
 45 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index d8b7f76b262f3..53f40d803e1a0 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 3.5.8
+Version: 3.5.7
 Title: R Front End for 'Apache Spark'
 Description: Provides an R Front end for 'Apache Spark' <https://spark.apache.org>.
 Authors@R:
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 7c4c5d84792bc..ffa6a0e2b06af 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index a543c74ca9003..e1edb8b6b411c 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 60d6066b3cc0e..9b4b2604a86f7 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index a8750506edcb2..6b644c01c488c 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index ee1492783cd9b..ab366a18bfc08 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 258b786ed308b..42dac42d53040 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index b237b1e272ecc..770c5f5676f7a 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 4f2ffa892d523..bc78e46ce160b 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/utils/pom.xml b/common/utils/pom.xml
index 7c445789d8fac..2d741b56f2aa8 100644
--- a/common/utils/pom.xml
+++ b/common/utils/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/avro/pom.xml b/connector/avro/pom.xml
index a6b310bdefa72..28fc372bb6f52 100644
--- a/connector/avro/pom.xml
+++ b/connector/avro/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/connect/client/jvm/pom.xml b/connector/connect/client/jvm/pom.xml
index 1f309e2db75ab..60fd84623e61d 100644
--- a/connector/connect/client/jvm/pom.xml
+++ b/connector/connect/client/jvm/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/connect/common/pom.xml b/connector/connect/common/pom.xml
index f676a5eccbad5..34f0bcaa9fe6f 100644
--- a/connector/connect/common/pom.xml
+++ b/connector/connect/common/pom.xml
@@ -22,7 +22,7 @@
     <parent>
         <groupId>org.apache.spark</groupId>
         <artifactId>spark-parent_2.12</artifactId>
-        <version>3.5.8-SNAPSHOT</version>
+        <version>3.5.7</version>
         <relativePath>../../../pom.xml</relativePath>
     </parent>
 
diff --git a/connector/connect/server/pom.xml b/connector/connect/server/pom.xml
index 97ff8140a3632..259bc59514eb0 100644
--- a/connector/connect/server/pom.xml
+++ b/connector/connect/server/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/docker-integration-tests/pom.xml b/connector/docker-integration-tests/pom.xml
index 91b89665d4700..8173d62fdb9c6 100644
--- a/connector/docker-integration-tests/pom.xml
+++ b/connector/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10-assembly/pom.xml b/connector/kafka-0-10-assembly/pom.xml
index 73f5c23a9f5c7..b7ae9c15f9990 100644
--- a/connector/kafka-0-10-assembly/pom.xml
+++ b/connector/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10-sql/pom.xml b/connector/kafka-0-10-sql/pom.xml
index 89ce0a2ff5cef..c79bb6a48874c 100644
--- a/connector/kafka-0-10-sql/pom.xml
+++ b/connector/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10-token-provider/pom.xml b/connector/kafka-0-10-token-provider/pom.xml
index a139af88905ac..e7c0947c2bfea 100644
--- a/connector/kafka-0-10-token-provider/pom.xml
+++ b/connector/kafka-0-10-token-provider/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10/pom.xml b/connector/kafka-0-10/pom.xml
index 3d804b66248dd..48356061e243f 100644
--- a/connector/kafka-0-10/pom.xml
+++ b/connector/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kinesis-asl-assembly/pom.xml b/connector/kinesis-asl-assembly/pom.xml
index e6c3f0219f3e1..e219ecc1c8050 100644
--- a/connector/kinesis-asl-assembly/pom.xml
+++ b/connector/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kinesis-asl/pom.xml b/connector/kinesis-asl/pom.xml
index 1bfc3384fa35c..38b87af5dcc37 100644
--- a/connector/kinesis-asl/pom.xml
+++ b/connector/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/protobuf/pom.xml b/connector/protobuf/pom.xml
index de2e490c00341..775a83a711818 100644
--- a/connector/protobuf/pom.xml
+++ b/connector/protobuf/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/spark-ganglia-lgpl/pom.xml b/connector/spark-ganglia-lgpl/pom.xml
index 2fd4109af2d4f..cfbf7a288fb8e 100644
--- a/connector/spark-ganglia-lgpl/pom.xml
+++ b/connector/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index f0771a62db3a3..d1d88db678b74 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index d19efdb99ba06..8e657ed3fd9ce 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -19,8 +19,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 3.5.8-SNAPSHOT
-SPARK_VERSION_SHORT: 3.5.8
+SPARK_VERSION: 3.5.7
+SPARK_VERSION_SHORT: 3.5.7
 SCALA_BINARY_VERSION: "2.12"
 SCALA_VERSION: "2.12.18"
 MESOS_VERSION: 1.0.0
@@ -40,7 +40,7 @@ DOCSEARCH_SCRIPT: |
       inputSelector: '#docsearch-input',
       enhancedSearchInput: true,
       algoliaOptions: {
-        'facetFilters': ["version:3.5.8"]
+        'facetFilters': ["version:3.5.7"]
       },
       debug: false // Set debug to true if you want to inspect the dropdown
   });
diff --git a/examples/pom.xml b/examples/pom.xml
index 00bc0d7bca367..0ac00c50a4fac 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 9f9ff9b0d9157..defcfefeb2a87 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index fc2d9b5799adb..5f9e67b1ec645 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 5a56efc2d168f..d837d390e759f 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 0f1e9fa843757..e4566ce769175 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index adfe9b29141fa..8e180556cdc0d 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 3a4530374bfee..ba5217718c0e7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.12</artifactId>
-  <version>3.5.8-SNAPSHOT</version>
+  <version>3.5.7</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>https://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index e67736d7e0548..b5321d87e9135 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__: str = "3.5.8.dev0"
+__version__: str = "3.5.7"
diff --git a/repl/pom.xml b/repl/pom.xml
index 8f3ae8b4f2fc7..d273ca4730819 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index f1feb2a61325f..7f58452f2b6d9 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 115c6ae85ab96..58efb0bd276ea 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index e57c69d1040f3..aecbacda683aa 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 6e45fb3113ece..81ff619113457 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/api/pom.xml b/sql/api/pom.xml
index 49dee295ff196..94880617c7e3a 100644
--- a/sql/api/pom.xml
+++ b/sql/api/pom.xml
@@ -22,7 +22,7 @@
     <parent>
         <groupId>org.apache.spark</groupId>
         <artifactId>spark-parent_2.12</artifactId>
-        <version>3.5.8-SNAPSHOT</version>
+        <version>3.5.7</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 7317a3f7ab94a..d1596e9129796 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index def8f5ddf98fb..e3d324c8edba1 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 9cf2c20f6761a..c4eab9bda22bb 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 3d85c41481dfe..1efc51119ead9 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index a8e82cb10d377..18e33b99cb223 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index b864ede1149ec..429f84acae4b3 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.8-SNAPSHOT</version>
+    <version>3.5.7</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From baa514fed3a8f717872580a52238fd18a110d65e Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Thu, 17 Jul 2025 16:39:36 +0900
Subject: [PATCH 38/86] Revert "Preparing Spark release v3.5.7-rc1"

This reverts commit 0355e8c9110c5449780ea7e1299dbcc14feb1e25.
---
 assembly/pom.xml                                       | 2 +-
 common/kvstore/pom.xml                                 | 2 +-
 common/network-common/pom.xml                          | 2 +-
 common/network-shuffle/pom.xml                         | 2 +-
 common/network-yarn/pom.xml                            | 2 +-
 common/sketch/pom.xml                                  | 2 +-
 common/tags/pom.xml                                    | 2 +-
 common/unsafe/pom.xml                                  | 2 +-
 common/utils/pom.xml                                   | 2 +-
 connector/avro/pom.xml                                 | 2 +-
 connector/connect/client/jvm/pom.xml                   | 2 +-
 connector/connect/common/pom.xml                       | 2 +-
 connector/connect/server/pom.xml                       | 2 +-
 connector/docker-integration-tests/pom.xml             | 2 +-
 connector/kafka-0-10-assembly/pom.xml                  | 2 +-
 connector/kafka-0-10-sql/pom.xml                       | 2 +-
 connector/kafka-0-10-token-provider/pom.xml            | 2 +-
 connector/kafka-0-10/pom.xml                           | 2 +-
 connector/kinesis-asl-assembly/pom.xml                 | 2 +-
 connector/kinesis-asl/pom.xml                          | 2 +-
 connector/protobuf/pom.xml                             | 2 +-
 connector/spark-ganglia-lgpl/pom.xml                   | 2 +-
 core/pom.xml                                           | 2 +-
 docs/_config.yml                                       | 2 +-
 examples/pom.xml                                       | 2 +-
 graphx/pom.xml                                         | 2 +-
 hadoop-cloud/pom.xml                                   | 2 +-
 launcher/pom.xml                                       | 2 +-
 mllib-local/pom.xml                                    | 2 +-
 mllib/pom.xml                                          | 2 +-
 pom.xml                                                | 2 +-
 python/pyspark/version.py                              | 2 +-
 repl/pom.xml                                           | 2 +-
 resource-managers/kubernetes/core/pom.xml              | 2 +-
 resource-managers/kubernetes/integration-tests/pom.xml | 2 +-
 resource-managers/mesos/pom.xml                        | 2 +-
 resource-managers/yarn/pom.xml                         | 2 +-
 sql/api/pom.xml                                        | 2 +-
 sql/catalyst/pom.xml                                   | 2 +-
 sql/core/pom.xml                                       | 2 +-
 sql/hive-thriftserver/pom.xml                          | 2 +-
 sql/hive/pom.xml                                       | 2 +-
 streaming/pom.xml                                      | 2 +-
 tools/pom.xml                                          | 2 +-
 44 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index ffa6a0e2b06af..058422820dd4e 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index e1edb8b6b411c..7ce97474ac11e 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 9b4b2604a86f7..1569686c98d75 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 6b644c01c488c..997d7bc46eb71 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index ab366a18bfc08..7935f2f24a32a 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 42dac42d53040..5e626fde4b76c 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 770c5f5676f7a..56e2817a495cc 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index bc78e46ce160b..3fd75f673acc9 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/utils/pom.xml b/common/utils/pom.xml
index 2d741b56f2aa8..f4c4eea09c6c8 100644
--- a/common/utils/pom.xml
+++ b/common/utils/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/avro/pom.xml b/connector/avro/pom.xml
index 28fc372bb6f52..601ffa552cf94 100644
--- a/connector/avro/pom.xml
+++ b/connector/avro/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/connect/client/jvm/pom.xml b/connector/connect/client/jvm/pom.xml
index 60fd84623e61d..01945546a976e 100644
--- a/connector/connect/client/jvm/pom.xml
+++ b/connector/connect/client/jvm/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/connect/common/pom.xml b/connector/connect/common/pom.xml
index 34f0bcaa9fe6f..68c14857dc26b 100644
--- a/connector/connect/common/pom.xml
+++ b/connector/connect/common/pom.xml
@@ -22,7 +22,7 @@
     <parent>
         <groupId>org.apache.spark</groupId>
         <artifactId>spark-parent_2.12</artifactId>
-        <version>3.5.7</version>
+        <version>3.5.7-SNAPSHOT</version>
         <relativePath>../../../pom.xml</relativePath>
     </parent>
 
diff --git a/connector/connect/server/pom.xml b/connector/connect/server/pom.xml
index 259bc59514eb0..c5961b69bbdbf 100644
--- a/connector/connect/server/pom.xml
+++ b/connector/connect/server/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/docker-integration-tests/pom.xml b/connector/docker-integration-tests/pom.xml
index 8173d62fdb9c6..62f52dfae3b43 100644
--- a/connector/docker-integration-tests/pom.xml
+++ b/connector/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10-assembly/pom.xml b/connector/kafka-0-10-assembly/pom.xml
index b7ae9c15f9990..5f9a659df45e8 100644
--- a/connector/kafka-0-10-assembly/pom.xml
+++ b/connector/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10-sql/pom.xml b/connector/kafka-0-10-sql/pom.xml
index c79bb6a48874c..ea72114fa3c68 100644
--- a/connector/kafka-0-10-sql/pom.xml
+++ b/connector/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10-token-provider/pom.xml b/connector/kafka-0-10-token-provider/pom.xml
index e7c0947c2bfea..a6db42170657a 100644
--- a/connector/kafka-0-10-token-provider/pom.xml
+++ b/connector/kafka-0-10-token-provider/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10/pom.xml b/connector/kafka-0-10/pom.xml
index 48356061e243f..db1572705304f 100644
--- a/connector/kafka-0-10/pom.xml
+++ b/connector/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kinesis-asl-assembly/pom.xml b/connector/kinesis-asl-assembly/pom.xml
index e219ecc1c8050..5e9b28deb3a62 100644
--- a/connector/kinesis-asl-assembly/pom.xml
+++ b/connector/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kinesis-asl/pom.xml b/connector/kinesis-asl/pom.xml
index 38b87af5dcc37..bd50098da9224 100644
--- a/connector/kinesis-asl/pom.xml
+++ b/connector/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/protobuf/pom.xml b/connector/protobuf/pom.xml
index 775a83a711818..856873fd3389f 100644
--- a/connector/protobuf/pom.xml
+++ b/connector/protobuf/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/spark-ganglia-lgpl/pom.xml b/connector/spark-ganglia-lgpl/pom.xml
index cfbf7a288fb8e..06f653738c51c 100644
--- a/connector/spark-ganglia-lgpl/pom.xml
+++ b/connector/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index d1d88db678b74..9c5e7428d569c 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 8e657ed3fd9ce..acfa53576a9ee 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -19,7 +19,7 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 3.5.7
+SPARK_VERSION: 3.5.7-SNAPSHOT
 SPARK_VERSION_SHORT: 3.5.7
 SCALA_BINARY_VERSION: "2.12"
 SCALA_VERSION: "2.12.18"
diff --git a/examples/pom.xml b/examples/pom.xml
index 0ac00c50a4fac..a9a19b20d12c6 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index defcfefeb2a87..f8fd503746a51 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 5f9e67b1ec645..c6ee7a0d076b8 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index d837d390e759f..80e19445c74cd 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index e4566ce769175..cf7bba8c1bc90 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 8e180556cdc0d..58ce9d07f99af 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index ba5217718c0e7..60d5c12b8db0c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.12</artifactId>
-  <version>3.5.7</version>
+  <version>3.5.7-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>https://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index b5321d87e9135..d3152f58da705 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__: str = "3.5.7"
+__version__: str = "3.5.7.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index d273ca4730819..3cdc95fdaff9b 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 7f58452f2b6d9..9473a1aac3123 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 58efb0bd276ea..2d43f57af8080 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index aecbacda683aa..1ff3e3b92e511 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 81ff619113457..c03a7450ad786 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/api/pom.xml b/sql/api/pom.xml
index 94880617c7e3a..d11d496811b20 100644
--- a/sql/api/pom.xml
+++ b/sql/api/pom.xml
@@ -22,7 +22,7 @@
     <parent>
         <groupId>org.apache.spark</groupId>
         <artifactId>spark-parent_2.12</artifactId>
-        <version>3.5.7</version>
+        <version>3.5.7-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index d1596e9129796..981ac6fffc1c3 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index e3d324c8edba1..60110c034e296 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index c4eab9bda22bb..fd954cf041ebd 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 1efc51119ead9..4df8ad16b05b0 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 18e33b99cb223..d355b773c3a7e 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 429f84acae4b3..8dd903ce0d1e0 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7</version>
+    <version>3.5.7-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From ebe6ca894709cc6962fd9f14d6414c526d1ff07d Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 18 Jul 2025 01:01:54 -0700
Subject: [PATCH 39/86] [SPARK-52516][SQL] Don't hold previous iterator
 reference after advancing to next file in ParquetPartitionReaderFactory

### What changes were proposed in this pull request?

This patch adds only one task completion listener for closing iterators in `ParquetPartitionReaderFactory`, instead of adding separate one for each file iterator.

### Why are the changes needed?

For each file iterator, currently we add one task completion listener used to update closing the iterator when the task is finished. In the listener, the iterator reference is held. So if the file is normally exhausted, the reference cannot be released early. It is a problem especially if the reference are heavy as reported by https://github.com/apache/iceberg/issues/13297.

Similar to #51503, we don't need set up a listener for each file iterator. Thus, we can set up just one listener for all iterator. Once we advance to next file, we can update the update target to new iterator.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Existing tests.

### Was this patch authored or co-authored using generative AI tooling?

No

Closes #51528 from viirya/fix_iter_callback.

Authored-by: Liang-Chi Hsieh <viirya@gmail.com>
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
(cherry picked from commit 197c9d6051efcc57f984a9497975d5723e1f2dac)
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
---
 .../ParquetPartitionReaderFactory.scala       | 38 ++++++++++++++++++-
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala
index 5774df95ac070..fcebdceb849ef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala
@@ -84,6 +84,8 @@ case class ParquetPartitionReaderFactory(
   private val datetimeRebaseModeInRead = options.datetimeRebaseModeInRead
   private val int96RebaseModeInRead = options.int96RebaseModeInRead
 
+  private val parquetReaderCallback = new ParquetReaderCallback()
+
   private def getFooter(file: PartitionedFile): ParquetMetadata = {
     val conf = broadcastedConf.value.value
     if (aggregation.isDefined || enableVectorizedReader) {
@@ -304,7 +306,8 @@ case class ParquetPartitionReaderFactory(
       reader, readDataSchema)
     val iter = new RecordReaderIterator(readerWithRowIndexes)
     // SPARK-23457 Register a task completion listener before `initialization`.
-    taskContext.foreach(_.addTaskCompletionListener[Unit](_ => iter.close()))
+    parquetReaderCallback.advanceFile(iter)
+    taskContext.foreach(parquetReaderCallback.initIfNotAlready)
     readerWithRowIndexes
   }
 
@@ -332,8 +335,39 @@ case class ParquetPartitionReaderFactory(
       capacity)
     val iter = new RecordReaderIterator(vectorizedReader)
     // SPARK-23457 Register a task completion listener before `initialization`.
-    taskContext.foreach(_.addTaskCompletionListener[Unit](_ => iter.close()))
+    parquetReaderCallback.advanceFile(iter)
+    taskContext.foreach(parquetReaderCallback.initIfNotAlready)
     logDebug(s"Appending $partitionSchema $partitionValues")
     vectorizedReader
   }
 }
+
+/**
+ * A callback class to handle the cleanup of Parquet readers.
+ *
+ * This class is used to ensure that the Parquet readers are closed properly when the task
+ * completes, and it also allows for the initialization of the reader callback only once per task.
+ */
+private class ParquetReaderCallback extends Serializable {
+  private var init: Boolean = false
+  private var iter: RecordReaderIterator[_] = null
+
+  def initIfNotAlready(taskContext: TaskContext): Unit = {
+    if (!init) {
+      taskContext.addTaskCompletionListener[Unit](_ => closeCurrent())
+      init = true
+    }
+  }
+
+  def advanceFile(iter: RecordReaderIterator[_]): Unit = {
+    closeCurrent()
+
+    this.iter = iter
+  }
+
+  def closeCurrent(): Unit = {
+    if (iter != null) {
+      iter.close()
+    }
+  }
+}

From 98645a2675110d9f4aab3d6dca1fc4e89cdff053 Mon Sep 17 00:00:00 2001
From: Peter Nguyen <petern0408@gmail.com>
Date: Wed, 23 Jul 2025 09:14:42 +0900
Subject: [PATCH 40/86] [SPARK-52791][PS] Fix error when inferring a UDT with a
 null first element

I modified the udt condition to check the first non-null element instead of the first element (which might be null).

```
import pyspark.pandas as ps
from pyspark.ml.linalg import SparseVector
sparse_values = {0: 0.1, 1: 1.1}
ps_series = ps.Series([None, SparseVector(1, \{0: 1.2}), SparseVector(1, \{0: 3})])
```
Error:
```
pyarrow.lib.ArrowInvalid: Could not convert SparseVector(1, {0: 1.2}) with type SparseVector: did not recognize Python value type when inferring an Arrow data type
```
This should work as normal, but it fails because the first element is None

Yes, previously it would error, but now it works properly. This is a behavior change from all previous spark versions, and should probably be backported.

Added a test

No

Closes #51475 from petern48/fix_infer_spark_type.

Authored-by: Peter Nguyen <petern0408@gmail.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit 5182eb4c6a51989b37f054ef07173cd797611d2b)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .../tests/data_type_ops/test_udt_ops.py       | 20 +++++++++++++++++++
 python/pyspark/pandas/typedef/typehints.py    |  5 +++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py
index 45f8cca56ee94..7b264582e044b 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py
@@ -129,6 +129,26 @@ def test_from_to_pandas(self):
         self.assert_eq(pser, psser._to_pandas())
         self.assert_eq(ps.from_pandas(pser), psser)
 
+    def test_with_first_null(self):
+        lst = [None, None, None, SparseVector(1, {0: 0.1})]
+        pser = pd.Series(lst)
+        psser = ps.Series(lst)
+        self.assert_eq(pser, psser._to_pandas())
+        self.assert_eq(ps.from_pandas(pser), psser)
+
+        lst2 = [SparseVector(1, {0: 0.1}), None, None, None]
+        pdf = pd.DataFrame({"a": lst, "b": lst2})
+        psdf = ps.DataFrame({"a": lst, "b": lst2})
+        self.assert_eq(pdf, psdf._to_pandas())
+        self.assert_eq(ps.from_pandas(pdf), psdf)
+
+    def test_with_all_null(self):
+        lst = [None, None, None, None]
+        pser = pd.Series(lst, dtype=object)
+        psser = ps.Series(lst, dtype=object)
+        self.assert_eq(pser, psser._to_pandas())
+        self.assert_eq(ps.from_pandas(pser), psser)
+
     def test_isnull(self):
         self.assert_eq(self.pser.isnull(), self.psser.isnull())
 
diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py
index 012eabf958eb8..5c7b3e01686ae 100644
--- a/python/pyspark/pandas/typedef/typehints.py
+++ b/python/pyspark/pandas/typedef/typehints.py
@@ -354,8 +354,9 @@ def infer_pd_series_spark_type(
     if dtype == np.dtype("object"):
         if len(pser) == 0 or pser.isnull().all():
             return types.NullType()
-        elif hasattr(pser.iloc[0], "__UDT__"):
-            return pser.iloc[0].__UDT__
+        notnull = pser[pser.notnull()]
+        if hasattr(notnull.iloc[0], "__UDT__"):
+            return notnull.iloc[0].__UDT__
         else:
             return from_arrow_type(pa.Array.from_pandas(pser).type, prefer_timestamp_ntz)
     elif isinstance(dtype, CategoricalDtype):

From 80c1f5f2caaaaf059178aa80cad4dfefe68bbebe Mon Sep 17 00:00:00 2001
From: Shardul Mahadik <smahadik@linkedin.com>
Date: Sat, 26 Jul 2025 19:21:03 +0800
Subject: [PATCH 41/86] [SPARK-52737][CORE] Pushdown predicate and number of
 apps to FsHistoryProvider when listing applications

### What changes were proposed in this pull request?
SPARK-38896 modified how applications are listed from the KVStore to close the KVStore iterator eagerly [Link](https://github.com/apache/spark/pull/36237/files#diff-128a6af0d78f4a6180774faedb335d6168dfc4defff58f5aa3021fc1bd767bc0R328). This meant that `FsHistoryProvider.getListing` now eagerly goes through every application in the KVStore before returning an iterator to the caller. In a couple of contexts where `FsHistoryProvider.getListing` is used, this is very detrimental. e.g. [here](https://github.com/apache/spark/blame/589e93a02725939c266f9ee97f96fdc6d3db33cd/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala#L112), due to `.exists()` we would previously only need to go through a handful of applications before the condition is satisfied. This causes significant perf regression for the SHS homepage in our environment which contains ~10000 Spark apps in a single history server.

To fix the issue, while preserving the original intent of closing the iterator early, this PR proposes pushing down filter predicates and number of applications required to FsHistoryProvider.

### Why are the changes needed?
To fix a perf regression in SHS due to SPARK-38896

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Existing unit tests for `HistoryPage` and `ApplicationListResource`

Tested performance on local SHS with a large number of apps (~75k) consistent with production.
Before:
```
smahadiklocalhost [ ~ ]$ curl http://localhost:18080/api/v1/applications | jq 'length'
75061

smahadiklocalhost [ ~ ]$ for i in {1..10}; do curl -s -w "\nTotal time: %{time_total}s\n" -o /dev/null http://localhost:18080; done
Total time: 3.607995s
Total time: 3.564875s
Total time: 3.095895s
Total time: 3.153576s
Total time: 3.157186s
Total time: 3.251107s
Total time: 3.681727s
Total time: 4.622074s
Total time: 6.866931s
Total time: 3.523224s

smahadiklocalhost [ ~ ]$ for i in {1..10}; do curl -s -w "\nTotal time: %{time_total}s\n" -o /dev/null http://localhost:18080/api/v1/applications?limit=10; done
Total time: 3.340698s
Total time: 3.206455s
Total time: 3.140326s
Total time: 4.704944s
Total time: 3.982831s
Total time: 7.375094s
Total time: 3.328329s
Total time: 3.264700s
Total time: 3.283851s
Total time: 3.456416s
```

After:
```
smahadiklocalhost [ ~ ]$ curl http://localhost:18080/api/v1/applications | jq 'length'
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 36.7M    0 36.7M    0     0  7662k      0 --:--:--  0:00:04 --:--:-- 7663k
75077

smahadiklocalhost [ ~ ]$ for i in {1..10}; do curl -s -w "\nTotal time: %{time_total}s\n" -o /dev/null http://localhost:18080; done
Total time: 0.224714s
Total time: 0.012205s
Total time: 0.014709s
Total time: 0.008092s
Total time: 0.007284s
Total time: 0.006350s
Total time: 0.005414s
Total time: 0.006391s
Total time: 0.005668s
Total time: 0.004738s

smahadiklocalhost [ ~ ]$ for i in {1..10}; do curl -s -w "\nTotal time: %{time_total}s\n" -o /dev/null http://localhost:18080/api/v1/applications?limit=10; done
Total time: 1.439507s
Total time: 0.015126s
Total time: 0.009085s
Total time: 0.007620s
Total time: 0.007692s
Total time: 0.007420s
Total time: 0.007152s
Total time: 0.010515s
Total time: 0.011493s
Total time: 0.007564s
```

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #51428 from shardulm94/smahadik/shs-slow.

Authored-by: Shardul Mahadik <smahadik@linkedin.com>
Signed-off-by: yangjie01 <yangjie01@baidu.com>
(cherry picked from commit aeae9ff7bfbbef574c047dd4d25c1cdb8667da96)
Signed-off-by: yangjie01 <yangjie01@baidu.com>
---
 .../history/ApplicationHistoryProvider.scala       |  9 +++++++++
 .../spark/deploy/history/FsHistoryProvider.scala   |  8 ++++++++
 .../apache/spark/deploy/history/HistoryPage.scala  |  2 +-
 .../spark/deploy/history/HistoryServer.scala       |  5 +++++
 .../scala/org/apache/spark/status/KVUtils.scala    | 14 ++++++++++++++
 .../spark/status/api/v1/ApiRootResource.scala      |  4 ++++
 .../status/api/v1/ApplicationListResource.scala    |  4 ++--
 .../main/scala/org/apache/spark/ui/SparkUI.scala   |  5 +++++
 8 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
index f3f7db6bb0aba..89f0d12935ce1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
@@ -99,6 +99,15 @@ private[history] abstract class ApplicationHistoryProvider {
    */
   def getListing(): Iterator[ApplicationInfo]
 
+  /**
+   * Returns a list of applications available for the history server to show.
+   *
+   * @param max The maximum number of applications to return
+   * @param predicate A function that filters the applications to be returned
+   * @return An iterator of matching applications up to the specified maximum
+   */
+  def getListing(max: Int)(predicate: ApplicationInfo => Boolean): Iterator[ApplicationInfo]
+
   /**
    * Returns the Spark UI for a specific application.
    *
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 387bc7d9e45b3..9841b21861d42 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -300,6 +300,14 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
       .index("endTime").reverse())(_.toApplicationInfo()).iterator
   }
 
+  override def getListing(max: Int)(
+      predicate: ApplicationInfo => Boolean): Iterator[ApplicationInfo] = {
+    // Return the filtered listing in end time descending order.
+    KVUtils.mapToSeqWithFilter(
+      listing.view(classOf[ApplicationInfoWrapper]).index("endTime").reverse(),
+      max)(_.toApplicationInfo())(predicate).iterator
+  }
+
   override def getApplicationInfo(appId: String): Option[ApplicationInfo] = {
     try {
       Some(load(appId).toApplicationInfo())
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
index f2cd5b7e240dd..dd4921207c961 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
@@ -91,7 +91,7 @@ private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("")
   }
 
   def shouldDisplayApplications(requestedIncomplete: Boolean): Boolean = {
-    parent.getApplicationList().exists(isApplicationCompleted(_) != requestedIncomplete)
+    parent.getApplicationInfoList(1)(isApplicationCompleted(_) != requestedIncomplete).nonEmpty
   }
 
   private def makePageLink(request: HttpServletRequest, showIncomplete: Boolean): String = {
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
index bea3f9ec84a4b..97425ba2339fd 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
@@ -220,6 +220,11 @@ class HistoryServer(
     getApplicationList()
   }
 
+  override def getApplicationInfoList(max: Int)(
+      filter: ApplicationInfo => Boolean): Iterator[ApplicationInfo] = {
+    provider.getListing(max)(filter)
+  }
+
   def getApplicationInfo(appId: String): Option[ApplicationInfo] = {
     provider.getApplicationInfo(appId)
   }
diff --git a/core/src/main/scala/org/apache/spark/status/KVUtils.scala b/core/src/main/scala/org/apache/spark/status/KVUtils.scala
index 0dd40962309a4..e7773b8a4350c 100644
--- a/core/src/main/scala/org/apache/spark/status/KVUtils.scala
+++ b/core/src/main/scala/org/apache/spark/status/KVUtils.scala
@@ -210,6 +210,20 @@ private[spark] object KVUtils extends Logging {
     }
   }
 
+  /**
+   * Maps all values of KVStoreView to new values using a transformation function
+   * and filtered by a filter function.
+   */
+  def mapToSeqWithFilter[T, B](
+      view: KVStoreView[T],
+      max: Int)
+      (mapFunc: T => B)
+      (filterFunc: B => Boolean): Seq[B] = {
+    Utils.tryWithResource(view.closeableIterator()) { iter =>
+      iter.asScala.map(mapFunc).filter(filterFunc).take(max).toList
+    }
+  }
+
   def size[T](view: KVStoreView[T]): Int = {
     Utils.tryWithResource(view.closeableIterator()) { iter =>
       iter.asScala.size
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
index cc21c1488f67c..44db9f2eec53e 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
@@ -82,6 +82,10 @@ private[spark] trait UIRoot {
   def withSparkUI[T](appId: String, attemptId: Option[String])(fn: SparkUI => T): T
 
   def getApplicationInfoList: Iterator[ApplicationInfo]
+
+  def getApplicationInfoList(max: Int)(
+      filter: ApplicationInfo => Boolean): Iterator[ApplicationInfo]
+
   def getApplicationInfo(appId: String): Option[ApplicationInfo]
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/ApplicationListResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/ApplicationListResource.scala
index 6eb8b2bfd55a3..04a25a818c3c6 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/ApplicationListResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/ApplicationListResource.scala
@@ -37,7 +37,7 @@ private[v1] class ApplicationListResource extends ApiRequestContext {
     val includeCompleted = status.isEmpty || status.contains(ApplicationStatus.COMPLETED)
     val includeRunning = status.isEmpty || status.contains(ApplicationStatus.RUNNING)
 
-    uiRoot.getApplicationInfoList.filter { app =>
+    uiRoot.getApplicationInfoList(numApps) { app =>
       val anyRunning = app.attempts.isEmpty || !app.attempts.head.completed
       // if any attempt is still running, we consider the app to also still be running;
       // keep the app if *any* attempts fall in the right time window
@@ -45,7 +45,7 @@ private[v1] class ApplicationListResource extends ApiRequestContext {
       app.attempts.exists { attempt =>
         isAttemptInRange(attempt, minDate, maxDate, minEndDate, maxEndDate, anyRunning)
       }
-    }.take(numApps)
+    }
   }
 
   private def isAttemptInRange(
diff --git a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
index 685407c11208f..4cbedc057c16b 100644
--- a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
@@ -192,6 +192,11 @@ private[spark] class SparkUI private (
     ))
   }
 
+  override def getApplicationInfoList(max: Int)(
+      filter: ApplicationInfo => Boolean): Iterator[ApplicationInfo] = {
+    getApplicationInfoList.filter(filter).take(max)
+  }
+
   def getApplicationInfo(appId: String): Option[ApplicationInfo] = {
     getApplicationInfoList.find(_.id == appId)
   }

From 5ccd68b23581915152726cd9ba9d9e5b619a77c9 Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Mon, 28 Jul 2025 12:07:31 +0800
Subject: [PATCH 42/86] [SPARK-52944][CORE][SQL][YARN][TESTS][3.5] Fix invalid
 assertions in tests

### What changes were proposed in this pull request?
This pr fixes some invalid assertions in the test code, mainly addressing two types of issues:

1. Forgetting to use `assert`. For example:

https://github.com/apache/spark/blob/80c1f5f2caaaaf059178aa80cad4dfefe68bbebe/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala#L321-L322

Here, it is clearly intended to assert that `q.lastProgress.sink.numOutputRows == 0L"`, but the use of `assert` was forgotten, rendering it an invalid expression.

2. Incorrect line breaks in `should be` statements, causing one line of code to be mistakenly treated as two unrelated lines. For example:

https://github.com/apache/spark/blob/5a9929c32ef8aafd275a3cf4797bb0ba9a6e61e2/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala#L72-L73

Here, it is clearly intended to make an assertion similar to the following:

```
filesCreatedByShuffle.map(_.getName) should be Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
```

However, due to the incorrect line break, it actually fails to function as an assertion.

Meanwhile, after implementing the aforementioned fixes, this pr also addresses and repairs the failing test cases within it.

### Why are the changes needed?
Fix invalid assertions in tests

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
- Pass Github Actions

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #51677 from LuciferYang/SPARK-52944-3.5.

Authored-by: yangjie01 <yangjie01@baidu.com>
Signed-off-by: yangjie01 <yangjie01@baidu.com>
---
 .../org/apache/spark/SortShuffleSuite.scala   |  7 ++-
 .../spark/deploy/SparkSubmitSuite.scala       | 16 +++---
 .../yarn/YarnShuffleServiceSuite.scala        | 56 +++++++++----------
 .../expressions/UnsafeRowConverterSuite.scala |  8 +--
 .../StreamingDeduplicationSuite.scala         |  2 +-
 5 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala b/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala
index 571110784818f..f0e41046ae48b 100644
--- a/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala
@@ -26,7 +26,7 @@ import org.apache.commons.io.filefilter.TrueFileFilter
 import org.scalatest.BeforeAndAfterAll
 import org.scalatest.matchers.should.Matchers._
 
-import org.apache.spark.internal.config.SHUFFLE_MANAGER
+import org.apache.spark.internal.config.{SHUFFLE_CHECKSUM_ALGORITHM, SHUFFLE_MANAGER}
 import org.apache.spark.rdd.ShuffledRDD
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
 import org.apache.spark.shuffle.sort.SortShuffleManager
@@ -69,8 +69,9 @@ class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {
     shuffledRdd.count()
     // Ensure that the shuffle actually created files that will need to be cleaned up
     val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
-    filesCreatedByShuffle.map(_.getName) should be
-    Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
+    filesCreatedByShuffle.map(_.getName) should be(
+      Set("shuffle_0_0_0.data", s"shuffle_0_0_0.checksum.${conf.get(SHUFFLE_CHECKSUM_ALGORITHM)}",
+        "shuffle_0_0_0.index"))
     // Check that the cleanup actually removes the files
     sc.env.blockManager.master.removeShuffle(0, blocking = true)
     for (file <- filesCreatedByShuffle) {
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 8e2d6e6cf5ff3..dd8bb3b96480e 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -1095,14 +1095,14 @@ class SparkSubmitSuite
 
             val appArgs = new SparkSubmitArguments(args)
             val (_, _, conf, _) = submit.prepareSubmitEnvironment(appArgs)
-            conf.get("spark.yarn.dist.jars").split(",").toSet should be
-            (Set(jar1.toURI.toString, jar2.toURI.toString))
-            conf.get("spark.yarn.dist.files").split(",").toSet should be
-            (Set(file1.toURI.toString, file2.toURI.toString))
-            conf.get("spark.yarn.dist.pyFiles").split(",").toSet should be
-            (Set(pyFile1.getAbsolutePath, pyFile2.getAbsolutePath))
-            conf.get("spark.yarn.dist.archives").split(",").toSet should be
-            (Set(archive1.toURI.toString, archive2.toURI.toString))
+            conf.get("spark.yarn.dist.jars").split(",").toSet should be(
+              Set(jar1.toURI.toString, jar2.toURI.toString))
+            conf.get("spark.yarn.dist.files").split(",").toSet should be(
+              Set(file1.toURI.toString, file2.toURI.toString))
+            conf.get("spark.yarn.dist.pyFiles").split(",").toSet should be(
+              Set(pyFile1.toURI.toString, pyFile2.toURI.toString))
+            conf.get("spark.yarn.dist.archives").split(",").toSet should be(
+              Set(archive1.toURI.toString, archive2.toURI.toString))
           }
         }
       }
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala
index 552cc98311e8f..5c98785089ef5 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala
@@ -616,35 +616,35 @@ abstract class YarnShuffleServiceSuite extends SparkFunSuite with Matchers {
     val mergeManager1DB = ShuffleTestAccessor.mergeManagerLevelDB(mergeManager1)
     ShuffleTestAccessor.recoveryFile(mergeManager1) should be (mergeMgrFile)
 
-    ShuffleTestAccessor.getAppsShuffleInfo(mergeManager1).size() equals 0
-    ShuffleTestAccessor.reloadAppShuffleInfo(
-      mergeManager1, mergeManager1DB).size() equals 0
+    assert(ShuffleTestAccessor.getAppsShuffleInfo(mergeManager1).size() equals 0)
+    assert(ShuffleTestAccessor.reloadAppShuffleInfo(
+      mergeManager1, mergeManager1DB).size() equals 0)
 
     mergeManager1.registerExecutor(app1Id.toString, mergedShuffleInfo1)
     var appShuffleInfo = ShuffleTestAccessor.getAppsShuffleInfo(mergeManager1)
-    appShuffleInfo.size() equals 1
+    assert(appShuffleInfo.size() equals 1)
     appShuffleInfo.get(app1Id.toString).getAppPathsInfo should be (appPathsInfo1)
     var appShuffleInfoAfterReload =
       ShuffleTestAccessor.reloadAppShuffleInfo(mergeManager1, mergeManager1DB)
-    appShuffleInfoAfterReload.size() equals 1
+    assert(appShuffleInfoAfterReload.size() equals 1)
     appShuffleInfoAfterReload.get(app1Id.toString).getAppPathsInfo should be (appPathsInfo1)
 
     mergeManager1.registerExecutor(app2Attempt1Id.toString, mergedShuffleInfo2Attempt1)
     appShuffleInfo = ShuffleTestAccessor.getAppsShuffleInfo(mergeManager1)
-    appShuffleInfo.size() equals 2
+    assert(appShuffleInfo.size() equals 2)
     appShuffleInfo.get(app1Id.toString).getAppPathsInfo should be (appPathsInfo1)
     appShuffleInfo.get(
       app2Attempt1Id.toString).getAppPathsInfo should be (appPathsInfo2Attempt1)
     appShuffleInfoAfterReload =
       ShuffleTestAccessor.reloadAppShuffleInfo(mergeManager1, mergeManager1DB)
-    appShuffleInfoAfterReload.size() equals 2
+    assert(appShuffleInfoAfterReload.size() equals 2)
     appShuffleInfoAfterReload.get(app1Id.toString).getAppPathsInfo should be (appPathsInfo1)
     appShuffleInfoAfterReload.get(
       app2Attempt1Id.toString).getAppPathsInfo should be (appPathsInfo2Attempt1)
 
     mergeManager1.registerExecutor(app3IdNoAttemptId.toString, mergedShuffleInfo3NoAttemptId)
     appShuffleInfo = ShuffleTestAccessor.getAppsShuffleInfo(mergeManager1)
-    appShuffleInfo.size() equals 3
+    assert(appShuffleInfo.size() equals 3)
     appShuffleInfo.get(app1Id.toString).getAppPathsInfo should be (appPathsInfo1)
     appShuffleInfo.get(
       app2Attempt1Id.toString).getAppPathsInfo should be (appPathsInfo2Attempt1)
@@ -652,7 +652,7 @@ abstract class YarnShuffleServiceSuite extends SparkFunSuite with Matchers {
       app3IdNoAttemptId.toString).getAppPathsInfo should be (appPathsInfo3NoAttempt)
     appShuffleInfoAfterReload =
       ShuffleTestAccessor.reloadAppShuffleInfo(mergeManager1, mergeManager1DB)
-    appShuffleInfoAfterReload.size() equals 3
+    assert(appShuffleInfoAfterReload.size() equals 3)
     appShuffleInfoAfterReload.get(app1Id.toString).getAppPathsInfo should be (appPathsInfo1)
     appShuffleInfoAfterReload.get(
       app2Attempt1Id.toString).getAppPathsInfo should be (appPathsInfo2Attempt1)
@@ -661,7 +661,7 @@ abstract class YarnShuffleServiceSuite extends SparkFunSuite with Matchers {
 
     mergeManager1.registerExecutor(app2Attempt2Id.toString, mergedShuffleInfo2Attempt2)
     appShuffleInfo = ShuffleTestAccessor.getAppsShuffleInfo(mergeManager1)
-    appShuffleInfo.size() equals 3
+    assert(appShuffleInfo.size() equals 3)
     appShuffleInfo.get(app1Id.toString).getAppPathsInfo should be (appPathsInfo1)
     appShuffleInfo.get(
       app2Attempt2Id.toString).getAppPathsInfo should be (appPathsInfo2Attempt2)
@@ -669,7 +669,7 @@ abstract class YarnShuffleServiceSuite extends SparkFunSuite with Matchers {
       app3IdNoAttemptId.toString).getAppPathsInfo should be (appPathsInfo3NoAttempt)
     appShuffleInfoAfterReload =
       ShuffleTestAccessor.reloadAppShuffleInfo(mergeManager1, mergeManager1DB)
-    appShuffleInfoAfterReload.size() equals 3
+    assert(appShuffleInfoAfterReload.size() equals 3)
     appShuffleInfoAfterReload.get(app1Id.toString).getAppPathsInfo should be (appPathsInfo1)
     appShuffleInfoAfterReload.get(
       app2Attempt2Id.toString).getAppPathsInfo should be (appPathsInfo2Attempt2)
@@ -678,14 +678,14 @@ abstract class YarnShuffleServiceSuite extends SparkFunSuite with Matchers {
 
     mergeManager1.applicationRemoved(app2Attempt2Id.toString, true)
     appShuffleInfo = ShuffleTestAccessor.getAppsShuffleInfo(mergeManager1)
-    appShuffleInfo.size() equals 2
+    assert(appShuffleInfo.size() equals 2)
     appShuffleInfo.get(app1Id.toString).getAppPathsInfo should be (appPathsInfo1)
     assert(!appShuffleInfo.containsKey(app2Attempt2Id.toString))
     appShuffleInfo.get(
       app3IdNoAttemptId.toString).getAppPathsInfo should be (appPathsInfo3NoAttempt)
     appShuffleInfoAfterReload =
       ShuffleTestAccessor.reloadAppShuffleInfo(mergeManager1, mergeManager1DB)
-    appShuffleInfoAfterReload.size() equals 2
+    assert(appShuffleInfoAfterReload.size() equals 2)
     appShuffleInfoAfterReload.get(app1Id.toString).getAppPathsInfo should be (appPathsInfo1)
     assert(!appShuffleInfoAfterReload.containsKey(app2Attempt2Id.toString))
     appShuffleInfoAfterReload.get(
@@ -725,9 +725,9 @@ abstract class YarnShuffleServiceSuite extends SparkFunSuite with Matchers {
     val mergeManager1DB = ShuffleTestAccessor.mergeManagerLevelDB(mergeManager1)
     ShuffleTestAccessor.recoveryFile(mergeManager1) should be (mergeMgrFile)
 
-    ShuffleTestAccessor.getAppsShuffleInfo(mergeManager1).size() equals 0
-    ShuffleTestAccessor.reloadAppShuffleInfo(
-      mergeManager1, mergeManager1DB).size() equals 0
+    assert(ShuffleTestAccessor.getAppsShuffleInfo(mergeManager1).size() equals 0)
+    assert(ShuffleTestAccessor.reloadAppShuffleInfo(
+      mergeManager1, mergeManager1DB).size() equals 0)
 
     mergeManager1.registerExecutor(app1Id.toString, mergedShuffleInfo1)
     mergeManager1.registerExecutor(app2Attempt1Id.toString, mergedShuffleInfo2Attempt1)
@@ -737,7 +737,7 @@ abstract class YarnShuffleServiceSuite extends SparkFunSuite with Matchers {
     prepareAppShufflePartition(mergeManager1, partitionId2, 2, "4")
 
     var appShuffleInfo = ShuffleTestAccessor.getAppsShuffleInfo(mergeManager1)
-    appShuffleInfo.size() equals 2
+    assert(appShuffleInfo.size() equals 2)
     appShuffleInfo.get(app1Id.toString).getAppPathsInfo should be (appPathsInfo1)
     appShuffleInfo.get(
       app2Attempt1Id.toString).getAppPathsInfo should be (appPathsInfo2Attempt1)
@@ -745,7 +745,7 @@ abstract class YarnShuffleServiceSuite extends SparkFunSuite with Matchers {
     assert(!appShuffleInfo.get(app2Attempt1Id.toString).getShuffles.get(2).isFinalized)
     var appShuffleInfoAfterReload =
       ShuffleTestAccessor.reloadAppShuffleInfo(mergeManager1, mergeManager1DB)
-    appShuffleInfoAfterReload.size() equals 2
+    assert(appShuffleInfoAfterReload.size() equals 2)
     appShuffleInfoAfterReload.get(app1Id.toString).getAppPathsInfo should be (appPathsInfo1)
     appShuffleInfoAfterReload.get(
       app2Attempt1Id.toString).getAppPathsInfo should be (appPathsInfo2Attempt1)
@@ -765,12 +765,12 @@ abstract class YarnShuffleServiceSuite extends SparkFunSuite with Matchers {
 
     mergeManager1.applicationRemoved(app1Id.toString, true)
     appShuffleInfo = ShuffleTestAccessor.getAppsShuffleInfo(mergeManager1)
-    appShuffleInfo.size() equals 1
+    assert(appShuffleInfo.size() equals 1)
     assert(!appShuffleInfo.containsKey(app1Id.toString))
     assert(appShuffleInfo.get(app2Attempt1Id.toString).getShuffles.get(2).isFinalized)
     appShuffleInfoAfterReload =
       ShuffleTestAccessor.reloadAppShuffleInfo(mergeManager1, mergeManager1DB)
-    appShuffleInfoAfterReload.size() equals 1
+    assert(appShuffleInfoAfterReload.size() equals 1)
     assert(!appShuffleInfoAfterReload.containsKey(app1Id.toString))
     assert(appShuffleInfoAfterReload.get(app2Attempt1Id.toString).getShuffles.get(2).isFinalized)
 
@@ -844,7 +844,7 @@ abstract class YarnShuffleServiceSuite extends SparkFunSuite with Matchers {
     prepareAppShufflePartition(mergeManager1, partitionId2, 2, "4")
 
     var appShuffleInfo = ShuffleTestAccessor.getAppsShuffleInfo(mergeManager1)
-    appShuffleInfo.size() equals 2
+    assert(appShuffleInfo.size() equals 2)
     appShuffleInfo.get(app1Id.toString).getAppPathsInfo should be (appPathsInfo1)
     appShuffleInfo.get(
       app2Id.toString).getAppPathsInfo should be (appPathsInfo2Attempt1)
@@ -867,20 +867,20 @@ abstract class YarnShuffleServiceSuite extends SparkFunSuite with Matchers {
     mergeManager1.applicationRemoved(app1Id.toString, true)
 
     appShuffleInfo = ShuffleTestAccessor.getAppsShuffleInfo(mergeManager1)
-    appShuffleInfo.size() equals 1
+    assert(appShuffleInfo.size() equals 1)
     assert(!appShuffleInfo.containsKey(app1Id.toString))
     assert(appShuffleInfo.get(app2Id.toString).getShuffles.get(2).isFinalized)
     // Clear the AppsShuffleInfo hashmap and reload the hashmap from DB
     appShuffleInfoAfterReload =
       ShuffleTestAccessor.reloadAppShuffleInfo(mergeManager1, mergeManager1DB)
-    appShuffleInfoAfterReload.size() equals 1
+    assert(appShuffleInfoAfterReload.size() equals 1)
     assert(!appShuffleInfoAfterReload.containsKey(app1Id.toString))
     assert(appShuffleInfoAfterReload.get(app2Id.toString).getShuffles.get(2).isFinalized)
 
     // Register application app1Id again and reload the DB again
     mergeManager1.registerExecutor(app1Id.toString, mergedShuffleInfo1)
     appShuffleInfo = ShuffleTestAccessor.getAppsShuffleInfo(mergeManager1)
-    appShuffleInfo.size() equals 2
+    assert(appShuffleInfo.size() equals 2)
     appShuffleInfo.get(app1Id.toString).getAppPathsInfo should be (appPathsInfo1)
     assert(appShuffleInfo.get(app1Id.toString).getShuffles.isEmpty)
     assert(appShuffleInfo.get(app2Id.toString).getShuffles.get(2).isFinalized)
@@ -924,7 +924,7 @@ abstract class YarnShuffleServiceSuite extends SparkFunSuite with Matchers {
     prepareAppShufflePartition(mergeManager1, partitionId1, 2, "4")
 
     var appShuffleInfo = ShuffleTestAccessor.getAppsShuffleInfo(mergeManager1)
-    appShuffleInfo.size() equals 1
+    assert(appShuffleInfo.size() equals 1)
     appShuffleInfo.get(
       app1Id.toString).getAppPathsInfo should be (appPathsInfo1Attempt1)
     assert(!appShuffleInfo.get(app1Id.toString).getShuffles.get(2).isFinalized)
@@ -938,7 +938,7 @@ abstract class YarnShuffleServiceSuite extends SparkFunSuite with Matchers {
     prepareAppShufflePartition(mergeManager1, partitionId2, 2, "4")
 
     appShuffleInfo = ShuffleTestAccessor.getAppsShuffleInfo(mergeManager1)
-    appShuffleInfo.size() equals 1
+    assert(appShuffleInfo.size() equals 1)
     appShuffleInfo.get(
       app1Id.toString).getAppPathsInfo should be (appPathsInfo1Attempt2)
     assert(!appShuffleInfo.get(app1Id.toString).getShuffles.get(2).isFinalized)
@@ -973,7 +973,7 @@ abstract class YarnShuffleServiceSuite extends SparkFunSuite with Matchers {
     val mergeManager3 = s3.shuffleMergeManager.asInstanceOf[RemoteBlockPushResolver]
     val mergeManager3DB = ShuffleTestAccessor.mergeManagerLevelDB(mergeManager3)
     appShuffleInfo = ShuffleTestAccessor.getAppsShuffleInfo(mergeManager3)
-    appShuffleInfo.size() equals 1
+    assert(appShuffleInfo.size() equals 1)
     appShuffleInfo.get(
       app1Id.toString).getAppPathsInfo should be (appPathsInfo1Attempt2)
     assert(appShuffleInfo.get(app1Id.toString).getShuffles.get(2).isFinalized)
@@ -1014,7 +1014,7 @@ abstract class YarnShuffleServiceSuite extends SparkFunSuite with Matchers {
     mergeManager1.registerExecutor(app1Id.toString, mergedShuffleInfo1Attempt2)
 
     val appShuffleInfo = ShuffleTestAccessor.getAppsShuffleInfo(mergeManager1)
-    appShuffleInfo.size() equals 1
+    assert(appShuffleInfo.size() equals 1)
     appShuffleInfo.get(
       app1Id.toString).getAppPathsInfo should be (appPathsInfo1Attempt2)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
index cbab8894cb553..865e63405f4dc 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -115,14 +115,14 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers with PlanTestB
     // Date is represented as Int in unsafeRow
     assert(DateTimeUtils.toJavaDate(unsafeRow.getInt(2)) === Date.valueOf("1970-01-01"))
     // Timestamp is represented as Long in unsafeRow
-    DateTimeUtils.toJavaTimestamp(unsafeRow.getLong(3)) should be
-    (Timestamp.valueOf("2015-05-08 08:10:25"))
+    DateTimeUtils.toJavaTimestamp(unsafeRow.getLong(3)) should be(
+      Timestamp.valueOf("2015-05-08 08:10:25"))
 
     unsafeRow.setInt(2, DateTimeUtils.fromJavaDate(Date.valueOf("2015-06-22")))
     assert(DateTimeUtils.toJavaDate(unsafeRow.getInt(2)) === Date.valueOf("2015-06-22"))
     unsafeRow.setLong(3, DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2015-06-22 08:10:25")))
-    DateTimeUtils.toJavaTimestamp(unsafeRow.getLong(3)) should be
-    (Timestamp.valueOf("2015-06-22 08:10:25"))
+    DateTimeUtils.toJavaTimestamp(unsafeRow.getLong(3)) should be(
+      Timestamp.valueOf("2015-06-22 08:10:25"))
   }
 
   testBothCodegenAndInterpreted(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala
index c69088589cc23..168f4f2452709 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala
@@ -319,7 +319,7 @@ class StreamingDeduplicationSuite extends StateStoreMetricsTest {
         },
         AssertOnQuery { q =>
           eventually(timeout(streamingTimeout)) {
-            q.lastProgress.sink.numOutputRows == 0L
+            assert(q.lastProgress.sink.numOutputRows == 0L)
             true
           }
         }

From 0fa45070b013b70f743dad3686a65cd43ef8690b Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Mon, 28 Jul 2025 20:20:59 +0800
Subject: [PATCH 43/86] [SPARK-52945][SQL][TESTS] Split
 `CastSuiteBase#checkInvalidCastFromNumericType` into three methods and
 guarantee assertions are valid

### What changes were proposed in this pull request?
Due to the absence of `assert` statements, the `CastSuiteBase#checkInvalidCastFromNumericType` method previously performed no assertion checks.

Additionally, since `checkInvalidCastFromNumericType` had significant variations in target type validation logic across different `EvalMode` contexts, this pr refactors the method into three specialized methods to ensure robust assertion enforcement:

- `checkInvalidCastFromNumericTypeToDateType`
- `checkInvalidCastFromNumericTypeToTimestampNTZType`
- `checkInvalidCastFromNumericTypeToBinaryType`

### Why are the changes needed?
To address the missing assertion validation in `CastSuiteBase`.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
- Pass GitHub Actions

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #51668 from LuciferYang/SPARK-52945.

Authored-by: yangjie01 <yangjie01@baidu.com>
Signed-off-by: yangjie01 <yangjie01@baidu.com>
(cherry picked from commit 9a452f81dbddb765f55d0610e0e1691bd2ca6e96)
Signed-off-by: yangjie01 <yangjie01@baidu.com>
---
 .../catalyst/expressions/CastSuiteBase.scala  | 95 ++++++++-----------
 .../expressions/CastWithAnsiOnSuite.scala     | 32 ++++++-
 .../catalyst/expressions/TryCastSuite.scala   |  9 ++
 3 files changed, 77 insertions(+), 59 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala
index 4352d5bc9c6bb..52ed183d5d2c8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala
@@ -545,61 +545,42 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
     checkCast("0", false)
   }
 
-  protected def checkInvalidCastFromNumericType(to: DataType): Unit = {
-    cast(1.toByte, to).checkInputDataTypes() ==
-      DataTypeMismatch(
-        errorSubClass = "CAST_WITH_FUNC_SUGGESTION",
-        messageParameters = Map(
-          "srcType" -> toSQLType(Literal(1.toByte).dataType),
-          "targetType" -> toSQLType(to),
-          "functionNames" -> "`DATE_FROM_UNIX_DATE`"
-        )
-      )
-    cast(1.toShort, to).checkInputDataTypes() ==
-      DataTypeMismatch(
-        errorSubClass = "CAST_WITH_FUNC_SUGGESTION",
-        messageParameters = Map(
-          "srcType" -> toSQLType(Literal(1.toShort).dataType),
-          "targetType" -> toSQLType(to),
-          "functionNames" -> "`DATE_FROM_UNIX_DATE`"
-        )
-      )
-    cast(1, to).checkInputDataTypes() ==
-      DataTypeMismatch(
-        errorSubClass = "CAST_WITH_FUNC_SUGGESTION",
-        messageParameters = Map(
-          "srcType" -> toSQLType(Literal(1).dataType),
-          "targetType" -> toSQLType(to),
-          "functionNames" -> "`DATE_FROM_UNIX_DATE`"
-        )
-      )
-    cast(1L, to).checkInputDataTypes() ==
-      DataTypeMismatch(
-        errorSubClass = "CAST_WITH_FUNC_SUGGESTION",
-        messageParameters = Map(
-          "srcType" -> toSQLType(Literal(1L).dataType),
-          "targetType" -> toSQLType(to),
-          "functionNames" -> "`DATE_FROM_UNIX_DATE`"
-        )
-      )
-    cast(1.0.toFloat, to).checkInputDataTypes() ==
-      DataTypeMismatch(
-        errorSubClass = "CAST_WITH_FUNC_SUGGESTION",
-        messageParameters = Map(
-          "srcType" -> toSQLType(Literal(1.0.toFloat).dataType),
-          "targetType" -> toSQLType(to),
-          "functionNames" -> "`DATE_FROM_UNIX_DATE`"
-        )
-      )
-    cast(1.0, to).checkInputDataTypes() ==
-      DataTypeMismatch(
-        errorSubClass = "CAST_WITH_FUNC_SUGGESTION",
-        messageParameters = Map(
-          "srcType" -> toSQLType(Literal(1.0).dataType),
-          "targetType" -> toSQLType(to),
-          "functionNames" -> "`DATE_FROM_UNIX_DATE`"
-        )
-      )
+  protected def createCastMismatch(
+      srcType: DataType,
+      targetType: DataType,
+      errorSubClass: String,
+      extraParams: Map[String, String] = Map.empty): DataTypeMismatch = {
+    val baseParams = Map(
+      "srcType" -> toSQLType(srcType),
+      "targetType" -> toSQLType(targetType)
+    )
+    DataTypeMismatch(errorSubClass, baseParams ++ extraParams)
+  }
+
+  protected def checkInvalidCastFromNumericTypeToDateType(): Unit = {
+    val errorSubClass = if (evalMode == EvalMode.LEGACY) {
+      "CAST_WITHOUT_SUGGESTION"
+    } else {
+      "CAST_WITH_FUNC_SUGGESTION"
+    }
+    val funcParams = if (evalMode == EvalMode.LEGACY) {
+      Map.empty[String, String]
+    } else {
+      Map("functionNames" -> "`DATE_FROM_UNIX_DATE`")
+    }
+    Seq(1.toByte, 1.toShort, 1, 1L, 1.0.toFloat, 1.0).foreach { testValue =>
+      val expectedError =
+        createCastMismatch(Literal(testValue).dataType, DateType, errorSubClass, funcParams)
+      assert(cast(testValue, DateType).checkInputDataTypes() == expectedError)
+    }
+  }
+  protected def checkInvalidCastFromNumericTypeToTimestampNTZType(): Unit = {
+    // All numeric types: `CAST_WITHOUT_SUGGESTION`
+    Seq(1.toByte, 1.toShort, 1, 1L, 1.0.toFloat, 1.0).foreach { testValue =>
+      val expectedError =
+        createCastMismatch(Literal(testValue).dataType, TimestampNTZType, "CAST_WITHOUT_SUGGESTION")
+      assert(cast(testValue, TimestampNTZType).checkInputDataTypes() == expectedError)
+    }
   }
 
   test("SPARK-16729 type checking for casting to date type") {
@@ -614,7 +595,7 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
         )
       )
     )
-    checkInvalidCastFromNumericType(DateType)
+    checkInvalidCastFromNumericTypeToDateType()
   }
 
   test("SPARK-20302 cast with same structure") {
@@ -996,7 +977,7 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
 
   test("disallow type conversions between Numeric types and Timestamp without time zone type") {
     import DataTypeTestUtils.numericTypes
-    checkInvalidCastFromNumericType(TimestampNTZType)
+    checkInvalidCastFromNumericTypeToTimestampNTZType()
     verifyCastFailure(
       cast(Literal(0L), TimestampNTZType),
       DataTypeMismatch(
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastWithAnsiOnSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastWithAnsiOnSuite.scala
index 5916e0501f8b6..3c554455426e8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastWithAnsiOnSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastWithAnsiOnSuite.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.util.DateTimeConstants.MILLIS_PER_SECOND
 import org.apache.spark.sql.catalyst.util.DateTimeTestUtils
 import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.{withDefaultTimeZone, UTC}
 import org.apache.spark.sql.errors.QueryErrorsBase
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -38,6 +39,33 @@ class CastWithAnsiOnSuite extends CastSuiteBase with QueryErrorsBase {
 
   override def evalMode: EvalMode.Value = EvalMode.ANSI
 
+  protected def checkInvalidCastFromNumericTypeToBinaryType(): Unit = {
+    def checkNumericTypeCast(
+        testValue: Any,
+        srcType: DataType,
+        to: DataType,
+        expectedErrorClass: String,
+        extraParams: Map[String, String] = Map.empty): Unit = {
+      val expectedError = createCastMismatch(srcType, to, expectedErrorClass, extraParams)
+      assert(cast(testValue, to).checkInputDataTypes() == expectedError)
+    }
+
+    // Integer types: suggest config change
+    val configParams = Map(
+      "config" -> toSQLConf(SQLConf.ANSI_ENABLED.key),
+      "configVal" -> toSQLValue("false", StringType)
+    )
+    checkNumericTypeCast(1.toByte, ByteType, BinaryType, "CAST_WITH_CONF_SUGGESTION", configParams)
+    checkNumericTypeCast(
+      1.toShort, ShortType, BinaryType, "CAST_WITH_CONF_SUGGESTION", configParams)
+    checkNumericTypeCast(1, IntegerType, BinaryType, "CAST_WITH_CONF_SUGGESTION", configParams)
+    checkNumericTypeCast(1L, LongType, BinaryType, "CAST_WITH_CONF_SUGGESTION", configParams)
+
+    // Floating types: no suggestion
+    checkNumericTypeCast(1.0.toFloat, FloatType, BinaryType, "CAST_WITHOUT_SUGGESTION")
+    checkNumericTypeCast(1.0, DoubleType, BinaryType, "CAST_WITHOUT_SUGGESTION")
+  }
+
   private def isTryCast = evalMode == EvalMode.TRY
 
   private def testIntMaxAndMin(dt: DataType): Unit = {
@@ -141,7 +169,7 @@ class CastWithAnsiOnSuite extends CastSuiteBase with QueryErrorsBase {
 
   test("ANSI mode: disallow type conversions between Numeric types and Date type") {
     import DataTypeTestUtils.numericTypes
-    checkInvalidCastFromNumericType(DateType)
+    checkInvalidCastFromNumericTypeToDateType()
     verifyCastFailure(
       cast(Literal(0L), DateType),
       DataTypeMismatch(
@@ -167,7 +195,7 @@ class CastWithAnsiOnSuite extends CastSuiteBase with QueryErrorsBase {
 
   test("ANSI mode: disallow type conversions between Numeric types and Binary type") {
     import DataTypeTestUtils.numericTypes
-    checkInvalidCastFromNumericType(BinaryType)
+    checkInvalidCastFromNumericTypeToBinaryType()
     val binaryLiteral = Literal(new Array[Byte](1.toByte), BinaryType)
     numericTypes.foreach { numericType =>
       assert(cast(binaryLiteral, numericType).checkInputDataTypes() ==
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/TryCastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/TryCastSuite.scala
index 9ead075663540..45b98c71ad705 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/TryCastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/TryCastSuite.scala
@@ -53,6 +53,15 @@ class TryCastSuite extends CastWithAnsiOnSuite {
     checkEvaluation(cast(l, to), tryCastResult, InternalRow(l.value))
   }
 
+  override protected def checkInvalidCastFromNumericTypeToBinaryType(): Unit = {
+    // All numeric types: `CAST_WITHOUT_SUGGESTION`
+    Seq(1.toByte, 1.toShort, 1, 1L, 1.0.toFloat, 1.0).foreach { testValue =>
+      val expectedError =
+        createCastMismatch(Literal(testValue).dataType, BinaryType, "CAST_WITHOUT_SUGGESTION")
+      assert(cast(testValue, BinaryType).checkInputDataTypes() == expectedError)
+    }
+  }
+
   test("print string") {
     assert(cast(Literal("1"), IntegerType).toString == "try_cast(1 as int)")
     assert(cast(Literal("1"), IntegerType).sql == "TRY_CAST('1' AS INT)")

From a137e4b09bf10db705db7778bb6f5553122fb855 Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Fri, 1 Aug 2025 15:13:22 +0200
Subject: [PATCH 44/86] Try to create a workflow to build 3.5

---
 .github/workflows/build_branch35.yml | 48 ++++++++++++++++++++++++++++
 .github/workflows/build_main.yml     |  1 +
 2 files changed, 49 insertions(+)
 create mode 100644 .github/workflows/build_branch35.yml

diff --git a/.github/workflows/build_branch35.yml b/.github/workflows/build_branch35.yml
new file mode 100644
index 0000000000000..9509706a28805
--- /dev/null
+++ b/.github/workflows/build_branch35.yml
@@ -0,0 +1,48 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: "Build (branch-3.5, Scala 2.13, Hadoop 3, JDK 8)"
+
+on:
+  pull_request:
+
+jobs:
+  run-build:
+    permissions:
+      packages: write
+    name: Run
+    uses: ./.github/workflows/build_and_test.yml
+    if: github.repository == 'logicalclocks/spark'
+    with:
+      java: 8
+      branch: branch-3.5
+      hadoop: hadoop3
+      envs: >-
+        {
+          "SCALA_PROFILE": "scala2.13"
+        }
+      jobs: >-
+        {
+          "build": "true",
+          "pyspark": "true",
+          "sparkr": "true",
+          "tpcds-1g": "true",
+          "docker-integration-tests": "true",
+          "lint" : "true"
+        }
diff --git a/.github/workflows/build_main.yml b/.github/workflows/build_main.yml
index 9ef52f326375b..6a8d60117414e 100644
--- a/.github/workflows/build_main.yml
+++ b/.github/workflows/build_main.yml
@@ -23,6 +23,7 @@ on:
   push:
     branches:
     - '**'
+  pull_request:
 
 jobs:
   call-build-and-test:

From 1632daa0052187b963d1bd3bccbac1db42fba920 Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Fri, 1 Aug 2025 16:44:37 +0200
Subject: [PATCH 45/86] Add settings to mvn command

---
 dev/make-distribution.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
index 6758a8aee0322..a182981dfa063 100755
--- a/dev/make-distribution.sh
+++ b/dev/make-distribution.sh
@@ -167,6 +167,7 @@ export MAVEN_OPTS="${MAVEN_OPTS:--Xss128m -Xmx4g -XX:ReservedCodeCacheSize=128m}
 # Normal quoting tricks don't work.
 # See: http://mywiki.wooledge.org/BashFAQ/050
 BUILD_COMMAND=("$MVN" clean package \
+    -s ~/.m2/settings.xml \
     -DskipTests \
     -Dmaven.javadoc.skip=true \
     -Dmaven.scaladoc.skip=true \

From 22355bab824c49286acfeb5bf3db16c032cb44a2 Mon Sep 17 00:00:00 2001
From: Robert Dillitz <r.dillitz@gmail.com>
Date: Fri, 1 Aug 2025 23:30:42 +0800
Subject: [PATCH 46/86] [SPARK-53054][CONNECT][3.5] Fix the
 connect.DataFrameReader default format behavior

### What changes were proposed in this pull request?
See title.

### Why are the changes needed?
Scala Spark Connect does not adhere to the [documented](https://spark.apache.org/docs/3.5.6/sql-data-sources-load-save-functions.html) behavior.

### Does this PR introduce _any_ user-facing change?
As documented in [Generic Load/Save Functions - Spark 3.5.6 Documentation](https://spark.apache.org/docs/3.5.6/sql-data-sources-load-save-functions.html), and similar to Spark Classic and the Python Spark Connect, Scala Spark Connect's `DataFrameReader` will now also default to the format set via the `spark.sql.sources.default` SQL configuration.

**Before**: `spark.read.load("..."`) throws

```
java.lang.IllegalArgumentException: The source format must be specified.
```
**Now**: `spark.read.load("...")` uses the format specified via `spark.sql.sources.default`

### How was this patch tested?
Test case added to ClientE2ETestSuite.

### Was this patch authored or co-authored using generative AI tooling?
No.

Closes #51759 from dillitz/fix-default-format-3.5.

Lead-authored-by: Robert Dillitz <r.dillitz@gmail.com>
Co-authored-by: Robert Dillitz <robert.dillitz@databricks.com>
Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
---
 .../org/apache/spark/sql/DataFrameReader.scala    | 15 ++++-----------
 .../org/apache/spark/sql/ClientE2ETestSuite.scala | 12 ++++++++++++
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 10d2af094a08c..f138ca93760cf 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -45,7 +45,7 @@ class DataFrameReader private[sql] (sparkSession: SparkSession) extends Logging
    * @since 3.4.0
    */
   def format(source: String): DataFrameReader = {
-    this.source = source
+    this.source = Some(source)
     this
   }
 
@@ -179,8 +179,7 @@ class DataFrameReader private[sql] (sparkSession: SparkSession) extends Logging
   def load(paths: String*): DataFrame = {
     sparkSession.newDataFrame { builder =>
       val dataSourceBuilder = builder.getReadBuilder.getDataSourceBuilder
-      assertSourceFormatSpecified()
-      dataSourceBuilder.setFormat(source)
+      source.foreach(dataSourceBuilder.setFormat)
       userSpecifiedSchema.foreach(schema => dataSourceBuilder.setSchema(schema.toDDL))
       extraOptions.foreach { case (k, v) =>
         dataSourceBuilder.putOptions(k, v)
@@ -285,7 +284,7 @@ class DataFrameReader private[sql] (sparkSession: SparkSession) extends Logging
     sparkSession.newDataFrame { builder =>
       val dataSourceBuilder = builder.getReadBuilder.getDataSourceBuilder
       format("jdbc")
-      dataSourceBuilder.setFormat(source)
+      source.foreach(dataSourceBuilder.setFormat)
       predicates.foreach(predicate => dataSourceBuilder.addPredicates(predicate))
       this.extraOptions ++= Seq("url" -> url, "dbtable" -> table)
       val params = extraOptions ++ connectionProperties.asScala
@@ -539,12 +538,6 @@ class DataFrameReader private[sql] (sparkSession: SparkSession) extends Logging
     text(paths: _*).select("value").as(StringEncoder)
   }
 
-  private def assertSourceFormatSpecified(): Unit = {
-    if (source == null) {
-      throw new IllegalArgumentException("The source format must be specified.")
-    }
-  }
-
   private def parse(ds: Dataset[String], format: ParseFormat): DataFrame = {
     sparkSession.newDataFrame { builder =>
       val parseBuilder = builder.getParseBuilder
@@ -571,7 +564,7 @@ class DataFrameReader private[sql] (sparkSession: SparkSession) extends Logging
   // Builder pattern config options
   ///////////////////////////////////////////////////////////////////////////////////////
 
-  private var source: String = _
+  private var source: Option[String] = None
 
   private var userSpecifiedSchema: Option[StructType] = None
 
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
index feefd19000d1d..d53a472723b71 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
@@ -1325,6 +1325,18 @@ class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with PrivateM
       .dropDuplicatesWithinWatermark("newcol")
     testAndVerify(result2)
   }
+
+  test("SPARK-53054: DataFrameReader defaults to spark.sql.sources.default") {
+    withTempPath { file =>
+      val path = file.getAbsoluteFile.toURI.toString
+      spark.range(100).write.parquet(file.toPath.toAbsolutePath.toString)
+
+      spark.conf.set("spark.sql.sources.default", "parquet")
+
+      val df = spark.read.load(path)
+      assert(df.count == 100)
+    }
+  }
 }
 
 private[sql] case class ClassData(a: String, b: Int)

From ae70bf9143883daabf0584912017ad70a1fc5dd6 Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Mon, 4 Aug 2025 09:36:53 +0200
Subject: [PATCH 47/86] Fix bootstrapper

---
 version.log | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 version.log

diff --git a/version.log b/version.log
new file mode 100644
index 0000000000000..7d280e2cd476e
--- /dev/null
+++ b/version.log
@@ -0,0 +1 @@
+3.5.5

From db1b30b0f7ce56c0bb288a79d7ec15c4cf56dd92 Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Mon, 4 Aug 2025 09:39:29 +0200
Subject: [PATCH 48/86]  Again

---
 dev/infra/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile
index f0b88666c040d..811b2dba3768e 100644
--- a/dev/infra/Dockerfile
+++ b/dev/infra/Dockerfile
@@ -43,7 +43,7 @@ RUN mkdir -p /usr/local/pypy/pypy3.8 && \
     ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3.8 && \
     ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3
 
-RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3
+RUN curl -sS https://bootstrap.pypa.io/pip/3.8/get-pip.py | pypy3
 
 RUN $APT_INSTALL gnupg ca-certificates pandoc
 RUN echo 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' >> /etc/apt/sources.list

From 2524c0a6d578136ff63b0a949ab9a90602c4f3e8 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Tue, 5 Aug 2025 11:16:56 +0900
Subject: [PATCH 49/86] Revert "[SPARK-49182][DOCS][PYTHON] Stop publish
 site/docs/{version}/api/python/_sources dir"

This reverts commit 0e2d757660212d679118e587f921e640720f9245.
---
 docs/_plugins/copy_api_dirs.rb | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb
index bafc479974551..28d5e0d82c93a 100644
--- a/docs/_plugins/copy_api_dirs.rb
+++ b/docs/_plugins/copy_api_dirs.rb
@@ -136,7 +136,6 @@
     mkdir_p "api/python"
 
     puts "cp -r ../python/docs/build/html/. api/python"
-    rm_r("../python/docs/build/html/_sources")
     cp_r("../python/docs/build/html/.", "api/python")
   end
 

From 665ccb34dd9e00746283fdfc1f380c32f7127aa5 Mon Sep 17 00:00:00 2001
From: Peter Toth <peter.toth@gmail.com>
Date: Wed, 6 Aug 2025 08:51:30 +0200
Subject: [PATCH 50/86] [SPARK-53094][SQL][3.5] Fix CUBE with aggregate
 containing HAVING clauses

### What changes were proposed in this pull request?

This is an alternative PR to https://github.com/apache/spark/pull/51810 to fix a regresion introduced in Spark 3.2 with https://github.com/apache/spark/pull/32470.
This PR defers the resolution of not fully resolved `UnresolvedHaving` nodes from `ResolveGroupingAnalytics`:
```
=== Applying Rule org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveGroupingAnalytics ===
 'Sort ['s DESC NULLS LAST], true                                                                                               'Sort ['s DESC NULLS LAST], true
!+- 'UnresolvedHaving ('count('product) > 2)                                                                                    +- 'UnresolvedHaving ('count(tempresolvedcolumn(product#261, product, false)) > 2)
!   +- 'Aggregate [cube(Vector(0), Vector(1), product#261, region#262)], [product#261, region#262, sum(amount#263) AS s#264L]      +- Aggregate [product#269, region#270, spark_grouping_id#268L], [product#269, region#270, sum(amount#263) AS s#264L]
!      +- SubqueryAlias t                                                                                                             +- Expand [[product#261, region#262, amount#263, product#266, region#267, 0], [product#261, region#262, amount#263, product#266, null, 1], [product#261, region#262, amount#263, null, region#267, 2], [product#261, region#262, amount#263, null, null, 3]], [product#261, region#262, amount#263, product#269, region#270, spark_grouping_id#268L]
!         +- LocalRelation [product#261, region#262, amount#263]                                                                         +- Project [product#261, region#262, amount#263, product#261 AS product#266, region#262 AS region#267]
!                                                                                                                                           +- SubqueryAlias t
!                                                                                                                                              +- LocalRelation [product#261, region#262, amount#263]
```
to `ResolveAggregateFunctions` to add the correct aggregate expressions (`count(product#261)`):
```
=== Applying Rule org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAggregateFunctions ===
 'Sort ['s DESC NULLS LAST], true                                                                                                                                                                                                                                                                                                                             'Sort ['s DESC NULLS LAST], true
!+- 'UnresolvedHaving (count(tempresolvedcolumn(product#261, product, false)) > cast(2 as bigint))                                                                                                                                                                                                                                                            +- Project [product#269, region#270, s#264L]
!   +- Aggregate [product#269, region#270, spark_grouping_id#268L], [product#269, region#270, sum(amount#263) AS s#264L]                                                                                                                                                                                                                                         +- Filter (count(product)#272L > cast(2 as bigint))
!      +- Expand [[product#261, region#262, amount#263, product#266, region#267, 0], [product#261, region#262, amount#263, product#266, null, 1], [product#261, region#262, amount#263, null, region#267, 2], [product#261, region#262, amount#263, null, null, 3]], [product#261, region#262, amount#263, product#269, region#270, spark_grouping_id#268L]         +- Aggregate [product#269, region#270, spark_grouping_id#268L], [product#269, region#270, sum(amount#263) AS s#264L, count(product#261) AS count(product)#272L]
!         +- Project [product#261, region#262, amount#263, product#261 AS product#266, region#262 AS region#267]                                                                                                                                                                                                                                                       +- Expand [[product#261, region#262, amount#263, product#266, region#267, 0], [product#261, region#262, amount#263, product#266, null, 1], [product#261, region#262, amount#263, null, region#267, 2], [product#261, region#262, amount#263, null, null, 3]], [product#261, region#262, amount#263, product#269, region#270, spark_grouping_id#268L]
!            +- SubqueryAlias t                                                                                                                                                                                                                                                                                                                                           +- Project [product#261, region#262, amount#263, product#261 AS product#266, region#262 AS region#267]
!               +- LocalRelation [product#261, region#262, amount#263]                                                                                                                                                                                                                                                                                                       +- SubqueryAlias t
!                                                                                                                                                                                                                                                                                                                                                                               +- LocalRelation [product#261, region#262, amount#263]
```

### Why are the changes needed?

Fix a correctness isue described in https://github.com/apache/spark/pull/51810.

### Does this PR introduce _any_ user-facing change?

Yes, it fixes a correctness issue.

### How was this patch tested?

Added new UT from https://github.com/apache/spark/pull/51810.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #51855 from peter-toth/SPARK-53094-fix-cube-having-3.5.

Authored-by: Peter Toth <peter.toth@gmail.com>
Signed-off-by: Peter Toth <peter.toth@gmail.com>
---
 .../spark/sql/catalyst/analysis/Analyzer.scala   |  4 ++++
 .../analyzer-results/grouping_set.sql.out        |  2 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala     | 16 ++++++++++++++++
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index a8f80cfd69a80..a7ebde57b863e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -723,6 +723,10 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor
       } else {
         colResolved.havingCondition
       }
+      // `cond` might contain unresolved aggregate functions so defer its resolution to
+      // `ResolveAggregateFunctions` rule if needed.
+      if (!cond.resolved) return colResolved
+
       // Try resolving the condition of the filter as though it is in the aggregate clause
       val (extraAggExprs, Seq(resolvedHavingCond)) =
         ResolveAggregateFunctions.resolveExprsWithAggregate(Seq(cond), aggForResolving)
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/grouping_set.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/grouping_set.sql.out
index bb453923ce954..e9a31fa780db8 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/grouping_set.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/grouping_set.sql.out
@@ -86,7 +86,7 @@ FROM   (VALUES ('x', 'a', 10), ('y', 'b', 20) ) AS t (c1, c2, c3)
 GROUP  BY GROUPING SETS ( ( c1 ), ( c2 ) )
 HAVING GROUPING__ID > 1
 -- !query analysis
-Filter (grouping__id#xL > cast(1 as bigint))
+Filter (GROUPING__ID#xL > cast(1 as bigint))
 +- Aggregate [c1#x, c2#x, spark_grouping_id#xL], [c1#x, c2#x, sum(c3#x) AS sum(c3)#xL, spark_grouping_id#xL AS grouping__id#xL]
    +- Expand [[c1#x, c2#x, c3#x, c1#x, null, 1], [c1#x, c2#x, c3#x, null, c2#x, 2]], [c1#x, c2#x, c3#x, c1#x, c2#x, spark_grouping_id#xL]
       +- Project [c1#x, c2#x, c3#x, c1#x AS c1#x, c2#x AS c2#x]
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 793a0da6a8622..3cf2bfd17ab12 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -4724,6 +4724,22 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
       Row(Array(0), Array(0)), Row(Array(1), Array(1)), Row(Array(2), Array(2)))
     checkAnswer(df, expectedAnswer)
   }
+
+  test("SPARK-53094: Fix cube-related data quality problem") {
+    val df = sql(
+      """SELECT product, region, sum(amount) AS s
+        |FROM VALUES
+        |  ('a', 'east', 100),
+        |  ('b', 'east', 200),
+        |  ('a', 'west', 150),
+        |  ('b', 'west', 250),
+        |  ('a', 'east', 120) AS t(product, region, amount)
+        |GROUP BY product, region WITH CUBE
+        |HAVING count(product) > 2
+        |ORDER BY s DESC""".stripMargin)
+
+    checkAnswer(df, Seq(Row(null, null, 820), Row(null, "east", 420), Row("a", null, 370)))
+  }
 }
 
 case class Foo(bar: Option[String])

From 71ab2cc10936925fa5c6d7141c4e3c99904e3f0b Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 7 Aug 2025 08:11:51 -0700
Subject: [PATCH 51/86] [SPARK-53155][SQL] Global lower agggregation should not
 be replaced with a project

This patch fixes the optimization rule `RemoveRedundantAggregates`.

The optimizer rule `RemoveRedundantAggregates` removes redundant lower aggregation from a query plan and replace it with a project of referred non-aggregate expressions. However, if the removed aggregation is a global one, that is not correct because a project is different with a global aggregation in semantics.

For example, if the input relation is empty, a project might be optimized to an empty relation, while a global aggregation will return a single row.

Yes, this fixes a user-facing bug. Previously, a global aggregation under another aggregation might be treated as redundant and replaced as a project with non-aggregation expressions. If the input relation is empty, the replacement is incorrect and might produce incorrect result. This patch adds a new unit test to show the difference.

Unit test, manual test.

No

Closes #51884 from viirya/fix_remove_redundant_agg.

Authored-by: Liang-Chi Hsieh <viirya@gmail.com>
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
(cherry picked from commit 3aa8c9dbc1c0d4622cd62a65db510d6feac31ba3)
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
---
 .../optimizer/RemoveRedundantAggregates.scala |  8 ++++++-
 .../RemoveRedundantAggregatesSuite.scala      | 21 ++++++++++++++++++-
 .../spark/sql/DataFrameAggregateSuite.scala   |  7 +++++++
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAggregates.scala
index badf4065f5fb5..08423a6856404 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAggregates.scala
@@ -54,7 +54,13 @@ object RemoveRedundantAggregates extends Rule[LogicalPlan] with AliasHelper {
         .map(_.toAttribute)
     ))
 
-    upperHasNoDuplicateSensitiveAgg && upperRefsOnlyDeterministicNonAgg
+    // If the lower aggregation is global, it is not redundant because a project with
+    // non-aggregate expressions is different with global aggregation in semantics.
+    // E.g., if the input relation is empty, a project might be optimized to an empty
+    // relation, while a global aggregation will return a single row.
+    lazy val lowerIsGlobalAgg = lower.groupingExpressions.isEmpty
+
+    upperHasNoDuplicateSensitiveAgg && upperRefsOnlyDeterministicNonAgg && !lowerIsGlobalAgg
   }
 
   private def isDuplicateSensitive(ae: AggregateExpression): Boolean = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAggregatesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAggregatesSuite.scala
index 2af3057c0b856..40b3d36d4bfc7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAggregatesSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAggregatesSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.optimizer
 
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions.{Expression, PythonUDAF}
+import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, PythonUDAF}
 import org.apache.spark.sql.catalyst.expressions.Literal.TrueLiteral
 import org.apache.spark.sql.catalyst.plans.{LeftAnti, LeftSemi, PlanTest}
 import org.apache.spark.sql.catalyst.plans.logical.{Distinct, LocalRelation, LogicalPlan}
@@ -289,4 +289,23 @@ class RemoveRedundantAggregatesSuite extends PlanTest {
     val originalQuery = Distinct(x.groupBy($"a", $"b")($"a", TrueLiteral)).analyze
     comparePlans(Optimize.execute(originalQuery), originalQuery)
   }
+
+  test("SPARK-53155: global lower aggregation should not be removed") {
+    object OptimizeNonRemovedRedundantAgg extends RuleExecutor[LogicalPlan] {
+      val batches = Batch("RemoveRedundantAggregates", FixedPoint(10),
+        PropagateEmptyRelation,
+        RemoveRedundantAggregates) :: Nil
+    }
+
+    val query = relation
+      .groupBy()(Literal(1).as("col1"), Literal(2).as("col2"), Literal(3).as("col3"))
+      .groupBy($"col1")(max($"col1"))
+      .analyze
+    val expected = relation
+      .groupBy()(Literal(1).as("col1"), Literal(2).as("col2"), Literal(3).as("col3"))
+      .groupBy($"col1")(max($"col1"))
+      .analyze
+    val optimized = OptimizeNonRemovedRedundantAgg.execute(query)
+    comparePlans(optimized, expected)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 5a8681aed973a..6f3090d89088b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -2282,6 +2282,13 @@ class DataFrameAggregateSuite extends QueryTest
       checkAnswer(df, Row(1.001d, 1, 1) :: Row(6.002d, 1, 1) :: Nil)
     }
   }
+
+  test("SPARK-53155: global lower aggregation should not be removed") {
+    val df = emptyTestData
+      .groupBy().agg(lit(1).as("col1"), lit(2).as("col2"), lit(3).as("col3"))
+      .groupBy($"col1").agg(max("col1"))
+    checkAnswer(df, Seq(Row(1, 1)))
+  }
 }
 
 case class B(c: Option[Double])

From 4e9dbc861c803288140fd7b4ccc754f0de3fb384 Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Fri, 8 Aug 2025 14:07:03 +0200
Subject: [PATCH 52/86] Bump hopsfs

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index b346f76012b2a..609fee113dc45 100644
--- a/pom.xml
+++ b/pom.xml
@@ -122,7 +122,7 @@
     <slf4j.version>2.0.17</slf4j.version>
     <log4j.version>2.24.3</log4j.version>
     <!-- make sure to update IsolatedClientLoader whenever this version is changed -->
-    <hadoop.version>3.2.0.16-EE-SNAPSHOT</hadoop.version>
+    <hadoop.version>3.2.0.18-EE-SNAPSHOT</hadoop.version>
     <hadoop.group>io.hops</hadoop.group>
     <!-- SPARK-41247: When updating `protobuf.version`, also need to update `protoVersion` in `SparkBuild.scala` -->
     <protobuf.version>3.23.4</protobuf.version>

From 3f87386679dccec565e4600f1ceebb4d8c5598e2 Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Tue, 12 Aug 2025 14:03:23 +0200
Subject: [PATCH 53/86] Use JIRA tag version for hopsfs

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 609fee113dc45..c8472b2fd6210 100644
--- a/pom.xml
+++ b/pom.xml
@@ -122,7 +122,7 @@
     <slf4j.version>2.0.17</slf4j.version>
     <log4j.version>2.24.3</log4j.version>
     <!-- make sure to update IsolatedClientLoader whenever this version is changed -->
-    <hadoop.version>3.2.0.18-EE-SNAPSHOT</hadoop.version>
+    <hadoop.version>3.2.0.18-EE-HWORKS-2203-SNAPSHOT</hadoop.version>
     <hadoop.group>io.hops</hadoop.group>
     <!-- SPARK-41247: When updating `protobuf.version`, also need to update `protoVersion` in `SparkBuild.scala` -->
     <protobuf.version>3.23.4</protobuf.version>

From 33a2aa823f7144ff3207a786c208b2c64daac2a4 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Fri, 28 Jun 2024 16:28:47 +0900
Subject: [PATCH 54/86] [SPARK-48746][PYTHON][SS][TESTS] Avoid using global
 temp view in foreachBatch test case

For regular foreachBatch tests, it's better to avoid a global temp view in their tests.

Using temp views sometimes [confuse users](https://stackoverflow.com/questions/62709024/temporary-view-in-spark-structure-streaming) so in standard tests, should be better to avoid using them unless we explicitly test a global temp view.

No, test-only.

Manually ran the tests.

No.

Closes #47140 from HyukjinKwon/SPARK-48746.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit fc98ccdf8f7e1287726fb40cf08e7ed2f3864ef6)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .../tests/streaming/test_streaming_foreach_batch.py  | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py b/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py
index 393101a096ea4..f5e98dbf41ee0 100644
--- a/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py
+++ b/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py
@@ -25,17 +25,18 @@ def test_streaming_foreach_batch(self):
         q = None
 
         def collectBatch(batch_df, batch_id):
-            batch_df.createOrReplaceGlobalTempView("test_view")
+            batch_df.write.format("parquet").saveAsTable("test_table")
 
         try:
             df = self.spark.readStream.format("text").load("python/test_support/sql/streaming")
             q = df.writeStream.foreachBatch(collectBatch).start()
             q.processAllAvailable()
-            collected = self.spark.sql("select * from global_temp.test_view").collect()
+            collected = self.spark.sql("select * from test_table").collect()
             self.assertTrue(len(collected), 2)
         finally:
             if q:
                 q.stop()
+            self.spark.sql("DROP TABLE IF EXISTS test_table")
 
     def test_streaming_foreach_batch_tempview(self):
         q = None
@@ -46,18 +47,19 @@ def collectBatch(batch_df, batch_id):
             # clone the session which is no longer same with the session used to start the
             # streaming query
             assert len(batch_df.sparkSession.sql("SELECT * FROM updates").collect()) == 2
-            # Write to a global view verify on the repl/client side.
-            batch_df.createOrReplaceGlobalTempView("temp_view")
+            # Write a table to verify on the repl/client side.
+            batch_df.write.format("parquet").saveAsTable("test_table")
 
         try:
             df = self.spark.readStream.format("text").load("python/test_support/sql/streaming")
             q = df.writeStream.foreachBatch(collectBatch).start()
             q.processAllAvailable()
-            collected = self.spark.sql("SELECT * FROM global_temp.temp_view").collect()
+            collected = self.spark.sql("SELECT * FROM test_table").collect()
             self.assertTrue(len(collected[0]), 2)
         finally:
             if q:
                 q.stop()
+            self.spark.sql("DROP TABLE IF EXISTS test_table")
 
     def test_streaming_foreach_batch_propagates_python_errors(self):
         from pyspark.errors import StreamingQueryException

From 2eeeac8a21dcacf49694af4c79cc31553758403c Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Wed, 13 Aug 2025 12:24:30 +0800
Subject: [PATCH 55/86] [MINOR][PYTHON][TESTS] Use different temp table name in
 foreachBatch tests

This PR proposes to use different temp table name in foreachBatch tests.

I think it fixes https://github.com/apache/spark/actions/runs/16920826594/job/47946275434 . The same names affect the test when they are async executed.

No, test-only.

Manually.

No.

Closes #52002 from HyukjinKwon/minor-diff-table-name.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
(cherry picked from commit ebf4dd1889f3404e856723f66851af48751b469f)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .../tests/streaming/test_streaming_foreach_batch.py  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py b/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py
index f5e98dbf41ee0..c10db1330b46c 100644
--- a/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py
+++ b/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py
@@ -25,18 +25,18 @@ def test_streaming_foreach_batch(self):
         q = None
 
         def collectBatch(batch_df, batch_id):
-            batch_df.write.format("parquet").saveAsTable("test_table")
+            batch_df.write.format("parquet").saveAsTable("test_table1")
 
         try:
             df = self.spark.readStream.format("text").load("python/test_support/sql/streaming")
             q = df.writeStream.foreachBatch(collectBatch).start()
             q.processAllAvailable()
-            collected = self.spark.sql("select * from test_table").collect()
+            collected = self.spark.sql("select * from test_table1").collect()
             self.assertTrue(len(collected), 2)
         finally:
             if q:
                 q.stop()
-            self.spark.sql("DROP TABLE IF EXISTS test_table")
+            self.spark.sql("DROP TABLE IF EXISTS test_table1")
 
     def test_streaming_foreach_batch_tempview(self):
         q = None
@@ -48,18 +48,18 @@ def collectBatch(batch_df, batch_id):
             # streaming query
             assert len(batch_df.sparkSession.sql("SELECT * FROM updates").collect()) == 2
             # Write a table to verify on the repl/client side.
-            batch_df.write.format("parquet").saveAsTable("test_table")
+            batch_df.write.format("parquet").saveAsTable("test_table2")
 
         try:
             df = self.spark.readStream.format("text").load("python/test_support/sql/streaming")
             q = df.writeStream.foreachBatch(collectBatch).start()
             q.processAllAvailable()
-            collected = self.spark.sql("SELECT * FROM test_table").collect()
+            collected = self.spark.sql("SELECT * FROM test_table2").collect()
             self.assertTrue(len(collected[0]), 2)
         finally:
             if q:
                 q.stop()
-            self.spark.sql("DROP TABLE IF EXISTS test_table")
+            self.spark.sql("DROP TABLE IF EXISTS test_table2")
 
     def test_streaming_foreach_batch_propagates_python_errors(self):
         from pyspark.errors import StreamingQueryException

From a20cd2e9c0eab54fe464a589ed5b2d69f26654ac Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Thu, 14 Aug 2025 15:39:15 +0200
Subject: [PATCH 56/86] snapshots distrib management

---
 pom.xml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pom.xml b/pom.xml
index c8472b2fd6210..3b538ee9b2ae2 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4025,5 +4025,10 @@
       <name>Hops Release Repository</name>
       <url>https://nexus.hops.works/repository/hops-artifacts</url>
     </repository>
+    <snapshotRepository>
+      <id>HopsEE</id>
+      <name>Hops Repo</name>
+      <url>https://nexus.hops.works/repository/hops-artifacts/</url>
+    </snapshotRepository>
   </distributionManagement>
 </project>

From f9a5c8cc92e2ce2e22613d7fde77427fb1699269 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 19 Aug 2025 08:05:03 +0800
Subject: [PATCH 57/86] [SPARK-49872][CORE] Remove jackson JSON string length
 limitation

This is a surgical fix extracted from https://github.com/apache/spark/pull/49163

The default jackson string limit introduced in jackson 2.15 can be too small for certain workloads, and this PR removes this limitation to avoid any regression.

fix regression

Yes, users won't hit this size limitation anymore.

https://github.com/apache/spark/pull/49163 tested it. We won't add a test in this PR as generating a super large JSON will make the CI unstable.

no

Closes #52049 from cloud-fan/json.

Lead-authored-by: Wenchen Fan <wenchen@databricks.com>
Co-authored-by: Wenchen Fan <cloud0fan@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 076618a46cabfb6358f2c1a696edf38db9ae7a6f)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../main/scala/org/apache/spark/util/JsonProtocol.scala    | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 3b4bc242b4668..8fde30fd18024 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -22,7 +22,7 @@ import java.util.{Properties, UUID}
 import scala.collection.JavaConverters._
 import scala.collection.Map
 
-import com.fasterxml.jackson.core.JsonGenerator
+import com.fasterxml.jackson.core.{JsonGenerator, StreamReadConstraints}
 import com.fasterxml.jackson.databind.JsonNode
 import org.json4s.jackson.JsonMethods.compact
 
@@ -54,6 +54,11 @@ import org.apache.spark.util.Utils.weakIntern
 private[spark] object JsonProtocol extends JsonUtils {
   // TODO: Remove this file and put JSON serialization into each individual class.
 
+  // SPARK-49872: Remove jackson JSON string length limitation.
+  mapper.getFactory.setStreamReadConstraints(
+    StreamReadConstraints.builder().maxStringLength(Int.MaxValue).build()
+  )
+
   /** ------------------------------------------------- *
    * JSON serialization methods for SparkListenerEvents |
    * -------------------------------------------------- */

From f7e85e04e2818fe58b43767b49c802c29ef0c0bf Mon Sep 17 00:00:00 2001
From: Bruce Robbins <bersprockets@gmail.com>
Date: Tue, 26 Aug 2025 19:02:33 -0700
Subject: [PATCH 58/86] [SPARK-52873][SQL][3.5] Further restrict when SHJ
 semi/anti join can ignore duplicate keys on the build side

### What changes were proposed in this pull request?

This is a back-port of both #52067 and #52128.

After https://github.com/apache/spark/commit/e861b0d93722f76cc103c05c7992c22c7fa23ad6, shuffle hash join for left semi/anti/existence will ignore duplicate keys if the join condition is empty or refers to the same parent attributes as the join keys. This PR proposes that duplicate keys should be ignored only when the join condition has these properties:

1. a subtree that is a semantic match to a build-side key, and/or
1. all attributes, outside of any subtree that is a semantic match to a build-side join key, should be from the stream-side.

### Why are the changes needed?

https://github.com/apache/spark/commit/e861b0d93722f76cc103c05c7992c22c7fa23ad6 causes a correctness issue when a column is transformed in the build-side join keys and also transformed, but differently, in a join condition. As an example:
```
create or replace temp view data(a) as values
("xxxx1111"),
("yyyy2222");

create or replace temp view lookup(k) as values
("xxxx22"),
("xxxx33"),
("xxxx11");

-- this returns one row
select *
from data
left semi join lookup
on substring(a, 1, 4) = substring(k, 1, 4)
and substring(a, 1, 6) >= k;

-- this is the same query as above, but with a shuffle hash join hint, and returns no rows
select /*+ SHUFFLE_HASH(lookup) */ *
from data
left semi join lookup
on substring(a, 1, 4) = substring(k, 1, 4)
and substring(a, 1, 6) >= k;
```
When the join uses broadcast hash join, the hashrelation of lookup has the following key -> values:
```
Key xxxx:
  xxxx11
  xxxx33
  xxxx22
```
The join condition matches on the build side row with the value `xxxx11`.

When the join uses shuffle hash join, on the other hand, the hash relation of lookup has the following key -> values:
```
Key xxxx:
  xxxx22
```
Because the keys must be unique, an arbitrary row is chosen to represent the key, and that row does not match the join condition.

After https://github.com/apache/spark/commit/1f35577a3ead9c6268b5ba47c2e3aec60484e3cc, a similar issue happens with integer keys:
```
create or replace temp view data(a) as values
(10000),
(30000);

create or replace temp view lookup(k) as values
(1000),
(1001),
(1002),
(1003),
(1004);

-- this query returns one row
select * from data left semi join lookup on a/10000 = cast(k/1000 as int) and k >=  a/10 + 3;

-- this is the same query as above, but with a shuffle hash join hint, and returns no rows
select /*+ SHUFFLE_HASH(lookup) */ * from data left semi join lookup on a/10000 = cast(k/1000 as int) and k >=  a/10 + 3;
```

### Does this PR introduce _any_ user-facing change?

No, except for fixing the correctness issue.

### How was this patch tested?

Modified an existing unit test.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #52135 from bersprockets/shj_semi_issue_br35.

Authored-by: Bruce Robbins <bersprockets@gmail.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 .../joins/ShuffledHashJoinExec.scala          | 31 +++++++++++---
 .../org/apache/spark/sql/JoinSuite.scala      | 40 ++++++++++++++++---
 2 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
index 974f6f9e50c2e..97ca74aee30c0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
@@ -65,14 +65,35 @@ case class ShuffledHashJoinExec(
     case _ => super.outputOrdering
   }
 
+  private def validCondForIgnoreDupKey(cond: Expression): Boolean = {
+    // to ignore duplicate keys on the build side, the join condition must
+    // have the following properties:
+    // 1) a subtree that is a semantic match to a build-side key, and/or
+    // 2) outside any subtree that is a semantic match to a build-side key,
+    //    all attributes should be from the stream-side.
+    val buildKeysSet = ExpressionSet(buildKeys)
+    val streamedOutputAttrs = AttributeSet(streamedOutput)
+
+    def validCond(cond: Expression): Boolean = {
+      cond match {
+        // don't bother traversing any subtree that has a semantic match to a build key
+        case e: Expression if buildKeysSet.contains(e) => true
+        // all attributes (outside any subtree that matches a build key) should be
+        // from the stream side
+        case a: Attribute if !streamedOutputAttrs.contains(a) => false
+        case e: Expression =>
+          e.children.forall(validCond(_))
+        case _ => true
+      }
+    }
+
+    validCond(cond)
+  }
+
   // Exposed for testing
   @transient lazy val ignoreDuplicatedKey = joinType match {
     case LeftExistence(_) =>
-      // For building hash relation, ignore duplicated rows with same join keys if:
-      // 1. Join condition is empty, or
-      // 2. Join condition only references streamed attributes and build join keys.
-      val streamedOutputAndBuildKeys = AttributeSet(streamedOutput ++ buildKeys)
-      condition.forall(_.references.subsetOf(streamedOutputAndBuildKeys))
+      condition.forall(validCondForIgnoreDupKey(_))
     case _ => false
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 4d256154c8574..44c8cb92fc3fd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -1556,30 +1556,58 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan
       spark.range(10).map(i => (i.toString, i + 1)).toDF("c1", "c2").write.saveAsTable("t1")
       spark.range(10).map(i => ((i % 5).toString, i % 3)).toDF("c1", "c2").write.saveAsTable("t2")
 
+      spark.range(10).map(i => (i, i + 1)).toDF("c1", "c2").write.saveAsTable("t1a")
+      spark.range(10).map(i => (i % 5, i % 3)).toDF("c1", "c2").write.saveAsTable("t2a")
+
+      val semiExpected1 = Seq(Row("0"), Row("1"), Row("2"), Row("3"), Row("4"))
+      val antiExpected1 = Seq(Row("5"), Row("6"), Row("7"), Row("8"), Row("9"))
+      val semiExpected2 = Seq(Row(0))
+      val antiExpected2 = Seq.tabulate(9) { x => Row(x + 1) }
+
       val semiJoinQueries = Seq(
         // No join condition, ignore duplicated key.
         (s"SELECT /*+ SHUFFLE_HASH(t2) */ t1.c1 FROM t1 LEFT SEMI JOIN t2 ON t1.c1 = t2.c1",
-          true),
+          true, semiExpected1, antiExpected1),
         // Have join condition on build join key only, ignore duplicated key.
         (s"""
             |SELECT /*+ SHUFFLE_HASH(t2) */ t1.c1 FROM t1 LEFT SEMI JOIN t2
             |ON t1.c1 = t2.c1 AND CAST(t1.c2 * 2 AS STRING) != t2.c1
           """.stripMargin,
-          true),
+          true, semiExpected1, antiExpected1),
         // Have join condition on other build attribute beside join key, do not ignore
         // duplicated key.
         (s"""
             |SELECT /*+ SHUFFLE_HASH(t2) */ t1.c1 FROM t1 LEFT SEMI JOIN t2
             |ON t1.c1 = t2.c1 AND t1.c2 * 100 != t2.c2
           """.stripMargin,
-          false)
+          false, semiExpected1, antiExpected1),
+        // SPARK-52873: Have a join condition that references attributes from the build-side
+        // join key, but those attributes are contained by a different expression than that
+        // used as the build-side join key (that is, CAST((t2.c2+10000)/1000 AS INT) is not
+        // the same as t2.c2). In this case, ignoreDuplicatedKey should be false
+        (
+          s"""
+             |SELECT /*+ SHUFFLE_HASH(t2a) */ t1a.c1 FROM t1a LEFT SEMI JOIN t2a
+             |ON CAST((t1a.c2+10000)/1000 AS INT) = CAST((t2a.c2+10000)/1000 AS INT)
+             |AND t2a.c2 >= t1a.c2 + 1
+             |""".stripMargin,
+        false, semiExpected2, antiExpected2),
+        // SPARK-52873: Have a join condition that contains the same expression as the
+        // build-side join key,and does not violate any other rules for the join condition.
+        // In this case, ignoreDuplicatedKey should be true
+        (
+          s"""
+             |SELECT /*+ SHUFFLE_HASH(t2a) */ t1a.c1 FROM t1a LEFT SEMI JOIN t2a
+             |ON t1a.c1 * 10000 = t2a.c1 * 1000 AND t2a.c1 * 1000 >= t1a.c1
+             |""".stripMargin,
+          true, semiExpected2, antiExpected2)
       )
       semiJoinQueries.foreach {
-        case (query, ignoreDuplicatedKey) =>
+        case (query, ignoreDuplicatedKey, semiExpected, antiExpected) =>
           val semiJoinDF = sql(query)
           val antiJoinDF = sql(query.replaceAll("SEMI", "ANTI"))
-          checkAnswer(semiJoinDF, Seq(Row("0"), Row("1"), Row("2"), Row("3"), Row("4")))
-          checkAnswer(antiJoinDF, Seq(Row("5"), Row("6"), Row("7"), Row("8"), Row("9")))
+          checkAnswer(semiJoinDF, semiExpected)
+          checkAnswer(antiJoinDF, antiExpected)
           Seq(semiJoinDF, antiJoinDF).foreach { df =>
             assert(collect(df.queryExecution.executedPlan) {
               case j: ShuffledHashJoinExec if j.ignoreDuplicatedKey == ignoreDuplicatedKey => true

From 5afcc0a6ba22212af22ff3b09fa1e9533615d05d Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Wed, 27 Aug 2025 10:14:17 +0200
Subject: [PATCH 59/86] Test with arrow upgrade

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 3b538ee9b2ae2..d8eaac897b336 100644
--- a/pom.xml
+++ b/pom.xml
@@ -227,7 +227,7 @@
     If you are changing Arrow version specification, please check
     ./python/pyspark/sql/pandas/utils.py, and ./python/setup.py too.
     -->
-    <arrow.version>12.0.1</arrow.version>
+    <arrow.version>18.3.0</arrow.version>
     <ammonite.version>2.5.9</ammonite.version>
 
     <!-- org.fusesource.leveldbjni will be used except on arm64 platform. -->

From 8a51801cb4e6d7ee1242e935ddb123a43953f1aa Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Wed, 27 Aug 2025 14:39:24 +0200
Subject: [PATCH 60/86] Use hopsfs 3.2.0.17-EE-RC1

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index d8eaac897b336..69cb8efcab50f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -122,7 +122,7 @@
     <slf4j.version>2.0.17</slf4j.version>
     <log4j.version>2.24.3</log4j.version>
     <!-- make sure to update IsolatedClientLoader whenever this version is changed -->
-    <hadoop.version>3.2.0.18-EE-HWORKS-2203-SNAPSHOT</hadoop.version>
+    <hadoop.version>3.2.0.17-EE-RC1</hadoop.version>
     <hadoop.group>io.hops</hadoop.group>
     <!-- SPARK-41247: When updating `protobuf.version`, also need to update `protoVersion` in `SparkBuild.scala` -->
     <protobuf.version>3.23.4</protobuf.version>

From 11e3f3b27f1fbdda23e786e5ac57e95c12555a91 Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Mon, 1 Sep 2025 13:48:10 +0200
Subject: [PATCH 61/86] Use arrow jar compatible with java8

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 69cb8efcab50f..3a9c26051d215 100644
--- a/pom.xml
+++ b/pom.xml
@@ -227,7 +227,7 @@
     If you are changing Arrow version specification, please check
     ./python/pyspark/sql/pandas/utils.py, and ./python/setup.py too.
     -->
-    <arrow.version>18.3.0</arrow.version>
+    <arrow.version>17.0.0</arrow.version>
     <ammonite.version>2.5.9</ammonite.version>
 
     <!-- org.fusesource.leveldbjni will be used except on arm64 platform. -->

From 494fd9c2a517994c5cc9453c8b31f6040772d4e2 Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Tue, 2 Sep 2025 14:54:13 +0200
Subject: [PATCH 62/86] Add build workflow

---
 .../workflows/build_spark_with_hopsfs.yaml    | 135 ++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100644 .github/workflows/build_spark_with_hopsfs.yaml

diff --git a/.github/workflows/build_spark_with_hopsfs.yaml b/.github/workflows/build_spark_with_hopsfs.yaml
new file mode 100644
index 0000000000000..42278b842f13c
--- /dev/null
+++ b/.github/workflows/build_spark_with_hopsfs.yaml
@@ -0,0 +1,135 @@
+name: Build Spark with hopsfs
+
+on:
+  workflow_call:
+    inputs:
+      ref:
+        description: 'The ref to checkout for the spark repo, default is branch-3.5'
+        required: false
+        type: string
+        default: 'branch-3.5'
+      jira_tag:
+        description: 'The tag to use for the jira release, default is the version from version.txt'
+        required: false
+        type: string
+        default: 'NOJIRA'
+      nexus_user:
+        description: 'Nexus user to upload the spark artifact'
+        required: false
+        type: string
+        default: harbordev
+      nexus_dev_spark_url:
+        description: 'Nexus URL to upload the spark artifact'
+        required: true
+        type: string
+      runner:
+        description: 'The type of runner to use, default is ghrunner-ee8'
+        required: false
+        type: string
+        default: 'ghrunner-ee8'
+    secrets:
+      NEXUS_HARBOR_PASSWORD:
+        required: true
+    
+concurrency:
+  group: build-spark-${{ github.workflow }}-${{ github.job }}-${{ inputs.jira_tag || 'NOJIRA' }}
+  cancel-in-progress: true
+
+# Used to avoid error on PRs
+env:
+  # SPARK_REF: ${{ inputs.ref || 'branch-3.5' }}
+  SPARK_REF: ${{ inputs.ref || 'HWORKS-2203-vatj' }}
+  JIRA_TAG: ${{ inputs.jira_tag || 'NOJIRA' }}
+
+jobs:
+  build-spark:
+    runs-on: ${{ inputs.runner }}
+    outputs:
+      pom_version_no_jira: ${{ steps.prep_version.outputs.pom_version_no_jira }}
+      pom_version: ${{ steps.prep_version.outputs.pom_version }}
+      commit_hash: ${{ steps.prep_version.outputs.commit_hash }}
+      jira_tag: ${{ env.JIRA_TAG }}
+      spark_tar_name: ${{ steps.prep_version.outputs.spark_tar_name }}
+      spark_tar_url: ${{ steps.prep_version.outputs.spark_tar_url }}
+      hopsfs_version: ${{ steps.prep_version.outputs.hopsfs_version }}
+    steps:
+      - name: Checkout spark repo
+        uses: actions/checkout@v4
+        with:
+          repository: logicalclocks/spark
+          ref: ${{ env.SPARK_REF }}
+          path: ${{ github.workspace }}/spark
+
+      - name: Prep step version
+        shell: bash
+        id: prep_version
+        working-directory: ${{ github.workspace }}/spark
+        run: |
+          COMMIT_HASH=$(git rev-parse --short HEAD)
+          POM_VERSION_NO_JIRA=$(mvn -q -Dexec.executable="echo" -Dexec.args='${project.version}' --non-recursive exec:exec)
+          find . -name "pom.xml" -exec sed -i "s|<version>${POM_VERSION_NO_JIRA}</version>|<version>${POM_VERSION_NO_JIRA}-${JIRA_TAG}-SNAPSHOT</version>|g" {} \;
+          POM_VERSION=$(mvn -q -Dexec.executable="echo" -Dexec.args='${project.version}' --non-recursive exec:exec)
+          SPARK_TAR_NAME=spark-${POM_VERSION}-bin-without-hadoop-with-hive.tgz
+          SPARK_TAR_URL="${{ inputs.nexus_dev_spark_url }}/${JIRA_TAG}/${SPARK_TAR_NAME}"
+          HOPSFS_VERSION=$(mvn -q -Dexec.executable="echo" -Dexec.args='${hadoop.version}' --non-recursive exec:exec)
+
+          echo "POM_VERSION_NO_JIRA=${POM_VERSION_NO_JIRA}" >> $GITHUB_ENV
+          echo "POM_VERSION=${POM_VERSION}" >> $GITHUB_ENV
+          echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
+          echo "SPARK_TAR_NAME=${SPARK_TAR_NAME}" >> $GITHUB_ENV
+          echo "SPARK_TAR_URL=${SPARK_TAR_URL}" >> $GITHUB_ENV
+          echo "HOPSFS_VERSION=${HOPSFS_VERSION}" >> $GITHUB_ENV
+
+          echo "# Build Spark" >> $GITHUB_STEP_SUMMARY
+          echo "POM_VERSION_NO_JIRA=${POM_VERSION_NO_JIRA}" >> $GITHUB_STEP_SUMMARY
+          echo "POM_VERSION=${POM_VERSION}" >> $GITHUB_STEP_SUMMARY
+          echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_STEP_SUMMARY
+          echo "SPARK_TAR_NAME=${SPARK_TAR_NAME}" >> $GITHUB_STEP_SUMMARY
+          echo "SPARK_TAR_URL=${SPARK_TAR_URL}" >> $GITHUB_STEP_SUMMARY
+          echo "HOPSFS_VERSION=${HOPSFS_VERSION}" >> $GITHUB_STEP_SUMMARY
+
+          echo "POM_VERSION=${POM_VERSION}" >> $GITHUB_OUTPUT
+          echo "POM_VERSION_NO_JIRA=${POM_VERSION_NO_JIRA}" >> $GITHUB_OUTPUT
+          echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_OUTPUT
+          echo "SPARK_TAR_NAME=${SPARK_TAR_NAME}" >> $GITHUB_OUTPUT
+          echo "SPARK_TAR_URL=${SPARK_TAR_URL}" >> $GITHUB_OUTPUT
+          echo "HOPSFS_VERSION=${HOPSFS_VERSION}" >> $GITHUB_OUTPUT
+
+      - name: Set up .m2 settings.xml
+        shell: bash
+        env:
+          M2_HOME: ~/.m2
+        run: |
+          echo "M2_HOME var is $M2_HOME" >> $GITHUB_STEP_SUMMARY
+          mkdir -p ~/.m2 && echo "<settings><servers><server><id>HopsEE</id><username>${{ inputs.nexus_user }}</username><password>${{ secrets.NEXUS_HARBOR_PASSWORD }}</password><configuration></configuration></server></servers></settings>" > ~/.m2/settings.xml
+        
+      - name: Cache maven
+        id: cache-maven
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.m2
+            !~/.m2/settings.xml
+          key: ${{ runner.os }}-maven-spark-${{ hashFiles('spark/**/pom.xml') }}
+          restore-keys: |
+            ${{ runner.os }}-maven-spark-
+            
+
+      - name: Build spark and spark-packaging
+        shell: bash
+        working-directory: ${{ github.workspace }}/spark
+        env:
+          POM_VERSION: ${{ env.POM_VERSION }}
+          M2_HOME: ~/.m2
+        run: |
+          ./dev/make-distribution.sh --name without-hadoop-with-hive --tgz "-Pkubernetes,hadoop-provided,parquet-provided,hive,hadoop-cloud,bigtop-dist"
+          
+      - name: Upload spark-packaging artifact to Nexus
+        shell: bash
+        working-directory: ${{ github.workspace }}/spark
+        env:
+          M2_HOME: ~/.m2
+        run: |
+          curl -u ${{ inputs.nexus_user }}:${{ secrets.NEXUS_HARBOR_PASSWORD }} --upload-file spark-$POM_VERSION-bin-without-hadoop-with-hive.tgz "${SPARK_TAR_URL}"
+          export MAVEN_OPTS="${MAVEN_OPTS:--Xss128m -Xmx4g -XX:ReservedCodeCacheSize=128m}"
+          ./build/mvn deploy -DskipTests -Dmaven.javadoc.skip=true -Dmaven.scaladoc.skip=true -Dmaven.source.skip -Dcyclonedx.skip=true -Pkubernetes,hadoop-provided,parquet-provided,hive,hadoop-cloud
\ No newline at end of file

From 2cb458a8617e1c0338214f37e481b7a276ef933f Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Tue, 2 Sep 2025 16:21:14 +0200
Subject: [PATCH 63/86] Add workflow dispatch

---
 .../workflows/build_spark_with_hopsfs.yaml    | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build_spark_with_hopsfs.yaml b/.github/workflows/build_spark_with_hopsfs.yaml
index 42278b842f13c..0fe15c6968f67 100644
--- a/.github/workflows/build_spark_with_hopsfs.yaml
+++ b/.github/workflows/build_spark_with_hopsfs.yaml
@@ -13,23 +13,31 @@ on:
         required: false
         type: string
         default: 'NOJIRA'
-      nexus_user:
-        description: 'Nexus user to upload the spark artifact'
+      runner:
+        description: 'The type of runner to use, default is ghrunner-ee8'
         required: false
         type: string
-        default: harbordev
-      nexus_dev_spark_url:
-        description: 'Nexus URL to upload the spark artifact'
+        default: 'ghrunner-ee8'
+    secrets:
+      NEXUS_HARBOR_PASSWORD:
         required: true
+  workflow_dispatch:
+    inputs:
+      ref:
+        description: 'The ref to checkout for the spark repo, default is branch-3.5'
+        required: false
         type: string
+        default: 'branch-3.5'
+      jira_tag:
+        description: 'The tag to use for the jira release, default is the version from version.txt'
+        required: false
+        type: string
+        default: 'NOJIRA'
       runner:
         description: 'The type of runner to use, default is ghrunner-ee8'
         required: false
         type: string
         default: 'ghrunner-ee8'
-    secrets:
-      NEXUS_HARBOR_PASSWORD:
-        required: true
     
 concurrency:
   group: build-spark-${{ github.workflow }}-${{ github.job }}-${{ inputs.jira_tag || 'NOJIRA' }}
@@ -70,7 +78,7 @@ jobs:
           find . -name "pom.xml" -exec sed -i "s|<version>${POM_VERSION_NO_JIRA}</version>|<version>${POM_VERSION_NO_JIRA}-${JIRA_TAG}-SNAPSHOT</version>|g" {} \;
           POM_VERSION=$(mvn -q -Dexec.executable="echo" -Dexec.args='${project.version}' --non-recursive exec:exec)
           SPARK_TAR_NAME=spark-${POM_VERSION}-bin-without-hadoop-with-hive.tgz
-          SPARK_TAR_URL="${{ inputs.nexus_dev_spark_url }}/${JIRA_TAG}/${SPARK_TAR_NAME}"
+          SPARK_TAR_URL="${{ vars.NEXUS_DEV_SPARK_URL }}/${JIRA_TAG}/${SPARK_TAR_NAME}"
           HOPSFS_VERSION=$(mvn -q -Dexec.executable="echo" -Dexec.args='${hadoop.version}' --non-recursive exec:exec)
 
           echo "POM_VERSION_NO_JIRA=${POM_VERSION_NO_JIRA}" >> $GITHUB_ENV
@@ -130,6 +138,6 @@ jobs:
         env:
           M2_HOME: ~/.m2
         run: |
-          curl -u ${{ inputs.nexus_user }}:${{ secrets.NEXUS_HARBOR_PASSWORD }} --upload-file spark-$POM_VERSION-bin-without-hadoop-with-hive.tgz "${SPARK_TAR_URL}"
+          curl -u ${{ vars.NEXUS_DEV_SPARK_URL }}:${{ secrets.NEXUS_HARBOR_PASSWORD }} --upload-file spark-$POM_VERSION-bin-without-hadoop-with-hive.tgz "${SPARK_TAR_URL}"
           export MAVEN_OPTS="${MAVEN_OPTS:--Xss128m -Xmx4g -XX:ReservedCodeCacheSize=128m}"
           ./build/mvn deploy -DskipTests -Dmaven.javadoc.skip=true -Dmaven.scaladoc.skip=true -Dmaven.source.skip -Dcyclonedx.skip=true -Pkubernetes,hadoop-provided,parquet-provided,hive,hadoop-cloud
\ No newline at end of file

From 3a449e8e1baa991199dcd491b17d9dc73eafbb35 Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Tue, 2 Sep 2025 17:48:03 +0200
Subject: [PATCH 64/86] add build arg

---
 .../workflows/build_spark_with_hopsfs.yaml    | 36 ++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_spark_with_hopsfs.yaml b/.github/workflows/build_spark_with_hopsfs.yaml
index 0fe15c6968f67..98ccbde903cf6 100644
--- a/.github/workflows/build_spark_with_hopsfs.yaml
+++ b/.github/workflows/build_spark_with_hopsfs.yaml
@@ -18,6 +18,11 @@ on:
         required: false
         type: string
         default: 'ghrunner-ee8'
+      build:
+        description: 'Whether to build spark or not, default is false. If this is false then the workflow will only prepare the versioning related outputs.'
+        required: false
+        type: boolean
+        default: true
     secrets:
       NEXUS_HARBOR_PASSWORD:
         required: true
@@ -38,6 +43,12 @@ on:
         required: false
         type: string
         default: 'ghrunner-ee8'
+      build:
+        description: 'Whether to build spark or not, default is false. If this is false then the workflow will only prepare the versioning related outputs.'
+        required: false
+        type: boolean
+        default: true
+  pull_request:
     
 concurrency:
   group: build-spark-${{ github.workflow }}-${{ github.job }}-${{ inputs.jira_tag || 'NOJIRA' }}
@@ -68,6 +79,26 @@ jobs:
           ref: ${{ env.SPARK_REF }}
           path: ${{ github.workspace }}/spark
 
+      - name: To build or not to build
+        id: to_build_or_not_to_build
+        shell: bash
+        env:
+          BUILD_SPARK: ${{ (github.event_name == 'pull_request' && contains(join(github.event.pull_request.labels.*.name, ','), 'build-spark')) || inputs.build }}
+        run: |
+          if [[ "${{ env.BUILD_SPARK }}" != "true" ]]; then
+            echo "# :recycle: Not building Spark" >> $GITHUB_STEP_SUMMARY
+            if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+              echo "This is a pull request and the 'build-spark' label is not present." >> $GITHUB_STEP_SUMMARY
+              echo "pull_request_labels=${{ join(github.event.pull_request.labels.*.name, ', ') }}" >> $GITHUB_STEP_SUMMARY
+            elif [[ "${{ inputs.build || 'false'}}" != "true" ]]; then
+              echo "The input 'build' is set to false." >> $GITHUB_STEP_SUMMARY
+            fi
+            echo "BUILD_SPARK=$BUILD_SPARK" >> $GITHUB_OUTPUT
+          else
+            echo "# :white_check_mark: Building Spark" >> $GITHUB_STEP_SUMMARY
+            echo "BUILD_SPARK=$BUILD_SPARK" >> $GITHUB_OUTPUT
+          fi
+
       - name: Prep step version
         shell: bash
         id: prep_version
@@ -88,7 +119,6 @@ jobs:
           echo "SPARK_TAR_URL=${SPARK_TAR_URL}" >> $GITHUB_ENV
           echo "HOPSFS_VERSION=${HOPSFS_VERSION}" >> $GITHUB_ENV
 
-          echo "# Build Spark" >> $GITHUB_STEP_SUMMARY
           echo "POM_VERSION_NO_JIRA=${POM_VERSION_NO_JIRA}" >> $GITHUB_STEP_SUMMARY
           echo "POM_VERSION=${POM_VERSION}" >> $GITHUB_STEP_SUMMARY
           echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_STEP_SUMMARY
@@ -105,6 +135,7 @@ jobs:
 
       - name: Set up .m2 settings.xml
         shell: bash
+        if: steps.to_build_or_not_to_build.outputs.BUILD_SPARK == 'true'
         env:
           M2_HOME: ~/.m2
         run: |
@@ -113,6 +144,7 @@ jobs:
         
       - name: Cache maven
         id: cache-maven
+        if: steps.to_build_or_not_to_build.outputs.BUILD_SPARK == 'true'
         uses: actions/cache@v4
         with:
           path: |
@@ -125,6 +157,7 @@ jobs:
 
       - name: Build spark and spark-packaging
         shell: bash
+        if: steps.to_build_or_not_to_build.outputs.BUILD_SPARK == 'true'
         working-directory: ${{ github.workspace }}/spark
         env:
           POM_VERSION: ${{ env.POM_VERSION }}
@@ -134,6 +167,7 @@ jobs:
           
       - name: Upload spark-packaging artifact to Nexus
         shell: bash
+        if: steps.to_build_or_not_to_build.outputs.BUILD_SPARK == 'true'
         working-directory: ${{ github.workspace }}/spark
         env:
           M2_HOME: ~/.m2

From c6a95be5b7d17600d36d2f6f8268d464c8a7c55d Mon Sep 17 00:00:00 2001
From: ziqi liu <ziqi.liu@databricks.com>
Date: Tue, 2 Sep 2025 12:26:34 -0700
Subject: [PATCH 65/86] [SPARK-53435][SQL][3.5] Fix race condition in
 CachedRDDBuilder

### What changes were proposed in this pull request?
backport https://github.com/apache/spark/commit/871fe3ded668048a9b23aa447be04cb2a7109300

There is race condition between `CachedRDDBuilder.cachedColumnBuffers` and `CachedRDDBuilder.clearCache`: when they interleave each other, `cachedColumnBuffers` might return a `nullptr`.

This looks like a day-1 bug introduced from  https://github.com/apache/spark/commit/20ca208bcda6f22fe7d9fb54144de435b4237536#diff-4068fce361a50e3d32af2ba2d4231905f500e7b2da9f46d5ddd99b758c30fd43

### Why are the changes needed?
The race condition might lead to NPE from [here](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStageExec.scala#L303) which is basically a null `RDD` returned from `CachedRDDBuilder.cachedColumnBuffers`

### Does this PR introduce _any_ user-facing change?
NO

### How was this patch tested?
Theoretically this race condition might be triggered as long as cache materialization and unpersistence happen on different thread. But there is no reliable way to construct unit test.

### Was this patch authored or co-authored using generative AI tooling?
NO

Closes #52199 from liuzqt/SPARK-53435-3.5.

Authored-by: ziqi liu <ziqi.liu@databricks.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 .../execution/columnar/InMemoryRelation.scala | 27 +++++--------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
index f750a4503be16..e26e6d1b1ddc6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
@@ -217,35 +217,22 @@ case class CachedRDDBuilder(
   val cachedName = tableName.map(n => s"In-memory table $n")
     .getOrElse(StringUtils.abbreviate(cachedPlan.toString, 1024))
 
-  def cachedColumnBuffers: RDD[CachedBatch] = {
+  def cachedColumnBuffers: RDD[CachedBatch] = synchronized {
     if (_cachedColumnBuffers == null) {
-      synchronized {
-        if (_cachedColumnBuffers == null) {
-          _cachedColumnBuffers = buildBuffers()
-        }
-      }
+      _cachedColumnBuffers = buildBuffers()
     }
     _cachedColumnBuffers
   }
 
-  def clearCache(blocking: Boolean = false): Unit = {
+  def clearCache(blocking: Boolean = false): Unit = synchronized {
     if (_cachedColumnBuffers != null) {
-      synchronized {
-        if (_cachedColumnBuffers != null) {
-          _cachedColumnBuffers.unpersist(blocking)
-          _cachedColumnBuffers = null
-        }
-      }
+      _cachedColumnBuffers.unpersist(blocking)
+      _cachedColumnBuffers = null
     }
   }
 
-  def isCachedColumnBuffersLoaded: Boolean = {
-    if (_cachedColumnBuffers != null) {
-      synchronized {
-        return _cachedColumnBuffers != null && isCachedRDDLoaded
-      }
-    }
-    false
+  def isCachedColumnBuffersLoaded: Boolean = synchronized {
+    _cachedColumnBuffers != null && isCachedRDDLoaded
   }
 
   private def isCachedRDDLoaded: Boolean = {

From 55f75df25a7a8b0bb19a16cf451d8999c4488dc8 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Wed, 3 Sep 2025 09:53:51 +0900
Subject: [PATCH 66/86] [MINOR][BUILD] Fix download of preview releases in the
 news

### What changes were proposed in this pull request?

This PR proposes to fix download of preview releases in the news when releasing.

### Why are the changes needed?

To have the correct download links for previews when they are released.

### Does this PR introduce _any_ user-facing change?

No, dev-only.

### How was this patch tested?

Manually.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #52208 from HyukjinKwon/fix-download-links.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit 6476dbc92afa08a168a189d0a83201d07ece60ad)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 dev/create-release/release-build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 8c7056e4d1918..e7cf4d0c9bcb2 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -303,10 +303,10 @@ meta:
   _wpas_done_all: '1'
 ---
 To enable wide-scale community testing of the upcoming Spark ${BASE_VERSION} release, the Apache Spark community has posted a
-<a href="https://archive.apache.org/dist/spark/spark-${RELEASE_VERSION}/">Spark ${RELEASE_VERSION} release</a>.
+<a href="${RELEASE_LOCATION}/spark-${RELEASE_VERSION}">Spark ${RELEASE_VERSION} release</a>.
 This preview is not a stable release in terms of either API or functionality, but it is meant to give the community early
 access to try the code that will become Spark ${BASE_VERSION}. If you would like to test the release,
-please <a href="https://archive.apache.org/dist/spark/spark-${RELEASE_VERSION}/">download</a> it, and send feedback using either
+please <a href="${RELEASE_LOCATION}/spark-${RELEASE_VERSION}">download</a> it, and send feedback using either
 <a href="https://spark.apache.org/community.html">mailing lists</a> or
 <a href="https://issues.apache.org/jira/browse/SPARK/?selectedTab=com.atlassian.jira.jira-projects-plugin:summary-panel">JIRA</a>.
 The documentation is available at the <a href="https://spark.apache.org/docs/${RELEASE_VERSION}/">link</a>.

From 4508f9ca24b61b954d613822c5e6840f61075434 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Wed, 3 Sep 2025 09:54:49 +0900
Subject: [PATCH 67/86] [MINOR][BUILD] Remove todos for testing in the real
 releases

### What changes were proposed in this pull request?

This PR proposes to remove todos (that are tested).

### Why are the changes needed?

To note what's tested or not for developement.

### Does this PR introduce _any_ user-facing change?

No, dev-only.

### How was this patch tested?

Manually tested.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #52207 from HyukjinKwon/remove-what-is-tested.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit fb22d371f0dd3c537ed3d6c8b493ac49a11447dd)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 dev/create-release/release-build.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index e7cf4d0c9bcb2..950bb48ca7b44 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -147,7 +147,6 @@ if [[ "$1" == "finalize" ]]; then
   echo "Uploading release docs to spark-website"
   cd spark-website
 
-  # TODO: Test it in the actual release
   # 1. Add download link to documentation.md
   python3 <<EOF
 import re
@@ -530,7 +529,6 @@ EOF
 
   echo "Done."
 
-  # TODO: Test it in the actual official release
   # Remove old releases from the mirror
   # Extract major.minor prefix
   RELEASE_SERIES=$(echo "$RELEASE_VERSION" | cut -d. -f1-2)

From d39d1e09b4d60d6f8a0de6a6a55d7540165ffaca Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Wed, 3 Sep 2025 09:55:37 +0900
Subject: [PATCH 68/86] [MINOR][BUILD] Remove `preview` postfix in
 documentation.md when releasing

### What changes were proposed in this pull request?

This PR proposes to remove `preview` postfix in `documentation.md` when releasing

### Why are the changes needed?

To be consistent. `preview` postfix is not needed, see https://github.com/apache/spark-website/blob/asf-site/documentation.md

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Manually tested.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #52206 from HyukjinKwon/remove-preview-postfix.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit 41c43466db5c221fb3a56a39b44c59cc215f4d54)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 dev/create-release/release-build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 950bb48ca7b44..a9dd229f6c69e 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -156,7 +156,7 @@ is_preview = bool(re.search(r'-preview\d*$', release_version))
 base_version = re.sub(r'-preview\d*$', '', release_version)
 
 stable_newline = f'  <li><a href="{{{{site.baseurl}}}}/docs/{release_version}/">Spark {release_version}</a></li>'
-preview_newline = f'  <li><a href="{{{{site.baseurl}}}}/docs/{release_version}/">Spark {release_version} preview</a></li>'
+preview_newline = f'  <li><a href="{{{{site.baseurl}}}}/docs/{release_version}/">Spark {release_version}</a></li>'
 
 inserted = False
 

From e93e70cd6a3f7d569e836caa7992ed2e33f79fca Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Wed, 3 Sep 2025 11:33:31 +0200
Subject: [PATCH 69/86] Fix typos

---
 .github/workflows/build_spark_with_hopsfs.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_spark_with_hopsfs.yaml b/.github/workflows/build_spark_with_hopsfs.yaml
index 98ccbde903cf6..bbe1f38856dce 100644
--- a/.github/workflows/build_spark_with_hopsfs.yaml
+++ b/.github/workflows/build_spark_with_hopsfs.yaml
@@ -140,7 +140,7 @@ jobs:
           M2_HOME: ~/.m2
         run: |
           echo "M2_HOME var is $M2_HOME" >> $GITHUB_STEP_SUMMARY
-          mkdir -p ~/.m2 && echo "<settings><servers><server><id>HopsEE</id><username>${{ inputs.nexus_user }}</username><password>${{ secrets.NEXUS_HARBOR_PASSWORD }}</password><configuration></configuration></server></servers></settings>" > ~/.m2/settings.xml
+          mkdir -p ~/.m2 && echo "<settings><servers><server><id>HopsEE</id><username>${{ vars.NEXUS_HARBOR_USER }}</username><password>${{ secrets.NEXUS_HARBOR_PASSWORD }}</password><configuration></configuration></server></servers></settings>" > ~/.m2/settings.xml
         
       - name: Cache maven
         id: cache-maven
@@ -172,6 +172,6 @@ jobs:
         env:
           M2_HOME: ~/.m2
         run: |
-          curl -u ${{ vars.NEXUS_DEV_SPARK_URL }}:${{ secrets.NEXUS_HARBOR_PASSWORD }} --upload-file spark-$POM_VERSION-bin-without-hadoop-with-hive.tgz "${SPARK_TAR_URL}"
+          curl -u ${{ vars.NEXUS_HARBOR_USER }}:${{ secrets.NEXUS_HARBOR_PASSWORD }} --upload-file spark-$POM_VERSION-bin-without-hadoop-with-hive.tgz "${SPARK_TAR_URL}"
           export MAVEN_OPTS="${MAVEN_OPTS:--Xss128m -Xmx4g -XX:ReservedCodeCacheSize=128m}"
           ./build/mvn deploy -DskipTests -Dmaven.javadoc.skip=true -Dmaven.scaladoc.skip=true -Dmaven.source.skip -Dcyclonedx.skip=true -Pkubernetes,hadoop-provided,parquet-provided,hive,hadoop-cloud
\ No newline at end of file

From cf7f364ccea7a101c0da642acc4f78b2ca7a4a13 Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Wed, 3 Sep 2025 12:11:14 +0200
Subject: [PATCH 70/86] Add workflow call outputs

---
 .../workflows/build_spark_with_hopsfs.yaml    | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/.github/workflows/build_spark_with_hopsfs.yaml b/.github/workflows/build_spark_with_hopsfs.yaml
index bbe1f38856dce..ad70bdf622b86 100644
--- a/.github/workflows/build_spark_with_hopsfs.yaml
+++ b/.github/workflows/build_spark_with_hopsfs.yaml
@@ -26,6 +26,28 @@ on:
     secrets:
       NEXUS_HARBOR_PASSWORD:
         required: true
+    outputs:
+      pom_version_no_jira:
+        value: ${{ jobs.build-spark.outputs.pom_version_no_jira }}
+        description: 'The pom version without the jira tag'
+      pom_version:
+        value: ${{ jobs.build-spark.outputs.pom_version }}
+        description: 'The pom version with the jira tag'
+      commit_hash:
+        value: ${{ jobs.build-spark.outputs.commit_hash }}
+        description: 'The commit hash of the spark repo'
+      jira_tag:
+        value: ${{ jobs.build-spark.outputs.jira_tag }}
+        description: 'The jira tag used for the build'
+      spark_tar_name:
+        value: ${{ jobs.build-spark.outputs.spark_tar_name }}
+        description: 'The name of the spark tar file'
+      spark_tar_url:
+        value: ${{ jobs.build-spark.outputs.spark_tar_url }}
+        description: 'The url of the spark tar file'
+      hopsfs_version:
+        value: ${{ jobs.build-spark.outputs.hopsfs_version }}
+        description: 'The version of hopsfs used in the build'
   workflow_dispatch:
     inputs:
       ref:

From 2807c0e9e6028d73622d10e17450f793757428ee Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Wed, 3 Sep 2025 14:24:51 +0200
Subject: [PATCH 71/86] Test

---
 connector/connect/client/jvm/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/connector/connect/client/jvm/pom.xml b/connector/connect/client/jvm/pom.xml
index f2630bfb9303f..9011c7ddcf699 100644
--- a/connector/connect/client/jvm/pom.xml
+++ b/connector/connect/client/jvm/pom.xml
@@ -122,7 +122,7 @@
               <include>com.google.errorprone:*</include>
               <include>com.google.j2objc:*</include>
               <include>com.google.protobuf:*</include>
-              <include>com.google.flatbuffers:*</include>
+              <!-- <include>com.google.flatbuffers:*</include> -->
               <include>io.grpc:*</include>
               <include>io.netty:*</include>
               <include>io.perfmark:*</include>

From 1356f001d68904a7c46e9de9539158528162633a Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Wed, 3 Sep 2025 15:00:50 +0200
Subject: [PATCH 72/86] Build spark with arrow12

---
 connector/connect/client/jvm/pom.xml | 2 +-
 pom.xml                              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/connector/connect/client/jvm/pom.xml b/connector/connect/client/jvm/pom.xml
index 9011c7ddcf699..f2630bfb9303f 100644
--- a/connector/connect/client/jvm/pom.xml
+++ b/connector/connect/client/jvm/pom.xml
@@ -122,7 +122,7 @@
               <include>com.google.errorprone:*</include>
               <include>com.google.j2objc:*</include>
               <include>com.google.protobuf:*</include>
-              <!-- <include>com.google.flatbuffers:*</include> -->
+              <include>com.google.flatbuffers:*</include>
               <include>io.grpc:*</include>
               <include>io.netty:*</include>
               <include>io.perfmark:*</include>
diff --git a/pom.xml b/pom.xml
index 3a9c26051d215..05795462c6d2a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -227,7 +227,7 @@
     If you are changing Arrow version specification, please check
     ./python/pyspark/sql/pandas/utils.py, and ./python/setup.py too.
     -->
-    <arrow.version>17.0.0</arrow.version>
+    <arrow.version>12.0.1</arrow.version>
     <ammonite.version>2.5.9</ammonite.version>
 
     <!-- org.fusesource.leveldbjni will be used except on arm64 platform. -->

From 1e2bd686bc82e038e85af7da90fd754ec433b831 Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Wed, 3 Sep 2025 15:03:09 +0200
Subject: [PATCH 73/86] Use 31

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 05795462c6d2a..6e9396f54b153 100644
--- a/pom.xml
+++ b/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache</groupId>
     <artifactId>apache</artifactId>
-    <version>18</version>
+    <version>31</version>
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.12</artifactId>

From cf221657f2ed2f14bd6ef2afae831a0c2cd50b8c Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Wed, 3 Sep 2025 15:17:45 +0200
Subject: [PATCH 74/86] Test explicit

---
 connector/connect/client/jvm/pom.xml | 6 +++++-
 pom.xml                              | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/connector/connect/client/jvm/pom.xml b/connector/connect/client/jvm/pom.xml
index f2630bfb9303f..f67c0d9053c4b 100644
--- a/connector/connect/client/jvm/pom.xml
+++ b/connector/connect/client/jvm/pom.xml
@@ -59,6 +59,11 @@
       <artifactId>protobuf-java</artifactId>
       <scope>compile</scope>
     </dependency>
+    <dependency>
+      <groupId>com.google.flatbuffers</groupId>
+      <artifactId>flatbuffers-java</artifactId>
+      <version>24.3.25</version>
+    </dependency>
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
@@ -122,7 +127,6 @@
               <include>com.google.errorprone:*</include>
               <include>com.google.j2objc:*</include>
               <include>com.google.protobuf:*</include>
-              <include>com.google.flatbuffers:*</include>
               <include>io.grpc:*</include>
               <include>io.netty:*</include>
               <include>io.perfmark:*</include>
diff --git a/pom.xml b/pom.xml
index 6e9396f54b153..51169ab613d48 100644
--- a/pom.xml
+++ b/pom.xml
@@ -227,7 +227,7 @@
     If you are changing Arrow version specification, please check
     ./python/pyspark/sql/pandas/utils.py, and ./python/setup.py too.
     -->
-    <arrow.version>12.0.1</arrow.version>
+    <arrow.version>17.0.0</arrow.version>
     <ammonite.version>2.5.9</ammonite.version>
 
     <!-- org.fusesource.leveldbjni will be used except on arm64 platform. -->

From c8f26653cc53979895543b371e31405666f96232 Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Wed, 3 Sep 2025 19:52:56 +0200
Subject: [PATCH 75/86] undo flatbuffers

---
 connector/connect/client/jvm/pom.xml | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/connector/connect/client/jvm/pom.xml b/connector/connect/client/jvm/pom.xml
index f67c0d9053c4b..f2630bfb9303f 100644
--- a/connector/connect/client/jvm/pom.xml
+++ b/connector/connect/client/jvm/pom.xml
@@ -59,11 +59,6 @@
       <artifactId>protobuf-java</artifactId>
       <scope>compile</scope>
     </dependency>
-    <dependency>
-      <groupId>com.google.flatbuffers</groupId>
-      <artifactId>flatbuffers-java</artifactId>
-      <version>24.3.25</version>
-    </dependency>
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
@@ -127,6 +122,7 @@
               <include>com.google.errorprone:*</include>
               <include>com.google.j2objc:*</include>
               <include>com.google.protobuf:*</include>
+              <include>com.google.flatbuffers:*</include>
               <include>io.grpc:*</include>
               <include>io.netty:*</include>
               <include>io.perfmark:*</include>

From 6c1c512bbf5c8d1c5263b23c219bf3dcab7f3a3c Mon Sep 17 00:00:00 2001
From: Kent Yao <yao@apache.org>
Date: Thu, 4 Sep 2025 10:37:34 +0800
Subject: [PATCH 76/86] [SPARK-53472][DOCS] Fix jekyll-redirect-from template
 and generated html files

### Why are the changes needed?

`page.redirect.to` defaults to pages with the absolute site root. In this PR, we revise it to the docs relative.

### Does this PR introduce _any_ user-facing change?
doc fix

Check https://dist.apache.org/repos/dist/dev/spark/v4.0.1-rc1-docs/_site/ for https://dist.apache.org/repos/dist/dev/spark/v4.0.1-rc1-docs/_site/building-with-maven.html

### How was this patch tested?
build docs locally.

### Was this patch authored or co-authored using generative AI tooling?
no

Closes #52217 from yaooqinn/SPARK-53472.

Authored-by: Kent Yao <yao@apache.org>
Signed-off-by: Kent Yao <yao@apache.org>
(cherry picked from commit 76352046c8903c5fe5eb632a0b9ddd13ede27b97)
Signed-off-by: Kent Yao <yao@apache.org>
---
 docs/_layouts/redirect.html | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/docs/_layouts/redirect.html b/docs/_layouts/redirect.html
index 72a0462fc6a30..6177f91b7d793 100644
--- a/docs/_layouts/redirect.html
+++ b/docs/_layouts/redirect.html
@@ -19,10 +19,11 @@
 <html lang="en-US">
 <meta charset="utf-8">
 <title>Redirecting&hellip;</title>
-<link rel="canonical" href="{{ page.redirect.to }}.html">
-<script>location="{{ page.redirect.to }}.html"</script>
-<meta http-equiv="refresh" content="0; url={{ page.redirect.to }}.html">
+{% assign redirect_url = page.redirect.to | replace_first: '/', '' | prepend: rel_path_to_root | append: '.html' %}
+<link rel="canonical" href="{{ redirect_url }}">
+<script>location="{{ redirect_url }}"</script>
+<meta http-equiv="refresh" content="0; url={{ redirect_url }}">
 <meta name="robots" content="noindex">
 <h1>Redirecting&hellip;</h1>
-<a href="{{ page.redirect.to }}.html">Click here if you are not redirected.</a>
-</html>
\ No newline at end of file
+<a href="{{ redirect_url }}">Click here if you are not redirected.</a>
+</html>

From 0aa117d06d00c6a1defb161d55a0670ca4985332 Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Thu, 4 Sep 2025 14:28:08 +0200
Subject: [PATCH 77/86] Remove snapshot

---
 .github/workflows/build_spark_with_hopsfs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_spark_with_hopsfs.yaml b/.github/workflows/build_spark_with_hopsfs.yaml
index ad70bdf622b86..782d5247793cf 100644
--- a/.github/workflows/build_spark_with_hopsfs.yaml
+++ b/.github/workflows/build_spark_with_hopsfs.yaml
@@ -128,7 +128,7 @@ jobs:
         run: |
           COMMIT_HASH=$(git rev-parse --short HEAD)
           POM_VERSION_NO_JIRA=$(mvn -q -Dexec.executable="echo" -Dexec.args='${project.version}' --non-recursive exec:exec)
-          find . -name "pom.xml" -exec sed -i "s|<version>${POM_VERSION_NO_JIRA}</version>|<version>${POM_VERSION_NO_JIRA}-${JIRA_TAG}-SNAPSHOT</version>|g" {} \;
+          find . -name "pom.xml" -exec sed -i "s|<version>${POM_VERSION_NO_JIRA}</version>|<version>${POM_VERSION_NO_JIRA%-SNAPSHOT}-${JIRA_TAG}-SNAPSHOT</version>|g" {} \;
           POM_VERSION=$(mvn -q -Dexec.executable="echo" -Dexec.args='${project.version}' --non-recursive exec:exec)
           SPARK_TAR_NAME=spark-${POM_VERSION}-bin-without-hadoop-with-hive.tgz
           SPARK_TAR_URL="${{ vars.NEXUS_DEV_SPARK_URL }}/${JIRA_TAG}/${SPARK_TAR_NAME}"

From 440903580c2f922d380e04c616b6f42a2a77d814 Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Thu, 4 Sep 2025 14:59:42 +0200
Subject: [PATCH 78/86] Save all

---
 R/pkg/DESCRIPTION                                      | 4 ----
 repl/pom.xml                                           | 4 ----
 resource-managers/kubernetes/core/pom.xml              | 4 ----
 resource-managers/kubernetes/integration-tests/pom.xml | 4 ----
 resource-managers/mesos/pom.xml                        | 4 ----
 resource-managers/yarn/pom.xml                         | 4 ----
 6 files changed, 24 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index f6ca6a532e6fc..53f40d803e1a0 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,10 +1,6 @@
 Package: SparkR
 Type: Package
-<<<<<<< HEAD
-Version: 3.5.5
-=======
 Version: 3.5.7
->>>>>>> branch-3.5
 Title: R Front End for 'Apache Spark'
 Description: Provides an R Front end for 'Apache Spark' <https://spark.apache.org>.
 Authors@R:
diff --git a/repl/pom.xml b/repl/pom.xml
index a076560a703ec..3cdc95fdaff9b 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,11 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-<<<<<<< HEAD
-    <version>3.5.5</version>
-=======
     <version>3.5.7-SNAPSHOT</version>
->>>>>>> branch-3.5
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 5f68b053d6be4..9473a1aac3123 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,11 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-<<<<<<< HEAD
-    <version>3.5.5</version>
-=======
     <version>3.5.7-SNAPSHOT</version>
->>>>>>> branch-3.5
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 4484f8c23d991..2d43f57af8080 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -20,11 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-<<<<<<< HEAD
-    <version>3.5.5</version>
-=======
     <version>3.5.7-SNAPSHOT</version>
->>>>>>> branch-3.5
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 24f7495ce9042..1ff3e3b92e511 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,11 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-<<<<<<< HEAD
-    <version>3.5.5</version>
-=======
     <version>3.5.7-SNAPSHOT</version>
->>>>>>> branch-3.5
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 23f1a04525b20..4780c62532234 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,11 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-<<<<<<< HEAD
-    <version>3.5.5</version>
-=======
     <version>3.5.7-SNAPSHOT</version>
->>>>>>> branch-3.5
     <relativePath>../../pom.xml</relativePath>
   </parent>
 

From 3a269727c90458e3f87c4b82747887d5f5baf69a Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Fri, 5 Sep 2025 09:08:16 +0200
Subject: [PATCH 79/86] implicit path error

---
 .../spark/sql/execution/datasources/InMemoryFileIndex.scala      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
index 2180c941aac4f..71cc0597aeea3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.execution.streaming.FileStreamSink
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.HadoopFSUtils
 
+implicit val pathOrdering: Ordering[Path] = Ordering.by(_.toString)
 
 /**
  * A [[FileIndex]] that generates the list of files to process by recursively listing all the

From 107151d253caa4d3849f5ac6bc96f8b3830c2cf5 Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Fri, 5 Sep 2025 09:18:09 +0200
Subject: [PATCH 80/86] Move implicit path ordering

---
 .../spark/sql/execution/datasources/InMemoryFileIndex.scala    | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
index 71cc0597aeea3..205367a1853d7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
@@ -32,8 +32,6 @@ import org.apache.spark.sql.execution.streaming.FileStreamSink
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.HadoopFSUtils
 
-implicit val pathOrdering: Ordering[Path] = Ordering.by(_.toString)
-
 /**
  * A [[FileIndex]] that generates the list of files to process by recursively listing all the
  * files present in `paths`.
@@ -145,6 +143,7 @@ class InMemoryFileIndex(
 }
 
 object InMemoryFileIndex extends Logging {
+  implicit val pathOrdering: Ordering[Path] = Ordering.by(_.toString)
 
   private[sql] def bulkListLeafFiles(
       paths: Seq[Path],

From a18909c31200a44fa95d5942a6fc24c31ea8c821 Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Fri, 5 Sep 2025 09:37:21 +0200
Subject: [PATCH 81/86] Exclude google flatbuffers

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 87c63971d3dc0..65829db88fb90 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2499,7 +2499,7 @@
             <artifactId>jasper-runtime</artifactId>
           </exclusion>
           <exclusion>
-            <groupId>com.vlkan</groupId>
+            <groupId>com.google</groupId>
             <artifactId>flatbuffers</artifactId>
           </exclusion>
           <!-- End of Hive 2.3 exclusion -->

From 7142ee9d05f5e0f70222060938fcd0c70cd43822 Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Fri, 5 Sep 2025 10:16:09 +0200
Subject: [PATCH 82/86] Remove InMemoryFileIndex fix

---
 .../spark/sql/execution/datasources/InMemoryFileIndex.scala  | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
index 205367a1853d7..6bf8f6f9f5cae 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
@@ -100,8 +100,7 @@ class InMemoryFileIndex(
   }
 
   override def equals(other: Any): Boolean = other match {
-    case hdfs: InMemoryFileIndex if rootPaths.size == hdfs.rootPaths.size =>
-      rootPaths.sorted == hdfs.rootPaths.sorted
+    case hdfs: InMemoryFileIndex => rootPaths.toSet == hdfs.rootPaths.toSet
     case _ => false
   }
 
@@ -143,8 +142,6 @@ class InMemoryFileIndex(
 }
 
 object InMemoryFileIndex extends Logging {
-  implicit val pathOrdering: Ordering[Path] = Ordering.by(_.toString)
-
   private[sql] def bulkListLeafFiles(
       paths: Seq[Path],
       hadoopConf: Configuration,

From 97a7daa0587c330b6d63c58498a404c8661d060f Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Fri, 5 Sep 2025 13:05:52 +0200
Subject: [PATCH 83/86] Test with new hive

---
 pom.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index 65829db88fb90..e358b5439d821 100644
--- a/pom.xml
+++ b/pom.xml
@@ -133,8 +133,8 @@
     <hive.group>io.hops.hive</hive.group>
     <hive.classifier>core</hive.classifier>
     <!-- Version used in Maven Hive dependency -->
-    <hive.version>3.0.0.13.5</hive.version>
-    <hive23.version>3.0.0.13.5</hive23.version>
+    <hive.version>3.0.0.13.10-HWORKS-2203-SNAPSHOT</hive.version>
+    <hive23.version>3.0.0.13.10-HWORKS-2203-SNAPSHOT</hive23.version>
     <!-- Version used for internal directory structure -->
     <hive.version.short>3.0</hive.version.short>
     <!-- note that this should be compatible with Kafka brokers version 0.10 and up -->

From 760012f0ce4465686aae55732681ae9b9658ff6d Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Fri, 5 Sep 2025 13:39:21 +0200
Subject: [PATCH 84/86] Add HiveEE to known repositories

---
 .github/workflows/build_spark_with_hopsfs.yaml |  9 +++++++--
 pom.xml                                        | 11 +++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_spark_with_hopsfs.yaml b/.github/workflows/build_spark_with_hopsfs.yaml
index 782d5247793cf..10d432dce97c1 100644
--- a/.github/workflows/build_spark_with_hopsfs.yaml
+++ b/.github/workflows/build_spark_with_hopsfs.yaml
@@ -162,8 +162,13 @@ jobs:
           M2_HOME: ~/.m2
         run: |
           echo "M2_HOME var is $M2_HOME" >> $GITHUB_STEP_SUMMARY
-          mkdir -p ~/.m2 && echo "<settings><servers><server><id>HopsEE</id><username>${{ vars.NEXUS_HARBOR_USER }}</username><password>${{ secrets.NEXUS_HARBOR_PASSWORD }}</password><configuration></configuration></server></servers></settings>" > ~/.m2/settings.xml
-        
+          mkdir -p ~/.m2
+          echo "<settings>" > ~/.m2/settings.xml
+          echo "<servers><server><id>HopsEE</id><username>${{ vars.NEXUS_HARBOR_USER }}</username><password>${{ secrets.NEXUS_HARBOR_PASSWORD }}</password><configuration></configuration></server></servers>" >> ~/.m2/settings.xml
+          echo "<servers><server><id>HiveEE</id><username>${{ vars.NEXUS_HARBOR_USER }}</username><password>${{ secrets.NEXUS_HARBOR_PASSWORD }}</password><configuration></configuration></server></servers>" >> ~/.m2/settings.xml
+          echo "</settings>" >> ~/.m2/settings.xml
+
+
       - name: Cache maven
         id: cache-maven
         if: steps.to_build_or_not_to_build.outputs.BUILD_SPARK == 'true'
diff --git a/pom.xml b/pom.xml
index e358b5439d821..e373a290253d5 100644
--- a/pom.xml
+++ b/pom.xml
@@ -373,6 +373,17 @@
         <enabled>true</enabled>
       </snapshots>
     </repository>
+    <repository>
+        <id>HiveEE</id>
+        <name>HiveEE Repo</name>
+        <url>https://nexus.hops.works/repository/hive-artifacts</url>
+        <releases>
+            <enabled>true</enabled>
+        </releases>
+        <snapshots>
+            <enabled>true</enabled>
+        </snapshots>
+    </repository>
   </repositories>
   <pluginRepositories>
     <pluginRepository>

From b9b2c5cea5bff06df095d59361128494c637070a Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Fri, 5 Sep 2025 13:57:53 +0200
Subject: [PATCH 85/86] Edit workflow

---
 .github/workflows/build_spark_with_hopsfs.yaml |  8 ++++----
 pom.xml                                        | 18 +++++++++---------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/build_spark_with_hopsfs.yaml b/.github/workflows/build_spark_with_hopsfs.yaml
index 10d432dce97c1..434162a796428 100644
--- a/.github/workflows/build_spark_with_hopsfs.yaml
+++ b/.github/workflows/build_spark_with_hopsfs.yaml
@@ -163,10 +163,10 @@ jobs:
         run: |
           echo "M2_HOME var is $M2_HOME" >> $GITHUB_STEP_SUMMARY
           mkdir -p ~/.m2
-          echo "<settings>" > ~/.m2/settings.xml
-          echo "<servers><server><id>HopsEE</id><username>${{ vars.NEXUS_HARBOR_USER }}</username><password>${{ secrets.NEXUS_HARBOR_PASSWORD }}</password><configuration></configuration></server></servers>" >> ~/.m2/settings.xml
-          echo "<servers><server><id>HiveEE</id><username>${{ vars.NEXUS_HARBOR_USER }}</username><password>${{ secrets.NEXUS_HARBOR_PASSWORD }}</password><configuration></configuration></server></servers>" >> ~/.m2/settings.xml
-          echo "</settings>" >> ~/.m2/settings.xml
+          echo "<settings><servers>" > ~/.m2/settings.xml
+          echo "<server><id>HopsEE</id><username>${{ vars.NEXUS_HARBOR_USER }}</username><password>${{ secrets.NEXUS_HARBOR_PASSWORD }}</password><configuration></configuration></server>" >> ~/.m2/settings.xml
+          echo "<server><id>HiveEE</id><username>${{ vars.NEXUS_HARBOR_USER }}</username><password>${{ secrets.NEXUS_HARBOR_PASSWORD }}</password><configuration></configuration></server>" >> ~/.m2/settings.xml
+          echo "</servers></settings>" >> ~/.m2/settings.xml
 
 
       - name: Cache maven
diff --git a/pom.xml b/pom.xml
index e373a290253d5..f369a06cfa21e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -374,15 +374,15 @@
       </snapshots>
     </repository>
     <repository>
-        <id>HiveEE</id>
-        <name>HiveEE Repo</name>
-        <url>https://nexus.hops.works/repository/hive-artifacts</url>
-        <releases>
-            <enabled>true</enabled>
-        </releases>
-        <snapshots>
-            <enabled>true</enabled>
-        </snapshots>
+      <id>HiveEE</id>
+      <name>HiveEE Repo</name>
+      <url>https://nexus.hops.works/repository/hive-artifacts</url>
+      <releases>
+          <enabled>true</enabled>
+      </releases>
+      <snapshots>
+          <enabled>true</enabled>
+      </snapshots>
     </repository>
   </repositories>
   <pluginRepositories>

From 1aadfbaf58371b582d7fa076705f428586e4dd53 Mon Sep 17 00:00:00 2001
From: Victor Jouffrey <vicatjou@gmail.com>
Date: Mon, 8 Sep 2025 12:12:43 +0200
Subject: [PATCH 86/86] Update spark version to 3.5.5

---
 assembly/pom.xml                                       | 2 +-
 common/kvstore/pom.xml                                 | 2 +-
 common/network-common/pom.xml                          | 2 +-
 common/network-shuffle/pom.xml                         | 2 +-
 common/network-yarn/pom.xml                            | 2 +-
 common/sketch/pom.xml                                  | 2 +-
 common/tags/pom.xml                                    | 2 +-
 common/unsafe/pom.xml                                  | 2 +-
 common/utils/pom.xml                                   | 2 +-
 connector/avro/pom.xml                                 | 2 +-
 connector/connect/client/jvm/pom.xml                   | 2 +-
 connector/connect/common/pom.xml                       | 2 +-
 connector/connect/server/pom.xml                       | 2 +-
 connector/docker-integration-tests/pom.xml             | 2 +-
 connector/kafka-0-10-assembly/pom.xml                  | 2 +-
 connector/kafka-0-10-sql/pom.xml                       | 2 +-
 connector/kafka-0-10-token-provider/pom.xml            | 2 +-
 connector/kafka-0-10/pom.xml                           | 2 +-
 connector/kinesis-asl-assembly/pom.xml                 | 2 +-
 connector/kinesis-asl/pom.xml                          | 2 +-
 connector/protobuf/pom.xml                             | 2 +-
 connector/spark-ganglia-lgpl/pom.xml                   | 2 +-
 core/pom.xml                                           | 2 +-
 examples/pom.xml                                       | 2 +-
 graphx/pom.xml                                         | 2 +-
 hadoop-cloud/pom.xml                                   | 2 +-
 launcher/pom.xml                                       | 2 +-
 mllib-local/pom.xml                                    | 2 +-
 mllib/pom.xml                                          | 2 +-
 pom.xml                                                | 2 +-
 repl/pom.xml                                           | 2 +-
 resource-managers/kubernetes/core/pom.xml              | 2 +-
 resource-managers/kubernetes/integration-tests/pom.xml | 2 +-
 resource-managers/mesos/pom.xml                        | 2 +-
 resource-managers/yarn/pom.xml                         | 2 +-
 sql/api/pom.xml                                        | 2 +-
 sql/catalyst/pom.xml                                   | 2 +-
 sql/core/pom.xml                                       | 2 +-
 sql/hive-thriftserver/pom.xml                          | 2 +-
 sql/hive/pom.xml                                       | 2 +-
 streaming/pom.xml                                      | 2 +-
 tools/pom.xml                                          | 2 +-
 version.log                                            | 1 -
 43 files changed, 42 insertions(+), 43 deletions(-)
 delete mode 100644 version.log

diff --git a/assembly/pom.xml b/assembly/pom.xml
index b8d3e7e4cecab..6eb1c9c341b81 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 7ce97474ac11e..a1ec2748329b9 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 1569686c98d75..6ae7863161b1e 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 997d7bc46eb71..7537e39d93ea5 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 673beefc522d0..f458b6c4e7e15 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 5e626fde4b76c..43313bd0ec28f 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 56e2817a495cc..471b499c37297 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 3fd75f673acc9..4dac8356b77b6 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/utils/pom.xml b/common/utils/pom.xml
index f4c4eea09c6c8..2e4e0dcdaa2eb 100644
--- a/common/utils/pom.xml
+++ b/common/utils/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/avro/pom.xml b/connector/avro/pom.xml
index 601ffa552cf94..11811ed080bca 100644
--- a/connector/avro/pom.xml
+++ b/connector/avro/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/connect/client/jvm/pom.xml b/connector/connect/client/jvm/pom.xml
index 01945546a976e..f2630bfb9303f 100644
--- a/connector/connect/client/jvm/pom.xml
+++ b/connector/connect/client/jvm/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/connect/common/pom.xml b/connector/connect/common/pom.xml
index 68c14857dc26b..a4f010f7076d4 100644
--- a/connector/connect/common/pom.xml
+++ b/connector/connect/common/pom.xml
@@ -22,7 +22,7 @@
     <parent>
         <groupId>org.apache.spark</groupId>
         <artifactId>spark-parent_2.12</artifactId>
-        <version>3.5.7-SNAPSHOT</version>
+        <version>3.5.5</version>
         <relativePath>../../../pom.xml</relativePath>
     </parent>
 
diff --git a/connector/connect/server/pom.xml b/connector/connect/server/pom.xml
index c5961b69bbdbf..54c63a6f6ded8 100644
--- a/connector/connect/server/pom.xml
+++ b/connector/connect/server/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/docker-integration-tests/pom.xml b/connector/docker-integration-tests/pom.xml
index bdf3f16d187a6..d35c4809b529f 100644
--- a/connector/docker-integration-tests/pom.xml
+++ b/connector/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10-assembly/pom.xml b/connector/kafka-0-10-assembly/pom.xml
index e6b2198c87440..4cd2f31a94645 100644
--- a/connector/kafka-0-10-assembly/pom.xml
+++ b/connector/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10-sql/pom.xml b/connector/kafka-0-10-sql/pom.xml
index 0c875477f9f08..ebb6f15ad3697 100644
--- a/connector/kafka-0-10-sql/pom.xml
+++ b/connector/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10-token-provider/pom.xml b/connector/kafka-0-10-token-provider/pom.xml
index 61e9b2a1b4f3b..cde5d2c4c1348 100644
--- a/connector/kafka-0-10-token-provider/pom.xml
+++ b/connector/kafka-0-10-token-provider/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kafka-0-10/pom.xml b/connector/kafka-0-10/pom.xml
index db1572705304f..b59e6401191be 100644
--- a/connector/kafka-0-10/pom.xml
+++ b/connector/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kinesis-asl-assembly/pom.xml b/connector/kinesis-asl-assembly/pom.xml
index 038e370d5b099..1b5ee194c268a 100644
--- a/connector/kinesis-asl-assembly/pom.xml
+++ b/connector/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/kinesis-asl/pom.xml b/connector/kinesis-asl/pom.xml
index bd50098da9224..608671f47a0c3 100644
--- a/connector/kinesis-asl/pom.xml
+++ b/connector/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/protobuf/pom.xml b/connector/protobuf/pom.xml
index 856873fd3389f..91df2118e6092 100644
--- a/connector/protobuf/pom.xml
+++ b/connector/protobuf/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/connector/spark-ganglia-lgpl/pom.xml b/connector/spark-ganglia-lgpl/pom.xml
index 06f653738c51c..572766941ed93 100644
--- a/connector/spark-ganglia-lgpl/pom.xml
+++ b/connector/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index b6fe09adc8cad..29747b80a431e 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/examples/pom.xml b/examples/pom.xml
index a9a19b20d12c6..4cf1847cc16c3 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index f8fd503746a51..287116cca802a 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index a8cf9298173e6..94cb21db3a01d 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 602ab5e749c5a..97e74a0998958 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index cf7bba8c1bc90..a7020fabd259a 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 58ce9d07f99af..a9e8fca6e1b19 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index f369a06cfa21e..b09b6345f7cc3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.12</artifactId>
-  <version>3.5.7-SNAPSHOT</version>
+  <version>3.5.5</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>https://spark.apache.org/</url>
diff --git a/repl/pom.xml b/repl/pom.xml
index 3cdc95fdaff9b..1efb8b8fbe1f1 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 9473a1aac3123..7a97c4c1ff06c 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 2d43f57af8080..a004dd12fedec 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 1ff3e3b92e511..566e0baf8e23c 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 4780c62532234..f7bfa6d6ee6e7 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/api/pom.xml b/sql/api/pom.xml
index d11d496811b20..4a21574462c3a 100644
--- a/sql/api/pom.xml
+++ b/sql/api/pom.xml
@@ -22,7 +22,7 @@
     <parent>
         <groupId>org.apache.spark</groupId>
         <artifactId>spark-parent_2.12</artifactId>
-        <version>3.5.7-SNAPSHOT</version>
+        <version>3.5.5</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 981ac6fffc1c3..ff0992e7b21c7 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 4b55b4200c4c9..5d678c9464593 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 9d13436104e61..176ad85f71237 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 68c38523b72cb..a3ebd9f98c96a 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index d355b773c3a7e..92d716fa5c09d 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 8dd903ce0d1e0..f2e171b2dfe97 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.5.7-SNAPSHOT</version>
+    <version>3.5.5</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/version.log b/version.log
deleted file mode 100644
index 7d280e2cd476e..0000000000000
--- a/version.log
+++ /dev/null
@@ -1 +0,0 @@
-3.5.5