[SPARK-49060][CONNECT] Clean up Mima rules for SQL-Connect binary compatibility checks

xupefei · HyukjinKwon · commit 76a1ca5e1959 · 2024-08-05T19:11:34.000+09:00
### What changes were proposed in this pull request? This PR modifies some Mima rules which are used for checking the binary compatibility between `sql` and `connect` modules. Major changes include: - Removed unnecessary filters for specific `private[sql]` constructors - there's a wildcard rule which filters out all of them. - Removed outdated filters about APIs that are already consistent. - Add a warning about unused filters. Current output: ```bash $ ./dev/connect-jvm-client-mima-check Do connect-client-jvm module mima check ... Warning: ExcludeByName[Problem]("org.apache.spark.sql.Dataset.queryExecution") did not filter out any problems. Warning: ExcludeByName[Problem]("org.apache.spark.sql.Dataset.sqlContext") did not filter out any problems. Warning: ExcludeByName[Problem]("org.apache.spark.sql.Dataset.selectUntyped") did not filter out any problems. Warning: ExcludeByName[Problem]("org.apache.spark.sql.Dataset.rdd") did not filter out any problems. Warning: ExcludeByName[Problem]("org.apache.spark.sql.Dataset.toJavaRDD") did not filter out any problems. Warning: ExcludeByName[Problem]("org.apache.spark.sql.Dataset.javaRDD") did not filter out any problems. finish connect-client-jvm module mima check ... connect-client-jvm module mima check passed. ``` I manually audited all rules defined in the list. One issue I found is that all APIs in `Dataset` are not being checked at all, likely due to having a `private[sql]` companion object in `spark-core`. Changing the object's visibility from `private[sql]` to `public` will resolve this issue. The exact reason is unknown and is to be investigated. ### Why are the changes needed? Need to make sure Mima is really working. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Not needed. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #47487 from xupefei/mima-refactor. Authored-by: Paddy Xu <xupaddy@gmail.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
@@ -824,7 +824,7 @@ class KeyValueGroupedDataset[K, V] private[sql] () extends Serializable {
    * @param outputMode
    *   The output mode of the stateful processor.
    */
-  def transformWithState[U: Encoder](
+  private[sql] def transformWithState[U: Encoder](
       statefulProcessor: StatefulProcessor[K, V, U],
       timeMode: TimeMode,
       outputMode: OutputMode): Dataset[U] = {
@@ -850,7 +850,7 @@ class KeyValueGroupedDataset[K, V] private[sql] () extends Serializable {
    * @param outputEncoder
    *   Encoder for the output type.
    */
-  def transformWithState[U: Encoder](
+  private[sql] def transformWithState[U: Encoder](
       statefulProcessor: StatefulProcessor[K, V, U],
       timeMode: TimeMode,
       outputMode: OutputMode,
@@ -879,7 +879,7 @@ class KeyValueGroupedDataset[K, V] private[sql] () extends Serializable {
    *
    * See [[Encoder]] for more details on what types are encodable to Spark SQL.
    */
-  def transformWithState[U: Encoder, S: Encoder](
+  private[sql] def transformWithState[U: Encoder, S: Encoder](
       statefulProcessor: StatefulProcessorWithInitialState[K, V, U, S],
       timeMode: TimeMode,
       outputMode: OutputMode,
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala
@@ -21,6 +21,8 @@ import java.nio.charset.StandardCharsets
 import java.nio.file.{Files, Paths}
 import java.util.regex.Pattern
 
+import scala.collection.mutable.{Set => MutableSet}
+
 import com.typesafe.tools.mima.core._
 import com.typesafe.tools.mima.lib.MiMaLib
 
@@ -207,13 +209,13 @@ object CheckConnectJvmClientCompatibility {
       ProblemFilters.exclude[MissingClassProblem](
         "org.apache.spark.sql.Dataset$" // private[sql]
       ),
-      ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.ofRows"),
-      ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.DATASET_ID_TAG"),
-      ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.COL_POS_KEY"),
-      ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.DATASET_ID_KEY"),
-      ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.curId"),
       ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.ObservationListener"),
       ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.ObservationListener$"),
+      // TODO (SPARK-49096):
+      // Mima check might complain the following Dataset rules does not filter any problem.
+      // This is due to a potential bug in Mima that all methods in `class Dataset` are not being
+      // checked for problems due to the presence of a private[sql] companion object.
+      // Further investigation is needed.
       ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.queryExecution"),
       ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.sqlContext"),
       ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.selectUntyped"), // protected
@@ -232,7 +234,6 @@ object CheckConnectJvmClientCompatibility {
       ProblemFilters.exclude[MissingClassProblem](
         "org.apache.spark.sql.RelationalGroupedDataset$*" // private[sql]
       ),
-      ProblemFilters.exclude[Problem]("org.apache.spark.sql.RelationalGroupedDataset.apply"),
 
       // SparkSession
       ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.sparkContext"),
@@ -241,18 +242,14 @@ object CheckConnectJvmClientCompatibility {
       ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.sqlContext"),
       ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.listenerManager"),
       ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.experimental"),
-      ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.udtf"),
       ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.dataSource"),
       ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.createDataFrame"),
       ProblemFilters.exclude[Problem](
         "org.apache.spark.sql.SparkSession.baseRelationToDataFrame"),
       ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.createDataset"),
       ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.executeCommand"),
-      ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.this"),
 
       // SparkSession#implicits
-      ProblemFilters.exclude[DirectMissingMethodProblem](
-        "org.apache.spark.sql.SparkSession#implicits._sqlContext"),
       ProblemFilters.exclude[DirectMissingMethodProblem](
         "org.apache.spark.sql.SparkSession#implicits.session"),
 
@@ -285,26 +282,9 @@ object CheckConnectJvmClientCompatibility {
         "org.apache.spark.sql.streaming.PythonStreamingQueryListenerWrapper"),
       ProblemFilters.exclude[MissingTypesProblem](
         "org.apache.spark.sql.streaming.StreamingQueryListener$Event"),
-      ProblemFilters.exclude[MissingTypesProblem](
-        "org.apache.spark.sql.streaming.StreamingQueryListener$QueryIdleEvent"),
-      ProblemFilters.exclude[DirectMissingMethodProblem](
-        "org.apache.spark.sql.streaming.StreamingQueryListener#QueryIdleEvent.logEvent"),
-      ProblemFilters.exclude[MissingTypesProblem](
-        "org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent"),
-      ProblemFilters.exclude[DirectMissingMethodProblem](
-        "org.apache.spark.sql.streaming.StreamingQueryListener#QueryProgressEvent.logEvent"),
-      ProblemFilters.exclude[MissingTypesProblem](
-        "org.apache.spark.sql.streaming.StreamingQueryListener$QueryStartedEvent"),
-      ProblemFilters.exclude[DirectMissingMethodProblem](
-        "org.apache.spark.sql.streaming.StreamingQueryListener#QueryStartedEvent.logEvent"),
-      ProblemFilters.exclude[MissingTypesProblem](
-        "org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminatedEvent"),
-      ProblemFilters.exclude[DirectMissingMethodProblem](
-        "org.apache.spark.sql.streaming.StreamingQueryListener#QueryTerminatedEvent.logEvent"),
 
       // SQLImplicits
       ProblemFilters.exclude[Problem]("org.apache.spark.sql.SQLImplicits.rddToDatasetHolder"),
-      ProblemFilters.exclude[Problem]("org.apache.spark.sql.SQLImplicits._sqlContext"),
       ProblemFilters.exclude[Problem]("org.apache.spark.sql.SQLImplicits.session"),
 
       // Artifact Manager
@@ -341,24 +321,8 @@ object CheckConnectJvmClientCompatibility {
         "org.apache.spark.sql.KeyValueGroupedDatasetImpl"),
       ProblemFilters.exclude[MissingClassProblem](
         "org.apache.spark.sql.KeyValueGroupedDatasetImpl$"),
-      ProblemFilters.exclude[ReversedMissingMethodProblem](
-        "org.apache.spark.sql.SQLImplicits._sqlContext" // protected
-      ),
       ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.internal.SessionCleaner"),
 
-      // private
-      ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.internal.CleanupTask"),
-      ProblemFilters.exclude[MissingClassProblem](
-        "org.apache.spark.sql.internal.CleanupTaskWeakReference"),
-      ProblemFilters.exclude[MissingClassProblem](
-        "org.apache.spark.sql.internal.CleanupCachedRemoteRelation"),
-      ProblemFilters.exclude[MissingClassProblem](
-        "org.apache.spark.sql.internal.CleanupCachedRemoteRelation$"),
-
-      // Catalyst Refactoring
-      ProblemFilters.exclude[Problem]("org.apache.spark.sql.catalyst.util.SparkCollectionUtils"),
-      ProblemFilters.exclude[Problem]("org.apache.spark.sql.catalyst.util.SparkCollectionUtils$"),
-
       // New public APIs added in the client
       // ScalaUserDefinedFunction
       ProblemFilters
@@ -490,13 +454,24 @@ object CheckConnectJvmClientCompatibility {
       excludeRules: Seq[ProblemFilter]): List[Problem] = {
     val mima = new MiMaLib(Seq(newJar, oldJar))
     val allProblems = mima.collectProblems(oldJar, newJar, List.empty)
+
+    val effectiveExcludeRules = MutableSet.empty[ProblemFilter]
     val problems = allProblems
       .filter { p =>
         includedRules.exists(rule => rule(p))
       }
       .filter { p =>
-        excludeRules.forall(rule => rule(p))
+        excludeRules.forall { rule =>
+          val passedRule = rule(p)
+          if (!passedRule) {
+            effectiveExcludeRules += rule
+          }
+          passedRule
+        }
       }
+    excludeRules.filterNot(effectiveExcludeRules.contains).foreach { rule =>
+      println(s"Warning: $rule did not filter out any problems.")
+    }
     problems
   }
 
@@ -511,11 +486,14 @@ object CheckConnectJvmClientCompatibility {
       resultWriter.write(
         s"ERROR: Comparing Client jar: $clientModule and $targetName jar: $targetModule \n")
       resultWriter.write(s"problems with $targetName module: \n")
-      resultWriter.write(s"${problems.map(p => p.description(description)).mkString("\n")}")
-      resultWriter.write("\n")
-      resultWriter.write(
-        "Exceptions to binary compatibility can be added in " +
-          s"'CheckConnectJvmClientCompatibility#checkMiMaCompatibilityWith${targetName}Module'\n")
+      val problemDescriptions =
+        problems.map(p => s"${p.getClass.getSimpleName}: ${p.description(description)}")
+      resultWriter.write(problemDescriptions.mkString("\n"))
+      resultWriter.write("\n\n")
+      resultWriter.write("Exceptions to binary compatibility can be added in " +
+        s"'CheckConnectJvmClientCompatibility#checkMiMaCompatibilityWith${targetName}Module':\n")
+      resultWriter.write(problems.flatMap(_.howToFilter).distinct.mkString(",\n"))
+      resultWriter.write("\n\n")
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -229,7 +229,7 @@ class SparkSession private(
    */
   def udf: UDFRegistration = sessionState.udfRegistration
 
-  def udtf: UDTFRegistration = sessionState.udtfRegistration
+  private[sql] def udtf: UDTFRegistration = sessionState.udtfRegistration
 
   /**
    * A collection of methods for registering user-defined data sources.