[SPARK-52432][SDP][SQL] Scope DataflowGraphRegistry to Session

jackywang-db · sryza · commit 0177265b6cb9 · 2025-07-21T08:17:49.000-07:00
### What changes were proposed in this pull request? Scope `DataflowGraphRegistry` to spark connect session. This is done by adding it as a member to the spark connect [SessionHolder](https://github.com/apache/spark/blob/master/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SessionHolder.scala#L54). This is added here because pipeline executions are also [scoped](https://github.com/apache/spark/blob/master/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SessionHolder.scala#L125) to this class. Added getter/setter methods to access dataflow graphs for the session. Added logic to drop all dataflow graphs when session is closed. ### Why are the changes needed? Currently `DataflowGraphRegistry` is a singleton, but it should instead be scoped to a single SparkSession for proper isolation between pipelines that are run on the same cluster. This allows proper cleanup of pipeline resources when session is closed. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Added new testcases to test data flow graph session isolation and proper clean up. ### Was this patch authored or co-authored using generative AI tooling? No Closes apache#51544 from JiaqiWang18/SPARK-52432-session-graphRegistry. Authored-by: Jacky Wang <jacky.wang@databricks.com> Signed-off-by: Sandy Ryza <sandy.ryza@databricks.com>
diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/pipelines/DataflowGraphRegistry.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/pipelines/DataflowGraphRegistry.scala
@@ -28,10 +28,7 @@ import org.apache.spark.sql.pipelines.graph.GraphRegistrationContext
  * PipelinesHandler when CreateDataflowGraph is called, and the PipelinesHandler also supports
  * attaching flows/datasets to a graph.
  */
-// TODO(SPARK-51727): Currently DataflowGraphRegistry is a singleton, but it should instead be
-//  scoped to a single SparkSession for proper isolation between pipelines that are run on the
-//  same cluster.
-object DataflowGraphRegistry {
+class DataflowGraphRegistry {
 
   private val dataflowGraphs = new ConcurrentHashMap[String, GraphRegistrationContext]()
 
@@ -55,7 +52,7 @@ object DataflowGraphRegistry {
 
   /** Retrieves the graph for a given id, and throws if the id could not be found. */
   def getDataflowGraphOrThrow(dataflowGraphId: String): GraphRegistrationContext =
-    DataflowGraphRegistry.getDataflowGraph(dataflowGraphId).getOrElse {
+    getDataflowGraph(dataflowGraphId).getOrElse {
       throw new SparkException(
         errorClass = "DATAFLOW_GRAPH_NOT_FOUND",
         messageParameters = Map("graphId" -> dataflowGraphId),
diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/pipelines/PipelinesHandler.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/pipelines/PipelinesHandler.scala
@@ -28,7 +28,6 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.classic.SparkSession
 import org.apache.spark.sql.connect.common.DataTypeProtoConverter
 import org.apache.spark.sql.connect.service.SessionHolder
 import org.apache.spark.sql.pipelines.Language.Python
@@ -68,7 +67,7 @@ private[connect] object PipelinesHandler extends Logging {
     cmd.getCommandTypeCase match {
       case proto.PipelineCommand.CommandTypeCase.CREATE_DATAFLOW_GRAPH =>
         val createdGraphId =
-          createDataflowGraph(cmd.getCreateDataflowGraph, sessionHolder.session)
+          createDataflowGraph(cmd.getCreateDataflowGraph, sessionHolder)
         PipelineCommandResult
           .newBuilder()
           .setCreateDataflowGraphResult(
@@ -78,73 +77,81 @@ private[connect] object PipelinesHandler extends Logging {
           .build()
       case proto.PipelineCommand.CommandTypeCase.DROP_DATAFLOW_GRAPH =>
         logInfo(s"Drop pipeline cmd received: $cmd")
-        DataflowGraphRegistry.dropDataflowGraph(cmd.getDropDataflowGraph.getDataflowGraphId)
+        sessionHolder.dataflowGraphRegistry
+          .dropDataflowGraph(cmd.getDropDataflowGraph.getDataflowGraphId)
         defaultResponse
       case proto.PipelineCommand.CommandTypeCase.DEFINE_DATASET =>
         logInfo(s"Define pipelines dataset cmd received: $cmd")
-        defineDataset(cmd.getDefineDataset, sessionHolder.session)
+        defineDataset(cmd.getDefineDataset, sessionHolder)
         defaultResponse
       case proto.PipelineCommand.CommandTypeCase.DEFINE_FLOW =>
         logInfo(s"Define pipelines flow cmd received: $cmd")
-        defineFlow(cmd.getDefineFlow, transformRelationFunc, sessionHolder.session)
+        defineFlow(cmd.getDefineFlow, transformRelationFunc, sessionHolder)
         defaultResponse
       case proto.PipelineCommand.CommandTypeCase.START_RUN =>
         logInfo(s"Start pipeline cmd received: $cmd")
         startRun(cmd.getStartRun, responseObserver, sessionHolder)
         defaultResponse
       case proto.PipelineCommand.CommandTypeCase.DEFINE_SQL_GRAPH_ELEMENTS =>
         logInfo(s"Register sql datasets cmd received: $cmd")
-        defineSqlGraphElements(cmd.getDefineSqlGraphElements, sessionHolder.session)
+        defineSqlGraphElements(cmd.getDefineSqlGraphElements, sessionHolder)
         defaultResponse
       case other => throw new UnsupportedOperationException(s"$other not supported")
     }
   }
 
   private def createDataflowGraph(
       cmd: proto.PipelineCommand.CreateDataflowGraph,
-      spark: SparkSession): String = {
+      sessionHolder: SessionHolder): String = {
     val defaultCatalog = Option
       .when(cmd.hasDefaultCatalog)(cmd.getDefaultCatalog)
       .getOrElse {
         logInfo(s"No default catalog was supplied. Falling back to the current catalog.")
-        spark.catalog.currentCatalog()
+        sessionHolder.session.catalog.currentCatalog()
       }
 
     val defaultDatabase = Option
       .when(cmd.hasDefaultDatabase)(cmd.getDefaultDatabase)
       .getOrElse {
         logInfo(s"No default database was supplied. Falling back to the current database.")
-        spark.catalog.currentDatabase
+        sessionHolder.session.catalog.currentDatabase
       }
 
     val defaultSqlConf = cmd.getSqlConfMap.asScala.toMap
 
-    DataflowGraphRegistry.createDataflowGraph(
+    sessionHolder.dataflowGraphRegistry.createDataflowGraph(
       defaultCatalog = defaultCatalog,
       defaultDatabase = defaultDatabase,
       defaultSqlConf = defaultSqlConf)
   }
 
   private def defineSqlGraphElements(
       cmd: proto.PipelineCommand.DefineSqlGraphElements,
-      session: SparkSession): Unit = {
+      sessionHolder: SessionHolder): Unit = {
     val dataflowGraphId = cmd.getDataflowGraphId
 
-    val graphElementRegistry = DataflowGraphRegistry.getDataflowGraphOrThrow(dataflowGraphId)
+    val graphElementRegistry =
+      sessionHolder.dataflowGraphRegistry.getDataflowGraphOrThrow(dataflowGraphId)
     val sqlGraphElementRegistrationContext = new SqlGraphRegistrationContext(graphElementRegistry)
-    sqlGraphElementRegistrationContext.processSqlFile(cmd.getSqlText, cmd.getSqlFilePath, session)
+    sqlGraphElementRegistrationContext.processSqlFile(
+      cmd.getSqlText,
+      cmd.getSqlFilePath,
+      sessionHolder.session)
   }
 
   private def defineDataset(
       dataset: proto.PipelineCommand.DefineDataset,
-      sparkSession: SparkSession): Unit = {
+      sessionHolder: SessionHolder): Unit = {
     val dataflowGraphId = dataset.getDataflowGraphId
-    val graphElementRegistry = DataflowGraphRegistry.getDataflowGraphOrThrow(dataflowGraphId)
+    val graphElementRegistry =
+      sessionHolder.dataflowGraphRegistry.getDataflowGraphOrThrow(dataflowGraphId)
 
     dataset.getDatasetType match {
       case proto.DatasetType.MATERIALIZED_VIEW | proto.DatasetType.TABLE =>
         val tableIdentifier =
-          GraphIdentifierManager.parseTableIdentifier(dataset.getDatasetName, sparkSession)
+          GraphIdentifierManager.parseTableIdentifier(
+            dataset.getDatasetName,
+            sessionHolder.session)
         graphElementRegistry.registerTable(
           Table(
             identifier = tableIdentifier,
@@ -165,7 +172,9 @@ private[connect] object PipelinesHandler extends Logging {
             isStreamingTable = dataset.getDatasetType == proto.DatasetType.TABLE))
       case proto.DatasetType.TEMPORARY_VIEW =>
         val viewIdentifier =
-          GraphIdentifierManager.parseTableIdentifier(dataset.getDatasetName, sparkSession)
+          GraphIdentifierManager.parseTableIdentifier(
+            dataset.getDatasetName,
+            sessionHolder.session)
 
         graphElementRegistry.registerView(
           TemporaryView(
@@ -184,14 +193,15 @@ private[connect] object PipelinesHandler extends Logging {
   private def defineFlow(
       flow: proto.PipelineCommand.DefineFlow,
       transformRelationFunc: Relation => LogicalPlan,
-      sparkSession: SparkSession): Unit = {
+      sessionHolder: SessionHolder): Unit = {
     val dataflowGraphId = flow.getDataflowGraphId
-    val graphElementRegistry = DataflowGraphRegistry.getDataflowGraphOrThrow(dataflowGraphId)
+    val graphElementRegistry =
+      sessionHolder.dataflowGraphRegistry.getDataflowGraphOrThrow(dataflowGraphId)
 
     val isImplicitFlow = flow.getFlowName == flow.getTargetDatasetName
 
     val flowIdentifier = GraphIdentifierManager
-      .parseTableIdentifier(name = flow.getFlowName, spark = sparkSession)
+      .parseTableIdentifier(name = flow.getFlowName, spark = sessionHolder.session)
 
     // If the flow is not an implicit flow (i.e. one defined as part of dataset creation), then
     // it must be a single-part identifier.
@@ -205,7 +215,7 @@ private[connect] object PipelinesHandler extends Logging {
       new UnresolvedFlow(
         identifier = flowIdentifier,
         destinationIdentifier = GraphIdentifierManager
-          .parseTableIdentifier(name = flow.getTargetDatasetName, spark = sparkSession),
+          .parseTableIdentifier(name = flow.getTargetDatasetName, spark = sessionHolder.session),
         func =
           FlowAnalysis.createFlowFunctionFromLogicalPlan(transformRelationFunc(flow.getRelation)),
         sqlConf = flow.getSqlConfMap.asScala.toMap,
@@ -224,7 +234,8 @@ private[connect] object PipelinesHandler extends Logging {
       responseObserver: StreamObserver[ExecutePlanResponse],
       sessionHolder: SessionHolder): Unit = {
     val dataflowGraphId = cmd.getDataflowGraphId
-    val graphElementRegistry = DataflowGraphRegistry.getDataflowGraphOrThrow(dataflowGraphId)
+    val graphElementRegistry =
+      sessionHolder.dataflowGraphRegistry.getDataflowGraphOrThrow(dataflowGraphId)
     val tableFiltersResult = createTableFilters(cmd, graphElementRegistry, sessionHolder)
 
     // We will use this variable to store the run failure event if it occurs. This will be set
diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SessionHolder.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SessionHolder.scala
@@ -38,6 +38,7 @@ import org.apache.spark.sql.classic.SparkSession
 import org.apache.spark.sql.connect.common.InvalidPlanInput
 import org.apache.spark.sql.connect.config.Connect
 import org.apache.spark.sql.connect.ml.MLCache
+import org.apache.spark.sql.connect.pipelines.DataflowGraphRegistry
 import org.apache.spark.sql.connect.planner.PythonStreamingQueryListener
 import org.apache.spark.sql.connect.planner.StreamingForeachBatchHelper
 import org.apache.spark.sql.connect.service.SessionHolder.{ERROR_CACHE_SIZE, ERROR_CACHE_TIMEOUT_SEC}
@@ -125,6 +126,9 @@ case class SessionHolder(userId: String, sessionId: String, session: SparkSessio
   private lazy val pipelineExecutions =
     new ConcurrentHashMap[String, PipelineUpdateContext]()
 
+  // Registry for dataflow graphs specific to this session
+  private[connect] lazy val dataflowGraphRegistry = new DataflowGraphRegistry()
+
   // Handles Python process clean up for streaming queries. Initialized on first use in a query.
   private[connect] lazy val streamingForeachBatchRunnerCleanerCache =
     new StreamingForeachBatchHelper.CleanerCache(this)
@@ -320,6 +324,9 @@ case class SessionHolder(userId: String, sessionId: String, session: SparkSessio
     // Stops all pipeline execution and clears the pipeline execution cache
     removeAllPipelineExecutions()
 
+    // Clean up dataflow graphs
+    dataflowGraphRegistry.dropAllDataflowGraphs()
+
     // if there is a server side listener, clean up related resources
     if (streamingServersideListenerHolder.isServerSideListenerRegistered) {
       streamingServersideListenerHolder.cleanUp()
diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/PythonPipelineSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/PythonPipelineSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.connect.pipelines
 import java.io.{BufferedReader, InputStreamReader}
 import java.nio.charset.StandardCharsets
 import java.nio.file.Paths
+import java.util.UUID
 import java.util.concurrent.TimeUnit
 
 import scala.collection.mutable.ArrayBuffer
@@ -28,6 +29,7 @@ import scala.util.Try
 import org.apache.spark.api.python.PythonUtils
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.connect.service.SparkConnectService
 import org.apache.spark.sql.pipelines.graph.DataflowGraph
 import org.apache.spark.sql.pipelines.utils.{EventVerificationTestHelpers, TestPipelineUpdateContextMixin}
 
@@ -42,6 +44,8 @@ class PythonPipelineSuite
 
   def buildGraph(pythonText: String): DataflowGraph = {
     val indentedPythonText = pythonText.linesIterator.map("    " + _).mkString("\n")
+    // create a unique identifier to allow identifying the session and dataflow graph
+    val customSessionIdentifier = UUID.randomUUID().toString
     val pythonCode =
       s"""
          |from pyspark.sql import SparkSession
@@ -57,6 +61,7 @@ class PythonPipelineSuite
          |spark = SparkSession.builder \\
          |    .remote("sc://localhost:$serverPort") \\
          |    .config("spark.connect.grpc.channel.timeout", "5s") \\
+         |    .config("spark.custom.identifier", "$customSessionIdentifier") \\
          |    .create()
          |
          |dataflow_graph_id = create_dataflow_graph(
@@ -78,8 +83,17 @@ class PythonPipelineSuite
       throw new RuntimeException(
         s"Python process failed with exit code $exitCode. Output: ${output.mkString("\n")}")
     }
+    val activeSessions = SparkConnectService.sessionManager.listActiveSessions
 
-    val dataflowGraphContexts = DataflowGraphRegistry.getAllDataflowGraphs
+    // get the session holder by finding the session with the custom UUID set in the conf
+    val sessionHolder = activeSessions
+      .map(info => SparkConnectService.sessionManager.getIsolatedSession(info.key, None))
+      .find(_.session.conf.get("spark.custom.identifier") == customSessionIdentifier)
+      .getOrElse(
+        throw new RuntimeException(s"Session with identifier $customSessionIdentifier not found"))
+
+    // get all dataflow graphs from the session holder
+    val dataflowGraphContexts = sessionHolder.dataflowGraphRegistry.getAllDataflowGraphs
     assert(dataflowGraphContexts.size == 1)
 
     dataflowGraphContexts.head.toDataflowGraph
diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/SparkDeclarativePipelinesServerSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/SparkDeclarativePipelinesServerSuite.scala
diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/SparkDeclarativePipelinesServerTest.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/SparkDeclarativePipelinesServerTest.scala