[SPARK-39834][SQL][SS] Include the origin stats and constraints for LogicalRDD if it comes from DataFrame

HeartSaVioR · HeartSaVioR · commit 0e33195d1ba1 · 2022-07-27T17:02:55.000+09:00
Credit to juliuszsompolski for figuring out issues and proposing the alternative. ### What changes were proposed in this pull request? This PR proposes to effectively revert SPARK-39748 but include the origin stats and constraints instead in LogicalRDD if it comes from DataFrame, to help optimizer figuring out better plan. ### Why are the changes needed? We figured out several issues from [SPARK-39748](https://issues.apache.org/jira/browse/SPARK-39748): 1. One of major use case for DataFrame.checkpoint is ML, especially "iterative algorithm", and the purpose on calling checkpoint is to "prune" the logical plan. That is against the purpose of including origin logical plan and we have a risk to have nested LogicalRDDs which grows the size of logical plan infinitely. 2. We leverage logical plan to carry over stats, but the correct stats information is in optimized plan. 3. (Not an issue but missing spot) constraints is also something we can carry over. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing and new UTs. Closes #37248 from HeartSaVioR/SPARK-39834. Authored-by: Jungtaek Lim <kabhwan.opensource@gmail.com> Signed-off-by: Jungtaek Lim <kabhwan.opensource@gmail.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -46,7 +46,6 @@ import org.apache.spark.sql.catalyst.optimizer.CombineUnions
 import org.apache.spark.sql.catalyst.parser.{ParseException, ParserUtils}
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, PartitioningCollection}
 import org.apache.spark.sql.catalyst.trees.TreeNodeTag
 import org.apache.spark.sql.catalyst.util.IntervalUtils
 import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
@@ -710,29 +709,10 @@ class Dataset[T] private[sql](
         internalRdd.doCheckpoint()
       }
 
-      // Takes the first leaf partitioning whenever we see a `PartitioningCollection`. Otherwise the
-      // size of `PartitioningCollection` may grow exponentially for queries involving deep inner
-      // joins.
-      @scala.annotation.tailrec
-      def firstLeafPartitioning(partitioning: Partitioning): Partitioning = {
-        partitioning match {
-          case p: PartitioningCollection => firstLeafPartitioning(p.partitionings.head)
-          case p => p
-        }
-      }
-
-      val outputPartitioning = firstLeafPartitioning(physicalPlan.outputPartitioning)
-
       Dataset.ofRows(
         sparkSession,
-        LogicalRDD(
-          logicalPlan.output,
-          internalRdd,
-          Some(queryExecution.analyzed),
-          outputPartitioning,
-          physicalPlan.outputOrdering,
-          isStreaming
-        )(sparkSession)).as[T]
+        LogicalRDD.fromDataset(rdd = internalRdd, originDataset = this, isStreaming = isStreaming)
+      ).as[T]
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -18,12 +18,12 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Encoder, SparkSession}
+import org.apache.spark.sql.{Dataset, Encoder, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning}
+import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, PartitioningCollection, UnknownPartitioning}
 import org.apache.spark.sql.catalyst.util.truncatedString
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
@@ -86,19 +86,24 @@ case class ExternalRDDScanExec[T](
 /**
  * Logical plan node for scanning data from an RDD of InternalRow.
  *
- * It is advised to set the field `originLogicalPlan` if the RDD is directly built from DataFrame,
- * as the stat can be inherited from `originLogicalPlan`.
+ * It is advised to set the field `originStats` and `originConstraints` if the RDD is directly
+ * built from DataFrame, so that Spark can make better optimizations.
  */
 case class LogicalRDD(
     output: Seq[Attribute],
     rdd: RDD[InternalRow],
-    originLogicalPlan: Option[LogicalPlan] = None,
     outputPartitioning: Partitioning = UnknownPartitioning(0),
     override val outputOrdering: Seq[SortOrder] = Nil,
-    override val isStreaming: Boolean = false)(session: SparkSession)
+    override val isStreaming: Boolean = false)(
+    session: SparkSession,
+    // originStats and originConstraints are intentionally placed to "second" parameter list,
+    // to prevent catalyst rules to mistakenly transform and rewrite them. Do not change this.
+    originStats: Option[Statistics] = None,
+    originConstraints: Option[ExpressionSet] = None)
   extends LeafNode with MultiInstanceRelation {
 
-  override protected final def otherCopyArgs: Seq[AnyRef] = session :: Nil
+  override protected final def otherCopyArgs: Seq[AnyRef] =
+    session :: originStats :: originConstraints :: Nil
 
   override def newInstance(): LogicalRDD.this.type = {
     val rewrite = output.zip(output.map(_.newInstance())).toMap
@@ -116,37 +121,78 @@ case class LogicalRDD(
       case e: Attribute => rewrite.getOrElse(e, e)
     }.asInstanceOf[SortOrder])
 
-    val rewrittenOriginLogicalPlan = originLogicalPlan.map { plan =>
-      assert(output == plan.output, "The output columns are expected to the same for output " +
-        s"and originLogicalPlan. output: $output / output in originLogicalPlan: ${plan.output}")
+    val rewrittenStatistics = originStats.map { s =>
+      Statistics(
+        s.sizeInBytes,
+        s.rowCount,
+        AttributeMap[ColumnStat](s.attributeStats.map {
+          case (attr, v) => (rewrite.getOrElse(attr, attr), v)
+        }),
+        s.isRuntime
+      )
+    }
 
-      val projectList = output.map { attr =>
-        Alias(attr, attr.name)(exprId = rewrite(attr).exprId)
-      }
-      Project(projectList, plan)
+    val rewrittenConstraints = originConstraints.map { c =>
+      c.map(_.transform {
+        case e: Attribute => rewrite.getOrElse(e, e)
+      })
     }
 
     LogicalRDD(
       output.map(rewrite),
       rdd,
-      rewrittenOriginLogicalPlan,
       rewrittenPartitioning,
       rewrittenOrdering,
       isStreaming
-    )(session).asInstanceOf[this.type]
+    )(session, rewrittenStatistics, rewrittenConstraints).asInstanceOf[this.type]
   }
 
   override protected def stringArgs: Iterator[Any] = Iterator(output, isStreaming)
 
   override def computeStats(): Statistics = {
-    originLogicalPlan.map(_.stats).getOrElse {
+    originStats.getOrElse {
       Statistics(
         // TODO: Instead of returning a default value here, find a way to return a meaningful size
         // estimate for RDDs. See PR 1238 for more discussions.
         sizeInBytes = BigInt(session.sessionState.conf.defaultSizeInBytes)
       )
     }
   }
+
+  override lazy val constraints: ExpressionSet = originConstraints.getOrElse(ExpressionSet())
+}
+
+object LogicalRDD {
+  /**
+   * Create a new LogicalRDD based on existing Dataset. Stats and constraints are inherited from
+   * origin Dataset.
+   */
+  private[sql] def fromDataset(
+      rdd: RDD[InternalRow],
+      originDataset: Dataset[_],
+      isStreaming: Boolean): LogicalRDD = {
+    // Takes the first leaf partitioning whenever we see a `PartitioningCollection`. Otherwise the
+    // size of `PartitioningCollection` may grow exponentially for queries involving deep inner
+    // joins.
+    @scala.annotation.tailrec
+    def firstLeafPartitioning(partitioning: Partitioning): Partitioning = {
+      partitioning match {
+        case p: PartitioningCollection => firstLeafPartitioning(p.partitionings.head)
+        case p => p
+      }
+    }
+
+    val optimizedPlan = originDataset.queryExecution.optimizedPlan
+    val executedPlan = originDataset.queryExecution.executedPlan
+
+    LogicalRDD(
+      originDataset.logicalPlan.output,
+      rdd,
+      firstLeafPartitioning(executedPlan.outputPartitioning),
+      executedPlan.outputOrdering,
+      isStreaming
+    )(originDataset.sparkSession, Some(optimizedPlan.stats), Some(optimizedPlan.constraints))
+  }
 }
 
 /** Physical plan node for scanning data from an RDD of InternalRow. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSink.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.streaming.sources
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.LogicalRDD
 import org.apache.spark.sql.execution.streaming.Sink
 import org.apache.spark.sql.streaming.DataStreamWriter
@@ -28,33 +27,13 @@ class ForeachBatchSink[T](batchWriter: (Dataset[T], Long) => Unit, encoder: Expr
   extends Sink {
 
   override def addBatch(batchId: Long, data: DataFrame): Unit = {
-    val rdd = data.queryExecution.toRdd
-    val executedPlan = data.queryExecution.executedPlan
-    val analyzedPlanWithoutMarkerNode = eliminateWriteMarkerNode(data.queryExecution.analyzed)
-    // assertion on precondition
-    assert(data.logicalPlan.output == analyzedPlanWithoutMarkerNode.output)
-    val node = LogicalRDD(
-      data.logicalPlan.output,
-      rdd,
-      Some(analyzedPlanWithoutMarkerNode),
-      executedPlan.outputPartitioning,
-      executedPlan.outputOrdering)(data.sparkSession)
+    val node = LogicalRDD.fromDataset(rdd = data.queryExecution.toRdd, originDataset = data,
+      isStreaming = false)
     implicit val enc = encoder
     val ds = Dataset.ofRows(data.sparkSession, node).as[T]
     batchWriter(ds, batchId)
   }
 
-  /**
-   * ForEachBatchSink implementation reuses the logical plan of `data` which breaks the contract
-   * of Sink.addBatch, which `data` should be just used to "collect" the output data.
-   * We have to deal with eliminating marker node here which we do this in streaming specific
-   * optimization rule.
-   */
-  private def eliminateWriteMarkerNode(plan: LogicalPlan): LogicalPlan = plan match {
-    case node: WriteToMicroBatchDataSourceV1 => node.child
-    case node => node
-  }
-
   override def toString(): String = "ForeachBatchSink"
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -31,10 +31,10 @@ import org.scalatest.matchers.should.Matchers._
 
 import org.apache.spark.SparkException
 import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd}
-import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, Uuid}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, EqualTo, ExpressionSet, GreaterThan, Literal, Uuid}
 import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, LeafNode, LocalRelation, LogicalPlan, OneRowRelation, Statistics}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
@@ -2011,7 +2011,7 @@ class DataFrameSuite extends QueryTest
     }
   }
 
-  test("SPARK-39748: build the stats for LogicalRDD based on originLogicalPlan") {
+  test("SPARK-39834: build the stats for LogicalRDD based on origin stats") {
     def buildExpectedColumnStats(attrs: Seq[Attribute]): AttributeMap[ColumnStat] = {
       AttributeMap(
         attrs.map {
@@ -2040,7 +2040,8 @@ class DataFrameSuite extends QueryTest
 
     val outputList = Seq(
       AttributeReference("cbool", BooleanType)(),
-      AttributeReference("cbyte", BooleanType)()
+      AttributeReference("cbyte", ByteType)(),
+      AttributeReference("cint", IntegerType)()
     )
 
     val expectedSize = 16
@@ -2052,9 +2053,11 @@ class DataFrameSuite extends QueryTest
     withSQLConf(SQLConf.CBO_ENABLED.key -> "true") {
       val df = Dataset.ofRows(spark, statsPlan)
 
+      // We can't leverage LogicalRDD.fromDataset here, since it triggers physical planning and
+      // there is no matching physical node for OutputListAwareStatsTestPlan.
       val logicalRDD = LogicalRDD(
-        df.logicalPlan.output, spark.sparkContext.emptyRDD, Some(df.queryExecution.analyzed),
-        isStreaming = true)(spark)
+        df.logicalPlan.output, spark.sparkContext.emptyRDD[InternalRow], isStreaming = true)(
+        spark, Some(df.queryExecution.optimizedPlan.stats), None)
 
       val stats = logicalRDD.computeStats()
       val expectedStats = Statistics(sizeInBytes = expectedSize, rowCount = Some(2),
@@ -2065,14 +2068,52 @@ class DataFrameSuite extends QueryTest
       // reflected as well.
       val newLogicalRDD = logicalRDD.newInstance()
       val newStats = newLogicalRDD.computeStats()
-      // LogicalRDD.newInstance adds projection to originLogicalPlan, which triggers estimation
-      // on sizeInBytes. We don't intend to check the estimated value.
-      val newExpectedStats = Statistics(sizeInBytes = newStats.sizeInBytes, rowCount = Some(2),
+      val newExpectedStats = Statistics(sizeInBytes = expectedSize, rowCount = Some(2),
         attributeStats = buildExpectedColumnStats(newLogicalRDD.output))
       assert(newStats === newExpectedStats)
     }
   }
 
+  test("SPARK-39834: build the constraints for LogicalRDD based on origin constraints") {
+    def buildExpectedConstraints(attrs: Seq[Attribute]): ExpressionSet = {
+      val exprs = attrs.flatMap { attr =>
+        attr.dataType match {
+          case BooleanType => Some(EqualTo(attr, Literal(true, BooleanType)))
+          case IntegerType => Some(GreaterThan(attr, Literal(5, IntegerType)))
+          case _ => None
+        }
+      }
+      ExpressionSet(exprs)
+    }
+
+    val outputList = Seq(
+      AttributeReference("cbool", BooleanType)(),
+      AttributeReference("cbyte", ByteType)(),
+      AttributeReference("cint", IntegerType)()
+    )
+
+    val statsPlan = OutputListAwareConstraintsTestPlan(outputList = outputList)
+
+    val df = Dataset.ofRows(spark, statsPlan)
+
+    // We can't leverage LogicalRDD.fromDataset here, since it triggers physical planning and
+    // there is no matching physical node for OutputListAwareConstraintsTestPlan.
+    val logicalRDD = LogicalRDD(
+      df.logicalPlan.output, spark.sparkContext.emptyRDD[InternalRow], isStreaming = true)(
+      spark, None, Some(df.queryExecution.optimizedPlan.constraints))
+
+    val constraints = logicalRDD.constraints
+    val expectedConstraints = buildExpectedConstraints(logicalRDD.output)
+    assert(constraints === expectedConstraints)
+
+    // This method re-issues expression IDs for all outputs. We expect constraints to be
+    // reflected as well.
+    val newLogicalRDD = logicalRDD.newInstance()
+    val newConstraints = newLogicalRDD.constraints
+    val newExpectedConstraints = buildExpectedConstraints(newLogicalRDD.output)
+    assert(newConstraints === newExpectedConstraints)
+  }
+
   test("SPARK-10656: completely support special chars") {
     val df = Seq(1 -> "a").toDF("i_$.a", "d^'a.")
     checkAnswer(df.select(df("*")), Row(1, "a"))
@@ -3356,3 +3397,26 @@ case class OutputListAwareStatsTestPlan(
   }
   override def newInstance(): LogicalPlan = copy(outputList = outputList.map(_.newInstance()))
 }
+
+/**
+ * This class is used for unit-testing. It's a logical plan whose output is passed in.
+ */
+case class OutputListAwareConstraintsTestPlan(
+    outputList: Seq[Attribute]) extends LeafNode with MultiInstanceRelation {
+  override def output: Seq[Attribute] = outputList
+
+  override lazy val constraints: ExpressionSet = {
+    val exprs = outputList.flatMap { attr =>
+      attr.dataType match {
+        case BooleanType => Some(EqualTo(attr, Literal(true, BooleanType)))
+        case IntegerType => Some(GreaterThan(attr, Literal(5, IntegerType)))
+        case _ => None
+      }
+    }
+    ExpressionSet(exprs)
+  }
+
+  override def newInstance(): LogicalPlan = copy(outputList = outputList.map(_.newInstance()))
+}
+
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSinkSuite.scala