[HADP-56882][SPARK-51281][SQL] DataFrameWriterV2 should respect the path option (apache#705)

wangyum · cloud-fan · GitHub Enterprise · commit dff28eaaf51a · 2025-03-05T00:55:01.000-06:00
* [SPARK-51281][SQL] DataFrameWriterV2 should respect the path option Unlike `DataFrameWriter.saveAsTable` where we explicitly get the "path" option and treat it as table location, `DataFrameWriterV2` doesn't do it and treats the "path" option as a normal option which doesn't have any real impact. This PR fixes it, and adds a legacy config to restore the old behavior. bug fix Yes, now `DataFrameWriterV2` can correctly write data to the specified path for file source tables. new test no Closes apache#50040 from cloud-fan/prop. Lead-authored-by: Wenchen Fan <cloud0fan@gmail.com> Co-authored-by: Wenchen Fan <wenchen@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com> (cherry picked from commit a3671e5) Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -5097,6 +5097,15 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val  LEGACY_DF_WRITER_V2_IGNORE_PATH_OPTION =
+    buildConf("spark.sql.legacy.dataFrameWriterV2IgnorePathOption")
+      .internal()
+      .doc("When set to true, DataFrameWriterV2 ignores the 'path' option and always write data " +
+        "to the default table location.")
+      .version("3.5.0")
+      .booleanConf
+      .createWithDefault(false)
+
   val MAX_REPAIR_PARTITION_NUM =
     buildConf("spark.carmel.sql.repairPartition.maxNum")
       .internal()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala
@@ -24,9 +24,11 @@ import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableException, NoSuchTableException, TableAlreadyExistsException, UnresolvedIdentifier, UnresolvedRelation}
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Bucket, Days, Hours, Literal, Months, Years}
 import org.apache.spark.sql.catalyst.plans.logical.{AppendData, CreateTableAsSelect, LogicalPlan, OptionList, OverwriteByExpression, OverwritePartitionsDynamic, ReplaceTableAsSelect, UnresolvedTableSpec}
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.connector.expressions.{LogicalExpressions, NamedReference, Transform}
 import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.execution.QueryExecution
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.IntegerType
 
 /**
@@ -108,24 +110,30 @@ final class DataFrameWriterV2[T] private[sql](table: String, ds: Dataset[T])
   }
 
   override def create(): Unit = {
-    val tableSpec = UnresolvedTableSpec(
-      properties = properties.toMap,
-      provider = provider,
-      optionExpression = OptionList(Seq.empty),
-      location = None,
-      comment = None,
-      serde = None,
-      external = false)
     runCommand(
       CreateTableAsSelect(
         UnresolvedIdentifier(tableName),
         partitioning.getOrElse(Seq.empty),
         logicalPlan,
-        tableSpec,
+        buildTableSpec(),
         options.toMap,
         false))
   }
 
+  private def buildTableSpec(): UnresolvedTableSpec = {
+    val ignorePathOption = sparkSession.sessionState.conf.getConf(
+      SQLConf.LEGACY_DF_WRITER_V2_IGNORE_PATH_OPTION)
+    UnresolvedTableSpec(
+      properties = properties.toMap,
+      provider = provider,
+      optionExpression = OptionList(Seq.empty),
+      location = if (ignorePathOption) None else CaseInsensitiveMap(options.toMap).get("path"),
+      comment = None,
+      serde = None,
+      external = false)
+  }
+
+  /** @inheritdoc */
   override def replace(): Unit = {
     internalReplace(orCreate = false)
   }
@@ -197,19 +205,11 @@ final class DataFrameWriterV2[T] private[sql](table: String, ds: Dataset[T])
   }
 
   private def internalReplace(orCreate: Boolean): Unit = {
-    val tableSpec = UnresolvedTableSpec(
-      properties = properties.toMap,
-      provider = provider,
-      optionExpression = OptionList(Seq.empty),
-      location = None,
-      comment = None,
-      serde = None,
-      external = false)
     runCommand(ReplaceTableAsSelect(
       UnresolvedIdentifier(tableName),
       partitioning.getOrElse(Seq.empty),
       logicalPlan,
-      tableSpec,
+      buildTableSpec(),
       writeOptions = options.toMap,
       orCreate = orCreate))
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala
@@ -789,4 +789,30 @@ class DataFrameWriterV2Suite extends QueryTest with SharedSparkSession with Befo
       errorClass = "CALL_ON_STREAMING_DATASET_UNSUPPORTED",
       parameters = Map("methodName" -> "`writeTo`"))
   }
+
+  test("SPARK-51281: DataFrameWriterV2 should respect the path option") {
+    def checkResults(df: DataFrame): Unit = {
+      checkAnswer(df, spark.range(10).toDF())
+    }
+
+    Seq(true, false).foreach { ignorePath =>
+      withSQLConf(SQLConf.LEGACY_DF_WRITER_V2_IGNORE_PATH_OPTION.key -> ignorePath.toString) {
+        withTable("t1", "t2") {
+          spark.range(10).writeTo("t1").using("json").create()
+          checkResults(spark.table("t1"))
+
+          withTempPath { p =>
+            val path = p.getCanonicalPath
+            spark.range(10).writeTo("t2").using("json").option("path", path).create()
+            checkResults(spark.table("t2"))
+            if (ignorePath) {
+              assert(!p.exists())
+            } else {
+              checkResults(spark.read.json(path))
+            }
+          }
+        }
+      }
+    }
+  }
 }