Fix

HyukjinKwon · HyukjinKwon · commit 5497b9f13998 · 2023-12-12T22:39:33.000-08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonDataSource.scala
@@ -56,8 +56,11 @@ class PythonTableProvider(shortName: String) extends TableProvider {
       schema: StructType,
       partitioning: Array[Transform],
       properties: java.util.Map[String, String]): Table = {
+    assert(partitioning.isEmpty)
     new PythonTable(shortName, source, schema)
   }
+
+  override def supportsExternalMetadata(): Boolean = true
 }
 
 class PythonTable(shortName: String, source: UserDefinedPythonDataSource, givenSchema: StructType)
@@ -70,8 +73,8 @@ class PythonTable(shortName: String, source: UserDefinedPythonDataSource, givenS
   override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = {
     new ScanBuilder with Batch with Scan {
 
-      private lazy val pythonFunc: PythonFunction = source.createPythonFunction(
-        shortName, options, Some(givenSchema))
+      private lazy val pythonFunc: PythonFunction =
+        source.createPythonFunction(shortName, options, givenSchema)
 
       private lazy val info: PythonDataSourceReadInfo =
         new UserDefinedPythonDataSourceReadRunner(
@@ -163,37 +166,32 @@ class PythonPartitionReaderFactory(
  */
 case class UserDefinedPythonDataSource(dataSourceCls: PythonFunction) {
 
-  private var pythonResult: PythonDataSourceCreationResult = _
-
-  private def getOrCreatePythonResult(
+  private def createPythonResult(
       shortName: String,
       options: CaseInsensitiveStringMap,
       userSpecifiedSchema: Option[StructType]): PythonDataSourceCreationResult = {
-    if (pythonResult != null) return pythonResult
-    val runner = new UserDefinedPythonDataSourceRunner(
+    new UserDefinedPythonDataSourceRunner(
       dataSourceCls,
       shortName,
       userSpecifiedSchema,
-      CaseInsensitiveMap(options.asCaseSensitiveMap().asScala.toMap))
-    pythonResult = runner.runInPython()
-    pythonResult
+      CaseInsensitiveMap(options.asCaseSensitiveMap().asScala.toMap)).runInPython()
   }
 
   def inferSchema(
       shortName: String,
       options: CaseInsensitiveStringMap): StructType = {
-    getOrCreatePythonResult(shortName, options, None).schema
+    createPythonResult(shortName, options, None).schema
   }
 
   def createPythonFunction(
       shortName: String,
       options: CaseInsensitiveStringMap,
-      userSpecifiedSchema: Option[StructType]): PythonFunction = {
-    val pickledDataSourceInstance = getOrCreatePythonResult(
-      shortName, options, userSpecifiedSchema).dataSource
+      givenSchema: StructType): PythonFunction = {
+    val dataSource = createPythonResult(
+      shortName, options, Some(givenSchema)).dataSource
 
     SimplePythonFunction(
-      command = pickledDataSourceInstance.toImmutableArraySeq,
+      command = dataSource.toImmutableArraySeq,
       envVars = dataSourceCls.envVars,
       pythonIncludes = dataSourceCls.pythonIncludes,
       pythonExec = dataSourceCls.pythonExec,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonDataSourceSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution.python
 
 import org.apache.spark.sql.{AnalysisException, IntegratedUDFTestUtils, QueryTest, Row}
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.sql.types.StructType
 
@@ -55,6 +56,10 @@ class PythonDataSourceSuite extends QueryTest with SharedSparkSession {
     val df = spark.read.format(dataSourceName).schema(schema).load()
     assert(df.rdd.getNumPartitions == 2)
     val plan = df.queryExecution.optimizedPlan
+    plan match {
+      case s: DataSourceV2ScanRelation if s.relation.table.isInstanceOf[PythonTable] =>
+      case _ => fail(s"Plan did not match the expected pattern. Actual plan:\n$plan")
+    }
     checkAnswer(df, Seq(Row(0, 0), Row(0, 1), Row(1, 0), Row(1, 1), Row(2, 0), Row(2, 1)))
   }
 
@@ -164,12 +169,12 @@ class PythonDataSourceSuite extends QueryTest with SharedSparkSession {
          |            paths = []
          |        return [InputPartition(p) for p in paths]
          |
-         |    def read(self, path):
-         |        if path is not None:
-         |            assert isinstance(path, InputPartition)
-         |            yield (path.value, 1)
+         |    def read(self, part):
+         |        if part is not None:
+         |            assert isinstance(part, InputPartition)
+         |            yield (part.value, 1)
          |        else:
-         |            yield (path, 1)
+         |            yield (part, 1)
          |
          |class $dataSourceName(DataSource):
          |    @classmethod
@@ -256,7 +261,7 @@ class PythonDataSourceSuite extends QueryTest with SharedSparkSession {
   }
 
   test("data source read with custom partitions") {
-    assume(shouldTestPythonUDFs)
+    assume(shouldTestPandasUDFs)
     val dataSourceScript =
       s"""
          |from pyspark.sql.datasource import DataSource, DataSourceReader, InputPartition
@@ -288,7 +293,7 @@ class PythonDataSourceSuite extends QueryTest with SharedSparkSession {
   }
 
   test("data source read with empty partitions") {
-    assume(shouldTestPythonUDFs)
+    assume(shouldTestPandasUDFs)
     val dataSourceScript =
       s"""
          |from pyspark.sql.datasource import DataSource, DataSourceReader
@@ -316,7 +321,7 @@ class PythonDataSourceSuite extends QueryTest with SharedSparkSession {
   }
 
   test("data source read with invalid partitions") {
-    assume(shouldTestPythonUDFs)
+    assume(shouldTestPandasUDFs)
     val reader1 =
       s"""
          |class SimpleDataSourceReader(DataSourceReader):