-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-23817][SQL]Migrate ORC file format read path to data source V2 #20933
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
09cb47d
c6aac35
72937ec
d6128ae
5d00dd7
2561a72
a59f54e
0fbb091
9936333
5dba7e2
a6dba6a
4955714
7bc9951
31fe235
8894bbd
3f0a67c
c90f1f7
ba84051
c91db6d
6dc1f10
eee5c94
edbe034
aa80f6f
cca4321
67b1748
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,8 +30,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{AnalysisBarrier, InsertIntoT | |
| import org.apache.spark.sql.execution.SQLExecution | ||
| import org.apache.spark.sql.execution.command.DDLUtils | ||
| import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, LogicalRelation} | ||
| import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils | ||
| import org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2 | ||
| import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Utils, FileDataSourceV2, WriteToDataSourceV2} | ||
| import org.apache.spark.sql.sources.BaseRelation | ||
| import org.apache.spark.sql.sources.v2._ | ||
| import org.apache.spark.sql.types.StructType | ||
|
|
@@ -241,39 +240,47 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { | |
| val cls = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf) | ||
| if (classOf[DataSourceV2].isAssignableFrom(cls)) { | ||
| val ds = cls.newInstance() | ||
| ds match { | ||
| case ws: WriteSupport => | ||
| val options = new DataSourceOptions((extraOptions ++ | ||
| DataSourceV2Utils.extractSessionConfigs( | ||
| ds = ds.asInstanceOf[DataSourceV2], | ||
| conf = df.sparkSession.sessionState.conf)).asJava) | ||
| // Using a timestamp and a random UUID to distinguish different writing jobs. This is good | ||
| // enough as there won't be tons of writing jobs created at the same second. | ||
| val jobId = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US) | ||
| .format(new Date()) + "-" + UUID.randomUUID() | ||
| val writer = ws.createWriter(jobId, df.logicalPlan.schema, mode, options) | ||
| if (writer.isPresent) { | ||
| runCommand(df.sparkSession, "save") { | ||
| WriteToDataSourceV2(writer.get(), df.logicalPlan) | ||
| } | ||
| } | ||
| val (needToFallBackFileDataSourceV2, fallBackFileFormat) = ds match { | ||
| case f: FileDataSourceV2 => | ||
| val disabledV2Readers = | ||
| df.sparkSession.sessionState.conf.disabledV2FileDataSourceWriter.split(",") | ||
| (disabledV2Readers.contains(f.shortName), f.fallBackFileFormat.getCanonicalName) | ||
| case _ => (false, source) | ||
| } | ||
|
|
||
| // Streaming also uses the data source V2 API. So it may be that the data source implements | ||
| // v2, but has no v2 implementation for batch writes. In that case, we fall back to saving | ||
| // as though it's a V1 source. | ||
| case _ => saveToV1Source() | ||
| if (ds.isInstanceOf[WriteSupport] && !needToFallBackFileDataSourceV2) { | ||
| val options = new DataSourceOptions((extraOptions ++ | ||
| DataSourceV2Utils.extractSessionConfigs( | ||
| ds = ds.asInstanceOf[DataSourceV2], | ||
| conf = df.sparkSession.sessionState.conf)).asJava) | ||
| // Using a timestamp and a random UUID to distinguish different writing jobs. This is good | ||
| // enough as there won't be tons of writing jobs created at the same second. | ||
| val jobId = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US) | ||
| .format(new Date()) + "-" + UUID.randomUUID() | ||
| val writer = ds.asInstanceOf[WriteSupport] | ||
| .createWriter(jobId, df.logicalPlan.schema, mode, options) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure I understand this: why do use
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is. We're still evolving the v2 API and integration with Spark. This problem is addressed in PR #21305, which is the first of a series of changes to standardize the logical plans and fix problems like this one. There's also an open proposal for those changes. |
||
| if (writer.isPresent) { | ||
| runCommand(df.sparkSession, "save") { | ||
| WriteToDataSourceV2(writer.get(), df.logicalPlan) | ||
| } | ||
| } | ||
| } else { | ||
| // In the following cases, we fall back to saving with V1: | ||
| // 1. The data source implements v2, but has no v2 implementation for write path. | ||
| // 2. The v2 writer of the data source is configured as disabled. | ||
| saveToV1Source(fallBackFileFormat) | ||
| } | ||
| } else { | ||
| saveToV1Source() | ||
| saveToV1Source(source) | ||
| } | ||
| } | ||
|
|
||
| private def saveToV1Source(): Unit = { | ||
| private def saveToV1Source(className: String): Unit = { | ||
| // Code path for data source v1. | ||
| runCommand(df.sparkSession, "save") { | ||
| DataSource( | ||
| sparkSession = df.sparkSession, | ||
| className = source, | ||
| className = className, | ||
| partitionColumns = partitioningColumns.getOrElse(Nil), | ||
| options = extraOptions.toMap).planForWriting(mode, AnalysisBarrier(df.logicalPlan)) | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
param doc for them.