-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-18021][SQL] Refactor file name specification for data sources #15562
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
426ed1f
6b79d88
34b3cb1
229682f
7aaded1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,7 +26,7 @@ import org.apache.parquet.hadoop.util.ContextUtil | |
|
|
||
| import org.apache.spark.sql.Row | ||
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.execution.datasources.{BucketingUtils, OutputWriter, OutputWriterFactory, WriterContainer} | ||
| import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory} | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.types.StructType | ||
| import org.apache.spark.util.SerializableConfiguration | ||
|
|
@@ -122,47 +122,29 @@ private[parquet] class ParquetOutputWriterFactory( | |
| } | ||
|
|
||
| /** Disable the use of the older API. */ | ||
| def newInstance( | ||
| override def newInstance( | ||
| path: String, | ||
| bucketId: Option[Int], | ||
| fileNamePrefix: String, | ||
| dataSchema: StructType, | ||
| context: TaskAttemptContext): OutputWriter = { | ||
| throw new UnsupportedOperationException( | ||
| "this version of newInstance not supported for " + | ||
| throw new UnsupportedOperationException("this version of newInstance not supported for " + | ||
| "ParquetOutputWriterFactory") | ||
| } | ||
| } | ||
|
|
||
|
|
||
| // NOTE: This class is instantiated and used on executor side only, no need to be serializable. | ||
| private[parquet] class ParquetOutputWriter( | ||
| path: String, | ||
| bucketId: Option[Int], | ||
| stagingDir: String, | ||
| fileNamePrefix: String, | ||
| context: TaskAttemptContext) | ||
| extends OutputWriter { | ||
|
|
||
| private val recordWriter: RecordWriter[Void, InternalRow] = { | ||
| val outputFormat = { | ||
| new ParquetOutputFormat[InternalRow]() { | ||
| // Here we override `getDefaultWorkFile` for two reasons: | ||
| // | ||
| // 1. To allow appending. We need to generate unique output file names to avoid | ||
| // overwriting existing files (either exist before the write job, or are just written | ||
| // by other tasks within the same write job). | ||
| // | ||
| // 2. To allow dynamic partitioning. Default `getDefaultWorkFile` uses | ||
| // `FileOutputCommitter.getWorkPath()`, which points to the base directory of all | ||
| // partitions in the case of dynamic partitioning. | ||
| override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { | ||
| val configuration = context.getConfiguration | ||
| val uniqueWriteJobId = configuration.get(WriterContainer.DATASOURCE_WRITEJOBUUID) | ||
| val taskAttemptId = context.getTaskAttemptID | ||
| val split = taskAttemptId.getTaskID.getId | ||
| val bucketString = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("") | ||
| // It has the `.parquet` extension at the end because (de)compression tools | ||
| // such as gunzip would not be able to decompress this as the compression | ||
| // is not applied on this whole file but on each "page" in Parquet format. | ||
| new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$bucketString$extension") | ||
| new Path(stagingDir, fileNamePrefix + extension) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why remove this comment?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Basically the only contract now is that prefix needs to be enforced, and it is not the concern of these classes to think about dynamic partitioning or appending. |
||
| } | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should probably preserve this comment and move it to the new place where we generate the UUID.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's actually there already in WriteJobDescription. I shortened it to a single line.