Skip to content

Commit 70b13e0

Browse files
committed
Delete more code
1 parent ed5e5bc commit 70b13e0

File tree

3 files changed

+2
-139
lines changed

3 files changed

+2
-139
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -55,20 +55,6 @@ trait FileFormat {
5555
options: Map[String, String],
5656
dataSchema: StructType): OutputWriterFactory
5757

58-
/**
59-
* Returns a [[OutputWriterFactory]] for generating output writers that can write data.
60-
* This method is current used only by FileStreamSinkWriter to generate output writers that
61-
* does not use output committers to write data. The OutputWriter generated by the returned
62-
* [[OutputWriterFactory]] must implement the method `newWriter(path)`..
63-
*/
64-
def buildWriter(
65-
sqlContext: SQLContext,
66-
dataSchema: StructType,
67-
options: Map[String, String]): OutputWriterFactory = {
68-
// TODO: Remove this default implementation when the other formats have been ported
69-
throw new UnsupportedOperationException(s"buildWriter is not supported for $this")
70-
}
71-
7258
/**
7359
* Returns whether this format support returning columnar batch or not.
7460
*

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -415,17 +415,6 @@ class ParquetFileFormat
415415
}
416416
}
417417
}
418-
419-
override def buildWriter(
420-
sqlContext: SQLContext,
421-
dataSchema: StructType,
422-
options: Map[String, String]): OutputWriterFactory = {
423-
new ParquetOutputWriterFactory(
424-
sqlContext.conf,
425-
dataSchema,
426-
sqlContext.sessionState.newHadoopConf(),
427-
options)
428-
}
429418
}
430419

431420
object ParquetFileFormat extends Logging {

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOutputWriter.scala

Lines changed: 2 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -17,125 +17,13 @@
1717

1818
package org.apache.spark.sql.execution.datasources.parquet
1919

20-
import org.apache.hadoop.conf.Configuration
2120
import org.apache.hadoop.fs.Path
2221
import org.apache.hadoop.mapreduce._
23-
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
24-
import org.apache.parquet.hadoop.{ParquetOutputFormat, ParquetRecordWriter}
25-
import org.apache.parquet.hadoop.codec.CodecConfig
26-
import org.apache.parquet.hadoop.util.ContextUtil
22+
import org.apache.parquet.hadoop.ParquetOutputFormat
2723

2824
import org.apache.spark.sql.Row
2925
import org.apache.spark.sql.catalyst.InternalRow
30-
import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory}
31-
import org.apache.spark.sql.internal.SQLConf
32-
import org.apache.spark.sql.types.StructType
33-
import org.apache.spark.util.SerializableConfiguration
34-
35-
36-
/**
37-
* A factory for generating OutputWriters for writing parquet files. This implemented is different
38-
* from the [[ParquetOutputWriter]] as this does not use any [[OutputCommitter]]. It simply
39-
* writes the data to the path used to generate the output writer. Callers of this factory
40-
* has to ensure which files are to be considered as committed.
41-
*/
42-
private[parquet] class ParquetOutputWriterFactory(
43-
sqlConf: SQLConf,
44-
dataSchema: StructType,
45-
hadoopConf: Configuration,
46-
options: Map[String, String])
47-
extends OutputWriterFactory {
48-
49-
private val serializableConf: SerializableConfiguration = {
50-
val job = Job.getInstance(hadoopConf)
51-
val conf = ContextUtil.getConfiguration(job)
52-
val parquetOptions = new ParquetOptions(options, sqlConf)
53-
54-
// We're not really using `ParquetOutputFormat[Row]` for writing data here, because we override
55-
// it in `ParquetOutputWriter` to support appending and dynamic partitioning. The reason why
56-
// we set it here is to setup the output committer class to `ParquetOutputCommitter`, which is
57-
// bundled with `ParquetOutputFormat[Row]`.
58-
job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]])
59-
60-
ParquetOutputFormat.setWriteSupportClass(job, classOf[ParquetWriteSupport])
61-
62-
// We want to clear this temporary metadata from saving into Parquet file.
63-
// This metadata is only useful for detecting optional columns when pushing down filters.
64-
val dataSchemaToWrite = StructType.removeMetadata(
65-
StructType.metadataKeyForOptionalField,
66-
dataSchema).asInstanceOf[StructType]
67-
ParquetWriteSupport.setSchema(dataSchemaToWrite, conf)
68-
69-
// Sets flags for `CatalystSchemaConverter` (which converts Catalyst schema to Parquet schema)
70-
// and `CatalystWriteSupport` (writing actual rows to Parquet files).
71-
conf.set(
72-
SQLConf.PARQUET_BINARY_AS_STRING.key,
73-
sqlConf.isParquetBinaryAsString.toString)
74-
75-
conf.set(
76-
SQLConf.PARQUET_INT96_AS_TIMESTAMP.key,
77-
sqlConf.isParquetINT96AsTimestamp.toString)
78-
79-
conf.set(
80-
SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key,
81-
sqlConf.writeLegacyParquetFormat.toString)
82-
83-
// Sets compression scheme
84-
conf.set(ParquetOutputFormat.COMPRESSION, parquetOptions.compressionCodecClassName)
85-
new SerializableConfiguration(conf)
86-
}
87-
88-
/**
89-
* Returns a [[OutputWriter]] that writes data to the give path without using
90-
* [[OutputCommitter]].
91-
*/
92-
override def newWriter(path: String): OutputWriter = new OutputWriter {
93-
94-
// Create TaskAttemptContext that is used to pass on Configuration to the ParquetRecordWriter
95-
private val hadoopTaskAttemptId = new TaskAttemptID(new TaskID(new JobID, TaskType.MAP, 0), 0)
96-
private val hadoopAttemptContext = new TaskAttemptContextImpl(
97-
serializableConf.value, hadoopTaskAttemptId)
98-
99-
// Instance of ParquetRecordWriter that does not use OutputCommitter
100-
private val recordWriter = createNoCommitterRecordWriter(path, hadoopAttemptContext)
101-
102-
override def write(row: Row): Unit = {
103-
throw new UnsupportedOperationException("call writeInternal")
104-
}
105-
106-
protected[sql] override def writeInternal(row: InternalRow): Unit = {
107-
recordWriter.write(null, row)
108-
}
109-
110-
override def close(): Unit = recordWriter.close(hadoopAttemptContext)
111-
}
112-
113-
/** Create a [[ParquetRecordWriter]] that writes the given path without using OutputCommitter */
114-
private def createNoCommitterRecordWriter(
115-
path: String,
116-
hadoopAttemptContext: TaskAttemptContext): RecordWriter[Void, InternalRow] = {
117-
// Custom ParquetOutputFormat that disable use of committer and writes to the given path
118-
val outputFormat = new ParquetOutputFormat[InternalRow]() {
119-
override def getOutputCommitter(c: TaskAttemptContext): OutputCommitter = { null }
120-
override def getDefaultWorkFile(c: TaskAttemptContext, ext: String): Path = { new Path(path) }
121-
}
122-
outputFormat.getRecordWriter(hadoopAttemptContext)
123-
}
124-
125-
/** Disable the use of the older API. */
126-
override def newInstance(
127-
path: String,
128-
dataSchema: StructType,
129-
context: TaskAttemptContext): OutputWriter = {
130-
throw new UnsupportedOperationException("this version of newInstance not supported for " +
131-
"ParquetOutputWriterFactory")
132-
}
133-
134-
override def getFileExtension(context: TaskAttemptContext): String = {
135-
CodecConfig.from(context).getCodec.getExtension + ".parquet"
136-
}
137-
}
138-
26+
import org.apache.spark.sql.execution.datasources.OutputWriter
13927

14028
// NOTE: This class is instantiated and used on executor side only, no need to be serializable.
14129
private[parquet] class ParquetOutputWriter(path: String, context: TaskAttemptContext)

0 commit comments

Comments
 (0)