[SPARK-18658][SQL] Write text records directly to a FileOutputStream

Nathan Howell · Nathan Howell · commit 56667bd86c1d · 2016-11-30T21:24:53.000-08:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -21,9 +21,7 @@ import java.io.IOException
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
-import org.apache.hadoop.io.{NullWritable, Text}
-import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
+import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
 
 import org.apache.spark.TaskContext
 import org.apache.spark.ml.feature.LabeledPoint
@@ -35,7 +33,6 @@ import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
 import org.apache.spark.sql.execution.datasources._
-import org.apache.spark.sql.execution.datasources.text.TextOutputWriter
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.SerializableConfiguration
@@ -46,30 +43,21 @@ private[libsvm] class LibSVMOutputWriter(
     context: TaskAttemptContext)
   extends OutputWriter {
 
-  private[this] val buffer = new Text()
-
-  private val recordWriter: RecordWriter[NullWritable, Text] = {
-    new TextOutputFormat[NullWritable, Text]() {
-      override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-        new Path(path)
-      }
-    }.getRecordWriter(context)
-  }
+  private val writer = CodecStreams.getOutputStreamWriter(context, new Path(path))
 
   override def write(row: Row): Unit = {
     val label = row.get(0)
     val vector = row.get(1).asInstanceOf[Vector]
-    val sb = new StringBuilder(label.toString)
+    writer.write(label.toString)
     vector.foreachActive { case (i, v) =>
-      sb += ' '
-      sb ++= s"${i + 1}:$v"
+      writer.write(s" ${i + 1}:$v")
     }
-    buffer.set(sb.mkString)
-    recordWriter.write(NullWritable.get(), buffer)
+
+    writer.write('\n')
   }
 
   override def close(): Unit = {
-    recordWriter.close(context)
+    writer.close()
   }
 }
 
@@ -136,7 +124,7 @@ private[libsvm] class LibSVMFileFormat extends TextBasedFileFormat with DataSour
       }
 
       override def getFileExtension(context: TaskAttemptContext): String = {
-        ".libsvm" + TextOutputWriter.getCompressionExtension(context)
+        ".libsvm" + CodecStreams.getCompressionExtension(context)
       }
     }
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
@@ -194,4 +194,8 @@ private[sql] class JacksonGenerator(
       writeFields(row, schema, rootFieldWriters)
     }
   }
+
+  private[sql] def writeLineEnding(): Unit = {
+    gen.writeRaw('\n')
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CodecStreams.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CodecStreams.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.io.{OutputStream, OutputStreamWriter}
+import java.nio.charset.{Charset, StandardCharsets}
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.io.compress._
+import org.apache.hadoop.mapreduce.JobContext
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
+import org.apache.hadoop.util.ReflectionUtils
+
+object CodecStreams {
+  private def getCompressionCodec(
+      context: JobContext,
+      file: Option[Path] = None): Option[CompressionCodec] = {
+    if (FileOutputFormat.getCompressOutput(context)) {
+      val compressorClass = FileOutputFormat.getOutputCompressorClass(
+        context,
+        classOf[GzipCodec])
+
+      Some(ReflectionUtils.newInstance(compressorClass, context.getConfiguration))
+    } else {
+      file.flatMap { path =>
+        val compressionCodecs = new CompressionCodecFactory(context.getConfiguration)
+        Option(compressionCodecs.getCodec(path))
+      }
+    }
+  }
+
+  /** Create a new file and open it for writing.
+   * If compression is enabled in the [[JobContext]] the stream will write compressed data to disk.
+   * An exception will be thrown if the file already exists.
+   */
+  def getOutputStream(context: JobContext, file: Path): OutputStream = {
+    val fs = file.getFileSystem(context.getConfiguration)
+    val outputStream: OutputStream = fs.create(file, false)
+
+    getCompressionCodec(context, Some(file)).fold(outputStream) { codec =>
+      codec.createOutputStream(outputStream)
+    }
+  }
+
+  def getOutputStreamWriter(
+      context: JobContext,
+      file: Path,
+      charset: Charset = StandardCharsets.UTF_8): OutputStreamWriter = {
+    new OutputStreamWriter(getOutputStream(context, file), charset)
+  }
+
+  /** Returns the compression codec extension to be used in a file name, e.g. ".gzip"). */
+  def getCompressionExtension(context: JobContext): String = {
+    getCompressionCodec(context).fold("") { code =>
+      code.getDefaultExtension
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.sql.execution.datasources.csv
 
-import java.io.{CharArrayWriter, StringReader}
+import java.io.{CharArrayWriter, OutputStream, StringReader}
+import java.nio.charset.StandardCharsets
 
 import com.univocity.parsers.csv._
 
@@ -64,7 +65,10 @@ private[csv] class CsvReader(params: CSVOptions) {
  * @param params Parameters object for configuration
  * @param headers headers for columns
  */
-private[csv] class LineCsvWriter(params: CSVOptions, headers: Seq[String]) extends Logging {
+private[csv] class LineCsvWriter(
+    params: CSVOptions,
+    headers: Seq[String],
+    output: OutputStream) extends Logging {
   private val writerSettings = new CsvWriterSettings
   private val format = writerSettings.getFormat
 
@@ -80,21 +84,14 @@ private[csv] class LineCsvWriter(params: CSVOptions, headers: Seq[String]) exten
   writerSettings.setHeaders(headers: _*)
   writerSettings.setQuoteEscapingEnabled(params.escapeQuotes)
 
-  private val buffer = new CharArrayWriter()
-  private val writer = new CsvWriter(buffer, writerSettings)
+  private val writer = new CsvWriter(output, StandardCharsets.UTF_8, writerSettings)
 
   def writeRow(row: Seq[String], includeHeader: Boolean): Unit = {
     if (includeHeader) {
       writer.writeHeaders()
     }
-    writer.writeRow(row.toArray: _*)
-  }
 
-  def flush(): String = {
-    writer.flush()
-    val lines = buffer.toString.stripLineEnd
-    buffer.reset()
-    lines
+    writer.writeRow(row.toArray: _*)
   }
 
   def close(): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
@@ -20,19 +20,15 @@ package org.apache.spark.sql.execution.datasources.csv
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.fs.Path
-import org.apache.hadoop.io.{NullWritable, Text}
-import org.apache.hadoop.mapreduce.RecordWriter
 import org.apache.hadoop.mapreduce.TaskAttemptContext
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory, PartitionedFile}
-import org.apache.spark.sql.execution.datasources.text.TextOutputWriter
+import org.apache.spark.sql.execution.datasources.{CodecStreams, OutputWriter, OutputWriterFactory, PartitionedFile}
 import org.apache.spark.sql.types._
 
 object CSVRelation extends Logging {
@@ -179,7 +175,7 @@ private[csv] class CSVOutputWriterFactory(params: CSVOptions) extends OutputWrit
   }
 
   override def getFileExtension(context: TaskAttemptContext): String = {
-    ".csv" + TextOutputWriter.getCompressionExtension(context)
+    ".csv" + CodecStreams.getCompressionExtension(context)
   }
 }
 
@@ -189,9 +185,6 @@ private[csv] class CsvOutputWriter(
     context: TaskAttemptContext,
     params: CSVOptions) extends OutputWriter with Logging {
 
-  // create the Generator without separator inserted between 2 records
-  private[this] val text = new Text()
-
   // A `ValueConverter` is responsible for converting a value of an `InternalRow` to `String`.
   // When the value is null, this converter should not be called.
   private type ValueConverter = (InternalRow, Int) => String
@@ -200,17 +193,9 @@ private[csv] class CsvOutputWriter(
   private val valueConverters: Array[ValueConverter] =
     dataSchema.map(_.dataType).map(makeConverter).toArray
 
-  private val recordWriter: RecordWriter[NullWritable, Text] = {
-    new TextOutputFormat[NullWritable, Text]() {
-      override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-        new Path(path)
-      }
-    }.getRecordWriter(context)
-  }
-
-  private val FLUSH_BATCH_SIZE = 1024L
-  private var records: Long = 0L
-  private val csvWriter = new LineCsvWriter(params, dataSchema.fieldNames.toSeq)
+  private var printHeader: Boolean = params.headerFlag
+  private val writer = CodecStreams.getOutputStream(context, new Path(path))
+  private val csvWriter = new LineCsvWriter(params, dataSchema.fieldNames.toSeq, writer)
 
   private def rowToString(row: InternalRow): Seq[String] = {
     var i = 0
@@ -245,24 +230,12 @@ private[csv] class CsvOutputWriter(
   override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal")
 
   override protected[sql] def writeInternal(row: InternalRow): Unit = {
-    csvWriter.writeRow(rowToString(row), records == 0L && params.headerFlag)
-    records += 1
-    if (records % FLUSH_BATCH_SIZE == 0) {
-      flush()
-    }
-  }
-
-  private def flush(): Unit = {
-    val lines = csvWriter.flush()
-    if (lines.nonEmpty) {
-      text.set(lines)
-      recordWriter.write(NullWritable.get(), text)
-    }
+    csvWriter.writeRow(rowToString(row), printHeader)
+    printHeader = false
   }
 
   override def close(): Unit = {
-    flush()
     csvWriter.close()
-    recordWriter.close(context)
+    writer.close()
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
@@ -17,15 +17,12 @@
 
 package org.apache.spark.sql.execution.datasources.json
 
-import java.io.CharArrayWriter
-
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
-import org.apache.hadoop.io.{LongWritable, NullWritable, Text}
+import org.apache.hadoop.io.{LongWritable, Text}
 import org.apache.hadoop.mapred.{JobConf, TextInputFormat}
-import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
+import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
 
 import org.apache.spark.TaskContext
 import org.apache.spark.internal.Logging
@@ -35,7 +32,6 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.json.{JacksonGenerator, JacksonParser, JSONOptions}
 import org.apache.spark.sql.catalyst.util.CompressionCodecs
 import org.apache.spark.sql.execution.datasources._
-import org.apache.spark.sql.execution.datasources.text.TextOutputWriter
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.SerializableConfiguration
@@ -90,7 +86,7 @@ class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister {
       }
 
       override def getFileExtension(context: TaskAttemptContext): String = {
-        ".json" + TextOutputWriter.getCompressionExtension(context)
+        ".json" + CodecStreams.getCompressionExtension(context)
       }
     }
   }
@@ -163,33 +159,20 @@ private[json] class JsonOutputWriter(
     context: TaskAttemptContext)
   extends OutputWriter with Logging {
 
-  private[this] val writer = new CharArrayWriter()
+  private val writer = CodecStreams.getOutputStreamWriter(context, new Path(path))
+
   // create the Generator without separator inserted between 2 records
   private[this] val gen = new JacksonGenerator(dataSchema, writer, options)
-  private[this] val result = new Text()
-
-  private val recordWriter: RecordWriter[NullWritable, Text] = {
-    new TextOutputFormat[NullWritable, Text]() {
-      override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-        new Path(path)
-      }
-    }.getRecordWriter(context)
-  }
 
   override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal")
 
   override protected[sql] def writeInternal(row: InternalRow): Unit = {
     gen.write(row)
-    gen.flush()
-
-    result.set(writer.toString)
-    writer.reset()
-
-    recordWriter.write(NullWritable.get(), result)
+    gen.writeLineEnding()
   }
 
   override def close(): Unit = {
     gen.close()
-    recordWriter.close(context)
+    writer.close()
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala

Original file line number	Diff line number	Diff line change
`@@ -194,4 +194,8 @@ private[sql] class JacksonGenerator(`
`194`	`194`	`writeFields(row, schema, rootFieldWriters)`
`195`	`195`	`}`
`196`	`196`	`}`
	`197`	`+`
	`198`	`+ private[sql] def writeLineEnding(): Unit = {`
	`199`	`+ gen.writeRaw('\n')`
	`200`	`+ }`
`197`	`201`	`}`