@@ -24,7 +24,6 @@ import scala.util.control.NonFatal
2424import com .google .common .base .Objects
2525import org .apache .hadoop .fs .{FileStatus , Path }
2626import org .apache .hadoop .io .{LongWritable , NullWritable , Text }
27- import org .apache .hadoop .io .SequenceFile .CompressionType
2827import org .apache .hadoop .mapred .TextInputFormat
2928import org .apache .hadoop .mapreduce .{Job , TaskAttemptContext }
3029import org .apache .hadoop .mapreduce .RecordWriter
@@ -34,6 +33,7 @@ import org.apache.spark.Logging
3433import org .apache .spark .rdd .RDD
3534import org .apache .spark .sql ._
3635import org .apache .spark .sql .catalyst .InternalRow
36+ import org .apache .spark .sql .execution .datasources .CompressionCodecs
3737import org .apache .spark .sql .sources ._
3838import org .apache .spark .sql .types ._
3939
@@ -50,16 +50,16 @@ private[sql] class CSVRelation(
5050 case None => inferSchema(paths)
5151 }
5252
53- private val params = new CSVOptions (parameters)
53+ private val options = new CSVOptions (parameters)
5454
5555 @ transient
5656 private var cachedRDD : Option [RDD [String ]] = None
5757
5858 private def readText (location : String ): RDD [String ] = {
59- if (Charset .forName(params .charset) == Charset .forName(" UTF-8" )) {
59+ if (Charset .forName(options .charset) == Charset .forName(" UTF-8" )) {
6060 sqlContext.sparkContext.textFile(location)
6161 } else {
62- val charset = params .charset
62+ val charset = options .charset
6363 sqlContext.sparkContext.hadoopFile[LongWritable , Text , TextInputFormat ](location)
6464 .mapPartitions { _.map { pair =>
6565 new String (pair._2.getBytes, 0 , pair._2.getLength, charset)
@@ -81,8 +81,8 @@ private[sql] class CSVRelation(
8181 private def tokenRdd (header : Array [String ], inputPaths : Array [String ]): RDD [Array [String ]] = {
8282 val rdd = baseRdd(inputPaths)
8383 // Make sure firstLine is materialized before sending to executors
84- val firstLine = if (params .headerFlag) findFirstLine(rdd) else null
85- CSVRelation .univocityTokenizer(rdd, header, firstLine, params )
84+ val firstLine = if (options .headerFlag) findFirstLine(rdd) else null
85+ CSVRelation .univocityTokenizer(rdd, header, firstLine, options )
8686 }
8787
8888 /**
@@ -96,20 +96,16 @@ private[sql] class CSVRelation(
9696 val pathsString = inputs.map(_.getPath.toUri.toString)
9797 val header = schema.fields.map(_.name)
9898 val tokenizedRdd = tokenRdd(header, pathsString)
99- CSVRelation .parseCsv(tokenizedRdd, schema, requiredColumns, inputs, sqlContext, params )
99+ CSVRelation .parseCsv(tokenizedRdd, schema, requiredColumns, inputs, sqlContext, options )
100100 }
101101
102102 override def prepareJobForWrite (job : Job ): OutputWriterFactory = {
103103 val conf = job.getConfiguration
104- params.compressionCodec.foreach { codec =>
105- conf.set(" mapreduce.output.fileoutputformat.compress" , " true" )
106- conf.set(" mapreduce.output.fileoutputformat.compress.type" , CompressionType .BLOCK .toString)
107- conf.set(" mapreduce.output.fileoutputformat.compress.codec" , codec)
108- conf.set(" mapreduce.map.output.compress" , " true" )
109- conf.set(" mapreduce.map.output.compress.codec" , codec)
104+ options.compressionCodec.foreach { codec =>
105+ CompressionCodecs .setCodecConfiguration(conf, codec)
110106 }
111107
112- new CSVOutputWriterFactory (params )
108+ new CSVOutputWriterFactory (options )
113109 }
114110
115111 override def hashCode (): Int = Objects .hashCode(paths.toSet, dataSchema, schema, partitionColumns)
@@ -129,17 +125,17 @@ private[sql] class CSVRelation(
129125 private def inferSchema (paths : Array [String ]): StructType = {
130126 val rdd = baseRdd(paths)
131127 val firstLine = findFirstLine(rdd)
132- val firstRow = new LineCsvReader (params ).parseLine(firstLine)
128+ val firstRow = new LineCsvReader (options ).parseLine(firstLine)
133129
134- val header = if (params .headerFlag) {
130+ val header = if (options .headerFlag) {
135131 firstRow
136132 } else {
137133 firstRow.zipWithIndex.map { case (value, index) => s " C $index" }
138134 }
139135
140136 val parsedRdd = tokenRdd(header, paths)
141- if (params .inferSchemaFlag) {
142- CSVInferSchema .infer(parsedRdd, header, params .nullValue)
137+ if (options .inferSchemaFlag) {
138+ CSVInferSchema .infer(parsedRdd, header, options .nullValue)
143139 } else {
144140 // By default fields are assumed to be StringType
145141 val schemaFields = header.map { fieldName =>
@@ -153,8 +149,8 @@ private[sql] class CSVRelation(
153149 * Returns the first line of the first non-empty file in path
154150 */
155151 private def findFirstLine (rdd : RDD [String ]): String = {
156- if (params .isCommentSet) {
157- val comment = params .comment.toString
152+ if (options .isCommentSet) {
153+ val comment = options .comment.toString
158154 rdd.filter { line =>
159155 line.trim.nonEmpty && ! line.startsWith(comment)
160156 }.first()
0 commit comments