Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions core/src/main/scala/org/apache/spark/SparkContext.scala
Original file line number Diff line number Diff line change
Expand Up @@ -863,10 +863,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
new WholeTextFileRDD(
this,
classOf[WholeTextFileInputFormat],
classOf[String],
classOf[String],
classOf[Text],
classOf[Text],
updateConf,
minPartitions).setName(path)
minPartitions).setName(path).map(record => (record._1.toString, record._2.toString))
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package org.apache.spark.input
import scala.collection.JavaConverters._

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat
Expand All @@ -33,14 +34,13 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext
*/

private[spark] class WholeTextFileInputFormat
extends CombineFileInputFormat[String, String] with Configurable {
extends CombineFileInputFormat[Text, Text] with Configurable {

override protected def isSplitable(context: JobContext, file: Path): Boolean = false

override def createRecordReader(
split: InputSplit,
context: TaskAttemptContext): RecordReader[String, String] = {

context: TaskAttemptContext): RecordReader[Text, Text] = {
val reader =
new ConfigurableCombineFileRecordReader(split, context, classOf[WholeTextFileRecordReader])
reader.setConf(getConf)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ private[spark] class WholeTextFileRecordReader(
split: CombineFileSplit,
context: TaskAttemptContext,
index: Integer)
extends RecordReader[String, String] with Configurable {
extends RecordReader[Text, Text] with Configurable {

private[this] val path = split.getPath(index)
private[this] val fs = path.getFileSystem(
Expand All @@ -58,18 +58,18 @@ private[spark] class WholeTextFileRecordReader(
// True means the current file has been processed, then skip it.
private[this] var processed = false

private[this] val key = path.toString
private[this] var value: String = null
private[this] val key: Text = new Text(path.toString)
private[this] var value: Text = null

override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = {}

override def close(): Unit = {}

override def getProgress: Float = if (processed) 1.0f else 0.0f

override def getCurrentKey: String = key
override def getCurrentKey: Text = key

override def getCurrentValue: String = value
override def getCurrentValue: Text = value

override def nextKeyValue(): Boolean = {
if (!processed) {
Expand All @@ -83,7 +83,7 @@ private[spark] class WholeTextFileRecordReader(
ByteStreams.toByteArray(fileIn)
}

value = new Text(innerBuffer).toString
value = new Text(innerBuffer)
Closeables.close(fileIn, false)
processed = true
true
Expand Down
33 changes: 1 addition & 32 deletions core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,11 @@ import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, FileSplit}

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.input.WholeTextFileInputFormat
import org.apache.spark._
import org.apache.spark.executor.DataReadMethod
import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil
import org.apache.spark.rdd.NewHadoopRDD.NewHadoopMapPartitionsWithSplitRDD
import org.apache.spark.util.{SerializableConfiguration, ShutdownHookManager, Utils}
import org.apache.spark.util.{SerializableConfiguration, ShutdownHookManager}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.storage.StorageLevel

Expand All @@ -59,7 +58,6 @@ private[spark] class NewHadoopPartition(
* @param inputFormatClass Storage format of the data to be read.
* @param keyClass Class of the key associated with the inputFormatClass.
* @param valueClass Class of the value associated with the inputFormatClass.
* @param conf The Hadoop configuration.
*/
@DeveloperApi
class NewHadoopRDD[K, V](
Expand Down Expand Up @@ -282,32 +280,3 @@ private[spark] object NewHadoopRDD {
}
}
}

private[spark] class WholeTextFileRDD(
sc : SparkContext,
inputFormatClass: Class[_ <: WholeTextFileInputFormat],
keyClass: Class[String],
valueClass: Class[String],
conf: Configuration,
minPartitions: Int)
extends NewHadoopRDD[String, String](sc, inputFormatClass, keyClass, valueClass, conf) {

override def getPartitions: Array[Partition] = {
val inputFormat = inputFormatClass.newInstance
val conf = getConf
inputFormat match {
case configurable: Configurable =>
configurable.setConf(conf)
case _ =>
}
val jobContext = newJobContext(conf, jobId)
inputFormat.setMinPartitions(jobContext, minPartitions)
val rawSplits = inputFormat.getSplits(jobContext).toArray
val result = new Array[Partition](rawSplits.size)
for (i <- 0 until rawSplits.size) {
result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
}
result
}
}

56 changes: 56 additions & 0 deletions core/src/main/scala/org/apache/spark/rdd/WholeTextFileRDD.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.rdd

import org.apache.hadoop.conf.{Configurable, Configuration}
import org.apache.hadoop.io.{Text, Writable}
import org.apache.hadoop.mapreduce.InputSplit

import org.apache.spark.{Partition, SparkContext}
import org.apache.spark.input.WholeTextFileInputFormat

/**
* An RDD that reads a bunch of text files in, and each text file becomes one record.
*/
private[spark] class WholeTextFileRDD(
sc : SparkContext,
inputFormatClass: Class[_ <: WholeTextFileInputFormat],
keyClass: Class[Text],
valueClass: Class[Text],
conf: Configuration,
minPartitions: Int)
extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) {

override def getPartitions: Array[Partition] = {
val inputFormat = inputFormatClass.newInstance
val conf = getConf
inputFormat match {
case configurable: Configurable =>
configurable.setConf(conf)
case _ =>
}
val jobContext = newJobContext(conf, jobId)
inputFormat.setMinPartitions(jobContext, minPartitions)
val rawSplits = inputFormat.getSplits(jobContext).toArray
val result = new Array[Partition](rawSplits.size)
for (i <- 0 until rawSplits.size) {
result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
}
result
}
}