port RDD API to use commit protocol.

jiangxb1987 · jiangxb1987 · commit a0426c8d5fda · 2016-11-04T22:27:17.000+08:00
diff --git a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
@@ -162,11 +162,14 @@ class SparkHadoopWriter(jobConf: JobConf) extends Logging with Serializable {
 private[spark]
 object SparkHadoopWriter {
   def createJobID(time: Date, id: Int): JobID = {
-    val formatter = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
-    val jobtrackerID = formatter.format(time)
+    val jobtrackerID = createJobTrackerID(time)
     new JobID(jobtrackerID, id)
   }
 
+  def createJobTrackerID(time: Date): String = {
+    new SimpleDateFormat("yyyyMMddHHmmss", Locale.US).format(time)
+  }
+
   def createPathFromString(path: String, conf: JobConf): Path = {
     if (path == null) {
       throw new IllegalArgumentException("Output path is null")
diff --git a/core/src/main/scala/org/apache/spark/SparkNewHadoopWriter.scala b/core/src/main/scala/org/apache/spark/SparkNewHadoopWriter.scala
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+import java.util.Date
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.mapreduce._
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
+import org.apache.spark.util.SerializableConfiguration
+
+/**
+ * Internal helper class that saves an RDD using a Hadoop OutputFormat
+ * (from the newer mapreduce API, not the old mapred API).
+ *
+ * Saves the RDD using a JobConf, which should contain an output key class, an output value class,
+ * a filename to write to, etc, exactly like in a Hadoop MapReduce job.
+ *
+ * Use a [[HadoopMapReduceCommitProtocol]] to handle output commit, which, unlike Hadoop's
+ * OutputCommitter, is serializable.
+ */
+private[spark]
+class SparkNewHadoopWriter(
+    jobConf: Configuration,
+    committer: HadoopMapReduceCommitProtocol) extends Logging with Serializable {
+
+  private val now = new Date()
+  private val conf = new SerializableConfiguration(jobConf)
+
+  private val jobtrackerID = SparkHadoopWriter.createJobTrackerID(new Date())
+  private var jobId = 0
+  private var splitId = 0
+  private var attemptId = 0
+
+  @transient private var writer: RecordWriter[AnyRef, AnyRef] = null
+  @transient private var jobContext: JobContext = null
+  @transient private var taskContext: TaskAttemptContext = null
+
+  def setupJob(): Unit = {
+    // Committer setup a job
+    committer.setupJob(getJobContext)
+  }
+
+  def setupTask(context: TaskContext): Unit = {
+    // Set jobID/taskID
+    jobId = context.stageId
+    splitId = context.partitionId
+    attemptId = (context.taskAttemptId % Int.MaxValue).toInt
+    // Committer setup a task
+    committer.setupTask(getTaskContext(context))
+  }
+
+  def write(context: TaskContext, key: AnyRef, value: AnyRef): Unit = {
+    getWriter(context).write(key, value)
+  }
+
+  def abortTask(context: TaskContext): Unit = {
+    // Close writer
+    getWriter(context).close(getTaskContext(context))
+    // Committer abort a task
+    committer.abortTask(getTaskContext(context))
+  }
+
+  def commitTask(context: TaskContext): Unit = {
+    // Close writer
+    getWriter(context).close(getTaskContext(context))
+    // Committer commit a task
+    committer.commitTask(getTaskContext(context))
+  }
+
+  def abortJob(): Unit = {
+    committer.abortJob(getJobContext)
+  }
+
+  def commitJob() {
+    committer.commitJob(getJobContext, Seq.empty)
+  }
+
+  // ********* Private Functions *********
+
+  /*
+   * Generate jobContext. Since jobContext is transient, it may be null after serialization.
+   */
+  private def getJobContext(): JobContext = {
+    if (jobContext == null) {
+      val jobAttemptId = new TaskAttemptID(jobtrackerID, jobId, TaskType.MAP, 0, 0)
+      jobContext = new TaskAttemptContextImpl(conf.value, jobAttemptId)
+    }
+    jobContext
+  }
+
+  /*
+   * Generate taskContext. Since taskContext is transient, it may be null after serialization.
+   */
+  private def getTaskContext(context: TaskContext): TaskAttemptContext = {
+    if (taskContext == null) {
+      val attemptId = new TaskAttemptID(jobtrackerID, jobId, TaskType.REDUCE, splitId,
+        context.attemptNumber)
+      taskContext = new TaskAttemptContextImpl(conf.value, attemptId)
+    }
+    taskContext
+  }
+
+  /*
+   * Generate writer. Since writer is transient, it may be null after serialization.
+   */
+  private def getWriter(context: TaskContext): RecordWriter[AnyRef, AnyRef] = {
+    if (writer == null) {
+      val format = getJobContext.getOutputFormatClass.newInstance
+      writer = format.getRecordWriter(getTaskContext(context))
+        .asInstanceOf[RecordWriter[AnyRef, AnyRef]]
+    }
+    writer
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -21,19 +21,21 @@ import java.nio.ByteBuffer
 import java.text.SimpleDateFormat
 import java.util.{Date, HashMap => JHashMap, Locale}
 
+import org.apache.spark.internal.io.{HadoopMapReduceCommitProtocol, FileCommitProtocol}
+
 import scala.collection.{mutable, Map}
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
 import scala.util.DynamicVariable
 
 import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus
-import org.apache.hadoop.conf.{Configurable, Configuration}
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.FileSystem
 import org.apache.hadoop.io.SequenceFile.CompressionType
 import org.apache.hadoop.io.compress.CompressionCodec
 import org.apache.hadoop.mapred.{FileOutputCommitter, FileOutputFormat, JobConf, OutputFormat}
-import org.apache.hadoop.mapreduce.{Job => NewAPIHadoopJob, OutputFormat => NewOutputFormat, RecordWriter => NewRecordWriter, TaskAttemptID, TaskType}
+import org.apache.hadoop.mapreduce.{Job => NewAPIHadoopJob, OutputFormat => NewOutputFormat, TaskAttemptID, TaskType}
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
 
 import org.apache.spark._
@@ -1092,37 +1094,38 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       jobFormat.checkOutputSpecs(job)
     }
 
+    // Instantiate writer
+    val committer = FileCommitProtocol.instantiate(
+      className = classOf[HadoopMapReduceCommitProtocol].getName,
+      jobId = stageId.toString,
+      outputPath = jobConfiguration.get("mapred.output.dir"),
+      isAppend = false
+    ).asInstanceOf[HadoopMapReduceCommitProtocol]
+    val writer = new SparkNewHadoopWriter(hadoopConf, committer)
+
     val writeShard = (context: TaskContext, iter: Iterator[(K, V)]) => {
-      val config = wrappedConf.value
-      /* "reduce task" <split #> <attempt # = spark task #> */
-      val attemptId = new TaskAttemptID(jobtrackerID, stageId, TaskType.REDUCE, context.partitionId,
-        context.attemptNumber)
-      val hadoopContext = new TaskAttemptContextImpl(config, attemptId)
-      val format = outfmt.newInstance
-      format match {
-        case c: Configurable => c.setConf(config)
-        case _ => ()
-      }
-      val committer = format.getOutputCommitter(hadoopContext)
-      committer.setupTask(hadoopContext)
+      writer.setupTask(context)
 
       val outputMetricsAndBytesWrittenCallback: Option[(OutputMetrics, () => Long)] =
         initHadoopOutputMetrics(context)
 
-      val writer = format.getRecordWriter(hadoopContext).asInstanceOf[NewRecordWriter[K, V]]
       require(writer != null, "Unable to obtain RecordWriter")
       var recordsWritten = 0L
       Utils.tryWithSafeFinallyAndFailureCallbacks {
         while (iter.hasNext) {
           val pair = iter.next()
-          writer.write(pair._1, pair._2)
+          writer.write(context, pair._1.asInstanceOf[AnyRef], pair._2.asInstanceOf[AnyRef])
 
           // Update bytes written metric every few records
           maybeUpdateOutputMetrics(outputMetricsAndBytesWrittenCallback, recordsWritten)
           recordsWritten += 1
         }
-      }(finallyBlock = writer.close(hadoopContext))
-      committer.commitTask(hadoopContext)
+
+        writer.commitTask(context)
+      }(catchBlock = {
+        writer.abortTask(context)
+        writer.abortJob()
+      })
       outputMetricsAndBytesWrittenCallback.foreach { case (om, callback) =>
         om.setBytesWritten(callback())
         om.setRecordsWritten(recordsWritten)
@@ -1147,9 +1150,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       logWarning(warningMessage)
     }
 
-    jobCommitter.setupJob(jobTaskContext)
+    writer.setupJob()
     self.context.runJob(self, writeShard)
-    jobCommitter.commitJob(jobTaskContext)
+    writer.commitJob()
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -509,21 +509,6 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
       (2, ArrayBuffer(1))))
   }
 
-  test("saveNewAPIHadoopFile should call setConf if format is configurable") {
-    val pairs = sc.parallelize(Array((new Integer(1), new Integer(1))))
-
-    // No error, non-configurable formats still work
-    pairs.saveAsNewAPIHadoopFile[NewFakeFormat]("ignored")
-
-    /*
-      Check that configurable formats get configured:
-      ConfigTestFormat throws an exception if we try to write
-      to it when setConf hasn't been called first.
-      Assertion is in ConfigTestFormat.getRecordWriter.
-     */
-    pairs.saveAsNewAPIHadoopFile[ConfigTestFormat]("ignored")
-  }
-
   test("saveAsHadoopFile should respect configured output committers") {
     val pairs = sc.parallelize(Array((new Integer(1), new Integer(1))))
     val conf = new JobConf()