Merge remote-tracking branch 'upstream/master' into SPARK-2177

yhuai · yhuai · commit 83adb2ffe56f · 2014-06-18T22:55:10.000-07:00
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -446,7 +446,7 @@ abstract class RDD[T: ClassTag](
    * Return this RDD sorted by the given key function.
    */
   def sortBy[K](
-      f: (T) ⇒ K,
+      f: (T) => K,
       ascending: Boolean = true,
       numPartitions: Int = this.partitions.size)
       (implicit ord: Ordering[K], ctag: ClassTag[K]): RDD[T] =
diff --git a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
@@ -120,7 +120,7 @@ class FileAppenderSuite extends FunSuite with BeforeAndAfter with Logging {
     // on SparkConf settings.
 
     def testAppenderSelection[ExpectedAppender: ClassTag, ExpectedRollingPolicy](
-        properties: Seq[(String, String)], expectedRollingPolicyParam: Long = -1): FileAppender = {
+        properties: Seq[(String, String)], expectedRollingPolicyParam: Long = -1): Unit = {
 
       // Set spark conf properties
       val conf = new SparkConf
@@ -129,8 +129,9 @@ class FileAppenderSuite extends FunSuite with BeforeAndAfter with Logging {
       }
 
       // Create and test file appender
-      val inputStream = new PipedInputStream(new PipedOutputStream())
-      val appender = FileAppender(inputStream, new File("stdout"), conf)
+      val testOutputStream = new PipedOutputStream()
+      val testInputStream = new PipedInputStream(testOutputStream)
+      val appender = FileAppender(testInputStream, testFile, conf)
       assert(appender.isInstanceOf[ExpectedAppender])
       assert(appender.getClass.getSimpleName ===
         classTag[ExpectedAppender].runtimeClass.getSimpleName)
@@ -144,7 +145,8 @@ class FileAppenderSuite extends FunSuite with BeforeAndAfter with Logging {
         }
         assert(policyParam === expectedRollingPolicyParam)
       }
-      appender
+      testOutputStream.close()
+      appender.awaitTermination()
     }
 
     import RollingFileAppender._
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -419,7 +419,7 @@ class RowMatrix(
   /** Updates or verifies the number of rows. */
   private def updateNumRows(m: Long) {
     if (nRows <= 0) {
-      nRows == m
+      nRows = m
     } else {
       require(nRows == m,
         s"The number of rows $m is different from what specified or previously computed: ${nRows}.")
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
@@ -43,12 +43,19 @@ def launch_gateway():
             # Don't send ctrl-c / SIGINT to the Java gateway:
             def preexec_func():
                 signal.signal(signal.SIGINT, signal.SIG_IGN)
-            proc = Popen(command, stdout=PIPE, stdin=PIPE, preexec_fn=preexec_func)
+            proc = Popen(command, stdout=PIPE, stdin=PIPE, stderr=PIPE, preexec_fn=preexec_func)
         else:
             # preexec_fn not supported on Windows
-            proc = Popen(command, stdout=PIPE, stdin=PIPE)
-        # Determine which ephemeral port the server started on:
-        gateway_port = int(proc.stdout.readline())
+            proc = Popen(command, stdout=PIPE, stdin=PIPE, stderr=PIPE)
+        
+        try:
+            # Determine which ephemeral port the server started on:
+            gateway_port = int(proc.stdout.readline())
+        except:
+            error_code = proc.poll()
+            raise Exception("Launching GatewayServer failed with exit code %d: %s" %
+                (error_code, "".join(proc.stderr.readlines())))
+
         # Create a thread to echo output from the GatewayServer, which is required
         # for Java log output to show up:
         class EchoOutputThread(Thread):
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -68,7 +68,7 @@ class BindReferences[TreeNode <: QueryPlan[TreeNode]] extends Rule[TreeNode] {
 }
 
 object BindReferences extends Logging {
-  def bindReference(expression: Expression, input: Seq[Attribute]): Expression = {
+  def bindReference[A <: Expression](expression: A, input: Seq[Attribute]): A = {
     expression.transform { case a: AttributeReference =>
       attachTree(a, "Binding attribute") {
         val ordinal = input.indexWhere(_.exprId == a.exprId)
@@ -83,6 +83,6 @@ object BindReferences extends Logging {
           BoundReference(ordinal, a)
         }
       }
-    }
+    }.asInstanceOf[A] // Kind of a hack, but safe.  TODO: Tighten return type when possible.
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Row.scala
@@ -208,6 +208,9 @@ class GenericMutableRow(size: Int) extends GenericRow(size) with MutableRow {
 
 
 class RowOrdering(ordering: Seq[SortOrder]) extends Ordering[Row] {
+  def this(ordering: Seq[SortOrder], inputSchema: Seq[Attribute]) =
+    this(ordering.map(BindReferences.bindReference(_, inputSchema)))
+
   def compare(a: Row, b: Row): Int = {
     var i = 0
     while (i < ordering.size) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -22,7 +22,7 @@ import org.apache.spark.{HashPartitioner, RangePartitioner, SparkConf}
 import org.apache.spark.rdd.ShuffledRDD
 import org.apache.spark.sql.{SQLConf, SQLContext, Row}
 import org.apache.spark.sql.catalyst.errors.attachTree
-import org.apache.spark.sql.catalyst.expressions.{MutableProjection, RowOrdering}
+import org.apache.spark.sql.catalyst.expressions.{NoBind, MutableProjection, RowOrdering}
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.util.MutablePair
@@ -31,7 +31,7 @@ import org.apache.spark.util.MutablePair
  * :: DeveloperApi ::
  */
 @DeveloperApi
-case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends UnaryNode {
+case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends UnaryNode with NoBind {
 
   override def outputPartitioning = newPartitioning
 
@@ -42,7 +42,7 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una
       case HashPartitioning(expressions, numPartitions) =>
         // TODO: Eliminate redundant expressions in grouping key and value.
         val rdd = child.execute().mapPartitions { iter =>
-          val hashExpressions = new MutableProjection(expressions)
+          val hashExpressions = new MutableProjection(expressions, child.output)
           val mutablePair = new MutablePair[Row, Row]()
           iter.map(r => mutablePair.update(hashExpressions(r), r))
         }
@@ -53,7 +53,7 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una
 
       case RangePartitioning(sortingExpressions, numPartitions) =>
         // TODO: RangePartitioner should take an Ordering.
-        implicit val ordering = new RowOrdering(sortingExpressions)
+        implicit val ordering = new RowOrdering(sortingExpressions, child.output)
 
         val rdd = child.execute().mapPartitions { iter =>
           val mutablePair = new MutablePair[Row, Null](null, null)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -250,9 +250,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case logical.SetCommand(key, value) =>
         Seq(execution.SetCommand(key, value, plan.output)(context))
-      case logical.ExplainCommand(child) =>
-        val sparkPlan = context.executePlan(child).sparkPlan
-        Seq(execution.ExplainCommand(sparkPlan, plan.output)(context))
+      case logical.ExplainCommand(logicalPlan) =>
+        Seq(execution.ExplainCommand(logicalPlan, plan.output)(context))
       case logical.CacheCommand(tableName, cache) =>
         Seq(execution.CacheCommand(tableName, cache)(context))
       case _ => Nil
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
@@ -21,6 +21,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{SQLContext, Row}
 import org.apache.spark.sql.catalyst.expressions.{GenericRow, Attribute}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 
 trait Command {
   /**
@@ -71,16 +72,23 @@ case class SetCommand(
 }
 
 /**
+ * An explain command for users to see how a command will be executed.
+ *
+ * Note that this command takes in a logical plan, runs the optimizer on the logical plan
+ * (but do NOT actually execute it).
+ *
  * :: DeveloperApi ::
  */
 @DeveloperApi
 case class ExplainCommand(
-    child: SparkPlan, output: Seq[Attribute])(
+    logicalPlan: LogicalPlan, output: Seq[Attribute])(
     @transient context: SQLContext)
-  extends UnaryNode with Command {
+  extends LeafNode with Command {
 
-  // Actually "EXPLAIN" command doesn't cause any side effect.
-  override protected[sql] lazy val sideEffectResult: Seq[String] = this.toString.split("\n")
+  // Run through the optimizer to generate the physical plan.
+  override protected[sql] lazy val sideEffectResult: Seq[String] = {
+    "Physical execution plan:" +: context.executePlan(logicalPlan).executedPlan.toString.split("\n")
+  }
 
   def execute(): RDD[Row] = {
     val explanation = sideEffectResult.map(row => new GenericRow(Array[Any](row)))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -202,12 +202,9 @@ class HiveQuerySuite extends HiveComparisonTest {
     }
   }
 
-  private val explainCommandClassName =
-    classOf[execution.ExplainCommand].getSimpleName.stripSuffix("$")
-
   def isExplanation(result: SchemaRDD) = {
     val explanation = result.select('plan).collect().map { case Row(plan: String) => plan }
-    explanation.size > 1 && explanation.head.startsWith(explainCommandClassName)
+    explanation.size > 1 && explanation.head.startsWith("Physical execution plan")
   }
 
   test("SPARK-1704: Explain commands as a SchemaRDD") {

Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,7 @@ class BindReferences[TreeNode <: QueryPlan[TreeNode]] extends Rule[TreeNode] {`
`68`	`68`	`}`
`69`	`69`
`70`	`70`	`object BindReferences extends Logging {`
`71`		`- def bindReference(expression: Expression, input: Seq[Attribute]): Expression = {`
	`71`	`+ def bindReference[A <: Expression](expression: A, input: Seq[Attribute]): A = {`
`72`	`72`	`expression.transform { case a: AttributeReference =>`
`73`	`73`	`attachTree(a, "Binding attribute") {`
`74`	`74`	`val ordinal = input.indexWhere(_.exprId == a.exprId)`
`@@ -83,6 +83,6 @@ object BindReferences extends Logging {`
`83`	`83`	`BoundReference(ordinal, a)`
`84`	`84`	`}`
`85`	`85`	`}`
`86`		`- }`
	`86`	`+ }.asInstanceOf[A] // Kind of a hack, but safe. TODO: Tighten return type when possible.`
`87`	`87`	`}`
`88`	`88`	`}`
Original file line number	Diff line number	Diff line change
`@@ -202,12 +202,9 @@ class HiveQuerySuite extends HiveComparisonTest {`
`202`	`202`	`}`
`203`	`203`	`}`
`204`	`204`
`205`		`- private val explainCommandClassName =`
`206`		`- classOf[execution.ExplainCommand].getSimpleName.stripSuffix("$")`
`207`		`-`
`208`	`205`	`def isExplanation(result: SchemaRDD) = {`
`209`	`206`	`val explanation = result.select('plan).collect().map { case Row(plan: String) => plan }`
`210`		`- explanation.size > 1 && explanation.head.startsWith(explainCommandClassName)`
	`207`	`+ explanation.size > 1 && explanation.head.startsWith("Physical execution plan")`
`211`	`208`	`}`
`212`	`209`
`213`	`210`	`test("SPARK-1704: Explain commands as a SchemaRDD") {`