CodingCat
diff --git a/‎docs/running-on-yarn.md‎
Lines changed: 11 additions & 0 deletions b/‎docs/running-on-yarn.md‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎python/pyspark/sql.py‎
Lines changed: 21 additions & 1 deletion b/‎python/pyspark/sql.py‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala‎
Lines changed: 19 additions & 10 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala‎
Lines changed: 19 additions & 10 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala‎
Lines changed: 4 additions & 12 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala‎
Lines changed: 4 additions & 12 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala‎
Lines changed: 2 additions & 2 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎sql/hive/src/main/scala/org/apache/spark/sql/hive/api/java/JavaHiveContext.scala‎
Lines changed: 2 additions & 8 deletions b/‎sql/hive/src/main/scala/org/apache/spark/sql/hive/api/java/JavaHiveContext.scala‎
Lines changed: 2 additions & 8 deletions
diff --git a/‎sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveQLSuite.scala‎
Lines changed: 101 additions & 0 deletions b/‎sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveQLSuite.scala‎
Lines changed: 101 additions & 0 deletions
diff --git a/‎sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveSuite.scala‎
Lines changed: 0 additions & 41 deletions b/‎sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveSuite.scala‎
Lines changed: 0 additions & 41 deletions
diff --git a/‎sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala‎
Lines changed: 18 additions & 14 deletions b/‎sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala‎
Lines changed: 18 additions & 14 deletions
diff --git a/‎yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/Client.scala‎
Lines changed: 2 additions & 2 deletions b/‎yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/Client.scala‎
Lines changed: 2 additions & 2 deletions
@@ -79,6 +79,17 @@ Most of the configs are the same for Spark on YARN as for other deployment modes
   <td>(none)</td>
   <td>
     Comma-separated list of files to be placed in the working directory of each executor.
+  <td><code>spark.yarn.executor.memoryOverhead</code></td>
+  <td>384</code></td>
+  <td>
+    The amount of off heap memory (in megabytes) to be allocated per executor. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.yarn.driver.memoryOverhead</code></td>
+  <td>384</code></td>
+  <td>
+    The amount of off heap memory (in megabytes) to be allocated per driver. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc.
   </td>
 </tr>
 </table>
 
@@ -77,12 +77,25 @@ def inferSchema(self, rdd):
         """Infer and apply a schema to an RDD of L{dict}s.
 
         We peek at the first row of the RDD to determine the fields names
-        and types, and then use that to extract all the dictionaries.
+        and types, and then use that to extract all the dictionaries. Nested
+        collections are supported, which include array, dict, list, set, and
+        tuple.
 
         >>> srdd = sqlCtx.inferSchema(rdd)
         >>> srdd.collect() == [{"field1" : 1, "field2" : "row1"}, {"field1" : 2, "field2": "row2"},
         ...                    {"field1" : 3, "field2": "row3"}]
         True
+
+        >>> from array import array
+        >>> srdd = sqlCtx.inferSchema(nestedRdd1)
+        >>> srdd.collect() == [{"f1" : array('i', [1, 2]), "f2" : {"row1" : 1.0}},
+        ...                    {"f1" : array('i', [2, 3]), "f2" : {"row2" : 2.0}}]
+        True
+
+        >>> srdd = sqlCtx.inferSchema(nestedRdd2)
+        >>> srdd.collect() == [{"f1" : [[1, 2], [2, 3]], "f2" : set([1, 2]), "f3" : (1, 2)},
+        ...                    {"f1" : [[2, 3], [3, 4]], "f2" : set([2, 3]), "f3" : (2, 3)}]
+        True
         """
         if (rdd.__class__ is SchemaRDD):
             raise ValueError("Cannot apply schema to %s" % SchemaRDD.__name__)
@@ -413,6 +426,7 @@ def subtract(self, other, numPartitions=None):
 
 def _test():
     import doctest
+    from array import array
     from pyspark.context import SparkContext
     globs = globals().copy()
     # The small batch size here ensures that we see multiple batches,
@@ -422,6 +436,12 @@ def _test():
     globs['sqlCtx'] = SQLContext(sc)
     globs['rdd'] = sc.parallelize([{"field1" : 1, "field2" : "row1"},
         {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}])
+    globs['nestedRdd1'] = sc.parallelize([
+        {"f1" : array('i', [1, 2]), "f2" : {"row1" : 1.0}},
+        {"f1" : array('i', [2, 3]), "f2" : {"row2" : 2.0}}])
+    globs['nestedRdd2'] = sc.parallelize([
+        {"f1" : [[1, 2], [2, 3]], "f2" : set([1, 2]), "f3" : (1, 2)},
+        {"f1" : [[2, 3], [3, 4]], "f2" : set([2, 3]), "f3" : (2, 3)}])
     (failure_count, test_count) = doctest.testmod(globs=globs,optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
 
@@ -298,19 +298,28 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   /**
    * Peek at the first row of the RDD and infer its schema.
-   * TODO: We only support primitive types, add support for nested types.
+   * TODO: consolidate this with the type system developed in SPARK-2060.
    */
   private[sql] def inferSchema(rdd: RDD[Map[String, _]]): SchemaRDD = {
+    import scala.collection.JavaConversions._
+    def typeFor(obj: Any): DataType = obj match {
+      case c: java.lang.String => StringType
+      case c: java.lang.Integer => IntegerType
+      case c: java.lang.Long => LongType
+      case c: java.lang.Double => DoubleType
+      case c: java.lang.Boolean => BooleanType
+      case c: java.util.List[_] => ArrayType(typeFor(c.head))
+      case c: java.util.Set[_] => ArrayType(typeFor(c.head))
+      case c: java.util.Map[_, _] =>
+        val (key, value) = c.head
+        MapType(typeFor(key), typeFor(value))
+      case c if c.getClass.isArray =>
+        val elem = c.asInstanceOf[Array[_]].head
+        ArrayType(typeFor(elem))
+      case c => throw new Exception(s"Object of type $c cannot be used")
+    }
     val schema = rdd.first().map { case (fieldName, obj) =>
-      val dataType = obj.getClass match {
-        case c: Class[_] if c == classOf[java.lang.String] => StringType
-        case c: Class[_] if c == classOf[java.lang.Integer] => IntegerType
-        case c: Class[_] if c == classOf[java.lang.Long] => LongType
-        case c: Class[_] if c == classOf[java.lang.Double] => DoubleType
-        case c: Class[_] if c == classOf[java.lang.Boolean] => BooleanType
-        case c => throw new Exception(s"Object of type $c cannot be used")
-      }
-      AttributeReference(fieldName, dataType, true)()
+      AttributeReference(fieldName, typeFor(obj), true)()
     }.toSeq
 
     val rowRdd = rdd.mapPartitions { iter =>
 
@@ -40,19 +40,13 @@ class JavaSQLContext(val sqlContext: SQLContext) {
   /**
    * Executes a query expressed in SQL, returning the result as a JavaSchemaRDD
    */
-  def sql(sqlQuery: String): JavaSchemaRDD = {
-    val result = new JavaSchemaRDD(sqlContext, sqlContext.parseSql(sqlQuery))
-    // We force query optimization to happen right away instead of letting it happen lazily like
-    // when using the query DSL.  This is so DDL commands behave as expected.  This is only
-    // generates the RDD lineage for DML queries, but do not perform any execution.
-    result.queryExecution.toRdd
-    result
-  }
+  def sql(sqlQuery: String): JavaSchemaRDD =
+    new JavaSchemaRDD(sqlContext, sqlContext.parseSql(sqlQuery))
 
   /**
    * :: Experimental ::
    * Creates an empty parquet file with the schema of class `beanClass`, which can be registered as
-   * a table. This registered table can be used as the target of future insertInto` operations.
+   * a table. This registered table can be used as the target of future `insertInto` operations.
    *
    * {{{
    *   JavaSQLContext sqlCtx = new JavaSQLContext(...)
@@ -62,7 +56,7 @@ class JavaSQLContext(val sqlContext: SQLContext) {
    * }}}
    *
    * @param beanClass A java bean class object that will be used to determine the schema of the
-   *                  parquet file.                          s
+   *                  parquet file.
    * @param path The path where the directory containing parquet metadata should be created.
    *             Data inserted into this table will also be stored at this location.
    * @param allowExisting When false, an exception will be thrown if this directory already exists.
@@ -100,14 +94,12 @@ class JavaSQLContext(val sqlContext: SQLContext) {
     new JavaSchemaRDD(sqlContext, SparkLogicalPlan(ExistingRdd(schema, rowRdd)))
   }
 
-
   /**
    * Loads a parquet file, returning the result as a [[JavaSchemaRDD]].
    */
   def parquetFile(path: String): JavaSchemaRDD =
     new JavaSchemaRDD(sqlContext, ParquetRelation(path))
 
-
   /**
    * Registers the given RDD as a temporary table in the catalog.  Temporary tables exist only
    * during the lifetime of this instance of SQLContext.
 
@@ -83,8 +83,8 @@ case class ExplainCommand(
   override protected[sql] lazy val sideEffectResult: Seq[String] = this.toString.split("\n")
 
   def execute(): RDD[Row] = {
-    val explanation = sideEffectResult.mkString("\n")
-    context.sparkContext.parallelize(Seq(new GenericRow(Array[Any](explanation))), 1)
+    val explanation = sideEffectResult.map(row => new GenericRow(Array[Any](row)))
+    context.sparkContext.parallelize(explanation, 1)
   }
 
   override def otherCopyArgs = context :: Nil
 
@@ -31,12 +31,6 @@ class JavaHiveContext(sparkContext: JavaSparkContext) extends JavaSQLContext(spa
   /**
     * Executes a query expressed in HiveQL, returning the result as a JavaSchemaRDD.
     */
-  def hql(hqlQuery: String): JavaSchemaRDD = {
-    val result = new JavaSchemaRDD(sqlContext, HiveQl.parseSql(hqlQuery))
-    // We force query optimization to happen right away instead of letting it happen lazily like
-    // when using the query DSL.  This is so DDL commands behave as expected.  This is only
-    // generates the RDD lineage for DML queries, but do not perform any execution.
-    result.queryExecution.toRdd
-    result
-  }
+  def hql(hqlQuery: String): JavaSchemaRDD =
+    new JavaSchemaRDD(sqlContext, HiveQl.parseSql(hqlQuery))
 }
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.api.java
+
+import scala.util.Try
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.api.java.JavaSparkContext
+import org.apache.spark.sql.api.java.JavaSchemaRDD
+import org.apache.spark.sql.execution.ExplainCommand
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.test.TestSQLContext
+
+// Implicits
+import scala.collection.JavaConversions._
+
+class JavaHiveQLSuite extends FunSuite {
+  lazy val javaCtx = new JavaSparkContext(TestSQLContext.sparkContext)
+
+  // There is a little trickery here to avoid instantiating two HiveContexts in the same JVM
+  lazy val javaHiveCtx = new JavaHiveContext(javaCtx) {
+    override val sqlContext = TestHive
+  }
+
+  ignore("SELECT * FROM src") {
+    assert(
+      javaHiveCtx.hql("SELECT * FROM src").collect().map(_.getInt(0)) ===
+        TestHive.sql("SELECT * FROM src").collect().map(_.getInt(0)).toSeq)
+  }
+
+  private val explainCommandClassName =
+    classOf[ExplainCommand].getSimpleName.stripSuffix("$")
+
+  def isExplanation(result: JavaSchemaRDD) = {
+    val explanation = result.collect().map(_.getString(0))
+    explanation.size > 1 && explanation.head.startsWith(explainCommandClassName)
+  }
+
+  ignore("Query Hive native command execution result") {
+    val tableName = "test_native_commands"
+
+    assertResult(0) {
+      javaHiveCtx.hql(s"DROP TABLE IF EXISTS $tableName").count()
+    }
+
+    assertResult(0) {
+      javaHiveCtx.hql(s"CREATE TABLE $tableName(key INT, value STRING)").count()
+    }
+
+    javaHiveCtx.hql("SHOW TABLES").registerAsTable("show_tables")
+
+    assert(
+      javaHiveCtx
+        .hql("SELECT result FROM show_tables")
+        .collect()
+        .map(_.getString(0))
+        .contains(tableName))
+
+    assertResult(Array(Array("key", "int", "None"), Array("value", "string", "None"))) {
+      javaHiveCtx.hql(s"DESCRIBE $tableName").registerAsTable("describe_table")
+
+      javaHiveCtx
+        .hql("SELECT result FROM describe_table")
+        .collect()
+        .map(_.getString(0).split("\t").map(_.trim))
+        .toArray
+    }
+
+    assert(isExplanation(javaHiveCtx.hql(
+      s"EXPLAIN SELECT key, COUNT(*) FROM $tableName GROUP BY key")))
+
+    TestHive.reset()
+  }
+
+  ignore("Exactly once semantics for DDL and command statements") {
+    val tableName = "test_exactly_once"
+    val q0 = javaHiveCtx.hql(s"CREATE TABLE $tableName(key INT, value STRING)")
+
+    // If the table was not created, the following assertion would fail
+    assert(Try(TestHive.table(tableName)).isSuccess)
+
+    // If the CREATE TABLE command got executed again, the following assertion would fail
+    assert(Try(q0.count()).isSuccess)
+  }
+}
@@ -169,7 +169,7 @@ class HiveQuerySuite extends HiveComparisonTest {
 
   def isExplanation(result: SchemaRDD) = {
     val explanation = result.select('plan).collect().map { case Row(plan: String) => plan }
-    explanation.size == 1 && explanation.head.startsWith(explainCommandClassName)
+    explanation.size > 1 && explanation.head.startsWith(explainCommandClassName)
   }
 
   test("SPARK-1704: Explain commands as a SchemaRDD") {
@@ -184,25 +184,29 @@ class HiveQuerySuite extends HiveComparisonTest {
   test("Query Hive native command execution result") {
     val tableName = "test_native_commands"
 
-    val q0 = hql(s"DROP TABLE IF EXISTS $tableName")
-    assert(q0.count() == 0)
+    assertResult(0) {
+      hql(s"DROP TABLE IF EXISTS $tableName").count()
+    }
 
-    val q1 = hql(s"CREATE TABLE $tableName(key INT, value STRING)")
-    assert(q1.count() == 0)
+    assertResult(0) {
+      hql(s"CREATE TABLE $tableName(key INT, value STRING)").count()
+    }
 
-    val q2 = hql("SHOW TABLES")
-    val tables = q2.select('result).collect().map { case Row(table: String) => table }
-    assert(tables.contains(tableName))
+    assert(
+      hql("SHOW TABLES")
+        .select('result)
+        .collect()
+        .map(_.getString(0))
+        .contains(tableName))
 
-    val q3 = hql(s"DESCRIBE $tableName")
     assertResult(Array(Array("key", "int", "None"), Array("value", "string", "None"))) {
-      q3.select('result).collect().map { case Row(fieldDesc: String) =>
-        fieldDesc.split("\t").map(_.trim)
-      }
+      hql(s"DESCRIBE $tableName")
+        .select('result)
+        .collect()
+        .map(_.getString(0).split("\t").map(_.trim))
     }
 
-    val q4 = hql(s"EXPLAIN SELECT key, COUNT(*) FROM $tableName GROUP BY key")
-    assert(isExplanation(q4))
+    assert(isExplanation(hql(s"EXPLAIN SELECT key, COUNT(*) FROM $tableName GROUP BY key")))
 
     TestHive.reset()
   }
 
@@ -71,7 +71,7 @@ class Client(clientArgs: ClientArguments, hadoopConf: Configuration, spConf: Spa
 
     val capability = Records.newRecord(classOf[Resource]).asInstanceOf[Resource]
     // Memory for the ApplicationMaster.
-    capability.setMemory(args.amMemory + YarnAllocationHandler.MEMORY_OVERHEAD)
+    capability.setMemory(args.amMemory + memoryOverhead)
     amContainer.setResource(capability)
 
     appContext.setQueue(args.amQueue)
@@ -115,7 +115,7 @@ class Client(clientArgs: ClientArguments, hadoopConf: Configuration, spConf: Spa
     val minResMemory = newApp.getMinimumResourceCapability().getMemory()
     val amMemory = ((args.amMemory / minResMemory) * minResMemory) +
           ((if ((args.amMemory % minResMemory) == 0) 0 else minResMemory) -
-          YarnAllocationHandler.MEMORY_OVERHEAD)
+          memoryOverhead)
     amMemory
   }
Original file line number	Diff line number	Diff line change
`@@ -83,8 +83,8 @@ case class ExplainCommand(`
`83`	`83`	`override protected[sql] lazy val sideEffectResult: Seq[String] = this.toString.split("\n")`
`84`	`84`
`85`	`85`	`def execute(): RDD[Row] = {`
`86`		`- val explanation = sideEffectResult.mkString("\n")`
`87`		`- context.sparkContext.parallelize(Seq(new GenericRow(Array[Any](explanation))), 1)`
	`86`	`+ val explanation = sideEffectResult.map(row => new GenericRow(Array[Any](row)))`
	`87`	`+ context.sparkContext.parallelize(explanation, 1)`
`88`	`88`	`}`
`89`	`89`
`90`	`90`	`override def otherCopyArgs = context :: Nil`