CodingCat
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala‎
Lines changed: 7 additions & 4 deletions b/‎core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala‎
Lines changed: 19 additions & 12 deletions b/‎core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala‎
Lines changed: 19 additions & 12 deletions
diff --git a/‎core/src/test/scala/org/apache/spark/rdd/ParallelCollectionSplitSuite.scala‎
Lines changed: 18 additions & 0 deletions b/‎core/src/test/scala/org/apache/spark/rdd/ParallelCollectionSplitSuite.scala‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎docs/programming-guide.md‎
Lines changed: 7 additions & 5 deletions b/‎docs/programming-guide.md‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎docs/streaming-programming-guide.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/streaming-programming-guide.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/pyspark/sql.py‎
Lines changed: 3 additions & 1 deletion b/‎python/pyspark/sql.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala‎
Lines changed: 10 additions & 8 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala‎
Lines changed: 1 addition & 1 deletion b/‎sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala‎
Lines changed: 5 additions & 34 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala‎
Lines changed: 5 additions & 34 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala‎
Lines changed: 3 additions & 8 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala‎
Lines changed: 3 additions & 8 deletions
@@ -139,10 +139,13 @@ class HadoopRDD[K, V](
       // Create a JobConf that will be cached and used across this RDD's getJobConf() calls in the
       // local process. The local cache is accessed through HadoopRDD.putCachedMetadata().
       // The caching helps minimize GC, since a JobConf can contain ~10KB of temporary objects.
-      val newJobConf = new JobConf(broadcastedConf.value.value)
-      initLocalJobConfFuncOpt.map(f => f(newJobConf))
-      HadoopRDD.putCachedMetadata(jobConfCacheKey, newJobConf)
-      newJobConf
+      // synchronize to prevent ConcurrentModificationException (Spark-1097, Hadoop-10456)
+      broadcastedConf.synchronized {
+        val newJobConf = new JobConf(broadcastedConf.value.value)
+        initLocalJobConfFuncOpt.map(f => f(newJobConf))
+        HadoopRDD.putCachedMetadata(jobConfCacheKey, newJobConf)
+        newJobConf
+      }
     }
   }
 
 
@@ -117,6 +117,15 @@ private object ParallelCollectionRDD {
     if (numSlices < 1) {
       throw new IllegalArgumentException("Positive number of slices required")
     }
+    // Sequences need to be sliced at the same set of index positions for operations
+    // like RDD.zip() to behave as expected
+    def positions(length: Long, numSlices: Int): Iterator[(Int, Int)] = {
+      (0 until numSlices).iterator.map(i => {
+        val start = ((i * length) / numSlices).toInt
+        val end = (((i + 1) * length) / numSlices).toInt
+        (start, end)
+      })
+    }
     seq match {
       case r: Range.Inclusive => {
         val sign = if (r.step < 0) {
@@ -128,30 +137,28 @@ private object ParallelCollectionRDD {
           r.start, r.end + sign, r.step).asInstanceOf[Seq[T]], numSlices)
       }
       case r: Range => {
-        (0 until numSlices).map(i => {
-          val start = ((i * r.length.toLong) / numSlices).toInt
-          val end = (((i + 1) * r.length.toLong) / numSlices).toInt
-          new Range(r.start + start * r.step, r.start + end * r.step, r.step)
-        }).asInstanceOf[Seq[Seq[T]]]
+        positions(r.length, numSlices).map({
+          case (start, end) =>
+            new Range(r.start + start * r.step, r.start + end * r.step, r.step)
+        }).toSeq.asInstanceOf[Seq[Seq[T]]]
       }
       case nr: NumericRange[_] => {
         // For ranges of Long, Double, BigInteger, etc
         val slices = new ArrayBuffer[Seq[T]](numSlices)
-        val sliceSize = (nr.size + numSlices - 1) / numSlices // Round up to catch everything
         var r = nr
-        for (i <- 0 until numSlices) {
+        for ((start, end) <- positions(nr.length, numSlices)) {
+          val sliceSize = end - start
           slices += r.take(sliceSize).asInstanceOf[Seq[T]]
           r = r.drop(sliceSize)
         }
         slices
       }
       case _ => {
         val array = seq.toArray // To prevent O(n^2) operations for List etc
-        (0 until numSlices).map(i => {
-          val start = ((i * array.length.toLong) / numSlices).toInt
-          val end = (((i + 1) * array.length.toLong) / numSlices).toInt
-          array.slice(start, end).toSeq
-        })
+        positions(array.length, numSlices).map({
+          case (start, end) =>
+            array.slice(start, end).toSeq
+        }).toSeq
       }
     }
   }
 
@@ -111,6 +111,24 @@ class ParallelCollectionSplitSuite extends FunSuite with Checkers {
     assert(slices.forall(_.isInstanceOf[Range]))
   }
 
+  test("identical slice sizes between Range and NumericRange") {
+    val r = ParallelCollectionRDD.slice(1 to 7, 4)
+    val nr = ParallelCollectionRDD.slice(1L to 7L, 4)
+    assert(r.size === 4)
+    for (i <- 0 until r.size) {
+      assert(r(i).size === nr(i).size)
+    }
+  }
+
+  test("identical slice sizes between List and NumericRange") {
+    val r = ParallelCollectionRDD.slice(List(1, 2), 4)
+    val nr = ParallelCollectionRDD.slice(1L to 2L, 4)
+    assert(r.size === 4)
+    for (i <- 0 until r.size) {
+      assert(r(i).size === nr(i).size)
+    }
+  }
+
   test("large ranges don't overflow") {
     val N = 100 * 1000 * 1000
     val data = 0 until N
 
@@ -377,13 +377,15 @@ Some notes on reading files with Spark:
 
 * The `textFile` method also takes an optional second argument for controlling the number of slices of the file. By default, Spark creates one slice for each block of the file (blocks being 64MB by default in HDFS), but you can also ask for a higher number of slices by passing a larger value. Note that you cannot have fewer slices than blocks.
 
-Apart from reading files as a collection of lines,
-`SparkContext.wholeTextFiles` lets you read a directory containing multiple small text files, and returns each of them as (filename, content) pairs. This is in contrast with `textFile`, which would return one record per line in each file.
+Apart from text files, Spark's Python API also supports several other data formats:
 
-### SequenceFile and Hadoop InputFormats
+* `SparkContext.wholeTextFiles` lets you read a directory containing multiple small text files, and returns each of them as (filename, content) pairs. This is in contrast with `textFile`, which would return one record per line in each file.
+
+* `RDD.saveAsPickleFile` and `SparkContext.pickleFile` support saving an RDD in a simple format consisting of pickled Python objects. Batching is used on pickle serialization, with default batch size 10.
 
-In addition to reading text files, PySpark supports reading ```SequenceFile``` 
-and any arbitrary ```InputFormat```.
+* Details on reading `SequenceFile` and arbitrary Hadoop `InputFormat` are given below.
+
+### SequenceFile and Hadoop InputFormats
 
 **Note** this feature is currently marked ```Experimental``` and is intended for advanced users. It may be replaced in future with read/write support based on SparkSQL, in which case SparkSQL is the preferred approach.
 
 
@@ -950,7 +950,7 @@ is 200 milliseconds.
 
 An alternative to receiving data with multiple input streams / receivers is to explicitly repartition
 the input data stream (using `inputStream.repartition(<number of partitions>)`).
-This distributes the received batches of data across all the machines in the cluster
+This distributes the received batches of data across specified number of machines in the cluster
 before further processing.
 
 ### Level of Parallelism in Data Processing
 
@@ -16,6 +16,7 @@
 #
 
 from pyspark.rdd import RDD
+from pyspark.serializers import BatchedSerializer, PickleSerializer
 
 from py4j.protocol import Py4JError
 
@@ -346,7 +347,8 @@ def _toPython(self):
         # TODO: This is inefficient, we should construct the Python Row object
         # in Java land in the javaToPython function. May require a custom
         # pickle serializer in Pyrolite
-        return RDD(jrdd, self._sc, self._sc.serializer).map(lambda d: Row(d))
+        return RDD(jrdd, self._sc, BatchedSerializer(
+                        PickleSerializer())).map(lambda d: Row(d))
 
     # We override the default cache/persist/checkpoint behavior as we want to cache the underlying
     # SchemaRDD object in the JVM, not the PythonRDD checkpointed by the super class
 
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.plans.logical
 
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Attribute}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BoundReference}
 import org.apache.spark.sql.catalyst.types.StringType
 
 /**
@@ -26,35 +26,37 @@ import org.apache.spark.sql.catalyst.types.StringType
  */
 abstract class Command extends LeafNode {
   self: Product =>
-  def output: Seq[Attribute] = Seq.empty  // TODO: SPARK-2081 should fix this
+  def output: Seq[Attribute] = Seq.empty
 }
 
 /**
  * Returned for commands supported by a given parser, but not catalyst.  In general these are DDL
  * commands that are passed directly to another system.
  */
-case class NativeCommand(cmd: String) extends Command
+case class NativeCommand(cmd: String) extends Command {
+  override def output =
+    Seq(BoundReference(0, AttributeReference("result", StringType, nullable = false)()))
+}
 
 /**
  * Commands of the form "SET (key) (= value)".
  */
 case class SetCommand(key: Option[String], value: Option[String]) extends Command {
   override def output = Seq(
-    AttributeReference("key", StringType, nullable = false)(),
-    AttributeReference("value", StringType, nullable = false)()
-  )
+    BoundReference(0, AttributeReference("key", StringType, nullable = false)()),
+    BoundReference(1, AttributeReference("value", StringType, nullable = false)()))
 }
 
 /**
  * Returned by a parser when the users only wants to see what query plan would be executed, without
  * actually performing the execution.
  */
 case class ExplainCommand(plan: LogicalPlan) extends Command {
-  override def output = Seq(AttributeReference("plan", StringType, nullable = false)())
+  override def output =
+    Seq(BoundReference(0, AttributeReference("plan", StringType, nullable = false)()))
 }
 
 /**
  * Returned for the "CACHE TABLE tableName" and "UNCACHE TABLE tableName" command.
  */
 case class CacheCommand(tableName: String, doCache: Boolean) extends Command
-
 
@@ -161,7 +161,7 @@ class FilterPushdownSuite extends OptimizerTest {
 
     comparePlans(optimized, correctAnswer)
   }
-  
+
   test("joins: push down left outer join #1") {
     val x = testRelation.subquery('x)
     val y = testRelation.subquery('y)
 
@@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.{ScalaReflection, dsl}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.catalyst.optimizer.Optimizer
-import org.apache.spark.sql.catalyst.plans.logical.{SetCommand, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 
 import org.apache.spark.sql.columnar.InMemoryRelation
@@ -147,14 +147,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *
    * @group userf
    */
-  def sql(sqlText: String): SchemaRDD = {
-    val result = new SchemaRDD(this, parseSql(sqlText))
-    // We force query optimization to happen right away instead of letting it happen lazily like
-    // when using the query DSL.  This is so DDL commands behave as expected.  This is only
-    // generates the RDD lineage for DML queries, but do not perform any execution.
-    result.queryExecution.toRdd
-    result
-  }
+  def sql(sqlText: String): SchemaRDD = new SchemaRDD(this, parseSql(sqlText))
 
   /** Returns the specified table as a SchemaRDD */
   def table(tableName: String): SchemaRDD =
@@ -259,8 +252,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
   protected[sql] val planner = new SparkPlanner
 
   @transient
-  protected[sql] lazy val emptyResult =
-    sparkContext.parallelize(Seq(new GenericRow(Array[Any]()): Row), 1)
+  protected[sql] lazy val emptyResult = sparkContext.parallelize(Seq.empty[Row], 1)
 
   /**
    * Prepares a planned SparkPlan for execution by binding references to specific ordinals, and
@@ -280,35 +272,14 @@ class SQLContext(@transient val sparkContext: SparkContext)
   protected abstract class QueryExecution {
     def logical: LogicalPlan
 
-    def eagerlyProcess(plan: LogicalPlan): RDD[Row] = plan match {
-      case SetCommand(key, value) =>
-        // Only this case needs to be executed eagerly. The other cases will
-        // be taken care of when the actual results are being extracted.
-        // In the case of HiveContext, sqlConf is overridden to also pass the
-        // pair into its HiveConf.
-        if (key.isDefined && value.isDefined) {
-          set(key.get, value.get)
-        }
-        // It doesn't matter what we return here, since this is only used
-        // to force the evaluation to happen eagerly.  To query the results,
-        // one must use SchemaRDD operations to extract them.
-        emptyResult
-      case _ => executedPlan.execute()
-    }
-
     lazy val analyzed = analyzer(logical)
     lazy val optimizedPlan = optimizer(analyzed)
     // TODO: Don't just pick the first one...
     lazy val sparkPlan = planner(optimizedPlan).next()
     lazy val executedPlan: SparkPlan = prepareForExecution(sparkPlan)
 
     /** Internal version of the RDD. Avoids copies and has no schema */
-    lazy val toRdd: RDD[Row] = {
-      logical match {
-        case s: SetCommand => eagerlyProcess(s)
-        case _ => executedPlan.execute()
-      }
-    }
+    lazy val toRdd: RDD[Row] = executedPlan.execute()
 
     protected def stringOrError[A](f: => A): String =
       try f.toString catch { case e: Throwable => e.toString }
@@ -330,7 +301,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * TODO: We only support primitive types, add support for nested types.
    */
   private[sql] def inferSchema(rdd: RDD[Map[String, _]]): SchemaRDD = {
-    val schema = rdd.first.map { case (fieldName, obj) =>
+    val schema = rdd.first().map { case (fieldName, obj) =>
       val dataType = obj.getClass match {
         case c: Class[_] if c == classOf[java.lang.String] => StringType
         case c: Class[_] if c == classOf[java.lang.Integer] => IntegerType
 
@@ -97,7 +97,7 @@ import java.util.{Map => JMap}
 @AlphaComponent
 class SchemaRDD(
     @transient val sqlContext: SQLContext,
-    @transient protected[spark] val logicalPlan: LogicalPlan)
+    @transient val baseLogicalPlan: LogicalPlan)
   extends RDD[Row](sqlContext.sparkContext, Nil) with SchemaRDDLike {
 
   def baseSchemaRDD = this
@@ -347,16 +347,11 @@ class SchemaRDD(
       val pickle = new Pickler
       iter.map { row =>
         val map: JMap[String, Any] = new java.util.HashMap
-        // TODO: We place the map in an ArrayList so that the object is pickled to a List[Dict].
-        // Ideally we should be able to pickle an object directly into a Python collection so we
-        // don't have to create an ArrayList every time.
-        val arr: java.util.ArrayList[Any] = new java.util.ArrayList
         row.zip(fieldNames).foreach { case (obj, name) =>
           map.put(name, obj)
         }
-        arr.add(map)
-        pickle.dumps(arr)
-      }
+        map
+      }.grouped(10).map(batched => pickle.dumps(batched.toArray))
     }
   }
Original file line number	Diff line number	Diff line change
`@@ -161,7 +161,7 @@ class FilterPushdownSuite extends OptimizerTest {`
`161`	`161`
`162`	`162`	`comparePlans(optimized, correctAnswer)`
`163`	`163`	`}`
`164`		`-`
	`164`	`+`
`165`	`165`	`test("joins: push down left outer join #1") {`
`166`	`166`	`val x = testRelation.subquery('x)`
`167`	`167`	`val y = testRelation.subquery('y)`