address comments

davies · davies · commit 36525838b257 · 2014-07-22T00:49:11.000-07:00
fix code style and add docs and comments
use ExternalMerger for map-side aggregation
check memory usage during partitionBy()
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -57,7 +57,8 @@ private[spark] class PythonRDD[T: ClassTag](
   override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {
     val startTime = System.currentTimeMillis
     val env = SparkEnv.get
-    val localdir = env.conf.get("spark.local.dir", System.getProperty("java.io.tmpdir"))
+    val localdir = env.blockManager.diskBlockManager.localDirs.map(
+      f => f.getPath()).mkString(",")
     val worker: Socket = env.createPythonWorker(pythonExec,
       envVars.toMap + ("SPARK_LOCAL_DIR" -> localdir))
 
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
@@ -43,7 +43,7 @@ private[spark] class DiskBlockManager(shuffleManager: ShuffleBlockManager, rootD
   /* Create one local directory for each path mentioned in spark.local.dir; then, inside this
    * directory, create multiple subdirectories that we will hash files into, in order to avoid
    * having really large inodes at the top level. */
-  private val localDirs: Array[File] = createLocalDirs()
+  val localDirs: Array[File] = createLocalDirs()
   if (localDirs.isEmpty) {
     logError("Failed to create any local dir.")
     System.exit(ExecutorExitCode.DISK_STORE_FAILED_TO_CREATE_DIR)
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -201,7 +201,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>
     Amount of memory to use per python worker process during aggregation, in the same
     format as JVM memory strings (e.g. <code>512m</code>, <code>2g</code>). If the memory
-    used during aggregation go above this amount, it will spill the data into disks.
+    used during aggregation goes above this amount, it will spill the data into disks.
   </td>
 </tr>
 </table>
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -42,7 +42,8 @@
 from pyspark.rddsampler import RDDSampler
 from pyspark.storagelevel import StorageLevel
 from pyspark.resultiterable import ResultIterable
-from pyspark.shuffle import MapMerger, ExternalHashMapMerger
+from pyspark.shuffle import Aggregator, InMemoryMerger, ExternalMerger, \
+    get_used_memory
 
 from py4j.java_collections import ListConverter, MapConverter
 
@@ -171,18 +172,20 @@ def _replaceRoot(self, value):
 
 def _parse_memory(s):
     """
-    It returns a number in MB
+    Parse a memory string in the format supported by Java (e.g. 1g, 200m) and
+    return the value in MB
 
     >>> _parse_memory("256m")
     256
     >>> _parse_memory("2g")
     2048
     """
-    units = {'g': 1024, 'm': 1, 't': 1<<20, 'k':1.0/1024}
+    units = {'g': 1024, 'm': 1, 't': 1 << 20, 'k': 1.0 / 1024}
     if s[-1] not in units:
         raise ValueError("invalid format: " + s)
     return int(float(s[:-1]) * units[s[-1].lower()])
 
+
 class RDD(object):
 
     """
@@ -1198,15 +1201,25 @@ def partitionBy(self, numPartitions, partitionFunc=None):
         # to Java.  Each object is a (splitNumber, [objects]) pair.
         outputSerializer = self.ctx._unbatched_serializer
 
+        limit = _parse_memory(self.ctx._conf.get("spark.python.worker.memory")
+                or "512m")
         def add_shuffle_key(split, iterator):
 
             buckets = defaultdict(list)
-
+            c, batch = 0, 1000
             for (k, v) in iterator:
                 buckets[partitionFunc(k) % numPartitions].append((k, v))
+                c += 1
+                if c % batch == 0 and get_used_memory() > limit:
+                    for split in buckets.keys():
+                        yield pack_long(split)
+                        yield outputSerializer.dumps(buckets[split])
+                        del buckets[split]
+
             for (split, items) in buckets.iteritems():
                 yield pack_long(split)
                 yield outputSerializer.dumps(items)
+
         keyed = PipelinedRDD(self, add_shuffle_key)
         keyed._bypass_serializer = True
         with _JavaStackTrace(self.context) as st:
@@ -1251,27 +1264,26 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         if numPartitions is None:
             numPartitions = self._defaultReducePartitions()
 
+        serializer = self.ctx.serializer
+        spill = (self.ctx._conf.get("spark.shuffle.spill") or 'True').lower() == 'true'
+        memory = _parse_memory(self.ctx._conf.get("spark.python.worker.memory") or "512m")
+        agg = Aggregator(createCombiner, mergeValue, mergeCombiners)
+
         def combineLocally(iterator):
-            combiners = {}
-            for x in iterator:
-                (k, v) = x
-                if k not in combiners:
-                    combiners[k] = createCombiner(v)
-                else:
-                    combiners[k] = mergeValue(combiners[k], v)
-            return combiners.iteritems()
+            merger = ExternalMerger(agg, memory, serializer) \
+                         if spill else InMemoryMerger(agg)
+            merger.combine(iterator)
+            return merger.iteritems()
+
         locally_combined = self.mapPartitions(combineLocally)
         shuffled = locally_combined.partitionBy(numPartitions)
  
-        serializer = self.ctx.serializer
-        spill = ((self.ctx._conf.get("spark.shuffle.spill") or 'True').lower()
-                in ('true', '1', 'yes'))
-        memory = _parse_memory(self.ctx._conf.get("spark.python.worker.memory") or "512m")
         def _mergeCombiners(iterator):
-            merger = ExternalHashMapMerger(mergeCombiners, memory, serializer)\
-                         if spill else MapMerger(mergeCombiners)
+            merger = ExternalMerger(agg, memory, serializer) \
+                         if spill else InMemoryMerger(agg)
             merger.merge(iterator)
             return merger.iteritems()
+
         return shuffled.mapPartitions(_mergeCombiners)
 
     def aggregateByKey(self, zeroValue, seqFunc, combFunc, numPartitions=None):
diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py