From d8e51f9dd21cdffb5f8eb1f6312b761529dbcb9b Mon Sep 17 00:00:00 2001
From: Ken <ugw.gi.world@gmail.com>
Date: Tue, 8 Jul 2014 18:31:41 -0700
Subject: [PATCH 001/347] initial commit for pySparkStreaming

---
 bin/spark-submit                              |   6 +
 core/pom.xml                                  |   2 +-
 .../apache/spark/api/python/PythonRDD.scala   |   2 +-
 .../apache/spark/deploy/PythonRunner.scala    |   1 +
 .../src/main/python/streaming/wordcount.py    |  22 ++
 python/pyspark/java_gateway.py                |   3 +
 python/pyspark/streaming/__init__.py          |   1 +
 python/pyspark/streaming/context.py           | 133 ++++++++
 python/pyspark/streaming/dstream.py           | 315 ++++++++++++++++++
 python/pyspark/streaming/duration.py          | 171 ++++++++++
 python/pyspark/streaming/jtime.py             | 116 +++++++
 python/pyspark/streaming/pyprint.py           |  28 ++
 python/pyspark/streaming/utils.py             |  18 +
 streaming/pom.xml                             |  14 +-
 .../streaming/api/java/JavaDStreamLike.scala  |   8 +
 .../streaming/api/python/PythonDStream.scala  | 152 +++++++++
 .../spark/streaming/dstream/DStream.scala     |  68 +++-
 17 files changed, 1050 insertions(+), 10 deletions(-)
 create mode 100644 examples/src/main/python/streaming/wordcount.py
 create mode 100644 python/pyspark/streaming/__init__.py
 create mode 100644 python/pyspark/streaming/context.py
 create mode 100644 python/pyspark/streaming/dstream.py
 create mode 100644 python/pyspark/streaming/duration.py
 create mode 100644 python/pyspark/streaming/jtime.py
 create mode 100644 python/pyspark/streaming/pyprint.py
 create mode 100644 python/pyspark/streaming/utils.py
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
diff --git a/bin/spark-submit b/bin/spark-submit
index 9e7cecedd0325..ac275b7696d5c 100755
--- a/bin/spark-submit
+++ b/bin/spark-submit
@@ -37,6 +37,12 @@ done
 
 DEPLOY_MODE=${DEPLOY_MODE:-"client"}
 
+# Figure out which Python executable to use
+if [[ -z "$PYSPARK_PYTHON" ]]; then
+  PYSPARK_PYTHON="python"
+fi
+export PYSPARK_PYTHON
+
 if [ -n "$DRIVER_MEMORY" ] && [ $DEPLOY_MODE == "client" ]; then
   export SPARK_DRIVER_MEMORY=$DRIVER_MEMORY
 fi
diff --git a/core/pom.xml b/core/pom.xml
index 8c23842730e37..43633dcb63f54 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>1.1.0-SNAPSHOT</version>
+    <version>1.0.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index f6570d335757a..e88a54d2086ea 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -252,7 +252,7 @@ private class PythonException(msg: String, cause: Exception) extends RuntimeExce
  * Form an RDD[(Array[Byte], Array[Byte])] from key-value pairs returned from Python.
  * This is used by PySpark's shuffle operations.
  */
-private class PairwiseRDD(prev: RDD[Array[Byte]]) extends
+private[spark] class PairwiseRDD(prev: RDD[Array[Byte]]) extends
   RDD[(Long, Array[Byte])](prev) {
   override def getPartitions = prev.partitions
   override def compute(split: Partition, context: TaskContext) =
diff --git a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
index 0d6751f3fa6d2..89f3fd47724fe 100644
--- a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
@@ -57,6 +57,7 @@ object PythonRunner {
     val builder = new ProcessBuilder(Seq(pythonExec, "-u", formattedPythonFile) ++ otherArgs)
     val env = builder.environment()
     env.put("PYTHONPATH", pythonPath)
+    env.put("PYSPARK_PYTHON", pythonExec)
     env.put("PYSPARK_GATEWAY_PORT", "" + gatewayServer.getListeningPort)
     builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize
     val process = builder.start()
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
new file mode 100644
index 0000000000000..f44cd696894ba
--- /dev/null
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -0,0 +1,22 @@
+import sys
+from operator import add
+
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.duration import *
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print >> sys.stderr, "Usage: wordcount <directory>"
+        exit(-1)
+    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
+
+    lines = ssc.textFileStream(sys.argv[1])
+    fm_lines = lines.flatMap(lambda x: x.split(" "))
+    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
+    mapped_lines = fm_lines.map(lambda x: (x, 1))
+    
+    fm_lines.pyprint()
+    filtered_lines.pyprint()
+    mapped_lines.pyprint()
+    ssc.start()
+    ssc.awaitTermination()
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 0dbead4415b02..7038c6422be47 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -82,6 +82,9 @@ def run(self):
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
     java_import(gateway.jvm, "org.apache.spark.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.api.python.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
diff --git a/python/pyspark/streaming/__init__.py b/python/pyspark/streaming/__init__.py
new file mode 100644
index 0000000000000..719592912e80c
--- /dev/null
+++ b/python/pyspark/streaming/__init__.py
@@ -0,0 +1 @@
+__author__ = 'ktakagiw'
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
new file mode 100644
index 0000000000000..c8ae9c4af85c9
--- /dev/null
+++ b/python/pyspark/streaming/context.py
@@ -0,0 +1,133 @@
+__author__ = 'ktakagiw'
+
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import shutil
+import sys
+from threading import Lock
+from tempfile import NamedTemporaryFile
+
+from pyspark import accumulators
+from pyspark.accumulators import Accumulator
+from pyspark.broadcast import Broadcast
+from pyspark.conf import SparkConf
+from pyspark.files import SparkFiles
+from pyspark.java_gateway import launch_gateway
+from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
+from pyspark.storagelevel import StorageLevel
+from pyspark.rdd import RDD
+from pyspark.context import SparkContext
+
+from py4j.java_collections import ListConverter
+
+from pyspark.streaming.dstream import DStream
+
+class StreamingContext(object):
+    """
+    Main entry point for Spark functionality. A StreamingContext represents the
+    connection to a Spark cluster, and can be used to create L{RDD}s and
+    broadcast variables on that cluster.
+    """
+
+    def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
+        environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
+        gateway=None, duration=None):
+        """
+        Create a new StreamingContext. At least the master and app name and duration
+        should be set, either through the named parameters here or through C{conf}.
+
+        @param master: Cluster URL to connect to
+               (e.g. mesos://host:port, spark://host:port, local[4]).
+        @param appName: A name for your job, to display on the cluster web UI.
+        @param sparkHome: Location where Spark is installed on cluster nodes.
+        @param pyFiles: Collection of .zip or .py files to send to the cluster
+               and add to PYTHONPATH.  These can be paths on the local file
+               system or HDFS, HTTP, HTTPS, or FTP URLs.
+        @param environment: A dictionary of environment variables to set on
+               worker nodes.
+        @param batchSize: The number of Python objects represented as a single
+               Java object.  Set 1 to disable batching or -1 to use an
+               unlimited batch size.
+        @param serializer: The serializer for RDDs.
+        @param conf: A L{SparkConf} object setting Spark properties.
+        @param gateway: Use an existing gateway and JVM, otherwise a new JVM
+               will be instatiated.
+        @param duration: A L{Duration} Duration for SparkStreaming
+
+        """
+        # Create the Python Sparkcontext
+        self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
+                        pyFiles=pyFiles, environment=environment, batchSize=batchSize,
+                        serializer=serializer, conf=conf, gateway=gateway)
+        self._jvm = self._sc._jvm
+        self._jssc = self._initialize_context(self._sc._jsc, duration._jduration)
+
+    # Initialize StremaingContext in function to allow subclass specific initialization
+    def _initialize_context(self, jspark_context, jduration):
+        return self._jvm.JavaStreamingContext(jspark_context, jduration)
+
+    def actorStream(self, props, name, storageLevel, supervisorStrategy):
+        raise NotImplementedError
+
+    def addStreamingListener(self, streamingListener):
+        raise NotImplementedError
+
+    def awaitTermination(self, timeout=None):
+        if timeout:
+            self._jssc.awaitTermination(timeout)
+        else:
+            self._jssc.awaitTermination()
+
+    def checkpoint(self, directory):
+        raise NotImplementedError
+
+    def fileStream(self, directory, filter=None, newFilesOnly=None):
+        raise NotImplementedError
+
+    def networkStream(self, receiver):
+        raise NotImplementedError
+
+    def queueStream(self, queue, oneAtATime=True, defaultRDD=None):
+        raise NotImplementedError
+
+    def rawSocketStream(self, hostname, port, storagelevel):
+        raise NotImplementedError
+
+    def remember(self, duration):
+        raise NotImplementedError
+
+    def socketStream(hostname, port, converter,storageLevel):
+        raise NotImplementedError
+
+    def start(self):
+        self._jssc.start()
+
+    def stop(self, stopSparkContext=True):
+        raise NotImplementedError
+
+    def textFileStream(self, directory):
+        return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
+
+    def transform(self, seq):
+        raise NotImplementedError
+
+    def union(self, seq):
+        raise NotImplementedError
+
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
new file mode 100644
index 0000000000000..b422b147d11e1
--- /dev/null
+++ b/python/pyspark/streaming/dstream.py
@@ -0,0 +1,315 @@
+from base64 import standard_b64encode as b64enc
+import copy
+from collections import defaultdict
+from collections import namedtuple
+from itertools import chain, ifilter, imap
+import operator
+import os
+import sys
+import shlex
+import traceback
+from subprocess import Popen, PIPE
+from tempfile import NamedTemporaryFile
+from threading import Thread
+import warnings
+import heapq
+from random import Random
+
+from pyspark.serializers import NoOpSerializer, CartesianDeserializer, \
+    BatchedSerializer, CloudPickleSerializer, PairDeserializer, pack_long
+from pyspark.join import python_join, python_left_outer_join, \
+    python_right_outer_join, python_cogroup
+from pyspark.statcounter import StatCounter
+from pyspark.rddsampler import RDDSampler
+from pyspark.storagelevel import StorageLevel
+#from pyspark.resultiterable import ResultIterable
+from pyspark.rdd import _JavaStackTrace
+
+from py4j.java_collections import ListConverter, MapConverter
+
+__all__ = ["DStream"]
+
+class DStream(object):
+    def __init__(self, jdstream, ssc, jrdd_deserializer):
+        self._jdstream = jdstream
+        self._ssc = ssc
+        self.ctx = ssc._sc
+        self._jrdd_deserializer = jrdd_deserializer
+
+    def generatedRDDs(self):
+        """
+         // RDDs generated, marked as private[streaming] so that testsuites can access it
+         @transient
+        """
+        pass
+
+    def print_(self):
+        """
+        """
+        # print is a resrved name of Python. We cannot give print to function name
+        getattr(self._jdstream, "print")()
+
+    def pyprint(self):
+        """
+        """
+        self._jdstream.pyprint()
+
+    def cache(self):
+        """
+        """
+        raise NotImplementedError
+
+    def checkpoint(self):
+        """
+        """
+        raise NotImplementedError
+
+    def compute(self, time):
+        """
+        """
+        raise NotImplementedError
+
+    def context(self):
+        """
+        """
+        raise NotImplementedError
+
+    def count(self):
+        """
+        """
+        raise NotImplementedError
+
+    def countByValue(self, numPartitions=None):
+        """
+        """
+        raise NotImplementedError
+
+    def countByValueAndWindow(self, duration, slideDuration=None):
+        """
+        """
+        raise NotImplementedError
+
+    def countByWindow(self, duration, slideDuration=None):
+        """
+        """
+        raise NotImplementedError
+
+    def dstream(self):
+        """
+        """
+        raise NotImplementedError
+
+    def filter(self, f):
+        """
+        """
+        def func(iterator): return ifilter(f, iterator)
+        return self.mapPartitions(func)
+
+    def flatMap(self, f, preservesPartitioning=False):
+        """
+        """
+        def func(s, iterator): return chain.from_iterable(imap(f, iterator))
+        return self.mapPartitionsWithIndex(func, preservesPartitioning)
+
+    def foreachRDD(self, f, time):
+        """
+        """
+        raise NotImplementedError
+
+    def glom(self):
+        """
+        """
+        raise NotImplementedError
+
+    def map(self, f, preservesPartitioning=False):
+        """
+        """
+        def func(split, iterator): return imap(f, iterator)
+        return PipelinedDStream(self, func, preservesPartitioning)
+
+    def mapPartitions(self, f):
+        """
+        """
+        def func(s, iterator): return f(iterator)
+        return self.mapPartitionsWithIndex(func)
+
+    def perist(self, storageLevel):
+        """
+        """
+        raise NotImplementedError
+
+    def reduce(self, func, numPartitions=None):
+        """
+
+        """
+        return self._combineByKey(lambda x:x, func, func, numPartitions)
+
+    def _combineByKey(self, createCombiner, mergeValue, mergeCombiners,
+                      numPartitions = None):
+        """
+        """
+        if numPartitions is None:
+            numPartitions = self.ctx._defaultParallelism()
+        def combineLocally(iterator):
+            combiners = {}
+            for x in iterator:
+                (k, v) = x
+                if k not in combiners:
+                    combiners[k] = createCombiner(v)
+                else:
+                    combiners[k] = mergeValue(combiners[k], v)
+            return combiners.iteritems()
+        locally_combined = self.mapPartitions(combineLocally)
+        shuffled = locally_combined.partitionBy(numPartitions)
+        def _mergeCombiners(iterator):
+            combiners = {}
+            for (k, v) in iterator:
+                if not k in combiners:
+                    combiners[k] = v
+                else:
+                    combiners[k] = mergeCombiners(combiners[k], v)
+            return combiners.iteritems()
+        return shuffled.mapPartitions(_mergeCombiners) 
+
+
+   def partitionBy(self, numPartitions, partitionFunc=None):
+        """
+        Return a copy of the DStream partitioned using the specified partitioner.
+
+        """
+        if numPartitions is None:
+            numPartitions = self.ctx._defaultReducePartitions()
+
+        if partitionFunc is None:
+            partitionFunc = lambda x: 0 if x is None else hash(x)
+        # Transferring O(n) objects to Java is too expensive.  Instead, we'll
+        # form the hash buckets in Python, transferring O(numPartitions) objects
+        # to Java.  Each object is a (splitNumber, [objects]) pair.
+        outputSerializer = self.ctx._unbatched_serializer
+        def add_shuffle_key(split, iterator):
+
+            buckets = defaultdict(list)
+
+            for (k, v) in iterator:
+                buckets[partitionFunc(k) % numPartitions].append((k, v))
+            for (split, items) in buckets.iteritems():
+                yield pack_long(split)
+                yield outputSerializer.dumps(items)
+        keyed = PipelinedDStream(self, add_shuffle_key)
+        keyed._bypass_serializer = True
+        with _JavaStackTrace(self.ctx) as st:
+            #JavaDStream
+            #pairRDD = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairRDD()
+            pairDStream = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairDStream()
+            partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
+                                                          id(partitionFunc))
+        jdstream = pairDStream.partitionBy(partitioner).values()
+        dstream = DStream(jdstream, self._ssc, BatchedSerializer(outputSerializer))
+        # This is required so that id(partitionFunc) remains unique, even if
+        # partitionFunc is a lambda:
+        dstream._partitionFunc = partitionFunc
+        return dstream
+
+
+
+    def reduceByWindow(self, reduceFunc, windowDuration, slideDuration, inReduceTunc):
+        """
+        """
+
+        raise NotImplementedError
+
+    def repartition(self, numPartitions):
+        """
+        """
+        raise NotImplementedError
+
+    def slice(self, fromTime, toTime):
+        """
+        """
+        raise NotImplementedError
+
+    def transform(self, transformFunc):
+        """
+        """
+        raise NotImplementedError
+
+    def transformWith(self, other, transformFunc):
+        """
+        """
+        raise NotImplementedError
+
+    def union(self, that):
+        """
+        """
+        raise NotImplementedError
+
+    def window(self, windowDuration, slideDuration=None):
+        """
+        """
+        raise NotImplementedError
+
+    def wrapRDD(self, rdd):
+        """
+        """
+        raise NotImplementedError
+
+    def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
+        return PipelinedDStream(self, f, preservesPartitioning)
+
+
+class PipelinedDStream(DStream):
+    def __init__(self, prev, func, preservesPartitioning=False):
+        if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
+            # This transformation is the first in its stage:
+            self.func = func
+            self.preservesPartitioning = preservesPartitioning
+            self._prev_jdstream = prev._jdstream
+            self._prev_jrdd_deserializer = prev._jrdd_deserializer
+        else:
+            prev_func = prev.func
+            def pipeline_func(split, iterator):
+                return func(split, prev_func(split, iterator))
+            self.func = pipeline_func
+            self.preservesPartitioning = \
+                prev.preservesPartitioning and preservesPartitioning
+            self._prev_jdstream = prev._prev_jdstream  # maintain the pipeline
+            self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer
+        self.is_cached = False
+        self.is_checkpointed = False
+        self._ssc = prev._ssc
+        self.ctx = prev.ctx
+        self.prev = prev
+        self._jdstream_val = None
+        self._jrdd_deserializer = self.ctx.serializer
+        self._bypass_serializer = False
+
+    @property
+    def _jdstream(self):
+        if self._jdstream_val:
+            return self._jdstream_val
+        if self._bypass_serializer:
+            serializer = NoOpSerializer()
+        else:
+            serializer = self.ctx.serializer
+
+        command = (self.func, self._prev_jrdd_deserializer, serializer)
+        pickled_command = CloudPickleSerializer().dumps(command)
+        broadcast_vars = ListConverter().convert(
+            [x._jbroadcast for x in self.ctx._pickled_broadcast_vars],
+            self.ctx._gateway._gateway_client)
+        self.ctx._pickled_broadcast_vars.clear()
+        class_tag = self._prev_jdstream.classTag()
+        env = MapConverter().convert(self.ctx.environment,
+                                     self.ctx._gateway._gateway_client)
+        includes = ListConverter().convert(self.ctx._python_includes,
+                                     self.ctx._gateway._gateway_client)
+        python_dstream = self.ctx._jvm.PythonDStream(self._prev_jdstream.dstream(),
+                bytearray(pickled_command),
+                env, includes, self.preservesPartitioning,
+                self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator,
+                class_tag)
+        self._jdstream_val = python_dstream.asJavaDStream()
+        return self._jdstream_val
+
+    def _is_pipelinable(self):
+        return not (self.is_cached or self.is_checkpointed)
diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
new file mode 100644
index 0000000000000..ef1b4f6cef237
--- /dev/null
+++ b/python/pyspark/streaming/duration.py
@@ -0,0 +1,171 @@
+__author__ = 'ktakagiw'
+
+from pyspark.streaming import utils
+
+class Duration(object):
+    """
+    Duration for Spark Streaming application. Used to set duration
+
+    Most of the time, you would create a Duration object with
+    C{Duration()}, which will load values from C{spark.streaming.*} Java system
+    properties as well. In this case, any parameters you set directly on
+    the C{Duration} object take priority over system properties.
+
+    """
+    def __init__(self, millis, _jvm=None):
+        """
+        Create new Duration.
+
+        @param millis: milisecond
+
+        """
+        self._millis = millis
+
+        from pyspark.context import SparkContext
+        SparkContext._ensure_initialized()
+        _jvm = _jvm or SparkContext._jvm
+        self._jduration = _jvm.Duration(millis)
+
+    def toString(self):
+        """ Return duration as string """
+        return str(self._millis) + " ms"
+
+    def isZero(self):
+        """ Check if millis is zero """
+        return self._millis == 0
+
+    def prettyPrint(self):
+        """
+        Return a human-readable string representing a duration
+        """
+        return utils.msDurationToString(self._millis)
+
+    def milliseconds(self):
+        """ Return millisecond """
+        return self._millis
+
+    def toFormattedString(self):
+        """ Return millisecond """
+        return str(self._millis)
+
+    def max(self, other):
+        """ Return higher Duration """
+        Duration._is_duration(other)
+        if self > other:
+            return self
+        else:
+            return other
+
+    def min(self, other):
+        """ Return lower Durattion """
+        Duration._is_duration(other)
+        if self < other:
+            return self
+        else:
+            return other
+
+    def __str__(self):
+        return self.toString()
+
+    def __add__(self, other):
+        """ Add Duration and Duration """
+        Duration._is_duration(other)
+        return Duration(self._millis + other._millis)
+
+    def __sub__(self, other):
+        """ Subtract Duration by Duration  """
+        Duration._is_duration(other)
+        return Duration(self._millis - other._millis)
+
+    def __mul__(self, other):
+        """ Multiple Duration by Duration """
+        Duration._is_duration(other)
+        return Duration(self._millis * other._millis)
+
+    def __div__(self, other):
+        """
+        Divide Duration by Duration
+        for Python 2.X
+        """
+        Duration._is_duration(other)
+        return Duration(self._millis / other._millis)
+
+    def __truediv__(self, other):
+        """
+        Divide Duration by Duration
+        for Python 3.0
+        """
+        Duration._is_duration(other)
+        return Duration(self._millis / other._millis)
+
+    def __floordiv__(self, other):
+        """ Divide Duration by Duration """
+        Duration._is_duration(other)
+        return Duration(self._millis // other._millis)
+
+    def __len__(self):
+        """ Length of miilisecond in Duration """
+        return len(self._millis)
+
+    def __lt__(self, other):
+        """ Duration < Duration """
+        Duration._is_duration(other)
+        return self._millis < other._millis
+
+    def __le__(self, other):
+        """ Duration <= Duration """
+        Duration._is_duration(other)
+        return self.millis <= other._millis
+
+    def __eq__(self, other):
+        """ Duration ==  Duration """
+        Duration._is_duration(other)
+        return self._millis == other._millis
+
+    def __ne__(self, other):
+        """ Duration != Duration """
+        Duration._is_duration(other)
+        return self._millis != other._millis
+
+    def __gt__(self, other):
+        """ Duration > Duration """
+        Duration._is_duration(other)
+        return self._millis > other._millis
+
+    def __ge__(self, other):
+        """ Duration >= Duration """
+        Duration._is_duration(other)
+        return self._millis >= other._millis
+
+    @classmethod
+    def _is_duration(self, instance):
+        """ is instance Duration """
+        if not isinstance(instance, Duration):
+            raise TypeError("This should be Duration")
+
+def Milliseconds(milliseconds):
+    """
+    Helper function that creates instance of [[pysparkstreaming.duration]] representing
+    a given number of milliseconds.
+    """
+    return Duration(milliseconds)
+
+def Seconds(seconds):
+    """
+    Helper function that creates instance of [[pysparkstreaming.duration]] representing
+    a given number of seconds.
+    """
+    return Duration(seconds * 1000)
+
+def Minites(minites):
+    """
+    Helper function that creates instance of [[pysparkstreaming.duration]] representing
+    a given number of minutes.
+    """
+    return Duration(minutes * 60000)
+
+if __name__ == "__main__":
+    d = Duration(1)
+    print d
+    print d.milliseconds()
+
diff --git a/python/pyspark/streaming/jtime.py b/python/pyspark/streaming/jtime.py
new file mode 100644
index 0000000000000..41670af659ea3
--- /dev/null
+++ b/python/pyspark/streaming/jtime.py
@@ -0,0 +1,116 @@
+__author__ = 'ktakagiw'
+
+from pyspark.streaming import utils
+from pyspark.streaming.duration import Duration
+
+class Time(object):
+    """
+    Time for Spark Streaming application. Used to set Time
+
+    Most of the time, you would create a Duration object with
+    C{Time()}, which will load values from C{spark.streaming.*} Java system
+    properties as well. In this case, any parameters you set directly on
+    the C{Time} object take priority over system properties.
+
+    """
+    def __init__(self, millis, _jvm=None):
+        """
+        Create new Time.
+
+        @param millis: milisecond
+
+        @param _jvm: internal parameter used to pass a handle to the
+               Java VM; does not need to be set by users
+
+        """
+        self._millis = millis
+
+        from pyspark.context import StreamingContext
+        StreamingContext._ensure_initialized()
+        _jvm = _jvm or StreamingContext._jvm
+        self._jtime = _jvm.Time(millis)
+
+    def toString(self):
+        """ Return time as string """
+        return str(self._millis) + " ms"
+
+    def milliseconds(self):
+        """ Return millisecond """
+        return self._millis
+
+    def max(self, other):
+        """ Return higher Time """
+        Time._is_time(other)
+        if self > other:
+            return self
+        else:
+            return other
+
+    def min(self, other):
+        """ Return lower Time """
+        Time._is_time(other)
+        if self < other:
+            return self
+        else:
+            return other
+
+    def __add__(self, other):
+        """ Add Time and Time """
+        Duration._is_duration(other)
+        return Time(self._millis + other._millis)
+
+    def __sub__(self, other):
+        """ Subtract Time by Duration or Time """
+        if isinstance(other, Duration):
+            return Time(self._millis - other._millis)
+        elif isinstance(other, Time):
+            return Duration(self._mills, other._millis)
+        else:
+            raise TypeError
+
+    def __lt__(self, other):
+        """ Time < Time """
+        Time._is_time(other)
+        return self._millis < other._millis
+
+    def __le__(self, other):
+        """ Time <= Time """
+        Time._is_time(other)
+        return self.millis <= other._millis
+
+    def __eq__(self, other):
+        """ Time ==  Time """
+        Time._is_time(other)
+        return self._millis == other._millis
+
+    def __ne__(self, other):
+        """ Time != Time """
+        Time._is_time(other)
+        return self._millis != other._millis
+
+    def __gt__(self, other):
+        """ Time > Time """
+        Time._is_time(other)
+        return self._millis > other._millis
+
+    def __ge__(self, other):
+        """ Time >= Time """
+        Time._is_time(other)
+        return self._millis >= other._millis
+
+    def isMultipbleOf(duration):
+        """ is multiple by Duration """
+        Duration._is_duration(duration)
+        return self._millis % duration._millis == 0
+
+    def until(time, interval):
+        raise NotImplementedError
+
+    def to(time, interval):
+        raise NotImplementedError
+
+    @classmethod
+    def _is_time(self, instance):
+        """ is instance Time """
+        if not isinstance(instance, Time):
+            raise TypeError
diff --git a/python/pyspark/streaming/pyprint.py b/python/pyspark/streaming/pyprint.py
new file mode 100644
index 0000000000000..fcdaca510812c
--- /dev/null
+++ b/python/pyspark/streaming/pyprint.py
@@ -0,0 +1,28 @@
+import sys
+from itertools import chain
+from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
+
+def collect(binary_file_path):
+    dse = PickleSerializer()
+    with open(binary_file_path, 'rb') as tempFile:
+        for item in dse.load_stream(tempFile):
+            yield item
+def main():
+    try:
+        binary_file_path = sys.argv[1]
+    except:
+        print "Missed FilePath in argement"
+
+    if not binary_file_path:
+        return 
+
+    counter = 0
+    for rdd in chain.from_iterable(collect(binary_file_path)):
+        print rdd
+        counter = counter + 1
+        if counter >= 10:
+            print "..."
+            break
+
+if __name__ =="__main__":
+    exit(main())
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
new file mode 100644
index 0000000000000..71aa3376c6578
--- /dev/null
+++ b/python/pyspark/streaming/utils.py
@@ -0,0 +1,18 @@
+__author__ = 'ktakagiw'
+
+def msDurationToString(ms):
+    """
+    Returns a human-readable string representing a duration such as "35ms"
+    """
+    second = 1000
+    minute = 60 * second
+    hour = 60 * minute
+
+    if ms < second:
+        return "%d ms" % ms
+    elif ms < minute:
+        return "%.1f s" % (float(ms) / second)
+    elif ms < hout:
+        return "%.1f m" % (float(ms) / minute)
+    else:
+        return "%.2f h" % (float(ms) / hour)
diff --git a/streaming/pom.xml b/streaming/pom.xml
index f506d6ce34a6f..88df63592efee 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>1.1.0-SNAPSHOT</version>
+    <version>1.0.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -69,14 +69,14 @@
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
       </plugin>
-      
-      <!-- 
-           This plugin forces the generation of jar containing streaming test classes, 
+
+      <!--
+           This plugin forces the generation of jar containing streaming test classes,
            so that the tests classes of external modules can use them. The two execution profiles
-           are necessary - first one for 'mvn package', second one for 'mvn compile'. Ideally, 
-           'mvn compile' should not compile test classes and therefore should not need this. 
+           are necessary - first one for 'mvn package', second one for 'mvn compile'. Ideally,
+           'mvn compile' should not compile test classes and therefore should not need this.
            However, an open Maven bug (http://jira.codehaus.org/browse/MNG-3559)
-           causes the compilation to fail if streaming test-jar is not generated. Hence, the 
+           causes the compilation to fail if streaming test-jar is not generated. Hence, the
            second execution profile for 'mvn compile'.
       -->
       <plugin>
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index a6184de4e83c1..cfa336df8674f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -54,6 +54,14 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     dstream.print()
   }
 
+  /**
+   * Print the first ten elements of each PythonRDD generated in the PythonDStream. This is an output
+   * operator, so this PythonDStream will be registered as an output stream and there materialized.
+   * This function is for PythonAPI.
+   */
+
+  def pyprint() = dstream.pyprint()
+
   /**
    * Return a new DStream in which each RDD has a single element generated by counting each RDD
    * of this DStream.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
new file mode 100644
index 0000000000000..2d8b1e468dc4c
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.api.python
+
+import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
+
+import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark._
+import org.apache.spark.util.Utils
+import java.io._
+import scala.Some
+import org.apache.spark.streaming.Duration
+import scala.util.control.Breaks._
+import org.apache.spark.broadcast.Broadcast
+import scala.Some
+import org.apache.spark.streaming.Duration
+import org.apache.spark.rdd.RDD
+import org.apache.spark.api.python.PythonRDD
+
+
+import org.apache.spark.streaming.{Duration, Time}
+import org.apache.spark.streaming.dstream._
+import org.apache.spark.streaming.api.java._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.api.python._
+import org.apache.spark.api.python.PairwiseRDD
+
+
+import scala.reflect.ClassTag
+
+
+class PythonDStream[T: ClassTag](
+                                  parent: DStream[T],
+                                  command: Array[Byte],
+                                  envVars: JMap[String, String],
+                                  pythonIncludes: JList[String],
+                                  preservePartitoning: Boolean,
+                                  pythonExec: String,
+                                  broadcastVars: JList[Broadcast[Array[Byte]]],
+                                  accumulator: Accumulator[JList[Array[Byte]]]
+                                  ) extends DStream[Array[Byte]](parent.ssc) {
+
+  override def dependencies = List(parent)
+
+  override def slideDuration: Duration = parent.slideDuration
+
+  //pythonDStream compute
+  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    parent.getOrCompute(validTime) match{
+      case Some(rdd) =>
+        val pythonRDD = new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning, pythonExec, broadcastVars, accumulator)
+        Some(pythonRDD.asJavaRDD.rdd)
+      case None => None
+    }
+  }
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+
+  /**
+   * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
+   * operator, so this PythonDStream will be registered as an output stream and there materialized.
+   * Since serialized Python object is readable by Python, pyprint writes out binary data to
+   * temporary file and run python script to deserialized and print the first ten elements
+   */
+  private[streaming] def ppyprint() {
+    def foreachFunc = (rdd: RDD[Array[Byte]], time: Time) => {
+      val iter = rdd.take(11).iterator
+
+      // make a temporary file
+      val prefix = "spark"
+      val suffix = ".tmp"
+      val tempFile = File.createTempFile(prefix, suffix)
+      val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
+      //write out serialized python object
+      PythonRDD.writeIteratorToStream(iter, tempFileStream)
+      tempFileStream.close()
+
+      // This value has to be passed from python
+      val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+      val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
+      //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
+      //absolute path to the python script is needed to change because we do not use pysparkstreaming
+      val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pysparkstreaming/streaming/pyprint.py", tempFile.getAbsolutePath)
+      val workerEnv = pb.environment()
+
+      //envVars also need to be pass
+      //workerEnv.putAll(envVars)
+      val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
+      workerEnv.put("PYTHONPATH", pythonPath)
+      val worker = pb.start()
+      val is = worker.getInputStream()
+      val isr = new InputStreamReader(is)
+      val br = new BufferedReader(isr)
+
+      println ("-------------------------------------------")
+      println ("Time: " + time)
+      println ("-------------------------------------------")
+
+      //print value from python std out
+      var line = ""
+      breakable {
+        while (true) {
+          line = br.readLine()
+          if (line == null) break()
+          println(line)
+        }
+      }
+      //delete temporary file
+      tempFile.delete()
+      println()
+
+    }
+    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
+  }
+}
+
+
+private class PairwiseDStream(prev:DStream[Array[Byte]]) extends
+DStream[(Long, Array[Byte])](prev.ssc){
+  override def dependencies = List(prev)
+
+  override def slideDuration: Duration = prev.slideDuration
+
+  override def compute(validTime:Time):Option[RDD[(Long, Array[Byte])]]={
+    prev.getOrCompute(validTime) match{
+      case Some(rdd)=>Some(rdd)
+        val pairwiseRDD = new PairwiseRDD(rdd)
+        Some(pairwiseRDD.asJavaPairRDD.rdd)
+      case None => None
+    }
+  }
+  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]] = JavaPairDStream(this)
+}
+
+
+
+
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 4709a62381647..ffd7f88fd9dd1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -18,11 +18,13 @@
 package org.apache.spark.streaming.dstream
 
 
-import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
+import java.io._
 
 import scala.deprecated
 import scala.collection.mutable.HashMap
 import scala.reflect.ClassTag
+import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
+import scala.util.control.Breaks._
 
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.rdd.{BlockRDD, RDD}
@@ -31,6 +33,8 @@ import org.apache.spark.streaming._
 import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.scheduler.Job
 import org.apache.spark.util.MetadataCleaner
+import org.apache.spark.streaming.Duration
+import org.apache.spark.api.python.PythonRDD
 
 /**
  * A Discretized Stream (DStream), the basic abstraction in Spark Streaming, is a continuous
@@ -601,6 +605,68 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
+
+
+
+
+  /**
+   * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
+   * operator, so this PythonDStream will be registered as an output stream and there materialized.
+   * Since serialized Python object is readable by Python, pyprint writes out binary data to
+   * temporary file and run python script to deserialized and print the first ten elements
+   */
+  private[streaming] def pyprint() {
+    def foreachFunc = (rdd: RDD[T], time: Time) => {
+      val iter = rdd.take(11).iterator
+
+      // make a temporary file
+      val prefix = "spark"
+      val suffix = ".tmp"
+      val tempFile = File.createTempFile(prefix, suffix)
+      val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
+      //write out serialized python object
+      PythonRDD.writeIteratorToStream(iter, tempFileStream)
+      tempFileStream.close()
+
+      // This value has to be passed from python
+      val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+      val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
+      //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
+      //absolute path to the python script is needed to change because we do not use pysparkstreaming
+      val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath)
+      val workerEnv = pb.environment()
+
+      //envVars also need to be pass
+      //workerEnv.putAll(envVars)
+      val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
+      workerEnv.put("PYTHONPATH", pythonPath)
+      val worker = pb.start()
+      val is = worker.getInputStream()
+      val isr = new InputStreamReader(is)
+      val br = new BufferedReader(isr)
+
+      println ("-------------------------------------------")
+      println ("Time: " + time)
+      println ("-------------------------------------------")
+
+      //print value from python std out
+      var line = ""
+      breakable {
+        while (true) {
+          line = br.readLine()
+          if (line == null) break()
+          println(line)
+        }
+      }
+      //delete temporary file
+      tempFile.delete()
+      println()
+
+    }
+    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
+  }
+
+
   /**
    * Return a new DStream in which each RDD contains all the elements in seen in a
    * sliding window of time over this DStream. The new DStream generates RDDs with

From 1367be52f80ee55a1b0cb1070b8fb02cf258c0be Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Tue, 15 Jul 2014 15:41:52 -0700
Subject: [PATCH 002/347] comment PythonDStream.PairwiseDStream

---
 .../apache/spark/streaming/api/python/PythonDStream.scala   | 3 ++-
 .../scala/org/apache/spark/streaming/dstream/DStream.scala  | 6 ++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 2d8b1e468dc4c..fe67250604d8e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -129,7 +129,7 @@ class PythonDStream[T: ClassTag](
   }
 }
 
-
+/*
 private class PairwiseDStream(prev:DStream[Array[Byte]]) extends
 DStream[(Long, Array[Byte])](prev.ssc){
   override def dependencies = List(prev)
@@ -146,6 +146,7 @@ DStream[(Long, Array[Byte])](prev.ssc){
   }
   val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]] = JavaPairDStream(this)
 }
+*/
 
 
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index b24109074e816..d9d5446b62e9f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -620,10 +620,7 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-
-
-
-
+//TODO move pyprint to PythonDStream
   /**
    * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
    * operator, so this PythonDStream will be registered as an output stream and there materialized.
@@ -644,6 +641,7 @@ abstract class DStream[T: ClassTag] (
       tempFileStream.close()
 
       // This value has to be passed from python
+      // Python currently does not do cluster deployment. But what happened
       val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
       val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
       //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???

From 88068cf8439991b17c244d65af3192b49968583f Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Tue, 15 Jul 2014 17:19:20 -0700
Subject: [PATCH 003/347] modify dstream.py to fix indent error

---
 python/pyspark/streaming/dstream.py                             | 2 +-
 .../org/apache/spark/streaming/api/python/PythonDStream.scala   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index b422b147d11e1..a512517f6e437 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -172,7 +172,7 @@ def _mergeCombiners(iterator):
         return shuffled.mapPartitions(_mergeCombiners) 
 
 
-   def partitionBy(self, numPartitions, partitionFunc=None):
+    def partitionBy(self, numPartitions, partitionFunc=None):
         """
         Return a copy of the DStream partitioned using the specified partitioner.
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index fe67250604d8e..389136f9e21a0 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -91,7 +91,7 @@ class PythonDStream[T: ClassTag](
       tempFileStream.close()
 
       // This value has to be passed from python
-      val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+      //val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
       val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
       //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
       //absolute path to the python script is needed to change because we do not use pysparkstreaming

From 94a07879007d6e6157b7f5b59a04284996f5623f Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Tue, 15 Jul 2014 21:08:43 -0700
Subject: [PATCH 004/347] added reducedByKey not working yet

---
 .../src/main/python/streaming/wordcount.py    | 10 ++++++-
 python/pyspark/streaming/dstream.py           | 27 +++++++++++++++++--
 .../streaming/api/python/PythonDStream.scala  |  6 ++---
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index f44cd696894ba..3996991109d60 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -1,6 +1,7 @@
 import sys
 from operator import add
 
+from pyspark.conf import SparkConf
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
 
@@ -8,15 +9,22 @@
     if len(sys.argv) != 2:
         print >> sys.stderr, "Usage: wordcount <directory>"
         exit(-1)
-    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
+    conf = SparkConf()
+    conf.setAppName("PythonStreamingWordCount")
+    conf.set("spark.default.parallelism", 1)
+
+#    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
+    ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
     lines = ssc.textFileStream(sys.argv[1])
     fm_lines = lines.flatMap(lambda x: x.split(" "))
     filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
+    reduced_lines = mapped_lines.reduce(add)
     
     fm_lines.pyprint()
     filtered_lines.pyprint()
     mapped_lines.pyprint()
+    reduced_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index a512517f6e437..e144f8bc1cc09 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -29,6 +29,7 @@
 
 __all__ = ["DStream"]
 
+
 class DStream(object):
     def __init__(self, jdstream, ssc, jrdd_deserializer):
         self._jdstream = jdstream
@@ -149,7 +150,7 @@ def _combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         """
         """
         if numPartitions is None:
-            numPartitions = self.ctx._defaultParallelism()
+            numPartitions = self._defaultReducePartitions()
         def combineLocally(iterator):
             combiners = {}
             for x in iterator:
@@ -211,7 +212,6 @@ def add_shuffle_key(split, iterator):
         return dstream
 
 
-
     def reduceByWindow(self, reduceFunc, windowDuration, slideDuration, inReduceTunc):
         """
         """
@@ -254,8 +254,31 @@ def wrapRDD(self, rdd):
         raise NotImplementedError
 
     def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
+        """
+
+        """
         return PipelinedDStream(self, f, preservesPartitioning)
 
+    def _defaultReducePartitions(self):
+        """
+
+        """
+        # hard code to avoid the error
+        return 2
+        if self.ctx._conf.contains("spark.default.parallelism"):
+            return self.ctx.defaultParallelism
+        else:
+            return self.getNumPartitions()
+
+    def getNumPartitions(self):
+      """
+      Returns the number of partitions in RDD
+      >>> rdd = sc.parallelize([1, 2, 3, 4], 2)
+      >>> rdd.getNumPartitions()
+      2
+      """
+      return self._jdstream.partitions().size()
+
 
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 389136f9e21a0..719dd0a6a53c2 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -129,7 +129,7 @@ class PythonDStream[T: ClassTag](
   }
 }
 
-/*
+
 private class PairwiseDStream(prev:DStream[Array[Byte]]) extends
 DStream[(Long, Array[Byte])](prev.ssc){
   override def dependencies = List(prev)
@@ -144,9 +144,9 @@ DStream[(Long, Array[Byte])](prev.ssc){
       case None => None
     }
   }
-  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]] = JavaPairDStream(this)
+  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
 }
-*/
+
 
 
 

From 69e9cd33a58b880f96cc9c3e5e62eaa415c49843 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:07:42 -0700
Subject: [PATCH 005/347] implementing transform function in Python

---
 python/pyspark/mllib/_common.py               |  2 +-
 python/pyspark/streaming/dstream.py           |  3 +-
 .../api/python/PythonTransformedDStream.scala | 37 +++++++++++++++++++
 .../spark/streaming/dstream/DStream.scala     |  3 ++
 4 files changed, 42 insertions(+), 3 deletions(-)
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala

diff --git a/python/pyspark/mllib/_common.py b/python/pyspark/mllib/_common.py
index e609b60a0f968..4b723693f43e3 100644
--- a/python/pyspark/mllib/_common.py
+++ b/python/pyspark/mllib/_common.py
@@ -164,7 +164,7 @@ def _deserialize_double_vector(ba, offset=0):
     nb = len(ba) - offset
     if nb < 5:
         raise TypeError("_deserialize_double_vector called on a %d-byte array, "
-                "which is too short" % nb)
+                        "which is too short" % nb)
     if ba[offset] == DENSE_VECTOR_MAGIC:
         return _deserialize_dense_vector(ba, offset)
     elif ba[offset] == SPARSE_VECTOR_MAGIC:
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index e144f8bc1cc09..3365c6d69c1a2 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -172,7 +172,6 @@ def _mergeCombiners(iterator):
             return combiners.iteritems()
         return shuffled.mapPartitions(_mergeCombiners) 
 
-
     def partitionBy(self, numPartitions, partitionFunc=None):
         """
         Return a copy of the DStream partitioned using the specified partitioner.
@@ -231,6 +230,7 @@ def slice(self, fromTime, toTime):
     def transform(self, transformFunc):
         """
         """
+        self._jdstream.transform(transformFunc)
         raise NotImplementedError
 
     def transformWith(self, other, transformFunc):
@@ -264,7 +264,6 @@ def _defaultReducePartitions(self):
 
         """
         # hard code to avoid the error
-        return 2
         if self.ctx._conf.contains("spark.default.parallelism"):
             return self.ctx.defaultParallelism
         else:
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
new file mode 100644
index 0000000000000..ff70483b771a4
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
@@ -0,0 +1,37 @@
+package org.apache.spark.streaming.api.python
+
+import org.apache.spark.Accumulator
+import org.apache.spark.api.python.PythonRDD
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.RDD
+import org.apache.spark.streaming.api.java.JavaDStream
+import org.apache.spark.streaming.{Time, Duration}
+import org.apache.spark.streaming.dstream.DStream
+
+import scala.reflect.ClassTag
+
+/**
+ * Created by ken on 7/15/14.
+ */
+class PythonTransformedDStream[T: ClassTag](
+               parents: Seq[DStream[T]],
+               command: Array[Byte],
+               envVars: JMap[String, String],
+               pythonIncludes: JList[String],
+               preservePartitoning: Boolean,
+               pythonExec: String,
+               broadcastVars: JList[Broadcast[Array[Byte]]],
+               accumulator: Accumulator[JList[Array[Byte]]]
+               ) extends DStream[Array[Byte]](parent.ssc) {
+
+  override def dependencies = List(parent)
+
+  override def slideDuration: Duration = parent.slideDuration
+
+  //pythonDStream compute
+  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
+    Some()
+  }
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index d9d5446b62e9f..67977244ef420 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -561,9 +561,12 @@ abstract class DStream[T: ClassTag] (
     // because the DStream is reachable from the outer object here, and because 
     // DStreams can't be serialized with closures, we can't proactively check 
     // it for serializability and so we pass the optional false to SparkContext.clean
+
+    // serialized python
     val cleanedF = context.sparkContext.clean(transformFunc, false)
     val realTransformFunc =  (rdds: Seq[RDD[_]], time: Time) => {
       assert(rdds.length == 1)
+      // if transformfunc is fine, it is okay
       cleanedF(rdds.head.asInstanceOf[RDD[T]], time)
     }
     new TransformedDStream[U](Seq(this), realTransformFunc)

From 72bfc66074b2f35224f116759e0a47204a138f24 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:12:53 -0700
Subject: [PATCH 006/347] modified the code base on comment in
 https://github.com/tdas/spark/pull/10

---
 core/pom.xml                         | 2 +-
 python/pyspark/streaming/__init__.py | 1 -
 python/pyspark/streaming/context.py  | 5 +----
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/core/pom.xml b/core/pom.xml
index a59fc9fc035d7..6abf8480d5da0 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>1.0.0</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/python/pyspark/streaming/__init__.py b/python/pyspark/streaming/__init__.py
index 719592912e80c..e69de29bb2d1d 100644
--- a/python/pyspark/streaming/__init__.py
+++ b/python/pyspark/streaming/__init__.py
@@ -1 +0,0 @@
-__author__ = 'ktakagiw'
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index c8ae9c4af85c9..40e9d98942e2e 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -1,6 +1,3 @@
-__author__ = 'ktakagiw'
-
-
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -41,7 +38,7 @@
 
 class StreamingContext(object):
     """
-    Main entry point for Spark functionality. A StreamingContext represents the
+    Main entry point for Spark Streaming functionality. A StreamingContext represents the
     connection to a Spark cluster, and can be used to create L{RDD}s and
     broadcast variables on that cluster.
     """

From a7a0b5ce72e9bad14880f2285544d11d725f0f14 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:17:02 -0700
Subject: [PATCH 007/347] add coment for hack why PYSPARK_PYTHON is needed in
 spark-submit

---
 bin/spark-submit | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/bin/spark-submit b/bin/spark-submit
index ac275b7696d5c..fa022f707e572 100755
--- a/bin/spark-submit
+++ b/bin/spark-submit
@@ -37,6 +37,16 @@ done
 
 DEPLOY_MODE=${DEPLOY_MODE:-"client"}
 
+
+# This is a hack to make DStream.pyprint work. 
+# This will be removed after pyprint is moved to PythonDStream.
+# Problem is that print function is in (Scala)DStream. 
+# Whenever python code is executed, we call PythonDStream which passes
+# pythonExec(which python Spark should execute).
+# Since pyprint is located in DStream, Spark does not know which python should use. 
+# In that case, get python path from PYSPARK_PYTHON, environmental variable. 
+# This fix is ongoing in print branch in my repo.
+
 # Figure out which Python executable to use
 if [[ -z "$PYSPARK_PYTHON" ]]; then
   PYSPARK_PYTHON="python"

From 0a516f5a31bfb5f5d3ac58139af820ad8bb50a5a Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:19:13 -0700
Subject: [PATCH 008/347] add coment for hack why PYSPARK_PYTHON is needed in
 spark-submit

---
 bin/spark-submit | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/spark-submit b/bin/spark-submit
index fa022f707e572..ec4e10787cff0 100755
--- a/bin/spark-submit
+++ b/bin/spark-submit
@@ -45,7 +45,7 @@ DEPLOY_MODE=${DEPLOY_MODE:-"client"}
 # pythonExec(which python Spark should execute).
 # Since pyprint is located in DStream, Spark does not know which python should use. 
 # In that case, get python path from PYSPARK_PYTHON, environmental variable. 
-# This fix is ongoing in print branch in my repo.
+# This fix is ongoing in print branch in https://github.com/giwa/spark/tree/print.
 
 # Figure out which Python executable to use
 if [[ -z "$PYSPARK_PYTHON" ]]; then

From 57e3e52191464f6b8f8ec53a6452dcf86d4704a6 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:24:08 -0700
Subject: [PATCH 009/347] remove not implemented DStream functions in python

---
 python/pyspark/streaming/dstream.py | 102 ----------------------------
 1 file changed, 102 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index a512517f6e437..6ab9c500450aa 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -54,50 +54,6 @@ def pyprint(self):
         """
         self._jdstream.pyprint()
 
-    def cache(self):
-        """
-        """
-        raise NotImplementedError
-
-    def checkpoint(self):
-        """
-        """
-        raise NotImplementedError
-
-    def compute(self, time):
-        """
-        """
-        raise NotImplementedError
-
-    def context(self):
-        """
-        """
-        raise NotImplementedError
-
-    def count(self):
-        """
-        """
-        raise NotImplementedError
-
-    def countByValue(self, numPartitions=None):
-        """
-        """
-        raise NotImplementedError
-
-    def countByValueAndWindow(self, duration, slideDuration=None):
-        """
-        """
-        raise NotImplementedError
-
-    def countByWindow(self, duration, slideDuration=None):
-        """
-        """
-        raise NotImplementedError
-
-    def dstream(self):
-        """
-        """
-        raise NotImplementedError
 
     def filter(self, f):
         """
@@ -111,16 +67,6 @@ def flatMap(self, f, preservesPartitioning=False):
         def func(s, iterator): return chain.from_iterable(imap(f, iterator))
         return self.mapPartitionsWithIndex(func, preservesPartitioning)
 
-    def foreachRDD(self, f, time):
-        """
-        """
-        raise NotImplementedError
-
-    def glom(self):
-        """
-        """
-        raise NotImplementedError
-
     def map(self, f, preservesPartitioning=False):
         """
         """
@@ -133,11 +79,6 @@ def mapPartitions(self, f):
         def func(s, iterator): return f(iterator)
         return self.mapPartitionsWithIndex(func)
 
-    def perist(self, storageLevel):
-        """
-        """
-        raise NotImplementedError
-
     def reduce(self, func, numPartitions=None):
         """
 
@@ -210,49 +151,6 @@ def add_shuffle_key(split, iterator):
         dstream._partitionFunc = partitionFunc
         return dstream
 
-
-
-    def reduceByWindow(self, reduceFunc, windowDuration, slideDuration, inReduceTunc):
-        """
-        """
-
-        raise NotImplementedError
-
-    def repartition(self, numPartitions):
-        """
-        """
-        raise NotImplementedError
-
-    def slice(self, fromTime, toTime):
-        """
-        """
-        raise NotImplementedError
-
-    def transform(self, transformFunc):
-        """
-        """
-        raise NotImplementedError
-
-    def transformWith(self, other, transformFunc):
-        """
-        """
-        raise NotImplementedError
-
-    def union(self, that):
-        """
-        """
-        raise NotImplementedError
-
-    def window(self, windowDuration, slideDuration=None):
-        """
-        """
-        raise NotImplementedError
-
-    def wrapRDD(self, rdd):
-        """
-        """
-        raise NotImplementedError
-
     def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         return PipelinedDStream(self, f, preservesPartitioning)
 

From c9d79dd381ee001eb5920ca865b5dc72f8b46a7f Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:35:59 -0700
Subject: [PATCH 010/347] revert pom.xml

---
 python/pyspark/streaming/pyprint.py | 2 +-
 streaming/pom.xml                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/streaming/pyprint.py b/python/pyspark/streaming/pyprint.py
index fcdaca510812c..6e87c985a57e3 100644
--- a/python/pyspark/streaming/pyprint.py
+++ b/python/pyspark/streaming/pyprint.py
@@ -1,6 +1,6 @@
 import sys
 from itertools import chain
-from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
+from pyspark.serializers import PickleSerializer
 
 def collect(binary_file_path):
     dse = PickleSerializer()
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 88df63592efee..2239ad9c8579c 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>1.0.0</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 8f8202b5c9bfccfb42f7027e7e8079b4b5807f02 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:38:26 -0700
Subject: [PATCH 011/347] revert streaming pom.xml

---
 streaming/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/streaming/pom.xml b/streaming/pom.xml
index 2239ad9c8579c..03102c5e836bf 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -76,7 +76,7 @@
            are necessary - first one for 'mvn package', second one for 'mvn compile'. Ideally,
            'mvn compile' should not compile test classes and therefore should not need this.
            However, an open Maven bug (http://jira.codehaus.org/browse/MNG-3559)
-           causes the compilation to fail if streaming test-jar is not generated. Hence, the
+           causes the compilation to fail if streaming test-jar is not generated. Hence, the 
            second execution profile for 'mvn compile'.
       -->
       <plugin>

From fa4a7fc1b0643bfbe48b24e3897d65bce3332e64 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:44:14 -0700
Subject: [PATCH 012/347] revert streaming/pom.xml

---
 streaming/pom.xml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/streaming/pom.xml b/streaming/pom.xml
index 03102c5e836bf..f506d6ce34a6f 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -69,12 +69,12 @@
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
       </plugin>
-
-      <!--
-           This plugin forces the generation of jar containing streaming test classes,
+      
+      <!-- 
+           This plugin forces the generation of jar containing streaming test classes, 
            so that the tests classes of external modules can use them. The two execution profiles
-           are necessary - first one for 'mvn package', second one for 'mvn compile'. Ideally,
-           'mvn compile' should not compile test classes and therefore should not need this.
+           are necessary - first one for 'mvn package', second one for 'mvn compile'. Ideally, 
+           'mvn compile' should not compile test classes and therefore should not need this. 
            However, an open Maven bug (http://jira.codehaus.org/browse/MNG-3559)
            causes the compilation to fail if streaming test-jar is not generated. Hence, the 
            second execution profile for 'mvn compile'.

From 6e0a64adc334cb8d158e827fa4a0b4e816903460 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 12:15:06 -0700
Subject: [PATCH 013/347] sorted the import following Spark coding convention

---
 .../streaming/api/python/PythonDStream.scala  | 120 ++----------------
 1 file changed, 13 insertions(+), 107 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 389136f9e21a0..9d4eebaadc4c7 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -19,42 +19,28 @@ package org.apache.spark.streaming.api.python
 
 import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
 
-import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
-import org.apache.spark.broadcast.Broadcast
+import scala.reflect.ClassTag
+
 import org.apache.spark._
-import org.apache.spark.util.Utils
-import java.io._
-import scala.Some
-import org.apache.spark.streaming.Duration
-import scala.util.control.Breaks._
-import org.apache.spark.broadcast.Broadcast
-import scala.Some
-import org.apache.spark.streaming.Duration
 import org.apache.spark.rdd.RDD
-import org.apache.spark.api.python.PythonRDD
-
-
+import org.apache.spark.api.python._
+import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.streaming.{Duration, Time}
 import org.apache.spark.streaming.dstream._
 import org.apache.spark.streaming.api.java._
-import org.apache.spark.rdd.RDD
-import org.apache.spark.api.python._
-import org.apache.spark.api.python.PairwiseRDD
-
 
-import scala.reflect.ClassTag
 
 
 class PythonDStream[T: ClassTag](
-                                  parent: DStream[T],
-                                  command: Array[Byte],
-                                  envVars: JMap[String, String],
-                                  pythonIncludes: JList[String],
-                                  preservePartitoning: Boolean,
-                                  pythonExec: String,
-                                  broadcastVars: JList[Broadcast[Array[Byte]]],
-                                  accumulator: Accumulator[JList[Array[Byte]]]
-                                  ) extends DStream[Array[Byte]](parent.ssc) {
+    parent: DStream[T],
+    command: Array[Byte],
+    envVars: JMap[String, String],
+    pythonIncludes: JList[String],
+    preservePartitoning: Boolean,
+    pythonExec: String,
+    broadcastVars: JList[Broadcast[Array[Byte]]],
+    accumulator: Accumulator[JList[Array[Byte]]])
+  extends DStream[Array[Byte]](parent.ssc) {
 
   override def dependencies = List(parent)
 
@@ -70,84 +56,4 @@ class PythonDStream[T: ClassTag](
     }
   }
   val asJavaDStream  = JavaDStream.fromDStream(this)
-
-  /**
-   * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
-   * operator, so this PythonDStream will be registered as an output stream and there materialized.
-   * Since serialized Python object is readable by Python, pyprint writes out binary data to
-   * temporary file and run python script to deserialized and print the first ten elements
-   */
-  private[streaming] def ppyprint() {
-    def foreachFunc = (rdd: RDD[Array[Byte]], time: Time) => {
-      val iter = rdd.take(11).iterator
-
-      // make a temporary file
-      val prefix = "spark"
-      val suffix = ".tmp"
-      val tempFile = File.createTempFile(prefix, suffix)
-      val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
-      //write out serialized python object
-      PythonRDD.writeIteratorToStream(iter, tempFileStream)
-      tempFileStream.close()
-
-      // This value has to be passed from python
-      //val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
-      val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
-      //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
-      //absolute path to the python script is needed to change because we do not use pysparkstreaming
-      val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pysparkstreaming/streaming/pyprint.py", tempFile.getAbsolutePath)
-      val workerEnv = pb.environment()
-
-      //envVars also need to be pass
-      //workerEnv.putAll(envVars)
-      val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
-      workerEnv.put("PYTHONPATH", pythonPath)
-      val worker = pb.start()
-      val is = worker.getInputStream()
-      val isr = new InputStreamReader(is)
-      val br = new BufferedReader(isr)
-
-      println ("-------------------------------------------")
-      println ("Time: " + time)
-      println ("-------------------------------------------")
-
-      //print value from python std out
-      var line = ""
-      breakable {
-        while (true) {
-          line = br.readLine()
-          if (line == null) break()
-          println(line)
-        }
-      }
-      //delete temporary file
-      tempFile.delete()
-      println()
-
-    }
-    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
-  }
-}
-
-/*
-private class PairwiseDStream(prev:DStream[Array[Byte]]) extends
-DStream[(Long, Array[Byte])](prev.ssc){
-  override def dependencies = List(prev)
-
-  override def slideDuration: Duration = prev.slideDuration
-
-  override def compute(validTime:Time):Option[RDD[(Long, Array[Byte])]]={
-    prev.getOrCompute(validTime) match{
-      case Some(rdd)=>Some(rdd)
-        val pairwiseRDD = new PairwiseRDD(rdd)
-        Some(pairwiseRDD.asJavaPairRDD.rdd)
-      case None => None
-    }
-  }
-  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]] = JavaPairDStream(this)
 }
-*/
-
-
-
-

From 25d30d531d8eefc477bd2540ad5f7a22d2b89010 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 12:19:42 -0700
Subject: [PATCH 014/347] add empty line

---
 .../org/apache/spark/streaming/api/python/PythonDStream.scala    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 9d4eebaadc4c7..4c98f1c993317 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -55,5 +55,6 @@ class PythonDStream[T: ClassTag](
       case None => None
     }
   }
+  
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }

From 6d012f7f96e2355d035e12d1310c68de32ed329c Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 15:40:42 -0700
Subject: [PATCH 015/347] remove unused import in python

---
 python/pyspark/streaming/context.py           |  9 ------
 python/pyspark/streaming/dstream.py           | 30 +++----------------
 python/pyspark/streaming/duration.py          | 17 ++++++++++-
 python/pyspark/streaming/jtime.py             | 24 ++++++++++++++-
 python/pyspark/streaming/pyprint.py           | 19 ++++++++++++
 .../streaming/api/python/PythonDStream.scala  |  2 +-
 6 files changed, 63 insertions(+), 38 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 40e9d98942e2e..d3ff16fca764f 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -15,15 +15,6 @@
 # limitations under the License.
 #
 
-import os
-import shutil
-import sys
-from threading import Lock
-from tempfile import NamedTemporaryFile
-
-from pyspark import accumulators
-from pyspark.accumulators import Accumulator
-from pyspark.broadcast import Broadcast
 from pyspark.conf import SparkConf
 from pyspark.files import SparkFiles
 from pyspark.java_gateway import launch_gateway
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 6ab9c500450aa..cd28184274c9a 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -1,28 +1,8 @@
-from base64 import standard_b64encode as b64enc
-import copy
 from collections import defaultdict
-from collections import namedtuple
 from itertools import chain, ifilter, imap
-import operator
-import os
-import sys
-import shlex
-import traceback
-from subprocess import Popen, PIPE
-from tempfile import NamedTemporaryFile
-from threading import Thread
-import warnings
-import heapq
-from random import Random
-
-from pyspark.serializers import NoOpSerializer, CartesianDeserializer, \
-    BatchedSerializer, CloudPickleSerializer, PairDeserializer, pack_long
-from pyspark.join import python_join, python_left_outer_join, \
-    python_right_outer_join, python_cogroup
-from pyspark.statcounter import StatCounter
-from pyspark.rddsampler import RDDSampler
-from pyspark.storagelevel import StorageLevel
-#from pyspark.resultiterable import ResultIterable
+
+from pyspark.serializers import NoOpSerializer,\
+    BatchedSerializer, CloudPickleSerializer, pack_long
 from pyspark.rdd import _JavaStackTrace
 
 from py4j.java_collections import ListConverter, MapConverter
@@ -46,7 +26,7 @@ def generatedRDDs(self):
     def print_(self):
         """
         """
-        # print is a resrved name of Python. We cannot give print to function name
+        # print is a reserved name of Python. We cannot give print to function name
         getattr(self._jdstream, "print")()
 
     def pyprint(self):
@@ -54,7 +34,6 @@ def pyprint(self):
         """
         self._jdstream.pyprint()
 
-
     def filter(self, f):
         """
         """
@@ -140,7 +119,6 @@ def add_shuffle_key(split, iterator):
         keyed._bypass_serializer = True
         with _JavaStackTrace(self.ctx) as st:
             #JavaDStream
-            #pairRDD = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairRDD()
             pairDStream = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairDStream()
             partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
                                                           id(partitionFunc))
diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
index ef1b4f6cef237..5982146e69026 100644
--- a/python/pyspark/streaming/duration.py
+++ b/python/pyspark/streaming/duration.py
@@ -1,4 +1,19 @@
-__author__ = 'ktakagiw'
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 
 from pyspark.streaming import utils
 
diff --git a/python/pyspark/streaming/jtime.py b/python/pyspark/streaming/jtime.py
index 41670af659ea3..32ef741051283 100644
--- a/python/pyspark/streaming/jtime.py
+++ b/python/pyspark/streaming/jtime.py
@@ -1,8 +1,30 @@
-__author__ = 'ktakagiw'
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 
 from pyspark.streaming import utils
 from pyspark.streaming.duration import Duration
 
+"""
+The name of this file, time is not good naming for python
+because if we do import time when we want to use native python time package, it does
+not import python time package.
+"""
+
+
 class Time(object):
     """
     Time for Spark Streaming application. Used to set Time
diff --git a/python/pyspark/streaming/pyprint.py b/python/pyspark/streaming/pyprint.py
index 6e87c985a57e3..1aeb8e50375ed 100644
--- a/python/pyspark/streaming/pyprint.py
+++ b/python/pyspark/streaming/pyprint.py
@@ -1,5 +1,24 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
 import sys
 from itertools import chain
+
 from pyspark.serializers import PickleSerializer
 
 def collect(binary_file_path):
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 4c98f1c993317..76b88385e095a 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -55,6 +55,6 @@ class PythonDStream[T: ClassTag](
       case None => None
     }
   }
-  
+
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }

From 1e84f41b30404b46e0afc1c905a6dcfd523bca78 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 16:23:08 -0700
Subject: [PATCH 016/347] initial commit for socketTextStream

---
 .../python/streaming/nerwork_wordcount.py     | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 examples/src/main/python/streaming/nerwork_wordcount.py

diff --git a/examples/src/main/python/streaming/nerwork_wordcount.py b/examples/src/main/python/streaming/nerwork_wordcount.py
new file mode 100644
index 0000000000000..2e5048ccad213
--- /dev/null
+++ b/examples/src/main/python/streaming/nerwork_wordcount.py
@@ -0,0 +1,22 @@
+import sys
+from operator import add
+
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.duration import *
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
+        exit(-1)
+    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
+
+    lines = ssc.socketTextStream(sys.argv[1], sys.argv[2])
+    fm_lines = lines.flatMap(lambda x: x.split(" "))
+    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
+    mapped_lines = fm_lines.map(lambda x: (x, 1))
+    
+    fm_lines.pyprint()
+    filtered_lines.pyprint()
+    mapped_lines.pyprint()
+    ssc.start()
+    ssc.awaitTermination()

From a61fa9e0a590c5e30f650fc24df81deed4e09d78 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 16:27:28 -0700
Subject: [PATCH 017/347] fied input of socketTextDStream

---
 .../python/streaming/nerwork_wordcount.py     |  2 +-
 python/pyspark/java_gateway.py                |  1 +
 python/pyspark/streaming/context.py           | 25 +++----------------
 3 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/examples/src/main/python/streaming/nerwork_wordcount.py b/examples/src/main/python/streaming/nerwork_wordcount.py
index 2e5048ccad213..67dc28f7bf7f0 100644
--- a/examples/src/main/python/streaming/nerwork_wordcount.py
+++ b/examples/src/main/python/streaming/nerwork_wordcount.py
@@ -10,7 +10,7 @@
         exit(-1)
     ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
 
-    lines = ssc.socketTextStream(sys.argv[1], sys.argv[2])
+    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     fm_lines = lines.flatMap(lambda x: x.split(" "))
     filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 7038c6422be47..cea7d0975e5d1 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -85,6 +85,7 @@ def run(self):
     java_import(gateway.jvm, "org.apache.spark.streaming.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.dstream.*")
     java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index d3ff16fca764f..5dcc9ba35a653 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -19,7 +19,7 @@
 from pyspark.files import SparkFiles
 from pyspark.java_gateway import launch_gateway
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
-from pyspark.storagelevel import StorageLevel
+from pyspark.storagelevel import *
 from pyspark.rdd import RDD
 from pyspark.context import SparkContext
 
@@ -83,26 +83,9 @@ def awaitTermination(self, timeout=None):
         else:
             self._jssc.awaitTermination()
 
-    def checkpoint(self, directory):
-        raise NotImplementedError
-
-    def fileStream(self, directory, filter=None, newFilesOnly=None):
-        raise NotImplementedError
-
-    def networkStream(self, receiver):
-        raise NotImplementedError
-
-    def queueStream(self, queue, oneAtATime=True, defaultRDD=None):
-        raise NotImplementedError
-
-    def rawSocketStream(self, hostname, port, storagelevel):
-        raise NotImplementedError
-
-    def remember(self, duration):
-        raise NotImplementedError
-
-    def socketStream(hostname, port, converter,storageLevel):
-        raise NotImplementedError
+    # start from simple one. storageLevel is not passed for now.
+    def socketTextStream(self, hostname, port):
+        return DStream(self._jssc.socketTextStream(hostname, port), self, UTF8Deserializer())
 
     def start(self):
         self._jssc.start()

From a8c9fd5120204a96ae47520ec734212fb23a3af9 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 16:27:28 -0700
Subject: [PATCH 018/347] fixed for socketTextStream

---
 .../python/streaming/nerwork_wordcount.py     |  2 +-
 python/pyspark/java_gateway.py                |  1 +
 python/pyspark/streaming/context.py           | 25 +++----------------
 3 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/examples/src/main/python/streaming/nerwork_wordcount.py b/examples/src/main/python/streaming/nerwork_wordcount.py
index 2e5048ccad213..67dc28f7bf7f0 100644
--- a/examples/src/main/python/streaming/nerwork_wordcount.py
+++ b/examples/src/main/python/streaming/nerwork_wordcount.py
@@ -10,7 +10,7 @@
         exit(-1)
     ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
 
-    lines = ssc.socketTextStream(sys.argv[1], sys.argv[2])
+    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     fm_lines = lines.flatMap(lambda x: x.split(" "))
     filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 7038c6422be47..cea7d0975e5d1 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -85,6 +85,7 @@ def run(self):
     java_import(gateway.jvm, "org.apache.spark.streaming.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.dstream.*")
     java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index d3ff16fca764f..5dcc9ba35a653 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -19,7 +19,7 @@
 from pyspark.files import SparkFiles
 from pyspark.java_gateway import launch_gateway
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
-from pyspark.storagelevel import StorageLevel
+from pyspark.storagelevel import *
 from pyspark.rdd import RDD
 from pyspark.context import SparkContext
 
@@ -83,26 +83,9 @@ def awaitTermination(self, timeout=None):
         else:
             self._jssc.awaitTermination()
 
-    def checkpoint(self, directory):
-        raise NotImplementedError
-
-    def fileStream(self, directory, filter=None, newFilesOnly=None):
-        raise NotImplementedError
-
-    def networkStream(self, receiver):
-        raise NotImplementedError
-
-    def queueStream(self, queue, oneAtATime=True, defaultRDD=None):
-        raise NotImplementedError
-
-    def rawSocketStream(self, hostname, port, storagelevel):
-        raise NotImplementedError
-
-    def remember(self, duration):
-        raise NotImplementedError
-
-    def socketStream(hostname, port, converter,storageLevel):
-        raise NotImplementedError
+    # start from simple one. storageLevel is not passed for now.
+    def socketTextStream(self, hostname, port):
+        return DStream(self._jssc.socketTextStream(hostname, port), self, UTF8Deserializer())
 
     def start(self):
         self._jssc.start()

From ce7d426ffb4f8c8bcc384fa96d733b18aef4a6f5 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Wed, 16 Jul 2014 23:29:37 -0700
Subject: [PATCH 019/347] added doctest for pyspark.streaming.duration

---
 python/pyspark/streaming/duration.py | 242 +++++++++++++++++++++++----
 python/pyspark/streaming/utils.py    |  20 ++-
 python/run-tests                     |   1 +
 3 files changed, 233 insertions(+), 30 deletions(-)

diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
index 5982146e69026..06a169e5215ac 100644
--- a/python/pyspark/streaming/duration.py
+++ b/python/pyspark/streaming/duration.py
@@ -42,29 +42,80 @@ def __init__(self, millis, _jvm=None):
         self._jduration = _jvm.Duration(millis)
 
     def toString(self):
-        """ Return duration as string """
+        """
+        Return duration as string
+
+        >>> d_10 = Duration(10)
+        >>> d_10.toString()
+        '10 ms'
+        """
         return str(self._millis) + " ms"
 
     def isZero(self):
-        """ Check if millis is zero """
+        """
+        Check if millis is zero
+
+        >>> d_10 = Duration(10)
+        >>> d_10.isZero()
+        False
+        >>> d_0 = Duration(0)
+        >>> d_0.isZero()
+        True
+        """
         return self._millis == 0
 
     def prettyPrint(self):
         """
         Return a human-readable string representing a duration
+
+        >>> d_10 = Duration(10)
+        >>> d_10.prettyPrint()
+        '10 ms'
+        >>> d_1sec = Duration(1000)
+        >>> d_1sec.prettyPrint()
+        '1.0 s'
+        >>> d_1min = Duration(60 * 1000)
+        >>> d_1min.prettyPrint()
+        '1.0 m'
+        >>> d_1hour = Duration(60 * 60 * 1000)
+        >>> d_1hour.prettyPrint()
+        '1.00 h'
         """
         return utils.msDurationToString(self._millis)
 
     def milliseconds(self):
-        """ Return millisecond """
+        """
+        Return millisecond
+
+        >>> d_10 = Duration(10)
+        >>> d_10.milliseconds()
+        10
+
+        """
         return self._millis
 
     def toFormattedString(self):
-        """ Return millisecond """
+        """
+        Return millisecond
+
+        >>> d_10 = Duration(10)
+        >>> d_10.toFormattedString()
+        '10'
+
+        """
         return str(self._millis)
 
     def max(self, other):
-        """ Return higher Duration """
+        """
+        Return higher Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_100 = Duration(100)
+        >>> d_max = d_10.max(d_100)
+        >>> print d_max
+        100 ms
+
+        """
         Duration._is_duration(other)
         if self > other:
             return self
@@ -72,7 +123,16 @@ def max(self, other):
             return other
 
     def min(self, other):
-        """ Return lower Durattion """
+        """
+        Return lower Durattion
+
+        >>> d_10 = Duration(10)
+        >>> d_100 = Duration(100)
+        >>> d_min = d_10.min(d_100)
+        >>> print d_min
+        10 ms
+
+        """
         Duration._is_duration(other)
         if self < other:
             return self
@@ -80,20 +140,52 @@ def min(self, other):
             return other
 
     def __str__(self):
+        """
+        >>> d_10 = Duration(10)
+        >>> str(d_10)
+        '10 ms'
+
+        """
         return self.toString()
 
     def __add__(self, other):
-        """ Add Duration and Duration """
+        """
+        Add Duration and Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_100 = Duration(100)
+        >>> d_110 = d_10 + d_100
+        >>> print d_110
+        110 ms
+        """
         Duration._is_duration(other)
         return Duration(self._millis + other._millis)
 
     def __sub__(self, other):
-        """ Subtract Duration by Duration  """
+        """
+        Subtract Duration by Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_100 = Duration(100)
+        >>> d_90 =  d_100 - d_10
+        >>> print d_90
+        90 ms
+
+        """
         Duration._is_duration(other)
         return Duration(self._millis - other._millis)
 
     def __mul__(self, other):
-        """ Multiple Duration by Duration """
+        """
+        Multiple Duration by Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_100 = Duration(100)
+        >>> d_1000 = d_10 * d_100
+        >>> print d_1000
+        1000 ms
+
+        """
         Duration._is_duration(other)
         return Duration(self._millis * other._millis)
 
@@ -101,6 +193,13 @@ def __div__(self, other):
         """
         Divide Duration by Duration
         for Python 2.X
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_2 = d_20 / d_10
+        >>> print d_2
+        2 ms
+
         """
         Duration._is_duration(other)
         return Duration(self._millis / other._millis)
@@ -109,46 +208,121 @@ def __truediv__(self, other):
         """
         Divide Duration by Duration
         for Python 3.0
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_2 = d_20 / d_10
+        >>> print d_2
+        2 ms
+
         """
         Duration._is_duration(other)
         return Duration(self._millis / other._millis)
 
     def __floordiv__(self, other):
-        """ Divide Duration by Duration """
+        """
+        Divide Duration by Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_3 = Duration(3)
+        >>> d_3 = d_10 // d_3
+        >>> print d_3
+        3 ms
+
+        """
         Duration._is_duration(other)
         return Duration(self._millis // other._millis)
 
-    def __len__(self):
-        """ Length of miilisecond in Duration """
-        return len(self._millis)
-
     def __lt__(self, other):
-        """ Duration < Duration """
+        """
+        Duration < Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_10 < d_20
+        True
+        >>> d_20 < d_10
+        False
+
+        """
         Duration._is_duration(other)
         return self._millis < other._millis
 
     def __le__(self, other):
-        """ Duration <= Duration """
+        """
+        Duration <= Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_10 <= d_20
+        True
+        >>> d_20 <= d_10
+        False
+
+        """
         Duration._is_duration(other)
-        return self.millis <= other._millis
+        return self._millis <= other._millis
 
     def __eq__(self, other):
-        """ Duration ==  Duration """
+        """
+        Duration ==  Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_10 == d_20
+        False
+        >>> other_d_10 = Duration(10)
+        >>> d_10 == other_d_10
+        True
+
+        """
         Duration._is_duration(other)
         return self._millis == other._millis
 
     def __ne__(self, other):
-        """ Duration != Duration """
+        """
+        Duration != Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_10 != d_20
+        True
+        >>> other_d_10 = Duration(10)
+        >>> d_10 != other_d_10
+        False
+
+        """
         Duration._is_duration(other)
         return self._millis != other._millis
 
     def __gt__(self, other):
-        """ Duration > Duration """
+        """
+        Duration > Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_10 > d_20
+        False
+        >>> d_20 > d_10
+        True
+
+        """
         Duration._is_duration(other)
         return self._millis > other._millis
 
     def __ge__(self, other):
-        """ Duration >= Duration """
+        """
+        Duration >= Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_10 < d_20
+        True
+        >>> d_20 < d_10
+        False
+
+
+        """
         Duration._is_duration(other)
         return self._millis >= other._millis
 
@@ -162,6 +336,12 @@ def Milliseconds(milliseconds):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
     a given number of milliseconds.
+
+    >>> milliseconds = Milliseconds(1)
+    >>> d_1 = Duration(1)
+    >>> milliseconds == d_1
+    True
+
     """
     return Duration(milliseconds)
 
@@ -169,18 +349,24 @@ def Seconds(seconds):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
     a given number of seconds.
+
+    >>> seconds = Seconds(1)
+    >>> d_1sec = Duration(1000)
+    >>> seconds == d_1sec
+    True
+
     """
     return Duration(seconds * 1000)
 
-def Minites(minites):
+def Minutes(minutes):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
     a given number of minutes.
-    """
-    return Duration(minutes * 60000)
 
-if __name__ == "__main__":
-    d = Duration(1)
-    print d
-    print d.milliseconds()
+    >>> minutes = Minutes(1)
+    >>> d_1min = Duration(60 * 1000)
+    >>> minutes == d_1min
+    True
 
+    """
+    return Duration(minutes * 60 * 1000)
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index 71aa3376c6578..b1fa1e227b0a1 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -1,4 +1,20 @@
-__author__ = 'ktakagiw'
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 
 def msDurationToString(ms):
     """
@@ -12,7 +28,7 @@ def msDurationToString(ms):
         return "%d ms" % ms
     elif ms < minute:
         return "%.1f s" % (float(ms) / second)
-    elif ms < hout:
+    elif ms < hour:
         return "%.1f m" % (float(ms) / minute)
     else:
         return "%.2f h" % (float(ms) / hour)
diff --git a/python/run-tests b/python/run-tests
index 9282aa47e8375..0d977770fc160 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -60,6 +60,7 @@ export PYSPARK_DOC_TEST=1
 run_test "pyspark/broadcast.py"
 run_test "pyspark/accumulators.py"
 run_test "pyspark/serializers.py"
+run_test "pyspark/streaming/duration.py"
 unset PYSPARK_DOC_TEST
 run_test "pyspark/tests.py"
 run_test "pyspark/mllib/_common.py"

From e604fcba9cbe97ee981d8ec14fd8d9207aaaa275 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Wed, 16 Jul 2014 23:36:33 -0700
Subject: [PATCH 020/347] fixed typo of network_workdcount.py

---
 .../python/streaming/network_wordcount.py     | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 examples/src/main/python/streaming/network_wordcount.py

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
new file mode 100644
index 0000000000000..67dc28f7bf7f0
--- /dev/null
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -0,0 +1,22 @@
+import sys
+from operator import add
+
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.duration import *
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
+        exit(-1)
+    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
+
+    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
+    fm_lines = lines.flatMap(lambda x: x.split(" "))
+    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
+    mapped_lines = fm_lines.map(lambda x: (x, 1))
+    
+    fm_lines.pyprint()
+    filtered_lines.pyprint()
+    mapped_lines.pyprint()
+    ssc.start()
+    ssc.awaitTermination()

From 57209798d09cd72c4c6a9f28f63e7f7fbc89c68e Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Wed, 16 Jul 2014 23:39:25 -0700
Subject: [PATCH 021/347] delete old file

---
 .../python/streaming/nerwork_wordcount.py     | 22 -------------------
 1 file changed, 22 deletions(-)
 delete mode 100644 examples/src/main/python/streaming/nerwork_wordcount.py

diff --git a/examples/src/main/python/streaming/nerwork_wordcount.py b/examples/src/main/python/streaming/nerwork_wordcount.py
deleted file mode 100644
index 67dc28f7bf7f0..0000000000000
--- a/examples/src/main/python/streaming/nerwork_wordcount.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import sys
-from operator import add
-
-from pyspark.streaming.context import StreamingContext
-from pyspark.streaming.duration import *
-
-if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
-        exit(-1)
-    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
-
-    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
-    fm_lines = lines.flatMap(lambda x: x.split(" "))
-    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
-    mapped_lines = fm_lines.map(lambda x: (x, 1))
-    
-    fm_lines.pyprint()
-    filtered_lines.pyprint()
-    mapped_lines.pyprint()
-    ssc.start()
-    ssc.awaitTermination()

From 571d52d935565e88fc05d564d45aeb7020134b86 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Tue, 15 Jul 2014 21:08:43 -0700
Subject: [PATCH 022/347] added reducedByKey not working yet

---
 .../src/main/python/streaming/wordcount.py    | 10 ++-
 python/pyspark/streaming/dstream.py           | 26 +++++-
 .../streaming/api/python/PythonDStream.scala  | 85 +++++++++++++++++++
 3 files changed, 119 insertions(+), 2 deletions(-)

diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index f44cd696894ba..3996991109d60 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -1,6 +1,7 @@
 import sys
 from operator import add
 
+from pyspark.conf import SparkConf
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
 
@@ -8,15 +9,22 @@
     if len(sys.argv) != 2:
         print >> sys.stderr, "Usage: wordcount <directory>"
         exit(-1)
-    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
+    conf = SparkConf()
+    conf.setAppName("PythonStreamingWordCount")
+    conf.set("spark.default.parallelism", 1)
+
+#    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
+    ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
     lines = ssc.textFileStream(sys.argv[1])
     fm_lines = lines.flatMap(lambda x: x.split(" "))
     filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
+    reduced_lines = mapped_lines.reduce(add)
     
     fm_lines.pyprint()
     filtered_lines.pyprint()
     mapped_lines.pyprint()
+    reduced_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index cd28184274c9a..f0a3342876e4c 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -9,6 +9,7 @@
 
 __all__ = ["DStream"]
 
+
 class DStream(object):
     def __init__(self, jdstream, ssc, jrdd_deserializer):
         self._jdstream = jdstream
@@ -69,7 +70,7 @@ def _combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         """
         """
         if numPartitions is None:
-            numPartitions = self.ctx._defaultParallelism()
+            numPartitions = self._defaultReducePartitions()
         def combineLocally(iterator):
             combiners = {}
             for x in iterator:
@@ -130,8 +131,31 @@ def add_shuffle_key(split, iterator):
         return dstream
 
     def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
+        """
+
+        """
         return PipelinedDStream(self, f, preservesPartitioning)
 
+    def _defaultReducePartitions(self):
+        """
+
+        """
+        # hard code to avoid the error
+        return 2
+        if self.ctx._conf.contains("spark.default.parallelism"):
+            return self.ctx.defaultParallelism
+        else:
+            return self.getNumPartitions()
+
+    def getNumPartitions(self):
+      """
+      Returns the number of partitions in RDD
+      >>> rdd = sc.parallelize([1, 2, 3, 4], 2)
+      >>> rdd.getNumPartitions()
+      2
+      """
+      return self._jdstream.partitions().size()
+
 
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 76b88385e095a..83e4eaa8b5e4e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -55,6 +55,91 @@ class PythonDStream[T: ClassTag](
       case None => None
     }
   }
+<<<<<<< HEAD
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
+=======
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+
+  /**
+   * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
+   * operator, so this PythonDStream will be registered as an output stream and there materialized.
+   * Since serialized Python object is readable by Python, pyprint writes out binary data to
+   * temporary file and run python script to deserialized and print the first ten elements
+   */
+  private[streaming] def ppyprint() {
+    def foreachFunc = (rdd: RDD[Array[Byte]], time: Time) => {
+      val iter = rdd.take(11).iterator
+
+      // make a temporary file
+      val prefix = "spark"
+      val suffix = ".tmp"
+      val tempFile = File.createTempFile(prefix, suffix)
+      val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
+      //write out serialized python object
+      PythonRDD.writeIteratorToStream(iter, tempFileStream)
+      tempFileStream.close()
+
+      // This value has to be passed from python
+      //val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+      val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
+      //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
+      //absolute path to the python script is needed to change because we do not use pysparkstreaming
+      val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pysparkstreaming/streaming/pyprint.py", tempFile.getAbsolutePath)
+      val workerEnv = pb.environment()
+
+      //envVars also need to be pass
+      //workerEnv.putAll(envVars)
+      val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
+      workerEnv.put("PYTHONPATH", pythonPath)
+      val worker = pb.start()
+      val is = worker.getInputStream()
+      val isr = new InputStreamReader(is)
+      val br = new BufferedReader(isr)
+
+      println ("-------------------------------------------")
+      println ("Time: " + time)
+      println ("-------------------------------------------")
+
+      //print value from python std out
+      var line = ""
+      breakable {
+        while (true) {
+          line = br.readLine()
+          if (line == null) break()
+          println(line)
+        }
+      }
+      //delete temporary file
+      tempFile.delete()
+      println()
+
+    }
+    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
+  }
+}
+
+
+private class PairwiseDStream(prev:DStream[Array[Byte]]) extends
+DStream[(Long, Array[Byte])](prev.ssc){
+  override def dependencies = List(prev)
+
+  override def slideDuration: Duration = prev.slideDuration
+
+  override def compute(validTime:Time):Option[RDD[(Long, Array[Byte])]]={
+    prev.getOrCompute(validTime) match{
+      case Some(rdd)=>Some(rdd)
+        val pairwiseRDD = new PairwiseRDD(rdd)
+        Some(pairwiseRDD.asJavaPairRDD.rdd)
+      case None => None
+    }
+  }
+  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
+}
+
+
+
+
+
+>>>>>>> added reducedByKey not working yet

From 3b6d7b01c3cd4d0ad9ac0ffde0032805b64560be Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:07:42 -0700
Subject: [PATCH 023/347] implementing transform function in Python

---
 python/pyspark/mllib/_common.py               |  2 +-
 python/pyspark/streaming/dstream.py           |  2 -
 .../api/python/PythonTransformedDStream.scala | 37 +++++++++++++++++++
 .../spark/streaming/dstream/DStream.scala     |  3 ++
 4 files changed, 41 insertions(+), 3 deletions(-)
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala

diff --git a/python/pyspark/mllib/_common.py b/python/pyspark/mllib/_common.py
index e609b60a0f968..4b723693f43e3 100644
--- a/python/pyspark/mllib/_common.py
+++ b/python/pyspark/mllib/_common.py
@@ -164,7 +164,7 @@ def _deserialize_double_vector(ba, offset=0):
     nb = len(ba) - offset
     if nb < 5:
         raise TypeError("_deserialize_double_vector called on a %d-byte array, "
-                "which is too short" % nb)
+                        "which is too short" % nb)
     if ba[offset] == DENSE_VECTOR_MAGIC:
         return _deserialize_dense_vector(ba, offset)
     elif ba[offset] == SPARSE_VECTOR_MAGIC:
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index f0a3342876e4c..5766cca39bdee 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -92,7 +92,6 @@ def _mergeCombiners(iterator):
             return combiners.iteritems()
         return shuffled.mapPartitions(_mergeCombiners) 
 
-
     def partitionBy(self, numPartitions, partitionFunc=None):
         """
         Return a copy of the DStream partitioned using the specified partitioner.
@@ -141,7 +140,6 @@ def _defaultReducePartitions(self):
 
         """
         # hard code to avoid the error
-        return 2
         if self.ctx._conf.contains("spark.default.parallelism"):
             return self.ctx.defaultParallelism
         else:
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
new file mode 100644
index 0000000000000..ff70483b771a4
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
@@ -0,0 +1,37 @@
+package org.apache.spark.streaming.api.python
+
+import org.apache.spark.Accumulator
+import org.apache.spark.api.python.PythonRDD
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.RDD
+import org.apache.spark.streaming.api.java.JavaDStream
+import org.apache.spark.streaming.{Time, Duration}
+import org.apache.spark.streaming.dstream.DStream
+
+import scala.reflect.ClassTag
+
+/**
+ * Created by ken on 7/15/14.
+ */
+class PythonTransformedDStream[T: ClassTag](
+               parents: Seq[DStream[T]],
+               command: Array[Byte],
+               envVars: JMap[String, String],
+               pythonIncludes: JList[String],
+               preservePartitoning: Boolean,
+               pythonExec: String,
+               broadcastVars: JList[Broadcast[Array[Byte]]],
+               accumulator: Accumulator[JList[Array[Byte]]]
+               ) extends DStream[Array[Byte]](parent.ssc) {
+
+  override def dependencies = List(parent)
+
+  override def slideDuration: Duration = parent.slideDuration
+
+  //pythonDStream compute
+  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
+    Some()
+  }
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index d9d5446b62e9f..67977244ef420 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -561,9 +561,12 @@ abstract class DStream[T: ClassTag] (
     // because the DStream is reachable from the outer object here, and because 
     // DStreams can't be serialized with closures, we can't proactively check 
     // it for serializability and so we pass the optional false to SparkContext.clean
+
+    // serialized python
     val cleanedF = context.sparkContext.clean(transformFunc, false)
     val realTransformFunc =  (rdds: Seq[RDD[_]], time: Time) => {
       assert(rdds.length == 1)
+      // if transformfunc is fine, it is okay
       cleanedF(rdds.head.asInstanceOf[RDD[T]], time)
     }
     new TransformedDStream[U](Seq(this), realTransformFunc)

From 04af0462cd17a875d5a98c809ad7dece517c232f Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Thu, 17 Jul 2014 16:27:05 -0700
Subject: [PATCH 024/347] reduceByKey is working

---
 .../src/main/python/streaming/wordcount.pyc   | Bin 0 -> 1566 bytes
 python/pyspark/streaming/dstream.py           |   6 +-
 .../streaming/api/python/PythonDStream.scala  |  87 +++---------------
 .../api/python/PythonTransformedDStream.scala |  19 ++--
 4 files changed, 29 insertions(+), 83 deletions(-)
 create mode 100644 examples/src/main/python/streaming/wordcount.pyc

diff --git a/examples/src/main/python/streaming/wordcount.pyc b/examples/src/main/python/streaming/wordcount.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db93702361f47f57988ea82c213aae522e7a9f81
GIT binary patch
literal 1566
zcmb_c+invv5S`tmX_F>xdP%Qb`WW$$yrZfDRUlD`3MEBIE2LoJN!i4Ek?qh#;suqD
z<A3-7n6a}d5{L(2H};Otj6G*APU%~t_Uz(oe+Y}GLf<De%)0<U(k}o75P3NS6+jX~
z6hgqd5~va+Wr)gtT?VxRNd=;cU$1~#g)3YGS%FCi`fY$!K!9-#ZUd03Fe!ol?2vUR
z%QXmJTwtBuk~&BMunf24y#d&O@dn7MQ}ALFWDVpx+!7V6fUL8LB*Ugd1KJlxTYy_I
z-d?yQ>q9y>+5y~w@h->>_~JZ8Ex;Cx_dsqI$i71kKsF2H5bzMjM}Q%7h>o4XJ*F4n
zK8zO<nJlE^r9>3&dpnXIIEg}|#P-`;l<D|8J(q1tH`o$QLQ21=(xRUWJvq<Pk%yWL
zS&brz(`$k)&aBt)_D6P&=D{ElaXJ{pAuY^3nqC`mjgL=mGA)PMG_@zEGS)b>nVz57
z12T9uw;!@}dGH%DJZD35$VV`Rj>M6eD%+ujPzqISlGsr$lgW?>I^S}tg^jax$SNMp
z;hctP(DaEa?*gX;_S>wwv}|<ZLs((ET*W^{G;0$EZZchxa3+#mq3Ieu>~fgh+)?kE
zA}QTvlpRuWK2DWL-b-<|OR}K>zmlsFO7$Rqlgg4A({btd9G1bVIK_XX)mx{dTQvSa
zhqw@<QCQG%$-ZOKW6J-5#a_Xp{g5Qre`ao6-m)C2n3?X80&Jb8eC~UJ%iVMN;QlDk
zWeGZ@=f>=Dh3(OAO^QZYbZjaT91NU$g{kG968ie1<$oh%C2H+3oW-`snC0*d+NXJb
zNn1T{w04!?K9YZFJZ6S=gYAjV^H`y8am%IUG8(sGq=bDbaz|y947`H9jACU`y92z4
za-rpWn$Q-`3VrjXg12du(;-^!crO;am|Z^4b(~s9!C0ZK<vT5IL~O56)3tW@=dQ9&
u`upBbGC5Uv#>Fxnn(9F_YzEDs7S@AS(4e<Uqggr*_kwNrZicm>cz*!UkyX3^

literal 0
HcmV?d00001

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 5766cca39bdee..4e18cbacf3eba 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -118,11 +118,9 @@ def add_shuffle_key(split, iterator):
         keyed = PipelinedDStream(self, add_shuffle_key)
         keyed._bypass_serializer = True
         with _JavaStackTrace(self.ctx) as st:
-            #JavaDStream
-            pairDStream = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairDStream()
             partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
-                                                          id(partitionFunc))
-        jdstream = pairDStream.partitionBy(partitioner).values()
+                                                      id(partitionFunc))
+            jdstream = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream(), partitioner).asJavaDStream()
         dstream = DStream(jdstream, self._ssc, BatchedSerializer(outputSerializer))
         # This is required so that id(partitionFunc) remains unique, even if
         # partitionFunc is a lambda:
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 83e4eaa8b5e4e..d305797bb4a0f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -55,91 +55,34 @@ class PythonDStream[T: ClassTag](
       case None => None
     }
   }
-<<<<<<< HEAD
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
-=======
-  val asJavaDStream  = JavaDStream.fromDStream(this)
-
-  /**
-   * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
-   * operator, so this PythonDStream will be registered as an output stream and there materialized.
-   * Since serialized Python object is readable by Python, pyprint writes out binary data to
-   * temporary file and run python script to deserialized and print the first ten elements
-   */
-  private[streaming] def ppyprint() {
-    def foreachFunc = (rdd: RDD[Array[Byte]], time: Time) => {
-      val iter = rdd.take(11).iterator
-
-      // make a temporary file
-      val prefix = "spark"
-      val suffix = ".tmp"
-      val tempFile = File.createTempFile(prefix, suffix)
-      val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
-      //write out serialized python object
-      PythonRDD.writeIteratorToStream(iter, tempFileStream)
-      tempFileStream.close()
-
-      // This value has to be passed from python
-      //val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
-      val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
-      //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
-      //absolute path to the python script is needed to change because we do not use pysparkstreaming
-      val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pysparkstreaming/streaming/pyprint.py", tempFile.getAbsolutePath)
-      val workerEnv = pb.environment()
-
-      //envVars also need to be pass
-      //workerEnv.putAll(envVars)
-      val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
-      workerEnv.put("PYTHONPATH", pythonPath)
-      val worker = pb.start()
-      val is = worker.getInputStream()
-      val isr = new InputStreamReader(is)
-      val br = new BufferedReader(isr)
 
-      println ("-------------------------------------------")
-      println ("Time: " + time)
-      println ("-------------------------------------------")
 
-      //print value from python std out
-      var line = ""
-      breakable {
-        while (true) {
-          line = br.readLine()
-          if (line == null) break()
-          println(line)
-        }
-      }
-      //delete temporary file
-      tempFile.delete()
-      println()
-
-    }
-    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
-  }
-}
-
-
-private class PairwiseDStream(prev:DStream[Array[Byte]]) extends
-DStream[(Long, Array[Byte])](prev.ssc){
+private class PairwiseDStream(prev:DStream[Array[Byte]], partitioner: Partitioner) extends
+DStream[Array[Byte]](prev.ssc){
   override def dependencies = List(prev)
 
   override def slideDuration: Duration = prev.slideDuration
 
-  override def compute(validTime:Time):Option[RDD[(Long, Array[Byte])]]={
+  override def compute(validTime:Time):Option[RDD[Array[Byte]]]={
     prev.getOrCompute(validTime) match{
       case Some(rdd)=>Some(rdd)
         val pairwiseRDD = new PairwiseRDD(rdd)
-        Some(pairwiseRDD.asJavaPairRDD.rdd)
+        /*
+         * This is equivalent to following python code
+         * with _JavaStackTrace(self.context) as st:
+         *    pairRDD = self.ctx._jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD()
+         *    partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
+         *                                                  id(partitionFunc))
+         * jrdd = pairRDD.partitionBy(partitioner).values()
+         * rdd = RDD(jrdd, self.ctx, BatchedSerializer(outputSerializer))
+         */
+        Some(pairwiseRDD.asJavaPairRDD.partitionBy(partitioner).values().rdd)
       case None => None
     }
   }
-  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+  //val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
 }
-
-
-
-
-
->>>>>>> added reducedByKey not working yet
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
index ff70483b771a4..bc07e09ec6d03 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
@@ -1,3 +1,5 @@
+/*
+
 package org.apache.spark.streaming.api.python
 
 import org.apache.spark.Accumulator
@@ -10,11 +12,8 @@ import org.apache.spark.streaming.dstream.DStream
 
 import scala.reflect.ClassTag
 
-/**
- * Created by ken on 7/15/14.
- */
 class PythonTransformedDStream[T: ClassTag](
-               parents: Seq[DStream[T]],
+               parent: DStream[T],
                command: Array[Byte],
                envVars: JMap[String, String],
                pythonIncludes: JList[String],
@@ -30,8 +29,14 @@ class PythonTransformedDStream[T: ClassTag](
 
   //pythonDStream compute
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
-    Some()
+
+//    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
+//    parents.map(_.getOrCompute(validTime).orNull).to
+//    parent = parents.head.asInstanceOf[RDD]
+//    Some()
   }
-  val asJavaDStream  = JavaDStream.fromDStream(this)
+
+  val asJavaDStream = JavaDStream.fromDStream(this)
 }
+
+*/

From ae464e0c0f2f87bae309d7f6bdb1a5a4a3cd646b Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Thu, 17 Jul 2014 17:09:23 -0700
Subject: [PATCH 025/347] edit python sparkstreaming example

---
 examples/src/main/python/streaming/network_wordcount.py | 8 +++++++-
 examples/src/main/python/streaming/wordcount.py         | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index 67dc28f7bf7f0..77fca7ff7657d 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -1,6 +1,7 @@
 import sys
 from operator import add
 
+from pyspark.conf import SparkConf
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
 
@@ -8,15 +9,20 @@
     if len(sys.argv) != 3:
         print >> sys.stderr, "Usage: wordcount <hostname> <port>"
         exit(-1)
-    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
+    conf = SparkConf()
+    conf.setAppName("PythonStreamingNetworkWordCount")
+    conf.set("spark.default.parallelism", 1)
+    ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     fm_lines = lines.flatMap(lambda x: x.split(" "))
     filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
+    reduced_lines = mapped_lines.reduce(add)
     
     fm_lines.pyprint()
     filtered_lines.pyprint()
     mapped_lines.pyprint()
+    reduced_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index 3996991109d60..9ff8bc5ac9ab2 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -13,6 +13,7 @@
     conf.setAppName("PythonStreamingWordCount")
     conf.set("spark.default.parallelism", 1)
 
+# still has a bug
 #    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
 

From d01a125a8bc9f3f2e33d1c90adaa47755fbc6049 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Fri, 18 Jul 2014 17:58:58 -0700
Subject: [PATCH 026/347] added count operation but this implementation need
 double check

---
 python/pyspark/streaming/dstream.py | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 6fbd5b6f88089..3f23e65712368 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -1,5 +1,8 @@
 from collections import defaultdict
 from itertools import chain, ifilter, imap
+import operator
+
+import logging
 
 from pyspark.serializers import NoOpSerializer,\
     BatchedSerializer, CloudPickleSerializer, pack_long
@@ -24,6 +27,18 @@ def generatedRDDs(self):
         """
         pass
 
+    def count(self):
+        """
+
+        """
+        #TODO make sure count implementation, thiis different from what pyspark does
+        return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum().map(lambda x: x[1])
+
+    def sum(self):
+        """
+        """
+        return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+
     def print_(self):
         """
         """
@@ -63,9 +78,9 @@ def reduce(self, func, numPartitions=None):
         """
 
         """
-        return self._combineByKey(lambda x:x, func, func, numPartitions)
+        return self.combineByKey(lambda x:x, func, func, numPartitions)
 
-    def _combineByKey(self, createCombiner, mergeValue, mergeCombiners,
+    def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
                       numPartitions = None):
         """
         """
@@ -74,6 +89,12 @@ def _combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         def combineLocally(iterator):
             combiners = {}
             for x in iterator:
+
+                #TODO for count operation make sure count implementation
+                # This is different from what pyspark does
+                if isinstance(x, int):
+                    x = ("", x)
+
                 (k, v) = x
                 if k not in combiners:
                     combiners[k] = createCombiner(v)
@@ -143,7 +164,7 @@ def _defaultReducePartitions(self):
         else:
             return self.getNumPartitions()
 
-      return self._jdstream.partitions().size()
+        return self._jdstream.partitions().size()
 
     def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """

From bd20e179a4c151f5442a9497a0bdfe206b1ffc70 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Sat, 19 Jul 2014 18:58:01 -0700
Subject: [PATCH 027/347] fix map function

---
 .../python/streaming/network_wordcount.py     |  2 ++
 python/pyspark/streaming/dstream.py           | 32 ++++++++-----------
 2 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index 77fca7ff7657d..a1458e06f13d2 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -19,10 +19,12 @@
     filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
     reduced_lines = mapped_lines.reduce(add)
+    counted_lines = reduced_lines.count()
     
     fm_lines.pyprint()
     filtered_lines.pyprint()
     mapped_lines.pyprint()
     reduced_lines.pyprint()
+    counted_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 3f23e65712368..caa62d44a9069 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -20,21 +20,14 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
         self.ctx = ssc._sc
         self._jrdd_deserializer = jrdd_deserializer
 
-    def generatedRDDs(self):
-        """
-         // RDDs generated, marked as private[streaming] so that testsuites can access it
-         @transient
-        """
-        pass
-
     def count(self):
         """
 
         """
         #TODO make sure count implementation, thiis different from what pyspark does
-        return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum().map(lambda x: x[1])
+        return self.mapPartitions(lambda i: [sum(1 for _ in i)]).map(lambda x: (None, 1))
 
-    def sum(self):
+    def _sum(self):
         """
         """
         return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
@@ -65,8 +58,9 @@ def func(s, iterator): return chain.from_iterable(imap(f, iterator))
     def map(self, f, preservesPartitioning=False):
         """
         """
-        def func(split, iterator): return imap(f, iterator)
-        return PipelinedDStream(self, func, preservesPartitioning)
+        def func(iterator): return imap(f, iterator)
+        return self.mapPartitions(func)
+        #return PipelinedDStream(self, func, preservesPartitioning)
 
     def mapPartitions(self, f):
         """
@@ -74,6 +68,12 @@ def mapPartitions(self, f):
         def func(s, iterator): return f(iterator)
         return self.mapPartitionsWithIndex(func)
 
+    def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
+        """
+
+        """
+        return PipelinedDStream(self, f, preservesPartitioning)
+
     def reduce(self, func, numPartitions=None):
         """
 
@@ -92,8 +92,8 @@ def combineLocally(iterator):
 
                 #TODO for count operation make sure count implementation
                 # This is different from what pyspark does
-                if isinstance(x, int):
-                    x = ("", x)
+                #if isinstance(x, int):
+                #    x = ("", x)
 
                 (k, v) = x
                 if k not in combiners:
@@ -166,12 +166,6 @@ def _defaultReducePartitions(self):
 
         return self._jdstream.partitions().size()
 
-    def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
-        """
-
-        """
-        return PipelinedDStream(self, f, preservesPartitioning)
-
     def _defaultReducePartitions(self):
         """
 

From 84a021f9e7409f8baffd731989d87b124918fcf7 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Sun, 20 Jul 2014 14:31:55 -0700
Subject: [PATCH 028/347] clean up code

---
 python/pyspark/streaming/context.py           | 41 ++++-----
 python/pyspark/streaming/dstream.py           | 85 ++++++++++---------
 python/pyspark/streaming/duration.py          |  1 +
 python/pyspark/streaming/pyprint.py           |  9 +-
 .../streaming/api/java/JavaDStreamLike.scala  |  2 +-
 .../streaming/api/python/PythonDStream.scala  |  4 +-
 .../api/python/PythonTransformedDStream.scala | 54 ------------
 .../spark/streaming/dstream/DStream.scala     | 21 +++--
 8 files changed, 89 insertions(+), 128 deletions(-)
 delete mode 100644 streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 5dcc9ba35a653..a4900191d1730 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -22,15 +22,15 @@
 from pyspark.storagelevel import *
 from pyspark.rdd import RDD
 from pyspark.context import SparkContext
+from pyspark.streaming.dstream import DStream
 
 from py4j.java_collections import ListConverter
 
-from pyspark.streaming.dstream import DStream
 
 class StreamingContext(object):
     """
     Main entry point for Spark Streaming functionality. A StreamingContext represents the
-    connection to a Spark cluster, and can be used to create L{RDD}s and
+    connection to a Spark cluster, and can be used to create L{DStream}s and
     broadcast variables on that cluster.
     """
 
@@ -71,13 +71,16 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
     def _initialize_context(self, jspark_context, jduration):
         return self._jvm.JavaStreamingContext(jspark_context, jduration)
 
-    def actorStream(self, props, name, storageLevel, supervisorStrategy):
-        raise NotImplementedError
-
-    def addStreamingListener(self, streamingListener):
-        raise NotImplementedError
+    def start(self):
+        """
+        Start the execution of the streams.
+        """
+        self._jssc.start()
 
     def awaitTermination(self, timeout=None):
+        """
+        Wait for the execution to stop.
+        """
         if timeout:
             self._jssc.awaitTermination(timeout)
         else:
@@ -85,20 +88,18 @@ def awaitTermination(self, timeout=None):
 
     # start from simple one. storageLevel is not passed for now.
     def socketTextStream(self, hostname, port):
+        """
+        Create an input from TCP source hostname:port. Data is received using
+        a TCP socket and receive byte is interpreted as UTF8 encoded '\n' delimited
+        lines.
+        """
         return DStream(self._jssc.socketTextStream(hostname, port), self, UTF8Deserializer())
 
-    def start(self):
-        self._jssc.start()
-
-    def stop(self, stopSparkContext=True):
-        raise NotImplementedError
-
     def textFileStream(self, directory):
+        """
+        Create an input stream that monitors a Hadoop-compatible file system
+        for new files and reads them as text files. Files must be wrriten to the
+        monitored directory by "moving" them from another location within the same
+        file system. FIle names starting with . are ignored.
+        """
         return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
-
-    def transform(self, seq):
-        raise NotImplementedError
-
-    def union(self, seq):
-        raise NotImplementedError
-
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index caa62d44a9069..a640df7394bcf 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -2,8 +2,6 @@
 from itertools import chain, ifilter, imap
 import operator
 
-import logging
-
 from pyspark.serializers import NoOpSerializer,\
     BatchedSerializer, CloudPickleSerializer, pack_long
 from pyspark.rdd import _JavaStackTrace
@@ -25,64 +23,86 @@ def count(self):
 
         """
         #TODO make sure count implementation, thiis different from what pyspark does
-        return self.mapPartitions(lambda i: [sum(1 for _ in i)]).map(lambda x: (None, 1))
+        return self._mapPartitions(lambda i: [sum(1 for _ in i)]).map(lambda x: (None, 1))
 
     def _sum(self):
         """
         """
-        return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+        return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
     def print_(self):
         """
+        Since print is reserved name for python, we cannot make a print method function.
+        This function prints serialized data in RDD in DStream because Scala and Java cannot
+        deserialized pickled python object. Please use DStream.pyprint() instead to print result.
+
+        Call DStream.print().
         """
-        # print is a reserved name of Python. We cannot give print to function name
+        #hack to call print function in DStream
         getattr(self._jdstream, "print")()
 
     def pyprint(self):
         """
+        Print the first ten elements of each RDD generated in this DStream. This is an output
+        operator, so this DStream will be registered as an output stream and there materialized.
+
         """
         self._jdstream.pyprint()
 
     def filter(self, f):
         """
+        Return DStream containing only the elements that satisfy predicate.
         """
         def func(iterator): return ifilter(f, iterator)
-        return self.mapPartitions(func)
+        return self._mapPartitions(func)
 
     def flatMap(self, f, preservesPartitioning=False):
         """
+        Pass each value in the key-value pair DStream through flatMap function
+        without changing the keys: this also retains the original RDD's partition.
         """
         def func(s, iterator): return chain.from_iterable(imap(f, iterator))
-        return self.mapPartitionsWithIndex(func, preservesPartitioning)
+        return self._mapPartitionsWithIndex(func, preservesPartitioning)
 
-    def map(self, f, preservesPartitioning=False):
+    def map(self, f):
         """
+        Return DStream by applying a function to each element of DStream.
         """
         def func(iterator): return imap(f, iterator)
-        return self.mapPartitions(func)
-        #return PipelinedDStream(self, func, preservesPartitioning)
+        return self._mapPartitions(func)
 
-    def mapPartitions(self, f):
+    def _mapPartitions(self, f):
         """
+        Return a new DStream by applying a function to each partition of this DStream.
         """
         def func(s, iterator): return f(iterator)
-        return self.mapPartitionsWithIndex(func)
+        return self._mapPartitionsWithIndex(func)
 
-    def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
+    def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
-
+        Return a new DStream by applying a function to each partition of this DStream,
+        While tracking the index of the original partition.
         """
         return PipelinedDStream(self, f, preservesPartitioning)
 
-    def reduce(self, func, numPartitions=None):
+
+    def reduceByKey(self, func, numPartitions=None):
         """
+        Merge the value for each key using an associative reduce function.
+
+        This will also perform the merging locally on each mapper before
+        sending resuls to reducer, similarly to a "combiner" in MapReduce.
 
+        Output will be hash-partitioned with C{numPartitions} partitions, or
+        the default parallelism level if C{numPartitions} is not specified.
         """
         return self.combineByKey(lambda x:x, func, func, numPartitions)
 
     def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
                       numPartitions = None):
         """
+        Count the number of elements for each key, and return the result to the
+        master as a dictionary
         """
         if numPartitions is None:
             numPartitions = self._defaultReducePartitions()
@@ -148,42 +168,27 @@ def add_shuffle_key(split, iterator):
         dstream._partitionFunc = partitionFunc
         return dstream
 
-    def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
-        """
-
-        """
-        return PipelinedDStream(self, f, preservesPartitioning)
-
     def _defaultReducePartitions(self):
         """
+        Returns the default number of partitions to use during reduce tasks (e.g., groupBy).
+        If spark.default.parallelism is set, then we'll use the value from SparkContext
+        defaultParallelism, otherwise we'll use the number of partitions in this RDD.
 
+        This mirrors the behavior of the Scala Partitioner#defaultPartitioner, intended to reduce
+        the likelihood of OOMs. Once PySpark adopts Partitioner-based APIs, this behavior will
+        be inherent.
         """
-        # hard code to avoid the error
         if self.ctx._conf.contains("spark.default.parallelism"):
             return self.ctx.defaultParallelism
         else:
             return self.getNumPartitions()
 
-        return self._jdstream.partitions().size()
-
-    def _defaultReducePartitions(self):
+    def getNumPartitions(self):
         """
-
+        Return the number of partitions in RDD
         """
-        # hard code to avoid the error
-        if self.ctx._conf.contains("spark.default.parallelism"):
-            return self.ctx.defaultParallelism
-        else:
-            return self.getNumPartitions()
-
-    def getNumPartitions(self):
-      """
-      Returns the number of partitions in RDD
-      >>> rdd = sc.parallelize([1, 2, 3, 4], 2)
-      >>> rdd.getNumPartitions()
-      2
-      """
-      return self._jdstream.partitions().size()
+        # TODO: remove hardcoding. RDD has NumPartitions but DStream does not have.
+        return 2
 
 
 class PipelinedDStream(DStream):
diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
index 06a169e5215ac..a7f1036e4b856 100644
--- a/python/pyspark/streaming/duration.py
+++ b/python/pyspark/streaming/duration.py
@@ -17,6 +17,7 @@
 
 from pyspark.streaming import utils
 
+
 class Duration(object):
     """
     Duration for Spark Streaming application. Used to set duration
diff --git a/python/pyspark/streaming/pyprint.py b/python/pyspark/streaming/pyprint.py
index 1aeb8e50375ed..49517b3e5c247 100644
--- a/python/pyspark/streaming/pyprint.py
+++ b/python/pyspark/streaming/pyprint.py
@@ -21,16 +21,22 @@
 
 from pyspark.serializers import PickleSerializer
 
+
 def collect(binary_file_path):
+    """
+    Read pickled file written by SparkStreaming
+    """
     dse = PickleSerializer()
     with open(binary_file_path, 'rb') as tempFile:
         for item in dse.load_stream(tempFile):
             yield item
+
+
 def main():
     try:
         binary_file_path = sys.argv[1]
     except:
-        print "Missed FilePath in argement"
+        print "Missed FilePath in argements"
 
     if not binary_file_path:
         return 
@@ -43,5 +49,6 @@ def main():
             print "..."
             break
 
+
 if __name__ =="__main__":
     exit(main())
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index cfa336df8674f..a2b9d581f609c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -59,7 +59,7 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
    * operator, so this PythonDStream will be registered as an output stream and there materialized.
    * This function is for PythonAPI.
    */
-
+  //TODO move this function to PythonDStream
   def pyprint() = dstream.pyprint()
 
   /**
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index b730a98bfdbf7..05ccc23e9f422 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -71,7 +71,9 @@ DStream[Array[Byte]](prev.ssc){
       case Some(rdd)=>Some(rdd)
         val pairwiseRDD = new PairwiseRDD(rdd)
         /*
-         * This is equivalent to following python code
+         * Since python operation is executed by Scala after StreamingContext.start.
+         * What PairwiseDStream does is equivalent to following python code in pySpark.
+         *
          * with _JavaStackTrace(self.context) as st:
          *    pairRDD = self.ctx._jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD()
          *    partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
deleted file mode 100644
index 9e2d261776ff6..0000000000000
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
+++ /dev/null
@@ -1,54 +0,0 @@
-<<<<<<< HEAD
-/*
-
-=======
->>>>>>> 69e9cd33a58b880f96cc9c3e5e62eaa415c49843
-package org.apache.spark.streaming.api.python
-
-import org.apache.spark.Accumulator
-import org.apache.spark.api.python.PythonRDD
-import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.rdd.RDD
-import org.apache.spark.streaming.api.java.JavaDStream
-import org.apache.spark.streaming.{Time, Duration}
-import org.apache.spark.streaming.dstream.DStream
-
-import scala.reflect.ClassTag
-
-<<<<<<< HEAD
-class PythonTransformedDStream[T: ClassTag](
-               parent: DStream[T],
-=======
-/**
- * Created by ken on 7/15/14.
- */
-class PythonTransformedDStream[T: ClassTag](
-               parents: Seq[DStream[T]],
->>>>>>> 69e9cd33a58b880f96cc9c3e5e62eaa415c49843
-               command: Array[Byte],
-               envVars: JMap[String, String],
-               pythonIncludes: JList[String],
-               preservePartitoning: Boolean,
-               pythonExec: String,
-               broadcastVars: JList[Broadcast[Array[Byte]]],
-               accumulator: Accumulator[JList[Array[Byte]]]
-               ) extends DStream[Array[Byte]](parent.ssc) {
-
-  override def dependencies = List(parent)
-
-  override def slideDuration: Duration = parent.slideDuration
-
-  //pythonDStream compute
-  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-<<<<<<< HEAD
-
-//    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
-//    parents.map(_.getOrCompute(validTime).orNull).to
-//    parent = parents.head.asInstanceOf[RDD]
-//    Some()
-  }
-
-  val asJavaDStream = JavaDStream.fromDStream(this)
-}
-
-*/
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 67977244ef420..fc7a2055025c1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -623,37 +623,36 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-//TODO move pyprint to PythonDStream
+//TODO move pyprint to PythonDStream and executed by py4j call back function
   /**
    * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
    * operator, so this PythonDStream will be registered as an output stream and there materialized.
    * Since serialized Python object is readable by Python, pyprint writes out binary data to
    * temporary file and run python script to deserialized and print the first ten elements
+   *
+   * Currently call python script directly. We should avoid this
    */
   private[streaming] def pyprint() {
     def foreachFunc = (rdd: RDD[T], time: Time) => {
       val iter = rdd.take(11).iterator
 
-      // make a temporary file
+      // Generate a temporary file
       val prefix = "spark"
       val suffix = ".tmp"
       val tempFile = File.createTempFile(prefix, suffix)
       val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
-      //write out serialized python object
+      // Write out serialized python object to temporary file
       PythonRDD.writeIteratorToStream(iter, tempFileStream)
       tempFileStream.close()
 
-      // This value has to be passed from python
-      // Python currently does not do cluster deployment. But what happened
+      // pythonExec should be passed from python. Move pyprint to PythonDStream
       val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
       val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
-      //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
-      //absolute path to the python script is needed to change because we do not use pysparkstreaming
+      // Call python script to deserialize and print result in stdout
       val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath)
       val workerEnv = pb.environment()
 
-      //envVars also need to be pass
-      //workerEnv.putAll(envVars)
+      // envVars also should be pass from python
       val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
       workerEnv.put("PYTHONPATH", pythonPath)
       val worker = pb.start()
@@ -665,7 +664,7 @@ abstract class DStream[T: ClassTag] (
       println ("Time: " + time)
       println ("-------------------------------------------")
 
-      //print value from python std out
+      // Print values which is from python std out
       var line = ""
       breakable {
         while (true) {
@@ -674,7 +673,7 @@ abstract class DStream[T: ClassTag] (
           println(line)
         }
       }
-      //delete temporary file
+      // Delete temporary file
       tempFile.delete()
       println()
 

From d042ac61dafe536d3eedee9307fbca854c7f6135 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Sun, 20 Jul 2014 15:32:20 -0700
Subject: [PATCH 029/347] clean up codes

---
 bin/spark-submit                                   |  4 ++--
 .../src/main/python/streaming/network_wordcount.py |  7 +------
 examples/src/main/python/streaming/wordcount.py    |  2 +-
 python/pyspark/streaming/dstream.py                | 14 +++++++-------
 .../apache/spark/streaming/dstream/DStream.scala   |  3 ++-
 5 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/bin/spark-submit b/bin/spark-submit
index ec4e10787cff0..a297714c67da0 100755
--- a/bin/spark-submit
+++ b/bin/spark-submit
@@ -42,10 +42,9 @@ DEPLOY_MODE=${DEPLOY_MODE:-"client"}
 # This will be removed after pyprint is moved to PythonDStream.
 # Problem is that print function is in (Scala)DStream. 
 # Whenever python code is executed, we call PythonDStream which passes
-# pythonExec(which python Spark should execute).
+# pythonExec(which python Spark should execute). pythonExec is used to call python.
 # Since pyprint is located in DStream, Spark does not know which python should use. 
 # In that case, get python path from PYSPARK_PYTHON, environmental variable. 
-# This fix is ongoing in print branch in https://github.com/giwa/spark/tree/print.
 
 # Figure out which Python executable to use
 if [[ -z "$PYSPARK_PYTHON" ]]; then
@@ -53,6 +52,7 @@ if [[ -z "$PYSPARK_PYTHON" ]]; then
 fi
 export PYSPARK_PYTHON
 
+
 if [ -n "$DRIVER_MEMORY" ] && [ $DEPLOY_MODE == "client" ]; then
   export SPARK_DRIVER_MEMORY=$DRIVER_MEMORY
 fi
diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index a1458e06f13d2..c6ededc24db21 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -11,20 +11,15 @@
         exit(-1)
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
-    conf.set("spark.default.parallelism", 1)
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     fm_lines = lines.flatMap(lambda x: x.split(" "))
-    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
-    reduced_lines = mapped_lines.reduce(add)
-    counted_lines = reduced_lines.count()
+    reduced_lines = mapped_lines.reduceByKey(add)
     
     fm_lines.pyprint()
-    filtered_lines.pyprint()
     mapped_lines.pyprint()
     reduced_lines.pyprint()
-    counted_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index 9ff8bc5ac9ab2..ee52c4e178142 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -21,7 +21,7 @@
     fm_lines = lines.flatMap(lambda x: x.split(" "))
     filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
-    reduced_lines = mapped_lines.reduce(add)
+    reduced_lines = mapped_lines.reduceByKey(add)
     
     fm_lines.pyprint()
     filtered_lines.pyprint()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index a640df7394bcf..08de8dbe9d542 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -22,13 +22,15 @@ def count(self):
         """
 
         """
-        #TODO make sure count implementation, thiis different from what pyspark does
-        return self._mapPartitions(lambda i: [sum(1 for _ in i)]).map(lambda x: (None, 1))
+        pass
+        #TODO: make sure count implementation, thiis different from what pyspark does
+        #return self._mapPartitions(lambda i: [sum(1 for _ in i)]).map(lambda x: (None, 1))
 
     def _sum(self):
         """
         """
-        return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+        pass
+        #return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
     def print_(self):
         """
@@ -85,7 +87,6 @@ def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
         return PipelinedDStream(self, f, preservesPartitioning)
 
-
     def reduceByKey(self, func, numPartitions=None):
         """
         Merge the value for each key using an associative reduce function.
@@ -121,7 +122,7 @@ def combineLocally(iterator):
                 else:
                     combiners[k] = mergeValue(combiners[k], v)
             return combiners.iteritems()
-        locally_combined = self.mapPartitions(combineLocally)
+        locally_combined = self._mapPartitions(combineLocally)
         shuffled = locally_combined.partitionBy(numPartitions)
         def _mergeCombiners(iterator):
             combiners = {}
@@ -131,12 +132,11 @@ def _mergeCombiners(iterator):
                 else:
                     combiners[k] = mergeCombiners(combiners[k], v)
             return combiners.iteritems()
-        return shuffled.mapPartitions(_mergeCombiners) 
+        return shuffled._mapPartitions(_mergeCombiners)
 
     def partitionBy(self, numPartitions, partitionFunc=None):
         """
         Return a copy of the DStream partitioned using the specified partitioner.
-
         """
         if numPartitions is None:
             numPartitions = self.ctx._defaultReducePartitions()
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index fc7a2055025c1..f539bc9aa147d 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -623,7 +623,7 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-//TODO move pyprint to PythonDStream and executed by py4j call back function
+//TODO: move pyprint to PythonDStream and executed by py4j call back function
   /**
    * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
    * operator, so this PythonDStream will be registered as an output stream and there materialized.
@@ -647,6 +647,7 @@ abstract class DStream[T: ClassTag] (
 
       // pythonExec should be passed from python. Move pyprint to PythonDStream
       val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+
       val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
       // Call python script to deserialize and print result in stdout
       val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath)

From cc2092b09868b82f0d495a786028190f74db7a4a Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Sun, 20 Jul 2014 15:33:34 -0700
Subject: [PATCH 030/347] remove waste file

---
 examples/src/main/python/streaming/wordcount.pyc | Bin 1566 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 examples/src/main/python/streaming/wordcount.pyc

diff --git a/examples/src/main/python/streaming/wordcount.pyc b/examples/src/main/python/streaming/wordcount.pyc
deleted file mode 100644
index db93702361f47f57988ea82c213aae522e7a9f81..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1566
zcmb_c+invv5S`tmX_F>xdP%Qb`WW$$yrZfDRUlD`3MEBIE2LoJN!i4Ek?qh#;suqD
z<A3-7n6a}d5{L(2H};Otj6G*APU%~t_Uz(oe+Y}GLf<De%)0<U(k}o75P3NS6+jX~
z6hgqd5~va+Wr)gtT?VxRNd=;cU$1~#g)3YGS%FCi`fY$!K!9-#ZUd03Fe!ol?2vUR
z%QXmJTwtBuk~&BMunf24y#d&O@dn7MQ}ALFWDVpx+!7V6fUL8LB*Ugd1KJlxTYy_I
z-d?yQ>q9y>+5y~w@h->>_~JZ8Ex;Cx_dsqI$i71kKsF2H5bzMjM}Q%7h>o4XJ*F4n
zK8zO<nJlE^r9>3&dpnXIIEg}|#P-`;l<D|8J(q1tH`o$QLQ21=(xRUWJvq<Pk%yWL
zS&brz(`$k)&aBt)_D6P&=D{ElaXJ{pAuY^3nqC`mjgL=mGA)PMG_@zEGS)b>nVz57
z12T9uw;!@}dGH%DJZD35$VV`Rj>M6eD%+ujPzqISlGsr$lgW?>I^S}tg^jax$SNMp
z;hctP(DaEa?*gX;_S>wwv}|<ZLs((ET*W^{G;0$EZZchxa3+#mq3Ieu>~fgh+)?kE
zA}QTvlpRuWK2DWL-b-<|OR}K>zmlsFO7$Rqlgg4A({btd9G1bVIK_XX)mx{dTQvSa
zhqw@<QCQG%$-ZOKW6J-5#a_Xp{g5Qre`ao6-m)C2n3?X80&Jb8eC~UJ%iVMN;QlDk
zWeGZ@=f>=Dh3(OAO^QZYbZjaT91NU$g{kG968ie1<$oh%C2H+3oW-`snC0*d+NXJb
zNn1T{w04!?K9YZFJZ6S=gYAjV^H`y8am%IUG8(sGq=bDbaz|y947`H9jACU`y92z4
za-rpWn$Q-`3VrjXg12du(;-^!crO;am|Z^4b(~s9!C0ZK<vT5IL~O56)3tW@=dQ9&
u`upBbGC5Uv#>Fxnn(9F_YzEDs7S@AS(4e<Uqggr*_kwNrZicm>cz*!UkyX3^


From a778d4b675ae765cb24553e9c1eb7f36f7709279 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Wed, 23 Jul 2014 15:43:11 -0700
Subject: [PATCH 031/347] Implemented DStream.foreachRDD in the Python API
 using Py4J callback server.

---
 .../python/streaming/network_wordcount.py     |   4 +-
 python/lib/py4j-0.8.1-src.zip                 | Bin 37662 -> 37673 bytes
 python/pyspark/java_gateway.py                |   2 +-
 python/pyspark/streaming/dstream.py           |  44 ++++++++++---
 python/pyspark/streaming/utils.py             |  21 ++++++
 .../streaming/api/java/JavaDStreamLike.scala  |   8 ---
 .../streaming/api/python/PythonDStream.scala  |  38 +++++++++++
 .../spark/streaming/dstream/DStream.scala     |  60 ------------------
 8 files changed, 95 insertions(+), 82 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index c6ededc24db21..2bbb36a6b787e 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -17,9 +17,7 @@
     fm_lines = lines.flatMap(lambda x: x.split(" "))
     mapped_lines = fm_lines.map(lambda x: (x, 1))
     reduced_lines = mapped_lines.reduceByKey(add)
-    
-    fm_lines.pyprint()
-    mapped_lines.pyprint()
+
     reduced_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/python/lib/py4j-0.8.1-src.zip b/python/lib/py4j-0.8.1-src.zip
index 2069a328d1f2e6a94df057c6a3930048ae3f3832..68d7267c733da88cfdc5d9b97e24327013ec766d 100644
GIT binary patch
delta 11395
zcmV-}EPT_RrUI#^0<dmB4fK5XL}gLcTM#V(0OXT<KOldsInYzAInYyecnbgl1oZ&`
z00a~O008BEYj@j5lIVB+3XHtxkjzjRJCoUm&NBDP5}hcq9xGAKX1zWjL^dT@B)|Zm
zWNyy<_pL|2(I6?unSEh*ViD-BuCA`GuBxuCei0pfbr4O8X_ns|MmM832mgp~_V)Mp
z&x+}4E~0<Bh_bvEWjd*&r>wq@irbH3QdjUcN~0gr$25AEmGvr}M`!8eKFdWkD~m;x
z?p?0yBKJSWdwcKFGD~mgqKfMKw2l_p-F+QR=4o}$4b=A{`r+MKRMlxI+f4Ill}|-k
zO^Q<Nb*a_!bOA_5$3^}q@;Za2rbUz&bySK;u~>hIe2TcY0;vja6VSND3IaaPvj;I>
z_W;M^Qlzz*Mga9hOrk&cqpx4S{HMLW!*o^O7v*8}8o=I*xmc{Dlk`sHX;vZ8_Nebk
zGF#QFQY1-~EtUl!u*x$avPk9`l5$@4qx7~a=BrvHTw;%FPKx;)s0+QRRMn~YWhM6H
zk9mJ_cL%*zKZ}a1E!WdxqAJw&dI>0%s!$~mr2bV*9z@O6s@3hXEGDATja?Q?k@us^
z)61c*u$GO~_dpb66jgXcFZ$8dTY#yGKZ*3A6tg{r*f$?zpzaa|S{5kN{7&?v`?N~y
zy8OBy-KLeO>JqzU>ddk{ooD|cO1H7=bS8h^s83dL=tf-t@l@%rtZ$Dn<j5ZKQ<BIR
zNwT-MhdfY4=d1br`!t`<McI`kj^*#(;a&vKRLml=$m%XobJiywE2r`sK1%?zx0eH)
z<en==0|#txPXWFY^;`H8-t8lP`@J}wPE|1*0#_kOEp_9MFcXqHVAZE|z2Dp0dwqZN
z=FRXbxjy~3A=LWj<v;!X-pTOI(aqaYa&&TXHN3ut*PT~i|3mx|{_j;sf4#i88o{Tp
z|MKTQ|Jl6!dG!6od2)Gm@vlGICBD%mj*s5HeSLKNV{$#bdIt?39Ul)bN6F~)Y<O{l
z#s9K*`SUkFB-cNlUM9yEZ{H4&N2h-m=Y&P@Lx;<cu5Pdcm8iS#J8MyO`cY?%|EcLe
z<w|ruKzH|WScnEZxZRJBi{vpAPklMs2{4u%C77M3^sXWavrH!sFiWDMjF<<irI=*1
z3?}{UIvShmW0Z7IgMa}12;oP0x)2q>K!JuI4wq?}E+W<bA@HlvzYxOHeKrNTCuz!T
zlNm!Ie_2sKcD1aCzv`Lh6-DS@$v??to`EC<vMapf#Qj92QMdwsLNO51?@ot5!SJ1(
z9i5*f=SOG5=#Th1IXy#mNv>~Rdtd+ELDMvpO`&Gz-}4UqBdSTd6iF_gKwjpeYYn9?
z<iZ&OeAUk75%B33ghe0Iyn3RE=n^EZV3|oIe?_|T-yYQ{1*`yh@|ez70_%x7v0rr=
zV&4PUkIsu+@PBPM4mip8AR?#9$Fxj9xZZaSmygN}*VcjxNDhJJ@9AEYf`5YK0ImSh
zT7h7h;6fKW41wh<K>Ga$W_oC2md&vnBz1m$`BxA;<$gRX%0*gtEs5GQ_2YC2Yss{$
ze`*Il6;+JC<Bvs_>mSps#L1<9gV>)_4-*0dZS?xifbGZepZNa~xdUYPKkdWd>V^X?
zn>v$KSpwk?N?0XfA#!VOK^c}tSb4-O`_zy2tNXMRgo58XK>=FvIDjl5h-U4`W|0Cb
zKEcecy7;%XxLACEUu?dB5o2zcr>lH&f3GHN7rA`!5+{F&RR;U}eHJGA7t*O%*45zs
z2Sd+kCK4D&K%*YK0X;<+wt7nqfZQ5?$8C&R1%#=^YM@jlT!Dyq95UCDZc<1@S<Ay}
z5*nylaTJ}-K!xL?pin8SjA({hIVy5uc`p{INvX2Ys#096L^1dL(98g69RI#te}D4>
zeud^P^7$G%fM}=X8V3Rkz&G9WQ;M1{Xpx|oO`{a$c3$KM@FJh4<&@_SY8NoPP#tvM
zdsH<LA5>@*m=q{Gn+fE!Xl&SKj4Cj#F3^3!`Y@sAG^?SI`s#F24=Hz3|0Hs3pteUT
z=7d9}Y+9B$7@+NZs>?Kj_oR#he<)OO@Lz({b(%9HbziLJu!0I}%!m?%!B|%qN7S^a
zQ3RsWK2}JK<DQM1#gK<c;PD97{&C-QNHV<QK2Y`NbF78(F4uU~_b?=Ka8yZzS<$#v
z(L;uWR;*is1}!YE%~y+Ckp2a;hvEN9l#gmal<drA)E~ut8Uy(PWT+>%f4R*U(VH}z
zTO0`13`lVVMKU3_fqT+Ig0=)k3FZhh`mihuP!^AASg<ZF3VH<qegkQ)CAos6k$P*`
zhdax2-WU#ym*KmCLaq`^7Lp36C4ZzuHk!oyeH?xeO8W=8kS(x2kU(Vyfj`{^JA&$@
z49ju3CPeo?D2mxEq$|p{fAxBBZ~(;)U}oQz#e>KXmf2G5J7}t+pgNFrm7QP}J)Vl&
z)m;~bB$~N*nk~ulR2{WRt&|11#sD>e4rG3*Pkk_6r^TuspfyH6vYh_bqjQ4uXMrX+
zYnLoy%~+A~%T{bR7h=jHp;|4q3i!g4q!pu+5TpTG{#`^ZW=wD`f1Z*W)pL}Sge_QF
zhfvXhX+Ot>AM|qe7W5-}mEgi|R+K9NR-bqjtdLU)(yOP@*c2Z}()biCBP7JJoU0?8
z*fU^K62iO=%vN)n{m~e#^@6b(%y2c6OO_g(g5^BP5eDmN^QJwP1|*X62}r&ouhT47
zf{mK6Ya0ByBL1c1e+PFI2BMVxcjSke6mL$4Z%=&XnqPi5B;9(WYHe_j6W{|J7hB0T
z7=D%>?10}%DOS@_fdKRPA~_qLy#}*Uw!^|W5hSy?T<`tM5AAZTlNXYlFtu=O8Kdae
zSAXrOiRN_5^t#_dA<5YSxW#Ih`Mgz$-$0GNC#%{<#+f2=e=b$q=2AZ*Jj|537WJ0d
z1Wf46BV@!eL1~>*te$|TgmVT3h(I-nTeuaI6s8=N0b!lp<qKsff-Ec3x}YUPF1?d9
z$2ADlg97B!yhx{J(fylDIY8a6L;dFBY^XsI?uAkoO*1$_?HLNoH_1pGKKoUdAobfP
zqc>aBPmr@ff3SKTFN<Q{+uL0jT>lM(fzxMQtsx?QtBm*`B_h74VAx$K;1^Jw?+Z^V
z@VnL}HU(VC0GSBI+gt<ZNSeFr-_ENldg8_rvM7MiQ<dp`pRxr<3JJ0n@mGqdaLX~d
zxB%`v!YP-c%UsP^iCac0u+%kvS!Jb|f*C*t-&83-e^W-wtu;w)h5nhcg~8ZWE#fw%
zi)^)RMVCB+TE>|`L!xUzb6BK};dT}#ibr6EsJ<n|tSbwJ+9DO(@{s1!3>`uh(iHjz
z76O%1vj!+@540ozh_1(MDh*Ww)}_8g;!vAz%J5gvHVze5xR5xsm8V7zago6l2CNj?
zREWCB7h&t@msL7vgOsL7U+!KVV<m^a3Vt7Dlf+3Ye>jht><0BA-@Y^zqs6MKZTdTe
zZqRgJu5?O}O83Oa$3;vw9SAMCAr_CoD8E7PcUGOso?XnkgnH`)Q4<102QJg|q7q&F
zFYZ4YuKySO3M@k3D{{?q5ykds#ew~UTVTZ#@)WwDK1~EUM_e{nRS%;JQy0BvV}d>&
zE8*wxe^6bnE!$Lrh#6lw0MuMGRIU13PgXap`xTgLPkER3`m`u$b{I>7B!#j#TvYlw
zef8iUM%M-ebXeiBOP@VZvDb(A`um$va&_|4)ibC8!m+$f??fVZRV7b!eV4mg*bnz!
z5R4KHUq?@Z<z@~G&a~5$3uI$A5?FNsT5Y0te=fV#0<@)MDsbo5YZSNqN&yo(`lG|T
zo2ro00dO(sD)D15{lfs1ve6ap)}5Y@l8e_r439_2=?N{-9z0kiVBO1sBCx0$PnkU`
z_-s<#KsUR9#$462Veqs)vi8aFZBu{U8HhDXJJ$jARe%P&b9tlhweuZ5n|M!>8tSx9
zf6HT*Kcq16OhRQCwJqj8Tz=92Nuz0(s?gZVwZ22@8!y%A1TiI9w!|2Y3|4Ibx-M4H
zLtZ?w%Nc%^%0zp2hFgSlfo2@9OtL3dYy!29!U@F#<A4zaU~6W;EFf1{Z9_p?KnKg5
zC;kO06i={n^rZ@y&VjKo{-Op|F6t^<f092gLD$PBggR|piciaVHpxJ2Nta_G%}z;#
z+chbDh?4Tq9&4H_Q93pv$eQULNnSHX9X+?a!NM7an{#I1x5!+t9OKa_qyiC6R*S`&
z;wid?y+{*FLHVps5(+_RgdV6;>4el|8K-bN2T(wg!K>B^a^q_=)FcrY1hlR3e}ufO
zvJ?nbSh1yx0)E^!s>x2P(INrk_M1$YYFXq}hAfPmXdrk}jD@g9t5Oz?rB*sqK{0e{
zTpB394=NDg%*+<76s(Ymj{iBB3iAFMXbX|%mD#H~Fun~^dnV`>=O}{3YF=kc45Fgu
za6+<<yub~oJHc&LxG98=2XxG{e^*@)MsjNs<`ckb?%)ulpV0InY-y>ECZK&4i|CMb
z;ju-Kv6^){K8oGJw5^k};%F=O3WPH{V=4lDA*-=bMpJRj!*^SNB5<ER3S$IPFxf0c
z_4;;I%UvsqJCJ_qs-0}sDD|bw8~TIv%IhqZ<|%JCp(wabCl6I@d2SlBe+YC_UWW%>
zluVzz;R6bX>9Yu?nL9x7>uQ_6|7JjY_lDJ#x`U6@n`EPG*~lL_6f#D{kc2j~(C`H_
zB1%oOh7fx`?B<5+7N{1m*tBpWW+@B<(wB%3HyScTX>82dvq{Y)$--733HQVo<I0v_
z)7<dZ9=*a@c=+@`)O()~8A2?r{XymasI-b}Q*=E6hSTwfvQ}^qk#Ukv6n}~40oG(i
z*!qWC$nUcin4(^}6)1=hO3}Y+&yn>M;p!%?c$XWF4b;8Z1Z@tUi~6Z3AEfI|TllOF
z=-P3|EQ<|&(;!0uU<!F4>9V`4BG!bFFdy1oZFuqgSw-||){`W9p=wJV;#djch8EGl
zs39)9Hc#~p3pF<nJRa$X`F|nQZcMv@JNEi614{<wr{D3uRlc{uYOn@Q?weZmo~2GJ
zoRo&8LtH#orQMGlR|{QK`_H9%pu7vbj*}SF^2csj$qe2`Jp?^<9-;31=!!N1a4Q(b
zR6<=?QUf0NEPbbPYk!{21PdGKJX5hVcjRZ3&XbygT?!i#hpwmy27fFa2gOr>_q{p3
z)8vaA^zJnW*mh7jA3QNfp+x?u`|={dKmi;KKiGz-VokYmj(*rRt<_-$@%ZA0cV}w!
zY_~J%DoRbdbvT6G%h41x?TJ*-C~gPFx&)QuRE8w2QuI8^!nY1%*cuwUg~wX`s+y~u
z#F$t1F?-g=c0Vf7=zot;-%Y#3Uo%)t$@G)4P%3!3*(%YovAjDPVN|PdLy%pFL#nh}
z(<*u+q1iN>J3X_)Y;=$0)(?<Xz;v6M&41k|S%N!Zd2R#Uu}1#OOplUn5nM^_nbNwK
z<uG$tsBRF8n<q#%lf%<kBVN#o1g6-;o0?u1c#3UB-gW&3;C~FSt}d?3xax>iE?{P?
zw0#?5Te>|^${Lf09U8X0wOEJdD(^2#gAS)Fn+uUP>aYxMTc<F2f$ha?M&}u7s3q>q
zWULFh3N%H$)#MRk=UIt&E0`r9Bb0=YD*?Aks#8?ZCJJL_w5m#)iE6`1D&~kZ)IoVc
z9uOEB4Ehgip?}$bAnoTuDi5^Qnt?-0Il=8tI&RT@*{2F#OlgpNcGJJq<2cp>tV|5F
zah1m>=ngP*#CHiO<3z2~7Rp`a4N74kdWs3P^8gtf&*G4#rqD2_jsmBd;{S=(qoIe=
zQD6e3xAUyJ7t`mNj2l6Hd4o~y!T1(oX4#C;#An%sa(|b%3fhF|fO#x3T)#*?GtCiu
z{B8!J?Bqc9R{mDo9!GDI)AQk|ufASi;8~l|)$r)7=fc^jas7^#<r7AF$U|SaTgNlQ
z;qiI8xSgiaQ`z^usRN4yu2guA|J+sz$;Js-2Kc`6dF|GS4iv<!JdNe}jkTA3jH5OV
z4t?0}kraa^VRzfCyWV!i+3^n;_%$pi3%!$|Q6LHyVT`K4Gat}HNt4P^B!9eelfsCT
z*d9K|CvPb0JME2g-_X0ou^%eXK0p6f#%)7<5mujN+!~A42KG6|t!2^Lpf(z}&`vk#
zkn&H<_)%cW6{T=6W1(y(L)Qg4sM_cFR<6;5u4FpE2QqhY6Z_=ir}N~yqtWoEBX{F8
zhII&ceLJgqcJPlKPdIF4vVYBtrb=uU1uvo#Izo*R?0+zYDtShmBm+prwLU<i$ISKA
zv|8r>71p?l7i(DKRz#M=%Kv?g@c$A6**(y3>~`*O!x;u@W7B%5rhz(8v610*6sZ#u
zVNEydFv9!8gI6Ei>TWNNT%x(XkWr*PZz_CGe#?s0&y<6p$(R#tT7OQVc_KI%f;>NP
z_YYRl#j;*|uw-X$1vKgGc95K+je1&q-C3n0VX?;<SYH?s+vzmTr&eDaK8;u{Nep5f
zlFwB(rO*PxeBWD^*_dZMb`>LS{)dJZ&>@thrctrEixFZxP-y#hcG42I!s(bTb}J6E
zxz)TRCsxfds&we9n}5Y3R?)RRciYp1v5n*C4YZdRPg?aQCjeKI`@ForiHqqQ43Oow
zDrD3WUldLYpH$>)6e-gMO6jy4+y@iS&}O4@l&JzxTH#Lq;bTiW6Gu^u>wo~bA2cxb
zgYn;P5fhd(Fb#%H4Sno9aM;{xk^|81Vb-x&a9hza=>~It<$s%D#avb1B@VyQZ%~<P
zSX5eIm;&Ao8{k<fp8@=7@0`G_qtE8NH9mM?z4;bh81R;u06sPxg0xJUxKAkK1bh-_
zH8l26rC{W$jfL(h;<A|(O#l*GT_m2Qc_M#d`++unaxv$r>ZWQxgiKQ?HmI+5snm=1
zDr#hQuW<`2)qg|dYuK$CYM}u<Ts2tD_sVuvslMsEy&asOdZcB?S01xsJd`&Pq;iIC
zD`GG0kThj_(YMh{7vCTnjzgYi3}e^cZEKd`Kdy7roml!CzDmr&noft0sW!;@zHzX}
zuooTCv1g+~<by?V(*|Zq*_a0-B;vdx4%*v}>Y2`JEmkTpMK`<tcTHdsfZVceZ7n=n
zN?Y_oa_Z@gIakyG9tOuX2A!}@C;9}n!P!riVb*<?gPYfV>oh3qNbC&7h)E3bBImqX
z){z0ok*_eIlP}dqpYblslRi~Qe>t#&5|KPhi%87TrDiUdh5}#;dr;Ky529cHYKZtC
z)<JW{Kl)}uzx}0ndtDg=-l*u3%iyGDZ^J=rJ7p)6L8!6jEY79_ucgD*Ap6OrAJI`c
z4J8ya545lX=7EChM5{G@<TUp@Vr}gPd*Tzw4}jvj@WF3I3X~vyd2YiSf6Qa%sM}ru
z6`g$}2ZqZ(4fU<-Gig8(>Sh&x>pNo3T|D{FFw57*AkGYo7DX#0M(yt}(Ch_Ew$4x*
zuKWJ(s|oD{6RoO2W`faHO+=oSSz#x3VPF#&UbLYr5~$N+MJI1Sd3Y+bTDuY{dQ_ex
ziCsh4q3V0=_6JN)Weh#Ge}Q;_ys>O@xq;X?T;~vg2R1Y38N*!Tpylx$k5iCwHXr?U
z1;EZp0erC<KJ5FPqyR0N#_q>`@%fEMel3!B$=zw^&%&pvI6h0c*(w(Uq?O|t#F2J^
z-P!Ae(Ep-^2|1ixX~I6Jfqx3Yg70~J@ho6hWn1i0^?uZG#a^cuMb0)wof3F*vf`TC
zY03!3RyUl^0V4O!Qv`N%SmGELK&M_+2M!<D)#~D!>{Yi~37TEA-V5cdu?2|)*0Ilf
zP6yhhU`vxpS1u<wMP$;upt70Kf~wOw8BOnoo#1^3ob)wWHLavg8QJ<k{?()|N|KYF
zS1JLplgU>jF$nmp;TxI$a-)>TVkG3Hc-)p;(RBH@i+T0obELd2z0f-#MR@dW^h#z9
zz~3OzQL2+2SRa3{{+w9xdUP~8Jx(rxn5X}$=Qi?AtNSLb3)m${#%c^X(7HmasC)eO
z==z!v4X9ccrYuIB5&Jt7J%svi!YmxSw>fLY?yH``f~KFl<^`4re_IxFa=3}kWVTf_
zAv<xfO}TAj6N}p2-Hp}a0&4O2ENP1x&B9M$99j<K-7<f<_)c4EqssCSU7TN!u5OM;
z7gsyX=`ZLh(v^OL`u3ZYDV~>MhY^l$<GezvbLW!t;5HXy`N3I^DUl5=MjWCbvjJNR
zGrm`CQzN4C&-k^itGdDe)+_60EvbFV*$ax17pM4T)>jat3?9v@rh?1BWQjqfqAn@n
z@X20FDgJ-sGfZ~eY*m7!vqg@LOq3MG(<p|reOl-vW&$~KK2~!6^ECr}w03ZF{^L2u
zm~2xIY?Q)UK!cd=#eNsL;+Ta}F#zbtKOTKI44NLc6p&qOlR9?4>=n!2bJwf5YMWN`
zu%%=caj6IJ%KzqHU?4_7ki!pQOupo5DHlfB^PYdT<^AsLT_&Cyes7eI{Ar9~V8xS)
zOLt=Y{OcW1&}hkGA22rJkfL{_uv8vnlYYuGdAc*X8xAZloh3be<Tr}+q_Tu%wSC(=
zB(_))l){T&F-gMG?0Hs!YCwt{XGY@zn5n&07Lscutt|gGF{8n8r|KXDhF8P$<00vs
zm^FV@*N;6<ryuWfUGSN9I<qa=!m>2Gh0fxK1BK(>(?`2KP~@u9v4FAfEmXiKl$18h
z??svMUW*6EnBweFJdK{H3f|LU0=AJ+F>OSF2VzvVAP>@vf<f+9^R(m`D7m7C{D>CG
z2NwOFv4T89=zUoU@7i>RF#^z9Quuu->8pPou1(P^6$pjg|2F9>O%eAn%ca!Pwq*5?
zhef1+pUekOC^;KgZ5ve5NTC20TSJz}1lIvej;oxXi!{si<v$ouy#y->!`$j8K{0VA
zWsD`RQ)_SWtEd55WthMP!j8dp^UA9<6uBi`bgY^Nj!yV73}cCQS%%pum@=da@78~=
zgl*^bjtSx=1N2a7LxNOs-MHJd$oaM-j7FuN@)9d0P+u;G5|M3-VrVI(s%LUfl4Q8D
zJAlC`Xbf*Oad2Y4A!XmeG&`k34u_;1*d{)um_r5Z8*NKWixD){?Tv|FFgYWbWSq)`
zq3DtpIIC7{(iBB?35M)JF`H@TqUe8-JQFI<A0<aAYMJN*zRp&a&RDU#9w1kKD}yC(
z7&tTEGx*DT(0U<`TakoTqEe3}9APh8kSA_9C`P>#cu!M;d2pTE7F3_q-!0qoty}Z%
zISq^pKAjmv$4G;5-^zn60nnK_WOOGN908f~=`I6Ar|+i`R6F?LGhak#KJ$M?y&Pl5
zC+a3KlQ9P~&aLk5DOta1NoP_o@(!LhrWC~UqIg&>Ww-@hMzGc=7NnI-n}|6@gBxHf
zLzLcp>mhCQSu_bxLjvp<snlDjP3tm-XthKUO)agX`;1~vbzq3KXUS<#aUQ1+rVOEC
zb#qQv63k}Vgp`ElAw8cd5BYzzo>8wUwpk~>aRb8};DW5$JeQ|{spS6_!ZNu$(Jxfi
z<a(AR`%XLwr5-fU=;TFK<D4a0S%J}B?m?fs<dkx8XD)HaPZn$(IZ+qdkycsZSy$d#
zp%Gh}D=%0Rtzv!DksV$(O(cKIdzQ)q#sijp(hP77L6B~ev#Lf`lU#o!oQw}9-<g6z
z6GL{R3KGP~lg3agy6fZmO)+8GwsNUmud7-tVtFuolr6*=pqy2D+_ezl@it%$I0>H;
zP+2J_w+*+d-Z*O+IHc(7S+*X<1Ad7!=Mq$+ZqJW1YRoNEp8R*tK89}^hPGoH=A)bE
z%-tAqSa|~&HwO9i{PKTh1p52=yW!Pnc(p;B-2??uH^^*pYWuN-`c&vMjc2#Gr6S14
z4H>Ez%x2lx=W~JKC2a%k^k{Pg3~cuynoq%aC)da5j+r&v0%oN7+I5Vou;dK@2o~%%
z__Fs5!_?W<x3n_5q7{`%Fs*)g+vwrLKImg^6B;`waN=`5>ezo};0%a`V3<za|0cD3
zCy!ST_hRImYXc~}wbQn1;d5MI$TpMEN2-y!X{Q7sFQVhQz)Me;QL!ok@FU%IN=J~8
zhH4)nAaS2L1n6;8SNU0tH|WT0gz&B79oQv>u~OHEfdxtTB$B>Kw=zAc`#NEvIWh%A
zUad-DR$Q`nC?0>7Z&h5Rc$6up+q9^HHrUZ*fc4bUX|0<O@u4*M99=_K5}K+0oyhTm
z9CTfg-YU)B1z(UWue>5xm_nVJ?vhr9i#T*u10z-9RzUHrD0|H%oKwtS(13uYW9OMy
zmrsHQqI#8!=6T3fp8bLWMtpDyxFecopl9UJcdX5hSvP+vqvNcMx@7a`++(9!Q<c;J
z%ci}+Va20{?=b388|;QiE9bB>c1fS(Z%f#-`MyclwS{u}_-c4G8YWbB+b&3B%a_F8
z-zMzM%2WQ_ZPRAVdw97E2lkpZFdScCImX>w>iAJjc#15Nv)xWG|J4-4eUAER)i&p8
z4(N;KkZ&#Ioiq~;NJ1BifiiB)5j)#Ix?k+`cwUH<+4Gg+&eBS;T{*S9Fe!8UoI(N_
zlh0olf5Rd$U6Cmejj7iAS0CQP=ip4T`*lR*`e$?NuXR;)tP6Z=(o|>?w4S8LU>s|(
zP>X%J#RrPgn$VVc{#ho}5vCBXN$P5Volu;^(bd(FpD(=Wn~qdDp2)lGWCmukcQb;U
z8%d*ytA#bp4qzQb5xC_?*J*b1kAvSc&f>EKe}$c6LS12vW1Hy3KjeVAHnv)AO*1be
zuuPL;Q%;q1RVjyi<(Jqk^6fZ~gw93u(014`gW6^z?_1cpOf36c9uS6ZkvO29b)ngH
z#8<zwav(FBx2#7kDw{u&?b#DIYrdG)&FgT3*;)N^+t&^Him<KAw-)+LmiRwD<_$Hy
zf2l2hslnD`Sc-OgXXEVk{c~Mqi(;NM^1(1<rIdp(-{6AT#BUpk^m`asEnb{0m}sj?
z)7GR{=~MFPuq(ecd0{gh9T_iy6l$fA=?$})-9>W~vAnuH^Ro@`F3;H9a22p1LYVL1
z*|pORL^NNBcQ)~W?^N<KP{;%6L3~2~e~uj_m~|G+xy{NQle6?eW`lBWn4Svj3N*?f
zY2SH<4=R(^P8Ml*P97SWJVxdOX}gq)Iv+?<r#2C#0m<hxtxPv`JJQ`GmKE%&#6~8&
zi*M1yeZ<WOMr0utMM+t<=h-5oc1oE?y(sX$gthW<8#Tna;~==RX>pWu8`mg&e;XHC
zp*DfLvr>Q|WjorvRziH`+->Hb@5SN27xSeY{KphDo@I(Rhvafs4~PVUk?a~iVgsh?
za-N0~0iW;}lywLb0Kqt=x0DoKTB;ben>SfR-y8{Z$|3`l*476pm>fiEgO`)v(BqE$
z3c-Cd*|Eb26W1NcUwW@+md$I*e<~`}esPQK(E(lYt2af_q0C)Ie>_{m%Rc^P{rfuQ
zAXJ*Pw{+SC-UhagG?{5xW>V8m8~GFHvJJ;?q4x@1+pChR4k5)alzfx}CwG_RW@#-i
z3$(WL+&kS=du^}+<{f>qx`EZrh+&o=Rhz=Mt#F7(i_BCQI-X&8kxC<oe_%{<D18U2
z-y_+O0mcNu%jPS6hgz!%r_XiKbZDIF3Y#;7==<T@%N<NAe5k1Oyl**OXdb|(5pK32
zdX%4ri6z-SaNfDKis8)-wOWlce1408MgdIzkbvp=<pzS|nw~cfn*)6R_t1W802fe$
z^c<1Ze6yvstqVf0bk6A6e;ELADkn5iKksydcN`TDWJ<Z>Cu9cEh1)G%-Qo@hE_GQ(
z`2W-LWt~XIg>#;+^2xn6Mkn)1<|uxNYE%iMeLvXW@AJ~9f8lCeEbD6U{)4NEPcxB#
zs*RUDg>>S#)S4I4rF{J|BP7K@Q9u)^A?R54VoU;e7}?gOav@uvf4?2O8{3BBp@51;
z)?+48Q*T6}jZ!-=xTtr4y;_P1FG4k`g<yy-@5KVQ3{-hqxnmy#NqC7wirW$@Ddsmn
z;8$q&0@qbz7sx!D@XbNt3Dycy#7H~E05Cu{KS0fFnwHaOUQ9SsIv%B-DgBf1vhrZ_
z9E8E0krG!&<7!>Sf0!gW`!vR_7FHC|%cpO@a+`J_t@5cT8=cl1b|^ES%b!}gX&Hq8
z^_V#i^)Xjm|HY@eOrv6{uLYy)tyBm{*J;unMb@w}Tt$!P%AB>)mBvP$;k*Z9g~&MW
z`8Znhq+x*(LM9V6PY6<4pcbj`u4q6r!QB5rr4eq$$K#`if6Th@m{5>z<n*&9lDA%2
zftq%x<XD=IVf{pfQh36LF1At3Z;2cuZ~8Xj3{e%a22L=m8&yF<E%CA|yO+!E?d`pw
zluWOWj(?1<FNepcZ%&VOpseSe@MC4s{PA2|bf=n|XI>{B<akZ8w5^MFG+uNESLG$~
zyiu7-z3T=_f3d0p_L>&WGU-OiNhxfnyyb#eiqg7UPG;@c9!5#-n8u-s=LAQ#<jUV1
z@`%kUAzjQJGNZB&nmC2j$$6n7xp2;V_vdV}aG#U;2nW<k5*5f3>n|-qmciX5!B>tK
zEl!pd3|7~z$$CaF*2CjDxt?`qr8O!2)@1f3zTEj>e{-ypxbQ`UcUr2IkmBR2Vvg(n
zq|7Kl507Uce^L~leIEc_t7Qi#;87LaPcEi{wzAchnlSl_$@x)|xklm|MC05!V96O=
zOw8ycU10`S)JLWwopY2x>~;>X(!?V+bUSA=ySL6C-AwMiJt@6A$Zq;$voe{~#kTKV
z4z+bLe`C4IyS=#KuVY!Ae5o4;9B}Zf#=k&(>noct-X>Vv0hU}1Z?1<p_nXe$S6lN4
zJ1yR3U<d4wTmK8Czo$_Q`A)xhrhDekvvhr{uE3-?k4j%MBhq}ej8@D4=MIQ0JCK>S
zWR^7!0RWchg-SP_;K)3!ylR9NWp1F$h21;vfApeeiNz@L^Lg8TRPtrgwY}afkf@F0
z=*R};i&Fb!hjeKoa&m9O3j+4_s?XDo>_%;9Iu72VTsu$m#*-2)Eoq%Yk|n@zSw1Ve
z9YvZ*(uPKj(;qhgyOW6rw)>Z0!e$08FLyUA0E7MO%fD{P($LJ_?X-KxVgtPf-Coa5
ze;0=ja%e(f7qFT%YkzRbNfXw-9Y7H-lmR21;;Ec|ggM6eFPi_9Gau@s)0hA0Y9#uw
zqS60ad>fHIk*>7M<a&7ZZg`a(9Ul)bN6F~)Y<O`q@;Zr+O02pOX?!@BY~vH^oE2U7
zBs&(a?cJQxj}mhYm^~^yu>;A(3D<nNe=^I52Dae>68mwkms0%d--OB9CN<Pf>`;d`
z7j7y@VH~$jOihW|j_lUJ!rmg>a6e|~*>Iz$|FowVR*!eTi48k&zeKpUfB9FV>efB*
z?F4iLk)7=QDa~jBq9iLeZ9I_|CdKB2^qXKC<<5?(@z=~^`Mf2QZkd}qdkay+e+-m2
zva~L*s1sur#`)US8C7(vZSqM>_1~LU6BR>o0{yk<yJ@-v*09E%M!`#VY<aZpDwOuJ
zLA}>K?|_fYXs*UXCF-;&-{hU=Pn|8(rHMF-!5G!OMpT6yLrPl|{G_|-c{MCa%UYsy
z3b<KntPYk{=1O&9WN&kOvsc62fArmuamE(>&DE+syPIPv8|=4P%dLj2TI1+n&+Ku{
zr=+p-zmpxlN!<=l!ed)rq|>#1=uh6bT5Y;ln`IhKz9ui54XLKSMkKH<TjASQ8X$_>
zkDP+yvmrR$zEbzwDAe%uy>{_nHjQ6jwW*%?xHr`FC3?N)q{Nh(jJixYe}w*2jEv>C
z>gttHbgX*9`TgmF14&ky0-jxC_SW5|{JV)9Rda<orXr&7CW&0%V%G3VZE>u=e#eEr
z`4%30Xjjwf%6CUsuaCYPLO<SO>WZCv7376r$*GtdJ@7sE^+O1;HuXlW>%8xGlRxD4
zg=u88v|eGf?TRdOYCT5~e{<{njF*<URcX-<1Uob=t!#y()96=6U##5m)(Tf%$Q$qE
z6>)*9=<(U*UOQ7RaP628;1*Z13|G9#Z0R39v$&e>4Nk6xqnoSq<ZO6-4P2#5Tk|qE
z!rI#|4c%WWwIELr?I?rDe_KU%E^U6*ZJA`v+;_P7-{f9>gHZP?e+JkJ%Fw5?-mqgI
z*nqcju`jA{72&Oy@ivuWzT^ry)QP%fW)`2sWTj45gf4_^JRRw-rxUqeOx!hU2^O=u
zN&nQ%;)eUI5n}H1xlWI$<&DNGB%aA2(i?~s|3KmWF5TGgABYa1F{FAs+)Tr`Dua-I
zxdlE0aY;Z1t<^#Qe@Jv7BaByMjDduxW1zqN6P+PfnMU+>OG|{`5mi~U(3IoMXb=br
z7+cs$k}1e)mQJjV8U^E?nNXcjlW@>|H02a0V>U<h{ay4)K^dF2#xg>WkWwdmD7?&z
zDyyQdya`FhmnmQx76pe%j4L|{ng~$Q>20;TtwgQj)~4B{e>RXbgN;d!fD9S;qcJXx
z>UR%ckc|Kw&?WkqSJG@4n<S7dO7SBWLKTOk!+0ykl{6U-(5l>`!u#iIH*K0wf9Vbl
z=XEYV$vZz@nu(4*)XPgF;P+@s7fCXUNIoPgG`8mKC{|rOolf0ax{^MM#fK{GYgd5p
zib=-VIn|L;e-$lip|+EiOucpYPj!OghB__j6EJiY<^DK`UNyfWxsh$4DVfnseU>t!
zktH(&d819U2VqAmZR(AE%A6X7s`kwX(3|;Zg4UX$hU<jyBg=bSxYap<khGC)A*pT)
z|9k@Ag)o^#2OBy}zvhhD0j9oQ3jy{plHqk)WfR&xZ3Qt!TcvyaLzGFes$t@&yb@p{
zyZHfGHprV`TJI&Y<D9##rO<Vcp+!G`^Fwm|<LRX`4^A)6uhs1*MFrNzV^&fi5-AGi
z&HG@@4?u}0JOZFVP^r<?jkGa9_RuMApk*g`G!j&a{{&D=0|b+?kr=bnY>Ffe^nCY3
zWl_~z5G?=z<dbTVE)lFb&{K7I3jhHG^#K3?1QY<1sgW!J!jsmKRRL;~ERtOTZIg<U
RQ~}JB-;y~735x>-21)<`

delta 11422
zcmV;PEMe2BrUIU(0<dmB4caZ2LqsxE&IK(10Hl+9KOle1KDbiMKDbhKcnbgl1n2_*
z00ig*008BE`FGn!lJM{PD=_lDM>0cU>`Z14y=A^nmgq!@b*w}=oAvsE5ZRPqkpKgL
zlDT>Fzi%CVqd`)RGkaloViD-BuI{d`tE;PDL<e6TM3Z8g<#&hC&FIa+KjNFc{r&y3
zV!E1(s4jn^EU!hGPU`3>tM8-Y_M@286}*ko=!f(%joxKty-MfNSvtATauLnSViBc#
zm+QL7{g3h9-n+ES(%ZSHqWV6qqeXUiUq_R9THSL4^}UFGcy|_6by~_c(>z+`Q&Cov
zq7-{wYV|x_0MgNMkw1#O&Y-Dj5#>c4m10sX79xM2BJQm~s)E}DG;XnifRFR+LCn`Z
z!11^gX)UG^Ks^zY=+FJ=>z6P8X>ac^UDfwRc^JJ0u=ipv7VGFFy%TwwRXAvS)b}Kr
zt?E@Nk|fF&%K{Ks<rxgJNah(1<-F=g>1|caSG7pE#2(k26!STZF7&2SRj1;YmDrO%
z=EZ;A9rRlLEGn+HTu+OMs!-SKC7@KQLX|*}`d2Y|5H(k;R=3Ntn21U@c3CV%-j6O%
zFNeCqS~gPO!=NCcsKO(9(T}d)0!&r>Nu&>@nC&UVzWEr#=q_QRWq~ry??gYkPph=9
z%dh*<ZCZ({F0ost&MeE*dG;TobQ`-)XX1a2`eYS{Zqx+~o+|y7_3aU+9LYm|N)q`Z
zN%r>kkOr#gd^MkcpXSrKD7$iqWBI#xxEH}Q6|+bzvbqbSIqMURl~ef*pCy3V+sgq?
za?cf`fdjU;rvTrH`Yrqk@AeVD{a&0-r>d9@fvXUtmb&prSP97;km^&q-tX=0y}o~W
z^JaLJT%Z2i5Ndt%@}K^G?_~Jq=;rMxIXXGH8eU(+>&~mM{~>+}|M#k+zg}Kkjo{PQ
zfBEyD|7>3VJo^6PJh{BO_}8E965r?&$4779zCJqsF}WUIy@Q62j*o|zqhxe?HoUmO
z;(yt@{P~+7lItH&FO%bow{M5XqtkzjbHXC{p~K}zS2x&!O4Qx=owcYs{iw6X|J3xK
zawR$+pu2lGEi?u^x!sSCi{vpAPklMt2@sZ?C0L!O^sXWfvrH!suu7t$jF<+hrI=*1
z3>N+EIvShmW8`#DgMa}12;oP0x)2q>K!%1N4wq?}E+W<bA<(PPzYxOHeKrNTCvM7Q
zlNm!Ie@Rh4cD1aCzv`Lh)rio)l75oOJOfS&!>;g-6ZaF9M&=6i3B`a(zdIfN1k-nR
zc65G{oFAPHqd(&7<n#>5CAq$N?S1`s2TjvZHiep<f6qJckEkZ;QY5)}0)Cl`t~Hgq
zkPBxB@KrmPM?j}v5Egw*^XiEfqB|gQ1!5+Ve<RYB|MsXxDL?|`$zwWS39KjT#D3Lf
zh<y)WKRPdR!T+`4IN&7R1B;v{AJZ}c=6c^XR6Z&*l&u97kQ4&q@9AEYf`0<%0IC4i
zS^;C3prDH#g23_>ApQOW6Fsys%jVb(;yS;+{3|e?azCCG<sz-S7Dw%w`f<7hSu*Xa
zf7*dhMHS=k_+ydf`o}aYadGM2!1m|V!-T*<8@;|WVf%6XC;op#?f}XCPx~;ry5T^}
zrp{zlmN4)KB}hpSL~hM3D1&GO$s=ajr+&0w-KV7>6#Ui+3XB!c1MmWZXk<q=ixgP#
z308L1#lJ1#V$lJ9vFQS4jHzLsuJXyfe_F6zr1HT_T>K?g8SL-(nVINcI8Mc~t_JTv
znDLxuB7u1XH0r?{&{KpVtGCnu@U8K8+{TzyKv-I=21-@JWr&E!AyXacCWS<lwLGjQ
zp@FIuN73mFR5&gQ3YEgjux6;0qar7k_hNyXlqwsoDn-SL6?4xI%?g0V@$bv^e>Xqi
zS7`1cpRb_<h;~}8aU!q)eA7)orKstG772RUG)j?g=S6-1FY;+xPI>*Hb^+51)j{XI
zM^yvyL4`(vNnvDXGl7&AjSbn1Q3WRH0^Jv^4-<M$vl<GiuTCfRlyW!qPa2L5qwS3p
zQ^FyRY+9B$8KCWas>?Kj_oR#he<)OO@Lz({b(%9XbziLJAVGySXEYLo!B|%qN7S^a
zkp-gCK2}JK<DQM1MUaO$z~d3f{&C-QND{oFK2Y`NbF78@F4uU~_b?@La#Tr#S<$>z
z(L;s<tw^^74O&=Ko39qP!2Js*4@3WzC?C~?DBhXPs6UGOGzRho@KBF$e{-8JqBm(a
zw<r*-8Q|gwig-e71NXRv1Z^>l63h`M^kG>Rpe!EKv|wEl3R(gHy#Y7ZoLoWDNWC@W
z!=2?dZwv><%h26GAy<hdGf4&1;y;oj8%^T<K2AR{rTqh4$Yxj{h@mopz@M(bj-Waz
z!*X7(3DNxzYQ$_3(iLUfe_9?K96+%HSlPE_@gVYpWwsRi4w|Ycs1C$kWhYogkEh~x
zb=O5EiDvGdCQGtBRY&ciR?32uF)*4i4rG3*Pkk_6r^TuspfyH6vYh_bvvY#$XMrX+
zYnRMo&0LZ3%T{bR7h=jRp;|4q3i!g~q!pu+5TpTG{#`^ZW=v2Pe@{t`>N(0u!WJya
zAyjl=+0Rk%gI><wf__A=5)|x4qFf2E`oyDPg`7%|UOkP*ruaCL#;0H&At8?CS{>oS
zo&k{(6J|LuTg_?pM`N(o3&v(JLun>OmYSS``8@Fv2J30_rak5c#FFy~aK0k1(=1nv
zjhe7)8vMB;{-yW_e|HonqLlr2_=i~(Z%&7APkiN?Uw$_v-Fl;HZE%khpaYy2Tgf&U
zewH5WfZs_eR?AU=0Q2}FIUAn62D4GN!^}7l#Iv|u@BPaU?R>417jigZY2n;5M$xaY
z{@PIs&FPfsb-!hVBxMVr7OP#R^HwE(!)Ww9Uez`;&JvMpf2rEGmiiIlVV2ajsJE;p
zAVOyyAt8<lO6!th^#n8}Tr<c(1gb&Y!mXI3u;i!=Fzf6tUnoNncv+d&1&ItPdM9a)
zG6>Xz0{GLsNT){V{!Nw~pl;Wpesgg))Sw9WLMe-;8C;<D3I*{^JQAnRe$^#N{r1J^
ztrqnY_$&-qf4z>EMKSN~?amCY{|3y!>9elZU=hESNBoas5nq%s?9LSM3n<R_g~t{6
zU8{&q0ar3WCPL9R*T6NB=C1tPS*oHZZX6+r0tmfSdA#paw%|x1LDC}nN)Z)qIVKku
zz@109<Wh8*s}(D8%SZ*5y2dZ7tQ1o)1IXZ;D&=R&e`vY2CaJB^Ka;mG7`v)9xJ~XN
zTWyo*l15O=I1^|{bS-EOiL?>i&dfy72&@p*x1^YLWuZ`8q+(kh(tMhsL#V<rg}#A>
zK;_h`fswT*S`GlPuE%UD4OIiy9es(!p*G!=;jf@=oGPqvA#rFcPmLa;kU<FpRtjw@
zL|vq?7IpN?DxI@IN=u|Kcdw4Ik~6*vejj<0#YroFD36-#2K6D|zBCo1#j2|9@plHg
zF{b-c(kVtN-4h=l7ctp%z_jFsSUduw{06<>S#>IVb}{P`>a7bzEeK#8D5mE{CA#=u
z+<!Dw|1bCzNQAyu<eKLqitN#f1N#TJz=|iNDRe=7nh0`^xMZ%X9!3|YE_%(z1bsYK
z%+H~Jp}JgKwy6XWGrn>HsJUpUTJ^V{tZr8KD=^od@-FZ7X;#qeFqQ;P3T1J+sPuFC
z>cKyZt_=w2u)<@PK6{{IuMhF{_cx>D>g1=ZXN(3g$MQD46N%hal{C@yUG7G(AMU*%
z7$q9Mj-CYKW)1>p+UZFF+1QN)QeA*no9LZ?%Wky*Z7G=w-1+qy#Vx;5z=Dqc=#cKF
zD&%wkTnxI3{TNLDFaV`&bcMThr{|;O;`I;1<56;YLL%CO2SNhmUJfGyLe+T6>{-EQ
zli~)t*#$J_s-6vlr|proPlj)s`s>a>tXbN*4ydmJG}xWX8-1^x?(o?}dvd6uPW!Td
zJZAnw3KOp+REAO8V(LTji~diVO}kWu##XNN9a7&|RHqZflz7<^V>mL9+5mK2tfGg!
zcw(0`{3?}+_U;U~2<HOLIF?MZCsu3%wU5jR*#qN%5d>gsW<V^!S6FRBMp{4z%bXYf
z1u7IzAUXO{g-ho^SQvj%gDMwwl`YACAD5u(WfMZ3HZH}d<vg2Yz_z5zv5;n`9E967
zDSe2N^3WdZF;}c~Y*>)B(mQf^%^Y>~+VUm~R~T;2nTg-RbG>qmN28DmL^xS37Hf*9
z=oYq+CX#~kS)C*lg3t&(P^HodsmU@<;dTz7U`Ph9S}Vwnugy@CL|_omw#E~G^0LZO
zz*s?IOBV(FxNTIEomR6&494v@nK0F|$g2!V7&p;?@uV0FVa-;hEE-F#be4i5=+wA0
zkbfUkz`&W9Em$d7Arl?{b1)U;{WZ`QBF!taS94%|8>IG3&@IkU1dG+Y&XyQNMa|)Y
zWF2{d8%}qE+p2I=2ptdTm}jqlx*&|?)+DSafYsc=AxJ-==|k9(sE#I}eHDx7kagj)
zH6UZP>U4Y*yMt+4CuPOaR_qlBXLiO^1o}c&W21~7#W7FcZ2^kFeflVj5lF#gvlP|q
z+f^-httjq5`lYM(VY6nbFJ0cy9~`f|&QfWf@^%xlg4=ZRP{o$#rm>2DKsRMMJn*7q
z`s58CP&iDVSum~K0kU6L+w}c61KPVcq^{H*e5Bqa8)eHz{=lh_F(PJ2Xfq29Uoa!0
z)HG`bV$X-&+;H6j)dCiq7EZ)0g=xU?r9p@r4Vgh{Y|PoKNv$MF!d4&&_rz!8%9dZ#
z+VEtLmT(px{`ep2z0U_3A(qzupmKjyTE(?TbUgur)A5J0R&WxLaFYQ}6n_WJ1FVM?
zVe21mA-~U7V2XO>R-hn4C`JFOJxA74gsYpl>|Jg+Hc<Cs6SO&aF6yVEe2}g;ZQ-*z
zplin+BNiL_ra^`Rz!dT@q|5HEidYjy!gOeNwc*9{R~3y<vz{EH7pk_@A&wOjZfFq=
zj2i5+YtvNUkWh2;z@w3Vn13EZ?Z&hlxMQ#HGO%QjfBGHoTjhHjtOjf1<i4p@?^)`!
z!bxdZI>f~zRoeZ?QCjGt+J7$91La*{IZkX)^B=oq#WQ#t^$_&bd4#(2qs!U|z^z~y
zQweopaSeFjv-F+Ht^Ij66U=O+^GwCg+>xJAI!|f}b}4K~9J-<+7=N&I9Ar-c+V|G@
zPKz&YjCZd&!M20K`QWiR3MKMq-RBnp0t(<@=)pEb6=}+ibM(WmNmhpm#G{KJ-kqu0
zv)#_5t0*<;)}at~FGo|*v?o$QqqrTI>k?FsQyG%9O40Ku3*S0~VQXmY79MN$t7=L)
zi7`v|F?-g=c0Vf7=zot;-%YzjUo#M<WctZiC>1>2Y?a2bA>JL0FsfC!A@DB5AywL~
zX%)SZ&}^E`ot{}?Ho8Z0>j#Eaz;c_K&41k|S%N!Zd2R#Uv1b0uERT|H5tO9%N@?B8
za)>$1R5!52%?l)($?0jxh!-T0z!aN!OVjHDFR`u2yRP2=oPXif)y0*WR~@m+1<Z<-
zwr_)NOScDdS!43BL&N5`7U|Gh<^5%8(BX1rb0N}39hSju>l79*kiD4A=sZIWwZxrS
zjCCQUK$FE=O&%e3o|SmFf>i=MLU9Nw3Aj~KouYy^Q5Z9$RaMeVR2xoGF-N4K4)P1~
zfWXvX(0^D9&42y_$9^uP@<6iIOdMLu32t}Naf|NDK2`8yN`ut1oBpL9$FUwDWg?)B
zQXZe6JHV_F-zA`o6D6lDl)K6sl)^;x6cK9Y0TMW##UV{ip<zxP1x_=?{}Zi8GagDu
zfd!D>&a>)XOrK{mZUpt^4Mw#G<6DH8Wivt(pJf-yU4PyxXcM9X=CR07evx`+nj`Y~
z-3&t6$${*x{H?Y<j@~4v=fhE7eZ9WGvo@ov;n7*og|kuP`W-FHCyexvhrV#Pj#q|5
z<MVWJJ58gfvhRCS2i6d{T;VzXb6Y7S8z*2H;QPv_wOhkFkP)-;G?w!>)?W58&e}LQ
z^<le5PCb@{-EFh(dfOFe$3I};*C0+7T7q+Hi}XR@frD<RyzTU(m%Cb7i|P(fV;$uY
z1=U2wgoH3gRp6Nq=%E~w%uytNy>gSnu#?!HKF23-M%H)Q8|S_m?-s{?s6hMr{975f
z4faJ?eU@=+ELt1b=NPw^MQelFXxu_O-JnCtKP}@&fhkv%!okdivYpJhF33UEzQ(sw
zjh=MH(*ZsVa~C(UPcD8sPrf@E4SzavH%?=aL%8ePS=F<Hf9!b7VJne;Z6-8TVlyjv
z5uMNxYK&n2gDF(WGup%%fGe)`0TMlDuBWEeGXJlz##Ovn!y30DvK&(W?_-4jml(+I
zfrewZbB7zwFi;zt<e^#y>OjRtg4a=`PDq5c+^oY0?+*`NeQ>M0y*P3Q&FzJZBJFug
z;d}C1R;+%e90X0qoM6*`ath7UfP*2(>jQWHK#DGw_1c3aJ98_bNoTi%_!Mo_lkjy{
zm5zkP9%o>EVMJ`F(=?x2eR23SVzuO85aAGiuCghG77*t9o>*pMp7Gd~jkNh68d^Yy
zP?DNP#pW(Xi19?B?c3Q&OV|phW473>ILzi&^OBrcHOHvZp|5U#7K>O#*Y@0PPZP#A
zj-xlwUS2$D)t8(ATutut^8O|Y(>E9(%WqZ4sKvf0oEAQ*$k!-RrVEtPX*akJCZ3_q
zM&&3|1)#LTo&3Yc7I!9!q8Qf!0dPNPVC)Cuzuh7xEN5UE44WGI*m>Zvxz!{Gpx?u)
zW47S7qGQqx=KRWkH^Yj#s=P}aexu)@GSwhdT40z0-VPh!St*|Z{Aur;z^bFq=DamN
zcwoKx7F`(dmWTj8HXMQ^CQaNYlyL$+iLx3Rd#F+{a@EE{_Y`s2JQPg;5?fs)o}_sq
ze_{K9HvQpZ%2U-%)qV(>CR1#TzS^ZyFWRf9nc2O@El8?=hsM{iTQ$@|19-S<u$u3c
z?W$6J(|3D2I6?JD^Nz1PV#RnUZy`wK4Bb{lUfLmP%Jiadqn9qeK{OnPJgpeUuD#pV
zD#3qT=cc=`^f!E!n1eN)4j)r(4CnjC!5%|ibVSEqjS7(u)`*)nFiXnDJQyJs=M{0#
z-gZ>abXIFtQGq48+4a9`0*e6TmThZG@Mtb=jTa85Uf!5<MNQyga9(533F~yCPf#12
z{bU(N?lT|UEcdO`psXXYGZZ5xF~p0U^Gd8E0gxkKVL&Hes*OJ5UBr_>RY-q6u!9nj
zJWGp6%+aN0E|-P^U<!Ls)btOcU;k<b@j<ME=8Avx&4hmYqIi2<83W#^=nj{`Nv+<7
zgVuJ+P9}p;W6f2ZO$S~}hpj>OlSx0KqjDNbC}tkU!U~v&5mYByt>q(^x#tmUYd6>v
ze}Mb|D6R`1{8l7G363wXZFql!dCVMj+Y6wgvv1_UaOtO^zIA;f4Jbn0tm1Edht0Xd
zlMf9ue{D?S%)n?-v_fpu{{8~ZUa(~A47uUD@9(~v&`vPXsv3AE7;V)=<Y}1|c48L>
zHi79y8@eKfIxSXo@&=TLr!uRxE0Ll{<vEhrHI$uEeUIJ#fa$4>p~rtV5KoXdmrW`+
z5F3Z<oB`m8&CGemFx5C{d349~6eOI@M}J)guyaxXpRI-u`#vWrK#Qia`*B}<e&dl}
zYskCg?zHn~;nP$cpQYSv<%@x#mGc?Ik#>RI)$4@N|Du@*Ih<T>!ak^he+t2Z?|FRj
zEMQk<TkKNxe$;W<UZ+ME&NfAz5_od5;+osXlo5=rZaAF-MDClX2<+yt#4!p$r(RVD
z4j<Un>f)O0RkvCRnq9Nr3+1e_8HohevCn%>2im1zbCXI}E+;ufWYW8!vYC-U)#;p!
zrgy_m@V*01`kJhoR??=7Y<(F1)ub*;;*+0ODgm^U%2y*XF!-zC8=3xcBbUcw#N;J=
z+?HI?bosZldG+CQxV+81&^sVmc=T=bN@fng-@wt4tCJpBAAd`KPONx6IvSlGCzmjo
zr~j(gHquV3`zEXl*d>U^Y79Bhx<ad{d;Iq3`kD|8s9F}L%to9U`#WSkg!*s7EF8PH
zIcvr4QqN#P%g>d00TJPE%VJIrH_@5Qwu&ZXCl0nLw{7gfqIP$8W3{+|f*zlq?{u{`
znk)~`#rgH<>VM{VbaA!An*4&E9$o1-s9(QH8RB&ncKG1zG|n5eI%h69k8M+!$`6h>
zr9?C&OgO|qX7jZKE5284QzN4C&*-(uMcv?kYsvUoBC$_7cR{i7;taoxd;~Vg;L$8<
zDyR%hmKa1TYLXHLpX{}i;y*sa<i^cX#RxiE_|V8iIe$?+j3Ox8Cxt#<9w3KL$BIvX
zzGTNoOZS`eAI~vHWSe-mQ3^}(#$dJ=`(5~oW7bK*4$zN(Jo;`JG#zX%AiLJab?koG
z635;%*HT!uP0~1Q=@=m_<v2_3H~#_?F@ga({1C?EOG-N_*ksRp)|U0Vvv-+zYWS^@
zKk}zBhJSGtPb#k5iP7_~XFx`yIg5Qj*sw#2zLCsQd3;U!D9>c+&g5=3ki2wu^z@P4
zDAJ3{VwTnRZR-%*Vnt3eFMh>C66R*l)9O_NlI1un8V|5c?X9j5Un6d1`KO5(4URii
z2QDzY8lE2yNoT~Yr@DUZc`E&QmFt4fq|=#eNq-iWrPwWW7CxLPocEqSn(cuiSDTLI
zi+yjO0zRRnvRMxHe#Uzx9vovzvq$hWdZH?L&xQ%OMnc834+S1*QQ3YxNHYrNxLeKB
zl4F>pL=X7^tsx&+<M#{+@(iZ;WjVZSlNrJYKx;|C_ocY6c9=F<uT&rua{t@7uQVmx
z!+$83QcK&Cl|vr(kp6kvCH9z-vjNq%!6c(pC;-LQkR>ufIbiW|m9uk^X1Tt+2Lqy)
zVC7(#TKyy_rp=^$vDkHL?G1hvH9)Hj5hx(+7+N<kyh<aHTiiv*nrYzZL?6R6mg|I2
z8_UdrqzdoWu7qvp<&FvBB?I(OX+wfkaev*o+qB5}wjzusrJk~gl^m!q#i2xGn@|ji
zLaKTu_Y{eTE4u>-jDo)KMiK`n_WM!x988l_I>X@%DF>#BPbua=0sBVV64PP?O?7)?
zq8CiA2qqb)B4H@Hqy^5Z6`M3eQC)%|dr-_~nz$(XBhQ4&(?`ioidrV>fUmPvrGGO{
z?5+n$$!}$#WQV`A@|Q%`xR3j3O{u8|tryz36^UsjDs@Q05%RJHd18ixTGTs%_bert
z$JV(mLG?-f-Lk#jy0z|}v%t9C)0shZjAJnFTY0P{06O!Agzn^WBOp^gon?UN^!qe}
zY6m`iW{U=z&umdI$Jp_Sx(Uo=ynn%rajUy~O15uW(wWeUyo0BODTVO7C>~Zz87@JW
z39R*r18F7GCSXp{+y<D+@T51-dN?-vESZF-Ap!P_RO&6%rgfP@v|1vIrk2*xeMT{(
zI<UjqbL6z8xQ^2YQ-)Bnx;dvS1!l8sLP|pOkeyGIhkV-3sMi#mtP|I`fq&f%a6wXS
zo~u*9RPufcVVNAB#xGRX<Z6~B`%XLvr4BUE=;TCJ<C>+hvI3jE+<`uI$tC6D&Q#*g
zpDfroGNLY|BdxN+v#Pu`LNm5BS5~klTE+UPB0Ie7F_H8w?@1~Pm=6&9q#2+LL6B~e
zvZ_W>lT;*}j14AVnSwzJLw|Oo3J!?jCyl98<F1d}H`#<~+scJ@y{>Aph~>fRQMM3g
zfO1ypdDl#Y=i7ia;3RxXAZ4YD+&0~+dgH8Q;E<uOXW4obPxvLSoJ&xNx;;NmsIj(C
zdGg;m`w+fq7}`#4n2&CrGIwLcA>|EV+!*51^UIqN=<nz6hF7EE)qe(Ub`umt-5|3i
zs>zQ9)TcV1X*|2dEfqmVUdT|pU^dFeKAj5;FIgLCr$?J3U|_os(R>QVE4e;Kch0QU
z7BC~t*REq!g*k5kK(Ju9L6^N}7^cp)yrq@d6|JaBf@$@`+XfFG)<GY08`Ic10TZ9o
zQO70&XE-bb!{fyLZ-0)K@8t37;a-e<b8P^HgZtZdEqsa#Oxflk^pR?$UfL<a$BXEA
zF7VRQWmK$60Q^XImC_L+97DB_5a77aoC5SXs;l@c!W(pCHbMB-@ebUQ!dR*6zrcc|
zdk#t8q+6Aq)P0*EXpT$)kyooy7>P^P4#l(bt%!>hk0RxC8-Es6&;~o23?NS}oz}X^
z5Fbio&(Se-rJz~r--#SAxIxzy>8;Z2UGN3B^2#c5g(=jj=`LwyD8!+w8W^b(w*rc1
zMcHeLa840_K@$R&j-4l7T^<P<i0V}?n&%l;dG-qi2=T!qppIyofu4~=-?273X5FNU
zj<YiAlFgrUkAID7O;u6@ESvTMhYgP!zQd?XZLk|6t(?Qg*d=`qzb)a;=KCUD*B0{W
z<E!D(XqZshZMz_iEn8xLf19v1E6@0Iw@sTd>tS&h4$L(&FdQFXImX>w==f1Bc!~^?
ztKCiz|J4-OeUAER)wbqo4d{#FkZ$CiG7}CcLKljGGJkHY5j$Hyx?k+`bY6&)+4Gg`
z&eBS;UAeTpFeh_+oI(N#dyX!@VrIQ{tJ|Iu%O-gLs{i$ev2<O75SXsW6o|%D>;0<_
z@8NTBCE5KtqT%{yb8D}4)pM*1d~4E7=pkr536H@v)?}d;`*Mp96uC8_E%Wp<PpBhI
z0bCQ;)qemxp*V-5tE(eF-*?kD9jS7>kayY146J1DW&=03kwz1xg*D9%U>!sexTQze
zX?F9EgWogG;<Ffqog+eBL5*{pXyG4nKwTSKt+tk#m(f?IIk87hm2_1phkE6g*e&wy
zIFLlnMfA{i*f4?GW+U%g*ttk7`&=IIg>Df$pnslqq1koBSGTirATyG;$fFjO&7aBk
z?1!5*Urfv9b+|$7tbV!e>jr&A*w*D+3w<U@{2!n5hML~gmcP_w>p3h%yS=k<_4@v~
zuChfj&l&k(7_w5zfthb`!EB<pjf3=i7+5V{oGqAWt4fcpNsrQ}<j-MOerxi<W;!`C
zUVj29&`KfG6J|5Ji{>U_dG&hcXA9t6p0T;1Dqul`FyFzmYo`l{XuS~aY@z|*spMsl
zkO#_x_=Nl&J8&@TESGZ|l{+VA>4VJX<lGQF71kAIlmXJdvkV_pCas+;((ar*GBWv#
z%<0i~sT6fSkYrA6B1!|2&sSQRYUp;OyMIY6E7(hkjZAhI-=c~8h?^0N$U-cNlCosa
zvqeVjlrj%`QQ&<CYvtoMYKU`3L2zf&qA2G!tx@<kEwn=I0q)L90fv<AX!lwP(Uo&I
znR`ALr~h8emvZtSQ_y&pDc<~%%UwMfBw&nW*YFV;FjbfHG?d8qgufuKLzn;v=6@-@
zrKIlCQpKp<yh$SZ=0{jl)-XV6ZGDh}$w8zxc)9q^c-)a+A-HcQJ9Zdh;<*F)OYil}
zvUyEeL512cZm~T&peug$rYJg;dCTaJXKQ%b$G@z9U#A>|N{jZEPOHG%z}Ar-W)jOh
z)U?w^{sg)#!|_|_y+YUas-)B*q<{E@;*WCT<nEH(EUo2bf7W)MdncP}uMJkfyrWN6
zH?X>yG0f7VYE$^O6%O%e4Ko#nj%OHNq|yK)7?a#d-(l+aNH%1E@qplE@s+*<t<{9n
z=dx%yG)`rO&6z>;{qXJO4ki^oR8)H2x10_%4`9;>H`@?B%1^_@l4Kt^uYcTH#n9%4
zTCK(zKEK64qW~s<O2G8|audOEP0yQ$%>lmud&Yii02hn~={X{+`DRLMTNjvK>722%
zGx*_BPH3Ti-suMKI4K^elzhccs0^$Nw_Ccp#T^bz>avXR|0nTfohU}ZIZs#l<X#)2
zlX)d`48KG*s)X6TAMEe<S%38DUnq@>WnB&4e{fatX(kd-wehm1kWT!TTJu7>l&@cA
zbfg$43TPrV1Rcv>j7i`QBHNl&E@bQTw_|r>+l+W9prVnrm<Oq;HzLzUuALWL)H{G)
zEyaX|P)%wf7^2I2vA`_@Rh}ew>|-DYULuj=wuDM*`OOdb6`H+3xqoWx0+}ZhzBwp7
zfvg}!jI>h>00U(61Jul>X*rGN#e_4R<5B3D(mx3=D~~m=L73bbDRFf)uGUqINshBm
zW87+CMG?Jx`t~ZfX$R6OpNg{4X-#2=GIP27sg;|SSqO|C6X&5mri$`ke5%VdDwg_M
zFS?#eg>ZD8Cf!+N4SyTMRrGwW%vl><X>8OP&RZ~6h>YW&kE69t8WtEKWHL$ff*_>@
zYLWWxvIaC0%>5ly8sS!ayghoztP6_?1?fgkKWia*^2!R-v_mDw(tHf-Cn}V}V?J~-
zjUs-F<rsd`w+GG)sv4|;6U^#HRS;84yzI#C<*<8udoL)Z(tqot;~%5z%i;0qo6}<*
zDC>DA{8(8ue>@i#-HGPriPwn(IbM@2ZR?^OjThaaRe4D`Z&apH@4A3etg3*$rdhL0
zno)963EL@exgeIJwC;wJSvR(aQIb2Raj4=s!I3Sw?l*@#VzY`#7jtKrQP~GgoI>j4
zx=@i^xaPh4Z-2H}xX;O4gcE8dc?#rt^+ijNWN<f0@Ri|3i<4yqlht)=vYye4_4Ig7
zu4kQDX-z7>H5t8$FLyrJ6ze1&d=cTDlu8m(d|Xw`QSML5i~{uVcm~oZMd8``{OMXO
zJ2(N4s^ETdF%`6xt+v#J$u~^SkCMzaV%NYL=gt93&VRsSVn!$F3Nx^xJ~9>QoTCI{
zw{v(ECLXb&+clfnxpn^MW^(TBN#)&1cGDLd$z)Ow+rD=>)z-y|<tp#?;)cJDd3Exo
zZVYh1!LJ(s0`{%1T)udlU~LCjay7iU9^%?>I(J`f%>(STc$<M8utRSAFO<HXW-;VD
z{o<MGnSVad()F#n!jj@VDt$?eNb}V)S}p&dJ0Y^{KxWF4S<*NJ07#-2D&2H~Bk{D-
zs$p7`d4VnmcJHjyLd{}}QRL_2w*9E&%cN^}y;UGl8^_U+4a#Sw_Q?+E(nRFs-i8+h
z?CVvZrybdi+R$_yyft#|G|d}NinX+)We#zc0Drw@`K;)6)X+q7Y-rXveQ^V@yO?-l
zyMGBLTxQ_%a&|)kFxkJp{Ogu13(f4?PP=!^HqdL(?e*+*Z}=dECKPr7(xh4YgFBqG
zVD0SwYT!Z{EYc~S%IQa#BaHu|^-nqPp*}i&`Jb*vq7M>{{@3E$i1dkcrClc1!>f10
ztAFI^_;`3ZN=Bz=!;71d*GYU-V$_XD<HNaN8=p|;tmwKX*|BhK@8*+!l$dM0>{;Q3
z9Y_XFxaP~15g!`Zh6_aO$GIL#(W`%xC2O11Q2StqI<&cPQ$Y&jxNTwTk(ljBZVfE#
zEy4}=TZWzuH*)$<dx~K7c=waoumkr)gnw)Mmv=R)ZrualK7bA*vXiwxr5OnzO0r^)
zjmPrBL$Nub{3h5&xwE5ce3@A+pC>ZumbkgIw-7bVKzXA|t9V777_%tO*RIZ}qFZf~
zPhzV7-YiX248;lb*BakV%OxPg8g~)}FUhg_(YC8h+RFy@UiZ8MJ~AV@nh%v|(|;QI
zCha_b>1<gpO~jE6#;EQ!qAKJZQre;5C*4iYt6_0k))Jjlz|B%)b+D{5SE3Ukdz;&v
zy&CSO?}m&sHsfzftM=?{j-_m{-)1eh8nQ~p(Z8PA<C;%NW9NS-JA9M69bSaTHor)x
zYx~fjyfL-fbT2mZG+ca5S~eR}O@DojhQPWkg>Oq~fGBQ1ateyihTwGjO5IPRP{Y&r
z+Qoy}G=6>6rh4Mz-Z0ab==GY55>sL_>N4dJ`cpA7mfxzYQ$o?P>IvuPrwa<?u*wwf
z>>9JT?l$GmP2{YaE66bw5rsEN<oXt|h9$K{vHJQP7y9N~bnu~FO{**49e-WDKKgD5
z{dkM1D|YTxkQRa^r($mO!1vtO4<W?b)Egz&dEf6Qf5`0%&&X(Ly@F`l6<6ledX6CG
z*7+GPt#GT-q8$i!Xjod=3P-2Wua3SLx#P(SmtV*m>*N)0fve>4+2vk3Q!a3Am|@@+
zRWc7(w8>=YA3n3Fn(qBgu78H3o2&EWY<PVQRHaK>>oPaY+S@J--9Ia}AdeC4$b-m#
zTSay*Y<|^kS!B)JcewfAq+WdkQ1>ea*b2&wPiMVh$3CzDZ{uQLG~p`3TQAdXD#d(B
z6;h}Zb<4yoK8eXnovsL7=-7BV(p^s{a=n<iYt#}fVs(@LshhzK^?zA2#MI|got{z4
z8;w^$Jd;4AHxMiSfx`RUabv%KAUc5Nkm~Jlvkc>^3_|+l7WfRr<p45htq%G}q5~OW
zydq-^Bt#tp{q3LV47tiQqPJUGBK(f1%9@#`oM%RZKv2Ng!cKBbK~}SLVr|qY823zs
z>V%qvgYKg#r#Kn2Ie((>-=a?n%Gk6umJxb{lsef%;bmS_Srv8VO-3@lOaardC^$@F
zT-iy`L|_!1-d3yIO4KTDZJJGL14$FunD_|rka0g6qi9sWd-#HE1fYN}jgNUH&4#f_
z0?DitKVl(NafmyNw_;pLlkotp$}K9qf4+9prU~_z?!a(f=YQgpytCt_S?Jh9EnXS{
zzeiKL2$D%e(jkpPV{6WiV%5dd>C~;IE9s+He5lgCHU;Rem}H!tQynQ)(V`Y=J6Xxp
zTX+9dCm3$1({g+QhOQd9KMtZ-&96AzNH)-v%xI=QOCHh4l9>U&(Wcphu%neW^~OGB
zPK`oU`{n}Z&42teL2J!W!*#;<k>oup-0GY_NZQD@kW{yQfBpdAg)W&j4mNaHe$5rL
z15ABA76R;HB*W{p$|kgX3T%qDO85AOD3fAU!@^N{CBQ^>^8+$(kT<!s-YaCsId@x2
zq3a$)i+=v*hvfRl(@SL@oL-z?tJ_P83apLCtfW9BQYaM6oA<$*A7CV&@CbkcL8V4l
zH`2xc-b1IjVJthrqmiIW{3lRL0|b-2kr=byY>Ffe+AWtuL^4#)1uXyoq?2`#E(FXz
sxKfjVjUSW7ks1NFli`t70bi3hl3f8}la-QG0l|~%k~s$ciU0rr05{ZGF8}}l

diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index cea7d0975e5d1..671c0d426677a 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -76,7 +76,7 @@ def run(self):
         EchoOutputThread(proc.stdout).start()
 
     # Connect to the gateway
-    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False)
+    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=True)
 
     # Import the classes used by PySpark
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 08de8dbe9d542..0ba2b4b38a281 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -43,14 +43,6 @@ def print_(self):
         #hack to call print function in DStream
         getattr(self._jdstream, "print")()
 
-    def pyprint(self):
-        """
-        Print the first ten elements of each RDD generated in this DStream. This is an output
-        operator, so this DStream will be registered as an output stream and there materialized.
-
-        """
-        self._jdstream.pyprint()
-
     def filter(self, f):
         """
         Return DStream containing only the elements that satisfy predicate.
@@ -190,6 +182,38 @@ def getNumPartitions(self):
         # TODO: remove hardcoding. RDD has NumPartitions but DStream does not have.
         return 2
 
+    def foreachRDD(self, func):
+        """
+        """
+        from utils import RDDFunction
+        wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
+        self.ctx._jvm.PythonForeachDStream(self._jdstream.dstream(), wrapped_func)
+
+    def pyprint(self):
+        """
+        Print the first ten elements of each RDD generated in this DStream. This is an output
+        operator, so this DStream will be registered as an output stream and there materialized.
+
+        """
+        def takeAndPrint(rdd, time):
+            taken = rdd.take(11)
+            print "-------------------------------------------"
+            print "Time: %s" % (str(time))
+            print "-------------------------------------------"
+            for record in taken[:10]:
+                print record
+            if len(taken) > 10:
+                print "..."
+            print
+
+        self.foreachRDD(takeAndPrint)
+
+
+    #def transform(self, func):
+    #    from utils import RDDFunction
+    #    wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
+    #    jdstream = self.ctx._jvm.PythonTransformedDStream(self._jdstream.dstream(), wrapped_func).toJavaDStream
+    #    return DStream(jdstream, self._ssc, ...)  ## DO NOT KNOW HOW 
 
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
@@ -209,7 +233,6 @@ def pipeline_func(split, iterator):
             self._prev_jdstream = prev._prev_jdstream  # maintain the pipeline
             self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer
         self.is_cached = False
-        self.is_checkpointed = False
         self._ssc = prev._ssc
         self.ctx = prev.ctx
         self.prev = prev
@@ -246,4 +269,5 @@ def _jdstream(self):
         return self._jdstream_val
 
     def _is_pipelinable(self):
-        return not (self.is_cached or self.is_checkpointed)
+        return not (self.is_cached)
+
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index b1fa1e227b0a1..84f1dadeba03d 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -15,6 +15,27 @@
 # limitations under the License.
 #
 
+from pyspark.rdd import RDD
+
+class RDDFunction():
+    def __init__(self, ctx, jrdd_deserializer, func):
+        self.ctx = ctx
+        self.deserializer = jrdd_deserializer
+        self.func = func
+
+    def call(self, jrdd, time):
+        # Wrap JavaRDD into python's RDD class
+        rdd = RDD(jrdd, self.ctx, self.deserializer)
+        # Call user defined RDD function
+        self.func(rdd, time)
+
+    def __str__(self):
+        return "%s, %s" % (str(self.deserializer), str(self.func))
+
+    class Java:
+        implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction']
+
+
 
 def msDurationToString(ms):
     """
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index a2b9d581f609c..a6184de4e83c1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -54,14 +54,6 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     dstream.print()
   }
 
-  /**
-   * Print the first ten elements of each PythonRDD generated in the PythonDStream. This is an output
-   * operator, so this PythonDStream will be registered as an output stream and there materialized.
-   * This function is for PythonAPI.
-   */
-  //TODO move this function to PythonDStream
-  def pyprint() = dstream.pyprint()
-
   /**
    * Return a new DStream in which each RDD has a single element generated by counting each RDD
    * of this DStream.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 05ccc23e9f422..751b7504f1cea 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -56,6 +56,10 @@ class PythonDStream[T: ClassTag](
     }
   }
 
+  def foreachRDD(foreachFunc: PythonRDDFunction) {
+    new PythonForeachDStream(this, context.sparkContext.clean(foreachFunc, false)).register()
+  }
+
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
 
@@ -85,5 +89,39 @@ DStream[Array[Byte]](prev.ssc){
       case None => None
     }
   }
+
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
+
+class PythonForeachDStream(
+    prev: DStream[Array[Byte]],
+    foreachFunction: PythonRDDFunction
+  ) extends ForEachDStream[Array[Byte]](
+    prev,
+    (rdd: RDD[Array[Byte]], time: Time) => {
+      foreachFunction.call(rdd.toJavaRDD(), time.milliseconds)
+    }
+  ) {
+
+  this.register()
+}
+/*
+This does not work. Ignore this for now. -TD
+class PythonTransformedDStream(
+    prev: DStream[Array[Byte]],
+    transformFunction: PythonRDDFunction
+  ) extends DStream[Array[Byte]](prev.ssc) {
+
+  override def dependencies = List(prev)
+
+  override def slideDuration: Duration = prev.slideDuration
+
+  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    prev.getOrCompute(validTime).map(rdd => {
+      transformFunction.call(rdd.toJavaRDD(), validTime.milliseconds).rdd
+    })
+  }
+
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
+*/
\ No newline at end of file
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index f539bc9aa147d..d8dbdf59e7ff1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -623,66 +623,6 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-//TODO: move pyprint to PythonDStream and executed by py4j call back function
-  /**
-   * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
-   * operator, so this PythonDStream will be registered as an output stream and there materialized.
-   * Since serialized Python object is readable by Python, pyprint writes out binary data to
-   * temporary file and run python script to deserialized and print the first ten elements
-   *
-   * Currently call python script directly. We should avoid this
-   */
-  private[streaming] def pyprint() {
-    def foreachFunc = (rdd: RDD[T], time: Time) => {
-      val iter = rdd.take(11).iterator
-
-      // Generate a temporary file
-      val prefix = "spark"
-      val suffix = ".tmp"
-      val tempFile = File.createTempFile(prefix, suffix)
-      val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
-      // Write out serialized python object to temporary file
-      PythonRDD.writeIteratorToStream(iter, tempFileStream)
-      tempFileStream.close()
-
-      // pythonExec should be passed from python. Move pyprint to PythonDStream
-      val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
-
-      val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
-      // Call python script to deserialize and print result in stdout
-      val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath)
-      val workerEnv = pb.environment()
-
-      // envVars also should be pass from python
-      val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
-      workerEnv.put("PYTHONPATH", pythonPath)
-      val worker = pb.start()
-      val is = worker.getInputStream()
-      val isr = new InputStreamReader(is)
-      val br = new BufferedReader(isr)
-
-      println ("-------------------------------------------")
-      println ("Time: " + time)
-      println ("-------------------------------------------")
-
-      // Print values which is from python std out
-      var line = ""
-      breakable {
-        while (true) {
-          line = br.readLine()
-          if (line == null) break()
-          println(line)
-        }
-      }
-      // Delete temporary file
-      tempFile.delete()
-      println()
-
-    }
-    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
-  }
-
-
   /**
    * Return a new DStream in which each RDD contains all the elements in seen in a
    * sliding window of time over this DStream. The new DStream generates RDDs with

From e185338e1b13d92f66356c0a966e5b5c59e69f0c Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Fri, 1 Aug 2014 14:39:18 -0700
Subject: [PATCH 032/347] Added missing file

---
 .../spark/streaming/api/python/PythonRDDFunction.java     | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
new file mode 100644
index 0000000000000..88f7036c3a05b
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
@@ -0,0 +1,8 @@
+package org.apache.spark.streaming.api.python;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.streaming.Time;
+
+public interface PythonRDDFunction {
+  JavaRDD<byte[]> call(JavaRDD<byte[]> rdd, long time);
+}

From 54e2e8c5e4e7c5a65e9b8f1d90142902ed95160e Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Fri, 1 Aug 2014 14:40:37 -0700
Subject: [PATCH 033/347] Added extra line.

---
 python/pyspark/streaming/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index 84f1dadeba03d..c60ecd1ed607a 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -17,6 +17,7 @@
 
 from pyspark.rdd import RDD
 
+
 class RDDFunction():
     def __init__(self, ctx, jrdd_deserializer, func):
         self.ctx = ctx

From fe8619882c5f2f5631ee7d1326e9558256753ca4 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Fri, 1 Aug 2014 18:29:15 -0700
Subject: [PATCH 034/347] add py4j 0.8.2.1 but server is not launched

---
 LICENSE                                         |   2 +-
 bin/pyspark                                     |   2 +-
 bin/pyspark2.cmd                                |   2 +-
 .../apache/spark/api/python/PythonUtils.scala   |   2 +-
 project/SparkBuild.scala                        |   2 +-
 python/lib/py4j-0.8.2.1-src.zip                 | Bin 0 -> 37800 bytes
 sbin/spark-config.sh                            |   2 +-
 sbin/spark-executor                             |   2 +-
 8 files changed, 7 insertions(+), 7 deletions(-)
 create mode 100644 python/lib/py4j-0.8.2.1-src.zip

diff --git a/LICENSE b/LICENSE
index 383f079df8c8b..e8e52800de12f 100644
--- a/LICENSE
+++ b/LICENSE
@@ -514,7 +514,7 @@ The following components are provided under a BSD-style license. See project lin
      (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
      (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
      (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
-     (The New BSD License) Py4J (net.sf.py4j:py4j:0.8.1 - http://py4j.sourceforge.net/)
+     (The New BSD License) Py4J (net.sf.py4j:py4j:0.8.2.1 - http://py4j.sourceforge.net/)
      (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
      (ISC/BSD License) jbcrypt (org.mindrot:jbcrypt:0.3m - http://www.mindrot.org/)
 
diff --git a/bin/pyspark b/bin/pyspark
index 69b056fe28f2c..39a20e2a24a3c 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -52,7 +52,7 @@ export PYSPARK_PYTHON
 
 # Add the PySpark classes to the Python path:
 export PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH
-export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.8.1-src.zip:$PYTHONPATH
+export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.8.2.1-src.zip:$PYTHONPATH
 
 # Load the PySpark shell.py script when ./pyspark is used interactively:
 export OLD_PYTHONSTARTUP=$PYTHONSTARTUP
diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd
index 0ef9eea95342e..2c4b08af8d4c3 100644
--- a/bin/pyspark2.cmd
+++ b/bin/pyspark2.cmd
@@ -45,7 +45,7 @@ rem Figure out which Python to use.
 if [%PYSPARK_PYTHON%] == [] set PYSPARK_PYTHON=python
 
 set PYTHONPATH=%FWDIR%python;%PYTHONPATH%
-set PYTHONPATH=%FWDIR%python\lib\py4j-0.8.1-src.zip;%PYTHONPATH%
+set PYTHONPATH=%FWDIR%python\lib\py4j-0.8.2.1-src.zip;%PYTHONPATH%
 
 set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
 set PYTHONSTARTUP=%FWDIR%python\pyspark\shell.py
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
index 6d3e257c4d5df..52c70712eea3d 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
@@ -29,7 +29,7 @@ private[spark] object PythonUtils {
     val pythonPath = new ArrayBuffer[String]
     for (sparkHome <- sys.env.get("SPARK_HOME")) {
       pythonPath += Seq(sparkHome, "python").mkString(File.separator)
-      pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.8.1-src.zip").mkString(File.separator)
+      pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.8.2.1-src.zip").mkString(File.separator)
     }
     pythonPath ++= SparkContext.jarOfObject(this)
     pythonPath.mkString(File.pathSeparator)
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 599714233c18f..7fca8f8c2b328 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -382,7 +382,7 @@ object SparkBuild extends Build {
         "org.tachyonproject"         % "tachyon"          % "0.4.1-thrift" excludeAll(excludeHadoop, excludeCurator, excludeEclipseJetty, excludePowermock),
         "com.clearspring.analytics"  % "stream"           % "2.7.0" excludeAll(excludeFastutil), // Only HyperLogLogPlus is used, which does not depend on fastutil.
         "org.spark-project"          % "pyrolite"         % "2.0.1",
-        "net.sf.py4j"                % "py4j"             % "0.8.1"
+        "net.sf.py4j"                % "py4j"             % "0.8.2.1"
       ),
     libraryDependencies ++= maybeAvro,
     assembleDeps,
diff --git a/python/lib/py4j-0.8.2.1-src.zip b/python/lib/py4j-0.8.2.1-src.zip
new file mode 100644
index 0000000000000000000000000000000000000000..646bbd532d5883943bbb25f33d505918a6f41394
GIT binary patch
literal 37800
zcmagEV~i)jvo$=nZQHhYZ2iWzZQHhO?3g>YZF9%A-~Hdb_d)U`w?B0HL)Yn~Q`KFk
zPASTOg24d&XJGjPi~ZN)|DV8t@PHgV*{m1=s?b27_Ly5Lfs|V+t{$*Jz+g|nKtNz<
z|28WAFLQQ^Xwm=E9O=K!_4O_7EM4^V85}(S3u$FVLPhKUjr1QxD4_o!kujW@{O6E-
za3COr|3)&lw{<Xd`5$m<Rj1>!nUH!O&`6(v5|vfS(_vf@0j19y-T-3UusxW|F@n}x
zBj3Ot4M=0Z#!Iu8IXt6}Y<oYx-97hE>eL>SIi;s>W3*An$KhsmVd*OIc_2VJtg<+7
zMNFuCwX`8LiuGLuR)S<NGP@691qY72C89{WgZu|u6w5rb73=Gk0pjlEu3#sJ+(nb>
zN^`;qKq7aT4{}*Awv-uIJ%sy2oEFC&j5yblXTZluRLOwm>bJ(6k4vzz3j6OfQms+)
z6KVCoqv83t&@-A~pN9pt!H^cBv@*b^5T@r1M@*Ynuy}utEiWxgAF&_Lpunrq--B$f
zzeaw=&Zo}@V)~~I6p~i+brN*ZHwVBQa(cpr3}`Xuh{wOxb1iY)TK}2jrU?GLL1yum
z?Ldc3cf>B<(njg*3;CE`nZt8`BFWExc$?7LrzoqtmR|~W)|enKek`kI0X3c!wS(r=
z%p~fLoj-F@uo)xoF7YnLBaeQqtNKxWbL^P+qGaDSt$W;h0UThjk6+l8ZeMgJwNCv0
zdR|*Q*Ju~1e7!Xr=0Akb+P!<XCndV6m^~3C0DgK5<KNPG%q(2vFuh?DAZT$2;<ED4
zZg<BlK_2h0Q5N&b_)C2&Z*4>Bf2T5xb;4AZNm|nOJubp4uMelN!&91FVTk($`X4v^
z-_8T?{wiGw3IueG1O$Zq-_B!ZX=i9->1FElKlsm;x~=t+5Yl&^KJyad&bY23a*9yD
z8Z}f;z={INYPX5gx?2;j8)tXkfxo*OX9wAfvbGqSt$;4Y!}}B~>lS~8K!!*a3j-#!
z4<CP~fOFF6-X4)xaibQ|T9D=#(+LVZ6W4wc^fx$rNf0(Dj$CQUzO(XlYS{}oCk8t=
z7zOKs2*nxsg!UknS7D;&l#rCiARAw9-(h$>fw<Cul{QswAM;|XP~b6)UL&~lNUQ?&
zYE{TOU7c;MBL}FSnzVKWc{z#$1sAy4fNa2YEK19;=3GQjDLqv^Z40~4k5(z>^r+R-
z5uZFjQ>kOx+(|u)DoUiZeCHi>(lNGQa(b4z<&X@-i)xURYqQV|VoSIU$!bP4Slbo~
zsCvbtA7vf9`?x}>EX35>rt1bdp1G`(V>xXLgG??H$vI6g=5VY!9|MgU>_9+G4^{(%
z3fh^)Xdz43VPzOJRppRK72~s9r@Y0WGurxq0&ZsoNxFxMPO^(En)`hU8ahwKy`H(p
zo~KR#Vc8)tHS}%58e6TkO0Ju7h_B1Yl-%LKhg}LUw6ABySYh-q@5n*~rcAMAOo&E?
zxPC)C1FeId>Yie02LjSf4&L{+5XzO4fU1_FOBiVx?=Z0xydo;Xsw1hPxaDP&9S>pu
zf|K!qBRsII*m2JLdQ*!;PWmr6TXtWgO{*J|^fuJ3&4Em&5;eRqZc>kZryYbXqHL1M
zD%yEoiBVQ6nT$`J=ijU?_NriNYj0Lj#+}C{Gqg|JU)wA!*OXn~_>$zZmjP#1PVvcZ
zNPY+F<{-ag-Qb;YWLzUR_iRP}HircyGNwB9j``@VOmEfBIe$25-ZtM3_|b+qG$qc%
zkiGOqi0M>X9j)D_nf>sfbO363$z%gaBrT;86)ai~2z03STU|pW7+*1Pto@<8CM>gD
zV$D%+3OIpf-IoV>MI;zI3g^|7Yr+iXs|v38sr+A&N*ih}t&(lJk*?hEt*|Q4(pPd}
ztA9fah2Kq}%d--^TaC0`W6gTE+YE1<mamq$&Z!GqO}7@3W?b+si&}<xbKrR4+H12z
zC(c`3Vq_mq+tuQa+a}a@<gV&X{o#F6*cms}&fWhTYUA@>Scc+TP3`)Nl+ppGSxVKi
zrC{3dd3WOHo9Wwq_2zo$DX;fr=&ILPu-$O;<>JI{b^3Y*(QbvH?2=?$4(eb*fe$x%
z+n->I!3Lejaj;sjuX1;yyj#c0vl+C=!=Do*)UNCe|NoA89rKy$1~?#~ACZ4Q*?-5p
zm7$xVzOlWHjj6GVrM;c={}lQ<7xuW^2tU`oBI$m&?XIyUe+UtTn}j$nc1qJFkpIBa
zbq=ex9(!!u#n=-_`H20zkXDXLK13aoZPF=%irGY#P(^)4rkK6Qbo+cK$sKAU>0URz
z9C9-LMAGNlwQX|8ntM`dkSqZ3O8dcjW)Hz4ifI!~`ucY6X%Tt{6=j=<;?~%XENPU}
zhA+2yv<;%qE-Pk$aT$YB=t~`5G3x5G!)=nx>0;hwr`AlS&5ULqm~dTqa*IW8;k1Nz
zv<Vv2>)tPyuXpUqn`WJ{$k)w#@5!_y6zNtkIsK?f2uj#OJy94lZ)P9p6hdyA3_7Bc
zV~&-8R51LoOi1<(H_UyKMhx?y^_5-7%nN<1G!-6Vg30CItiUl&n-X8`8Hy@TF9;TI
z&BH6>&&Cr3E~u2wcmRhrD}w$6vWLe}=3aP9aNqR8ZIW9+he73hlS%3rqvlM#{?KI|
zf32s(Wt;@w;Iv-OJ}NcjMHkG|NgmQcYS@#nB-(M(Y5*wa%v0)<o(OK`=%nYy%_<m_
zdc-|#3J;s>>%iJ2F&F5F!KzmAkEhT&oso+O)lpN2p$7RjZ~1+L157;-m_ZEW^H8ij
z_ZA~48paOn@<9TzjUMEoeMY`UBvJw2Uc@9QCS<bxzC~DFbe(9CRv4fepi;o;pytp;
zL=V1<==)u>EO`aTAoW=+&4Tct$Y*DV<@MtR#&&}-A)timYw@qj^`I-mgu1_OV`7at
zpbbV0nJ^ch(26MX2AhdoY*-KZ=80i&Ne^j|)tU0hubgPs#XOmpE(?{;?B$q|k%W+`
zwULVROhuT;Lph(B&+IRGgdzKp1q~l4w+TcSSepqdm4`gSo}++cmuRt6G$bRYu(?H;
z!UbiwWWj-<qstfh4D+yb5Z=XiH2EgVqlt;YG>j`Oy^&t09PwokVnhTuSib`av1+?a
zlydfUOqx`g>FJc|8qePq4}PGm^+c=G&#zFOcacw`T5@DL*gCY*YIIv_0JbYLv|y5(
zLK@$nZV|w0Nns;Swa{In2Qr=lQ)s<hb0DYt>;JCB26f^S1Bvdz#8wzCfeMoc?$?@n
z3I?_IaOGBSz+yy3Ak+wIU<Bot2>p}$2uJ}4FS<wV>z}!WtMv(C&}$^K5Jn?i!bY;i
zLAz}J+Qyf4bVO%EDdJ~<{8r~G21--~Iz5Ett>8X49V>Ecg`8}p#)x`R(OvBt8xX>p
zLHCZ3aTG6G2?I7_kqc<D<-$*d>$1~2z16E9iJIhLePt^Oq3wPBp5DvFb(`<A>y5TM
zkmX8}IRT4@K2XW&B%n@R7<g*5r>~RfJ0nIoZjRP^<?9_?{8cUDCP)QdU{v@B8P{RW
zG19FjQpUnAO6s&R2-$C&7^@#U+j#yZy!>Xw|Fw7@>qVmANx<jQ6*_wL45)D)jqF$Z
z^daIO!gcFsR8(REWTod$X$Xf8V<Z<RE&?28gTz=e%S~T<hrK3EdM+XFHd)wA5CyBs
z%5o9L;@@e)ZtJ(8pj9(kV?tuxEILy`E)&UW0_bn1^2<82P|*@{&*D05?7~!wt)@2?
z_3f_OuE97aZB9ViBhS~-v2CPVVSQ??O8CuyvX{9r4s(MbCu=|veg?p%${;UfIGZ4F
zH&*Xf5B)a9BN!i{3!eg7pBJbcwhg8WIS08rq%Bbcf)O1L=eI8fnS-zz!tlMU!N6wX
zH!*d^YX($}Pp~Hu(Ne?Sa5GkoOD4UGmfd2pgaRgB6?(N%*6kT)O;7CrucQ>?aD*na
z6OPMq$)~8SXP$Ja0Z{~$RX7YE%xa7Rb%!>2Kg(DUEmduSPtbsNmQ1qV=r|?|Uwj|V
z1^Uv%hL%4+8xL^3j?x>!^iXW>-0k`a-?Y3(7JgTt^e!+Sl%NprVR7r>b>k`|6;LY>
zS+%SFX=?yj2H6+GdcK141g@fdG1$VM7(lVmg;X0hM5>urTt>2MGqHt}qaMo#0gqld
zf1{F<m<+V#Oo{BbZ_Ytr9Np?^3c01$sa6p-!rK1u@IK-9{M*^t{<{75`u-V`86EBq
zc%Y!yk9Ehxtn^qm*-HGlL?3P!fuh?_OlK;w{usRM*P}Y$IA5t3K#+)FU`#96Jsi{c
z*I@CZ7MWS+?-zJw;DJ;ug|Av6BpQ<5cd*vHly%k0j`8mNuzpp0F9YM(M6Jn&#7GIs
zQHeoFHkPm+7qTg?Qu*I6E%cy{-`yC6rK*8K@XA0RF`%atAdzY{&Fj>(%mc}R@5k6n
zXUpR4aP<SwRC?Vf8?assdMV3Lcs8{Yt|Hr!(G+ahpi8t&iP@2P%P0Xb?(A5~IwJ2|
z<++ea$N1f;>Ui+eKs<YDmmjzA@z_4S>N8X#%Cn=S(v&D;Xuh6SV7&G0N?N)Tg?($U
zqpYI8L5m-fu-vOyqEpP#^oXZ)iA@L4UdY(=71c4achVN#`}bWQ$@ZE>YHCz|y|V@T
z6+0)w8xxWn;45aECFE#1%-H~DAI5D1yTG6jh#e?aB9h8QoJu97FN)11SiXhP)IZP>
zlL~Hmovc*H`pjrOfq<UAOjVB>66_R<npBXB1$hH7$Dkb{t!d2`(!%a8^a3d=6!M#C
ziOtbF=<KPi(!Esg&@0vw=|b%Se^b0(u{N2p(*jjDwTup}Zfv-arf_5D@gq#|k+apq
z45Mnqz6aNbInDP#I@k4HI}nI(9fLAILvS!U4mL>bKovk5Q&ZXVMqCW;d@B%oU-ASf
zmAN=h+{U@JtUQrN#vKo@ZSE?VesL%Vfpk@jt2nV_7yzvVH8>Jj3J|M2zIP^obu^~s
zPE@1j)Xc-2ulaNW(G@tp+F*=LVpG{UoGdOkE+t1Tzu`^EEIFd+Kp-PZVPRi?5Kb=Y
zUgXM;eII}p)L!Uoqi#PHgY>wA+tgv}^=9OpAuoz{qzoI_R$8(*sA<dFTH(*0P;bjz
zWl;!8wiVKUafr^YWfK95Z$)+RRA&KW0m=3^V)8WT11TJ>hHZ6A`fUx@={VzT+sRzT
zLR&9-XG?E~@TBK13#Za$<Rt;{<=3B%fj6BWSS`1JjErYFcCZL716)M+NJ-+l^;S1w
z9Wmt8raJz1X~R4^%n3L?R!wwq&1G4@iRaPlNQv(Umc7+~G^iO)uh%d@S1qc?DXP8x
zNlQUg*ScSZVa@v^50;uD-Mpz5Ro8*Tr01h=KxbbrH}=XF&4jne#qe?5CutA&69(=~
zi;%5=fj>+HDOPjoYlBxtOdv<L!!bvTV^OE%x7{<((Dj>OvpsBh7Q(fL#JbXSxu|8e
z>D4oc-L?r|YM-#h*4EZLz4#&daK2q<LOGk8>&MaV`*7=1cxL(sX4T%m^_&?_Uj=K=
zyh#;bFqI5jL;_};PJ73Sc-lh25t$azxuC3#e^QuDK94IK8`!ZmK<7dJ)!oI<&4sIn
z6X?L`V9)gnAQg3J8KkvfrnK5kR$Ag{RmOolMB4;Dtwp+x(Ka>sL3eldkqoK1H{nL^
zT8aK7n%+8{biE)wkyYA#VuUAHxxD6wtV0M`Hu++;Y}~2wlviASaj*WYO7Qnq+Au#X
z(P3PhjGai0Utrr#UZX?P(a<|NEMONGfcrH|LM2*r05CiYZsMX@&!qF{t1n*ex(Cbb
zc4__F7}?4%WGl!}kJ!CQ;6v8=;Q7=r!xi5M@g3Vb@nRm2+Nqq`3!hNtsO(8>k*s|2
zc_2LXf@ER#H|R!=ymHNm2Xk;Y9JBaiH4ag*IL*8wzrir&FLckJjy>!@7%p#;kPja8
zl6BEynLNTvv*{CR5!5m<l=*4C?w2|gng((tS$kT9$t$7NWz`bsCj~9_Dgxk_axFtF
zF)3uhMO^g5s*u+!XNu_|^H(KF9*djiobSXgix+y(0yEX5PYR;x$L?pg;qZDtSdo{K
zJx8=CjIF!RWX(s3bvwW-xbbpE@D{aRHUy0`czd-`9mg2%qQhmL!2pIqqS+vaTl+kQ
zf<K!+$uiMNIM9z(3C^O$_NDzH%DAi0Y=YzvQLRR1awb!etA7!V6uw~U8_d-8vE17@
zY53RTXgD&Lk+#g%6k(Vjwb#-H&eRS`QX8(R_4$(=iN_`X@QeK6nauw6+Z&5e7lUH%
z@BEx^_I^5<FmifL+<ICAR}W#jS2xaJa0KF9<V}&Z@;kuVm*~m`#SuCh=r^WlLa!z2
zAPm<UaK?h<X^l2+A+7+FN*LQrQ_|d^`}b?S`=9{qn-k07bi8|eMUb1dA4<t83yJO9
zCY%&<<95cqaE&cQs(rT&Crx)*({M&a<7TV}AAufT+$>`&&dO&xxUKOI;s2&U#eN^6
zxt0|!a`Zqz8XZ7DX#W=lGB<QFbvN|<Z~C(kwEX|7z+L%TyB~@>ow|R))KHutJt}Wu
z*Ks1|&b?TX#nF2|Mu^59%Ki;NT2C=Z1cDD(U=sM)Rf+d27FIrNNVW+w;U}5X)!o+B
z)dlG4K9TUe9uQ@rEz<S!#@LD5BYY$8US8eY<fAVx>xjA{n|jhuxF|~~X!;VOw+Ti{
z>Gp3@#rDV$E}rVDtyH@Z<d!CX&|%Kk&|xTbx_A7(iQf9z*xmF_Pt)|`#RRDCmb*5O
zU0+{Vuqae}+ZFa;&b!~t6jjN~tXpJBRQu3OSShC%F_xLlE`l52<72)Rk6plJWnn1k
zEh$k`3>y*OQ<Ln%l+d)x2_0h4LcMa$5)AolgX6`^B(50Y05733M}BWb_xQZs=RZFq
zETTVF@x|zYu6Z+KM!UKs?L_iT)8Mu6pZg?DsXbMpNuiiVn*u}C+{1}$l0CqUI_W(X
zJ+{*Pt=1ybF^adAk9xxk!}L<AtWvpqkiHxzd^os9@T&F0Kv&oPT|O)dsJM7SQKSJB
zN)t(bRcG);WK{#IH%&EVMN4u2(4~>wh^3dO|I#(2$F00ER7_A--AC>|6;rndE~EJ-
zTL@R4dPXzS|5^&4*Bua@1ToFpjns3=SF_Yj*MBbBY(@d-8nwyhYNnMxTndVwak%Bq
z4B<YF(;#eDz+&+M-eE5`NoBc{!aq~U?!}TWudayURp;NV1b#TC%bk@*oFYE%-CrCS
zgUo5n5;aU*;flF0kYiI--r)1e1J*Yuf#scM(Tf8?TmO^;?MA-Se?s3pBlmvU3FgWJ
zm~lbWi6!Y>KLq=wC9eZi^B11{-0g3krn5N*tB>9vcf%Afs?&cSU%V*?IVCvlCC2B+
z$Cq#4arE5veBThi(r^9ic-}9LhnJxQeBaUqf8J*Q9>*8kGtMMWR!94P%`@DkGC2A?
zK3`oFAKoNi>@V|Lyq>;#cyLRLmzxhp<I;VncTW=bzB6=ue?PKt_jr7`dGPUcb5YU-
z5%TExxwr=7SBT>G-qeUKpNZwNf0vj2q|na40c~#&S{5?zjmP?X>5#=|dYq51%LmjS
z(e<D6mAa!lW|;DL1^`gdO^4VN>4(X(P3uqXSLfGM2}!GialpX#B%lN*7DFq5AXwNw
z9ol6lU63oE5d-x?zeADouA9dB8&oI;Q?kF3hl*$aQGU2UrD(CA-@C<e&+4umH@(wU
zozWHA{PQIhgnzWu9(zx-{w_z%^(Ofxjq`!Z!wmkuy{5dTq=}tEKShlKPWE%>`2;Sz
zxx766B;yw3!4>x(S{*;bPLgr*ec1STzd^_<E3|=O=l#iZ-y>9#TPT%ux&!-U!E9hp
z?M#3*hq?vurc0=kzGwVhMx`fcL;Zqj4HJDPN&9iD6sHcbJ{dRW(TA>~JRkByXC2Ce
ztjEm7?AZI+aKLj&QVXn=OU_?l0S@yZ#GSxP#l8*)M^9*i(Oy~XrUL&7Sw>)mzN`l^
zW9px-ypbhIs{=ZFA<R%LjMd~AwL>a@JwDbC)J*yK(o|wt;Aw3)SY5g)m)ZMS)L2IF
zp{Wx1zWz})>G)=zmexUf=d3^HeqthKniFq3kzMoa{h|E!7rh3OdQ|ve>ge3xZ9YC1
zpa*a1YEcS4H36hCZ9<aUl-C<~u(uh&dQ;gb8KwDNihW$wqM~IG<@p%kfX&;mtkWqQ
z3`-aQ17iHO|0Ia{ybe={6QNS{dv;DxzF3)#wl_mCCd#B}nAZ318vic#-#<c|?8NE%
zF$i<YiUf;*oU4Ge7b-vwOWRcj?b{pKb-*>oz%b-*>Z3A6NOvY72$wx48$v@-!hmeZ
z3V~qJ6pWHLC%SM{O6WpU3k&~Mi$^Zd(1i($)}gv?0jMl3_TYH=oTKBl!29XZ@qfbi
z3;oM*|Axa8U{QXILkR-g$GQEe62-zCkxkSmSB!@3qjv;(OOju9oPtRoMa<N#s4G1h
zf+#HjBMU>yn+;vkY+*BI7SqUL`)kg(KZv#Pgsr+TA?LzHS%KPJ<)>tSjcH>?l9?Ru
zV64nEG8x!teza4T)I&j;Hb6uZ>W7@GVKFDO^23NdyjD%aS)5cDE5<d{QB*D(=Z!)7
z#W-}ReYeH23F;mR(t{6#*l>##KMAa^0?{St4bPf5d754CiC;D{2M>T^sK#awcud$A
zqFtaDUc$gw{#!jvZ^I3q9A5Y%of2;)f;Ko~wnQlAiv#)wg4Nch>5$D>#A(UVMuoIQ
zu*QRgb{EnEzjcU#(mc#0J*S;97jCK<pz0~R802b6xv+^ahigOZ^`V%+rfd)Y=3wqQ
zjSC-q$Kt}NI0i5pmsJLCd5u3B6F>6kjbB8RwfqsnY{XuHG^&s#x-F+A7gfwHG2>=j
z4LSXWWH;h~K8tF!?Iy&-6KVj}SZ_1EBf1YZZAJS)$f}|mqaD&UrPN5fQ5e2DXGoJp
zIn$jdKdY!Jf7YNy7PDhv%3(q(si;)oS9C5Pt|BPd5f^OC`CeIc4uZ7^+~A%poHCv^
zNz9shx66TLHqBG4P1Dlw*>6~gnwNrF0Ez`;E?O~{bs)@3x?OP<Eedhf-H1@^3UU>c
za0Dgu;GlaX@k-q*vujPE1%5B|K%?tafllfxpI<{Glu(vKpCTh0YS5`6)mdl7ay=O6
zLY`7{*7}~udVCLRz@Bp8P4-U9$_c{eXGVdz82>8cH-j8Xet@p;A*oxWQ_qRbs<+1Z
z>>&9r#e#B2qbN%Ga>t6$M7Ng@w^#6C&e8W{C8PE!MzV9bp%jATVr=EIWBMdJ+XU`S
zruw~73<4bWDCWe$=M4{2W)t4wND^V`*m3=lPPl=sP04uVFt9MtVH#7UulLnbjFGiS
z{V(~Ogp!)T5TaN6BYRR?H5fL3>N!<qYnDS!vZ~x-OaDYNFi2Nvtz@q$CloTnBO-!_
znzBMZd=DKV-ENi;2$d4t9;(eOWjL;cy|=Qi+oMQBe%r`W3%kL3w8JaqycHrOKouZ3
z9qnS}w!LdQ4t(y0f6j?%{)80ez?jl(k<|}wRj0Vt8!2({xL#)zv}COxvty+s7&to+
zuzc~|NcDHSdpooL>81}ZN8rK=9^=m1bLfW(<4!YU;Cc$EH(<y|-(xFax5ciOdQU2A
zRFo>~FHSqh!q)bS+m7748fRPr8*qqT5#P)Ub+sK?1d$Z7ZfOKe+ZH7$R$s0}`;szS
zRdp(j<D}$s>fq-C4ec=6k^jPDne=HUi<Vujl$Ff=>=8BG$ZsonyX5LIwM{oCDMBjU
z9Oxw?BiO881hyfqtf)$CAhf@p=&8*b=+La9O{^!wvJ_a^1OZ%SbDWCs6**e;OpVW?
zlfXb}PZ@HhVOD*e#f*tuOFY>NujtLODxv5MWC*s$mGi&0bR5iqpF-mBi?FC|@V~mG
zPii6;P|n6de;d~v#nc7u{vt>Y)4G1I(o?HcXy_jj_;8p(LD~-tZA4?+*tg*03_7E1
z%74f;aCGMY1)V6SI4r<BqD?C^YSsX>@PE$;agMm?UpquaO8*&Bh>sSacJBL^+-v%S
zZM?#7j5eLCK3^JPpq|&liqb<F;J5S7MoSO9>hl${ss1p&qDzMKsGD@1kVIW7d4qOC
zB92E<mQXkI`{qD3Mzx##t|Ab}EMv%;90Bv?Q=6JCEbP?LmaYVwH}^RPQ*@qJY&j>C
z*W9kTRxc>yKW5H{uu_}5WJ&{*hcR{Rtax;OKkpZe`3(<sT0dZ%F5q48u1EO3y1NsT
zRGm!tJC7Ns4{vhwQJ(C63$P%1iTgSo=GXobWROyE%X5!x!|A-{P?r09iwO55Rj3L!
z!3@>66<ZxPAt|Y{ugS1Zi4TnqobL2A9^uTcN@NA1Z@8d(Ex!K|5fo`eT-_tj-Ah24
zLC>F)kAS?KvRTag)*v~+moXeUP;NP6>{7GeJh=~oWgVRDZ&VJvSou|Ai$a)vw$sIo
zP>tqB2SP+G2-#}xq`T|5Tj!uT>eYB*7|+$@ea0Iiv}{+BB9ypgbR}Y&KAX>i%|<DW
zYNHeVO~UJlwkwaYY-Bq9{4{B>E^ff@=zs4rrh<)in0~49^iJ+R@qtdk`v`CHW#n<U
z&=sk($a5c8K|Jtj1~wAxEjq?#LL$aoKg&s|e;~*(>LngO^+2gJ5#8flg>Xu=QR%Py
zQSvdb^$^AKV!1m<?pX@#dU{K1x{^+6$bC0C5yGMp*``siDRf$VTnv-i$2dGaRtfy>
z)~@jZ^e~A)vxBx<ojSb3d8!f}ba}Pt;VeFegMpor$dUpdHA-l1YPK9y)k+r;qsB**
z_FPaZtfc<x6`1=WYt|()<WtJ#+Pjdc>P9pswY7#)OAx;;j;e9_>U9!`!{=<(3@gne
zKp0s}xp|+5@?%=PS+z1wEV&wwj4-l`l`QsHFCvY>p6nDx4O(+T<TDaa1`Nvwjs<Mu
z-&F1rZYB<R6mJR(M03I5VJkgzna8H0zYfBUF<pJhWxV~i01uoH2*~EKCT?9M@q1a6
z71TZ}f^DP{Ld7Ry8_ViA3L;<h=+gw#{RVa-<CgPgNc8N$Gd@bw{&|c`bSQJT5mm5v
z5iR557=mGDg<V_Vpg!Mssu&c>Z?h(};fqZGeSGy8tMEo~^|N7<StX_1_8cKzUDu3q
zW!^nEYGcr*%e~?geYBi0dZ5k3$8Fz3@mV6ej}QVYzgd5?6XckmuH|CeCft}xUqUB}
z`)#6AvP}v*?i*;ih*x3;aiI#Gs*iANgz{nyb_746+e3*ftSYR=ZnFc48MQ2E93#9)
z?|cGw%1C5Rch-`05}8d$mV<&1$8VQ6_F8!Rn|a=x&~U-;@b|d~;fPXKriv-TR1C)}
zWL<@Tz2@+u8uT1dGCcC$z!|b1C&5W)RNtN=!8H*~v_9bnpbJB>wfAuCv74r2h+@4C
z^s#9q>G|IC`ov?j{Rs*N?{vyo_$B^wuv&z?==Vx0!jf@0>sLC2(~YmVbqw75GT|_t
zA+&p@l1UGB;TmXCL!Rb8RsmASi~0aIfoU(N2)&98<uEQa*>++vHYTh3$+fb|JmSK?
zg@F>J-Aw!;8&`|edvV=8lS0YD-)HV!=BYP!b2v4T$&h7g-R6;tFqGtmmb)17s%J~2
zquU~w%Wlrp1XLFwH-|$I>^}|KYJ9=*3W&n;xe`>of}J(7KwB_j%&Cf)7^?x00y3MG
ze2e+!to>8sII}5>Q~2xHrFSJ1jOhbmLjtrDuD~)c!I#uRA3Hd^mUmCMhugOD^|l2L
z0v<MTJ`taqFK=Q20U%shUiIioVp;cncvtn-8`Xng2lK=Kc<$2Wx|w+rASJ`)LJ=xE
zy_i~Z1C_cUaS&n36`EwU@=;8y)WS0*)E<AJo}7&XHooN*!0yf+$$WEZ+*%WR<)>OS
zPgvNK?H1W%R-c)4PMKSf;_cc5z#07~EJ%hZ)#l&^x)UQ*CfUhVTf~N#E!tc^PTkkd
z^Cw9NFx7yhZ7$)xUkHq&Y(Y;t;hx5^KjfQxrqo35$y?Q>tgzCBXAH`^f@*Jd2{mTs
z<c$#A1-X$a4pZzbJa+cU*wEhpKH~&3_i%M}H2n+s7<B?}GAg~^G}v6Y4Afbv;Io;l
zf2$SaX;ypF(WG%{*U{9CXjTq4y=~_S9fmbwG-K_&Q~X8mXpUj5JEmJOeo#vhkHo7W
z)1}={158lJ6|x204nWB*3PMoAVkHFtl;u<*<&~KvN(vF~VH*SbmiAf#>d~^kAmhGt
zB=-ZKwOAc1s3|?Bq%c_y()p|QxRW6&%{kv>_~Q)ug(gIWaNRw9g8l+EiTfl6D9bMi
z*cRf^S}Ve!&I3d%Vj(5>Jha2f*h9lQ($U&)mwxgq__OX5Szzb_9^LW)FGlZtsg9%-
zvvo{WtXTS_Q*^n=S+8W>ih1{H!nsKBOz~o2ADEHmW!xn3yRyV7xp-qc6T9bYho^E*
zUIKVeORE>KU9qg4I0U(_x?H&Azjqazd_)itJ?Y^TE<8-R*aQ?X+ZNfpXuNyaT@By^
zC(-ka1=~VJ792scfP3nEwEnQyfMPSlDj0M78{3)^9@fMLs}QbuADKw4IcB@9ZZmN4
zJ_CdHhFzHyx;xm}CJI1xSt7YlY|1lwbyTOdR9xTotSAvAvdXE-TA+&8!E_md@TUZm
z-Zk3RDviGM#}*-2%Uz#Exx_D@L@+m>KlcoW2_3to5`iqzB2vTgo^39ktQ}2{y^DJr
zZ*d-O8?7OtTj=NaraNsK=!Yy*Qn!g0w*>k|Iaymab5z7m&^Mz(!YhBXWAA>*WEP<z
zm@%YWn~b;$LsUJp_oP}qmeA%Q+!|&W^9>1fyzcY8{k35G+#loTx*8M+DeuVz@bkWo
z#*N~&qO6;VsE~6s1!2xZ>o$tdJ_%-sq;?I<<p8%IXzk+7JO9<P(qe!3fw9$ji)L?j
zoFttjKK>Dqe)W$Gy4*40a^rPx-@gWFX|NPjofO(GS(Cio@LqC^tWV2!h2H<#@2^I*
zb8W+vxGw1EPS`2dwWs!xahr@9o=wTkm&!z~(ULFp#Bopr^Yq<2BfKn%Htl{k7@ygK
zu~^`~A`%dTd}@B{td{ofGm2-O>zPAsU9_xHsBPu)!RgU*Vv2Bx5L91J(Se0}@U_=A
zGtIkOOO0(_C&W<}3Rg@@5+6Qi@QSz-Exg`bmy%wCDL0N=qZgd|tMrf*qrv`Bv0z)r
zbrIcj|NE+QG;bu<b}N2I$lLR-L`<Ir#DU#&X7VJvJ+ns`Qq!+mN1Xmy?7I0@%Ckp2
zQCYVjC2p$Um+2eMmWReWlLn~t`1(lSwY8m93GJ;N0V?1_7}S^->*orC*^V1@(LiH)
z4s#K7b-tEj{qB=M4ZjJz4gD(Gj_E|b2S;rRaIE`zgZl-MSQ#c)3LZwims<~<m)eNk
zm;XEtUBmNkW9NPtyn2{zKOX{Y9|^T*JV+$DS#tLg!V)(~sigMH2oV6|^?O6(G>?QK
zJG=;5$f!E{k<v16On=R<K+eBC<6hNOr3F)@Y#a@@xK6%uDf*el%FM0K5teuP*P-=V
z4USbIFrN;)#&p}nNR{gB<F){TydvL1*CA}Y#ysJKHk2~guO{@wbPmZZmhIDv)6!^r
zB=}Jx1=_F%>zmdV>083p_A>^$FL-_OzanzwgD)z$gKs^Kg`)|Wy3-Bn#Ue++><4A=
zF~yC{%Hk2jo?>{-ZMyTUxy#C`29b6gUpF`mq(B|VYa1ILZ0W5du;D6<_AwJh6t_%V
zb-oGE7cND0^0+x)b9iRE<^t`WyKQ!XjWI(bijNthn2anl)wcN10dQ|Hpq+2Zw$0c#
zjb9KoJ)K!yoh)Mclos;466sW!9ncNnq3`pYM0Z*+txn{iwuQ?k6R&t1X`rs6g+A6X
z#JSyOI_$v(o4I#6<T|TXXCx+*UQ>{6NF+(-Qpk}Vol7xzi%3AShKNyF-@=cR`qM-L
z5w425cXeIZ{}dD29X<8;08X0DK~o3i>NdlIme|JRaiS#+XLR$$z|{rA>f|qzh3M0$
zoG_I8%<zVKyuXHE%W`VgZ)2v{nnN2K@XtepM7zLI7Xy&J=;Mk(hm+R0I|e1J5?jxB
z6{pv@*i1U_mJ#+13MIh8R5LW+E6!^<e?0F^4gc1~O66c<(FdtT#BS{B7i_?0+2j&)
z>b!g|svmAZ(`W%eXTeHqvm*HDn+D=e2l^<Ox+{LE4=I)pN7?NJ+}^61wQ!Ipy;O8b
zEV!eanyc;_w+R{m(H5UJAPk`%O`9f9zz>Za^Oys97_mC%Gt06gXc27R@qt5|KcwoX
z17G67^_f@i%k<1bL(Gv_UiwXA=IeVduj1TiOy#;YGx%f$9X5qz*ZMFsH9H9eb1hKU
z|M9}de9vcRB@RxWlv{&fzqKdUxaE&#+yRdP@Qq|ty%fbwKJ#olWKf@{^kpZbuWkvH
z#Te|Khb?jo9rbud_t_!H;2lDoE>)lR9fSVS($z`sIai|<ZnR$UE{tL4AWiJ!dG?(>
z6Us)jxwe-|xsNNsAdfF<@Z*o02s4l(wCQLZkby?Ew5>X#58@9-#SMQ)(6B9{U`3Xd
zN6xHgpt3P*E<ERw7hm{eNa`^QOld2pCP%BKBHk$g_hY4uN$%(dF6(VcOK9KP+wq(J
zFrNVPP?C@``Ac?Jq6{evW^1kIQ_2wj#L{o|#1j9HJ0z&n)8&azwlFN?n9)&bzVpr3
zLx|*NM6(JSWS!q&TBt>64M1vf@2Cy4nadVcdTPgs-G|iJ*>20A9_xxRiPb*((?a}X
z^@izuu-t};<4|-kWQ)J3u2`ML0{A;@8VvjVY5MN&_PvuhbzG#bVs72x@oHF#<^9eH
z>pt?q{=I3hADOI9LZW!_@CXIs;ZdJmTe#7-1ut|M=?_?gdo1yXhk8ZA@Th(K?>gxD
zfE07X3B0MyhTdPlB`SVsrUvYSvx7}9d-ieYERO$d+8VguD}*q^Md0elg$0RSwJajG
zQY*w<_7}PN`X2ZPc5uE4g4X$cyTNr8tm)+Fh^iZ<9k|8cgYArg!&81lu*UbxbvHQ|
zHJXvZo6^X;HK0@aPmUZMb*vmjKU~Z&Pv4y$ScS_yx3%F2Dx`k^ve0i*V&|7XpI=%q
z^qo$x%3K%)x$gPJhDgrReDh3zoMaY8U#e`aL{+~__c?@hIbV0RRDZ~@yCn<<gctAe
z@&)7j5OgB?NpCEzrr>1LutX<fvyF1{FI!R$1O@sbB6e`2e%x9mDM=6Aq!7HK3T#NZ
z8<7c!B)l$Y&#o4aeSMmz{GGqQxOr-AQQvJ-1{!^hnKq++)!6&8pp4Cv4g`b*zrRnj
z4d>Dfoq^X|IA%;=CU-(@C<*AZt6QbS5wrDTEEr0?(@u6jVP%J)AdYT-VI|K}gUUFn
zKX+Nb&2{m5r$?5L2;<yKe6xx7E*wbdx;W;9eWkFBS+JQmkYZrD{SyR|G^<FdP-kbF
z3Wr$F^gziNYwL9Je-TAJ7Byy=wrs~!47F->Cr?G|-z9`}tUpY_RRNDX<)FuMo|m=q
zp&wWe)z*Nrl86T%$}b1P!qtVHjQ(06WBzXZz55a;?*rL+5fp5qhy*2tk#<JI;KHRy
zF6`!71BHmT{@vp2xGU(V*hh)B!!vm!mU*{d3pq+9;UnsZ!>@!c3vXf<KvY<?q`e2v
zi|#iS%tm_C_-C1qe!Wup`m`_@+IkFf<(1zU(i39wqYH1RWeIBlO2|4A{Jo>SPN3hi
zQIA@P>gao=y^f8kz3q^pG^LeXHlfRx0`Z@;cbAdmOtRy4Ku@1%GTBUS68W#>SO3}$
zL(nDJq(g}}h>IhmST<+h>W6Y<Vr`_TdCAj~gWeN?Qvevl*&?zj_v8E(8fBfVy(Pxm
zYqjT5#TjNvw+dazkya=xg|<_x3%xmmG)Ebc4mbC!6doF7*!9qEk>mu-`gBL0|CnCQ
zsUU9)ft+NMV|Y$Y5-?km40`8K>JAi`h)F}C<m7M=3<Yx%>i9?6oGXXeR6+BOdALBf
zVAn#|NGqnGvYnU0tKXuoAj#2&EUZXIYp+Jj7xjg%N@gw$F=|~F6;l)IUbdpI5Y4u-
z!t{vfl}8)E12%K*6tMMHo6zn9a#r(6p2(gD?W3a-^C@hNz|cm*Tq`=vBK<lMXCth)
zZ#)?3Majgj69n9n+eKfcaDIc~pRcA*_;P@$$HtY5Ejr2tahQj$s1H&g(=K@`?;r)j
za(qz)5g`vI!djsdG>|T7S@4EJ{p1vbpI>nqN&$u$R<E<Yt+Ii*m?S^0c!Uq9FVrw~
zB6e4=CH+G$ZQZz{UQm)Is|U@k%gEfPI>zmrwL9D}pRP^L2M=AZU!rVR$WhzOFmDw}
zm!+j8@q*3WV6DJ92_1Aub;u)6D2pvJeEv5lDSOInJtQRLZ>@el72?}_9dg}SV@O=d
z@eZ410Bn-gLs1<_*+PT{)Fd4`n_pogg+&+Lx-;GqRa#}BxOzt;&MIAOs_Wtatl7Cr
zx{fl#wJiDd=Xg*ouMJHkg;XlKUfxNQR#ro)RNi2c#qz3q`MQN!d}N7z=0+`@fq=m#
z<v7p_64;rzT#XeOyOh`fG_f%V6%AH1lL=?lUI3QgLWVT^HP5$1vRp=M?Ox34MHNPL
zPs}6J1jYfJ;tUP{O;+Ts4{kj~PKW~XRSj0_W|=m?US|hk%j@}=PCM=Zx_2nm8Ja|C
z>j@td*BUmI@+a4s=d3!6_$Myx{S-ppC7u^wc?W2ud&$Ae&yyA8?dx;<){3obgJ{|=
zD5wJ0q%|kuV`lt4>y0(lyse#_L{gpU>;o4re)FG-6lkb77Rce%j12<O`i8VzUI1EZ
zZR~O~rDY4uzTgv!UrICO2?`Vheh8uKmS32`<0q1)YNp=2)dgL)j5y0k>En%LJ7p_t
zd=lXRCbfHrphMGy1Q(C`(#x&YlQ>!(4#{&r-2#mEjB3gIc1z8`WSSj5CsBnAYgZoe
zi?QgiqmOTLvO*&j=rwVHb}?5R@0V*F(q8*)4oEXs)ny^ZK0FUg7q!oY2QrYPL5(Vj
z$Q%(hJMDOO%I=j^DO6bO-!U{K&lZ}{bw{a9Vn}cKqfW^?I}>@&w~8ROH6_!4ucai*
z+N0`MMLl@8)nd8|vr2zbr>W-9GTJ2V(v3fRC0i_>1epV9=SwcK%Gh=Xa{%}l(MWCJ
z;ijTXjy8uGO-Er=kgT}ec9G*h;LxH^G?>${E7g1-*!&o86l97LY|}zBV}&;w+3-zv
zDe9vd*uy0cCtJp(S}LhaK$B(LeOD*NVOLC1mGB(DN>Wk3fc}(#1~-cVZRQ-Tu-g*n
z3-sU+#D+nqZo9!Oa6wT#zkK9$YegdFc|UIOVZhtF?`MOUA;NjXYlb;;a)bJ*!sh9q
zA6vDts`)U3yX6=AXtmUOYJu{Yk}t%{+OZoVO0X4TQ5}xpG%SV}ybbs4UA*aJvFeXs
z(^aBvrpin{{9*H?Lr0NZb7p%(XT7M^-fm6N;OYNR?`9N?DLupC2t12Y4YZJL+39`1
zB@lpAPui{{#rf^e5jj$O`RJ>=XN~$Ijwozxf$(@zPXSxHmac76Fjfm~o5>a8DS~e(
zd!N<*8{$+{1t9Z!dVW5pAT-x3CsBpRc%&=O0!q6s({cczy=ZY@XDHVMs^M(}wh(&8
zGi~?nc)OVs?ia<Hhi@~E5WRTVA@}%~kVQxrSGTq`Efdp7XGFJALO#_+j5^%wIK4;v
z=)#exsx<nv(6-(Xq-9xLpshZbRP((fh6`w$#7&4lx^SJH`o}r*MWsyAVm)3<l*6B5
zbFjfK&4*dOWOW~+q3QLwwGQ_Y+)z8sp8heC?5@{m2G__-MeAE9x%Q+U?dpAIWcgzA
z#HcjdXiu!MpUH$g1(bEyAZT6G9vAYgO|f39hD)1iUMrPA8?~-dV*Yfn>~qDUV)`|$
zqzlZ5B~4^RX4=dRQ%?RN<FhT|clX?&1usE%iYG8UNyr;Qnl7)Ol;}mMg*htxt2jAh
zNvw+}!U1Zd`(<B$&19*(7oWTQ%mO5~-pn{a5M51P3`WYDQfUY>RO*5(x`daqJ~c+1
zJQ%LUw4-<ivPiG$muRKXeW*RH3r4B-pX^6daBu2{r18H8+NzF(cT7RZnh|13%4^@A
ztz#9r$`oc)FtUA(Y8-Z?VO&xwL6q4TV%%q6;`@(&6`>oTc(c;LQ7N_-FIv#^smXNn
zuE4`*y_gTBIiI6wWO=4hl@U&!Rgl48F^P{jJx!2R=T3QHB+>b|m`AQqs6hRErEMvu
zB34zWKiQm8qU8ml?56ObDJvV0&>5GaR>A4fyD_8q>*zrzW{t+-$l?Qh3B8Y+vvk?3
zXk{7|4+j|A#h{G`nD$XczY2k3KRsH*()m9p-(OrO<SUvj?9bluy5RBf&kW0KGsCmw
zH^%aW*1hrmgbL8QxYkKu^Ms)FDt;=-=kVz!g_pKW30UEubV!i^ylv}2&hdR4I|G-e
zV+_}r0NHk~8rMkiD@-berygO~Vl)iYLCo`iU06(eB^rrAV<5qBN|vg9ri%fgi<n}T
z+-$!i!<I9JAFpn&juB|k1yohFcG`5*IS>sN9XL1l_$L&84JVlpLKVzu4bS}z)2f}o
z7U&fhCWWN%mlk^Z^g-?Slbg*BpGN5M=QZyAmoJG2AK@h~=g_S3tbmuP^Q8M-B^0%%
zplVg*yZsvYvm3|RMXrBG+68(saG0ji);<Vtr<;cS{nPBLNsQodm{VTu<5ZVAKi-uj
zJgZ-X1sJf|MSOno?s_`YZ^y;_FPVHV>R?fpjL6FPxjjMlGfnG3r!%EiKP-cskjlw|
zgc%HeGA~|=h&roQjs~^y>FuJyyWUou??`bTs(2=NDFrvtl5}+jmf>7%s3Og1AX`z+
zh?uHq(z~IL6cmu>cHI*ni02V08a0YSGjXlHyRUBX4Dtdf#vJSGzMU4{hkk}Z8It4?
zgJcro5EL4>$x2(a(39nQ0vR#OpnglfZ3MY^zzG@8O+-?cIJDAzVm|kJc8rmrr=u!L
zRVZI{&UPcJy|1Lg&0Rrrqf((B%hzhR%na>2Wp*Su_kL*qiH>*86dzVT!7lOfoCqa#
z%W^SKB)4%<pMU6Jakq6TZ9q2XN|?|fX1R9v#jAB)nCpWgM%~E61cRnmj2?1NXD`Sy
zK!H)R4AFM%iajN6VyQrdS%}H;TM)T_rkzw;bW6pPd19u^r&5QBe^V@@6Z^!NWd1BO
zN6f)iW!C^5&0Kz}A}UMs*x%T;S@!n!W~3U~{7)N9@x|o+-C6dc@fFpDCZ&(Qrm8<5
zDkiz53#$65WZ7k%R90(hhs>dv)E5mS&221<@8vNg^>7u;sdX0nB$WWUL`OsHeHye8
zwKT0}DI93?c5KoVbF2*dtYe&wR8PbDF>*FQwljlwFlInN^596Bay9h+Xn_5O{Ai<S
zUw|?NN4^5xl>y0PV^=GPEVx{X<V4$;UOWw5R_$`dgu8fib@33pBhS^mlznn|<q~V8
zck9hSrN){4TP#G%X<1(K+Y!J#>g%s;c2p3)%9<cB4m&H)B;xsQa{|^?6?8H_tU9OE
z%A<@*0mOqXD6PC2p$lrA!;4ABO_p+pom)Z+2OWDqrds0sjB@%XcW|rAV>oQ#i(?V|
zG~jXC26(ecr#<;{M1_+#IWp94bLUu9J5z5GK;ZrX?~VU(e&k%r<k;6=hw}sST<hGY
z-z8Vy3QE_3v!jFPc*Pm}r>+%(!d<ItS_pBCSaZ~TqFlHMt1;K_c?nxK-~8-d?Rbe?
zFkafc7R9nhOT7B{$C*+z4S#ITm&Jzk4-k-)%dgUGaR;(IH7$18=z|8uRwvHLvYjQm
zRxB~UAfU}S=~meZ(Cb=|$c=VF{Plmbm(}iN0)?EYe4ed`d|!{wljTC<b_KlLN53}d
zs{EqQZHb1AOwKEkIrN-%+H6#NaTrCC@vo+lyCvL!?cpI*+#hl*;JiS6)?FqtpyaQ~
z$zHq2?gd%My=L!G8_-$^$2RNH{2@qcHsUteHMy)7!S+EZ*cExsA_xYwRf^n$+8n(Q
zVc0iDt)G+-(DUN?^q)v}6#ljOQ-Au_Sjk;U^#xNW{w^=hF6p?q`0(^%6BjRzu6AtC
z+#!k3-L(id0lYsYchIIYQTL<caiP}Ul^I@2M)xoqO#`sJ3Aq6d)9JKLz7__!EI8}&
z$#wGc)%jZKrE*pkzV*NGH*`E!CD}GH$_OIY<BnGP1_Vg95*#=mZ6uK|1+R#0mk!Bg
z8x0{*Rzj>l*1>QAZFzzAghJuS>+7U>3o#r-FsnxKv7`3`zZ$p)hbh*?+nc(rKKYw8
z1nehEnp2%mY>2HY0$GMtE}s9j)P%X_*6C&{dA4oP<PWL-bbE3t37}EV>0A47(x>+k
ztVkwdOl_Dx6x(*BZcIWP>%H`V3b2@^Wg()oP2#*!T=Dsz*=RDz4JE=qD{WbO0LHjW
zPR08bI<xZtg9puY<UDNw*`*`rMw+F=dK{Zx;hWjDIM;<YVG*nxhxXTMSNkORa_a`Y
za@uwPlVt*lA1|x;Y60@t<e%KfTe$-_4`hPPPm<i<S2F%Aq0Y<e1;We<gTt#Orp`pm
zDXSJxHJ5TZJ(a*k?O&0<6n$90$~V<%4AVs5?RTwF5ycl@j{(`Ik@4C0Rh07=9GK)I
zr>0A$W5mtnMdoz9fQ}rIorZJh(d6O)FDDICd1#K<bqV)u5HCt6wI@%kP!f_Cib$PV
zl9gL|Tcw(rlVg$HoE2C?p5^Aw`|0`Ri(>n*x%f7$PNNe&a7$thCz`+F2tKpEUq~V~
za$QOvJoz8nM~EG{fr%+;D>)XMPS=^*7u-P^+Aro^Tqm_6(Hul;zYQ%-VexXsdwF<N
zyhk@_b^M4`EGBZ2Vf-kY!|mi#EGNM4v2Y;ll*k_(#Ij}6zK>a>tEiDZo!IfVb#f`q
z2QRD(XmmHU(mM=Sb{s9Kh+9*F_{5t@5lKF3(YrBuX6p{p#8zVR!^3~%&X=jj4;Zlw
zXsK8Qa-YIbKl*^XVxx24M$kjr%wFtZhI7~vsh93LO^q-JhfPZD)I(UNIP&7yRu^M+
zW+|_6O2MKvwhR27Y`Si1ox(ESeX15&tu%|7C-zvOMYTX14hYA(ckww}KOnn6Ef6Yq
z5E2!!)k~2U7~A?Ni5MUggsmOtdQtErpbpWoi%E$muq^C<B;CgVCCI*83kUn|qJWht
zwqrh49Izm~5e&B_SEt6TQq+qA@&0ym5>?J6GCVdyIqsqHjAi6pqyzzKD&xf!4X;yV
z<#RK{j&x5wQvlke#-D{=S>VO`F>YBSops?kQ3JHx9kuFB>7@yK+blUNP$|^4j8~}J
z1V=`k!<7m@_dD=2C?R0x*mzI!<*){5kntKcK_g-`#|Yk-HX|LW)H|S-33`$)|6aGn
zLg=UTq<gM9=Q|||WG_n$o@YgSM)>gEVrtVeQ<Ov($>KS#tfp=a*E<&;SL-wBcNz@+
zs^$kS;BEBem}2KD+R_Q_B@tUn`b@8jy!`X}LwHB6I0d130O`x9@MF~*dN}SlL{<{=
z^pg2FDUK2g>mH{TyEjs6+d|pSTH92mQiWBya^TLCAaZjW9J|h_FQL><@j&eebVy06
z4Xjiqb<_JJa37pXCOZO$i-zAcdK0KDelRw$+kEoAtClG$4?j4Em_(`f=dTp0hAPAT
z42S6kT*3s;U|IU*tkbDy=QwSd!ZQNhXur?%KS%E$i!!AIIeIzz)h_=ZXYbf0TDNTJ
zrfu7{ohxnIwr$%sR@%00+qUgosamH_RqcB3-KSOE`v*pwAEHOh5z*rrk|LlRBSxx;
z#7K}(C(|GL)9?&Pa_*q~0eOmrmDuG*0Jr&CTMVhc_m;&GNWYQ)WzGH%oPj;FQmif<
z0084(d-neVBiK9IIoti+PW_*@?EeCc(2Qkex5@F)=?hB#A+R!HMPgIed1r%Wk3%Z6
zTJO`c2%gDbVzQxH7$z_-(Jk`xF&&RWD(SdMdlOg?G3Mc5%0WNgVq?`}U_6;bbDtD1
zJno3j)L>jmU`jbYq>BKmaR(B|q8mw?`o|f2$f`kVJ(1>R#og~k6Zu@7pIIF8vyj9n
zw?PjLWuGO<T8Meeq*7%MoMvW>YTtO;aKz$m(si^|VxlVCB+FRB8!NQ0LI$lfnuoE6
zoFGHg8?FHsl2VTu>9zp$kkW!ST~2@jnq(V!LPG)!+_)3M5K6g;g?2^5ZR5eCUW@Z6
zqhv_#fJL1N&Jes_8CZsyFc@_R^N<mCJaG7%tl$$-CBZh)tVP*u5+otM_*e+69yDA*
zG7(iGs5e8|451!w{JD86pebUnwZNCGTmgwJBSakWEU*X@+_#(q^W(_QybG*{wzwu;
zv`XeP9?0tb75NxYCwOXLv`I$&#7!RFXT}^XT|!#4@8YRSMmIY<M_o1T#vkQ25qd%(
zF<!*WQKFxRMVuC!$8<O;+<nI~?V96}2T6D{*vuFm+_%A47sJ?GTwes;0_le7L;^y}
zuz=K;cd~u~W{LlOLF>~K><<W#jgif`y-faq5P;;E1O0K~#@5a3FV|0!jRH^-{&a|0
zz)(f2Yi)7$EwgAx*;o3FS2P8_Hi&qeI+0^Jz)1K_UJ7#G@>ZE68o*ogO1|M=pWwcr
zv#BQCH{ItRt=(=r)Aph><57!^yEDFi@0aU#b*Hi)-SwBxde3BaF3x^_#u|Kx>@#y}
z2^Gh1W=XF*?jU13X9vxjxgP9k09Ix#%lNQib@t3yo`>V+SF)zJG8nrpC5<~*CGa0N
z%hTN2ADvdjajSN=)l#n&ZtUqEcddeZ_%=o^Yn?6*xPrjjAsKzy*R3?obUU+K<?LR3
zXdnJyKpm?$!t|gmtQfGh&yRgAbfSd;o7fk{pbXgCIxyPZ4Z<&9S(`ij^g0KaWbCAp
z<u?5mTM^t>=;>Q6o=Fn7-MRn<r8m-iw*1*y7WX(CD>imj_~kaRw_u>FYb_()-+JAb
zfrDr$zs_W$go0e{w%04?i*+OXC&oHm?C7<dN`J4dWX;Re|A1I#wkPdaE1y}wX72!_
z8VtIHd{XCM)kCT8fe-LcwQP+Wla%=dBJVtZ+&D+ddHI6}Q)(Yf?V4EhoR&KQNW=L^
zPna$GA2@@lJLtMRRsO*6-jzkz8z+xn#@X6=-FeZVD@+*~?v9em95H}dbVmukCQ$@V
z=4yW7tr3uCaOcJ?)!1(Jo7h-|XxYQ|3EliDTc+ao2Ut{dYn+_l%y-+)YPahbtB|J@
zgnTW#32p=4VUvHYYVaGnJtV#h+?+2GeWn=bR=d&HF6<DfSkS2yM@iVb=x%q|U18%p
zDEX+ljl~XGKVU0s3^Rx->I})fRC(wK{b1<sEQ@HQ8_K`~kdPt>!ftbu;Wq$eu@8$O
zsB(UYfIcw~MnVe<l3t@S_C@1wp~?ljCWLch|FjBswJE5dq$MXvA!h>=4aBs^2C*=`
zL&Rgml?j%F`<NQ?Ig~B2MMhZV*R+a}RO(^Bdd2ckfOU#WXa^)(KwuB!4@4^bJ9Qe9
zJpZE7-(Qdq>GGWj!1@vsl+Gel6nRS(fn?WF(#er>n+t9e76x6&n)dus1AhiS(yO<(
zLO^eoD;@&t0=YkeMcuCkBykgIO9V5Sik%1=R^dGxA!^@=O+jY`rwGpI=#Jpgc|6YY
zN7`xABOXpE^P#{%U-Bb4N0TI_5S|86N(RzOk?spJb4w!BPCQpOg~~8eCP-7zAX<$z
z*IMl!2R%vrHuqO}03ayhv~p(0-i71~gJwpid}y5M&{=@8Xi`^vD3^KBVUjjHX>$2Z
z>NkNZ9B?N*h>I+KP%8@|!fD5Ogfj`yW8@kWTO*wruK%@$6ekN?ul<{4zvAEt!Lwc5
zQZOv?7Qs@%o|SMSaD7qaQs6zxvod0eO8QIyQqn{n`GyV+m*~;(1?HN63U2=0P@rDj
zB^VwaAt--}M1=?ver2vJL{=@HlZz{}&PLqlOd2qh04BgUT<A17W)G-?FK{PUki97Y
zj~PZJifqN`%s-g!(T9wi>P!o1SY2=g@`=AIukfxzYY#a=yQt=D-8d-#H@hg0-vi9W
z5nbG30EvMw<ZT+$S&8w*o#(L#HVW9SjmLblr?+*}9({k5uGX=A!r*>q&H)ffDt^s1
z+4E9W@qqP=9eqpLPBy=*=E(^Zm-j=s&<298UfOleAB{`7L$!f2swgjkDZbo%xJ+vb
zABq)AbL30gPR4VkG0Mpibi~w9ftWwNZWv0I#QW;YX+*QKKhj^$oJ&V&yYAemj?%?x
zJ8~AAXo`Rg^8-e>!pQtRb~eHeH*|VkXz{^?h$u75o0aZ|jIQ>;w|6S#Q3^Sko<FuE
zd#Me+{6QLl6~L{Z5wOj*qSLc9o~Tzg^;dR8{-0N*Q1f(G`KzHTNID*~rCon^P|ACY
zi8<7ff@gC2N@)}~606&WW%}Mh)XK|ZsBAqU&L@iD%aAvfUlm9zTa@!lhoP!InTf=#
z5g_P#31CXf>7T@SpoHXns<f~Yb3ASOQ-~CO8gb$*(JBTs;H=f~o<j?o90!qfb9B!R
z3Q|dKc#Z*qOG%B`c9ThRAF1KgFoLRxH+1)mNJ+D&u*eR2LgNl4jsc8{=1F;jTsGM+
zHWao!C34DGw7CcqoY@$Pf={{N7;waBX6M<$Frz7^@rzT+7+T)O93avJT}(gqMnjy@
zIA^$yY##FB#4je^dXYxu${-j7#t!+haCn7E;TQ3I;}Om|a=$iBhfsU-kgbYTdgtFu
z8U`~>aw>lF>Sv;6Zx&e-*kzIIx0ii21^khKabu>&RXm4a$<c>PfW$z8Hlu`)6B&f?
zv#WO-TIx#JZ{eyNVI?-n*PM^GlE3(Lj*WaKb;5ZdGwx{&Qu6x#eUg;k%%g&;CLx~e
z8Lw}OQCG#DQTafi&;}4}udHd14V6gOh%IC^jsZQ_>l7c&*w4*&=<LwkGc1HU9=49b
z+=p)poa!9DP^P$ue_F&7%S++UIQ3}&SeIZ=(CQ=76NH4)xQK5`ANgKO(az%eMTaM%
zTq`<HCP#TPIQne8I@Oom@YCDI;@R!1VV|z%0p+7@rIhL6!~4#{P#zZ;cX?mmxNV{m
z`WDo#V4RPNvX)K^AL`a<&urYMdArM2x15Y+86LTm;_&GZQ6H9|2PQ(V-hxw!x9~8b
zeD&FMoPK`2UYSK1;I~RF&xN@?-Es6dj@}7u7M&+Ti!84aJ21q**;XC(@nBkxlzaxM
z*Wx|O=%|62pFDe1L*B847PDyfRKz;H#=Pz28(#ZEP`qa)$<rZjX>;#+qwgCPK>{4@
zgT^DUfOy-+z?}!^bl`Eo++d*e%W5WioJ!8IftXn_yQmyGll4%KkeHKQUcf@kP)Q+C
zO}0Cn<)M_vft0nt&?HKo*9&ubshK39QoLr|ZJ7vD3=wU=j+(kLyQbx*S@<em0p})P
z5!6D-+oY<3)&xVr(xJr=rvP?Rk64Nq#VMAdZ*>H5o6E$ZnOa<89$7x3xI#ev!vpy(
zi%BjX*HUR~5e8*9mvrZR_H4!Jd2YRqL0WTWE1)qvF(&kP1I}0*T_BCjPdh~5rW;he
z8VeH}(+qMAUF_rjQU6&>J(r4>d}K~0AoMLqFmC(L{!{-N$}6I4<>Cl@_;X&kulu}D
znXc+60Wg#KA9dXf#+sdXM=buqa8k046zUmO-~s$jbYutT#JBI*!&R`n+xTOAUj6J3
z??1<nQS`Xsk-tT;sGuz40`Z!RCif1SiovE9S==r|rhc$J^7#V1=ey(hdOshC8%_(*
z8&;3M@h3llp=>QH;5|t~;p^RGpDOnz0)(4BHX=;W{s8)|@YXpjKSk$zkMj2*)2N`f
zCxJiEe|5C5d72yGZ=@3c85D|3J%e&$poBKF3Xh5Q8m9}Q8kuYavnvPXQPv_y*3FTW
zlujTcwFgY~;|mHv2j1Y4Mw^RJ>tC3_Y1LmR<z2^vG6Dpx5L04eskBe?W0oKK<0y56
z;h@?ma!{#5qh}mh>&5hE>`35YLY#%F$xvWAxTKg&)HfbfxhVL#rwtV>4xwWlC#$`^
zuA+p6Uhz@;)MON9R?<r)4B3m7o`w?fz}VreqCaS@>9tjwX0O*`A4=JFuN6BKC6Ff1
z;EsLAck11#V=_@Y*y(BC1AA!+6*#MAZpx*6;!Vg~>j_bn*CbUI@@SS50amDrVQH%`
z8HBICpuR|XjqV2uE^M64%pyrh`uHZ-NdOLpi=HcWAiBg2`$lbnSQ5ZT7eYj<f{D*p
zI2DcVV3A{0QK}=E8C{p1U2QYPommY?`LpbyLCjiZh0D3q<i6nuW$%|`tg5glZy*(q
zDE)YozXBeNoWoe7zp9w1A8VA5;BW0F+ITi~7>$?gJF#(z$4|CW^@EhgX}5do$lzNu
zv`T~QvK~_M=idbtuJu1EY<=)N&}UmhMbG!l!`6dIr|BdP3I>O*aq$F34gP!kMqiVu
zfvH_Qp-T%wtU+53PV=on;9ib;I94^lDef`ED5wT5-KKvI6%l6rNHAwNW%}R*KmF_J
z45(p=){qeeo~)IIP*cPvDMgs>!9xjV2gUb1s*^*gDJB<PveFOI{x}4m^RJ8|x>Q0g
z1qUwzAoLCVoWw?eA)E+?NcUX5&m-;jvilu6Y2P+Yqx-!XJ)FCDgeu%V?yEfyRyWl2
z1xl)rR;Q!mmcBj{yLJhUOT8eRqEg|Slcv4Px2dQJJ(~olp}*{Idrii6mMKDM4ut~M
zN#%hoA%`S#Zi0!8{-OVmd+LAk|F*pU$^V1?pDV=9CQi;y|49M-LHqwm0Yv-fA^*Qv
zfP72X()2(801F@h0C@jA&(Oo!M9<0D(ZbeD?>|2C|8SKKQJ%EjWI*V;q*njXZx0U<
z9m{9flfq&bL2LrGd`(0f64hL1qEIANi+z@OkNs9pODrC{Ar6F_T)r|lYdcb((w!AH
zT+p^VG3@r_^SWdfFqI)Jnxd+pk)(VmU`Ykx;V7gMT(1e*ORsg74UX+#Jq87%mC(?;
zF2KDKu=T$~yokLdKDFIk0?^PDB4ex&Y6qejM`;(q>HD1Bf~<Zj?}_wA#ml2Wm86gw
z?}FrVyXQq19Tf;&u%VWmE`$ptP}s|EtAXrIG~rgALU$b?>h%PND}4+l7?$QJa>3+M
zUT0jddtjm(KG>xVUc}=;tF*ZR=`{wi9My&10QzVZF>UZ_`Zf{@DR=LiVjyDFG!on@
zpr1TfnLV|M%NC>=oIJOX`u!f2f4e1F8;WoR%Y7e~moIZY%3MD`bMz;9V!ko%G}!3=
z{LzxrET-DIP=%JC#~zUS8I@=j1dpK{YrgqSadE{QQH&QHuWJxkXWmv%c-j~T*9{T=
zu^|RLwo{5O+h5W2Av=n9U4Lc(?C#RyK^uHU@Y?Dg!Nz@;@&l}5U`{8;;GkFkfO*t?
zW^UoZ43tW_jCVZbPT(W$T<XppBJKyAoz3LsgPpIffb{hq`t9wzVw9Z`|1p{#7&fFA
z3JsbuvB6(kiZc`3NUmtgbg;3*9B4<N_NQh%gt(Ww@EigCe!P8<w;`MO)6}E8hj*Ai
z(-+`3Rmqt=j~s+Z8t=u8T-j~7+aLCS_HY_LK1e|SQUyT}0saQz|6pVOr#Ai<Tk!uT
z2$$8it^Nj~->V*BN7x!o`WgyYK%ZpGFPM)072X&^<M%DDlxpcj^p0L%a=*>z4dbsD
zA;U}A-p-Hhu&Z_JlPXjOV8#SaPNT@KtR#Cx8k|#J>ZxL7DI`V{Istyy>nV9Y1t#KT
zvJgrXdC3K9>J&ne97jxG)N;V#;2dN$3rZ;9rUm<K&Bs9QZ9F6XEGQ@ND@lO`fP-o!
z$d=VnY(5DmvIV|8toL_PE<8UY$ZWO(&hu?YK<RncELR+I(I*VzetVq}j|9{}%!m$)
zD_Y#ZRn_8lhx>Z(i}N>kxt$tue+F%a>TwBjXOkvCTT2+LkOdT@!lh+`M9*#Am-=AM
z1d*_&7C$$Cb@J1dLPkw2Vs$LJjJ{o0k3hQIVFxh;QC_>|Ea)V8Ep(bFLs8e%Y84xd
zFAdg{RNTh0eUBZAuDgUouZyx@^}AP5F=sf?ZwW@!Y0Q1D(X5fapWc8P&G)$iXaO5h
z{@Et}yp&FZ3ph*Z5Y!`MSG)3aH~bb}yIck`a`Jm@LBp+3RwenT>HYP00yhs1R<Wlc
zUNumC?pZ$E<u!p^nFdrE{xi!mILZUEXs!))sj!5F;Qd!G?-9u@z5_YFjc717Vepok
zfH@HFec{srX(@>ND!*Sx%h;!UhTr-`RnjzrnDG5jrI%#i*i$OrY<Io4HhBm_AC&JU
z9NE;*&YtT&MrmyRLXtEqSBaThwgvHKPe>!#`>Z!MYn``{2C$jhMN8u;pHY$I<DDRW
zq86yy0tW5I%=YqFl~zfBW}sk&$%0qGuKUVpEO%cwb<lA(l_0J@>N9?g_oIxPaID$C
zzg`nECKtE6c?S6{kCgd?9NjDtT$={00&Za-m%cRszdjRj%tf_K$K6d!j}@x=A8wfR
z2f#YnOGoi;|DJe_K&a(CULLpSBY!Er`^Lm)(ZAjE1{^hU-vmUK#ZPy`)AR~t8!+Q*
z(Y=xPo>6$C0^75~@tWnh$}t@(emNZUm5`TI{kxAAst22lr!v(#I8#G(=m+>OX?bd{
z2@5QuwgLQ=))P1Y0Pa7f_1_VG29Ayf9{<pmcz^l7Ls-P9Y}#({Blx_kRb_)?k2klG
zc!UQl1KH9Q6GF+RK;<ukfgPCpoZnzg?7FS6M+=#M0dfCwI+?oCs5Q|Tkq1z!BXJP+
zWf15%g*$KIwi$ezP<%#<qIR4P_5_C8kBwf#B0>`%9GFT87ZN<DQYi<Jlt3V>mnVq{
z6zKw)00s?oI7fJ)GRmRK;~W@c9!#zzK)eSkf+=vx_)Zs#Hl$o!Jz^*bM2PmGE#^;6
z9>4kA_A$d2U0qvDy_Q_?irzlT$5J$#0OncFmmIHs^z!6;<IIZ#{#jv&Canrr=75Fr
zQ%|?gLmC1L-tJ(6_~li76X;+&D?M$)HT2maes>x_H}3dkWNHZ|d+zz3E0B1n->-}y
zcGBh2#3O~IsZ7I~osgD>&XgG+79rOolVi)1`-c4RWy?eMT`h)}OD1H;Me4TOP<$)n
zBkZ*Z!{>ts$7`32NLr+K6A~k`n)hea)9zTG-OKEh8t^PUoTSX7KO=c)w0z@Ox80M~
z7zt`vLn|)_Sqn(d;I1kgI&Repp-kvUAgYbRFnjpQ+Ss%*!X?2R27AvngXy6r*Y4NB
znxn^L0jXF85~qqg(g4$jrOIZZNxLTwd8l4MGq=t-AX4YGE{nLN+RJ6=(2*q^R654+
zW*N?KJ_t({L=eQ{)!|T$x74!YsUKaLs#HY)0!5xemAO2-_Gzei1g^{hfubidp1u}F
zvud<O51c6)4pz9+@cl^&*4Pc<1D&FbdI<n6h@eNW(#Um=5Qng<Zdtp8%bo}9nhmNC
zzKCYG?yMu!b>qz^$hipM<qX#&mPtzFmwGf~<vdxz{q!*;B-i;5i8fvuLVv2M1C4S|
z@5MCV>G{cH`Gf3q!{OEYtBDlyG5eSTb-tSUH>7#`xes#3o1-2R^Z2Ry7oiL1WE>?C
zJXNv}zOhQ+wIs-nb+zfR-_eQ-C&=_F&w`k!rUV!3!oa8tYUv(2{(j?&_T~nJsyn0@
z&61jZubi}`5Snmu<~y5mt`6>S(lh$MWc8l)A-vV~;Vb-CQVX~M00jR=Rz?Qa)`kW~
zR{u~ITFbwO!)R#9VRNGR{#|Ffi;u{RO_Xm@>`Y(_PM#DLAhFR<U~x3f)=#fD<HaMW
z__O=E<`i)%NMK!jW9g>M8sS5i8#n7>&XyIEPoRzj?+^->&(CeMbm`0;Gh)Y)x~knJ
z)TusZb&#YV@N$>UAYCYmjA0fCKIULr9}Vh&BODYD9bqV{bAvcqnh0?vW)obB_-Z7P
z9bAxL4^~faLD42v5Z>T1;+tB<vXHC5BV&O?=K*Wa8{}Jn&g3WooQMQ<l}PDo$2=e}
zPlo%j$Ck=pOrj7<uHDsMuY(t?Zb}2x$jRtxcZchGqF?RRHD(^GZ^NfyS=b1g9^H?K
zhh~=j0amGqf(pW6Ny+z&Zx@?g*nb+}#FmcJq!4U26j5Vze($wn&ceealn@-{54=b4
zWRMijO;okg2^MH(LAE*EW2a+FrIKaS^u7VRUr3_&SZ1aaBxNG{Lh;T7A(Y#pt~kS1
zt4Y1|>Rb1{h@dU%m~fEpAEFAp<_qQ@wP#Se3B3a5G}aejwkJZFIU2vm;H<cfnMv#H
z;1KX4J3qPz`N!U~hc7d4obu1t-5Z7soaourr!xmu-26WHmlG3~oWCc_%G06T;J$fN
zDQHMpu1GMb<f-t><<y*HxJ|biX)j1Xn=yqql+&$Qz|9x#(k37D^9ypB4!#)TAApEB
z7@oKajE1+T&^9K}s@UWEp~YoW;ZF_hdA1IV!p(278F9%nJT9TuvRyGtkM_)Lt@qPx
zh#|n`snb{NkS~;lmrY%}(nr_M&1pACey>-M>K)nG0;Fs}@eCpYX;J-}NmHLUwpvG_
zE2o=kEAYclN~TEN{49KMOAN6@@J{HlzI9tvtpZKt;<b{y?S8?%kO28ek@<9`zB~$6
zgA4q1=0l(gLTjQ47zYtR&2ZQr*>p3~rCRw`6*r^B<Kka!B+2GQe0|v>2_Dytmctac
zp7}1&qAb$hc+6nOl;ty_D!5^S>n_<#MS@A39dj)T5Uh&~HUrk^>Zxzb^jge7G)l4w
zg5w}S&k9?62<qlU2Tw<TCJu*SVPP&t$~vlxI|?kQUu$Jf0<N^XaYq&ks-c#Vz~;=T
zSahelj%%eZbjoq~D=kgK9Wutt2jpi_tlhQNC_AAB5Q$VwubI&_K()-;D{d`gGKkBt
zYCH+4ty-;{nT8T1o!XC0L{*slJiUs0ChiW74w}tM=t^(LE}9ftf#s;5seBS*8ytIj
zw>zk`orolpNCc%M6zNYaen6|(vm01Bo2&byg~g96RH-6%TO5JoT)Ztem6I@@u7Bxe
zXl8*&9#MY`gxs<cE($C9H<swxrh3A$?h{f}Rz%z#^eq)lVmQ~c9!IaFHNs3EWr{+4
zTR31(mqw{39Db?2R6l-cy?QNzU4mg8!09*Xrf($@V{6E?*U)uZOjkUfZ;=W6LNWX8
zOgz1N*I2sh`2{eJg#J2Iuhr=I+Okfuut9K^z6rN8q46r*mitiQy7B;z=kP{IQHUiS
zGK~N0yfi30A>n?2gsiruEUx%vE)w*CgSZ~P#9m5Fjabp)#F|=st*qDXGARwBe$_vv
zBnuJ}P*#H!tl_?VC{@)wDp90#!i(N|_MOQ9g1no|l<s$1-X)4drW258Fx@hr+{O_L
zalp)inZv0E7M3J9frSo0J71DI8Od)6kG&;LlZ{|6>2ixd?!%>tQI{eFl#6In97MLR
z|6~^JQ;9=&E!~J0m>91Nq&1rg2)J~eO(Tnjt+)PR+KJ#WGM@V2)eO~ue9GPe1W;s{
zOF}mF-hhd@1?LIfJ{MBAOoa;b`*8wMHQvbyeT}O`i5$nOv{B`RL)P%<$szZhe%G{r
z?qjea$d0i3H0RH+gDJftz+yp)13&?>T)C^C>JBbZY0z#~!#xEtH9H<_{K~H0a1zL?
zcCFjZZcUPoes?mcC+droY~#35rCgZBryLN?bsMVbhVqJ5(eARbbOTGz{tJgP?=wl*
z$Ro&r*hOqdFdg};3cz~rWX;kX>_a-7Qid<Oa*p%V?O$@raoy0&Ypu@sfk3eHC>1Bj
z@L*aKb?Qe+)e+@HLpnKg@k?6xy4|S_#VO{Q$@<$^XrmxR6t896Pmrb#Z@ZT@o<BP;
z6VbmPcvGG4mz){^8_4)!S~*8qh!Rx7cX_Hd<5puyP{@SS>*c4GG2;d0`|6+c(MrPn
z%LQ@^ZuYqO;2rw+CexJV$S4NiT<rRz`!(bK@@$owDuk*1sJq20)thTJFc0Xt)u_$=
zC>MHeba_=58kCtRW=b#ReAc~w0vO+~Rl{`0XoDK&7xSI<yWNkbl5epl9z{|ha&yt3
z*>p18TcVapZCpvjf<@{H#C44ze-M+6QJc$6`qx-U5om>%5y4Ir{L+EhsMx>nBdU-D
z;%5NAHEE>?QIc`2R;z&Ny(q0-m}9_HZly8dmh{v=e45TV086GE*pLm{nafo4-=P0h
zo#D0Cgt?{MX7rE%01^!V0Pz1!b^agFwtp07-GP7m+k4G(>AE>G|MQN5eGcB2EC?#!
zfcv{Jhl&YxvY9Sdyz`vV$3AsWF&Gwz20+qur)+i}{o}7YxbHwRJTaxD`9S>0(3YE-
zo7roSZvDd{j-!9+*)DGEV}5;y+WOo)bA=NYGiE&*<d?@Zj^WK~cysTMQt6{wN93u1
zyD%9`^y!pQ6suTzNPpnki#2Mkpy}q24owgP8TBqG*wo8bddO0gXRvHRpVsgc8Uk(r
zr%7yQo^IfaruX$&ZELK*W>wdkPH4bj0YtEDUag1}5Uwx~7gRPqjjX@y%+=#$f0<aQ
zTB%=SSYcNGxQN^UN&37cQ8_f!J<bw?3P+5)5D;D`G#Q}KMIEF|4~hTe+_O=Zj|*1@
z(LuV1gkRca&}=9&O)3j2L(w<wB_exfm67ORe2o{$m-%xz`dcg1!_;&6>G#M$m$eX=
z(O=sM=qGb`;S25BQ%((XbnDzpl=;OyJ-RicfkE6>!&*-JR90jhG#FXf>>*q~euU#i
z`&67j2quFWQ8;v<TGH%PM7>hWr5NKgt|>UP2#vfO4u^o{8$S2%8GHLj1A8Mqz3u2R
z#(KDqFJChjjMyc#N~f>5pw^eoBzW?d=kYUG7VreXjcn{(;MLV6dL{O!^T6U=0TDdk
zxqcNLc%0_V>w8^`(EHOT{K%HD-|vq@&&_mWEGEoA4`KRBW_W9e(6{PkZgCv8Cv2fY
zymCKnL2cZ;4UMNX`*OO<<|jJ%0QrQajgRH2A+(U>Rj%W;z2B0$dI^)2h@71r?<NdH
zh13tNk8ZbPXVd=LyV}3uLWLZCM|ul306ZCv?gn2=ON08YWX&P5k`6_|OY~0;ksfRO
zS>AwkyrD5+>~m=zf&7Z2NX;9YHGOrwd0DDOsw0C7`{U$OitsUk@Ar+{pR~h9Z1>Wf
z$wl%fk8krjv+*>`;KkA)1R~-wh(n3&aVO7;)Iw@(0kHxjZU@yDU8b9Ku|Do{%b&%$
zY&$r=G5@l4Ah2Co^oAGrc2<+>GsjSB;uNDuW??X;qfw;n{nJNN88Pqp^z&etV<NaR
zr|!*B5)dIAsYt-pdH3!G8@AWi$MFPcLghJO)rCI^a)hMW#y-NQRq--F32zZv0A<67
zx%(%=l7S_+m_?vn7eg72Mp~-Hrm~)k*a&(=5W7uAH@IM$Y{?jaFrB$z_c;+1QL@1I
z6E4%!ReuP62;}hjA7DN7!xI)l<K5!MDYWaQ?e~QsO2=A3U@Hjv9f3g}OZ$xK^S+fW
z`F)k-AJ7<)c)ub4q`mjS#KaRogU8+IKvZrRUdfI2R17x->EEf(hYZQ?7g?!N-pI|h
z%h#sTJQiDJNFm6|JyiGCn<u|9T>v^=SUs%;wzxu?&brHFa}FxsAcG_HKdO(7@y0+r
z7kd%=4US!<Ve@8NitVEf2aV-RsUDGkQU{tN!hg3H<n%m-dgwC5j_&UU%>nbqjv`>I
z{W?5rnZ3!6f5^yLw7EAxO{l^7o}Fba4IL5`BEywe5Y29+gY)+`r*I~qG>KiN5}idp
zPhh6t)Ci5REj7eO@H3I5Tu?(T;xyvJ`@p<g3-d$JdAxFjt6^Xj{xZHWjf=a$n-6aA
zuA#~UmPreM@0B^M6Nt?QyaNY?ei`jCW_Zzg^9$q|@g;E=Co|#`=EoN$zl(oJJ8Qm}
zP4?ibWW0iPO7~K8be2wsEtWX}mY2s)7+q6jPf=z4A~7uCRFuhQl?Ma+RmI=LN}<D?
z9_NIlqcz;vt)|%uilw&07OwSxJVLzSj7@PB_{gsv$O!@Xd_WU;HbDGGhsgj=M*zXe
zAO{d|YLF8MgaeAu*3uenD5R)G2fc5$YS2t2^QUU-h`~!V_7=|^dK8*S6$hmqQFUg1
z*#PzAGa&sC*QoZ9WPLTV%yax^GJy3s5xOR_(um@dYS4raGHn|cgrAzT$@2}_4pA5q
z&D~6j1w~MfkhVLD81m3XDmn;kLl7f-!-%gVQn;3?o+|Q&W`0R^=dDn6m$ZTTB9N3I
zQjVvEAmFfSAYP^!#*wmQS)<+su6fmFtW+QSvO0YDR;swXEL)2#REH@uXklLuCVwVQ
zl@J*jsBAjCZy+oiMR^t(%n3-;m^0>*-T!2SZ;2%HlosX@O8hx<9AP-pQOacD)wnHY
z=;W5H{y7YOo#9Id%r|)0NyOP#-dRo@b+|tAfdL<9TX$eBd~?#2SCFGpwWFggOJWzr
zHEYnyH!)rQEf@b0`I@?h@N+aSk?XZ1-*$-T%hTu16(Lk$jTlcyy5y<M#N)BLOXlQ8
zX-O|iC6S^BMz$ZXt)5V|Vln~|u~8W&1N&hGXrVA}9bNG@(%?o1#0};(v5XJ3&zHXq
z6;(36SLkIn1-k+nd09P$*+|zCa`TvEm*h2KuhvmSzyUGlRoME=I8gZ`u1p7KTxx|L
zR82Bk<7wkUcImCLX$$oAcPV9C1=>Vz`Cf(NOQ5NivWXU&ntFR~aI1*rwUZ$w4z*Lz
zLlZv<y9v`AhtpmJ<pN_G;0JTGbP3#I;5QkJR0dZU3eN^u%rtXQAlPn{%yuF#nFy?J
zkJep`*kIvzHDh$Z{Pk?*I?QW03Jb6|f*MG(1INo~y?WaLe)GgWQKK7XN7&E-Kf^*)
zWgKf%-A{nx<osW9aTxy9t|d+`|IG<j#nlJ!qtC~@UIPUTw=CkUc>0G8fJ}WxZB|JK
z4IiRGV37KUO6<t*lKFNJ;qiPbghW!P{uz?dQAdeF1ZqEUSo$|g5E1OKk7i!`LSsHf
zy|F>)b%9VrfHoMsbLQdeV4Lg`ik6FCob&kpUTZ*VbY9ogrQ86$Vl3l2msHPH(GZ^s
z9p_MAE`@E8EmdxdpNZT8iHw|EtByS1HzQ7g7b($4C1xeMgEPr`-hXIErFLvWnGpGs
zW+|Siua}5bd;R2hE5&P>?E~9&Zj{U>`Hq@2PfRnk%i&Rx<~kw?6Jr{p&A3maDzf28
z2S``CZ2S!B%gU)u*_g-$69;MQ>|-`}lshiAeh=j338QGYllGQmwsfN}#pypwr_TM}
z`d;j?yubV9%FC|%vp*T8>iMP0I^yA+GqeFDVCjVbV-+?F)nu2wH{R_AP(l&40o`FO
z>Lpb84ytp}uI};Tmj;U|z^hYQS>&ZShqpeCktfc*tPv<y8HS*IFeLjcC@(~I9u=R|
z?dw2X*J7=7)U_OxzPR&vCvq`1jydV=!~D>m{jvfYJc>YDPd<020rc^Z&s*Jn05P{|
zhbt7>9S^U78a>uQ`u969mH)z@I2Gu-H_Wn*F<o3nYIYdxq%!q-5;3aWOI8vrJ~c|V
z`LP)CPYDY$uD|a*^+VLIzmUn(X0eojml1OZ^0B!I6YMkU)pAJv#%+sjsm^O@-d29m
zt}Z1jrcFwXuKr6p8A!#+8WHV(b0qd^ycyFSr=_BuCmwEhPN60r6<?Kd#d<?fEFj;E
z|JJMa*a7{%Zr9_o{?nGIqLDRZ)F7j@6ok`cg4f04zU=34SHTOXr>%g9&A@pj@VzI}
zOIrz>r_$u1^%Fq!Rg`yZ{oLq9T|2>(-;NZ?tr#*A6_mzZ!ExeSvYpn(wjH{bv(63&
zr-uSbISW7J<nq_BlGz(~YBTa^+o6p=YkAUULf%i3*z0)j>#SPOR^KUA<83E%6Lcpv
zB)1o};s3-OpGY6h8%c4({+sUmi=IwtzPNb$7>nOR)z^a+3nxG&xc=z5;VIdOL5dD3
zBFdc7zss*gb_sv$U3ryE7c8&MF}YboVzc)rrTL*!t3<sta34JMjh{CiQaDmB{89&%
zK~*W-p9fmlvr8RO8;ck!P>ZT<83ZqQkLqOfOU|D#{O}a^Ze=tGCC5)z?0)w0?E_)Q
z5`x?as67v@5y{M{&F*{(87lmoRp^V@)^Ja7UF!3>Lab^~=*^^OZ#s4P4E%%ld}vkR
zXygW0C?kYjj-q331S=nI)p|t40=upncC8kx{v&2hn-6vke0r^H4eV-A^h$X*!jDIJ
zm9%Tclg@zc(p^r$5wcmM-|Kn7ar*>J(h)x2H}6W80dEwrgGZNY;cRI9H|qg1wF!UT
zrceyA*gy{sT3)1_B^L^Lb_vhQW1Op~VL>oFq?yWq2GwvP5gEaFGO9gNj|^6}`g6Ey
zd4X{Ep7X^x5eyz(yj8J;^E$n7x|}2jlhcJ<{4xVy5p+p1PHG#Oj#7xU`Am)~s7R)X
z`Oe2s?-GfPN)QDnR(t0h=jy;?e{Z|g?}OH0tzk~uWl<oddOuK3D=A0PMa@|QtzX)G
zeYtFB8qHNrIWEbV(Pt7i;DstAZeuqrnnoR=N4$e#?$7aW0^zkVV`U;;!O>J0PjN+)
zMMJ}C1haA$A>ufbVAkJn5y94&4dL&Me2LX_e?c_LmK#;m-k3O%c`AjZX$85#1QoG6
z<j#f8f^lle3Wwf&1=@<cby{8f6ZGo`7QavoR=0l2N8_3g-$fkx%zpJsN4g}2J+)a1
zH>0cA`I<m0_F!X-=~anhz7pzOWN5xw13Z>-JxZY?_M*oEd3_8d<nnL37quC4`ho*i
zxr7xyS(ZdW%AHGN<~Yc-BVZx6^oulG$2>TOjH1riZ<&VWQrD0=x*ouyt<yq>6M4Hy
z+&-khdXEcTBHL+~RV3MX&r;nPDA#0jK;V~l{P?EG1WCfSyQ>Q==IpL>94fZj+{c1#
z(r&YcrC#hmOj?-awrxq2GM{ts-CqE4HVn(=-N<^q_mJ7?*W-RaB!@B=%b!dIrLdEh
zIX$D9{jKLuaA~{*G`kD-foVs_e|RQ;i&BJ->Htu(;yhQ)ao*ZjC#~T2_ZTqCZISq9
zA{aM2MtQbY_jMRE*68!%IB>_n9Eyb|lShwei_-9^tN8>7Q6`#`OCZK?k@#EoJ@@VM
zgG~AM;+pK26O`nqjK_QtHxTCFT5N`pYH)Xb+240tBCKtyz)FHeN0RhoQlCI!s%*jA
za0#=~=ZsIf3ceF-E6`nukpYM_mr*jg`PYhp(1zrd<KaC@tBh?Fa^<uOG{NLaut?E#
zdd4_C;ehjx)wX$Ze0{3>GxD*P0gp6?LHB$HX=ES4I}ja)%LF8bgk4gYs)(xEOc+?2
z@gCIo4~F2!^)=FKL!dEukLgKnc5t$$tLj~{I-an}9unl7RbJun)oa{rYIjwXMgRt~
z(1<-(r_t*^gODB`d@g+3KiqufeF$G5i+LA)2dY@#VUITKQQ(HB&!?d0KElQ+s!8Sd
zP|PG@TQpkS1;3v(hq1ZwNxfP;xNL<*!P9lpH$1HKvZ254b6fNKe!wX-4P*VH4#GKP
z(ih(Xt6sLuQ5q|6;%_{|&{CW&=XW1YGUxZG)Ekc?STYrO8U-h64{!iW5q^fewk%aS
zVByQc9tvS+v@Y_;F==BxsI70wkR4nv8ww1hpl9QdAzd$vjNjud-nW<~tZZi<>WpVd
zh#$^5o%(gB78O&Qu`p9p!;*l_M)QY|a}wb0K>Sx26B$a<xa-z~iWc{b;~F-))hF3-
z+3BO{-OYB&-({p6nHBQG1N@is<zstiN6v<Zfd|;f)Lm++fNmX=2&`nM!x)P^x~IW8
z(CP@cv15cgKt2R~1KXDS?%dG>A1x(zO;2Z?uBiG+yI&WS{eK9<LAAla_E)N|F?224
zOZM;#qGUF%KiW`eVw;EKG}}juEmj}R^Oy+28}g<thEuoE``Tq6ZGDKjSNm_qly2K(
zYvW}Dho_+iBY0rThr$l1DLB(<^}eY;%6noJwuEL4P>m1T-W(-Dl(!kuav$P?=!;)v
zfU2*%({2s&G@0(9ZfhxO-1g$qZs6`eZ(OAiZltG*I*!7O4pJ$PxQBk{A(;YEeJOe-
zUS{$DT)J>~inL7(4hag#P@~!eBL^0TX||vLSRu*!o+ArSc8hgvf8ai{#70$??^4<H
zi<=$yg>suM4H|;&!bjQj8SfB*5!11Z@!zRjIc4xV;rn_raH@T*(;8};NDCj1jqlp}
zBB>V4?}m!5bnXXww%@o|a;H7TgwNdbH785qDjO3=*gp({5q*^s7@q9RRf(%F?$lba
zN1;n1F{Mm@89Fick_ZRUXX#g=G5EkG1*381?98X!PHC(i=rCN5DlKD)Jk#VmMhgk}
z3zBk^!a`B4AftMD^gl3G@kv3A;=f~0>fzA1R>bu#M=Uxp?iz&{Zn_Wpj<%pLJ==JR
zg(p?uXyupw!2hdn74mmjdny!1`wj*Ga7g+8@<IIfVXd`=lk-2CR+|5QR(q|f<CMjL
z;`3T7{~O#s3N!lJb)|7<oyFm?K7&lD^Y@K0Pqg6?20AgCFw)OQ%1{_6#6gS3R@gI%
zQIXf|^89iF0#jr`lPnMjWCP$!AEl=M4mWIi=)G3uGkNqF6gaUxIB<g?-G;Rm6etaF
z7hr00v^WaEB+$&rC?H_MDNS7r%_<><zik<yJGvn|(O4v}CZi*A`_Mgb7ysX_>{rT9
zvKkB0VTO*!N7kYByp<YSLHYF1r`Dk6o<3~+L?G<`o>2I?j#3?;;LWAW#FgK}DuLuS
z>3g8A%$A$|q>tGkfN(6jM`=L836M@Q>Qz)xcdvU39X#KT*4sF<Gpq;$qzwZq!q%T>
z&6>2>v;yMhnY2$J>^)L36z=>18<@=iUsnzJ^m&&Rtu`ylTEN#g1EqQ)Z#aX$Y-H&o
zFfWqc4%S|E%m5>Tx?j48ZSel|q%@^FascV|`hu3O?j1hEc(;8!vt<OVv%Xy(TtH3D
zknf!=K93u7fuiHPJFsQUl)V~He3j<P$&w{0;f;YGIyb3oobeJFY3y>>H_q5tYC|C`
zXM=wRGlwX)hY0KJjS)l64xO?o_+L2pA%XV+#)1Q#Z5IxC$p+DAV}$N;5WRtTHOVj)
z29=atENGmAgi=IIEe4O7>A@y+7Nv+;<mh<&#Q|;5y9$U@E19Zw$sP;M-JtlmV4Xqn
z5~#@#mL|RC&QAEwwan(5b+Ra5aFRMt*e-0yl4IM7N|~KkV@^$FOq26EBR;4I;iCl>
zJsQ)<BBszmtZ5EqV9f?XDBKrC5JG^JDo<P3HrN*|85srC_Kd`;2}zq1kqoG#Y6;3!
zodS8WLE+CE6?o=@{zOKGa+&=(zqP&`3e9;VK&#^6S%yi1y6&Cmc6pK$M(0ThyLM*M
z`Jyl3Vq~qwUE#tm$TIff3-;$wER_(>sixvB_D|9@ln!q+u|L%<LpkC!$W+!jB5ha#
z=+q@gss;%r83VZLfCz5~QMMKcmvo+Z{^2b|A}AR{_s9{WC2Xx+N4HMxJNe$TZ9k{8
z;~vBhaB6!4yO{$LB?K})4)*^R=pzAT=p+YsjtpIfwcS%n#diI&wq={4`?%r)U7A6D
ze2)m^L9(XJl@)yuN($YSp66I|6RlP%)5n9*ILZH5!fX8@v!ISMdvP(;JH1|px`{%%
zxh>xBp*Zj@%{;N1t2Ly^%p_Yat$q~Dz+tX-DiJc7DVxc5KycifR<Gbv?QPxgb?v5^
zX<j`n7=m(HR+dhPYDdRZGB%nxclJX1C1?U194WPjNu>0rDp=RPlA%gxE!5D|eX+dG
z&a3d-8(hZbvOy+?98UZZ1wCg{32NU%l2GL-Izd5#-i#ZiyGCPBj)jfCuB=h%2v63G
z&CvwOXxXdsa#*#@(6oe}4|sRQRJ1N}$g>g#R%qV;A=P&pS_&PV-G%6QvtVm!2}v$&
zx{8R5l~uHg4@(s>DHCgIt&Yx2(a7j1dOlCn3`t38$k*BubaPu9?c+@~^)PxHiL0hU
zT8+cerY;a3q7KU#T5j^pBETD{ho4>WGu>4gjyQX7XjDC=X$8Fa*(%P&;Q?bL{98>n
zm&(bhaWkB9$+7Cm+}~8f#|xMJI~a@nbFzPD$B<?x2i`T*bVXnqNtMIdK!wmB5D1Sf
zuRCxbYNx$y+LxehJrVWAnPCuPOF(P4U$SuA{@nhi#tQ#e+a{MnUroFh4%bpZMru*>
zh#l@nHMH+L@wC8kLW#MBKGS%oht*WGvqCOkTGFQP$yP3U8Zs^~;!3UI#!5kbdZo~l
z);Xp0#yACeF@`FzA-67KYAUk6gWbT2R7g#>q=T6vTT%IOr~xD)<50M5oYs^hF_V;%
z8|wBwhA?YWTb2l+a;DQ+51Ly$kPh4F+;$GyYYz}z{pp>dWwNhJHde2<9Skv<_BLV0
zXg?b0_GnKg?#kmp?Lfum<bA=gUPwj;v8vaumxpdAF|?4YzCzj53VeiBJ=;2!WSjsN
zH^};Q+0)6pEK}N1%R&W7Q0f$@yFc3-;CNQbCR{zQY^Vvx>~iXBAHMm%ipQ(h{`%SZ
zN6zrV^=&B!IIOMjamN`ZY5@YQ*q$L|qC(;G9%AT|?6-gKu$@l=>H*(W&|G~2+7Hb8
z*yG5{XdK7eizLi(2#A0dFmI5J0boA12}oj^pgF&0C?2ewQsrfbwyMbrl9148^erwt
z*rwGF0z4BcpN{G|10-9`dKaW`?MU(OKM7lpuZcz`!hE}qB7F`g?#kHnD1yIE-^DJ{
zvAeGTUq-{^n1s7e;eyjk-TQKADB96Mdr#LS`qT{G14njf9aH-nc-jnVbr*zNX{~^9
zXB##cDZF;WmlgYZ|5Z+-{+3hrnKq@FzuqSwXaE50e=`QMF|hwfH8uS&s;TO(T_Fd8
z?`iGbe3*Z-5ahN`Io-ku(uQqqYdM?qvXZjKU&7OpK0e#Ei`v(&oizby=5>YUc}^_)
z)am<ih8-9@7_gP9e1UnOd#tQQ@D6+eA66XAB{1c3Dmpm@QOJg(Xrq`Fz{6#r8N}gV
z_jjOS^n&!AU_O(aQI7nSq6qZB5ki8vqbgM-HDu+&lw^sb!d<###z~gwSA}5JAEnfU
zGa{<#vga!)N-FH6gZu57%>=JNcKbx|hc%3DfUBxN%|uNl*Q5H-mxRyf8z4RDUg`T~
z79M;ei4~Mdn{uS@c&*fwIo3od$G~$w5@Z-Q3kNyqQQmq4p4ogaMuRDh^^Fu>;g{Na
zf3?}3`T01Tz)BlQ`Z6>nLUk<Ul!vVz7x)fx!MzP5<Uz!a_I80s&_7KO^XCVT8VCjU
z0pD-#9uWhN06IRpQ-Q}wK0K_s26Zq3T<Tcw$IsSiAWlZ&c@j!k;MulinK)dXT996*
zv*kTdP$1(8Wkz5s*@*MP>bJg)5k7$<@!1eNJ?w!hFw`HFUm%f;#EJzL<!{ladtxE-
z6s45CrM;%Y3mYX8uqvwq3CY!fJeH1=Z>B=f98tOemptJo3;MAVo!2iWybR6~x0Fs;
zDIRwp&+Ag0i6o~aav15BBxz8ZR95n<+iB`y$LGmCihbR6WL@N##RHfgw_c)H1#%lW
zZh1zWv6a#?0|iC=>12$NiH2}6wvLVLt0ivD>rtzl_bh&-b*a$hn#~tplCFP_s?WN&
zeN6w%QL9XJIT5w`S%x;Ts0NwaLtFZS6N6#%8F7{F0<b9qq2?_py`uCH%ixbiB=M(x
z=?sKBBJFp0rGCj(ua-UUGL0{Z#X(zK!6<;aOn;^}I>%s4|F*s=7%$RnX?-?|?qhVG
z=f<|u&Erl!(QmGtUzr%G?9{U1el)ljg!w$xR#~8rIU^60+;(xj$S`w$!A_R&bQHfr
z!I12nF77PcRW>YY|5>^Y`KtShIAI>B79{D+e_1(~P2cI31)*WWn0kQ2!bUmy1CO&r
zQx-d&y%DhqDgxWW^Ja%n_r0(F<yuGk`F4#L3vY7vR=0T=WZRooG~;X5W-&w7i7)jw
zOC)AKQ_d1h1|v&04@`d}ohzQBpih#(^U70O4A>eNE_^j_9?-TiFfDe5o!ql*v7F84
ztO6(ekoD`Wvi_UtJV8)g=B*y{+1G52ocsL+X?(d&kD&db^Cw$XJ%BERSOu_N0^K&u
zleFkPu1~s;gFK@LRVdO1hE&Qa`np2r^#?6R5atKyUnwH?Z;Ajbzroc1n<5hbZjk@G
z6yap@&mV5<{~|@G>BJqdq4><!wi)7yldxshZThG~meUxUxlW|cCzh9JR?`5ZLpUBR
zxu(zb-ns}91Vc1G&bd}R1p4^!&G7zmO|)&jj7D(`l8UGF-iP{g>owO4SA%;hmXj5u
zjK4>$W*P*PWQOs``nTsKn?|H2N!GVU#ojJBLLkdj4s&GD6)$B>#8%ce$Ghi5D`Fd;
zf+vfaaWpWLkqEuSCA+Hf=40XTqV#*VQ7I|{**Ug3T0p0GDreW`i+0zcr6eck%_-$6
z)|$Pz{KSIzIfAdT1UOM|ce}oGpI*q7FWS~C?;R>*=g5>YS8>!KwGGH794EB*kp#IZ
z+%jz9`4<O&FR`;<e~5jri*>myJeM6B<`?S&I^)YKW#6@vB_9p=APw?;V3vBx?^|6f
zj~o+gz4`%r(73obhMa-fMTlQAy*t5(!b)%FkH$drHW06RrG@rvL9@A$7AP}B4{&B7
zKUW4UP=gK$6hQ*btaPY4g_upxHz_l|^(U#!AtdoAOaomA+Ey4RqOr!3H)KRT+m`4)
z+}I8=UUwp4=6FnMh3X0S6$RpB&seAe4Zuioa_(dy)3j@s?lOA0NHX?ef5Uxbe>5!1
ziLXa{=fRnhLgko~*;zDk;=&{BcSD``<Wz_c2Zz@7A58+&o8})OVu|8a0i6oE-~8if
zq)I{J$KoHlh(__1jF9yQC-F5k3y$>xtGg2%-2hdpf>|57zg0FiW~|P>uIVUalXA1v
z-&(cpOwBD*o>mqvT~D`}wd5O7q80aA^T3Zjy5|8T#;C&1>?tw)CeITcTuwZ|1HNi2
zq$>V=h@emkR|b>qn;lTv_VE6N*yX>y+@vQb%Ups+yzAE{AshWPP*78pMo4Nr+*p*b
zT?+W+FaM^VTQCBu1^+PUZBKG>lLT|nOeV3wC;NgXgkn}nDH~}Kh5q)S{(k3+%Dq?>
zJfS62i<bn(Sy7}55PT$Ql;d=doCdqeJtrjzY5ms1FTqwsQTt|m2(hM=K7!L?>+;#(
z9b1Ry)`K9bbI{j}uI)X%xG2XHoQA$JX%(9!zNX*C^V-cDW;UOEmy~<b5Vwbch`p%}
zYZ#ZwdhcC$xa5uTYtbRsohLV(K>i5ez~B$74<4m7`u#wIr@K&<lKBn%FVQCa6|L%C
zMATn&Awc)vN6G&mo@N^tYiA2*b4L>c<NtkW#qr<Ec3I6vZc`lL`$n%`hHouZF{)Kb
zj#s3ErA;0SOIUiiT(ELDC%D+Ewz(5-y0=%O_BRJ*h+{#=-<L|=cxJnJm~$9=ZoP;>
zr}#yU1xn--w^w}{S{pkLJ^UIcw#1)9wV6$9`T2XEyg{atryz?PCzPQmNO|asnTn{~
zshINuvE`-Dei^K4Mos{@VC4v%q%d?TGl5onFxu^#$sVwZ>FBU?!I{`HOjQ-;E(QL#
z&dvoKs`QWJCzsr!Q8xFbXb>heY^g{~WXNbxE}3;_j4^ISid>>%+``h<XflN)vYV{S
zvLq`SBorx+<Wf&VF|Ns;gLG!cU!MPX=J8<W`#FBUci!`R-}ijqW79J#QxPQp=p1P-
ztud-wIW45YEYA8)#?OOHb$bn6<1NhR#qE0xUL<c0LKmE<q*)Yrrajc>QyO=j@-k_T
zX}C8SBH467=O4NAg=#x_vBHFPJNRD*^XR`vXQv8?o?+T{SLzK{nqofx?uDoe5~HZg
z)-j!rr$^>Ak!{EC)`a#H7t<*@9g`UUkdVV7O8svXw~inOjr|U$2uzQL?L#*BnK2kj
z*}=NoM_#wrD^iiUvKHQR!*!lw^|AM9DI2;z@eT!wxRdb=8ZO~oXvAKXYUB=+i<&u+
zUgn!z$NJlbNxkp7qX}cQR2L@|Ve|#tEcEzKf)j5Ltuc<JW!@H+_HPV?CT#|+t#?1{
z>NVR@Be2t<if8tkeGu}JIC7|qLJ|+@kx4#ah(1B!4oS5!HE`NQY{j8?>bat&V~Asi
z?10q2kBl3aKB;?^?XcU}BD=;ZN#N8+GsAtj$FqsI;?4Bc@jSm2PtBTTen~wl_j%-I
zn6o;?j->U>NU)(;v&zOyo%^+k!kjkZuUptFv(H(H^d`qZt8fi^x+(qRv%4AY+icyV
zPHs1xKs!t6@_1{AyJ2b@JMxmo_9*u^Ires4B;TfZ+hUu09ZL(OX*5l>13c>e8eyS+
z;{`u97Q5KEa+%-1f{QY0viB;r*58Yl%NTg?Ow{g-NoEbbC+hGN25VG5qgBk_QgGle
zowi~KMe|NdCyblBX~oTl9gxL$1;#~dOWz98v1g3XXShsVo@E{TjrLyqPS)|VFz5LV
zg%y;X7q&%$l!7PJfSUFrGaFm%8{&E+UOF^WYc~{m;GQ$cEV5pi$&d1GJ3R+#3N82!
z&7MHLjDLmfi>R37F6Rr2qr2RsVtPq8kI_?4FmrgzVS<?YjKv`5ADKRPQ;m-tR&
z=ETDk&UiUIk3UmoappIhxf?!w?HS&0@a@;z5f5&)G{Wk1XbYFaVfY@CbpUVybfPJ+
z*XiUCGy%VSF|}K;lMk8b_8lu)T9O5(_5h-8WL9PUKE*mjmz9bbm(9jVzRGcAF)`(0
zk7BvB`%eYa6QnOzq1cuPhgVIrZ$Ah{p4*QQjOCROE>D+#VVI<L4D*O-I4BZRk|f~I
zN5rz!Mq^UqBTK6C`b=eb!zY=ShYW6aBt<C|?mx20Ma?(*evzu+Fuj%5u%oHRxv|<J
zV!PPDO`}b3o;u?W$t&b;%Wd<>_S2I&l#c!RN{xJYel4Ob0c}zm^UgbhGB-`EJe<<~
z*1gj1UjA$4FZnM#seKzQ%1mz9;du!jThDj?qQ<5F@OoG9X$q~EqA8adpf2XGr?_$V
zv4aFg)WsmCdgUFpKN!D031~%U{^LzbPB0m!>N>=ucFk9J|7CYgrwk|g6hT1N>nX4*
zw2zi&M5E%%ezn7fi%3$PHecT!F2TsC+BIo1A4RpCX!*0|YRu=XZ=EibBvp^d#9;sc
z62L?$t|3WU-^r3WwwzR}vEe1_hnK{iR9*0H>NL+-Go$ny@)EH}KD=|n1|FS1OIJ2(
zia;b(?4_<35E)>(5tPah)Wg%Js_`U|8>7iZ54a@T>@SI){BtB~P*+BYc|SyZ{n2Z|
z=XSlP%5)}wNFoQP7NPpKUVconeB9T3v)}W@y-t4FGuD%rrxLZKx=$6VakH*!-}ZHR
zv<`9h)4}MwQG%_lL!$ZHrsT4_k-6C^$`h_1l4tPoYU1fk+!hk^&L(Ah#lBr0lIaM(
zVhmMpLswwoo!zmm-uW^?W17>pxxdr>Dn&g@j7vUxwBkx<Tmxa}E+~vr)>#uxU>J?S
zFiNcu#<ewZpG0$SFnBQ|Uw$ZSIPdiI9!4mlRe1eg;jVJaV|)BE^~u-LG-h6_rikd|
zwiu1%8_QkDbri(C<)-r&eimqK=iaCD#5B)CijehKYQl=pjF&ldo~ty4tN-J$C)l5q
zY@RVwm`cLsjiS1`)l}Zf9+y@ry5CZAu4DbDRyzmPq$#8Bs#Hg_S7GFYxZ@9yZGJr+
zeJ&3&uZH`?_zwiPlxvOdKkIcRREv2!oa=C8qGcFmw9TQV)QZcRlH(<daz|q<Sw08M
z$JFhbRH0_JXp`l+I%`Pityh)+;{!c6evQvbH~#=HU((kWrM$9_0*_yM9Ud>gUtUQA
zwHu{{(o_<}V!-Jl?nQ*eS#~Eb?*a&9fqO@NFo3>8Jr3x|<psEX#b4rrf`x!}JS;F-
zJYwJ<;ZjgGp*MVeAeN;7fF7X_{(3=*&6f+RLR(R-s0!)x1CXjBYS0xzX7K`m$dW4f
zBdB`utE!8y57}wale;!e;Zswld0@@NL;!#&o8}|1*IhW$Wvk^voYyAn;K9hh2bN*;
z2ceWfNdthzLoEZ>F&1PORxZH3xQK2pfnX8?)~L>T+hw?V<z~1#{3gucaYHv@{!WP5
zcMXFpA2(Q1X4^w-wP^e=kN2YLQp>_FseW~QgMyIbd&R?C6#Q@d+Oia0D8%##BX+_6
zqm{9Hzu_TBW7l5M3BFoDzO`g_V=tT`gJTQGa@+ehA@_X(r@s+SIAy~|Sjp`OA|SWp
zib+`fCpf)ca0J7a6R}*ZERR1x3grA+g~EBwI-Eju;-qj~!45*8E7(_oa4s;yA&P;V
z5ccIpkN_<=u0r6PbA=NELOBU+Gq4~4nt@#fz&T?B2c(B{0@&tnKmas<vkHK7bO{Gw
zqSyc{jWt08G}c@N0e!653tl)vI+hIqb)YMkzaVH0XA<_J4GyZl@O{u?3AZ*9dtC!3
z@m=C1!K<9r^cQ<V&bk1uuN1eW53dqHQK(Ab$OA8URsw)6;FlmjIN$)^3IP8C08&v!

literal 0
HcmV?d00001

diff --git a/sbin/spark-config.sh b/sbin/spark-config.sh
index 147b506dd5ca3..5c87da5815b64 100755
--- a/sbin/spark-config.sh
+++ b/sbin/spark-config.sh
@@ -36,4 +36,4 @@ export SPARK_HOME=${SPARK_PREFIX}
 export SPARK_CONF_DIR="$SPARK_HOME/conf"
 # Add the PySpark classes to the PYTHONPATH:
 export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH
-export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.8.1-src.zip:$PYTHONPATH
+export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.8.2.1-src.zip:$PYTHONPATH
diff --git a/sbin/spark-executor b/sbin/spark-executor
index 336549f29c9ce..3621321a9bc8d 100755
--- a/sbin/spark-executor
+++ b/sbin/spark-executor
@@ -20,7 +20,7 @@
 FWDIR="$(cd `dirname $0`/..; pwd)"
 
 export PYTHONPATH=$FWDIR/python:$PYTHONPATH
-export PYTHONPATH=$FWDIR/python/lib/py4j-0.8.1-src.zip:$PYTHONPATH
+export PYTHONPATH=$FWDIR/python/lib/py4j-0.8.2.1-src.zip:$PYTHONPATH
 
 echo "Running spark-executor with framework dir = $FWDIR"
 exec $FWDIR/bin/spark-class org.apache.spark.executor.MesosExecutorBackend

From f96cd4eced0a5d6d9c18f9c9228b295a58098387 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Sat, 2 Aug 2014 15:58:24 -0700
Subject: [PATCH 035/347] tried to restart callback server

---
 python/pyspark/java_gateway.py      | 5 ++++-
 python/pyspark/streaming/context.py | 8 ++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 671c0d426677a..f7f4a82ede3a0 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -76,7 +76,10 @@ def run(self):
         EchoOutputThread(proc.stdout).start()
 
     # Connect to the gateway
-    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=True)
+    # If start_callback_server is True, it looks like callback server is not killed
+    # process is hang up and test case does not move forward.
+    #gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=True)
+    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=False)
 
     # Import the classes used by PySpark
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index a4900191d1730..04737243f3192 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 #
 
+import time
+
 from pyspark.conf import SparkConf
 from pyspark.files import SparkFiles
 from pyspark.java_gateway import launch_gateway
@@ -60,6 +62,12 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         @param duration: A L{Duration} Duration for SparkStreaming
 
         """
+
+        # launch call back server
+        if not gateway:
+            gateway = launch_gateway()
+#        gateway.restart_callback_server()
+
         # Create the Python Sparkcontext
         self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
                         pyFiles=pyFiles, environment=environment, batchSize=batchSize,

From 1679808cfc45aba0a74edb6ebf9caa12ada664b9 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Sat, 2 Aug 2014 20:05:15 -0700
Subject: [PATCH 036/347] Kill py4j callback server properly

---
 python/pyspark/streaming/context.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 04737243f3192..5952e81a4bef3 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -15,7 +15,8 @@
 # limitations under the License.
 #
 
-import time
+import sys
+from signal import signal, SIGTERM, SIGINT
 
 from pyspark.conf import SparkConf
 from pyspark.files import SparkFiles
@@ -63,15 +64,14 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
 
         """
 
-        # launch call back server
-        if not gateway:
-            gateway = launch_gateway()
-#        gateway.restart_callback_server()
-
         # Create the Python Sparkcontext
         self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
                         pyFiles=pyFiles, environment=environment, batchSize=batchSize,
                         serializer=serializer, conf=conf, gateway=gateway)
+
+        # Start py4j callback server
+        SparkContext._gateway.restart_callback_server()
+        self._clean_up_trigger()
         self._jvm = self._sc._jvm
         self._jssc = self._initialize_context(self._sc._jsc, duration._jduration)
 
@@ -79,6 +79,16 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
     def _initialize_context(self, jspark_context, jduration):
         return self._jvm.JavaStreamingContext(jspark_context, jduration)
 
+    def _clean_up_trigger(self):
+        """Kill py4j callback server properly using signal lib"""
+
+        def clean_up_handler(*args):
+            SparkContext._gateway.shutdown()
+            sys.exit(0)
+
+        for sig in (SIGINT, SIGTERM):
+            signal(sig, clean_up_handler)
+
     def start(self):
         """
         Start the execution of the streams.

From 0f83eaab0afb5a57c0ca2225142222a323ff3325 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Sat, 2 Aug 2014 20:07:15 -0700
Subject: [PATCH 037/347] delete py4j 0.8.1

---
 python/lib/py4j-0.8.1-src.zip | Bin 37673 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 python/lib/py4j-0.8.1-src.zip

diff --git a/python/lib/py4j-0.8.1-src.zip b/python/lib/py4j-0.8.1-src.zip
deleted file mode 100644
index 68d7267c733da88cfdc5d9b97e24327013ec766d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 37673
zcmb4qbC4$el5X4POxy0Bwr$(CZQHiHr)}G|ZQC~Qd}sIE-S6Dp*u52z74=W$`-^-l
zpC>6R0Sp2K@Yj_XK`8i-hkw6;0$>5yx-pvnQc{Ej03I<ImVf;tPOi`ZfFLKp0095E
z$^N&mL;v%2M<WMEhyPGwYxrL(LHYBm5CD_hMEuPylR?ve^anu$0AT)IiMgJ$o{p8C
zt<GOMX>8s8aWeY%BiX;?Uru(Zyls=uir{@*eKQ;4n<xOe=~YTIcZ9TNUEN&D?6|0)
zs0Nl9xS)&6eCed}xou;K2bz9arhb|ojeaqGKhCfX0|N$Zp)8wc>hBsYZ5Fr%AIF0c
zLv;>JzL<<oN=6X0CNEqsVgYb>?r#FI?^HVGuOB%ldBdMeFJq7`J0ULw-FJWxCu*-u
z5l#tNJ~ts=BrkW9DxP+PS*^`%Tvb-7WMlL*j3#d`Qhk~`DX)%>8fW{m1G|;f6_~{x
zFD7ahx@U@_gt(N+NCd~i6n9kTn&xKkVH`zAs{bB8-cYJ`Bbj0szST$#Qj4$@k0td>
zfWCJp7VWvgb^{r@_a3Va9eF4-sZwiI!0;Ob+V$}Q6Z><8q-Cw>!QY$?)xVHVj(qx3
z<2lEJ0HlhGnGP#ABLkCBCr2g%A&Z9lHz+<7`bkVoN(vb#{LJF<AeAc3?65*TY=9BO
zT<_e-uWFNeyi`m9AgS~@)*VNI#e<p6P4N{56X`}uG6Z6gyy%~bcEW<ldR}jnBu|hS
z0`>&1?^_VcbQKrXPsn6LkpMzPg?kLy0l29AxtT>D*;|vL1&vY(7-dyKgyb5)t_ugr
zx0AtWw&<NR7XhX;ut%v4>()9;zyc``+4RM7Y2e*DzdI8N#q6c;0{u#b?MrhS>%g2g
zy~3PSv!zcdK{uc+`aI(0C^F-oyG#o3-wGz}z}ZvwTgDcMsKlWH&5-iR$d7t<NLV}^
ztlU*XI$b-lW$smgELvmBl*+q`4U28I!nf8pO^V<<<qIb`NbTQ?jYTC4m_ostbB<hS
z^=ob!iX7fxbp?oJ0V%kgYydgu9{e&{Astt@i6bdU5v$rwR=*eAqtw5t6C}MJT4IN$
z5R7D(Cv4axmP2PZMu%w9CCiN7Re6RWGzHK*9cqq1+XT$ul-n_^DlI$O+83J`aM6AZ
z^!Q9E5r58wdJ~i^XdM@1y9t~4@TL`zh73si<u<HqwsE{k5euik%WP_|z_0%{<tE&q
z5Uj&Ax5>H+Jl#AO>>&u~rMk|k<<qPB?%|vCKU<q9&1%t-UC<p5_<q{m!;N0F0blQn
z*}mF+kqwT9Z|CAZdGZiD%XA6OII^sJuMPaBvJX7U6c;e<+c8Gsy4jn02s`)*DvUJh
zFnzC%aif9|7s`b}ji93Oa*y~14|S9%H~A`{a>z^6eHzh?asDd-hdgj05H~~a+6Bz3
zF2UwkoMzNJXgp>2`=nVA>WMt7JSswl<L<R=bs*z5Vk+d1W1@MV;aGi4fzdfEgLd=f
z&t{=~g#PaT0Ghur;y>dQ+TSrkN5{<C%uz?@e}IR-^!*#=OjD-_tj#R%86p7yIOqWY
z;Qn3TKQPBc&(X+5&+Tux)5P5(|CjtL?r@$vuMf?BzoKBCg7+lygT~gOX60v7(4&qo
zULu>Za23H%A!&x!@cDxBj!s}cvc2hVPh9z{YC_7@%{3@vg=tP$nHZZG+pG|K@LVC1
zrnY=-BD(Q#u5%0|Cl{Gj&d1KCN{Q|yuuR||d55UJzw<Pz`xK75q6LZ=O7^=BaWd=J
z1)BYs2(yw&xurL3lG=DBfH0_g(9-R(oQ_fx?JN;EVTI~2ny>_%MM_0JYH>ItHnka8
zPX2H|hA4TRi`W$U!Acd*YbxL^Sz|r03_=+SqKv|-qnZhjmAQJX@~IaBQzi3l3?JP5
znJ^w5C`Oxqs2Y!kviGwDcrg*-=Up%er}MESQ1F7D!I`HR0DA8EF!TFagM82cT|~V1
zFXhhZVB`sOM`~TsPmU#gYewzC$Y6Xer;)e(MSSg@ZSoO?<>KsjOz8SL*ptL8;uw^h
z>4)q!7u|L6HgVc5zV%;urTyKSb+4&G*!IKv9%uA61l&_UQ*pRs)IofR#!8RrdB9MM
zdNRZCXdtv>co;~Bq*tqO2NzsYu_aM!c{lBLKrFytY8TC6-<omXnx7^gFJrNg&vVu<
zn}R=kWvyz}S-wv?>$kH!S!->igKcuzLSbf(CSI^wy0SgfT5@HMEkCgp4|c(^HJ2WH
z^wuxou9J*^_-KC%%s$mH_~5j6ErF$xSc>z0Ay}SG0CDK=azQ^nAr4Dz4%wC*pW?b8
zxD*RGKV^b`jD|x^cAR_RG441zb1T)=5bp*q(!4${>x^bt*gJ6%nJhPVKZF>%N_{zd
z+wle4ydNAry*2Rsh`nd{LR|!#`2-BX?cm|MeBQR@hn`8rVqvERWxz#GZ3Zpq#XZ^6
z7}3pnH;hAlq0IWln-R5mcm8qiXEsTxM1%$F9au*6IdT#KPZ|+)%+BZx+?~Tg4y8rq
z4ZdmgFvIJ`+9n7g9ysuW4#DqSRhYDs%CXU520_vjcPPEeqUdpLCFEPOQebJnC~Nok
z9`RT}jh;xo5ho;$C)dyNFv;CBQ>(axH2i?D+94z?yrWtn$+r*qA7q1?!-CjSN3JLY
zF9;xPsPVuRI5)2Nt5+8n1u=MN0wtLsR0Q4#)3ngB&xjc`T=cN~Tlkj1nP?);{)O<R
z;E63J;b<2{uzDj=RvIy>ESI8Iyq*yR?!(cIN<U056?H-APYjXttJp}w_3QCC?Xi*P
z&z|bw>mC4!)?-0~e~?>2ygsMG^b<>fqH_n@hI>NK<Uk=&G4s;;0C65UoXHe`9k75B
zTObn7?*Gp5DTSAug@CNHy*cR(swtZk`7%5xSJ!kwfWe_`i(Ro`Jxw<Xe@AwtSr{jl
z@H!PfLJFKu@U*a{!XWO7bnnT01LI;YuGI2hWza@68KO;k9{?1*;Y6dq<0a(xI^7!C
zrj>s;5{55clW!5S<FP;SKYWG!4#!g;`1GMEExztB%0`delY4o_d*F@9M;sU`oMvsi
ztOqA6@hmcN%<xeaE}#zi9(R~5!LOb16$4jHL^P?00LH__u*Vh^Pcvu?pRE^UKY$UJ
zN!HI>FI@o}&`FcOdr$&3n;MA`@rL5O+r<@JrfQE2pp=A_<4J2<-OX<kV9cfZT@H1^
zJFm3UZ!^zY5=UqN@WI#X{~7n|XQU_k7cOB;94;hI{OAZg;!MaaBxk_~sC(2KgI@L&
z1&qfy*3lL^7B;f6lv%<X0Lp^!0bSFKRH2d-Pgv?XR%y`;6d5out}@;lRASZ7iQ(3;
zD(d~yt=uJR-=U;t*uCVBz}ry9O)#kT9ZrN4T{wQ;PSyy#wwqAvjL0;B6odiIR1;o+
zd)sILz-a!^j2{`GcKD@M%b+(5=Q~14N8MyBY!lkC`-6m5yRLi#cL4+pec^0voE3c=
zJO~aQ1{Y4rMDOZIO4Ta@bTw72hW_PyXFs62$81-n`iwk)0cXU)DlxtWGQj2hLAO>B
zV1s6sW0SXr;sS5m@P(^`n-IW?pH^a;2xj%jmBnVWNYcl~Fnt!z+IgMDwQ4{FZ)=B-
zh-@>Be$<5}^V1A8uv0fp<vz7RzoJ+gbfVV2c;zypCOPyE)JOy~$g+P#;4i$+(i`gY
z12q0bRDFp}@IP~#(3S29r7E`j1e@7-2?-axuM5&s2DIobq8F^q+u%=o;QDDzSo_*8
zX8E}*z%1#5?uV;)tBI6_i@J#{?`Bug!8rjg(u_{aymwTmCWA0lmRLn`RW!F8wE1?E
zo<&50@^w7?q52+f0r7pCUoh5bU5Z&N+O77u_1I4f!7-43e;bIyL}k;J)I7T375-*X
zBaSqv(o4Pq%)&e%?nFmNpr0O&?;x<1F5vz^_s)N&ML*GbzmvV2jr2$gv;q&&oW~D&
z=Q8ru>L}o(pNCQIXgnc7t=ArTka56`_@PuqMn;o&n;lyDTQS!D8?iBh&~5aM_*guQ
z?p|ACD-6mfIl29y>z)%-WtIOG8dcM1Dtq2Cpv-c74~p_R;f6-0(oLEw(7?!AHWMxM
zphyzT+ysu3PKf=8Ue_l$ZgfFUeP%OIk6a;S*<WNnr`F9lc7DgBwpF8Mn6<ih`q`oD
z*3@;On*ofDw(Z@GEo%=a&fnM<elI+S8A{}f3=LZ4BVU<a)9{Ig-z8QR3qsi4`>|FZ
z@rg&)oNcZ3_4_8^QzL30d1O;lpwP?5u*-(~4H_*-Z<u^f4bq_-GKZY`-f3Pj<#3ol
z(iXv@dt$a_et|v3myA^rlD|$6e<khUMU}?vK>zSMXP1p9D93bPx4RD9I`Kl}%)!kS
zpKBx*9zdlR*^8t*t0rpxU{HCJl<<wv79L|~%&6My&=GvcC6uJq!m2Pk3DJ*7-KLG-
zXOoO3pZa{$$$K;tPb>}V!r~{6kIVi~zDIFk`*{YXYQ5sIX`avIed2Rw;IvSk5fhZR
zLgzDt^7TulXNtLUm`q*kl=p?zJNTcKXsv1_DrW{kAPr6Wp@#WXhpQ8>dGyBuVYXpR
z7VA2yROV;#X;Bklu?M!$RGPa@EJ;=D4y3eXWQd?D#-r6$MOIg!i-f9N3B-=1&EE}8
z>YN=hTGLS~J}-B<aoXKVQFb_4q_xZ<v1>RW{bRfJAeWe=uJ+eJJt#(U9@6NKrZ#4b
z?FG<W$duDx*JUJxCB^5CNsb~=Yxo@Gz$j+LoDw&tk3amyGgn595gKO5+x8L;s<g32
zDPZ3p7HF7u3LTqQH0;)H&5V(Jep_N)8CyB+>f!{JOPtMfFq?RPgup{jv+gA<aSYo6
zP4^2p4))PSmoj(=g379X*HQpxa|0`(uZ|NMp9bq$A)!(xABY}0)cXtWn@|k0W2D^%
z@>ryYNt=75GxT9LbwVnRlSG{ZFT`Q@;%0CV#NMP<r(_mN!)uDHE6c90Srd3vEh+5x
zC5s6REs;zC=t)F784#Lyw#Id;xfoXOpcN*c#C~YDO(Z246qJK<z<NTE&(>RsepfGX
zqWk%{T%p5da90{CEs@-&T5h5=rua2YY@KiEJu6+;u)*nQ${||NvRVG}xXD{j{1XSz
zuXR+ui^h4&&(Jr$t#&4;7-!FFLJne=^6x8MKSonTxBfPNW~!26!EZ8gj@s%i=fppB
zli^f3rq$1VeRth5=De;-Io~%M#A-evN)@bAq1l3;7GEN9b{TEFt{f3S@D6){^pwMl
zwT6<v0j(859Q-JnKkWV<G#YDfXlT_0CYu<mt_t}itO=)0?l5lRKKvn>5Fxn7zlOoP
zC7?ih7IXEXzD%MGmeXRN*rX=5-u<1_bXTrXq+0C13m*K!%asZ#7%mffu0>3%tQG3Z
z2`$Lpp^B(!P6*|%LD8ZFjuW^;aWwoX<BK1<e++xGG`xk8?V}yD_gm%m8NXu@L3#+p
zo{zeSc<RV@dnRod1z~v^<}s!v%n#bA`gU0_L=_zTX54cygSvbP{)uBQ%*KBtYSSZ_
z9@+*+!7(?Ai66B}Ehb`aNmm222Hv8!y>Ug8hiRqL-BQsK_KA=BNqsxahf{eOv}4Gf
zMtAAlRYuMpuu;3$V{y)Y(-=(L9zNGS=OUU8XBe=JQ=4M$WMwpyX%Cswh&N|lAc|0=
zuL}z;CtSvy9fdTLgmdXW##vmyA`l+ZB&ko0V$h#p2yZm-&=#r72?I-GF;qD>Pq1^x
z@obd<28SlrqEO6nby_G@Mx2$N;Y=oWk(MXNp(qh6xrIbaE-2D$C|em)DBaj}>wTbm
zfygQ;fQ${JwS9(dx$nNWyA?WXuQ^a-l#O~(7)YVk2a?S~!X9*9{g<8ww`NbzZ{`!V
z#)^h)r$mg%6EQ3Bd?g~6kt;@Zy|&;3u0avk$JiIX&}ul*QlXB(Nbw(zF$LolVG-55
z*}vzY;#iYmS3Yi$Ay)|u5$_GWi8OM-i+W2Go7FPj=vh(t%7vk6gjgW?6tO#}PlV4Q
zxwPekqwYUL^~Jo#uWo`VyA30XUu6Te9KMU;c$T7%(MS6uUq9%{m&9=9_RA0#wH3Qw
zlW0U=t#$EyYB7vB108dV&3EeoC-ToGY&t_<I&D!_*3qLsK+c8;2SLYgIUsZ^J+KOw
z6|^K>p-jeZBG1#Ia}o70H5Db6$*PlGHFsf#t=wN)57Y@vNVjz3w(5Mz`dm^Tm~~cr
zQaBm8Z-;^?I97+AD}Zo?59h)HZp_NwuT@rw$QENGLYNpY53`~eXdKtku1#w;`?MTP
z$#H4AGn1df#<LvRlcMjDt7&2hH7-ETt_l(^E$+b3)^~r&l(H#68BI~Cx+D9utJv5H
z=eY@A+pPlk-C@HHyNYR7P1jGD1!lT#fX2#C%1t9>-NUJ{n=K+<XfQtxNbMa%@+lKG
zmpHE@RZybsLoMwiHqVdXdFzY14Vg|D{RwXx7tvW!2*RA>Q2<A!fJ(qL;XhyndoG^W
z<uJ1C^)b}rHb!^|F!xsDnzU5o8~fYgQ?|>&{5rK3a}0aPWRNmz4Xa}12R~=p1Ptpx
z(mEVT6M6bj4>!CGfnrr6(g_&AU)Tj_<RQ|}cIl=SgDN=7_R}y0B>>L^>j&z}RX9YZ
zZ2<x=!plSEr(@WD+$%(}>uP$m?2PlIzKVSTkkUu5ppdD$)`aUT8yhYbVHMdg5#nHS
zYQEeeaK6t4=>qnLd=%DHX2=DYIW<aCn^^BVVGSUA6=v;|GDmo2gsD6((DBBmAC^=-
zzAfsB$kNkHS6{%BqF=vXn#liV-6MQNr!olGW=#ea+2tKbr0IOgv~fjgQ-^qTv&-HT
z&5#NponWcvVr+d^YmrH)+~?Nxwp|s-PMXH4VVj2E)&ss<e-6!?RJl?6dGI3zP+)J#
zv{L>)lV7-yE!}N+CEi^aw2v}d2zX566G|H-ml-yXhZ|`WHj<%)XiUa5g`*Ow_;qyy
zZ>lp)j}Y{LtiKY`C#rGM(R@mn7!5_xt+v;UVpLcu5M-JRR>lKPbubHmD=085aN?QM
z;Hj%S)5dc$(b?InGHjzpZx()34b1mg<c=+oE;m(@c|W;wE!{4;f2&1vmxWxI>38#I
zM1%$9bQxPlgUrw&&&B+CdGF1ExoKh0p~ew)mr62_NBbBO6WQ?)<~*;`Nk}fVI`Vbw
zINlDhHy(H2hULBoPt4$Z6Nz=h%?X7Mx<Sge=fX0`c~jTKa$U4UF6Ddhv${2ztLTdE
zZ|8Sw8juJBtFL_J3rD@9M@M3;z4$L->_lNuIZQ^jx)oVKAl7-St=eN+=jP?l(t><?
zs0|uA=@EnIv7YH}jrf4oyUH$k=SQxHJ%2YaN>b$E!DeBKg0i9Nk3=slPx1~?BvY&F
z1WyN`XJYwV`(H&PzYKR`nzx>I!`7|P@it0R2^vS?hX<(TN1Q`3n^25lC_d#~<4@B$
zSI-(X+TB{_6#Dr2oFovW0+IbIWy<a6zn2Z;FVaaulim0n>(N<^&9IJB#JZKZ{mJ^m
zd!d>KN<x&Nx?WIsy+u2PV1zaKV*Pf?S9<6<j<`P@b(boi>a&I$!c#+s;u5+qFD}Y>
z^V?yg8e9U{9_cqNl$>c!F%h%&eJIG$*-J)*k#-M4U<KQz`35FB+|}Z0@;WtUt&eQN
zi44Uv+Fy@6xqc(Sc3Jw@V)Qt&Ng}BqxVGg;Y^T-M^h#)_#}wBMhTPqCJ4On`i{!>r
z$6|mUS_H;RwdvlXEd3@9+K>5AI;eKP&$@H4Qrc_KoOo7sT4C1C+OEB1bMji>LforR
zimsfQ{|5h$;+1a%5x*J*`Pe-e002K0007>9QM@wHv$WLLGqCtu^(x2jSoSaZ*Xos;
zmCZUc!q=5fZwoG!UX?D#Bvb<f?XcZ_T?3?f^W+Q|5cO;mlGp+nMYQZg_qAvoGco0q
z1UQONF?=l%&UG~W@AiH2O677oLlB2OLX3RAz8P)INA^wW{9Dqg7ymjHQ(T5FJp3?w
zN8e+jA-*-!-hrR_i1wRmLkz;2=%8}^KSk8ATo*{Kf7tffrZypF7cow-+Chg2(=m+k
z;}Au<6G;WQUX^H69(A}{mJBmyQ26OH>-es7Pt4yTzw$@i;}M9*>cvRbx>SN9qYZ2j
zH)|VQ(jF5ndK14!P!8wI`L|~G*>aK&ouGF0USY{<>}if?_O$o!DPd+E18BiWv`kKu
z!$y_%$*RlmgsYS*eShu^aQ*beYB>%VmBTD_72a@+v-?f6FPy)d@Kc=45`qdqL<}Mz
za1+6JwHgDWib*m$q<`jZ1Enb4YQgi0_(oy&*Kue$d6ojZf{Ee=Ogs?`=L9eY%dbo7
zBaiRe&w7xOesTqGNr2o-xC`zczlam(>h%bm%LzLe!EC^YL>3n}|AYahrbd?OnZZF&
zc2<GwDUy@P)%X5c5UHsdo#EnQQ+r1K<Ll|*{M#Sc-ElDqv#vmM`L&6-K0n|@@_|dq
z_`I+Y(SBqhZ&e5M(gb3&8KcTX@_G)DwN4*z#n`t!b7^0HkyX1~->yN^5!JKA0*v-N
z10xEcWoaYEP=*ZFd8*TD!4wf?Hp<Nit5`X1`iL0hfefNf%rKoj$GLWTYgB05I7t7<
z+gYS47|)t(u{V1*s%~C(YhAyr5{7JfI>S+8)1(q2V(rHV6cW_W2Gabk@wmI*rU^(l
zAupZ(i4~uPvl*CQ^D77iOQFSu&VZGrXNT(7I~OQ!U;&DvMsD>_BUxWv1kl>tstR5T
zJ6~m@r%L`emW9Uip0ujl_D9jaS<$Z^xMZcOu}GzIVao&LS!zF1-Vg2wgAx!Sd`4NP
zO#q6_a|+Qk6bB<Ii+qr=1VAggRdOk{_g~pqY=|55kBP*A(g+AVhk#I6!pO;`N#oSh
zP)C^Dq1ey^)|Hb<xtl9KkZJI3Do0sF`^vgDqLrHfZdUwn5e7b=4{SHTgjzr|jE%{P
zG|yKZ1vC|UZrT{-3!(bR2{IChO{nT7t<q$fTPdl{2}143qp_2@S6TNa5sC@&D`Jn;
z%YeGqI>!))bnR+y<I}?J?PDsWF`YPUSHqR;3AOz0%srR8sH_}v#lum=DPKjqG>ukB
zB|G!@K1R}zEPUf=X-dq=r5@I_0XdUz$!k|7^=%d5O5-MBO_eU1qXS1$UXYD)xBVp(
z?7<9`SX3-HhUIU5L|Ubb*?AkP$D;znMKUE3mQ-V{WsDN|D!7Cfv}G-QKIU@nKc>7`
z9Ir25H75g~2>;lF+o<`Sx)Dx{sU{_>qRgs9a?aY>7kC37c685R^{c=^4(I(&*bzf^
zPUet9BKslTS!oGSsQ4f_Fge?vcJ14?g*?RT8v;oLqO9Q{Re(zPt+Epib2gsUOjK0g
ziIQ9ii>)b>7&njKh0v=Wu@<Uu508o+&&mE2T7BJ^pyvW$EyA3D>=DN)32$~+=~~Bc
z36Qu{GEHKvYSx}lA7pZoF*Ns36U@_(j4=fU)hlj4hy2zy8A;H@l%CUt2R53tFp-Id
zt>H--zc|9TuOBeIc0;O1hUV!LTdho?W48&9Xo4ZHvTlX|GxBTH)>Ns#u#$S*J0C(U
zcgDL04kE|;*y@XU`xU9%x>4d!rOmRkTg8I6+O^R@1It$tD;}tCk#jQ|=yJ!h*Fg=!
zs$HJeaH?EbRcv}u>R=>QnFvc_j*Y+}6OGQ^nZ?BWaFYtqi?ssj<7d2cGT#pZjF46+
zGDLpzC<yk~m!((S#Y;MUj1w*buq&drmzJGC)q_vYLn8YlZLVgI#_SQ`A>dA4pB(KV
zB5=lxV8fC*4@6U?HA}2yDTPjvg$P(|2CP)7l`S|W^h)zsH+^EN9Pb@i0snof*GDL?
z>*LA?67uqQ)mkB?{W6cV#bLyEy+mpwN>eIN3@L~Ry7A?0)CDP66Id|u^KB6wE~wI<
zH?fmKdCKslI;|h<15e(4g~?5>pFu;CjE#v*P3>36v9n5JTl}8xV$F-By*Fbj1+Wlj
z@}0R~4#)Np7uQCs$D8BrcdgRe0rn10FUcZ4IygV9roW7b!B;J1LA9I=fzL-jlbO40
ze&}?fls@l<wFI6nDK=Xq`HLvIOCyHOM<NcOr}pFehjWsE71EsiECT6^<b?;+Y8@Wr
z@zquDak<*vjgiU1L<9i^+3->9qzUq$F}Tb~IyTcGn$+Ji1=$PGizK|o?exZF(qif3
zyJ(x_Uk!)Zqn3O@{|_?#tFshOePi(PhfL4O0sc_wzjwg@t%d!MPLQRUgX7<b^uKFA
zt7|!Au%dW9SIcIB+eZA1ymVfw-&$p~yRS_nQE1P)GUSZZKlp)8h$e{i{hl-s0t&I$
zq_z<<DLEwioL!z@PKXZ@p4T7^1OizH_|!wL?z_bSn;LwpQT|98IRXVvXbTQp$4|3n
zsR0E_1>6Cc+!!f}f;SE{H8cze7<Wun6GgR*Pv&b~0_cja&q6Q~&ZSOgkK8(N3*5o0
zcjEf&Zl4z18WvGm@I%<eyDOL#b78Gf3<7F?=F7Qe3Eg#HKwg)4W%&kB{kdU0#x}!U
zYp$)nMf|1!7P+IxTG)!C#V|Voug7mF;zQn-mR_$4GD67+@e4;&J!M!Yx-w&bwH1uN
zMNqjVJTwZoQ@PwIpBMsCsyQfoXUL@#5fq+GmLnf6$EV%$1p1`y%qoLvaf!FR{pM5#
zpL@8bo0&9a0OnE5)9&KEs@`{?SKDJNjv3x-TU=d|4a=`ur`vzt;^x*Jj9b%(J!7Kh
z0^`HU&I#1`7}@sz^v#ey%Rd6XgEd3aWWj^}@N0gWlr&+K9R47{j(v^7>H#ONw&FT#
zMeUfGi3T*(d>YuhFO9$Sf<KSO_8>0MWX~akjMs@>Hw<_OU>F$i;Yx0gtC$asrT~gF
z8iKnQ$0}jsB&W2r%~6@%4_}<%o)Q0URYl-{#_Uio-2{DmJKy&yLW|Z;!C0k3`cPoz
z3dPF_;{=KePels9IN|yC^qBWd({!#$JEP(m8?ob=_1u~?DW<isgvn{;&xwh&Nm4FH
z#5*McTr~fJdqXN|#3UM+74?BMjA?%ex!Zy;d<d{&#Yr>kI@`Pj0|US6uAyiZ0ZCH=
zqCQnr4St!5V;~P^DBM|tJojAC@9^+oc9XZ$Ys>S2;OrMXv<gnnMVJJr%kHsGrw189
zbnc{(OGkRG&tFCCbWGLQOYE3=8HQdwfxfKr#bScll@wfsz6t93lA-lRw#V8fC<knM
z>55th#B~b*?b@VBl^}scBLEj|5TUIgik5t#;*MjF-(2}fcts=VZrLK#_|4_3=$6Sn
zM_)VEt*10L9R0X{4lOTWS2IAu_&|n-Gn=vyc)IaGshbJF>_Yu#U@SM}<FM`C%q*BD
zDPGQ5K<39`uOEVZIFL=rGNgpBxnliRMW$F69QjM+inQ>7RQEGp<}vF&2@OfZ4Dal<
z)%Q>4pwB{(&(CwVIw*GC3R3q>Cn|I(Qc?-$^Gj~IlF?~PZ1cJFCW<CgtPpHB##Bn!
z<=YxoJnh<O$7|<ya{57S<`hLEgBuZ%<aKn1j%{3!xVd$J{DZ_dFmdEw%KX1tlzuGH
zSPIZFbet-#G<VARbOjVOIjxdRBZC*Xg+ostk_X*z7RFMz4~>)*q|{@FYcE&ql44-u
zsV=ON+rt+#V7AhS)t+@OKI&5{(lN}Z{Pwf4U?5Z#)9+LY4$d{@b(8Qp3M!0<$Y77V
zvy!tmHxDcBH&#YWNXH;t!i6ml6_bKBwp2}FD6OTlA2OA#WC$xI*XLnw^uM&GiSY8I
z9Jd>?hQwA;Dx$<{WmfG42T_Hs3o15xXyD}r)XBlb_m<?K2tky#-7}~VS3eJ!dutr7
zV|9Zu5d5YrmPu-DU9}oOF=tbDXXLFf=<1Bd^x=;}{@T~Iv7tk=nFi+&U^ve)hNQ%5
zt0_b10RV(Yp3&~L1-;f>F#ZK$-DDWVg*`zF<}$P7QZr{tugRX-e!dn0f7K+7QbT!w
z69VU?TSRPDd8;w(eeSRGCF&sSRzR+fkt%s-y@laGt+jAGeR$CHo9+2{@K8`hCh)O5
zwW-mR>gWvNQ`sFV!I^$cj68VJcO_aSu<#H>bt9wZQ9hTlSbifd5!TGSRZo>)6oxKu
zeIKoUBXnYJK}*Pm6HE@4`pQ@iBIyXLt$I+aiZ^AF&9S+7u(R4%;)1Os6{AoGyLjl=
zHfDb)q>78^kzI`t-*auXamZuGO?l1PlYNKjt%{!UQRw2<vv&6K)kt3~wkirm{iDF1
zdWBR|_?$uP5FDQ~14XA>SHc8wYh?q)Xb!OhAWjafH}9SCVUvJmOoHCeDTl^Gt{ZT5
zSEa2_ZC7UZw=R=f$M<*gDZYRVomLum;UF@>e#914f5Qb+F4sW(&IFzvo7-g_!=RT4
zHw1R(xxMAs7)~2YpO4T0o9Y2sZBc{<rWuA6G&F=ZIbpn4vOur`qm$Vi#>Td)aU&&T
zYC1;8sh6!=-I~rP_**=M+p{|H5V0D&$s?&74<?$+g_wiv?i4Ia!nfCCN}y<l;rzAG
zB7{4(X&(9EjT0mI8Xj)Xc>FUi7rX(^LEtAtzELl-7p`Gxpf0F!bjByExO$M}!(P~T
zkOY!8D>cj$*W|@$zSU>pie#WK(7!V(&jP08FCYMbIS>E<oWC=wzMG?wj)SAUnYGFP
zhEszS$F0|C5jxH(Ro``6LxY4za%p!YFj#~T8$ivU<Ix6$)mQ1s<cU?HAH`myzf@D=
z3rDU9{o%$JFHB8Z4pb+!rv>%rG;NObJKcFa&KdZOrSJ<TD9WhB$?x(QlR-FH^C|dO
zs{(dXs~x2SquZDcLBXiS)O0TMurK(meQyxYqR$DBt=AU-)YJt?=*k3Ifv84NT7|HB
zKBhMyD<4X`!aY%Oa>!7{$s|TQAlY4Rxe$hj`GV)HD8(o9;r#LBb~0P4AiLv@IFu*Q
zo%;y7-NE6C?}PCMC0PrcezGgB(yi9q(NhfWZBqx%<8Y#tTU~*48-kb*YeTOAy*CRP
z*LgI283+WGy7o-a640p|@NeY(8b4K<KDLU<<frN%KQ)ucdX31v-Vm=2M!10GxDCn4
zmAV{esGXfUcuO3ct&celG`KyzH)k`6s&vd(qUPna1*CjLC71@mp)JLjZG4fRUou4$
z;R46$=m*xCwbl`wG{nMoL4?1rivo{sm!Qe?l{dc2jNn?;o$3R-IXAo01YhF6w75mE
za@{6>2P^BF(aP4_>(<?47<QeSnY%LqrBE#48V$PPdrvu)xN(Ju`37fUHhOwz;c3Yu
ze!hi%dHE_EW}(BqkNgD;8`KSj22B@V=c_5fmJV(pQ!ruNU*Bd5w8dBbT{Rj+*iD&#
zihzDQ+S<=mmr3|x?AF=EHOQOp4Uk1qbRx?s10j^cb#^6FavkdO#`5>iaBs~{-0RO+
z;UEkE0QTQMLn|jsM>9uLdm}x=|HWY8qRP7UIxB+LvP#WW08tOBy@um#ke?GaliAq<
zUj>R93Sh6OK?pf96w%>=Pq%<voe_&vx=^?k=nC2>n*$p|dYsqxvoKVuU*w3tWG+#A
z^}C^siOcZ)mvK^S+!<`E@z{p1ujlD2L@Ie2vbaf7DT=JPtG2k2sM3w1F)t`fPU?(T
zUsWSY65tsTXUGJlo@=Qgl={6s4pVbwIz#hAdSO)+CSV9N%ZYVxi!yg9i@IIE!_5-h
zPvEvj4H|gse8nST25RM6eYmW7jV4{kXv>XSBT5H<4-xBfLbJy+SU2-#WxfvwTYj0~
z-@y--saPJ_DZP=V0i5zLV|VfZi#cdH{x?w~H|ilj3nfE$9H!5%9>uPe)c2-~_&2IL
zO&w3koLTzoP&kc($MalJw=9yAh^?p^n{?vG$0h7p@o`BZ-B{Ro!ja=8M?#+W>BOQ9
z?47xP4$hOieOGCd+%oc=x~h-M{1VP*<*q<YGMp*;#hdRy8}R<;#I}mRL9Gh`B1-Xy
z-!h?s7h~}<NPkPa0H<sZDPh7FVj)YI<?v0s736j!)}#msV%qPXEmY+(4+o%KRW6j5
z*@iaur6~p?Yqz*VK5>XR5rw0ZoSa2jqq}Kg#qkO<YMBGOB`S;a=^;_gD$sxog9KuR
zTiyz*10?oJKZvwU4sPXZvI@xL_vyt#y4*_&aw?3x#{@@8@=$Ao=+?Y^O2&O9D&l@z
zmwM~dM5$ZTF9<=6*3oq<OHu=GixPe^A#@eVroXQ|#uQ9wI|};D6ef!o+%80?7ZIu%
z2RIW6Iy32Gu!C68q8mx{k1v(P-m{S%Ol+;#1!-AnUV}QpUj?n#AyQ!U0}0OX!|gYt
zi)@e8QgIjyk;p1hvN5F&bZo*l-<iCh18;g}MON;u$<|qWWsL^!pu&TtvM0K1D+W3M
zlKviZ^Q+nM9*pZMl{CKHb}2EC$gqy5HR*4NHXeV;Qq<u!dw9Dyr+2Ijengg7mbv)7
zo78;E$N{UIGux=<VC}$^@CE#!d`IAl$E;F+ObcAGiqb@=$(0!c7Y{DF3Z33{b3X64
z<(&e(oZlSgig)$dGv&8UaH2|H>Mec;@7;Oeu>n`*fB59nr_1Y}cod1A_>`_P{Y33k
z`rRiJ_$vw;Gw%91%%;Cgr@<;8!@=sK9HYT}PZ?GH^<R=T!2`cqpg%tS0sjB^^gog{
zJ$w6q8QS}Avb89sb?Y@=1g~e6icE0K(Z&`cx6nXEAZwaJd?@K8sN6*`usu_+(<_Xz
zZI>mMNC8u~UJh=DqlpW(Y9qBFSpbC^B0E8ETE4bpxYH&MtNxcU`A4(}O8dz`cVM{P
z=*Sfe0yNS7zKNtz0sd18rBVQKF$9uYS)wR^p$?ESU{HU%Q-mi9gKUZ%w!RUD{={-T
z#9N>On7l>}&Sb`z*EzCedM_zBLJEl9m1RXJL$R!jsqIadCOyZfPOH@_qR!zJDV#EV
zry|ZC1?;lK5BBcj=YZe9Z>%>qu>Goi;i5qbxi=v}kl+-2mb@#KlmR<7)r(rRQ<rI_
ztx_SWI%tqz*huVZ(O+q4>HZ3B+F!JyNH|ofP<tbp&C$swV*C-|g~1GWcX#~4$al22
zqs^o9QOC$W{9W5o!vo!qBm47)CsM@+R~k(Zoux6RHtD;?)BU-}QS^C%596IRwI>*g
z->TP@SQp0E7oNlbS%`5UOutzBD<AF3oLg0@v|-Z%`R$g_N{$DTdsKdKQXcp1KVpEK
z&A_iwkm+X*URW9$mxnpUnZjW1IH%FyRsFW%-dnME8_y#aDMMmYawX`aUo%%)&o^py
z$07~Z@oVJJ8U;jZztm<F6<2vW4<0x$hl5J}F}PlWHJA&+SOF0LF@Lc?P~|DHD1Yoj
zQ=%+U=7&I*BUfQ6%c6N4EE<L_wTCC~PKcwc@uN{W(yR;47!3;})M4=UC<$Za3jdBq
zUP`qHfEt9~ty^K}GFyOE&{?~rRm^F}4R*x})eBchy;FPI9_q6G>I39d2=IK0{T{<8
zDg0A4lCFG~B=2_e5E7F8^qWW%Ck3H5+1QRsv8(%RlIQsJ=)Uw$da`c+;`P}`0{M_-
zRE{!N#q<l(H1*UAx$VVXhn`{dSoIU%iETWFoB*C8(F@m5$^TLuWXrbFc+lry$%zeQ
za+z~ZL|9#nooTLb*a@|G2OW2}{z-Fv4MN%VrwG-8ie0yiq__Z@U}E|!v*K?p?7@V`
z|KwPhQHA`JCjK#}e}-(M0RaGr{!W&Etpw=&-<E=rtDcpur4ju<r)|+bg5~~_e`za7
z$+lbf)52^#qHb9J%4SEsNz643krYbK!}}2;eR-xQ>nQ*2RX`>*;fE02!I2FH33v2p
zWKP<KkS+EqP8n$pmv<9a(+u=;ULzdQx%E2Wm>rAss?m)JOg_uim*G4cYdP*R*48z#
zhGq{xVa!@?Y;U^oz;v~C83Nl=={eygh8|<xrJ4%(F_Wpx#$X5#!RwAcuqps*cJ>6Z
zoZtmFf1RWxKc4K#@G|aACz1+1`APxJ`l<qnOoM2T!Q^K$0(hN7HaE<gV|4{<$gD*<
zmeW1mt6E8Yfn6t}X_H9Pt;wuH%bTGi@ZVz&45i;s6MxJo^v8@C|AiU<-Fp8t-hTs5
zRxwf*e~cG={e{{#8l*Q@D*^2{3k${pyVfqJ!e*DoW1E#k`?gIP564_GBXnIZf*V?*
z4W)1hTmVu10Xe=1%*i|{UTT72{(B7LNzr^-2TM2zs3jig;P%us`a!A*`Aw4WC03#I
zhGuP@iu@2;6_dQCai+<43G8lGU6V5QMQFV{QCo7$+cr8vwV7nzZgXkRpgAqm$12$V
z$aSWj{)WrTEZu|t%H0P@%s)oXJL+a$bEZWRNGYn=44cN7Q9|y~idX8MT781@@AeV%
z*4Y&OW1G-F=>g$?VV{5f%>D;MCB@1}^z$MNUcZxraS{z8tSOSC04c-!gLgiV)LF)d
zZ4$L(8naZsK3RV$ddSZBX>LutA6`QH`*I;bg@FEmZBNEq6^x})kr=uXJcbKyiiNZX
zx0HRPx(SYt^l$1e-4n+G^?ANZIN>W>iVYyG=8{G-pt1>Utf7QDEtt(6rI^^%AMcUz
zBStwY6hhjc(^nA2BP<~(nK#L|<HizBmK8|v!}BTcM`(*?6m1DUv+#@he6{Qpi(fx9
z?o_WW-zNJiUp39%gO_O&%~RI_H8;TzTGpik;u*+Ub9Il(-HD>ZO>>||vF9d&XI0@h
zp9%<bx}xr^plX;(nRJ+COdGMVj*AvO&1Tn(q+$5(S?^NMVs2CTIfr0QGR#f**uf;<
zo*GCK*_NU%>jU>QTMs#h*yqPoYB$5P=yyQ2+|x{JUuN*f@4x>^b?@&)pnvWAApSoU
zQDZY}JxeopBm4hN_O__9Y4Im}^Lf_6ZwpzWN?kz#^Xn0B;)ZGKUE+$uH+<b-PpXuR
zM{n!)CiPi=Tr>Q963{=F?rwkI47pgfJgPvY1*VH*V>1Zv$Vjk7q{2GpqMRsHlt7|0
zqT%CpzMPQdk)tO}Bncw_6EM$NQYPUGW!wJ*MlA&#49rGGGb4urZkV&pRKNG<*u*j5
z&46+MzYyn}1K6unfNWYG#^e!mAerOI!FYWoX2<a{fXrmhV>{in2b7$3&Tz&e6@K_Z
z*lVjb<Q9kO|1+%3?1CEGe_6S()$X>|^X&A+Rc5P7)R$J1wsKUA)X}H`(ApfvB4`f9
zAb(*IFW!Aq`>8fiJx(a3p~=U^SC#a5DW6UqgHRPiCaq`N+0CEk_n@5!f-sj&V+M4B
ztOhz&gubwIa;1{hk54tGqh##*l3lkgvW}a$eUGye?%M5(h^P}R=$AMH$`pnk=SZe-
z?+;HvwZ_}ud}w}a5x$v5zFg!E{c~6gsSuPyBNyAUGgrK39^34CQZlkT%mIVVP!>hG
z$Em%wH+)xjb{5gcK^|35J+2vE9HmwM?CE+GYQ9s8Qdo+8(rC_gHOa8}`QW`5Pp@H#
zP2PRk-t}lORw3}_>VVl0uRWoYd?`tYyGmKl!zC;eUV|?^!b&OX0ra>&sFDlPFDyxA
zFV@?h8|$2S!FP(cV)o3cCnt|J@55A9^dtW2<;#Q&O`H5U(?`VNEIpQM>(!1MNPU>}
zt-{5z<d3LGvat>j-w|^ZEq?trBPKgJObW|HKvPh#f+T^<V3$25ROZ{y>ssho>k1GT
z@3m>%qrE7jMyxBgug{nGbcuznF75$7i$f*8AO}|qc$db0%YYj{kc(gHfS(@;SZ5-d
zCSz_UB}ej=efQUldi`J>Y$YSOHnYZ_!w{;u4i`slc}SlMZ@zxwGU{IMcmfU^xvm2u
zOXH@x;HZ1}GxwSBG-+SSdQQo`P=M`NV0lclUSt~&6h7_udyC16D?<pI1?#{j;wVit
z_fJ(3?E3)zhik1-PVrMr@NDY;ajkT?|9aKU!RYV3h7Xu_xxeIJ*!5CHDrQd@q4QlO
zF%`IJplRsB7@iV82=sbFdWBY<b5^iisoW1Jat-&?njy{Sw>f429h&%2tYMyQZo4LK
zwRWeqK=ZnjFjP=4aRH<!BGlWp$4vKAHTJPcT!vH`caKQ!1Q_wqG%cEiq+mf7jZl>$
z2JmtD#sL&t2-%dp5uQw*P@5nPVSu>kCNSM{=>+WETt7MCzGga^!YNQ=bUf4guX4uc
z!=+UlC+=x!YLf6-wk1X|rzCa{?`9*7T4YbDPOh+Hic{=WYjK6idEql;A8RRaqVBHt
zeV2Zn;1BN~TTi?ZsP>(slk{HVSi@?2P!0I;tY0*Hl5+xz9Ew%Xy?}36x%EM@bDvAK
zh3s4>9cqRTiz7M%D>B6Z&C{i7vNXYJB>6$Ka_Np;9+ihe$+upEQ2nT!+}wSxP&_gM
zFN~k=$bksbf8y2z$iQa64WF!N|5X@nABrYDTIf-ZRMfBfkXbT_A)!1Nutif%iZ)>e
z)ALQrEVbJSY8NOmGDyQfH+)bj_tXr>M3Mt*{=N0eqLB-0BF3v;bnHC05zQ!jp`r3{
zg0!mT5{P&$RYb&WF+B(9CwA@m6uCUeE7>-s5!RO%TZb#s<Z+ESD&oR2PLtp%aZNR@
zYg%&X2sLAxiQOGbr||~Ci4BV<0Yb6DWj?Jknyi7uU*bhzxnprh+K2|SC3KLrdq=TV
zRdaWGz`xdu{>&+=Rt7RRs%0sytxZ{+tU9Bk4~HF+#9t_%Ou|T2#&4;+Z(enH7~fkY
z`^!On96kGbcC%pML8=`hVAFu^+puXdS50wV<JEjCIf*IxbcgZ>nI_E;eMoW+PBss-
zQPQ9oUq_3Htu|=KranKjv(Y{Mim3CqPcKO<JjpgTug7^nSKVwyqUIWUcCXQS%1Eea
zSD+7#&m$6huQ}C$kV`Es6Krf^QootjxSjr1`%vsgqql)<^(SJ&)6FW9$eH1sy3l~j
zCaCv&H+iFr@6mWl?sXIhM+(O#)EP<cW8Ksn?^(zFz-05#JBAhmVTO~8xP5fd)Z$u{
zC~@oUym&HoEBq{6H@H4;c73yA(>V>}hQ5sf{F?22{L_#(WB684_wFpdoo+f8q<kw+
z4PE(?!-C1_ccaP+yq9)EgXfv6NJ{_um#_}j<wd;okK0}$001EVGi>?4T=+i(DVHi%
z77M%xU)efz3-DW`TC#|VyuHfg5M91YG6c(=hW4w@^_b2qo!NUnF3v2iM9=aXLMT?;
zS|s;x6ATO+T&dit{AKhs=#ZYATxr}6@h3Ywc<%Xi>Ub;uY9qAANU*f5yYY};piBk+
zKY=l%3JZ1}<R_Dgo<UjAnAku_80Pp%PKn1f`pMjL<J2a2#a#OtIkS59Lt=1*<@(Gu
z$g+Cq=9_r^4xzN`K*fimWXPAx0#~W3tt#!9fwh&zHA;w!k!(p=L5=$)eJ7)k8VA*8
zLj4P=$!aJYnLNMM3(+Ts%^wdqrIpm=S|?5HRnp1A`3sA;-hjvLqIyLqr|BB^iGbY6
z`UzRrbDhC9_?i*Srvw8utRMi&mt1?1RzW)tOXP|IjVvrXt`TGCirSbLlQ+<aq|y)^
zlC>lEN6LSrq0oWsaVu*>tD=!XI?(ISrSsV?4FV@A?Bgk-eH3dJH|n-USnQF&Y%L*(
zcac$vb`V9dy-h$uW-GYV&~@3cS98NH+WI90y^dM@RBkGh>LeZD?9ewNw%zk&62l7W
z=~^<78Q#x6Fy)6Tl5HH}rI5g{Srbl0X=Nh2BU#u219z5!^}5N0aALtBt0d{*Ls-Px
zk1GT%2@f@Ijjzpbd|qe5g4;c3`E}16;#ZV!H{)@&u1+8&egw*x(NkyH<V-8R32|e&
zCy^#c4$Frb-(}Ng4Q2%|8E?3Za+Y17pPob{;aTl=l)k}K7C>&{!65i+>tVqd<-<x~
zlb-&HwBrj~lvwh@_te}zCczoOdvDbQs4B`C)*eH`DQtbmivOq~*f;K1l707xr}on1
zM&*pdyS>^?!%d$zWuR?++$<E)b9bnaW~uqX%1yHIH#<_RQY8nGq%VP}nK-<RY2zM_
zCfRP2Q=rJN7c>kDAIOd|vvkKO6Xfe07C=dt#eNQckzcL3v&!-n!Md|$IhR~yJ}(G`
zwG|iU36`A*m(JMcKT1$ima?GBs~~wpZpToi8F1drhnlZ`OnbCg4z8UPFBdq?sPLIj
zHsungo^vh=ngx3>W4L45XfQ#>&6-}IW$aH{S7MJ^#a6avEo)D-bG*uIjp}J+?H&bL
zdcNfrA$gUPI|&ey+F~?_DVjCrOzJ&tk9~j9ez`1PUkyBFcOMU2b{p`t=&e7WA6qX^
zUM;~}FX0qj5R6Je?9It=VkT_*;B3%XqO#ldm-F<LZjTjrYMQ$>0Oz^-umA;F7rkQt
zMb-bd8UL>vrwIS_`Ty@-(+AIIK_j_E4FqZcfP)4A0F-}f`p;XY{{)Z<wi$)L<lor-
z@lT%gkB!wsFAJA-;rJ7mPpHU_<Jbw=9~=p*Ayf3vV@O$KwK$Y=Q^uX#c!JEtszf~i
zIfm9#AJ3H{-LrV)!)ac6F8YWkRsXoks;ayZu|1s-q{7XTx3Ps;@>#&YLv5a(o}Z_~
zPt9ra+Cu9((2Uqi^NTCGV#3#B^yO7<n8ovP=Y|>GQ<5GjvP4Wy54gvL7%3x!60fwX
z&Iz7*hCDp3=jbJDj`PCL-W#IPRyV!=?E9HHO~Woc6TIziswpm-k*`bMn<mK!sY!*)
zUpSR6-a7({iVO>Vk=?Ne6cq_2qB|*ui`A-zdWN?&1C>HqBhR!9LIH74)q~UNJOd2x
z%NI1J3IjX}O6KuA=U3}=eH3?g_B$$VqaW%$VDxN7w5nF*Cp<_Wm?`7WK9yXH(Q7Q0
z%M*zuQYVAr)r>LV)rlVcjJMZ*%ztR2I4V~s($tT!kPd&vDuM1SRa`86F(Q6Ch;g%X
z0q<1ii3l&JUY^&_qgZ%!ha^iXlO;wd_)?O@5u98sTfD5JBFUSJ(n6Dnd(I!9AJ46-
zNsd~0=0!0?QF0ah^)|bx;aeKc6ZVjcQqL8%!h&m%bA<$^jz%{BA^nj5-XMP7_R)sB
zS(H?AhSY9!Hc`i!W%7+L-dua`NU~KTqilB~tOanZcv@Xy6GMA2f^(-3KY}h+*W4V!
zDJyhb{`&FG_<KrdHHj$t@Oo=k8)RB>f}nQfj7HXBi4=#j_zsg>0-(M*1vv8{n^G7U
z+|ot{unYN7;{|!^lF<9TlQm0Pi4GG;1z(ifX+MS*Ii5pzvDmS5dwt!*>;7o+hN6}G
zvl)Nk?dXu_^K)zTW^#6NUC!Ro-kzzWD|k6eMdw@K1L^BpNyMkKqXnb4xcd9<nCE-7
z<HPOYbI{G+&FQw9XPvlfmN+_QtINyF@x#ELtHT`}78e^A{+Gz;$%Cb<UBKXXoy+Sz
z3lGsUPqsG6FuKh)T#QID`f;GpjX=nFOqF#Fa8gP8HHU?q)0>=0)OYDA&!M;@H?PaH
zt!gM3XUk?xOvC{lvWyqo8YFEp7IA7;@k12>WW6LQJgr1tUSukGSz)SSJq*d`B|ApO
znL(nXY=AGo2gI(3TN?su53gS^E?kOP8V$kX7yeE8OgBQ`yccsTso9uis(rmryz2nW
zAmcc~kH)Id>pgBKM(xE_PVWpQCwRGL_dGQ@9_Pl2(C)LeBUGUiol%}q!)!ky&^pSF
z{ee%xLb(<oAAUN7xohbVp5B|REbOe`M8_f~{U<Rm>}*^D&SF(I+8#RJn|$+(iBh@7
z$J_UCAB1IMZ75<i*}U3Ec^0sRGzWb@fjk$d4iRro(F28W=R4f<!cLHgs&v!Dh=khi
zws988Kq~;;?oy{zfX>6%YjvD{>Td(D#7sqK-uPIt!m*A%;}H~(4mU{hwrsh;ro@cF
zEHC*(iNg6EUY}|w`uq4F08|0Ys{84t?eL>>!1x_f^1gk78@rntBv0025;=K$y{aH^
z9(r?9jtQx|P=~ECJnlO}T8uW7n}c(sDn)kh#|R~#<IYzMj28L?{2#*3p-T{`N!n%G
zwr$(CZQHhO+wQV$n_aeTPtW4RyO=Y7As2a1#EmC%Jzp6f83@?+>0>tFdh9;`?H92F
z*yNS}dT8$=445{XD_K<&XdhlmgVd1F$=Xg3GfhmxbC~9HIsT>kvl%5MXqQV6Q0tx}
zfQBH#?0VcR5r~>^V7jjJt*xb9!zV!BrZ)i5*e;{I+Q;b+nGJe}&m9uEf21nd=cg|k
z*$4V!rN-*Y-Z!}LoFy3&hPXg6UN<1GC`PN7G;F{Ymj9S-j3#JE<%m^*3aL(Dga;1O
zTB4njgox(UkQEYEg_a{q?yNxQVNrnsWlfAlm=-6MlaVJqT7_g~<3g2^PK^lT>(~4=
zfCbKPua5sCuwUp2-NQQ;SAa$NF%~!wbU*v{lX6rRoke0nx@<AZv76oz_#IJx*>TD#
zycIe_w*s!qj<*Uc0=|j`N(KrAcQ)ivvxUvNSqv3JE#18Dzz0L&DNA)>!pDV+5<I25
z@-K;#O+l+T<w=NeiH%ko2S&lQPt_Dl&}&Hy5QU24oqo`gYZf!HtA5Q%eGruObOch!
zfQCAT_;PcFB1q99pPEF>VP0(ZNJ6;8-aW+XkK?Y31XK3`pThI0b4}<2-I~YBD+7s>
zgNkH`=0dybf^b-fT8(xP7%ffRjC!<o!VhRu`2Md{O1zao%GI>lGX97!7VtZeNd<XZ
z#|-*HHcPfP4#X;qgd>P36Jcwh_o7fxD~K562?X;;Jq@%1TD+x3JsmBS7bt)quw@--
zM^H(km-Tu$_muMv23$<~epjGS9g;L!5>y4Le~AcN*6^n<jvs>5CwON9Z9QLtfGJoY
z|28Oh&_xN%lw${($SXfZ)CNrn^<?X_mxBYq2%Lf0ZrVujIDFcM(iOKwRZ#&~f=g<y
z0V;1^)aHsVdMHU&hkF+7#DfZMi)3vIXvZi3YapD7U!}Y+gI@Va6<z_W7=L5V&uX!A
z(4%=E8M}2ljnRxo;{BAG4LiEgBvGLjZ41=fU~&m+aWX;)pw^GB2(6in18rUs){1jV
zatN)4R=C1Kpe6rUDE^bv3)&O0SL)uNomvVtpjy5MN=<??<YIOC!a54R_~IK14Kc}Z
zLryhu&N{P!64GEd+_ai=*7pKN&2vx#=ClKAvUD0&PS7#W1R~g_<zpFd8W>5^Bcwzx
z(Vrz6^`O|SdTXq&4x*pZBXD;Vu%eW2_oMJk6nnXFdwHMZEPX$Al5MYID_e(n<UL><
zbnOW?48JtItAJfeO0^}$K!C?LB2MhQ9<V0G)}cWTM3Sa<o!jpz_-m)yWb~vQh871K
z%;K{(b^q&%$P<^zr_Nm=!if!__7SV}_iok5U4htM^6I~0qnSibbgG*tO8i7T(<z;3
z&uJSVGH27oCc}(^mbpqbc>!f1&S0Ps0<00*?WoBpWjLvThPm6v(54hYG}KF-1*F1s
zJ;^L%tPsn=Ku1g666IzK-`O&p0^4=M-zNt)6@rL2p-O17%p8K(m{3#f6B0OlpQ`Bu
z&zt1M?Apun2~7g^UbytpMmw9Ex)?gYupxm<=DM|nM1IvVZ}_A{y%h%Qx)gip3OTpX
z@>F+StE6n8Iuihy5F?v9z^4*fU6;GF)l^;wqlk?Z0rSeK=f3%kpyE)FjcE7kQ56nT
zj2-O&H#~!s=|xjIGa8ZFVpR3ZtbUrNr5Qn(0${r;l>EwKQ?;y;wxK_njiCeUs<a1e
zrO`Gmwxa1gL0Y4k0>VYsg3cNxvHNalWKrS<VHLfkBQ?~}p?`@`Hy;vCmSAv&P!~}?
zz|eqFm7BmQtl^tU03zzfY^Y#Wfmcg=MGh3Q?I!Qk=h@&;)!Gv{np4Zg;2jeOP{C?Y
z{;EWDjvHQ{rK@t82d9)t=-=q!8k2_iLiypF%3@h{oND`Mmh!p6lD*7^H7N%`z|*9Z
z4lclnA9f(h<V^cbxCf1FZ@|EC#AjWBJrnNsvY-}^R8`r2aEI8<yXe=sNF_+U4v&qA
z%CO-=&~^}_#rIS63EsM^<xb}5PIrcQU4keg1BrmsEj<<?>3-4Q@MG2eqJKfr2<`s+
z=5$6Jd9fG?{0`PO7(sqO=`7%r1>qFeVN+LyFNRh|?=dqD;=`lfKfP1Xt)tyk35vkf
z=LA^RVNt35(3V$Yul_{Mu$K2sfBk5tuyDhW4k8I{<j_(1;Pido!xviv198#VZ;;OC
zRcKfXfBX2%R&;WGSXyDO032-Smc11ra#tmlckxZ{(p-ms)kBCug4Gw74>~r3gPv_(
zN`^KuW=GO+1=O-Fa;NKB0ctKap>jA~vqowAk^)5L`Y0ai%&tmU0(4+Jqq;X{_#OgC
zX<R_PS<TIhPl{gShlv-TIESQNz~j*%G1#68B&x6YXWH^oz1b*j39xqsjOwg38QwMX
zHoi>U*(&>QV@9w_ajyYBs|J8|IeQl0wmjnUWqZgcVO=!KPZ_82moVJR2$^EE(4PDw
z(iVOti!IZu=GP}%dWA2r=~vD{7)do-hcV({)NBDd>C_g4^>`YlF#S>|MZDd^v_qT&
z&2aw6G?J^?0JX#qLXH3*1|WhMSey0J5U6X|3JYopfKTO+|A11V$k#hPl|rS@1~xGK
z6oIMIT~KK!@zV*eO_7Cg+3G~ePkFLU2-rxb8xzjRNrr5)Qu;<DJ<Q`-XHhG1u@Q}D
zo#RUCF^l2ywsj8-Vuocqo9(rY>-@nvT%afcLX=aB9?3eO><sOZK$aGKuaQH760``x
zt57-*SvJ8eci;wB5J(!RUWIZxv}Rr=LBs%QuDuWGu4zPpP}68AjRx}DVyl{vt6m@i
zJbcNP$*|HqQVpYt$+jTyP>Kny7pqo6i!D{-QWj)%vC_e!=!2(1*qfe4tNFK7BjbKK
zGoT)Ruq>bxPf|CkIT?CkmAuK$&`v3WMy&MArD2MSP8@_9<GTA{%ee<^Rof{;;=o;|
zo78m?#GI^=PLTVpZuSsL^5>rjZ7eI}$O?Q>qt6mt_ZvhL8nRru_@cH4nro9A4=ku(
zphB3rjH!Tp39B0v#grYVhupOTh#d0ypo~G37&d4`pFiAGr*x^KxD))&t8#5tDV|HG
zus?#ICNF7HPblr#3n|)V$>CLPJngWWAY7F7!ol@YWb(WBfuX|Bn;^<ExdrYot+IQ6
z%>=w(!&Xu|gX5R8O^R(c#_>5&n8bz?LfD$-??9P}q-2}I5WM-WPQotHD$wfJn;nSE
zC}DsX(<2brv6w_GZKgMxBxgt_2Gs~4Ux(2TC)4zow(eH(dImK;-k$iEz4PIPXllO&
zkAEtfqpX!(cmczf?!z0^91s#6Q+~kMlE0?DX<p@@_@aLdsC5+pB2NHUO+?o}!nMb5
zn@}@~^g2)}A`qk&`mE*>pHcQN$m%@OvEyKud)7g0;qszCDk<<wE@idfYT(ZPI*!pq
z!}=}}2m=~Gc@msAc2yIth9olJ|JAkixc|}=eX?9lCh;n?lEyvIfZPv5Tj;MKr0Zgn
ze};u-Wryd%I}d*+T(*<x3gmwIpaYf$J1*b7?^W$>8?dSele@{*dd@4Aqs~bQD;3t^
zQ7he!$5DsYRr#DN^(yFw?v6{wSbi9@)0}|W5#t5nb;T>Z`8tbY19G6o7*!F{)ldS)
zd!z9!ce3`+n*o8wKIc(xV0J$;Pnk{53ettbMhdU1fB>Vx1s_okc<tfrT{?=!e!5`=
zUw2dBz~^BUFBJKy_<SJh7XZY8;a?A{Fp}+XjPqM(tz9t$yMH?fcQ?0svT0*Jr;?KG
zbma)yo?1X*xsFtsr?3Mb?F30ZR0$)gReIu?9BhL#UdO`f3LVq>uF9&;8O~6DVf0=d
zb>pX6IEz=<mF*hVXV#FJcuH<ipz`S4pjv2X=;p>URH=3lPSA}Iu4>j^qVggx++fM(
z=4~3Zafv@#g-=lJcgfD0@!uxb4B|HQv;*F55c`>uiBGac)RDZ-T<lCgW^mT1utSK>
zCYNAK9$L~M)|-z4DO(@T&eB8oh_V&kas39{gZWcOS2x{$5ur+FU{=%Yi*2Oc$*VAl
zl^l+X)!N;n5q?5-OD`P^S5BP`oyc}k4`ZiY&fpn%Jz^tvifL6rdNXZO1Kk0|BHF!j
zglKfC8R<@)K}tZJ5|Tg-*d|q_tTImqWo*`5C6yIvH6%%O62Y=8=qZSYfPn?`0dHNn
z;ZJaVDntd}yqXm#EX_G^EyrcI;JZ-{r6*&V;O(~5w|vO43S1)t)9jc!Mt%mz;5_UO
zNpUDryM#8WdkiaONZ=I(+42a0i8FVQP*yh2P#y<Vo_X*axmsKpzNlybRMPfX(}})x
zni+#F=<6A*cs=ljW@xq%Ge2y;5$crQ2K6`OEWne7NvBVOmw5)!`^yfj@ajn5MeRpz
z6HnYeY3Xsl;!m%h{+?#`!phxQ&FSDqh2wWz+A*@2*SN5LyY?9K(Ehz=XM1khg7U^|
zPbF|PNS%sD-0zke)Wj&H2H0EggXi^X1RTn64NvUxTWm|ZZ%i>89Gvg^O=2*qq5Eb{
z*N)rB#yvd78!HVtbWf18b@Y?qUSP0YdH3~Gae9}QX7mM|yfN-EBIt_9NC=u?OjSQK
z{=BdxPGtJEbu5=3g&m6Px|Nf=x=BjSj`)>OwcV|@4}^-HIxG=Gw^Q<9MDjWw-x#?)
z#U=N3tn==C*GL0C)jS{nuiB`sun(eIzUdZL^eWg3=V-0z!ruZmthRYM_OpZs`KdQW
zgGtm<2L|Ip8*<?_P|k|Kr*CR?3&C|JT>aoo?vBWxM+fDPM_t7Wcjd(Hhb&>P5bnO*
z)n0CQ@woCFHl~d;F=bLVXrS~($Xr6Ph*$m$p;Vq?*(AUcBQ1OaMPt*?%gtIRzo@J2
zQRvnhhiW2eLzBP0F?YX6z#F{t4p(kB4nvq=R>n(S6=i{33Jr<-t|*nrNQScPmkfiS
zL&56!TNifpaXLjEJqd~>y!KQ+@_tiMD|0Eq1rr&_Hd=Cpo<tlNK|K5pukdOMBaM3A
z^(N+ape*NCZV5R>v0hp}+nS}sh7Dtw8u}QBo98Su<!bske6bp|k}yU%Npq@frSm|8
zKD*j!8yV(2Zm7hzKjC3$^MolSWr__Q(TGMo3KpMkZi`5*gXbBvZq#s19V^`=<Z3gF
zRW4`M*&{@13oX3vn@b@Lv4>}{&FSd!TF<290P4w~-Q9hW(dXDO3CC?zCd)|rD00yG
zDo55RluMve=grt#;K+0HX2nh^l|Tww`*J_~#?oBK;)*es0RZjzEDSHf_q%q8$Z5<B
zuwnna=_?+G^)0I;;Q3p_)dmd*?FGhjFlWExY{L;9)g4l}z6E=QOe+l)ErSf89XRYg
zP0D=0FU#AU{+i;slWtZ%4}+d}G+m6lv<v_~Hk_blnXGnsibD_|By(1l7ll%TqpH}@
z8_I*mOcYrFByC-iyb@0$f5Xo}HeV;k)5?o7EB+7}WeT>6KW>%F(Jxe1CT@Qmp{ZAd
z53Sd0v8)OKd30DcW?m=VR4aSWyLxbQ3cN~8$JFtfjCfMm5lfuH?5K_CuM=2Iy%yRQ
zq|tW?u;PUC%wUYyc5JQF_W0|ZmvS{euzRJZ2eRbC<15+5-+LYF$Li5>7aGiqg^q$n
z9hbn&BsVg^iAfH+iQxWj#$9C2TG3K5h_ZM6xXT(K25dFj+S2l9NohX`IbEE`a;h%_
z;DL>?g5}oBk$-@+3~b1!8Ls-K1!u4MUb0YJN?gH=L`cHi6P<QzttJAFC;nk5NTy#A
z%fF|avhTmFuAV5SR*vxuI(ceviF8WL2GLTWi0cK8U1O?OrNGpHuBB5RJ)M8Ssi1{I
z)i%aC$6-Cp!UWmAi9x5?4^x{$Ak<zn=By7!O5~A_mY7~BHKR)p12TZ}R#?7+FVg?D
zig+Vf6?7iq|1cBk`!4nH*2M(gsW?lT+AA@$9}=|YHYHCMtY|o)nJokUQ#!Ot@HtU}
zzo5toONzn_-&_xL5*TzLQqAgn#PZ5xWP1(feSka;A8^olH()Ocl|ta_X*YyDZG2L(
zsRw|X>q`tCHpLJ7TzkoEDS&ukhU$OGZ8WRP^FGfowKhsH3loDTN=+hmeM>)o1uES-
ztAJJa`EyeZc@2VGTP0`~w4f#{l9#4wNbY86fQ+HnTu6;nv1BAlZZF{JLDi(yg+%G4
zl1Ch=E4)#7;eGW2E+cOW!`m9@5zyUW<J1wj&Y^RP5CG1`e2O`wb69YEj~lO?;K+vm
z?F!mID+%b+fW>$9nNtEln>Bhf=6nAhEAB^|<UX-&c{(>(Udh2H-LXN14qTFYh&i0-
z7P!4}88ZJ{6bi|iqhmS9R}ttBH4t>`>2ptGpl+&trS$B#>^P!Vjy}sKnv3M&)Oe7!
zX*mfos>Yr(3rO@jqYUEatZ{&WE>Nzg3J!<AzH)Jpb*0*-1(~JWv;%!y(*PO?vf41`
zEeHO$)Ii!_x;Vw0q=m{8CHDE!QH2df`orKd6%|h^L&9E}R37mPGiRL>h1;MdI%`XW
zfkV_}v8$jlb0Mf|i4#-yHY|6~7d+>QRg<;0L>6=7L*Qqngib2y3MwORQH^ou%HLU7
zdlVlZ{ZPW5Jo#59U%U(?6l!ao?@Lk=`qa`7{jol7yD1p3$LmG9ccw5D+=LNuiN53Y
z$5YB=2tCxGw9V3erXS+1dLKLU$Ei4F%!uTu)MG16XTjyYE!y*W-?^llwr8H3K-A;O
z_DRYNu78(cL9t4A7Y85y&mZinM4G24s9ad_UHu%A;(Yq>!hV+DcHTcfIU4s~E-YNU
za&&<i<-b*@Y!7nPpJX-A>!cE+Rxq6NokG<W-d`_gA6W>n6<TOzG-5f3pWG<CAs_aG
zG&o&toYo^Z)p-L!W%*qvplJ~P+Gx&B?UA!4t*R`9+>Z6K9k#~E(SNtMV^<D<tnS}v
zETgez2IUQKwBU}mnL6I&w5^LL9}skN^#57KapLIe+|%duXOxSne!(9v*(sEH>4xEm
zadwB@Le<WvNsoJM=*I5@nsJngu(V=0MFdTN+R)6s>bA<nMGtdtEw$C11HY@(m*%vU
zzLOi!QHki~Z>Fp22*ogXGgW0lQ~f59f+a<Cl>2uRdeW3Xhs-i=TQ;geB~7Anu`($r
zBTE!vjlNoE#AZO8PJC)kKaZ@yKFw|p&Oe8oqZu~J@HQ!fS^~it*CT%D9S0eqN{j$!
zWBho&>@Zn)ttbh)*2!FMeOIVcJI-s>?N-@pkM*S{XpWWOcay&-`UV7I0trL#hBCcJ
zR+Q*s8;^M{$KN+sx@37+e)cKxUzRb31|!K)((XoIUzfNF^A;0E`2Tv~5{leQ>Q(TJ
zll>-{Jv*5@u;FOvb4hu9kJzK0N*Y5<TYNXUNm?}!QwF2;86^i7uRPTPtpuWQ%*74?
zGb(#j(UPo5YH5Di7>frS%P$B(!_<c!4++n6GFVmD;(6uE@o$}U?=s!EnrJr$8(F$S
zXAi=GLl1YB#JhMEj;fX#0vc{<Rr<+OQrM<$MKka9qQS>94z3h=V)H7Xc9sl*S`!r+
zZAAjXjZ|6*;1^?p1aH(Hn~yLkI*P)-BG4wl(Z2H<f_Q|_d^I6&|GI=R0?lhFz4=m}
zSL4>nE>NRFgl_$^om0yqzA{XeE-$v4JST*UmiXsSgUeHJVr$r9t1L<s0-$LNr;UTG
z0ZPYJ=j2DTOg$g-V<_~1P(u!FU62bhI+B^jkk%<%+uv7L0BV_L^n(t@4Y)o|R<odV
zkm|bBWP!!Wy)nWVN4jZ-ZBS+~l|bFD)`i%fKF2{GkODs`SPBQJIBvIZHjlrz632*@
z<~<-$lPc)bg(D@}MjKgBmQ<NL$tRh#H*yCIP*}j&vp6~!^%0ux4zO^S3d4n$;H;DJ
zm1YQoUSZox%S0oxtX#)N>N9W>XPD$nh7_J9p>b5J*({=nx<ZCs2{L9|o)n!W@*q>q
z;ggC}T4kOA_s*)N&T2Gt;Ss3&Q3sRSV{kC<GQUlE&3h5XX(xuL6)DA&;26?tCy?85
zP>A&)d&nyXc{=27qN(IB@6v8Q)@ncAaKgqw^W|cSj7bDL?oI~N0?uc0nz)do;{awp
zoYMh|l;4&ks&K#UGwX|3@I8w5;24jQUyv~-GlDbE*52GuYW7)5nk&&iz~!-(p$tD3
zd1%m@wu9=5)vuDHm88mKM{tS;V;fKjr|kG#652j#vOvlU16`q4E^AlFUNVJQw1^<e
z($*G##u&-D8bq|1Pgu%3;+4af!4w*~I?JgeO`C0mq=cLh^5#>*zc=S8{-JEuB=2<u
z9>U%Ws`<;4ULH_+^ousg)WNerud;gNl}7q0#{*f4$1;C*B&v3pL#m}2xS)3<nBR4j
z;^b&f=Qt*hW`o12i2gTGO><vUeMhShQ9Fy8UIV$-$oF5Zn;x4iiT{*$+9c2@aGLKT
zj03D-f?ZNmm6%$x4oOZ7JlSq$kRY<KTQL;U@Obj50#)bb;fE}v;onxLO5H!zS~NzU
z4BjcU!<Yp*s$N?y2>bVJ2Cy8E`A7<yl;mx&ZB;uQ)|eb6XP26-cq71mk<6z^Dn)j4
z@yu4!v?_UD?z4Ph_E=%9#@Hvt*{9Rn7>*jAz!>ac?@wP3W*{G*?_K*V79K5v+1ntX
zitOVyj#mGwnm<&|m}AY`+DnPX$6+z6(ErVB{nR;dpVSuk`pLo>1jgo-FiU>GgM4l5
zEH0CE6UeOO{jY0GRs9hx00i0<+gsx+%+T`2YFjf?R}pHeBtxyA$L87o5Z{@x9WoX-
zxx;}o-^IG=9!x~2L5AGcFIkHZdAvIOtI<)mH84eY%TlZLzO#<OI9u|(cvWJjWjP2T
zebHe@zqEY1;=hKyJK}9uDe;7aFe`jQ0>^w#kTXXW^>@t}_Oo%Dkh|JL+;!5>2Bo#|
zKv2n7lB8bAHYVQ6PcFkkP9{(hy;><lHJwH`3cNIXRh<%#6w30==87P;^#zlF=jBDq
z+H8dTLY6liU14>ktjeF<@%tbg7v01is+k>7{RHYi-9+jcLb+KtNorxy94@NB#Ht5s
z0{1l(-ZP{*%G3Ju0Rg4Pr<v;M@<CulUa8S1p5v-{-$8(4J`RxW#93wq=124HH8XLV
z?3BfaH8GtgkEdJ4#aha#%K%elJ^h9w#bLJ$7fRSyVG`7*^-OM(`G@w>>rL;yl4~uY
zP9JYp9xPa7DsG!}62?>M!@Ij=S2L5!KU=m-X4Bpt9njz}S*#2=`Uae%+d3C-in1P2
zG)Ya{atyyKD8s&SAC@a@r<QPM(I*M^N4YG>a1ui3QGrZ$(}>)g{GIw&j~;pvDI1Te
z2eXS(4Z55y-3*k@pE*K6nBH-8eMV+IuGO1)NK<4Uzg54_;SDajLqrU^iIfPjm8+lX
z_&Y<Vdoqomt|CVt=FXRYTvc6apna?tRV<L2^Ge1HaIEVUqJ29?z@rwk=F>cXG|83`
z%Mj`$FRTJ`6>^3aR#p=8>ATOf<5W2F#=EXez|0!mnS-*iC5t1gLs_Tc3~)sd+mFT9
zEZrXCgLj!{51K$ibB%>`4P%Y7&qnx<0y^1RwQOaX>51!Ql8uz*Qe9Lj!@ZBw*J&TF
zITAu<MW5!~))|8Tni1b>uXo7MeCpsKgxMu>6y&)UZnzGs_cb{YFlV*Zh|^MK^Cw+z
zAhTQdW~|O!Is|WMemiZh!G1*5*G{#eeaNJ}@$;R)vi4N|ql^YtjWwd&yv+}<p1+-S
zO{0zSSRMu#OiELN8Fu%AHiqr7BcHr6G-&m5XfrITrIuDp)X$Y4&g!ZASUnAyad9!}
z15vC>6V73qGPgyYksG=@c^{i#@95^%w;WLG34}6k_cmIVU`1H;Biz~U18!9wm=wl=
z&jo#ie&TRRGF;K7J7y+vlbfCd&4QimGV-7<p;#sfEO$M_;Hi>V<<KnM<?ygFdBjdY
zEW1*oxbP*E%GnT00VhtGYh~D7xEHrc(onA|hsDUcqU{%keTQct#7qd$L?ty_PdA8)
z-J~+bdnoRDLs}jju~~;Z4hh<uEFDvvvDYYku+g;&*#f(olmZA%x&QX43Hcmv*`DOx
zj_miLKbGQrjiIpQnMSdPA33SRM}Pz+USaVW0he7k<%J;u=HH>GxI!QU3~-dTrJ(3(
zRbjMbvuld%;y^l25&@^Iu7Xo$;D}fSrzh{uAI5z_4frx|Gz>u?ui=jCd;Kv_o3Wmx
zqFVNIjJ{p~)V;4}7hNcv))n*T(H@%O`!W6g<T@r$$!czM`3t%O)*8o>X{l+Zv?Pc9
zj(pY#J7!<_3Z=VQom34+I6|M|lLAcco_=IzNlOpi-1@l1y{+PH`%mcL`Zjh3uFOOj
zrs1n*54P1hM2JUEt6*^D8G0m25l0xvI8pe5SM3sSBmj<r3{0I+`-W>(Lzd6EF1T3a
zTtH<r2cLcJZ%*OLQ2A6;Jazv|KU(1Rmm%7<5_+ZNheeWZ@Hy;sv_$RCz_w_`GW+;N
z2Z{m8{3HaNzaN7I9cJaR!({{e{JzfnSp%Re1V7;<uJ|;Sw$wrk{c)K++rR{Hq#|cg
z<iB+ZcE>@5H<9W%AU6S92;HRB-9CT=r*qX5d;48_FwK=14Ry||dzk296C;0=I-&53
zv{HdA_T^vSy?tCP_useDp}DlW`w6a&l4ni~sESSJ4RtwaTh2mXEcN+jA|@FTP!vF}
zBs|l2F_Hmt$JkgcaU#^t@5AlF-V8e^42Tk2H6~M1wihXkRpO>^uW%3O(Takk7h<gp
zHHe_Q9RX^CQSsF3FvbU#^oSIR+C-wFJYnbGQ@3~lT2nPfALq%wn-ScHT!m5^mf#)%
zU=U#A7nrfhN?Wqfli^V2!Yj?A{7$}Wd@y*#2^q*uL{b+&w9*~PAmRLE8LdTALs^(A
zx9@Yxb|a{LsHDKoUBz*uV9uY)*Xppu3<Y@3;Dmo>RM+_(DesgiHmdvsT6|fn0`YH!
zEaHl;)-&pe#-C2+v@E2KiskNigBypAbKJahXg!t~1|kZPA+yRuC}|a-Ro>QF2rv(9
z{R);M+Kr4I7KcxF-eo8x*d0AJktMb1X$EGwRUBz##T$N$gerLqhSRkwPTNP~5ZiyS
z?ZYUdHh{@7RAZ|O3eny-Qg@|K-CSSy6r{-fiI4M-siO}cE4P=6cP*$n&Al@=UHH1!
z(Qzry$~M;{$2;<nZEUWMc4N_V0argJz3*1cl<K(zlWM2}ddWhYVLBI^kfgLJ?>-LF
zh+6E_mNUCv#}kvh$rvq+It3ALJ(}Fbd5D;yBGjG6Wh!oj&*BJOp71Or>TsC#em%8m
zaL7-7hX7xeLP8x!zNDuWOdHrH8BmWyr<F@Xg;86po$wN)UmZH+?8tL9D`lnpUNw0k
z>)m=caIQ&izY`5{*Q!cQI5?~_io5(NnWhkck2fdqS3=42#Rt%>HU%f&Q$*d8rz;C;
zZCpvqVt9|5cvnp75IevUIo#qjn84JLnLZ=YH3U;%7B3^pbxskua)bM`I84mu!fnpn
z)i#U2EpyYuOWDIcZueo_#FV)_(t4{4*W$`(+%ehJ)4u=5STonB5*r-H;jVhLPr&Dr
zI-7onOk)!;O$XM#E}Y{vdv;r0`w@bhcE`*B{3=2F7rL~sToLA7uFs>)`(&=k#kaby
zzl0-RN}q{%(WfOwZR+<FE`o+TA+yyajTH_MAnhzviamF4oTs+iDg@2cu3##3oBQ^u
z$SiV%;^EPA(@*8Wbg`vphbE!op8+=xb{wtrX%g;YN#x{YKLic*@?15y6n7<NYrzG#
zQ>4Q!|2UeAL`!R#E1X2yZ?DNm-H9ujC9xPL7Q2Mc4$zex3EujdZZK#DMn~_?P6H79
zeL2-<H?cTBy>s2{J!S;l1GcR<Cx;#m?_?olh+dz?V(srRA&I=&iz|RgmjXzfdr+C<
z7s5IEMxXVY!i;}Wyp;ZXZbkA@U$pS6<%6Bbmsr<4b>!voZ0C`R^Z&)Bm>;+z<E3gk
z5iNZ<(rpfrFLF?IUrD&p>aMqQ7UPkeVKaECcp88wh9TE{I+&)y!CLo$kUkyj(v`k0
zc0(rqlCdt!T`7dKq1#bO7~-_a%E(4;#9giS5AG0cIq;jEZ8#K{|2CInsKvYYMZ$7B
z^og{$`oF6eU##-pU4yuQB<5ayQ_ffb7Lic1+43Z!lZ~8&JY^eTAJ4{B-LFj>y?2u)
z+ht~RdqXP@!KAP^Hq-s7KsH7jegC_Hsp?W~`yeUvyqm3#j4JF1^xbmSW!VX`T0fdA
zHX!Y0<Jo*omGaUk*t5oa8xTKjQ8y|~deKa=dz5=BpG&(`8p#ngfU(jerV@%HETxU|
zRnm3t$x0(RrB&n%)vjq(^RET&M9S63#NOGvNquPhX*<Dj6z%I@UXQmc+n5sjs!emM
zHmpYL@JxU1%5kQ=q~WnI8Mjw*8%}=ESX-}Tsm1q9ey3f_wo^UZ6icp`j6NH-vaDB(
zq@GTDe^U*M;9n9lC+MJwkYl@7>2;H0`R=L5?cRVbR$srRGViUOeff#htLLO-I3+8l
zQ-(9-M<piaxb5O1Rl%jkYw-R1JP24)EfXkD*Z5_1yX>ng5=YfJl(USe=#DH>XV+-;
zPRiD?rq^#QwD%ni&&NW2>0;7dT>VdcFSZc>j?qO_t~csYsKG>K1~%^<@9VQLL<8Hi
zSnXx^t$Q;6(MzbMNprKFVe@qqjk6W6ID+%#S4?_ZyQ<{^90)fHO|4C6@zR;PxPHyn
z{VJ5Y-Z=K{5vpTg_1XQ#u^zWFox^no;vSlMBaC|Ygv~iVUsHS54Xj*USaEjs<B^Sr
zFRV_fwDuzt`{3VAI@oP}>J|cdglmf6aer-5H#%Ft728acnJxGB_uV5s-(ZDTYOr;v
z$@%i8oqA*Z^?)|V2Kp)o)cxA@d$v@g-hcGSa%88e8Cw3ZNvWKA2wjK`5AJzgIdX@d
z%oeK^BwACa<nQv#LD**vqS4J`hg^KcaqK8HN!|p(B6bAoufYB{-S+w|egs?rV~H2{
zmdsE)Rd9k|2Phw~qoe>>?TX;9gbM+2jJiooU`T~=LEkrWE}?p+*pr*IbjU8Es%F!C
z%HgyH0t6I>_PS&e<+xRvTum$%=wV)_Lasux!@MucF$%dcoA}eW?vtb-W_If+lTdtu
z61g`;_jEK>bx~b6q~vHir2!TiXt-nyH8*4ygu=qPZA)jZNbSLD*+vOl0!whLWLy9Y
z=4&w{UF?NBynaF~0M0z=M~3>M4MsLe0*%x?F`5u+c*&3l^=K*!Cj5NWR?*<+sWrQ8
zmV*B|9PFtp9p3~uzg|{k9Pjd!H1Xc6g%r9ZQ?vvh2^CB0$rXi~E>Eu9wpM2nU(`rA
z)wQ*Hz-^S|=mzIXq7o{tRbgv#s!WgTtGsK_K^Pb9IWmI_>hTXpgdS@jQAeU}0cGZb
zjBgrKQ6ecbuxGJs6S$!`we3Z$?_>^EXw}OM_<Z)edC+QBVc2EJE%B7MPWuXHU_x`E
zEn%e{^sf)lUMNGRC^$BkOy5b&24GpQ9yH(!L(;yhmMQXIUQk9+?NaYA;S|Y+D#L?{
zZX^Suu6IC9w()EOty|K@!<??#(s|eTLQ#JIM`5S$!}C%mcscsnKP%VdqM)mz#%f9g
zNR&_~GoJ=C@B)$KJU|5k3Y7~hvCWJEFY}ak1*YU4EJ!Mmzaak)srv7nwga<roZdg2
zit%5<`v2y%9h~f4?2YYh{!g;{e*kS;u&nJjIUc(FK<PgPS4XW$Z0o!3Y|$KWNM+X=
zd|Q{mvjoaa|Ed*-3r<M(i2i=gB%qK=Ic?J31QkV&dpeqN&`-45TDKaSOeNFYCntzZ
zIH5B&np6{*QBDl&A%JS$fyA@uMUkd`xL^-kH%f0L(Y&mB_`hf&pKAy(OF(`VlNjeW
z>Z76Tvn1OHGjEwztL}l*%#Ks-o6H!ETE0!WjkQTm)<l?Qn@IX#h4oj-qIJdaF!quY
zWQzH~HNrwt>N6wV7J(j8TGD373o<~HY$H!<N`ir#bU_$FsWh|Du4=mfeei71<~+(Q
z8<sy{(O`lz0&h?OmSrXkK^?|CWQ3gv8u=kB`a)Dqv`aE?RWY9eNlYj`76xkojZl<I
zLX`~e%TzH(Xn>n|ZrKWKj@)Z2@*}HML?X)!l|VcTD!~NztK`7^JhHdw2J59Qt;-Or
zk^M>lvVMO>J_ghcnI0T#men|MSAh4OwE)YIlo9K{c&d@r%gM>pP)olFK)FqVo)k<<
z5cPJF9N=M*pvC4f8%d7v*s)5#=6K{m64?wfH$eyYYc$crFtLy@5Jk5{x?wtzgpf8W
zBK6~)YFLC>7I<IO{_+C*00FW!ww-X0EgTdEkQ#TSKQ8{ebu-83_C@ly2$V!117Z#^
zOv(CMM?z!EJjO}xmHzK5nj(KYM1pO-=&?Lt6#OPH1-V~ko9qz{;H^b9{|Mg~xF6_T
znrY8X&$(w?kNeJygV^ju^itFAte^k;<%WIzsoZBz!{xL7Gg-Z>i@(2#CO;zk?7Vtn
z)iIoT^6QQV$oS6LLCa>oCwn@8wR!6bK5Tfs12dM_;e^GNoY}1`#%^m_)6P{H{O8Te
z47bi_mo;(xn!R1E^lOzndxqy-o6sJ<t+DHRm#ZVL5b$<rW`E9g8%+z{&fHcdyEi}D
zX8;&b=h}@3J!mT{2JD~b$NpA2vEsl@?2A%R2JCHJ7@eL*k(cl6%^d-H-2+TAc2cQI
z+X2h1NbW22jICC$WXaneJpjY<8yS8(ft+m1dz`<kw)QpnmA0_AV4!R3t)o3Z`aPFH
zLue>`XR^`4!EW~38`TS?dXWK><6W-y^g7Mux$CRh3$hKrAXZr&$vZYGXO^%zJHV)h
zL++tp)P+|KP#SySg96j7TN5TE75+iUJI|jtE>ZH{0pKB&ItSCcrk1^@mCgV%aK18=
z=1TzwE?^ptdah5^zc9Ra6_E}mDHE6RcJ|(P-Zbcn)5b=-W2CZ23}BW$(L%4u6hZ$~
z#5dkL0eL2Oe*AKs-PVAqt#zoj18l$W%}2!wl|TT%lDd1-)WT+=`*wDR{eXCt0;Le-
zYsF1SJMa#h!fQ>V|M2Z0@m<j7LW$Tj#bA&6je$;ar(o5hZnXqT;@(A1hvV)l8~;Js
zXWeZacId_dTSZg2VRT7XX#S<@Luc40Lr+&lWE0(RCLVyKG(j+SySuD_As~xGcq~DU
z%R?meiA4w!T6nO`I+ckZnm{X6KG-!OoHP5Eb%dL3QNt81IYBBp8=zParUN#JrP&=K
z9wV-7h!ouC^sw)tT$vp*!kU1Vb*z+fFZ<OimZu`Db97<{AkiWMd$>RlQgN3d@wn9a
zH<iKuqC#l5-((=xx44i@HldQ}Tbd{&yRNcsp0xXXNV|v#=wkMa*S9+OGw_jqgM&2!
zdYgRdFjzOp{SoZHCmN8%U9>$3%yc?#GI&Il_iU7?V<#>ZofVuSB(t+8l1KOPI6DAo
zr(K_TB(=hq0t0>7pX3}(ij+cR21GdpNIO-gKiJ$onNTO`T*VA3(^!QdT~U*0EzUxF
zt!D!CB<aT@K=A>9poG)fg&lhrl0O`p8JY5-X|_{$5z4YzL+PPX_C=RT#^|KkHJ8+X
z5>+JdPGkrdS>oW29E2#RJ?9b56hN=BTWnmNOjd-!_c~I%9BhM5F3Wz^!4ra4hlG_-
zc+@R|m7)VH;bhRplIW%2d$d<|<TRDcnINQ;sRr^59U3mtqtOe@wZJso!n=`RgNADe
zJUl{h;WUXV5hVQTe0QjvdIBdGS602PgzuRQU>E^RpkIXWX-MoIP$z%TPQDO(b08iw
zjA%63s_|Js2;HME88_9LHqwZO&?w}SKzBj$U8nXQa-vR2-Pwjoav*L_Nr8YTn5z@I
zgykR-1Apk-45o`R<BJE+V+m|DuzNd?#Z+%!+ol8h{utdKr;bU(`<;15KqTpeb+;6+
zOEskf)-!hWEfssY!tS~!XHZ<;Pmy9<2)YItxA_1xE|pHTM#|`tf<&f-N{f*S?P+`{
zRxGX2ZykGCuhph#XD856Gb2S}fsFbQC^-_Jt8eE~t?GfO0C@{8UE%HebK`nSSLf}h
zIc%b70y4}`7?CPti}$#>NPFC{nGNBk2UjAZtZW}vx?eK7KL>t&)2WZr$SL#!ab-En
z?eLWkG6<{y?gNZ~?QT_FUgZfyeR64hIgy1QuSj7Q8Ey(!!&i`WJm$;00qmfZ_m-3M
zsG~*C<n+}tDDEWIw~Z_GeM6|#m!(iS`of$~6eE|RZ>oG%NUK|v3(JRLYQ9-X#H^7Z
z==zCZ$|@OO#CV{D<os&1u#)pU?FQ3`6#bg<5-c&QhBV-;weViUi&`89QFQZk&yI@H
z$?kYgfkDg3P1yER$?~6R5!5h3YKS*<_l!u%bEmM#j{3qAj%7}Pj7k>C1w&l6IWM*p
zcD-ftDp<7n2$P&S7)nA<`QR9E#AxQ{IU+D)ssD}&{j-#`eM~q&WC*&MejAL3Ib(3n
zaGlsZ6(opXOnvmDj4M?@FbGT>3*+GMij^ZS68I+~UGn7lHqC}n`wEb)OH}(7-pd+?
zvP|=;a(NB1P;)j*YzXYLN%lJ`zMBI-Bw^f{sd1IgAz1PZ;1VG*kf6;eA>>7eApGqc
z+=rLD6Zc!W>PK0LO$)UaVyqP|zFgv>o=Kf?9>`33n}U_Se{xTfGg^35QPm|SQoIrj
z%rNR}*fXmi2o&1^LL5}I40E88=$f#FjVCam=lh%!Vi*Uw*$!PCTY5)?Q76JTFqr%C
z&4ANfA{Hx@mhewYc;a{|0vM;i3<2vC9SGWdMSFvhP@0zT&FG`v|4?+Wczx61iK_e&
zTOgCCJQ*5$wpp9*&uRSaYiIH5@zZq3Q1^uL)v;F2^7Q3>XJM#}4~oCMZ)n;!RSkO!
z?oc!-L`7N8AchZfZ*pKZ>DRj5WvgFF!Lo{oT26KRa*S*UPt*q!rPpZ1sm5D;m{hs?
zYCg_5zuu_MrVR97BUa$T+@9$?dYnM-0yd8+5T!*{P>mZL7T9dBjsAQvt3*mUgVb;J
z8Dn(P#4JpiJE|k^+(L_8GJh&zompqz_V$bT^Gi^=XD!9kDPd)M?{#C~7ad6g9OH||
zBe;lo+s?pU0O)++dBEIgsLW?Q8#6&A@6<@ltdvty37y4ys82}D$*v%1DQ={!n4~V(
z6T$LO&f`eRT4ZDzt-<S!xw70sl2|QKH{rfQgei`QwqH+8-IP<;`r9IMm7s`oQ>X-L
zsqAA~Q$=fvp=jmUYJ^h+JEc!7&5Pn3$I!nvinz^X>exapA-RC85LsF!sPXBE{Fcom
zpMYznytM>_vYSu3b3S*r>ij&vQO_WwHM<qq6p<7gmfMIk-cA=pBm3I{5wz(Jm7vbT
z#Kts>Tt^r8xPLTo*4n_Os;v-}mjwuY%MpUx@w@*tu#WPI=vKWn3Lo)Y5aH*s;9H@m
zHbwx<WbvV)m&sVS^X`NtFcd*b)|pB@s|GxX--V9s=#upI6L+`<R&bkejL&P3^Xc<(
z{1{D-8xfT&hD8Ns6(5AxY&^Ah&|C^Oy~N^v89M!o?ODhl=(Erh&)@g;K-_p*gx<Jz
z{DVLB1q@|pRR!-w5(eMkF85TuHyJ3>{P{Q16zv1hf0ei1apftd&}U4b7nw#CwIdn)
zf&RO*mCeh-5dUu)@yC#GeA*e5^FLVL+&UsQ#(RP;m}+#Y3Cz9{lt)FI99b_<N=hb?
zjMM=z&7VIw5FPk0mkio`r24?(Bu<;b1}X0b9+WX4XqC7!6HB#2x<9kR@Q0K15r(5$
zljuRUE{(oP)E{rAkMSeHhe-(*s%9g>nUJzlGBLjdP?eI9=iYWyuy};d37qVXj{2%H
z7J8*eom0~>m^mqL)o^5QR(cvr!~+w@v#NpM_2$<$8JfL5%Y7&nyS+B-Fq9yg0>eA@
z9lz;!=gz4loe<}zeNXJ=WmMqoy7}pUr-k2yeYBqt)p$+Q<RFjcI1yllYZ#Wd`cpvo
z8;TlARMzQ!q2R(N$jmL1g=LO!@|^|YV7TbH(gtJ7+_7)e7m1|+eDxqiwX2x;{Y28x
z=nj@R)|8|>Q<%~9*xA)LQ$3i~fmFUK9va1MR9CrNx=imIk5KmboZ{3(ym*7Cc*N)@
zngbN^VB{Ugn*!9t#r#>Lg@tl;nrRc*G+;Dea_+<@Bp<)n$~6vBn`YeasiQ(}&C#k2
zbN<l}<j=W9Rc;Nxs%-u6JkV!b!X?l5%p*2K$)_14j*5ndZSe^N#*G1c`^Mi>X+de-
zJYma=!mPnt56%m1!QkFb`Z(5gz^NXw#3-nSu03WShe`-@{v?=lo3j0ILSF$5bcWQh
zMC-_if=@Qe!>Fm^Q<S1i_uyd!b3+n)p0z2V)D%-ouGtv}=^u_E=K`x^h_2O;%ON34
z00{kqzbA2#U<fBdp)$Q!?+Zw~eeC{+&N{cvGwA+r#t-KnoneZ%kNfJ+L$!@{{XtTy
zq_r97xaF_U#BSZf6VfjTr>Io87NqI#3hk=u!q285>F6)J+ul=gT@^}DTEk&L_0k0(
z%gCWgoSR_cV;}VYsZ({#XR90jRZ;%_yJd^<9~AIEooZuh>|$we=ls9u)oYzgdu&d)
z-<v+6biccH*VvLSd^o`-0k+GX(sXgeE@+z05tY_skH7aZ_IQ#$qQ9@im17c*QHLa(
zGz!3?HjyP1QD2cMW*;#<K0isahnfhwH%+gH?DW5pw7GU|o1C%co)j8Hi>f%K10cO~
zhaeHf)QKkj{W}lTaD77xGED?=>nul>R7z<hSKD0L2GQqN6|;cY^g$`Kr4Dasf9kWt
zZIaAsV%}w@*G*>3jAkDhuwA%vi$(5Wv;=pw@fy_sd|a*E?AVnz%{gO`ZkYEykZ4CJ
z(5zjt`%x0%m9T_*BGG5w&OOq|huk(9bVMb`94i7SqxoT&5bYcOHTOvxHOzz5S8^dS
zFZ8X_lz)r~CY5`)0!2G(N_?}YE2=!Z#9O*E53h_rpGe@lq)<HPQa!9$<@d*vIXsRs
z_rh6*`JokTlh^_{3@YcDN>WD~GiT`Yhb-$jv7QN+cH(=7(Rw}qtkjSbSu{^4dQ1nZ
zVNJdkZ^ufjQAILmoK~OmgmWuLCB86jRz@S&BkXOHf85;I0MsstxkQBzR<VkII)l{d
zj9h}Nj+#CUHORMl&+i`^WatG$527PofMDi&uoy+sFm_;-3*w7y^dJrGH}W+ik_`Cq
zA|ygGA(0vIEyC!g=|qXNLIchKmIO=(His-Cc=V-5-S3`b$}2bqs?TC-=7$AFJU>4y
zub(h5wi}8G0me^XkAG9D2VNb)*A>2ti8W?}G#E8xKwo-BDI&`oY9?^8VLs$pAcVpu
zKBPiaXUHSHcB0x4^<-SWDpWkTmt{mm5J05VMkvlR6=EO_Wq)Bjx4+^N1Rp@;H+&@D
z#uHg&ZpN!r8ukc#i2{sWrp8d#kcgPZ<P>5E=a<=%0R@DNE??p?%)`)u`w-jF<e4mw
zCL{pSFs`ulMtGZc#Fd7N5#nQG{t3j#sO>gU%-Pp5X;NXNrBR}3y!cQ!_=T|66RA?a
zxJGu~MLdma$&q1W>Cj56(QT<wwOyU129ekl(D?asivUzh3LAB*h3pnNkoM%8M(N|2
z2Rhr|_%{|C<jF5I1e!+^TS1rv3Un?Q;dS*CG)nD}%B{YDrHG6`h*9Lg2=Z?s+GqDs
zpaLLnRFB%XuKC65jY&b^8w9fudLv!@Mv|o=yKLUt#@7v0cxOXN!WUJ!t<E(xq^Js1
zS}@By{(VjwX2jMCS(!+UQT3vt``UFD0JwF7o*e<>C~lS#I!yQ?7vN;e#oq|m6{iha
zt2aL)HHpLe%2p(N+lTr+z1Pc|Hs2T5TWxm$%he=vJSGo)fReK*)j#TjfYW2W{hb6q
z88L!!^VHU>-yf)AZ)y>@LCUavV}eJB*beKCk#04S(iV16l4p%Uh<@9I7z3Eu#tX0E
z<+r2$Z^io<ui^#Id_GsMkkM=B0F4VMB*N`8hwxqa8`dw#$b<%niZ7i~U=E+g2rf<>
zcvy@EiLoS>o4)oAdrg|O90J~LGSHbo@>Z3V<wEqub!mcb8+X9KRkK>-0-`-kI@3We
zlgVm)sPCq7D>`!!(c-c%VmfWCf)q=wrvGlFwYzG&24k7DIRR*ozT8B|wh?cI^{cfi
z;x?<6z0QxbnHvN-S*sS|W~ka!8RUhGWa9<y#_HYa{c8sF2*!o$#wCN)=LRT;ZiDJZ
z%t7o4X-m`qqesQTs`DiyaS&948+nj17~D+!A*8H$%Ydly3HBr)SZ>%GX~w8=$)t7B
zvRf(^mq*8`LajE+`o~V!^wth?OGwg>L})TPVYwWae2K_-=1G+r5JW&&g+uW`t;NVw
zc4(6h{Hvs*rl>9O2^!SSl1|nen?PsciSNg{L|uOTtL4wj!Ub5bqxg<DGaQ>cf46bU
zGb87bh1(q{wF`&?!7splSloJe)3^ps0niFSQthgL)*1kkLGn$vkuR?_iLD@446?W<
zs;W@vLaYrPBH7F>CN0sunb^Y4R*&HWheIov|F@EzkOa8pT#=;CH|HQQj%MvFh162(
zOsfbJZhe1bWFP-$;r;x4f5ZM~WB(k%j0XD`G?3rx*Sh0zPHH@xWHo+5ydS$8Pr>as
zrZbgLe;iik`$?T=f~V9=m7f4_a9k_cJsjOwc&PYEi^Qz6?iE%Ea4;1^{<~HHfr_Z_
z1Ee)CWkaR1W1=TNY(T}{%fR?8QERFpF;bj-OneBOg(<Apg=CteRIcu|g%;THrw6UD
zR3%UVRtexU2Ka0eC{nGad4rOgaWFaX;}~=Kd_}AsrhX8TLa*oaFSHk(Udjptj!o^P
ztI&32G#Lvf@G><+Vs>QS3Q_=+J1d5gj?l+ec`kU;F>X()Iu7g%0N0+{)#n{-Jf=^d
z`YeTz(%cxa6gkp3im#^?2zNcJqL%JtVgLHu7_&$naPeajhI<uLbc#8O9^teuq3IyX
zD+!Cff;w9EPTJzfz`n~9$zHQiO^vef2TQPDv2!A<F+QmQu0pn1LXMWhyp8JI<AiNs
zHwXkAp##}!L{hnsQ>ld1WwDt!(~lsE`X?%UQo$X!la<PNzZtbB0KoINsmf79f}KK9
zlQLp4KX(B7IHV)EHMQAdTG;)iULaY8e10=Ep*d;?jXi}`x|hm5YQ=gYO{iU99ogG8
zbCVe>H9&P!%h>SR-@h)zDV&&jyl|5|q%8GNBgh)DAHnruP7A%@&VTxD9Pq@pj)56p
zz}V;=hZ-bzAPT^ZDJkr^BQA$_e&q4Juebsf%Um2M@8aBAR-Z{D<BmtxH+SVtzuA<6
zfVwLtl%1F|3;<Sx8XWN~`3RMtJ~|UXIvUe*C##WjY8Id_)_pnwXbK$PY|zH1Fe&UD
zPM201my@Gb-f^a-mmQIGz!2djF)(jF@u!w_FLUL_e-3~PYA^M*k++|VfqLCRZT?{D
z^=0IoBQA+_q>LEYR$8()sA<dDTH(%}Qf^CMXORg=v=!2Rvx&^DXA=O5ZAEo(Rc8TW
z0LTn9qW`Nc0FXag3)||E@Y@>sqvMRVZ6|#l3u(RNoh`K;!j+!8BA7~(k(Z>3E4T4{
z47ln1$ZWX<U}QYUwu6Ch8Q>zaM@$sgqqnvR?T99;Hr;X3tqt|$Fwf`sR5jVnF`s1t
zBbG<2BPq5YSoU83*`Q`Pvr$6_S+%4dr=a%Mm6igpuJy12^<R};c{o&iA0K63uDG%{
zcHNLIF<0TmRMKl?m=Z48*J(`DkfpL!WVytU_ol*JBD=9gUGi26qr#Lm!Ye`-A=`V#
zpmT=QJ<oHVXZ|?P=kq<^@9%u)_dCnyv+q_FiaGIyaY@IC$ovXXRog(j(@MBMO`gtb
zM5&6uLMYD<JTk&@FGsNWDE^dcD!8j&lI>9LXpomUNeXTv+pS4a6m`ot3+}3EQGSpI
z?<kL`Z)QE0EA5%>+<LdPy?E9?WIwie*U@(prC2OBfS5j+(BD#S(|}I>6|syzv((?Y
zpx@NE%-N2+NdBP4ZD%gr>soBFOD|DY#NY^Lmy&h2hg74RDV|S}*XK5=Oy{}22I>|f
zRfGvozGO3rnkD!KpZ7)7Xu<j%`g+eTVva`kyN6i4bTMnMmd(t-dyq8w9xE2JG+N1Y
z36{O6n^*cYJfFaJs<+|1+_~(X(?}w@F@Ef&al_5b>LCYlz3f(MFrSSCrqyY2pL<bv
zu0QH{>kJ`h;a=Ru0JFEQ{TVhY)aTI+iLoy=x)P{L0ydVmL;b)f^}5`K&C<UhbNev%
z!=c3pg*Pcmevb;$TOa(*oKo#e9xS>>*7+A!Pq~1%x&t04TRG`JUD$+(En;1YCO6Eu
z#{OK1PI<@^N5Z51rQ8zGBMW`{FJ^YQxeSJko5*ET9khk&daeki&$q|$>ZK>SW~CL{
z-x%bmsjBD|sS@;^Gh&<c+iR2`btpw!|8+C5;l$NF$FA>AO9~>4+Q^?eZ6bZM*GeLx
zEj)*mb7bex?a~5sU6xUk(#Pu8Z^(w;MeMD&U>j>2K28j4`Ir&!_o~BH>x-1{s}Wlc
z-KHFwX;UO|fG~``!ehHEd~MXI=Di}n3c2UAY{~oI^Shba#9~bxc--z^@PZdLiTB=*
ztQZg^AnQr~p%~>l$>tF63EVB^+f^Nb3CE)1MLDMK#rYu9ahZRGp~c!cH1y<HBgqaf
zCQdJ|wGZ+-m@aY_6uO+W6DE{tDd<pR6f{#>cXYZ?k8`R`SyPj0hfnlNCl-$FvD1mi
zOFc8H(lMygex7QW7a%P0o1i>pu!Rzo`i>~8&qvISd6QI#sF5IEs1~gYy~<0uHh$L_
z{RQJ0B>B;oAI}j5%M(JzJ#_!MjuSy;N`7hdzg%mvJ7T)*<`EiA#qpFn!qRLkp-z`;
z?yP!$V=SROE9AUqFn>n7x-@pKgZn67QCITc`ng!vL|nC(mO?eDxIX!6QAf1jLAb4W
zOtT7Ev~7XNf^}S$_$sL+pptC(cw72_AfS#*+j-ZPK<Ay$`T$FxwUtkDd<B}eGbkR~
z*{vRDXD6v6xm#5kp{%x>6S~5ZRUUGpi6*77z+gYE!8_ux-uAw0=-+q_c;BKiCRKGu
z&7^?LH0*b@g&dLd94{vG^V<N7lvqSB=jc;7dBP#+a*ZXMV{p-^%M3-_VLlZXe6XjN
z5WXj|#))D!z2qqH1V2?@a#G*;-r*J)=1(s4;R9LM!%;cL_8k14pR$DGj2`ms50uE#
z6h*o78W6ov^BR%rEovV7W;;*w_Kf!ERce}y6kD{F#0_DN{84|9t<^R<Q;q3l*snu}
z6-5cmQh&H>*9JNu%}y(P%(J8f`i451@6Q{Sv3dens-GNuawTnoqv;g$LjUdip=@qQ
z#bZp)tj-?_->Y_f<PxtMa36KgoD!KDW@pLS`6a|_Z1KN_(JjQ5%R(2EYVXM89IS*t
z*jsXm$6m{yJFM19?LUWO=L!(7H77g)b7A?=g>gRhfPYu)<Raol(mSpBD<6f;H@8v5
z3DeSPX_IpeR`29V`D3Wp;XalPa_Li~9QEyv&yi<1v`(c+R!6rCpEdP*Dp#EmkS;D4
zHJg8L*<ub~(Q;w;{vPN2sm>WppU9hMBWE(ZMk?dU4S9>7snida<p$Za6E5{S{X92&
zK7GN-NRDSU4<X@9(^C;Toi<Y`cc_}q;~Ki~QtuFiho5yhK`4XoS%nw+P+;=lo(Ytv
zm(0aa=Jmous-zT|jIyO?1_w}f+@{^ynW<U!VvB&pAJaDYcDmr#jVKu;FeukI7GQ}^
z-*&(j^=ZFoD%dI*3l`Y+r;bk;N9R^hTo5R|H7L5n*w%moz`^V=+C%QL^2YFHK!7lu
zi>4b&j|~{<u#C%LP{0*gF|57!SAAX}V%n%@xSqffGpd~lu18EG$YNeSd3wYU1+=ZO
z>A^SeB^th^wD&pj!(cx4Fqi}+_*p3SO7O1szRm=&h(AC=)-B&W<#duu8$40L87dWm
zoWKgny3X{v{RhCZi90GiO<(H(HH!nYY?Ii?0=16&ezItVfSTuk87$->44Yc%{1^w+
z83oMI@e71wGoO?nqktMUfGP5oAQUU*V6biYKE+1KdN2jMDuiMS0egUH6Y+Y+szX)X
z!Az!V8=1fw@83TYTFs%V=wN)D`bPZqvHQ=7pvvB0A^{Bu(Ym(jkM$Ay1+r1#7|ejv
zf-tPq9U^xO<Z2VCW5!xQrC7lP@&{H4)`MU9*WaoH-%@*|!vdWZ=zpR>%U)^z|FH9I
zRDJ{p>$HXn?s;qR=5Ex0WK09}ttT*Yt=DS2kawWA%EH(72E9iOM)e3-MWwe-hC9|<
zq0@7`ary^F$6j4U2Tuk8&`l<SjER6w`oN$s)<8j1z;6Z00vLeO=B0u#PIed!m>>h&
F{{fuiDn0-J


From 48f7746d810c22068db5b5a7b1dbb17fa47ca665 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sat, 2 Aug 2014 20:40:36 -0700
Subject: [PATCH 038/347] Removed the waste line

---
 python/pyspark/java_gateway.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index f7f4a82ede3a0..cea7d0975e5d1 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -76,10 +76,7 @@ def run(self):
         EchoOutputThread(proc.stdout).start()
 
     # Connect to the gateway
-    # If start_callback_server is True, it looks like callback server is not killed
-    # process is hang up and test case does not move forward.
-    #gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=True)
-    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=False)
+    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False)
 
     # Import the classes used by PySpark
     java_import(gateway.jvm, "org.apache.spark.SparkConf")

From d2127d67d87c580a6f787973dcb4dea671243ae4 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 3 Aug 2014 19:25:13 -0700
Subject: [PATCH 039/347] implemented reduce and count function in Dstream

---
 .../python/streaming/network_wordcount.py     |  2 ++
 python/pyspark/streaming/dstream.py           | 27 ++++++++++++-------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index 2bbb36a6b787e..f6fba4488e238 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -19,5 +19,7 @@
     reduced_lines = mapped_lines.reduceByKey(add)
 
     reduced_lines.pyprint()
+    count_lines = mapped_lines.count()
+    count_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 0ba2b4b38a281..e6cd2eb9a49af 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -22,25 +22,23 @@ def count(self):
         """
 
         """
-        pass
-        #TODO: make sure count implementation, thiis different from what pyspark does
-        #return self._mapPartitions(lambda i: [sum(1 for _ in i)]).map(lambda x: (None, 1))
+        # TODO: make sure count implementation, this different from what pyspark does
+        return self._mapPartitions(lambda i: [sum(1 for _ in i)])._sum()
 
     def _sum(self):
         """
         """
-        pass
-        #return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+        return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
     def print_(self):
         """
-        Since print is reserved name for python, we cannot make a print method function.
+        Since print is reserved name for python, we cannot define a print method function.
         This function prints serialized data in RDD in DStream because Scala and Java cannot
-        deserialized pickled python object. Please use DStream.pyprint() instead to print result.
+        deserialized pickled python object. Please use DStream.pyprint() instead to print results.
 
         Call DStream.print().
         """
-        #hack to call print function in DStream
+        # a hack to call print function in DStream
         getattr(self._jdstream, "print")()
 
     def filter(self, f):
@@ -79,17 +77,23 @@ def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
         return PipelinedDStream(self, f, preservesPartitioning)
 
+    def reduce(self, func):
+        """
+
+        """
+        return self.map(lambda x: (None, x)).reduceByKey(func, 1).map(lambda x: x[1])
+
     def reduceByKey(self, func, numPartitions=None):
         """
         Merge the value for each key using an associative reduce function.
 
         This will also perform the merging locally on each mapper before
-        sending resuls to reducer, similarly to a "combiner" in MapReduce.
+        sending results to reducer, similarly to a "combiner" in MapReduce.
 
         Output will be hash-partitioned with C{numPartitions} partitions, or
         the default parallelism level if C{numPartitions} is not specified.
         """
-        return self.combineByKey(lambda x:x, func, func, numPartitions)
+        return self.combineByKey(lambda x: x, func, func, numPartitions)
 
     def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
                       numPartitions = None):
@@ -99,6 +103,7 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         """
         if numPartitions is None:
             numPartitions = self._defaultReducePartitions()
+
         def combineLocally(iterator):
             combiners = {}
             for x in iterator:
@@ -116,6 +121,7 @@ def combineLocally(iterator):
             return combiners.iteritems()
         locally_combined = self._mapPartitions(combineLocally)
         shuffled = locally_combined.partitionBy(numPartitions)
+
         def _mergeCombiners(iterator):
             combiners = {}
             for (k, v) in iterator:
@@ -124,6 +130,7 @@ def _mergeCombiners(iterator):
                 else:
                     combiners[k] = mergeCombiners(combiners[k], v)
             return combiners.iteritems()
+
         return shuffled._mapPartitions(_mergeCombiners)
 
     def partitionBy(self, numPartitions, partitionFunc=None):

From 31e42607a84f10cfa4c9e48f95ffa6280df68e7f Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 3 Aug 2014 21:51:11 -0700
Subject: [PATCH 040/347] clean up examples

---
 .../main/python/streaming/network_wordcount.py    | 10 ++++------
 examples/src/main/python/streaming/wordcount.py   | 15 ++++-----------
 2 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index f6fba4488e238..9b7af07803b4d 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -14,12 +14,10 @@
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
-    fm_lines = lines.flatMap(lambda x: x.split(" "))
-    mapped_lines = fm_lines.map(lambda x: (x, 1))
-    reduced_lines = mapped_lines.reduceByKey(add)
+    words = lines.flatMap(lambda line: line.split(" "))
+    mapped_words = words.map(lambda word: (word, 1))
+    count = mapped_words.reduceByKey(add)
 
-    reduced_lines.pyprint()
-    count_lines = mapped_lines.count()
-    count_lines.pyprint()
+    count.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index ee52c4e178142..2426345711086 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -11,21 +11,14 @@
         exit(-1)
     conf = SparkConf()
     conf.setAppName("PythonStreamingWordCount")
-    conf.set("spark.default.parallelism", 1)
 
-# still has a bug
-#    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
     lines = ssc.textFileStream(sys.argv[1])
-    fm_lines = lines.flatMap(lambda x: x.split(" "))
-    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
-    mapped_lines = fm_lines.map(lambda x: (x, 1))
-    reduced_lines = mapped_lines.reduceByKey(add)
+    words = lines.flatMap(lambda line: line.split(" "))
+    mapped_words = words.map(lambda x: (x, 1))
+    count = mapped_words.reduceByKey(add)
     
-    fm_lines.pyprint()
-    filtered_lines.pyprint()
-    mapped_lines.pyprint()
-    reduced_lines.pyprint()
+    count.pyprint()
     ssc.start()
     ssc.awaitTermination()

From c40c0eff4847d876e2b68d99befc8242df41db32 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 3 Aug 2014 22:05:28 -0700
Subject: [PATCH 041/347] added stop in StreamingContext

---
 python/pyspark/streaming/context.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 5952e81a4bef3..01201f66421f8 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -121,3 +121,15 @@ def textFileStream(self, directory):
         file system. FIle names starting with . are ignored.
         """
         return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
+
+    def stop(self, stopSparkContext=True):
+        """
+        Stop the execution of the streams immediately (does not wait for all received data
+        to be processed).
+        """
+        
+        try:
+            self._jssc.stop(stopSparkContext)
+        finally:
+            # Stop Callback server
+            SparkContext._gateway.shutdown()

From a613b852668f88069555b7039d4e3c9f536bab93 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 3 Aug 2014 23:27:56 -0700
Subject: [PATCH 042/347] clean up dstream.py

---
 python/pyspark/streaming/dstream.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index e6cd2eb9a49af..7233ae5249e6d 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -20,9 +20,7 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
 
     def count(self):
         """
-
         """
-        # TODO: make sure count implementation, this different from what pyspark does
         return self._mapPartitions(lambda i: [sum(1 for _ in i)])._sum()
 
     def _sum(self):
@@ -79,7 +77,6 @@ def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
 
     def reduce(self, func):
         """
-
         """
         return self.map(lambda x: (None, x)).reduceByKey(func, 1).map(lambda x: x[1])
 
@@ -107,12 +104,6 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         def combineLocally(iterator):
             combiners = {}
             for x in iterator:
-
-                #TODO for count operation make sure count implementation
-                # This is different from what pyspark does
-                #if isinstance(x, int):
-                #    x = ("", x)
-
                 (k, v) = x
                 if k not in combiners:
                     combiners[k] = createCombiner(v)
@@ -142,6 +133,7 @@ def partitionBy(self, numPartitions, partitionFunc=None):
 
         if partitionFunc is None:
             partitionFunc = lambda x: 0 if x is None else hash(x)
+
         # Transferring O(n) objects to Java is too expensive.  Instead, we'll
         # form the hash buckets in Python, transferring O(numPartitions) objects
         # to Java.  Each object is a (splitNumber, [objects]) pair.
@@ -215,7 +207,6 @@ def takeAndPrint(rdd, time):
 
         self.foreachRDD(takeAndPrint)
 
-
     #def transform(self, func):
     #    from utils import RDDFunction
     #    wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)

From fb08559258681d8f2a0e56f0aa8d5df027bb7a90 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 3 Aug 2014 23:47:14 -0700
Subject: [PATCH 043/347] initial commit for testcase

---
 python/pyspark/streaming_tests.py | 58 +++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 python/pyspark/streaming_tests.py

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
new file mode 100644
index 0000000000000..95c5489a5695b
--- /dev/null
+++ b/python/pyspark/streaming_tests.py
@@ -0,0 +1,58 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Unit tests for PySpark; additional tests are implemented as doctests in
+individual modules.
+
+This file will merged to tests.py. But for now, this file is separated to
+focus to streaming test case
+
+"""
+from fileinput import input
+from glob import glob
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+import unittest
+import zipfile
+
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.duration import *
+
+
+SPARK_HOME = os.environ["SPARK_HOME"]
+
+
+class PySparkStreamingTestCase(unittest.TestCase):
+
+    def setUp(self):
+        self._old_sys_path = list(sys.path)
+        class_name = self.__class__.__name__
+        self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
+
+    def tearDown(self):
+        self.ssc.stop()
+        sys.path = self._old_sys_path
+
+
+if __name__ == "__main__":
+    unittest.main()

From 28aa56dbc6f7d07c86aa5c9095c6cd9c43d99e8f Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 4 Aug 2014 09:47:48 -0700
Subject: [PATCH 044/347] WIP

---
 .../main/python/streaming/test_oprations.py   | 24 +++++++++++++++++++
 python/pyspark/streaming/dstream.py           |  1 -
 .../streaming/api/python/PythonDStream.scala  |  3 ++-
 3 files changed, 26 insertions(+), 2 deletions(-)
 create mode 100644 examples/src/main/python/streaming/test_oprations.py

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
new file mode 100644
index 0000000000000..cb338ced5f228
--- /dev/null
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -0,0 +1,24 @@
+import sys
+from operator import add
+
+from pyspark.conf import SparkConf
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.duration import *
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
+        exit(-1)
+    conf = SparkConf()
+    conf.setAppName("PythonStreamingNetworkWordCount")
+    ssc = StreamingContext(conf=conf, duration=Seconds(1))
+
+    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
+    words = lines.flatMap(lambda line: line.split(" "))
+    mapped_words = words.map(lambda word: (word, 1))
+    count = mapped_words.reduceByKey(add)
+
+    count.pyprint()
+    ssc.start()
+#    ssc.awaitTermination()
+    ssc.stop()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 7233ae5249e6d..c5452b952cac4 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -120,7 +120,6 @@ def _mergeCombiners(iterator):
                     combiners[k] = v
                 else:
                     combiners[k] = mergeCombiners(combiners[k], v)
-            return combiners.iteritems()
 
         return shuffled._mapPartitions(_mergeCombiners)
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 751b7504f1cea..59ac8ffa7924b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -124,4 +124,5 @@ class PythonTransformedDStream(
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
-*/
\ No newline at end of file
+*/
+

From ba5112dd4a636cf25e5fb28cfecf8417a72bd423 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 4 Aug 2014 09:57:16 -0700
Subject: [PATCH 045/347] update comment

---
 python/pyspark/streaming/dstream.py | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 7233ae5249e6d..f4655d11b9b10 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -1,3 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 from collections import defaultdict
 from itertools import chain, ifilter, imap
 import operator
@@ -20,11 +37,13 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
 
     def count(self):
         """
+        Return a new DStream which contains the number of elements in this DStream.
         """
         return self._mapPartitions(lambda i: [sum(1 for _ in i)])._sum()
 
     def _sum(self):
         """
+        Add up the elements in this DStream.
         """
         return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
@@ -41,7 +60,7 @@ def print_(self):
 
     def filter(self, f):
         """
-        Return DStream containing only the elements that satisfy predicate.
+        Return a new DStream containing only the elements that satisfy predicate.
         """
         def func(iterator): return ifilter(f, iterator)
         return self._mapPartitions(func)
@@ -56,7 +75,7 @@ def func(s, iterator): return chain.from_iterable(imap(f, iterator))
 
     def map(self, f):
         """
-        Return DStream by applying a function to each element of DStream.
+        Return a new DStream by applying a function to each element of DStream.
         """
         def func(iterator): return imap(f, iterator)
         return self._mapPartitions(func)
@@ -71,12 +90,14 @@ def func(s, iterator): return f(iterator)
     def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
         Return a new DStream by applying a function to each partition of this DStream,
-        While tracking the index of the original partition.
+        while tracking the index of the original partition.
         """
         return PipelinedDStream(self, f, preservesPartitioning)
 
     def reduce(self, func):
         """
+        Return a new DStream by reduceing the elements of this RDD using the specified
+        commutative and associative binary operator.
         """
         return self.map(lambda x: (None, x)).reduceByKey(func, 1).map(lambda x: x[1])
 
@@ -268,4 +289,3 @@ def _jdstream(self):
 
     def _is_pipelinable(self):
         return not (self.is_cached)
-

From 56fae45acb729608f255192ccea7d6406fe4f825 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 4 Aug 2014 16:07:48 -0700
Subject: [PATCH 046/347] WIP

---
 .../main/python/streaming/test_oprations.py   |  5 +--
 python/pyspark/streaming/context.py           |  5 +++
 python/pyspark/streaming/dstream.py           |  4 ++-
 python/pyspark/streaming/utils.py             |  1 -
 .../streaming/api/python/PythonDStream.scala  | 32 +++++++++++++++++--
 5 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index cb338ced5f228..084902b6a2f0d 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -15,10 +15,11 @@
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     words = lines.flatMap(lambda line: line.split(" "))
+#    ssc.checkpoint("checkpoint")
     mapped_words = words.map(lambda word: (word, 1))
     count = mapped_words.reduceByKey(add)
 
     count.pyprint()
     ssc.start()
-#    ssc.awaitTermination()
-    ssc.stop()
+    ssc.awaitTermination()
+#    ssc.stop()
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 01201f66421f8..dfaa5cfbbae27 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -133,3 +133,8 @@ def stop(self, stopSparkContext=True):
         finally:
             # Stop Callback server
             SparkContext._gateway.shutdown()
+
+    def checkpoint(self, directory):
+        """
+        """
+        self._jssc.checkpoint(directory)
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 37f625e2806e9..3026254f8fab6 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -172,7 +172,8 @@ def add_shuffle_key(split, iterator):
         with _JavaStackTrace(self.ctx) as st:
             partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
                                                       id(partitionFunc))
-            jdstream = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream(), partitioner).asJavaDStream()
+            jdstream = self.ctx._jvm.PythonPairwiseDStream(keyed._jdstream.dstream(),
+                                                           partitioner).asJavaDStream()
         dstream = DStream(jdstream, self._ssc, BatchedSerializer(outputSerializer))
         # This is required so that id(partitionFunc) remains unique, even if
         # partitionFunc is a lambda:
@@ -233,6 +234,7 @@ def takeAndPrint(rdd, time):
     #    jdstream = self.ctx._jvm.PythonTransformedDStream(self._jdstream.dstream(), wrapped_func).toJavaDStream
     #    return DStream(jdstream, self._ssc, ...)  ## DO NOT KNOW HOW 
 
+
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index c60ecd1ed607a..aa5e19adbd927 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -37,7 +37,6 @@ class Java:
         implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction']
 
 
-
 def msDurationToString(ms):
     """
     Returns a human-readable string representing a duration such as "35ms"
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 59ac8ffa7924b..861def33671f1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -25,7 +25,7 @@ import org.apache.spark._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.api.python._
 import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.streaming.{Duration, Time}
+import org.apache.spark.streaming.{StreamingContext, Duration, Time}
 import org.apache.spark.streaming.dstream._
 import org.apache.spark.streaming.api.java._
 
@@ -64,7 +64,7 @@ class PythonDStream[T: ClassTag](
 }
 
 
-private class PairwiseDStream(prev:DStream[Array[Byte]], partitioner: Partitioner) extends
+private class PythonPairwiseDStream(prev:DStream[Array[Byte]], partitioner: Partitioner) extends
 DStream[Array[Byte]](prev.ssc){
   override def dependencies = List(prev)
 
@@ -105,6 +105,7 @@ class PythonForeachDStream(
 
   this.register()
 }
+
 /*
 This does not work. Ignore this for now. -TD
 class PythonTransformedDStream(
@@ -126,3 +127,30 @@ class PythonTransformedDStream(
 }
 */
 
+/**
+ * This is a input stream just for the unitest. This is equivalent to a checkpointable,
+ * replayable, reliable message queue like Kafka. It requires a sequence as input, and
+ * returns the i_th element at the i_th batch unde manual clock.
+ */
+class PythonTestInputStream(ssc_ : StreamingContext, filename: String, numPartitions: Int)
+  extends InputDStream[Array[Byte]](ssc_) {
+
+  def start() {}
+
+  def stop() {}
+
+  def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    logInfo("Computing RDD for time " + validTime)
+    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
+    //val selectedInput = if (index < input.size) input(index) else Seq[T]()
+
+    // lets us test cases where RDDs are not created
+    //if (filename == null)
+    //  return None
+
+    //val rdd = ssc.sc.makeRDD(selectedInput, numPartitions)
+    val rdd = PythonRDD.readRDDFromFile(ssc.sc, filename, numPartitions).rdd
+    logInfo("Created RDD " + rdd.id + " with " + filename)
+    Some(rdd)
+  }
+}
\ No newline at end of file

From f671cdb57475cac5a0418898c42a02df91c83ed5 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Tue, 5 Aug 2014 00:09:38 -0700
Subject: [PATCH 047/347] WIP: added PythonTestInputStream

---
 .../main/python/streaming/test_oprations.py   | 14 +++--------
 python/pyspark/streaming/context.py           | 25 +++++++++++++++++++
 python/pyspark/streaming/dstream.py           |  1 +
 .../api/java/JavaStreamingContext.scala       |  3 +++
 .../streaming/api/python/PythonDStream.scala  | 13 ++++++----
 5 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index 084902b6a2f0d..3338a766b9cc3 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -6,20 +6,14 @@
 from pyspark.streaming.duration import *
 
 if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
-        exit(-1)
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
-    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
-    words = lines.flatMap(lambda line: line.split(" "))
-#    ssc.checkpoint("checkpoint")
-    mapped_words = words.map(lambda word: (word, 1))
-    count = mapped_words.reduceByKey(add)
+    test_input = ssc._testInputStream([1,1,1,1])
+    mapped = test_input.map(lambda x: (x, 1))
+    mapped.pyprint()
 
-    count.pyprint()
     ssc.start()
-    ssc.awaitTermination()
+#    ssc.awaitTermination()
 #    ssc.stop()
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index dfaa5cfbbae27..d544eab9b8fc7 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -17,6 +17,7 @@
 
 import sys
 from signal import signal, SIGTERM, SIGINT
+from tempfile import NamedTemporaryFile
 
 from pyspark.conf import SparkConf
 from pyspark.files import SparkFiles
@@ -138,3 +139,27 @@ def checkpoint(self, directory):
         """
         """
         self._jssc.checkpoint(directory)
+
+    def _testInputStream(self, test_input, numSlices=None):
+
+        numSlices = numSlices or self._sc.defaultParallelism
+        # Calling the Java parallelize() method with an ArrayList is too slow,
+        # because it sends O(n) Py4J commands.  As an alternative, serialized
+        # objects are written to a file and loaded through textFile().
+        tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
+        # Make sure we distribute data evenly if it's smaller than self.batchSize
+        if "__len__" not in dir(test_input):
+            c = list(test_input)    # Make it a list so we can compute its length
+        batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
+        if batchSize > 1:
+            serializer = BatchedSerializer(self._sc._unbatched_serializer,
+                                           batchSize)
+        else:
+            serializer = self._sc._unbatched_serializer
+        serializer.dump_stream(test_input, tempFile)
+        tempFile.close()
+        print tempFile.name
+        jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
+                                                        tempFile.name,
+                                                        numSlices).asJavaDStream()
+        return DStream(jinput_stream, self, UTF8Deserializer())
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 3026254f8fab6..77c9a22239c69 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -141,6 +141,7 @@ def _mergeCombiners(iterator):
                     combiners[k] = v
                 else:
                     combiners[k] = mergeCombiners(combiners[k], v)
+            return combiners.iteritems()
 
         return shuffled._mapPartitions(_mergeCombiners)
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
index 18605cac7006c..b51d5ff0be9fc 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
@@ -546,6 +546,9 @@ class JavaStreamingContext(val ssc: StreamingContext) {
  * JavaStreamingContext object contains a number of utility functions.
  */
 object JavaStreamingContext {
+  implicit def fromStreamingContext(ssc: StreamingContext): JavaStreamingContext = new JavaStreamingContext(ssc)
+
+  implicit def toStreamingContext(jssc: JavaStreamingContext): StreamingContext = jssc.ssc
 
   /**
    * Either recreate a StreamingContext from checkpoint data or create a new StreamingContext.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 861def33671f1..96440b15d0285 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -23,6 +23,7 @@ import scala.reflect.ClassTag
 
 import org.apache.spark._
 import org.apache.spark.rdd.RDD
+import org.apache.spark.api.java._
 import org.apache.spark.api.python._
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.streaming.{StreamingContext, Duration, Time}
@@ -130,10 +131,10 @@ class PythonTransformedDStream(
 /**
  * This is a input stream just for the unitest. This is equivalent to a checkpointable,
  * replayable, reliable message queue like Kafka. It requires a sequence as input, and
- * returns the i_th element at the i_th batch unde manual clock.
+ * returns the i_th element at the i_th batch under manual clock.
  */
-class PythonTestInputStream(ssc_ : StreamingContext, filename: String, numPartitions: Int)
-  extends InputDStream[Array[Byte]](ssc_) {
+class PythonTestInputStream(ssc_ : JavaStreamingContext, filename: String, numPartitions: Int)
+  extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)){
 
   def start() {}
 
@@ -141,7 +142,7 @@ class PythonTestInputStream(ssc_ : StreamingContext, filename: String, numPartit
 
   def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     logInfo("Computing RDD for time " + validTime)
-    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
+    //val index = ((validTime - zeroTime) / slideDuration - 1).toInt
     //val selectedInput = if (index < input.size) input(index) else Seq[T]()
 
     // lets us test cases where RDDs are not created
@@ -149,8 +150,10 @@ class PythonTestInputStream(ssc_ : StreamingContext, filename: String, numPartit
     //  return None
 
     //val rdd = ssc.sc.makeRDD(selectedInput, numPartitions)
-    val rdd = PythonRDD.readRDDFromFile(ssc.sc, filename, numPartitions).rdd
+    val rdd = PythonRDD.readRDDFromFile(JavaSparkContext.fromSparkContext(ssc_.sparkContext), filename, numPartitions).rdd
     logInfo("Created RDD " + rdd.id + " with " + filename)
     Some(rdd)
   }
+
+  val asJavaDStream  = JavaDStream.fromDStream(this)
 }
\ No newline at end of file

From a120d071f8b8e07b6c57386d8ffede8890f827dc Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 6 Aug 2014 19:11:17 -0700
Subject: [PATCH 048/347] WIP

---
 .../scala/org/apache/spark/api/python/PythonRDD.scala  |  2 ++
 examples/src/main/python/streaming/test_oprations.py   | 10 +++++++---
 python/pyspark/streaming/context.py                    |  6 +++++-
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index b4ce4b88ca65d..668e318e7a545 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -306,6 +306,8 @@ private[spark] object PythonRDD extends Logging {
     } catch {
       case eof: EOFException => {}
     }
+    println("RDDDD ==================")
+    println(objs)
     JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
   }
 
diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index 3338a766b9cc3..5ee0bd4b31253 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -9,11 +9,15 @@
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
+    ssc.checkpoint("/tmp/spark_ckp")
 
-    test_input = ssc._testInputStream([1,1,1,1])
-    mapped = test_input.map(lambda x: (x, 1))
-    mapped.pyprint()
+    test_input = ssc._testInputStream([[1],[1],[1]])
+#    ssc.checkpoint("/tmp/spark_ckp")
+    fm_test = test_input.flatMap(lambda x: x.split(" "))
+    mapped_test = fm_test.map(lambda x: (x, 1))
 
+
+    mapped_test.print_()
     ssc.start()
 #    ssc.awaitTermination()
 #    ssc.stop()
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index d544eab9b8fc7..882db547faa39 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -146,7 +146,10 @@ def _testInputStream(self, test_input, numSlices=None):
         # Calling the Java parallelize() method with an ArrayList is too slow,
         # because it sends O(n) Py4J commands.  As an alternative, serialized
         # objects are written to a file and loaded through textFile().
-        tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
+
+        #tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
+        tempFile = open("/tmp/spark_rdd", "wb")
+
         # Make sure we distribute data evenly if it's smaller than self.batchSize
         if "__len__" not in dir(test_input):
             c = list(test_input)    # Make it a list so we can compute its length
@@ -157,6 +160,7 @@ def _testInputStream(self, test_input, numSlices=None):
         else:
             serializer = self._sc._unbatched_serializer
         serializer.dump_stream(test_input, tempFile)
+        tempFile.flush()
         tempFile.close()
         print tempFile.name
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc,

From 90ae568e4fe63338d60b92fe105090a67bb15f9b Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 10 Aug 2014 18:43:09 -0700
Subject: [PATCH 049/347] WIP added test case

---
 .../apache/spark/api/python/PythonRDD.scala   |  2 -
 .../main/python/streaming/test_oprations.py   | 25 +++++---
 python/pyspark/streaming/context.py           | 16 +++--
 python/pyspark/streaming/dstream.py           | 22 +++++--
 python/pyspark/streaming_tests.py             | 62 +++++++++++++++++--
 python/pyspark/worker.py                      |  2 +-
 .../streaming/api/java/JavaDStreamLike.scala  |  9 +++
 .../streaming/api/python/PythonDStream.scala  | 19 +++---
 .../spark/streaming/dstream/DStream.scala     | 17 +++++
 9 files changed, 134 insertions(+), 40 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 668e318e7a545..b4ce4b88ca65d 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -306,8 +306,6 @@ private[spark] object PythonRDD extends Logging {
     } catch {
       case eof: EOFException => {}
     }
-    println("RDDDD ==================")
-    println(objs)
     JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
   }
 
diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index 5ee0bd4b31253..24ebe23d63166 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -9,15 +9,22 @@
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
-    ssc.checkpoint("/tmp/spark_ckp")
 
-    test_input = ssc._testInputStream([[1],[1],[1]])
-#    ssc.checkpoint("/tmp/spark_ckp")
-    fm_test = test_input.flatMap(lambda x: x.split(" "))
-    mapped_test = fm_test.map(lambda x: (x, 1))
+    test_input = ssc._testInputStream([1,2,3])
+    class buff:
+        pass
+   
+    fm_test = test_input.map(lambda x: (x, 1))
+    fm_test.test_output(buff)
 
-
-    mapped_test.print_()
     ssc.start()
-#    ssc.awaitTermination()
-#    ssc.stop()
+    while True:
+        ssc.awaitTermination(50)
+        try:
+            buff.result
+            break
+        except AttributeError:
+            pass
+
+    ssc.stop()
+    print buff.result
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 882db547faa39..0d7665d645be8 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -100,10 +100,10 @@ def awaitTermination(self, timeout=None):
         """
         Wait for the execution to stop.
         """
-        if timeout:
-            self._jssc.awaitTermination(timeout)
-        else:
+        if timeout is None:
             self._jssc.awaitTermination()
+        else:
+            self._jssc.awaitTermination(timeout)
 
     # start from simple one. storageLevel is not passed for now.
     def socketTextStream(self, hostname, port):
@@ -137,6 +137,7 @@ def stop(self, stopSparkContext=True):
 
     def checkpoint(self, directory):
         """
+        Not tested
         """
         self._jssc.checkpoint(directory)
 
@@ -147,8 +148,7 @@ def _testInputStream(self, test_input, numSlices=None):
         # because it sends O(n) Py4J commands.  As an alternative, serialized
         # objects are written to a file and loaded through textFile().
 
-        #tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
-        tempFile = open("/tmp/spark_rdd", "wb")
+        tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
 
         # Make sure we distribute data evenly if it's smaller than self.batchSize
         if "__len__" not in dir(test_input):
@@ -160,10 +160,8 @@ def _testInputStream(self, test_input, numSlices=None):
         else:
             serializer = self._sc._unbatched_serializer
         serializer.dump_stream(test_input, tempFile)
-        tempFile.flush()
-        tempFile.close()
-        print tempFile.name
+
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
                                                         tempFile.name,
                                                         numSlices).asJavaDStream()
-        return DStream(jinput_stream, self, UTF8Deserializer())
+        return DStream(jinput_stream, self, PickleSerializer())
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 77c9a22239c69..47196196466db 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -47,7 +47,7 @@ def _sum(self):
         """
         return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
-    def print_(self):
+    def print_(self, label=None):
         """
         Since print is reserved name for python, we cannot define a print method function.
         This function prints serialized data in RDD in DStream because Scala and Java cannot
@@ -56,7 +56,7 @@ def print_(self):
         Call DStream.print().
         """
         # a hack to call print function in DStream
-        getattr(self._jdstream, "print")()
+        getattr(self._jdstream, "print")(label)
 
     def filter(self, f):
         """
@@ -217,6 +217,7 @@ def pyprint(self):
 
         """
         def takeAndPrint(rdd, time):
+            print "take and print ==================="
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
@@ -229,11 +230,24 @@ def takeAndPrint(rdd, time):
 
         self.foreachRDD(takeAndPrint)
 
-    #def transform(self, func):
+    #def transform(self, func): - TD
     #    from utils import RDDFunction
     #    wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
     #    jdstream = self.ctx._jvm.PythonTransformedDStream(self._jdstream.dstream(), wrapped_func).toJavaDStream
-    #    return DStream(jdstream, self._ssc, ...)  ## DO NOT KNOW HOW 
+    #    return DStream(jdstream, self._ssc, ...)  ## DO NOT KNOW HOW
+
+    def _test_output(self, buff):
+        """
+        This function is only for testcase.
+        Store data in dstream to buffer to valify the result in tesecase
+        """
+        def get_output(rdd, time):
+            taken = rdd.take(11)
+            buff.result = taken
+        self.foreachRDD(get_output)
+
+    def output(self):
+        self._jdstream.outputToFile()
 
 
 class PipelinedDStream(DStream):
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 95c5489a5695b..0660be10b027b 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -19,12 +19,13 @@
 Unit tests for PySpark; additional tests are implemented as doctests in
 individual modules.
 
-This file will merged to tests.py. But for now, this file is separated to
-focus to streaming test case
+This file will merged to tests.py. But for now, this file is separated due
+to focusing to streaming test case
 
 """
 from fileinput import input
 from glob import glob
+from itertools import chain
 import os
 import re
 import shutil
@@ -41,18 +42,69 @@
 
 SPARK_HOME = os.environ["SPARK_HOME"]
 
+class buff:
+    """
+    Buffer for store the output from stream
+    """
+    result = None
 
 class PySparkStreamingTestCase(unittest.TestCase):
-
     def setUp(self):
-        self._old_sys_path = list(sys.path)
+        print "set up"
         class_name = self.__class__.__name__
         self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
 
     def tearDown(self):
+        print "tear donw"
         self.ssc.stop()
-        sys.path = self._old_sys_path
+        time.sleep(10)
+
+class TestBasicOperationsSuite(PySparkStreamingTestCase):
+    def setUp(self):
+        PySparkStreamingTestCase.setUp(self)
+        buff.result = None
+        self.timeout = 10 # seconds
+
+    def tearDown(self):
+        PySparkStreamingTestCase.tearDown(self)
+
+    def test_map(self):
+        test_input = [range(1,5), range(5,9), range(9, 13)]
+        def test_func(dstream):
+            return dstream.map(lambda x: str(x))
+        expected = map(str, test_input)
+        output = self.run_stream(test_input, test_func)
+        self.assertEqual(output, expected)
+
+    def test_flatMap(self):
+        test_input = [range(1,5), range(5,9), range(9, 13)]
+        def test_func(dstream):
+            return dstream.flatMap(lambda x: (x, x * 2))
+        # Maybe there be good way to create flatmap
+        excepted = map(lambda x: list(chain.from_iterable((map(lambda y:[y, y*2], x)))), 
+                       test_input)
+        output = self.run_stream(test_input, test_func)
+
+    def run_stream(self, test_input, test_func):
+        # Generate input stream with user-defined input
+        test_input_stream = self.ssc._testInputStream(test_input)
+        # Applyed test function to stream
+        test_stream = test_func(test_input_stream)
+        # Add job to get outpuf from stream
+        test_stream._test_output(buff)
+        self.ssc.start()
 
+        start_time = time.time()
+        while True:
+            current_time = time.time()
+            # check time out
+            if (current_time - start_time) > self.timeout:
+                self.ssc.stop()
+                break
+            self.ssc.awaitTermination(50)
+            if buff.result is not None:
+                break
+        return buff.result
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index f43210c6c0301..7ca3252270d5a 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -58,7 +58,7 @@ def main(infile, outfile):
 
         # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
         sys.path.append(spark_files_dir) # *.py files that were added will be copied here
-        num_python_includes =  read_int(infile)
+        num_python_includes = read_int(infile)
         for _ in range(num_python_includes):
             filename = utf8_deserializer.loads(infile)
             sys.path.append(os.path.join(spark_files_dir, filename))
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index a6184de4e83c1..7a002bbe74ca9 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -54,6 +54,15 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     dstream.print()
   }
 
+  def print(label: String = null): Unit = {
+    dstream.print(label)
+  }
+
+  def outputToFile(): Unit = {
+    dstream.outputToFile()
+  }
+
+
   /**
    * Return a new DStream in which each RDD has a single element generated by counting each RDD
    * of this DStream.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 96440b15d0285..94c644fa81d45 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -17,9 +17,14 @@
 
 package org.apache.spark.streaming.api.python
 
+import java.io._
+import java.io.{ObjectInputStream, IOException}
 import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
 
+import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
+import scala.collection.JavaConversions._
+
 
 import org.apache.spark._
 import org.apache.spark.rdd.RDD
@@ -51,6 +56,8 @@ class PythonDStream[T: ClassTag](
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     parent.getOrCompute(validTime) match{
       case Some(rdd) =>
+        logInfo("RDD ID in python DStream     ===========")
+        logInfo("RDD id " + rdd.id)
         val pythonRDD = new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning, pythonExec, broadcastVars, accumulator)
         Some(pythonRDD.asJavaRDD.rdd)
       case None => None
@@ -77,7 +84,7 @@ DStream[Array[Byte]](prev.ssc){
         val pairwiseRDD = new PairwiseRDD(rdd)
         /*
          * Since python operation is executed by Scala after StreamingContext.start.
-         * What PairwiseDStream does is equivalent to following python code in pySpark.
+         * What PythonPairwiseDStream does is equivalent to python code in pySpark.
          *
          * with _JavaStackTrace(self.context) as st:
          *    pairRDD = self.ctx._jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD()
@@ -142,18 +149,10 @@ class PythonTestInputStream(ssc_ : JavaStreamingContext, filename: String, numPa
 
   def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     logInfo("Computing RDD for time " + validTime)
-    //val index = ((validTime - zeroTime) / slideDuration - 1).toInt
-    //val selectedInput = if (index < input.size) input(index) else Seq[T]()
-
-    // lets us test cases where RDDs are not created
-    //if (filename == null)
-    //  return None
-
-    //val rdd = ssc.sc.makeRDD(selectedInput, numPartitions)
     val rdd = PythonRDD.readRDDFromFile(JavaSparkContext.fromSparkContext(ssc_.sparkContext), filename, numPartitions).rdd
     logInfo("Created RDD " + rdd.id + " with " + filename)
     Some(rdd)
   }
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
-}
\ No newline at end of file
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index d8dbdf59e7ff1..bafff80adc54b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -623,6 +623,23 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
+
+  def print(label: String = null) {
+    def foreachFunc = (rdd: RDD[T], time: Time) => {
+      val first11 = rdd.take(11)
+      println ("-------------------------------------------")
+      println ("Time: " + time)
+      println ("-------------------------------------------")
+      if(label != null){
+        println (label)
+      }
+      first11.take(10).foreach(println)
+      if (first11.size > 10) println("...")
+      println()
+    }
+    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
+  }
+
   /**
    * Return a new DStream in which each RDD contains all the elements in seen in a
    * sliding window of time over this DStream. The new DStream generates RDDs with

From 2cfd3a0bf14beb50b7c4f2fc7ddcd5d3bdfd8c38 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 03:21:22 -0700
Subject: [PATCH 050/347] added basic operation test cases

---
 .../main/python/streaming/test_oprations.py   | 19 ++--
 python/pyspark/streaming/context.py           | 43 +++++----
 python/pyspark/streaming/dstream.py           |  8 +-
 python/pyspark/streaming_tests.py             | 95 +++++++++++++++----
 .../streaming/api/python/PythonDStream.scala  | 27 +++++-
 5 files changed, 135 insertions(+), 57 deletions(-)

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index 24ebe23d63166..70a62058286e9 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -9,22 +9,23 @@
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
-
-    test_input = ssc._testInputStream([1,2,3])
-    class buff:
+    class Buff:
+        result = list()
         pass
+    Buff.result = list()
+
+    test_input = ssc._testInputStream([range(1,4), range(4,7), range(7,10)])
    
     fm_test = test_input.map(lambda x: (x, 1))
-    fm_test.test_output(buff)
+    fm_test.pyprint()
+    fm_test._test_output(Buff.result)
 
     ssc.start()
     while True:
         ssc.awaitTermination(50)
-        try:
-            buff.result
+        if len(Buff.result) == 3:
             break
-        except AttributeError:
-            pass
 
     ssc.stop()
-    print buff.result
+    print Buff.result
+
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 0d7665d645be8..be142fd4f327b 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -123,14 +123,14 @@ def textFileStream(self, directory):
         """
         return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
 
-    def stop(self, stopSparkContext=True):
+    def stop(self, stopSparkContext=True, stopGraceFully=False):
         """
         Stop the execution of the streams immediately (does not wait for all received data
         to be processed).
         """
         
         try:
-            self._jssc.stop(stopSparkContext)
+            self._jssc.stop(stopSparkContext, stopGraceFully)
         finally:
             # Stop Callback server
             SparkContext._gateway.shutdown()
@@ -141,27 +141,34 @@ def checkpoint(self, directory):
         """
         self._jssc.checkpoint(directory)
 
-    def _testInputStream(self, test_input, numSlices=None):
-
+    def _testInputStream(self, test_inputs, numSlices=None):
+        """
+        Generate multiple files to make "stream" in Scala side for test.
+        Scala chooses one of the files and generates RDD using PythonRDD.readRDDFromFile.
+        """
         numSlices = numSlices or self._sc.defaultParallelism
         # Calling the Java parallelize() method with an ArrayList is too slow,
         # because it sends O(n) Py4J commands.  As an alternative, serialized
         # objects are written to a file and loaded through textFile().
 
-        tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
-
-        # Make sure we distribute data evenly if it's smaller than self.batchSize
-        if "__len__" not in dir(test_input):
-            c = list(test_input)    # Make it a list so we can compute its length
-        batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
-        if batchSize > 1:
-            serializer = BatchedSerializer(self._sc._unbatched_serializer,
-                                           batchSize)
-        else:
-            serializer = self._sc._unbatched_serializer
-        serializer.dump_stream(test_input, tempFile)
-
+        tempFiles = list()
+        for test_input in test_inputs:
+            tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
+
+            # Make sure we distribute data evenly if it's smaller than self.batchSize
+            if "__len__" not in dir(test_input):
+                c = list(test_input)    # Make it a list so we can compute its length
+            batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
+            if batchSize > 1:
+                serializer = BatchedSerializer(self._sc._unbatched_serializer,
+                                               batchSize)
+            else:
+                serializer = self._sc._unbatched_serializer
+            serializer.dump_stream(test_input, tempFile)
+            tempFiles.append(tempFile.name)
+
+        jtempFiles = ListConverter().convert(tempFiles, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
-                                                        tempFile.name,
+                                                        jtempFiles,
                                                         numSlices).asJavaDStream()
         return DStream(jinput_stream, self, PickleSerializer())
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 47196196466db..0f0a1847535ce 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -217,7 +217,6 @@ def pyprint(self):
 
         """
         def takeAndPrint(rdd, time):
-            print "take and print ==================="
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
@@ -242,13 +241,10 @@ def _test_output(self, buff):
         Store data in dstream to buffer to valify the result in tesecase
         """
         def get_output(rdd, time):
-            taken = rdd.take(11)
-            buff.result = taken
+            taken = rdd.collect()
+            buff.append(taken)
         self.foreachRDD(get_output)
 
-    def output(self):
-        self._jdstream.outputToFile()
-
 
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 0660be10b027b..d2e638a7d2acc 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -35,76 +35,133 @@
 import time
 import unittest
 import zipfile
+import operator
 
+from pyspark.context import SparkContext
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
 
 
 SPARK_HOME = os.environ["SPARK_HOME"]
 
-class buff:
+class StreamOutput:
     """
-    Buffer for store the output from stream
+    a class to store the output from stream
     """
-    result = None
+    result = list()
 
 class PySparkStreamingTestCase(unittest.TestCase):
     def setUp(self):
-        print "set up"
         class_name = self.__class__.__name__
         self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
 
     def tearDown(self):
-        print "tear donw"
-        self.ssc.stop()
-        time.sleep(10)
+        # Do not call StreamingContext.stop directly because we do not wait to shutdown
+        # call back server and py4j client
+        self.ssc._jssc.stop()
+        self.ssc._sc.stop()
+        # Why does it long time to terminaete StremaingContext and SparkContext?
+        # Should we change the sleep time if this depends on machine spec?
+        time.sleep(5)
+
+    @classmethod
+    def tearDownClass(cls):
+        time.sleep(5)
+        SparkContext._gateway._shutdown_callback_server()
 
 class TestBasicOperationsSuite(PySparkStreamingTestCase):
+    """
+    Input and output of this TestBasicOperationsSuite is the equivalent to 
+    Scala TestBasicOperationsSuite.
+    """
     def setUp(self):
         PySparkStreamingTestCase.setUp(self)
-        buff.result = None
+        StreamOutput.result = list()
         self.timeout = 10 # seconds
 
     def tearDown(self):
         PySparkStreamingTestCase.tearDown(self)
 
+    @classmethod
+    def tearDownClass(cls):
+        PySparkStreamingTestCase.tearDownClass()
+
     def test_map(self):
+        """Basic operation test for DStream.map"""
         test_input = [range(1,5), range(5,9), range(9, 13)]
         def test_func(dstream):
             return dstream.map(lambda x: str(x))
-        expected = map(str, test_input)
-        output = self.run_stream(test_input, test_func)
-        self.assertEqual(output, expected)
+        expected_output = map(lambda x: map(lambda y: str(y), x), test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
 
     def test_flatMap(self):
+        """Basic operation test for DStream.faltMap"""
         test_input = [range(1,5), range(5,9), range(9, 13)]
         def test_func(dstream):
             return dstream.flatMap(lambda x: (x, x * 2))
-        # Maybe there be good way to create flatmap
-        excepted = map(lambda x: list(chain.from_iterable((map(lambda y:[y, y*2], x)))), 
+        expected_output = map(lambda x: list(chain.from_iterable((map(lambda y: [y, y * 2], x)))), 
                        test_input)
-        output = self.run_stream(test_input, test_func)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_filter(self):
+        """Basic operation test for DStream.filter"""
+        test_input = [range(1,5), range(5,9), range(9, 13)]
+        def test_func(dstream):
+            return dstream.filter(lambda x: x % 2 == 0)
+        expected_output = map(lambda x: filter(lambda y: y % 2 == 0, x), test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_count(self):
+        """Basic operation test for DStream.count"""
+        test_input = [[], [1], range(1, 3), range(1,4), range(1,5)]
+        def test_func(dstream):
+            return dstream.count()
+        expected_output = map(lambda x: [len(x)], test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+        
+    def test_reduce(self):
+        """Basic operation test for DStream.reduce"""
+        test_input = [range(1,5), range(5,9), range(9, 13)]
+        def test_func(dstream):
+            return dstream.reduce(operator.add)
+        expected_output = map(lambda x: [reduce(operator.add, x)], test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_reduceByKey(self):
+        """Basic operation test for DStream.reduceByKey"""
+        test_input = [["a", "a", "b"], ["", ""], []]
+        def test_func(dstream):
+            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
+        expected_output = [[("a", 2), ("b", 1)],[("", 2)], []]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
 
-    def run_stream(self, test_input, test_func):
+    def _run_stream(self, test_input, test_func, expected_output):
+        """Start stream and return the output"""
         # Generate input stream with user-defined input
         test_input_stream = self.ssc._testInputStream(test_input)
         # Applyed test function to stream
         test_stream = test_func(test_input_stream)
         # Add job to get outpuf from stream
-        test_stream._test_output(buff)
+        test_stream._test_output(StreamOutput.result)
         self.ssc.start()
 
         start_time = time.time()
+        # loop until get the result from stream
         while True:
             current_time = time.time()
             # check time out
             if (current_time - start_time) > self.timeout:
-                self.ssc.stop()
                 break
             self.ssc.awaitTermination(50)
-            if buff.result is not None:
+            if len(expected_output) == len(StreamOutput.result):
                 break
-        return buff.result
+        return StreamOutput.result
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 94c644fa81d45..21809d8d3b97a 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -56,8 +56,6 @@ class PythonDStream[T: ClassTag](
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     parent.getOrCompute(validTime) match{
       case Some(rdd) =>
-        logInfo("RDD ID in python DStream     ===========")
-        logInfo("RDD id " + rdd.id)
         val pythonRDD = new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning, pythonExec, broadcastVars, accumulator)
         Some(pythonRDD.asJavaRDD.rdd)
       case None => None
@@ -140,7 +138,7 @@ class PythonTransformedDStream(
  * replayable, reliable message queue like Kafka. It requires a sequence as input, and
  * returns the i_th element at the i_th batch under manual clock.
  */
-class PythonTestInputStream(ssc_ : JavaStreamingContext, filename: String, numPartitions: Int)
+class PythonTestInputStream(ssc_ : JavaStreamingContext, inputFiles: JArrayList[String], numPartitions: Int)
   extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)){
 
   def start() {}
@@ -149,8 +147,27 @@ class PythonTestInputStream(ssc_ : JavaStreamingContext, filename: String, numPa
 
   def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     logInfo("Computing RDD for time " + validTime)
-    val rdd = PythonRDD.readRDDFromFile(JavaSparkContext.fromSparkContext(ssc_.sparkContext), filename, numPartitions).rdd
-    logInfo("Created RDD " + rdd.id + " with " + filename)
+    inputFiles.foreach(logInfo(_))
+    // make a temporary file
+    // make empty RDD
+    val prefix = "spark"
+    val suffix = ".tmp"
+    val tempFile = File.createTempFile(prefix, suffix)
+    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
+    logInfo("Index: " + index)
+
+    val selectedInputFile: String = {
+      if (inputFiles.isEmpty){
+        tempFile.getAbsolutePath
+      }else if (index < inputFiles.size()) {
+        inputFiles.get(index)
+      } else {
+        tempFile.getAbsolutePath
+      }
+    }
+
+    val rdd = PythonRDD.readRDDFromFile(JavaSparkContext.fromSparkContext(ssc_.sparkContext), selectedInputFile, numPartitions).rdd
+    logInfo("Created RDD " + rdd.id + " with " + selectedInputFile)
     Some(rdd)
   }
 

From db0a30355e9e861bd775dee48daa292ff4139c68 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 03:22:23 -0700
Subject: [PATCH 051/347] delete waste file

---
 .../main/python/streaming/test_oprations.py   | 31 -------------------
 1 file changed, 31 deletions(-)
 delete mode 100644 examples/src/main/python/streaming/test_oprations.py

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
deleted file mode 100644
index 70a62058286e9..0000000000000
--- a/examples/src/main/python/streaming/test_oprations.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import sys
-from operator import add
-
-from pyspark.conf import SparkConf
-from pyspark.streaming.context import StreamingContext
-from pyspark.streaming.duration import *
-
-if __name__ == "__main__":
-    conf = SparkConf()
-    conf.setAppName("PythonStreamingNetworkWordCount")
-    ssc = StreamingContext(conf=conf, duration=Seconds(1))
-    class Buff:
-        result = list()
-        pass
-    Buff.result = list()
-
-    test_input = ssc._testInputStream([range(1,4), range(4,7), range(7,10)])
-   
-    fm_test = test_input.map(lambda x: (x, 1))
-    fm_test.pyprint()
-    fm_test._test_output(Buff.result)
-
-    ssc.start()
-    while True:
-        ssc.awaitTermination(50)
-        if len(Buff.result) == 3:
-            break
-
-    ssc.stop()
-    print Buff.result
-

From 3334169e73141abfe1cb486d76fc983be7b1df92 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 03:41:24 -0700
Subject: [PATCH 052/347] fixed PEP-008 violation

---
 python/pyspark/streaming/context.py |  5 ----
 python/pyspark/streaming/dstream.py | 19 +++++++++------
 python/pyspark/streaming_tests.py   | 37 +++++++++++++++--------------
 3 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index be142fd4f327b..088a4965b6b13 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -19,12 +19,7 @@
 from signal import signal, SIGTERM, SIGINT
 from tempfile import NamedTemporaryFile
 
-from pyspark.conf import SparkConf
-from pyspark.files import SparkFiles
-from pyspark.java_gateway import launch_gateway
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
-from pyspark.storagelevel import *
-from pyspark.rdd import RDD
 from pyspark.context import SparkContext
 from pyspark.streaming.dstream import DStream
 
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 0f0a1847535ce..746f323628c1c 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -49,7 +49,7 @@ def _sum(self):
 
     def print_(self, label=None):
         """
-        Since print is reserved name for python, we cannot define a print method function.
+        Since print is reserved name for python, we cannot define a "print" method function.
         This function prints serialized data in RDD in DStream because Scala and Java cannot
         deserialized pickled python object. Please use DStream.pyprint() instead to print results.
 
@@ -159,8 +159,8 @@ def partitionBy(self, numPartitions, partitionFunc=None):
         # form the hash buckets in Python, transferring O(numPartitions) objects
         # to Java.  Each object is a (splitNumber, [objects]) pair.
         outputSerializer = self.ctx._unbatched_serializer
-        def add_shuffle_key(split, iterator):
 
+        def add_shuffle_key(split, iterator):
             buckets = defaultdict(list)
 
             for (k, v) in iterator:
@@ -205,6 +205,11 @@ def getNumPartitions(self):
 
     def foreachRDD(self, func):
         """
+        Apply userdefined function to all RDD in a DStream.
+        This python implementation could be expensive because it uses callback server
+        in order to apply function to RDD in DStream.
+        This is an output operator, so this DStream will be registered as an output
+        stream and there materialized.
         """
         from utils import RDDFunction
         wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
@@ -214,7 +219,6 @@ def pyprint(self):
         """
         Print the first ten elements of each RDD generated in this DStream. This is an output
         operator, so this DStream will be registered as an output stream and there materialized.
-
         """
         def takeAndPrint(rdd, time):
             taken = rdd.take(11)
@@ -235,14 +239,15 @@ def takeAndPrint(rdd, time):
     #    jdstream = self.ctx._jvm.PythonTransformedDStream(self._jdstream.dstream(), wrapped_func).toJavaDStream
     #    return DStream(jdstream, self._ssc, ...)  ## DO NOT KNOW HOW
 
-    def _test_output(self, buff):
+    def _test_output(self, result):
         """
-        This function is only for testcase.
-        Store data in dstream to buffer to valify the result in tesecase
+        This function is only for test case.
+        Store data in a DStream to result to verify the result in tese case
         """
         def get_output(rdd, time):
             taken = rdd.collect()
-            buff.append(taken)
+            result.append(taken)
+
         self.foreachRDD(get_output)
 
 
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index d2e638a7d2acc..ef9b87756fcef 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -23,18 +23,10 @@
 to focusing to streaming test case
 
 """
-from fileinput import input
-from glob import glob
 from itertools import chain
 import os
-import re
-import shutil
-import subprocess
-import sys
-import tempfile
 import time
 import unittest
-import zipfile
 import operator
 
 from pyspark.context import SparkContext
@@ -44,12 +36,14 @@
 
 SPARK_HOME = os.environ["SPARK_HOME"]
 
+
 class StreamOutput:
     """
     a class to store the output from stream
     """
     result = list()
 
+
 class PySparkStreamingTestCase(unittest.TestCase):
     def setUp(self):
         class_name = self.__class__.__name__
@@ -69,6 +63,7 @@ def tearDownClass(cls):
         time.sleep(5)
         SparkContext._gateway._shutdown_callback_server()
 
+
 class TestBasicOperationsSuite(PySparkStreamingTestCase):
     """
     Input and output of this TestBasicOperationsSuite is the equivalent to 
@@ -77,7 +72,7 @@ class TestBasicOperationsSuite(PySparkStreamingTestCase):
     def setUp(self):
         PySparkStreamingTestCase.setUp(self)
         StreamOutput.result = list()
-        self.timeout = 10 # seconds
+        self.timeout = 10  # seconds
 
     def tearDown(self):
         PySparkStreamingTestCase.tearDown(self)
@@ -88,7 +83,8 @@ def tearDownClass(cls):
 
     def test_map(self):
         """Basic operation test for DStream.map"""
-        test_input = [range(1,5), range(5,9), range(9, 13)]
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+
         def test_func(dstream):
             return dstream.map(lambda x: str(x))
         expected_output = map(lambda x: map(lambda y: str(y), x), test_input)
@@ -97,17 +93,19 @@ def test_func(dstream):
 
     def test_flatMap(self):
         """Basic operation test for DStream.faltMap"""
-        test_input = [range(1,5), range(5,9), range(9, 13)]
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+
         def test_func(dstream):
             return dstream.flatMap(lambda x: (x, x * 2))
         expected_output = map(lambda x: list(chain.from_iterable((map(lambda y: [y, y * 2], x)))), 
-                       test_input)
+                              test_input)
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
     def test_filter(self):
         """Basic operation test for DStream.filter"""
-        test_input = [range(1,5), range(5,9), range(9, 13)]
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+
         def test_func(dstream):
             return dstream.filter(lambda x: x % 2 == 0)
         expected_output = map(lambda x: filter(lambda y: y % 2 == 0, x), test_input)
@@ -116,7 +114,8 @@ def test_func(dstream):
 
     def test_count(self):
         """Basic operation test for DStream.count"""
-        test_input = [[], [1], range(1, 3), range(1,4), range(1,5)]
+        test_input = [[], [1], range(1, 3), range(1, 4), range(1, 5)]
+
         def test_func(dstream):
             return dstream.count()
         expected_output = map(lambda x: [len(x)], test_input)
@@ -125,7 +124,8 @@ def test_func(dstream):
         
     def test_reduce(self):
         """Basic operation test for DStream.reduce"""
-        test_input = [range(1,5), range(5,9), range(9, 13)]
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+
         def test_func(dstream):
             return dstream.reduce(operator.add)
         expected_output = map(lambda x: [reduce(operator.add, x)], test_input)
@@ -135,9 +135,10 @@ def test_func(dstream):
     def test_reduceByKey(self):
         """Basic operation test for DStream.reduceByKey"""
         test_input = [["a", "a", "b"], ["", ""], []]
+
         def test_func(dstream):
             return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
-        expected_output = [[("a", 2), ("b", 1)],[("", 2)], []]
+        expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
@@ -145,9 +146,9 @@ def _run_stream(self, test_input, test_func, expected_output):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
         test_input_stream = self.ssc._testInputStream(test_input)
-        # Applyed test function to stream
+        # Applied test function to stream
         test_stream = test_func(test_input_stream)
-        # Add job to get outpuf from stream
+        # Add job to get output from stream
         test_stream._test_output(StreamOutput.result)
         self.ssc.start()
 

From e8c7bfc556da45d33f9ffecf8c6b802fe7a7e49c Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 04:31:59 -0700
Subject: [PATCH 053/347] remove export PYSPARK_PYTHON in spark submit

---
 bin/spark-submit | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/bin/spark-submit b/bin/spark-submit
index a297714c67da0..9e7cecedd0325 100755
--- a/bin/spark-submit
+++ b/bin/spark-submit
@@ -37,22 +37,6 @@ done
 
 DEPLOY_MODE=${DEPLOY_MODE:-"client"}
 
-
-# This is a hack to make DStream.pyprint work. 
-# This will be removed after pyprint is moved to PythonDStream.
-# Problem is that print function is in (Scala)DStream. 
-# Whenever python code is executed, we call PythonDStream which passes
-# pythonExec(which python Spark should execute). pythonExec is used to call python.
-# Since pyprint is located in DStream, Spark does not know which python should use. 
-# In that case, get python path from PYSPARK_PYTHON, environmental variable. 
-
-# Figure out which Python executable to use
-if [[ -z "$PYSPARK_PYTHON" ]]; then
-  PYSPARK_PYTHON="python"
-fi
-export PYSPARK_PYTHON
-
-
 if [ -n "$DRIVER_MEMORY" ] && [ $DEPLOY_MODE == "client" ]; then
   export SPARK_DRIVER_MEMORY=$DRIVER_MEMORY
 fi

From bdde697368cee7c06fcbcf4f2102fedf3a58536f Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 04:42:08 -0700
Subject: [PATCH 054/347] removed unnesessary changes

---
 .../org/apache/spark/streaming/dstream/DStream.scala      | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index bafff80adc54b..46ef05d9c37a1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -17,14 +17,11 @@
 
 package org.apache.spark.streaming.dstream
 
-
-import java.io._
+import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
 
 import scala.deprecated
 import scala.collection.mutable.HashMap
 import scala.reflect.ClassTag
-import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
-import scala.util.control.Breaks._
 
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.rdd.{BlockRDD, RDD}
@@ -34,7 +31,6 @@ import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.scheduler.Job
 import org.apache.spark.util.MetadataCleaner
 import org.apache.spark.streaming.Duration
-import org.apache.spark.api.python.PythonRDD
 
 /**
  * A Discretized Stream (DStream), the basic abstraction in Spark Streaming, is a continuous
@@ -562,11 +558,9 @@ abstract class DStream[T: ClassTag] (
     // DStreams can't be serialized with closures, we can't proactively check 
     // it for serializability and so we pass the optional false to SparkContext.clean
 
-    // serialized python
     val cleanedF = context.sparkContext.clean(transformFunc, false)
     val realTransformFunc =  (rdds: Seq[RDD[_]], time: Time) => {
       assert(rdds.length == 1)
-      // if transformfunc is fine, it is okay
       cleanedF(rdds.head.asInstanceOf[RDD[T]], time)
     }
     new TransformedDStream[U](Seq(this), realTransformFunc)

From a65f3021fc8aa5f82889a18a728eed3c901996d0 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 05:32:28 -0700
Subject: [PATCH 055/347] edited the comment to add more precise description

---
 python/pyspark/streaming_tests.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index ef9b87756fcef..ec45acec94dbf 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -50,8 +50,8 @@ def setUp(self):
         self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
 
     def tearDown(self):
-        # Do not call StreamingContext.stop directly because we do not wait to shutdown
-        # call back server and py4j client
+        # Do not call pyspark.streaming.context.StreamingContext.stop directly because
+        # we do not wait to shutdowncall back server and py4j client
         self.ssc._jssc.stop()
         self.ssc._sc.stop()
         # Why does it long time to terminaete StremaingContext and SparkContext?
@@ -146,7 +146,7 @@ def _run_stream(self, test_input, test_func, expected_output):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
         test_input_stream = self.ssc._testInputStream(test_input)
-        # Applied test function to stream
+        # Apply test function to stream
         test_stream = test_func(test_input_stream)
         # Add job to get output from stream
         test_stream._test_output(StreamOutput.result)
@@ -160,6 +160,7 @@ def _run_stream(self, test_input, test_func, expected_output):
             if (current_time - start_time) > self.timeout:
                 break
             self.ssc.awaitTermination(50)
+            # check if the output is the same length of expexted output
             if len(expected_output) == len(StreamOutput.result):
                 break
         return StreamOutput.result

From 90a6484066ec2c157db6650d470e0b66cf42b342 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 16:34:12 -0700
Subject: [PATCH 056/347] added mapValues and flatMapVaules WIP for glom and
 mapPartitions test

---
 python/pyspark/streaming/context.py |  2 +
 python/pyspark/streaming/dstream.py | 69 ++++++++++++++++++++++-------
 python/pyspark/streaming_tests.py   | 48 +++++++++++++++++++-
 3 files changed, 101 insertions(+), 18 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 088a4965b6b13..eee298badcbad 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -140,6 +140,8 @@ def _testInputStream(self, test_inputs, numSlices=None):
         """
         Generate multiple files to make "stream" in Scala side for test.
         Scala chooses one of the files and generates RDD using PythonRDD.readRDDFromFile.
+
+        QueStream maybe good way to implement this function
         """
         numSlices = numSlices or self._sc.defaultParallelism
         # Calling the Java parallelize() method with an ArrayList is too slow,
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 746f323628c1c..5a6cf57ef1d9f 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -35,25 +35,31 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
         self.ctx = ssc._sc
         self._jrdd_deserializer = jrdd_deserializer
 
+    def context(self):
+        """
+        Return the StreamingContext associated with this DStream
+        """
+        return self._ssc
+
     def count(self):
         """
         Return a new DStream which contains the number of elements in this DStream.
         """
-        return self._mapPartitions(lambda i: [sum(1 for _ in i)])._sum()
+        return self.mapPartitions(lambda i: [sum(1 for _ in i)])._sum()
 
     def _sum(self):
         """
         Add up the elements in this DStream.
         """
-        return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+        return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
     def print_(self, label=None):
         """
         Since print is reserved name for python, we cannot define a "print" method function.
         This function prints serialized data in RDD in DStream because Scala and Java cannot
-        deserialized pickled python object. Please use DStream.pyprint() instead to print results.
+        deserialized pickled python object. Please use DStream.pyprint() to print results.
 
-        Call DStream.print().
+        Call DStream.print() and this function will print byte array in the DStream
         """
         # a hack to call print function in DStream
         getattr(self._jdstream, "print")(label)
@@ -63,29 +69,32 @@ def filter(self, f):
         Return a new DStream containing only the elements that satisfy predicate.
         """
         def func(iterator): return ifilter(f, iterator)
-        return self._mapPartitions(func)
+        return self.mapPartitions(func)
 
     def flatMap(self, f, preservesPartitioning=False):
         """
         Pass each value in the key-value pair DStream through flatMap function
         without changing the keys: this also retains the original RDD's partition.
         """
-        def func(s, iterator): return chain.from_iterable(imap(f, iterator))
+        def func(s, iterator):
+            return chain.from_iterable(imap(f, iterator))
         return self._mapPartitionsWithIndex(func, preservesPartitioning)
 
-    def map(self, f):
+    def map(self, f, preservesPartitioning=False):
         """
         Return a new DStream by applying a function to each element of DStream.
         """
-        def func(iterator): return imap(f, iterator)
-        return self._mapPartitions(func)
+        def func(iterator):
+            return imap(f, iterator)
+        return self.mapPartitions(func, preservesPartitioning)
 
-    def _mapPartitions(self, f):
+    def mapPartitions(self, f, preservesPartitioning=False):
         """
         Return a new DStream by applying a function to each partition of this DStream.
         """
-        def func(s, iterator): return f(iterator)
-        return self._mapPartitionsWithIndex(func)
+        def func(s, iterator):
+            return f(iterator)
+        return self._mapPartitionsWithIndex(func, preservesPartitioning)
 
     def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
@@ -131,7 +140,7 @@ def combineLocally(iterator):
                 else:
                     combiners[k] = mergeValue(combiners[k], v)
             return combiners.iteritems()
-        locally_combined = self._mapPartitions(combineLocally)
+        locally_combined = self.mapPartitions(combineLocally)
         shuffled = locally_combined.partitionBy(numPartitions)
 
         def _mergeCombiners(iterator):
@@ -143,7 +152,7 @@ def _mergeCombiners(iterator):
                     combiners[k] = mergeCombiners(combiners[k], v)
             return combiners.iteritems()
 
-        return shuffled._mapPartitions(_mergeCombiners)
+        return shuffled.mapPartitions(_mergeCombiners)
 
     def partitionBy(self, numPartitions, partitionFunc=None):
         """
@@ -233,6 +242,34 @@ def takeAndPrint(rdd, time):
 
         self.foreachRDD(takeAndPrint)
 
+    def mapValues(self, f):
+        """
+        Pass each value in the key-value pair RDD through a map function
+        without changing the keys; this also retains the original RDD's
+        partitioning.
+        """
+        map_values_fn = lambda (k, v): (k, f(v))
+        return self.map(map_values_fn, preservesPartitioning=True)
+
+    def flatMapValues(self, f):
+        """
+        Pass each value in the key-value pair RDD through a flatMap function
+        without changing the keys; this also retains the original RDD's
+        partitioning.
+        """
+        flat_map_fn = lambda (k, v): ((k, x) for x in f(v))
+        return self.flatMap(flat_map_fn, preservesPartitioning=True)
+
+    def glom(self):
+        """
+        Return a new DStream in which RDD is generated by applying glom() to RDD of
+        this DStream. Applying glom() to an RDD coalesces all elements within each partition into
+        an list.
+        """
+        def func(iterator):
+            yield list(iterator)
+        return self.mapPartitions(func)
+
     #def transform(self, func): - TD
     #    from utils import RDDFunction
     #    wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
@@ -242,7 +279,7 @@ def takeAndPrint(rdd, time):
     def _test_output(self, result):
         """
         This function is only for test case.
-        Store data in a DStream to result to verify the result in tese case
+        Store data in a DStream to result to verify the result in test case
         """
         def get_output(rdd, time):
             taken = rdd.collect()
@@ -305,4 +342,4 @@ def _jdstream(self):
         return self._jdstream_val
 
     def _is_pipelinable(self):
-        return not (self.is_cached)
+        return not self.is_cached
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index ec45acec94dbf..25ea350ca425f 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -142,10 +142,54 @@ def test_func(dstream):
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def _run_stream(self, test_input, test_func, expected_output):
+    def test_mapValues(self):
+        """Basic operation test for DStream.mapValues"""
+        test_input = [["a", "a", "b"], ["", ""], []]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add).mapValues(lambda x: x + 10)
+        expected_output = [[("a", 12), ("b", 11)], [("", 12)], []]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_flatMapValues(self):
+        """Basic operation test for DStream.flatMapValues"""
+        test_input = [["a", "a", "b"], ["", ""], []]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add).flatMapValues(lambda x: (x, x + 10))
+        expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_glom(self):
+        """Basic operation test for DStream.glom"""
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+        numSlices = 2
+
+        def test_func(dstream):
+            dstream.pyprint()
+            return dstream.glom()
+        expected_output = [[[1,2], [3,4]],[[5,6], [7,8]],[[9,10], [11,12]]]
+        output = self._run_stream(test_input, test_func, expected_output, numSlices)
+        self.assertEqual(expected_output, output)
+
+    def test_mapPartitions(self):
+        """Basic operation test for DStream.mapPartitions"""
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+        numSlices = 2
+
+        def test_func(dstream):
+            dstream.pyprint()
+            return dstream.mapPartitions(lambda x: reduce(operator.add, x))
+        expected_output = [[3, 7],[11, 15],[19, 23]]
+        output = self._run_stream(test_input, test_func, expected_output, numSlices)
+        self.assertEqual(expected_output, output)
+
+    def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
-        test_input_stream = self.ssc._testInputStream(test_input)
+        test_input_stream = self.ssc._testInputStream(test_input, numSlices)
         # Apply test function to stream
         test_stream = test_func(test_input_stream)
         # Add job to get output from stream

From 0704b86a9963c1d62b1934ce2fb47094b3fb03d3 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 13 Aug 2014 21:04:26 -0700
Subject: [PATCH 057/347] WIP: solved partitioned and None is not recognized

---
 python/pyspark/streaming/context.py           | 20 ++++++++++-
 python/pyspark/streaming/dstream.py           | 16 +++++++++
 python/pyspark/streaming_tests.py             | 23 ++++++------
 .../streaming/api/python/PythonDStream.scala  | 35 ++++++++++++++++++-
 4 files changed, 82 insertions(+), 12 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index eee298badcbad..32b52f74e16f0 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -154,7 +154,7 @@ def _testInputStream(self, test_inputs, numSlices=None):
 
             # Make sure we distribute data evenly if it's smaller than self.batchSize
             if "__len__" not in dir(test_input):
-                c = list(test_input)    # Make it a list so we can compute its length
+                test_input = list(test_input)    # Make it a list so we can compute its length
             batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
             if batchSize > 1:
                 serializer = BatchedSerializer(self._sc._unbatched_serializer,
@@ -162,6 +162,7 @@ def _testInputStream(self, test_inputs, numSlices=None):
             else:
                 serializer = self._sc._unbatched_serializer
             serializer.dump_stream(test_input, tempFile)
+            tempFile.close()
             tempFiles.append(tempFile.name)
 
         jtempFiles = ListConverter().convert(tempFiles, SparkContext._gateway._gateway_client)
@@ -169,3 +170,20 @@ def _testInputStream(self, test_inputs, numSlices=None):
                                                         jtempFiles,
                                                         numSlices).asJavaDStream()
         return DStream(jinput_stream, self, PickleSerializer())
+
+    
+    def _testInputStream2(self, test_inputs, numSlices=None):
+        """
+        This is inpired by QueStream implementation. Give list of RDD and generate DStream
+        which contain the RDD.
+        """
+        test_rdds = list()
+        for test_input in test_inputs:
+            test_rdd = self._sc.parallelize(test_input, numSlices)
+            print test_rdd.glom().collect()
+            test_rdds.append(test_rdd._jrdd)
+
+        jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
+        jinput_stream = self._jvm.PythonTestInputStream2(self._jssc, jtest_rdds).asJavaDStream()
+
+        return DStream(jinput_stream, self, BatchedSerializer(PickleSerializer()))
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 5a6cf57ef1d9f..101bfdbca0102 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -233,6 +233,8 @@ def takeAndPrint(rdd, time):
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
+            print rdd.glom().collect()
+            print "-------------------------------------------"
             print "-------------------------------------------"
             for record in taken[:10]:
                 print record
@@ -288,6 +290,20 @@ def get_output(rdd, time):
         self.foreachRDD(get_output)
 
 
+# TODO: implement groupByKey
+# TODO: impelment union
+# TODO: implement cache
+# TODO: implement persist
+# TODO: implement repertitions
+# TODO: implement saveAsTextFile
+# TODO: implement cogroup
+# TODO: implement join
+# TODO: implement countByValue
+# TODO: implement leftOuterJoin
+# TODO: implemtnt rightOuterJoin
+
+
+
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 25ea350ca425f..e346bc227fe46 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -71,8 +71,9 @@ class TestBasicOperationsSuite(PySparkStreamingTestCase):
     """
     def setUp(self):
         PySparkStreamingTestCase.setUp(self)
-        StreamOutput.result = list()
         self.timeout = 10  # seconds
+        self.numInputPartitions = 2
+        self.result = list()
 
     def tearDown(self):
         PySparkStreamingTestCase.tearDown(self)
@@ -137,6 +138,8 @@ def test_reduceByKey(self):
         test_input = [["a", "a", "b"], ["", ""], []]
 
         def test_func(dstream):
+            print "reduceByKey"
+            dstream.map(lambda x: (x, 1)).pyprint()
             return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
         expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
         output = self._run_stream(test_input, test_func, expected_output)
@@ -168,9 +171,8 @@ def test_glom(self):
         numSlices = 2
 
         def test_func(dstream):
-            dstream.pyprint()
             return dstream.glom()
-        expected_output = [[[1,2], [3,4]],[[5,6], [7,8]],[[9,10], [11,12]]]
+        expected_output = [[[1,2], [3,4]], [[5,6], [7,8]], [[9,10], [11,12]]]
         output = self._run_stream(test_input, test_func, expected_output, numSlices)
         self.assertEqual(expected_output, output)
 
@@ -180,20 +182,21 @@ def test_mapPartitions(self):
         numSlices = 2
 
         def test_func(dstream):
-            dstream.pyprint()
-            return dstream.mapPartitions(lambda x: reduce(operator.add, x))
-        expected_output = [[3, 7],[11, 15],[19, 23]]
+            def f(iterator): yield sum(iterator)
+            return dstream.mapPartitions(f)
+        expected_output = [[3, 7], [11, 15], [19, 23]]
         output = self._run_stream(test_input, test_func, expected_output, numSlices)
         self.assertEqual(expected_output, output)
 
     def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
-        test_input_stream = self.ssc._testInputStream(test_input, numSlices)
+        numSlices = numSlices or self.numInputPartitions
+        test_input_stream = self.ssc._testInputStream2(test_input, numSlices)
         # Apply test function to stream
         test_stream = test_func(test_input_stream)
         # Add job to get output from stream
-        test_stream._test_output(StreamOutput.result)
+        test_stream._test_output(self.result)
         self.ssc.start()
 
         start_time = time.time()
@@ -205,9 +208,9 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
                 break
             self.ssc.awaitTermination(50)
             # check if the output is the same length of expexted output
-            if len(expected_output) == len(StreamOutput.result):
+            if len(expected_output) == len(self.result):
                 break
-        return StreamOutput.result
+        return self.result
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 21809d8d3b97a..20e0b0d177d0f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -165,7 +165,7 @@ class PythonTestInputStream(ssc_ : JavaStreamingContext, inputFiles: JArrayList[
         tempFile.getAbsolutePath
       }
     }
-
+    println("PythonTestInputStreaming numPartitons" + numPartitions )
     val rdd = PythonRDD.readRDDFromFile(JavaSparkContext.fromSparkContext(ssc_.sparkContext), selectedInputFile, numPartitions).rdd
     logInfo("Created RDD " + rdd.id + " with " + selectedInputFile)
     Some(rdd)
@@ -173,3 +173,36 @@ class PythonTestInputStream(ssc_ : JavaStreamingContext, inputFiles: JArrayList[
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
+
+/**
+ * This is a input stream just for the unitest. This is equivalent to a checkpointable,
+ * replayable, reliable message queue like Kafka. It requires a sequence as input, and
+ * returns the i_th element at the i_th batch under manual clock.
+ * This implementation is close to QueStream
+ */
+
+class PythonTestInputStream2(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[JavaRDD[Array[Byte]]])
+  extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)) {
+
+  def start() {}
+
+  def stop() {}
+
+  def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    val emptyRDD = ssc.sparkContext.emptyRDD[Array[Byte]]
+    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
+    val selectedRDD = {
+      if (inputRDDs.isEmpty) {
+        emptyRDD
+      } else if (index < inputRDDs.size()) {
+        inputRDDs.get(index).rdd
+      } else {
+        emptyRDD
+      }
+    }
+
+    Some(selectedRDD)
+  }
+
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
\ No newline at end of file

From 080541a6d77cb85f788c297670cca24fbbc9f9b5 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 14 Aug 2014 02:19:46 -0700
Subject: [PATCH 058/347] broke something

---
 python/pyspark/rdd.py                         |  3 ++-
 python/pyspark/streaming/context.py           | 10 ++++++----
 python/pyspark/streaming/dstream.py           | 20 +++++++++++++++++++
 python/pyspark/streaming_tests.py             |  2 ++
 python/pyspark/worker.py                      | 11 ++++++++++
 .../streaming/api/python/PythonDStream.scala  |  1 -
 6 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index f64f48e3a4c9c..942382b40d28f 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -283,7 +283,8 @@ def mapPartitions(self, f, preservesPartitioning=False):
         >>> rdd.mapPartitions(f).collect()
         [3, 7]
         """
-        def func(s, iterator): return f(iterator)
+        def func(s, iterator): 
+            return f(iterator)
         return self.mapPartitionsWithIndex(func)
 
     def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 32b52f74e16f0..809158aedbc96 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -169,8 +169,7 @@ def _testInputStream(self, test_inputs, numSlices=None):
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
                                                         jtempFiles,
                                                         numSlices).asJavaDStream()
-        return DStream(jinput_stream, self, PickleSerializer())
-
+        return DStream(jinput_stream, self, BatchedSerializer(PickleSerializer()))
     
     def _testInputStream2(self, test_inputs, numSlices=None):
         """
@@ -178,12 +177,15 @@ def _testInputStream2(self, test_inputs, numSlices=None):
         which contain the RDD.
         """
         test_rdds = list()
+        test_rdd_deserializers = list()
         for test_input in test_inputs:
             test_rdd = self._sc.parallelize(test_input, numSlices)
-            print test_rdd.glom().collect()
             test_rdds.append(test_rdd._jrdd)
+            test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
 
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream2(self._jssc, jtest_rdds).asJavaDStream()
 
-        return DStream(jinput_stream, self, BatchedSerializer(PickleSerializer()))
+        dstream = DStream(jinput_stream, self, test_rdd_deserializers[0])
+        dstream._test_switch_dserializer(test_rdd_deserializers)
+        return dstream
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 101bfdbca0102..0a93a46d2b2a2 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -17,6 +17,7 @@
 
 from collections import defaultdict
 from itertools import chain, ifilter, imap
+import time
 import operator
 
 from pyspark.serializers import NoOpSerializer,\
@@ -289,6 +290,25 @@ def get_output(rdd, time):
 
         self.foreachRDD(get_output)
 
+    def _test_switch_dserializer(self, serializer_que):
+        """
+        Deserializer is dynamically changed based on numSlice and the number of
+        input. This function choose deserializer. Currently this is just FIFO.
+        """
+        
+        jrdd_deserializer = self._jrdd_deserializer
+
+        def switch(rdd, jtime):
+            try:
+                print serializer_que
+                jrdd_deserializer = serializer_que.pop(0)
+                print jrdd_deserializer
+            except Exception as e:
+                print e
+
+        self.foreachRDD(switch)
+
+
 
 # TODO: implement groupByKey
 # TODO: impelment union
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index e346bc227fe46..e23b86e8f040e 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -118,6 +118,8 @@ def test_count(self):
         test_input = [[], [1], range(1, 3), range(1, 4), range(1, 5)]
 
         def test_func(dstream):
+            print "count"
+            dstream.count().pyprint()
             return dstream.count()
         expected_output = map(lambda x: [len(x)], test_input)
         output = self._run_stream(test_input, test_func, expected_output)
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 7ca3252270d5a..8ee2f0b3a260f 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -23,6 +23,7 @@
 import time
 import socket
 import traceback
+import itertools
 # CloudPickler needs to be imported so that depicklers are registered using the
 # copy_reg module.
 from pyspark.accumulators import _accumulatorRegistry
@@ -74,6 +75,16 @@ def main(infile, outfile):
         (func, deserializer, serializer) = command
         init_time = time.time()
         iterator = deserializer.load_stream(infile)
+        print "deserializer in worker: %s" % str(deserializer)
+        iterator, walk = itertools.tee(iterator)
+        if isinstance(walk, int):
+            print "this is int"
+            print walk
+        else:
+            try:
+                print list(walk)
+            except:
+                print list(walk)
         serializer.dump_stream(func(split_index, iterator), outfile)
     except Exception as e:
         # Write the error to stderr in addition to trying to pass it back to
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 20e0b0d177d0f..e8788d4579dea 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -165,7 +165,6 @@ class PythonTestInputStream(ssc_ : JavaStreamingContext, inputFiles: JArrayList[
         tempFile.getAbsolutePath
       }
     }
-    println("PythonTestInputStreaming numPartitons" + numPartitions )
     val rdd = PythonRDD.readRDDFromFile(JavaSparkContext.fromSparkContext(ssc_.sparkContext), selectedInputFile, numPartitions).rdd
     logInfo("Created RDD " + rdd.id + " with " + selectedInputFile)
     Some(rdd)

From 2112638167e258609551df6e6036f33e08ff82e3 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 14 Aug 2014 18:07:10 -0700
Subject: [PATCH 059/347] all tests are passed if numSlice is 2 and the numver
 of each input is over 4

---
 python/pyspark/streaming/context.py           |  5 +++-
 python/pyspark/streaming_tests.py             | 28 +++++++++----------
 .../streaming/api/python/PythonDStream.scala  | 18 ++++++++++++
 3 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 809158aedbc96..123fa67f837e3 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -187,5 +187,8 @@ def _testInputStream2(self, test_inputs, numSlices=None):
         jinput_stream = self._jvm.PythonTestInputStream2(self._jssc, jtest_rdds).asJavaDStream()
 
         dstream = DStream(jinput_stream, self, test_rdd_deserializers[0])
-        dstream._test_switch_dserializer(test_rdd_deserializers)
         return dstream
+
+    def _testInputStream3(self):
+        jinput_stream = self._jvm.PythonTestInputStream3(self._jssc).asJavaDStream()
+        return DStream(jinput_stream, self, UTF8Deserializer())
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index e23b86e8f040e..19cce3f185833 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -37,13 +37,6 @@
 SPARK_HOME = os.environ["SPARK_HOME"]
 
 
-class StreamOutput:
-    """
-    a class to store the output from stream
-    """
-    result = list()
-
-
 class PySparkStreamingTestCase(unittest.TestCase):
     def setUp(self):
         class_name = self.__class__.__name__
@@ -115,7 +108,8 @@ def test_func(dstream):
 
     def test_count(self):
         """Basic operation test for DStream.count"""
-        test_input = [[], [1], range(1, 3), range(1, 4), range(1, 5)]
+        #test_input = [[], [1], range(1, 3), range(1, 4), range(1, 5)]
+        test_input = [range(1, 5), range(1,10), range(1,20)]
 
         def test_func(dstream):
             print "count"
@@ -137,33 +131,39 @@ def test_func(dstream):
 
     def test_reduceByKey(self):
         """Basic operation test for DStream.reduceByKey"""
-        test_input = [["a", "a", "b"], ["", ""], []]
+        #test_input = [["a", "a", "b"], ["", ""], []]
+        test_input = [["a", "a", "b", "b"], ["", "", "", ""], []]
 
         def test_func(dstream):
             print "reduceByKey"
             dstream.map(lambda x: (x, 1)).pyprint()
             return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
-        expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
+        #expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
+        expected_output = [[("a", 2), ("b", 2)], [("", 4)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
     def test_mapValues(self):
         """Basic operation test for DStream.mapValues"""
-        test_input = [["a", "a", "b"], ["", ""], []]
+        #test_input = [["a", "a", "b"], ["", ""], []]
+        test_input = [["a", "a", "b", "b"], ["", "", "", ""], []]
 
         def test_func(dstream):
             return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add).mapValues(lambda x: x + 10)
-        expected_output = [[("a", 12), ("b", 11)], [("", 12)], []]
+        #expected_output = [[("a", 12), ("b", 11)], [("", 12)], []]
+        expected_output = [[("a", 12), ("b", 12)], [("", 14)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
     def test_flatMapValues(self):
         """Basic operation test for DStream.flatMapValues"""
-        test_input = [["a", "a", "b"], ["", ""], []]
+        #test_input = [["a", "a", "b"], ["", ""], []]
+        test_input = [["a", "a", "b", "b"], ["", "", "",""], []]
 
         def test_func(dstream):
             return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add).flatMapValues(lambda x: (x, x + 10))
-        expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
+        #expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
+        expected_output = [[("a", 2), ("a", 12), ("b", 2), ("b", 12)], [("", 4), ("", 14)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index e8788d4579dea..7e46516a8a050 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -204,4 +204,22 @@ class PythonTestInputStream2(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[
   }
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
+}
+
+
+class PythonTestInputStream3(ssc_ : JavaStreamingContext)
+  extends InputDStream[Any](JavaStreamingContext.toStreamingContext(ssc_)) {
+
+  def start() {}
+
+  def stop() {}
+
+  def compute(validTime: Time): Option[RDD[Any]] = {
+    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
+    val selectedInput = ArrayBuffer(1, 2, 3).toSeq
+    val rdd :RDD[Any] = ssc.sc.makeRDD(selectedInput, 2)
+    Some(rdd)
+  }
+
+  val asJavaDStream = JavaDStream.fromDStream(this)
 }
\ No newline at end of file

From 536def42b9c8b0b81499e5e06d22b813f18d0bdd Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 14 Aug 2014 23:42:34 -0700
Subject: [PATCH 060/347] basic function test cases are passed

---
 python/pyspark/streaming_tests.py             | 209 +++++++++++++-----
 python/pyspark/worker.py                      |  11 -
 .../streaming/api/python/PythonDStream.scala  |  58 +----
 3 files changed, 160 insertions(+), 118 deletions(-)

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 19cce3f185833..6d85a7faae859 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -24,7 +24,6 @@
 
 """
 from itertools import chain
-import os
 import time
 import unittest
 import operator
@@ -34,9 +33,6 @@
 from pyspark.streaming.duration import *
 
 
-SPARK_HOME = os.environ["SPARK_HOME"]
-
-
 class PySparkStreamingTestCase(unittest.TestCase):
     def setUp(self):
         class_name = self.__class__.__name__
@@ -49,7 +45,7 @@ def tearDown(self):
         self.ssc._sc.stop()
         # Why does it long time to terminaete StremaingContext and SparkContext?
         # Should we change the sleep time if this depends on machine spec?
-        time.sleep(5)
+        time.sleep(8)
 
     @classmethod
     def tearDownClass(cls):
@@ -59,8 +55,17 @@ def tearDownClass(cls):
 
 class TestBasicOperationsSuite(PySparkStreamingTestCase):
     """
-    Input and output of this TestBasicOperationsSuite is the equivalent to 
-    Scala TestBasicOperationsSuite.
+    2 tests for each function for batach deserializer and unbatch deserilizer because
+    we cannot change the deserializer after streaming process starts.
+    Default numInputPartitions is 2.
+    If the number of input element is over 3, that DStream use batach deserializer.
+    If not, that DStream use unbatch deserializer.
+
+    Most of the operation uses UTF8 deserializer to get value from Scala.
+    I am wondering if these test are enough or not.
+    All tests input should have list of lists. This represents stream.
+    Every batch interval, the first object of list are chosen to make DStream.
+    Please see the BasicTestSuits in Scala or QueStream which is close to this implementation.
     """
     def setUp(self):
         PySparkStreamingTestCase.setUp(self)
@@ -75,8 +80,8 @@ def tearDown(self):
     def tearDownClass(cls):
         PySparkStreamingTestCase.tearDownClass()
 
-    def test_map(self):
-        """Basic operation test for DStream.map"""
+    def test_map_batch(self):
+        """Basic operation test for DStream.map with batch deserializer"""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
 
         def test_func(dstream):
@@ -85,8 +90,18 @@ def test_func(dstream):
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_flatMap(self):
-        """Basic operation test for DStream.faltMap"""
+    def test_map_unbatach(self):
+        """Basic operation test for DStream.map with unbatch deserializer"""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: str(x))
+        expected_output = map(lambda x: map(lambda y: str(y), x), test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_flatMap_batch(self):
+        """Basic operation test for DStream.faltMap with batch deserializer"""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
 
         def test_func(dstream):
@@ -96,8 +111,19 @@ def test_func(dstream):
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_filter(self):
-        """Basic operation test for DStream.filter"""
+    def test_flatMap_unbatch(self):
+        """Basic operation test for DStream.faltMap with unbatch deserializer"""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+
+        def test_func(dstream):
+            return dstream.flatMap(lambda x: (x, x * 2))
+        expected_output = map(lambda x: list(chain.from_iterable((map(lambda y: [y, y * 2], x)))),
+                              test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_filter_batch(self):
+        """Basic operation test for DStream.filter with batch deserializer"""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
 
         def test_func(dstream):
@@ -106,21 +132,38 @@ def test_func(dstream):
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_count(self):
-        """Basic operation test for DStream.count"""
-        #test_input = [[], [1], range(1, 3), range(1, 4), range(1, 5)]
-        test_input = [range(1, 5), range(1,10), range(1,20)]
+    def test_filter_unbatch(self):
+        """Basic operation test for DStream.filter with unbatch deserializer"""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+
+        def test_func(dstream):
+            return dstream.filter(lambda x: x % 2 == 0)
+        expected_output = map(lambda x: filter(lambda y: y % 2 == 0, x), test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_count_batch(self):
+        """Basic operation test for DStream.count with batch deserializer"""
+        test_input = [range(1, 5), range(1, 10), range(1, 20)]
 
         def test_func(dstream):
-            print "count"
-            dstream.count().pyprint()
             return dstream.count()
         expected_output = map(lambda x: [len(x)], test_input)
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
-        
-    def test_reduce(self):
-        """Basic operation test for DStream.reduce"""
+
+    def test_count_unbatch(self):
+        """Basic operation test for DStream.count with unbatch deserializer"""
+        test_input = [[], [1], range(1, 3), range(1, 4)]
+
+        def test_func(dstream):
+            return dstream.count()
+        expected_output = map(lambda x: [len(x)], test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_reduce_batch(self):
+        """Basic operation test for DStream.reduce with batch deserializer"""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
 
         def test_func(dstream):
@@ -129,67 +172,132 @@ def test_func(dstream):
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_reduceByKey(self):
-        """Basic operation test for DStream.reduceByKey"""
-        #test_input = [["a", "a", "b"], ["", ""], []]
-        test_input = [["a", "a", "b", "b"], ["", "", "", ""], []]
+    def test_reduce_unbatch(self):
+        """Basic operation test for DStream.reduce with unbatch deserializer"""
+        test_input = [[1], range(1, 3), range(1, 4)]
+
+        def test_func(dstream):
+            return dstream.reduce(operator.add)
+        expected_output = map(lambda x: [reduce(operator.add, x)], test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_reduceByKey_batch(self):
+        """Basic operation test for DStream.reduceByKey with batch deserializer"""
+        test_input = [["a", "a", "b", "b"], ["", "", "", ""]]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
+        expected_output = [[("a", 2), ("b", 2)], [("", 4)]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_reduceByKey_unbatch(self):
+        """Basic operation test for DStream.reduceByKey with unbatch deserilizer"""
+        test_input = [["a", "a", "b"], ["", ""], []]
 
         def test_func(dstream):
-            print "reduceByKey"
-            dstream.map(lambda x: (x, 1)).pyprint()
             return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
-        #expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
-        expected_output = [[("a", 2), ("b", 2)], [("", 4)], []]
+        expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_mapValues(self):
-        """Basic operation test for DStream.mapValues"""
-        #test_input = [["a", "a", "b"], ["", ""], []]
-        test_input = [["a", "a", "b", "b"], ["", "", "", ""], []]
+    def test_mapValues_batch(self):
+        """Basic operation test for DStream.mapValues with batch deserializer"""
+        test_input = [["a", "a", "b", "b"], ["", "", "", ""]]
 
         def test_func(dstream):
-            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add).mapValues(lambda x: x + 10)
-        #expected_output = [[("a", 12), ("b", 11)], [("", 12)], []]
-        expected_output = [[("a", 12), ("b", 12)], [("", 14)], []]
+            return dstream.map(lambda x: (x, 1))\
+                          .reduceByKey(operator.add)\
+                          .mapValues(lambda x: x + 10)
+        expected_output = [[("a", 12), ("b", 12)], [("", 14)]]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_flatMapValues(self):
-        """Basic operation test for DStream.flatMapValues"""
-        #test_input = [["a", "a", "b"], ["", ""], []]
-        test_input = [["a", "a", "b", "b"], ["", "", "",""], []]
+    def test_mapValues_unbatch(self):
+        """Basic operation test for DStream.mapValues with unbatch deserializer"""
+        test_input = [["a", "a", "b"], ["", ""], []]
 
         def test_func(dstream):
-            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add).flatMapValues(lambda x: (x, x + 10))
-        #expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
-        expected_output = [[("a", 2), ("a", 12), ("b", 2), ("b", 12)], [("", 4), ("", 14)], []]
+            return dstream.map(lambda x: (x, 1))\
+                          .reduceByKey(operator.add)\
+                          .mapValues(lambda x: x + 10)
+        expected_output = [[("a", 12), ("b", 11)], [("", 12)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_glom(self):
-        """Basic operation test for DStream.glom"""
+    def test_flatMapValues_batch(self):
+        """Basic operation test for DStream.flatMapValues with batch deserializer"""
+        test_input = [["a", "a", "b", "b"], ["", "", "", ""]]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: (x, 1))\
+                          .reduceByKey(operator.add)\
+                          .flatMapValues(lambda x: (x, x + 10))
+        expected_output = [[("a", 2), ("a", 12), ("b", 2), ("b", 12)], [("", 4), ("", 14)]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_flatMapValues_unbatch(self):
+        """Basic operation test for DStream.flatMapValues with unbatch deserializer"""
+        test_input = [["a", "a", "b"], ["", ""], []]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: (x, 1))\
+                          .reduceByKey(operator.add)\
+                          .flatMapValues(lambda x: (x, x + 10))
+        expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_glom_batch(self):
+        """Basic operation test for DStream.glom with batch deserializer"""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
         numSlices = 2
 
         def test_func(dstream):
             return dstream.glom()
-        expected_output = [[[1,2], [3,4]], [[5,6], [7,8]], [[9,10], [11,12]]]
+        expected_output = [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]]
+        output = self._run_stream(test_input, test_func, expected_output, numSlices)
+        self.assertEqual(expected_output, output)
+
+    def test_glom_unbatach(self):
+        """Basic operation test for DStream.glom with unbatch deserialiser"""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+        numSlices = 2
+
+        def test_func(dstream):
+            return dstream.glom()
+        expected_output = [[[1], [2, 3]], [[4], [5, 6]], [[7], [8, 9]]]
         output = self._run_stream(test_input, test_func, expected_output, numSlices)
         self.assertEqual(expected_output, output)
 
-    def test_mapPartitions(self):
-        """Basic operation test for DStream.mapPartitions"""
+    def test_mapPartitions_batch(self):
+        """Basic operation test for DStream.mapPartitions with batch deserializer"""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
         numSlices = 2
 
         def test_func(dstream):
-            def f(iterator): yield sum(iterator)
+            def f(iterator):
+                yield sum(iterator)
             return dstream.mapPartitions(f)
         expected_output = [[3, 7], [11, 15], [19, 23]]
         output = self._run_stream(test_input, test_func, expected_output, numSlices)
         self.assertEqual(expected_output, output)
 
+    def test_mapPartitions_unbatch(self):
+        """Basic operation test for DStream.mapPartitions with unbatch deserializer"""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+        numSlices = 2
+
+        def test_func(dstream):
+            def f(iterator):
+                yield sum(iterator)
+            return dstream.mapPartitions(f)
+        expected_output = [[1, 5], [4, 11], [7, 17]]
+        output = self._run_stream(test_input, test_func, expected_output, numSlices)
+        self.assertEqual(expected_output, output)
+
     def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
@@ -212,6 +320,7 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
             # check if the output is the same length of expexted output
             if len(expected_output) == len(self.result):
                 break
+
         return self.result
 
 if __name__ == "__main__":
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 8ee2f0b3a260f..7ca3252270d5a 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -23,7 +23,6 @@
 import time
 import socket
 import traceback
-import itertools
 # CloudPickler needs to be imported so that depicklers are registered using the
 # copy_reg module.
 from pyspark.accumulators import _accumulatorRegistry
@@ -75,16 +74,6 @@ def main(infile, outfile):
         (func, deserializer, serializer) = command
         init_time = time.time()
         iterator = deserializer.load_stream(infile)
-        print "deserializer in worker: %s" % str(deserializer)
-        iterator, walk = itertools.tee(iterator)
-        if isinstance(walk, int):
-            print "this is int"
-            print walk
-        else:
-            try:
-                print list(walk)
-            except:
-                print list(walk)
         serializer.dump_stream(func(split_index, iterator), outfile)
     except Exception as e:
         # Write the error to stderr in addition to trying to pass it back to
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 7e46516a8a050..9f1e1f4d3cca7 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -133,45 +133,6 @@ class PythonTransformedDStream(
 }
 */
 
-/**
- * This is a input stream just for the unitest. This is equivalent to a checkpointable,
- * replayable, reliable message queue like Kafka. It requires a sequence as input, and
- * returns the i_th element at the i_th batch under manual clock.
- */
-class PythonTestInputStream(ssc_ : JavaStreamingContext, inputFiles: JArrayList[String], numPartitions: Int)
-  extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)){
-
-  def start() {}
-
-  def stop() {}
-
-  def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-    logInfo("Computing RDD for time " + validTime)
-    inputFiles.foreach(logInfo(_))
-    // make a temporary file
-    // make empty RDD
-    val prefix = "spark"
-    val suffix = ".tmp"
-    val tempFile = File.createTempFile(prefix, suffix)
-    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
-    logInfo("Index: " + index)
-
-    val selectedInputFile: String = {
-      if (inputFiles.isEmpty){
-        tempFile.getAbsolutePath
-      }else if (index < inputFiles.size()) {
-        inputFiles.get(index)
-      } else {
-        tempFile.getAbsolutePath
-      }
-    }
-    val rdd = PythonRDD.readRDDFromFile(JavaSparkContext.fromSparkContext(ssc_.sparkContext), selectedInputFile, numPartitions).rdd
-    logInfo("Created RDD " + rdd.id + " with " + selectedInputFile)
-    Some(rdd)
-  }
-
-  val asJavaDStream  = JavaDStream.fromDStream(this)
-}
 
 /**
  * This is a input stream just for the unitest. This is equivalent to a checkpointable,
@@ -180,7 +141,7 @@ class PythonTestInputStream(ssc_ : JavaStreamingContext, inputFiles: JArrayList[
  * This implementation is close to QueStream
  */
 
-class PythonTestInputStream2(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[JavaRDD[Array[Byte]]])
+class PythonTestInputStream(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[JavaRDD[Array[Byte]]])
   extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)) {
 
   def start() {}
@@ -206,20 +167,3 @@ class PythonTestInputStream2(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
 
-
-class PythonTestInputStream3(ssc_ : JavaStreamingContext)
-  extends InputDStream[Any](JavaStreamingContext.toStreamingContext(ssc_)) {
-
-  def start() {}
-
-  def stop() {}
-
-  def compute(validTime: Time): Option[RDD[Any]] = {
-    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
-    val selectedInput = ArrayBuffer(1, 2, 3).toSeq
-    val rdd :RDD[Any] = ssc.sc.makeRDD(selectedInput, 2)
-    Some(rdd)
-  }
-
-  val asJavaDStream = JavaDStream.fromDStream(this)
-}
\ No newline at end of file

From a14c7e1a59370949a5f1eab16e448cc0012fa65e Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 14 Aug 2014 23:46:45 -0700
Subject: [PATCH 061/347] modified streaming test case to add coment

---
 python/pyspark/streaming_tests.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 6d85a7faae859..02996ccce9a3e 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -18,6 +18,9 @@
 """
 Unit tests for PySpark; additional tests are implemented as doctests in
 individual modules.
+Other option is separate this test case with other tests.
+This makes sense becuase streaming tests takes long time due to waiting time
+for stoping callback server.
 
 This file will merged to tests.py. But for now, this file is separated due
 to focusing to streaming test case
@@ -45,7 +48,7 @@ def tearDown(self):
         self.ssc._sc.stop()
         # Why does it long time to terminaete StremaingContext and SparkContext?
         # Should we change the sleep time if this depends on machine spec?
-        time.sleep(8)
+        time.sleep(10)
 
     @classmethod
     def tearDownClass(cls):
@@ -302,7 +305,7 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
         numSlices = numSlices or self.numInputPartitions
-        test_input_stream = self.ssc._testInputStream2(test_input, numSlices)
+        test_input_stream = self.ssc._testInputStream(test_input, numSlices)
         # Apply test function to stream
         test_stream = test_func(test_input_stream)
         # Add job to get output from stream

From e3033fcdd24258eb3836c0c07e5c959c3dfde7d2 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Fri, 15 Aug 2014 11:28:39 -0700
Subject: [PATCH 062/347] remove waste duplicated code

---
 python/pyspark/streaming/context.py | 43 +----------------
 python/pyspark/streaming/dstream.py | 75 +++++++++++++++++++++--------
 2 files changed, 56 insertions(+), 62 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 123fa67f837e3..60bcf86783e95 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -130,48 +130,7 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
             # Stop Callback server
             SparkContext._gateway.shutdown()
 
-    def checkpoint(self, directory):
-        """
-        Not tested
-        """
-        self._jssc.checkpoint(directory)
-
     def _testInputStream(self, test_inputs, numSlices=None):
-        """
-        Generate multiple files to make "stream" in Scala side for test.
-        Scala chooses one of the files and generates RDD using PythonRDD.readRDDFromFile.
-
-        QueStream maybe good way to implement this function
-        """
-        numSlices = numSlices or self._sc.defaultParallelism
-        # Calling the Java parallelize() method with an ArrayList is too slow,
-        # because it sends O(n) Py4J commands.  As an alternative, serialized
-        # objects are written to a file and loaded through textFile().
-
-        tempFiles = list()
-        for test_input in test_inputs:
-            tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
-
-            # Make sure we distribute data evenly if it's smaller than self.batchSize
-            if "__len__" not in dir(test_input):
-                test_input = list(test_input)    # Make it a list so we can compute its length
-            batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
-            if batchSize > 1:
-                serializer = BatchedSerializer(self._sc._unbatched_serializer,
-                                               batchSize)
-            else:
-                serializer = self._sc._unbatched_serializer
-            serializer.dump_stream(test_input, tempFile)
-            tempFile.close()
-            tempFiles.append(tempFile.name)
-
-        jtempFiles = ListConverter().convert(tempFiles, SparkContext._gateway._gateway_client)
-        jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
-                                                        jtempFiles,
-                                                        numSlices).asJavaDStream()
-        return DStream(jinput_stream, self, BatchedSerializer(PickleSerializer()))
-    
-    def _testInputStream2(self, test_inputs, numSlices=None):
         """
         This is inpired by QueStream implementation. Give list of RDD and generate DStream
         which contain the RDD.
@@ -184,7 +143,7 @@ def _testInputStream2(self, test_inputs, numSlices=None):
             test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
 
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
-        jinput_stream = self._jvm.PythonTestInputStream2(self._jssc, jtest_rdds).asJavaDStream()
+        jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
         dstream = DStream(jinput_stream, self, test_rdd_deserializers[0])
         return dstream
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 0a93a46d2b2a2..ea418822759c4 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -17,12 +17,13 @@
 
 from collections import defaultdict
 from itertools import chain, ifilter, imap
-import time
 import operator
 
 from pyspark.serializers import NoOpSerializer,\
     BatchedSerializer, CloudPickleSerializer, pack_long
 from pyspark.rdd import _JavaStackTrace
+from pyspark.storagelevel import StorageLevel
+from pyspark.resultiterable import ResultIterable
 
 from py4j.java_collections import ListConverter, MapConverter
 
@@ -35,6 +36,8 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
         self._ssc = ssc
         self.ctx = ssc._sc
         self._jrdd_deserializer = jrdd_deserializer
+        self.is_cached = False
+        self.is_checkpointed = False
 
     def context(self):
         """
@@ -234,8 +237,6 @@ def takeAndPrint(rdd, time):
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
-            print rdd.glom().collect()
-            print "-------------------------------------------"
             print "-------------------------------------------"
             for record in taken[:10]:
                 print record
@@ -290,32 +291,65 @@ def get_output(rdd, time):
 
         self.foreachRDD(get_output)
 
-    def _test_switch_dserializer(self, serializer_que):
+    def cache(self):
+        """
+        Persist this DStream with the default storage level (C{MEMORY_ONLY_SER}).
+        """
+        self.is_cached = True
+        self.persist(StorageLevel.MEMORY_ONLY_SER)
+        return self
+
+    def persist(self, storageLevel):
+        """
+        Set this DStream's storage level to persist its values across operations
+        after the first time it is computed. This can only be used to assign
+        a new storage level if the DStream does not have a storage level set yet.
+        """
+        self.is_cached = True
+        javaStorageLevel = self.ctx._getJavaStorageLevel(storageLevel)
+        self._jdstream.persist(javaStorageLevel)
+        return self
+
+    def checkpoint(self, interval):
         """
-        Deserializer is dynamically changed based on numSlice and the number of
-        input. This function choose deserializer. Currently this is just FIFO.
+        Mark this DStream for checkpointing. It will be saved to a file inside the
+        checkpoint directory set with L{SparkContext.setCheckpointDir()}
+
+        I am not sure this part in DStream
+        and
+        all references to its parent RDDs will be removed. This function must
+        be called before any job has been executed on this RDD. It is strongly
+        recommended that this RDD is persisted in memory, otherwise saving it
+        on a file will require recomputation.
+
+        interval must be pysprak.streaming.duration
         """
-        
-        jrdd_deserializer = self._jrdd_deserializer
+        self.is_checkpointed = True
+        self._jdstream.checkpoint(interval)
+        return self
+
+    def groupByKey(self, numPartitions=None):
+        def createCombiner(x):
+            return [x]
 
-        def switch(rdd, jtime):
-            try:
-                print serializer_que
-                jrdd_deserializer = serializer_que.pop(0)
-                print jrdd_deserializer
-            except Exception as e:
-                print e
+        def mergeValue(xs, x):
+            xs.append(x)
+            return xs
 
-        self.foreachRDD(switch)
+        def mergeCombiners(a, b):
+            a.extend(b)
+            return a
 
+        return self.combineByKey(createCombiner, mergeValue, mergeCombiners,
+                                 numPartitions).mapValues(lambda x: ResultIterable(x))
 
 
 # TODO: implement groupByKey
+# TODO: implement saveAsTextFile
+
+# Following operation has dependency to transform
 # TODO: impelment union
-# TODO: implement cache
-# TODO: implement persist
 # TODO: implement repertitions
-# TODO: implement saveAsTextFile
 # TODO: implement cogroup
 # TODO: implement join
 # TODO: implement countByValue
@@ -342,6 +376,7 @@ def pipeline_func(split, iterator):
             self._prev_jdstream = prev._prev_jdstream  # maintain the pipeline
             self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer
         self.is_cached = False
+        self.is_checkpointed = False
         self._ssc = prev._ssc
         self.ctx = prev.ctx
         self.prev = prev
@@ -378,4 +413,4 @@ def _jdstream(self):
         return self._jdstream_val
 
     def _is_pipelinable(self):
-        return not self.is_cached
+        return not (self.is_cached or self.is_checkpointed)

From 89ae38a0d6bc299ebb9aa81c7510812874ce7879 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Fri, 15 Aug 2014 17:10:56 -0700
Subject: [PATCH 063/347] added saveAsTextFiles and saveAsPickledFiles

---
 python/pyspark/streaming/context.py           | 17 +++++----
 python/pyspark/streaming/dstream.py           | 35 ++++++++++++++++---
 python/pyspark/streaming/utils.py             |  6 ++++
 python/pyspark/streaming_tests.py             | 32 +++++++++++++++++
 .../streaming/api/python/PythonDStream.scala  |  2 +-
 5 files changed, 78 insertions(+), 14 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 60bcf86783e95..691f9b06ad4e9 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -114,7 +114,7 @@ def textFileStream(self, directory):
         Create an input stream that monitors a Hadoop-compatible file system
         for new files and reads them as text files. Files must be wrriten to the
         monitored directory by "moving" them from another location within the same
-        file system. FIle names starting with . are ignored.
+        file system. File names starting with . are ignored.
         """
         return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
 
@@ -132,8 +132,9 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
 
     def _testInputStream(self, test_inputs, numSlices=None):
         """
-        This is inpired by QueStream implementation. Give list of RDD and generate DStream
-        which contain the RDD.
+        This function is only for test.
+        This implementation is inpired by QueStream implementation. 
+        Give list of RDD to generate DStream which contains the RDD.
         """
         test_rdds = list()
         test_rdd_deserializers = list()
@@ -142,12 +143,10 @@ def _testInputStream(self, test_inputs, numSlices=None):
             test_rdds.append(test_rdd._jrdd)
             test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
 
+#        if len(set(test_rdd_deserializers)) > 1:
+#            raise IOError("Deserializer should be one type to run test case. "
+#                          "See the SparkContext.parallelize to understand how to decide deserializer")
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
-        dstream = DStream(jinput_stream, self, test_rdd_deserializers[0])
-        return dstream
-
-    def _testInputStream3(self):
-        jinput_stream = self._jvm.PythonTestInputStream3(self._jssc).asJavaDStream()
-        return DStream(jinput_stream, self, UTF8Deserializer())
+        return DStream(jinput_stream, self, test_rdd_deserializers[0])
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index ea418822759c4..679360dbca08d 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -24,6 +24,8 @@
 from pyspark.rdd import _JavaStackTrace
 from pyspark.storagelevel import StorageLevel
 from pyspark.resultiterable import ResultIterable
+from pyspark.streaming.utils import rddToFileName
+
 
 from py4j.java_collections import ListConverter, MapConverter
 
@@ -343,21 +345,46 @@ def mergeCombiners(a, b):
         return self.combineByKey(createCombiner, mergeValue, mergeCombiners,
                                  numPartitions).mapValues(lambda x: ResultIterable(x))
 
+    def countByValue(self):
+        def countPartition(iterator):
+            counts = defaultdict(int)
+            for obj in iterator:
+                counts[obj] += 1
+            yield counts
+
+        def mergeMaps(m1, m2):
+            for (k, v) in m2.iteritems():
+                m1[k] += v
+            return m1
+
+        return self.mapPartitions(countPartition).reduce(mergeMaps).flatMap(lambda x: x.items())
+
+    def saveAsTextFiles(self, prefix, suffix=None):
+
+        def saveAsTextFile(rdd, time):
+            path = rddToFileName(prefix, suffix, time)
+            rdd.saveAsTextFile(path)
+
+        return self.foreachRDD(saveAsTextFile)
+
+    def saveAsPickledFiles(self, prefix, suffix=None):
+
+        def saveAsTextFile(rdd, time):
+            path = rddToFileName(prefix, suffix, time)
+            rdd.saveAsPickleFile(path)
+
+        return self.foreachRDD(saveAsTextFile)
 
-# TODO: implement groupByKey
-# TODO: implement saveAsTextFile
 
 # Following operation has dependency to transform
 # TODO: impelment union
 # TODO: implement repertitions
 # TODO: implement cogroup
 # TODO: implement join
-# TODO: implement countByValue
 # TODO: implement leftOuterJoin
 # TODO: implemtnt rightOuterJoin
 
 
-
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index aa5e19adbd927..9178577743e0b 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -53,3 +53,9 @@ def msDurationToString(ms):
         return "%.1f m" % (float(ms) / minute)
     else:
         return "%.2f h" % (float(ms) / hour)
+
+def rddToFileName(prefix, suffix, time):
+    if suffix is not None:
+        return prefix + "-" + str(time) + "." + suffix
+    else:
+        return prefix + "-" + str(time)
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 02996ccce9a3e..2bb01ed3a0642 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -301,6 +301,38 @@ def f(iterator):
         output = self._run_stream(test_input, test_func, expected_output, numSlices)
         self.assertEqual(expected_output, output)
 
+    def test_countByValue_batch(self):
+        """Basic operation test for DStream.countByValue with batch deserializer"""
+        test_input = [range(1, 5) + range(1,5), range(5, 7) + range(5, 9), ["a"] * 2 + ["b"] + [""] ]
+
+        def test_func(dstream):
+            return dstream.countByValue()
+        expected_output = [[(1, 2), (2, 2), (3, 2), (4, 2)],
+                           [(5, 2), (6, 2), (7, 1), (8, 1)],
+                           [("a", 2), ("b", 1), ("", 1)]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def test_countByValue_unbatch(self):
+        """Basic operation test for DStream.countByValue with unbatch deserializer"""
+        test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
+
+        def test_func(dstream):
+            return dstream.countByValue()
+        expected_output = [[(1, 1), (2, 1), (3, 1)],
+                           [(1, 2), ("", 1)],
+                           [("a", 2), ("b", 1)]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def _sort_result_based_on_key(self, outputs):
+        for output in outputs:
+            output.sort(key=lambda x: x[0])
+
     def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 9f1e1f4d3cca7..38b9004ab7439 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -138,7 +138,7 @@ class PythonTransformedDStream(
  * This is a input stream just for the unitest. This is equivalent to a checkpointable,
  * replayable, reliable message queue like Kafka. It requires a sequence as input, and
  * returns the i_th element at the i_th batch under manual clock.
- * This implementation is close to QueStream
+ * This implementation is inspired by QueStream
  */
 
 class PythonTestInputStream(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[JavaRDD[Array[Byte]]])

From ea9c8731b3d997ead7015d721c66231064e19ff9 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Fri, 15 Aug 2014 22:30:58 -0700
Subject: [PATCH 064/347] added TODO coments

---
 python/pyspark/streaming/context.py |  3 ++-
 python/pyspark/streaming/dstream.py | 16 ++++++++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 691f9b06ad4e9..470ed270cdbfb 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -17,7 +17,6 @@
 
 import sys
 from signal import signal, SIGTERM, SIGINT
-from tempfile import NamedTemporaryFile
 
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
 from pyspark.context import SparkContext
@@ -79,6 +78,7 @@ def _clean_up_trigger(self):
         """Kill py4j callback server properly using signal lib"""
 
         def clean_up_handler(*args):
+            SparkContext._gateway._shutdown_callback_server()
             SparkContext._gateway.shutdown()
             sys.exit(0)
 
@@ -128,6 +128,7 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
             self._jssc.stop(stopSparkContext, stopGraceFully)
         finally:
             # Stop Callback server
+            SparkContext._gateway._shutdown_callback_server()
             SparkContext._gateway.shutdown()
 
     def _testInputStream(self, test_inputs, numSlices=None):
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 679360dbca08d..ef0e2258e9922 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -376,15 +376,27 @@ def saveAsTextFile(rdd, time):
         return self.foreachRDD(saveAsTextFile)
 
 
+# TODO: implement updateStateByKey
+# TODO: implement slice
+
+# Window Operations
+# TODO: implement window
+# TODO: implement groupByKeyAndWindow
+# TODO: implement reduceByKeyAndWindow
+# TODO: implement countByValueAndWindow
+# TODO: implement countByWindow
+# TODO: implement reduceByWindow
+
 # Following operation has dependency to transform
-# TODO: impelment union
+# TODO: implement transform
+# TODO: implement transformWith
+# TODO: implement union
 # TODO: implement repertitions
 # TODO: implement cogroup
 # TODO: implement join
 # TODO: implement leftOuterJoin
 # TODO: implemtnt rightOuterJoin
 
-
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():

From d8b593b20351d32d4ac3948778bf2ebbab86879f Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 00:30:17 -0700
Subject: [PATCH 065/347] add comments

---
 python/pyspark/java_gateway.py      |  5 ++---
 python/pyspark/streaming/context.py | 13 ++++++-----
 python/pyspark/streaming/dstream.py | 24 ++++++++++++++++++++
 python/pyspark/streaming_tests.py   | 34 ++++++++++++++++++++++-------
 4 files changed, 59 insertions(+), 17 deletions(-)

diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index cea7d0975e5d1..8f9a747f9590b 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -82,15 +82,14 @@ def run(self):
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
     java_import(gateway.jvm, "org.apache.spark.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.api.python.*")
-    java_import(gateway.jvm, "org.apache.spark.streaming.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.*")  # do we need this?
     java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
-    java_import(gateway.jvm, "org.apache.spark.streaming.dstream.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.dstream.*")  # do we need this?
     java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.LocalHiveContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.TestHiveContext")
     java_import(gateway.jvm, "scala.Tuple2")
-
     return gateway
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 470ed270cdbfb..e380626aa080b 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -64,7 +64,9 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
                         pyFiles=pyFiles, environment=environment, batchSize=batchSize,
                         serializer=serializer, conf=conf, gateway=gateway)
 
-        # Start py4j callback server
+        # Start py4j callback server.
+        # Callback sever is need only by SparkStreming; therefore the callback sever
+        # is started in StreamingContext.
         SparkContext._gateway.restart_callback_server()
         self._clean_up_trigger()
         self._jvm = self._sc._jvm
@@ -78,6 +80,8 @@ def _clean_up_trigger(self):
         """Kill py4j callback server properly using signal lib"""
 
         def clean_up_handler(*args):
+            # Make sure stop callback server.
+            # This need improvement how to terminate callback sever properly.
             SparkContext._gateway._shutdown_callback_server()
             SparkContext._gateway.shutdown()
             sys.exit(0)
@@ -100,7 +104,7 @@ def awaitTermination(self, timeout=None):
         else:
             self._jssc.awaitTermination(timeout)
 
-    # start from simple one. storageLevel is not passed for now.
+    #TODO: add storageLevel
     def socketTextStream(self, hostname, port):
         """
         Create an input from TCP source hostname:port. Data is received using
@@ -134,7 +138,7 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
     def _testInputStream(self, test_inputs, numSlices=None):
         """
         This function is only for test.
-        This implementation is inpired by QueStream implementation. 
+        This implementation is inspired by QueStream implementation.
         Give list of RDD to generate DStream which contains the RDD.
         """
         test_rdds = list()
@@ -144,9 +148,6 @@ def _testInputStream(self, test_inputs, numSlices=None):
             test_rdds.append(test_rdd._jrdd)
             test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
 
-#        if len(set(test_rdd_deserializers)) > 1:
-#            raise IOError("Deserializer should be one type to run test case. "
-#                          "See the SparkContext.parallelize to understand how to decide deserializer")
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index ef0e2258e9922..8ed50d3dd2531 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -331,6 +331,17 @@ def checkpoint(self, interval):
         return self
 
     def groupByKey(self, numPartitions=None):
+        """
+        Return a new DStream which contains group the values for each key in the
+        DStream into a single sequence.
+        Hash-partitions the resulting RDD with into numPartitions partitions in
+        the DStream.
+
+        Note: If you are grouping in order to perform an aggregation (such as a
+        sum or average) over each key, using reduceByKey will provide much
+        better performance.
+
+        """
         def createCombiner(x):
             return [x]
 
@@ -346,6 +357,10 @@ def mergeCombiners(a, b):
                                  numPartitions).mapValues(lambda x: ResultIterable(x))
 
     def countByValue(self):
+        """
+        Return new DStream which contains the count of each unique value in this
+        DStreeam as a (value, count) pairs.
+        """
         def countPartition(iterator):
             counts = defaultdict(int)
             for obj in iterator:
@@ -360,6 +375,9 @@ def mergeMaps(m1, m2):
         return self.mapPartitions(countPartition).reduce(mergeMaps).flatMap(lambda x: x.items())
 
     def saveAsTextFiles(self, prefix, suffix=None):
+        """
+        Save this DStream as a text file, using string representations of elements.
+        """
 
         def saveAsTextFile(rdd, time):
             path = rddToFileName(prefix, suffix, time)
@@ -368,6 +386,11 @@ def saveAsTextFile(rdd, time):
         return self.foreachRDD(saveAsTextFile)
 
     def saveAsPickledFiles(self, prefix, suffix=None):
+        """
+        Save this DStream as a SequenceFile of serialized objects. The serializer
+        used is L{pyspark.serializers.PickleSerializer}, default batch size
+        is 10.
+        """
 
         def saveAsTextFile(rdd, time):
             path = rddToFileName(prefix, suffix, time)
@@ -397,6 +420,7 @@ def saveAsTextFile(rdd, time):
 # TODO: implement leftOuterJoin
 # TODO: implemtnt rightOuterJoin
 
+
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 2bb01ed3a0642..ef308fdd6aa59 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -18,12 +18,11 @@
 """
 Unit tests for PySpark; additional tests are implemented as doctests in
 individual modules.
-Other option is separate this test case with other tests.
-This makes sense becuase streaming tests takes long time due to waiting time
-for stoping callback server.
 
-This file will merged to tests.py. But for now, this file is separated due
-to focusing to streaming test case
+This file would be merged to tests.py after all functions are ready.
+But for now, this file is separated due to focusing to streaming test case.
+
+Callback server seems like unstable sometimes, which cause error in test case.
 
 """
 from itertools import chain
@@ -43,10 +42,10 @@ def setUp(self):
 
     def tearDown(self):
         # Do not call pyspark.streaming.context.StreamingContext.stop directly because
-        # we do not wait to shutdowncall back server and py4j client
+        # we do not wait to shutdown call back server and py4j client
         self.ssc._jssc.stop()
         self.ssc._sc.stop()
-        # Why does it long time to terminaete StremaingContext and SparkContext?
+        # Why does it long time to terminate StremaingContext and SparkContext?
         # Should we change the sleep time if this depends on machine spec?
         time.sleep(10)
 
@@ -68,7 +67,7 @@ class TestBasicOperationsSuite(PySparkStreamingTestCase):
     I am wondering if these test are enough or not.
     All tests input should have list of lists. This represents stream.
     Every batch interval, the first object of list are chosen to make DStream.
-    Please see the BasicTestSuits in Scala or QueStream which is close to this implementation.
+    Please see the BasicTestSuits in Scala which is close to this implementation.
     """
     def setUp(self):
         PySparkStreamingTestCase.setUp(self)
@@ -358,5 +357,24 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
 
         return self.result
 
+class TestSaveAsFilesSuite(PySparkStreamingTestCase):
+    def setUp(self):
+        PySparkStreamingTestCase.setUp(self)
+        self.timeout = 10  # seconds
+        self.numInputPartitions = 2
+        self.result = list()
+
+    def tearDown(self):
+        PySparkStreamingTestCase.tearDown(self)
+
+    @classmethod
+    def tearDownClass(cls):
+        PySparkStreamingTestCase.tearDownClass()
+
+
+
+
+
+
 if __name__ == "__main__":
     unittest.main()

From e7ebb08da3c59102cfad08ce4d687e56d02a0edf Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 00:35:50 -0700
Subject: [PATCH 066/347] removed wasted print in DStream

---
 .../streaming/api/java/JavaDStreamLike.scala    |  9 ---------
 .../spark/streaming/dstream/DStream.scala       | 17 -----------------
 2 files changed, 26 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index 7a002bbe74ca9..a6184de4e83c1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -54,15 +54,6 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     dstream.print()
   }
 
-  def print(label: String = null): Unit = {
-    dstream.print(label)
-  }
-
-  def outputToFile(): Unit = {
-    dstream.outputToFile()
-  }
-
-
   /**
    * Return a new DStream in which each RDD has a single element generated by counting each RDD
    * of this DStream.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 46ef05d9c37a1..39ad591e8896e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -617,23 +617,6 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-
-  def print(label: String = null) {
-    def foreachFunc = (rdd: RDD[T], time: Time) => {
-      val first11 = rdd.take(11)
-      println ("-------------------------------------------")
-      println ("Time: " + time)
-      println ("-------------------------------------------")
-      if(label != null){
-        println (label)
-      }
-      first11.take(10).foreach(println)
-      if (first11.size > 10) println("...")
-      println()
-    }
-    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
-  }
-
   /**
    * Return a new DStream in which each RDD contains all the elements in seen in a
    * sliding window of time over this DStream. The new DStream generates RDDs with

From 636090ac5323cdde6c72d48336b716693a80e010 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 13:24:17 -0700
Subject: [PATCH 067/347] added sparkContext as input parameter in
 StreamingContext

---
 python/pyspark/streaming/context.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index e380626aa080b..3f455a3e06072 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -34,7 +34,7 @@ class StreamingContext(object):
 
     def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
-        gateway=None, duration=None):
+        gateway=None, sparkContext=None, duration=None):
         """
         Create a new StreamingContext. At least the master and app name and duration
         should be set, either through the named parameters here or through C{conf}.
@@ -55,14 +55,18 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         @param conf: A L{SparkConf} object setting Spark properties.
         @param gateway: Use an existing gateway and JVM, otherwise a new JVM
                will be instatiated.
-        @param duration: A L{Duration} Duration for SparkStreaming
+        @param sparkContext: L{SparkContext} object.
+        @param duration: A L{Duration} object for SparkStreaming.
 
         """
 
-        # Create the Python Sparkcontext
-        self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
-                        pyFiles=pyFiles, environment=environment, batchSize=batchSize,
-                        serializer=serializer, conf=conf, gateway=gateway)
+        if sparkContext is None:
+            # Create the Python Sparkcontext
+            self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
+                            pyFiles=pyFiles, environment=environment, batchSize=batchSize,
+                            serializer=serializer, conf=conf, gateway=gateway)
+        else:
+            self._sc = sparkContext
 
         # Start py4j callback server.
         # Callback sever is need only by SparkStreming; therefore the callback sever

From a3d2379d79fdb8573963564f5c5be98558e495f2 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 14:39:45 -0700
Subject: [PATCH 068/347] added gorupByKey testcase

---
 python/pyspark/streaming_tests.py | 70 ++++++++++++++++++++++++-------
 1 file changed, 54 insertions(+), 16 deletions(-)

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index ef308fdd6aa59..c35d352c66ca5 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -275,7 +275,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_mapPartitions_batch(self):
-        """Basic operation test for DStream.mapPartitions with batch deserializer"""
+        """Basic operation test for DStream.mapPartitions with batch deserializer."""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
         numSlices = 2
 
@@ -288,7 +288,7 @@ def f(iterator):
         self.assertEqual(expected_output, output)
 
     def test_mapPartitions_unbatch(self):
-        """Basic operation test for DStream.mapPartitions with unbatch deserializer"""
+        """Basic operation test for DStream.mapPartitions with unbatch deserializer."""
         test_input = [range(1, 4), range(4, 7), range(7, 10)]
         numSlices = 2
 
@@ -301,8 +301,8 @@ def f(iterator):
         self.assertEqual(expected_output, output)
 
     def test_countByValue_batch(self):
-        """Basic operation test for DStream.countByValue with batch deserializer"""
-        test_input = [range(1, 5) + range(1,5), range(5, 7) + range(5, 9), ["a"] * 2 + ["b"] + [""] ]
+        """Basic operation test for DStream.countByValue with batch deserializer."""
+        test_input = [range(1, 5) + range(1,5), range(5, 7) + range(5, 9), ["a", "a", "b", ""]]
 
         def test_func(dstream):
             return dstream.countByValue()
@@ -315,7 +315,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_countByValue_unbatch(self):
-        """Basic operation test for DStream.countByValue with unbatch deserializer"""
+        """Basic operation test for DStream.countByValue with unbatch deserializer."""
         test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
 
         def test_func(dstream):
@@ -328,30 +328,72 @@ def test_func(dstream):
             self._sort_result_based_on_key(result)
         self.assertEqual(expected_output, output)
 
+    def test_groupByKey_batch(self):
+        """Basic operation test for DStream.groupByKey with batch deserializer."""
+        test_input = [range(1, 5), [1, 1, 1, 2, 2, 3], ["a", "a", "b", "", "", ""]]
+        def test_func(dstream):
+            return dstream.map(lambda x: (x,1)).groupByKey()
+        expected_output = [[(1, [1]), (2, [1]), (3, [1]), (4, [1])],
+                           [(1, [1, 1, 1]), (2, [1, 1]), (3, [1])],
+                           [("a", [1, 1]), ("b", [1]), ("", [1, 1, 1])]]
+        scattered_output = self._run_stream(test_input, test_func, expected_output)
+        output = self._convert_iter_value_to_list(scattered_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def test_groupByKey_unbatch(self):
+        """Basic operation test for DStream.groupByKey with unbatch deserializer."""
+        test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
+        def test_func(dstream):
+            return dstream.map(lambda x: (x,1)).groupByKey()
+        expected_output = [[(1, [1]), (2, [1]), (3, [1])],
+                           [(1, [1, 1]), ("", [1])],
+                           [("a", [1, 1]), ("b", [1])]]
+        scattered_output = self._run_stream(test_input, test_func, expected_output)
+        output = self._convert_iter_value_to_list(scattered_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def _convert_iter_value_to_list(self, outputs):
+        """Return key value pair list. Value is converted to iterator to list."""
+        result = list()
+        for output in outputs:
+            result.append(map(lambda (x, y): (x, list(y)), output))
+        return result
+
     def _sort_result_based_on_key(self, outputs):
+        """Sort the list base onf first value."""
         for output in outputs:
             output.sort(key=lambda x: x[0])
 
     def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
-        """Start stream and return the output"""
-        # Generate input stream with user-defined input
+        """
+        Start stream and return the output.
+        @param test_input: dataset for the test. This should be list of lists.
+        @param test_func: wrapped test_function. This function should return PythonDstream object.
+        @param expexted_output: expected output for this testcase.
+        @param numSlices: the number of slices in the rdd in the dstream.
+        """
+        # Generate input stream with user-defined input.
         numSlices = numSlices or self.numInputPartitions
         test_input_stream = self.ssc._testInputStream(test_input, numSlices)
-        # Apply test function to stream
+        # Apply test function to stream.
         test_stream = test_func(test_input_stream)
-        # Add job to get output from stream
+        # Add job to get output from stream.
         test_stream._test_output(self.result)
         self.ssc.start()
 
         start_time = time.time()
-        # loop until get the result from stream
+        # Loop until get the expected the number of the result from the stream.
         while True:
             current_time = time.time()
-            # check time out
+            # Check time out.
             if (current_time - start_time) > self.timeout:
                 break
             self.ssc.awaitTermination(50)
-            # check if the output is the same length of expexted output
+            # Check if the output is the same length of expexted output.
             if len(expected_output) == len(self.result):
                 break
 
@@ -372,9 +414,5 @@ def tearDownClass(cls):
         PySparkStreamingTestCase.tearDownClass()
 
 
-
-
-
-
 if __name__ == "__main__":
     unittest.main()

From 665bfdb48523ecb7aa5174341a74c55c2088a891 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 15:12:31 -0700
Subject: [PATCH 069/347] added testcase for combineByKey

---
 python/pyspark/streaming_tests.py | 35 +++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index c35d352c66ca5..7f6960faed1a0 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -332,7 +332,7 @@ def test_groupByKey_batch(self):
         """Basic operation test for DStream.groupByKey with batch deserializer."""
         test_input = [range(1, 5), [1, 1, 1, 2, 2, 3], ["a", "a", "b", "", "", ""]]
         def test_func(dstream):
-            return dstream.map(lambda x: (x,1)).groupByKey()
+            return dstream.map(lambda x: (x, 1)).groupByKey()
         expected_output = [[(1, [1]), (2, [1]), (3, [1]), (4, [1])],
                            [(1, [1, 1, 1]), (2, [1, 1]), (3, [1])],
                            [("a", [1, 1]), ("b", [1]), ("", [1, 1, 1])]]
@@ -345,8 +345,9 @@ def test_func(dstream):
     def test_groupByKey_unbatch(self):
         """Basic operation test for DStream.groupByKey with unbatch deserializer."""
         test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
+
         def test_func(dstream):
-            return dstream.map(lambda x: (x,1)).groupByKey()
+            return dstream.map(lambda x: (x, 1)).groupByKey()
         expected_output = [[(1, [1]), (2, [1]), (3, [1])],
                            [(1, [1, 1]), ("", [1])],
                            [("a", [1, 1]), ("b", [1])]]
@@ -356,6 +357,36 @@ def test_func(dstream):
             self._sort_result_based_on_key(result)
         self.assertEqual(expected_output, output)
 
+    def test_combineByKey_batch(self):
+        """Basic operation test for DStream.combineByKey with batch deserializer."""
+        test_input = [range(1, 5), [1, 1, 1, 2, 2, 3], ["a", "a", "b", "", "", ""]]
+
+        def test_func(dstream):
+            def add(a, b): return a + str(b)
+            return dstream.map(lambda x: (x, 1)).combineByKey(str, add, add)
+        expected_output = [[(1, "1"), (2, "1"), (3, "1"), (4, "1")],
+                           [(1, "111"), (2, "11"), (3, "1")],
+                           [("a", "11"), ("b", "1"), ("", "111")]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def test_combineByKey_unbatch(self):
+        """Basic operation test for DStream.combineByKey with unbatch deserializer."""
+        test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
+
+        def test_func(dstream):
+            def add(a, b): return a + str(b)
+            return dstream.map(lambda x: (x, 1)).combineByKey(str, add, add)
+        expected_output = [[(1, "1"), (2, "1"), (3, "1")],
+                           [(1, "11"), ("", "1")],
+                           [("a", "11"), ("b", "1")]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
     def _convert_iter_value_to_list(self, outputs):
         """Return key value pair list. Value is converted to iterator to list."""
         result = list()

From 5c3a683efb76c49e6441672272bc029ecfbb687a Mon Sep 17 00:00:00 2001
From: Ken <ugw.gi.world@gmail.com>
Date: Tue, 8 Jul 2014 18:31:41 -0700
Subject: [PATCH 070/347] initial commit for pySparkStreaming

---
 bin/spark-submit                              |   6 +
 core/pom.xml                                  |   2 +-
 .../apache/spark/api/python/PythonRDD.scala   |   2 +-
 .../apache/spark/deploy/PythonRunner.scala    |   1 +
 .../src/main/python/streaming/wordcount.py    |  22 ++
 python/pyspark/java_gateway.py                |   3 +
 python/pyspark/streaming/__init__.py          |   1 +
 python/pyspark/streaming/context.py           | 133 ++++++++
 python/pyspark/streaming/dstream.py           | 315 ++++++++++++++++++
 python/pyspark/streaming/duration.py          | 171 ++++++++++
 python/pyspark/streaming/jtime.py             | 116 +++++++
 python/pyspark/streaming/pyprint.py           |  28 ++
 python/pyspark/streaming/utils.py             |  18 +
 streaming/pom.xml                             |   8 +-
 .../streaming/api/java/JavaDStreamLike.scala  |   8 +
 .../streaming/api/python/PythonDStream.scala  | 152 +++++++++
 .../spark/streaming/dstream/DStream.scala     |  68 +++-
 17 files changed, 1047 insertions(+), 7 deletions(-)
 create mode 100644 examples/src/main/python/streaming/wordcount.py
 create mode 100644 python/pyspark/streaming/__init__.py
 create mode 100644 python/pyspark/streaming/context.py
 create mode 100644 python/pyspark/streaming/dstream.py
 create mode 100644 python/pyspark/streaming/duration.py
 create mode 100644 python/pyspark/streaming/jtime.py
 create mode 100644 python/pyspark/streaming/pyprint.py
 create mode 100644 python/pyspark/streaming/utils.py
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala

diff --git a/bin/spark-submit b/bin/spark-submit
index 9e7cecedd0325..ac275b7696d5c 100755
--- a/bin/spark-submit
+++ b/bin/spark-submit
@@ -37,6 +37,12 @@ done
 
 DEPLOY_MODE=${DEPLOY_MODE:-"client"}
 
+# Figure out which Python executable to use
+if [[ -z "$PYSPARK_PYTHON" ]]; then
+  PYSPARK_PYTHON="python"
+fi
+export PYSPARK_PYTHON
+
 if [ -n "$DRIVER_MEMORY" ] && [ $DEPLOY_MODE == "client" ]; then
   export SPARK_DRIVER_MEMORY=$DRIVER_MEMORY
 fi
diff --git a/core/pom.xml b/core/pom.xml
index 6d8be37037729..3ac830f2237f7 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>1.1.0-SNAPSHOT</version>
+    <version>1.0.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 10210a2927dcc..851862856d67b 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -258,7 +258,7 @@ private class PythonException(msg: String, cause: Exception) extends RuntimeExce
  * Form an RDD[(Array[Byte], Array[Byte])] from key-value pairs returned from Python.
  * This is used by PySpark's shuffle operations.
  */
-private class PairwiseRDD(prev: RDD[Array[Byte]]) extends
+private[spark] class PairwiseRDD(prev: RDD[Array[Byte]]) extends
   RDD[(Long, Array[Byte])](prev) {
   override def getPartitions = prev.partitions
   override def compute(split: Partition, context: TaskContext) =
diff --git a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
index 0d6751f3fa6d2..89f3fd47724fe 100644
--- a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
@@ -57,6 +57,7 @@ object PythonRunner {
     val builder = new ProcessBuilder(Seq(pythonExec, "-u", formattedPythonFile) ++ otherArgs)
     val env = builder.environment()
     env.put("PYTHONPATH", pythonPath)
+    env.put("PYSPARK_PYTHON", pythonExec)
     env.put("PYSPARK_GATEWAY_PORT", "" + gatewayServer.getListeningPort)
     builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize
     val process = builder.start()
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
new file mode 100644
index 0000000000000..f44cd696894ba
--- /dev/null
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -0,0 +1,22 @@
+import sys
+from operator import add
+
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.duration import *
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print >> sys.stderr, "Usage: wordcount <directory>"
+        exit(-1)
+    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
+
+    lines = ssc.textFileStream(sys.argv[1])
+    fm_lines = lines.flatMap(lambda x: x.split(" "))
+    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
+    mapped_lines = fm_lines.map(lambda x: (x, 1))
+    
+    fm_lines.pyprint()
+    filtered_lines.pyprint()
+    mapped_lines.pyprint()
+    ssc.start()
+    ssc.awaitTermination()
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index c7f7c1fe591b0..4547e54bd2d5d 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -84,6 +84,9 @@ def run(self):
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
     java_import(gateway.jvm, "org.apache.spark.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.api.python.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
diff --git a/python/pyspark/streaming/__init__.py b/python/pyspark/streaming/__init__.py
new file mode 100644
index 0000000000000..719592912e80c
--- /dev/null
+++ b/python/pyspark/streaming/__init__.py
@@ -0,0 +1 @@
+__author__ = 'ktakagiw'
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
new file mode 100644
index 0000000000000..c8ae9c4af85c9
--- /dev/null
+++ b/python/pyspark/streaming/context.py
@@ -0,0 +1,133 @@
+__author__ = 'ktakagiw'
+
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import shutil
+import sys
+from threading import Lock
+from tempfile import NamedTemporaryFile
+
+from pyspark import accumulators
+from pyspark.accumulators import Accumulator
+from pyspark.broadcast import Broadcast
+from pyspark.conf import SparkConf
+from pyspark.files import SparkFiles
+from pyspark.java_gateway import launch_gateway
+from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
+from pyspark.storagelevel import StorageLevel
+from pyspark.rdd import RDD
+from pyspark.context import SparkContext
+
+from py4j.java_collections import ListConverter
+
+from pyspark.streaming.dstream import DStream
+
+class StreamingContext(object):
+    """
+    Main entry point for Spark functionality. A StreamingContext represents the
+    connection to a Spark cluster, and can be used to create L{RDD}s and
+    broadcast variables on that cluster.
+    """
+
+    def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
+        environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
+        gateway=None, duration=None):
+        """
+        Create a new StreamingContext. At least the master and app name and duration
+        should be set, either through the named parameters here or through C{conf}.
+
+        @param master: Cluster URL to connect to
+               (e.g. mesos://host:port, spark://host:port, local[4]).
+        @param appName: A name for your job, to display on the cluster web UI.
+        @param sparkHome: Location where Spark is installed on cluster nodes.
+        @param pyFiles: Collection of .zip or .py files to send to the cluster
+               and add to PYTHONPATH.  These can be paths on the local file
+               system or HDFS, HTTP, HTTPS, or FTP URLs.
+        @param environment: A dictionary of environment variables to set on
+               worker nodes.
+        @param batchSize: The number of Python objects represented as a single
+               Java object.  Set 1 to disable batching or -1 to use an
+               unlimited batch size.
+        @param serializer: The serializer for RDDs.
+        @param conf: A L{SparkConf} object setting Spark properties.
+        @param gateway: Use an existing gateway and JVM, otherwise a new JVM
+               will be instatiated.
+        @param duration: A L{Duration} Duration for SparkStreaming
+
+        """
+        # Create the Python Sparkcontext
+        self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
+                        pyFiles=pyFiles, environment=environment, batchSize=batchSize,
+                        serializer=serializer, conf=conf, gateway=gateway)
+        self._jvm = self._sc._jvm
+        self._jssc = self._initialize_context(self._sc._jsc, duration._jduration)
+
+    # Initialize StremaingContext in function to allow subclass specific initialization
+    def _initialize_context(self, jspark_context, jduration):
+        return self._jvm.JavaStreamingContext(jspark_context, jduration)
+
+    def actorStream(self, props, name, storageLevel, supervisorStrategy):
+        raise NotImplementedError
+
+    def addStreamingListener(self, streamingListener):
+        raise NotImplementedError
+
+    def awaitTermination(self, timeout=None):
+        if timeout:
+            self._jssc.awaitTermination(timeout)
+        else:
+            self._jssc.awaitTermination()
+
+    def checkpoint(self, directory):
+        raise NotImplementedError
+
+    def fileStream(self, directory, filter=None, newFilesOnly=None):
+        raise NotImplementedError
+
+    def networkStream(self, receiver):
+        raise NotImplementedError
+
+    def queueStream(self, queue, oneAtATime=True, defaultRDD=None):
+        raise NotImplementedError
+
+    def rawSocketStream(self, hostname, port, storagelevel):
+        raise NotImplementedError
+
+    def remember(self, duration):
+        raise NotImplementedError
+
+    def socketStream(hostname, port, converter,storageLevel):
+        raise NotImplementedError
+
+    def start(self):
+        self._jssc.start()
+
+    def stop(self, stopSparkContext=True):
+        raise NotImplementedError
+
+    def textFileStream(self, directory):
+        return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
+
+    def transform(self, seq):
+        raise NotImplementedError
+
+    def union(self, seq):
+        raise NotImplementedError
+
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
new file mode 100644
index 0000000000000..b422b147d11e1
--- /dev/null
+++ b/python/pyspark/streaming/dstream.py
@@ -0,0 +1,315 @@
+from base64 import standard_b64encode as b64enc
+import copy
+from collections import defaultdict
+from collections import namedtuple
+from itertools import chain, ifilter, imap
+import operator
+import os
+import sys
+import shlex
+import traceback
+from subprocess import Popen, PIPE
+from tempfile import NamedTemporaryFile
+from threading import Thread
+import warnings
+import heapq
+from random import Random
+
+from pyspark.serializers import NoOpSerializer, CartesianDeserializer, \
+    BatchedSerializer, CloudPickleSerializer, PairDeserializer, pack_long
+from pyspark.join import python_join, python_left_outer_join, \
+    python_right_outer_join, python_cogroup
+from pyspark.statcounter import StatCounter
+from pyspark.rddsampler import RDDSampler
+from pyspark.storagelevel import StorageLevel
+#from pyspark.resultiterable import ResultIterable
+from pyspark.rdd import _JavaStackTrace
+
+from py4j.java_collections import ListConverter, MapConverter
+
+__all__ = ["DStream"]
+
+class DStream(object):
+    def __init__(self, jdstream, ssc, jrdd_deserializer):
+        self._jdstream = jdstream
+        self._ssc = ssc
+        self.ctx = ssc._sc
+        self._jrdd_deserializer = jrdd_deserializer
+
+    def generatedRDDs(self):
+        """
+         // RDDs generated, marked as private[streaming] so that testsuites can access it
+         @transient
+        """
+        pass
+
+    def print_(self):
+        """
+        """
+        # print is a resrved name of Python. We cannot give print to function name
+        getattr(self._jdstream, "print")()
+
+    def pyprint(self):
+        """
+        """
+        self._jdstream.pyprint()
+
+    def cache(self):
+        """
+        """
+        raise NotImplementedError
+
+    def checkpoint(self):
+        """
+        """
+        raise NotImplementedError
+
+    def compute(self, time):
+        """
+        """
+        raise NotImplementedError
+
+    def context(self):
+        """
+        """
+        raise NotImplementedError
+
+    def count(self):
+        """
+        """
+        raise NotImplementedError
+
+    def countByValue(self, numPartitions=None):
+        """
+        """
+        raise NotImplementedError
+
+    def countByValueAndWindow(self, duration, slideDuration=None):
+        """
+        """
+        raise NotImplementedError
+
+    def countByWindow(self, duration, slideDuration=None):
+        """
+        """
+        raise NotImplementedError
+
+    def dstream(self):
+        """
+        """
+        raise NotImplementedError
+
+    def filter(self, f):
+        """
+        """
+        def func(iterator): return ifilter(f, iterator)
+        return self.mapPartitions(func)
+
+    def flatMap(self, f, preservesPartitioning=False):
+        """
+        """
+        def func(s, iterator): return chain.from_iterable(imap(f, iterator))
+        return self.mapPartitionsWithIndex(func, preservesPartitioning)
+
+    def foreachRDD(self, f, time):
+        """
+        """
+        raise NotImplementedError
+
+    def glom(self):
+        """
+        """
+        raise NotImplementedError
+
+    def map(self, f, preservesPartitioning=False):
+        """
+        """
+        def func(split, iterator): return imap(f, iterator)
+        return PipelinedDStream(self, func, preservesPartitioning)
+
+    def mapPartitions(self, f):
+        """
+        """
+        def func(s, iterator): return f(iterator)
+        return self.mapPartitionsWithIndex(func)
+
+    def perist(self, storageLevel):
+        """
+        """
+        raise NotImplementedError
+
+    def reduce(self, func, numPartitions=None):
+        """
+
+        """
+        return self._combineByKey(lambda x:x, func, func, numPartitions)
+
+    def _combineByKey(self, createCombiner, mergeValue, mergeCombiners,
+                      numPartitions = None):
+        """
+        """
+        if numPartitions is None:
+            numPartitions = self.ctx._defaultParallelism()
+        def combineLocally(iterator):
+            combiners = {}
+            for x in iterator:
+                (k, v) = x
+                if k not in combiners:
+                    combiners[k] = createCombiner(v)
+                else:
+                    combiners[k] = mergeValue(combiners[k], v)
+            return combiners.iteritems()
+        locally_combined = self.mapPartitions(combineLocally)
+        shuffled = locally_combined.partitionBy(numPartitions)
+        def _mergeCombiners(iterator):
+            combiners = {}
+            for (k, v) in iterator:
+                if not k in combiners:
+                    combiners[k] = v
+                else:
+                    combiners[k] = mergeCombiners(combiners[k], v)
+            return combiners.iteritems()
+        return shuffled.mapPartitions(_mergeCombiners) 
+
+
+   def partitionBy(self, numPartitions, partitionFunc=None):
+        """
+        Return a copy of the DStream partitioned using the specified partitioner.
+
+        """
+        if numPartitions is None:
+            numPartitions = self.ctx._defaultReducePartitions()
+
+        if partitionFunc is None:
+            partitionFunc = lambda x: 0 if x is None else hash(x)
+        # Transferring O(n) objects to Java is too expensive.  Instead, we'll
+        # form the hash buckets in Python, transferring O(numPartitions) objects
+        # to Java.  Each object is a (splitNumber, [objects]) pair.
+        outputSerializer = self.ctx._unbatched_serializer
+        def add_shuffle_key(split, iterator):
+
+            buckets = defaultdict(list)
+
+            for (k, v) in iterator:
+                buckets[partitionFunc(k) % numPartitions].append((k, v))
+            for (split, items) in buckets.iteritems():
+                yield pack_long(split)
+                yield outputSerializer.dumps(items)
+        keyed = PipelinedDStream(self, add_shuffle_key)
+        keyed._bypass_serializer = True
+        with _JavaStackTrace(self.ctx) as st:
+            #JavaDStream
+            #pairRDD = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairRDD()
+            pairDStream = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairDStream()
+            partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
+                                                          id(partitionFunc))
+        jdstream = pairDStream.partitionBy(partitioner).values()
+        dstream = DStream(jdstream, self._ssc, BatchedSerializer(outputSerializer))
+        # This is required so that id(partitionFunc) remains unique, even if
+        # partitionFunc is a lambda:
+        dstream._partitionFunc = partitionFunc
+        return dstream
+
+
+
+    def reduceByWindow(self, reduceFunc, windowDuration, slideDuration, inReduceTunc):
+        """
+        """
+
+        raise NotImplementedError
+
+    def repartition(self, numPartitions):
+        """
+        """
+        raise NotImplementedError
+
+    def slice(self, fromTime, toTime):
+        """
+        """
+        raise NotImplementedError
+
+    def transform(self, transformFunc):
+        """
+        """
+        raise NotImplementedError
+
+    def transformWith(self, other, transformFunc):
+        """
+        """
+        raise NotImplementedError
+
+    def union(self, that):
+        """
+        """
+        raise NotImplementedError
+
+    def window(self, windowDuration, slideDuration=None):
+        """
+        """
+        raise NotImplementedError
+
+    def wrapRDD(self, rdd):
+        """
+        """
+        raise NotImplementedError
+
+    def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
+        return PipelinedDStream(self, f, preservesPartitioning)
+
+
+class PipelinedDStream(DStream):
+    def __init__(self, prev, func, preservesPartitioning=False):
+        if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
+            # This transformation is the first in its stage:
+            self.func = func
+            self.preservesPartitioning = preservesPartitioning
+            self._prev_jdstream = prev._jdstream
+            self._prev_jrdd_deserializer = prev._jrdd_deserializer
+        else:
+            prev_func = prev.func
+            def pipeline_func(split, iterator):
+                return func(split, prev_func(split, iterator))
+            self.func = pipeline_func
+            self.preservesPartitioning = \
+                prev.preservesPartitioning and preservesPartitioning
+            self._prev_jdstream = prev._prev_jdstream  # maintain the pipeline
+            self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer
+        self.is_cached = False
+        self.is_checkpointed = False
+        self._ssc = prev._ssc
+        self.ctx = prev.ctx
+        self.prev = prev
+        self._jdstream_val = None
+        self._jrdd_deserializer = self.ctx.serializer
+        self._bypass_serializer = False
+
+    @property
+    def _jdstream(self):
+        if self._jdstream_val:
+            return self._jdstream_val
+        if self._bypass_serializer:
+            serializer = NoOpSerializer()
+        else:
+            serializer = self.ctx.serializer
+
+        command = (self.func, self._prev_jrdd_deserializer, serializer)
+        pickled_command = CloudPickleSerializer().dumps(command)
+        broadcast_vars = ListConverter().convert(
+            [x._jbroadcast for x in self.ctx._pickled_broadcast_vars],
+            self.ctx._gateway._gateway_client)
+        self.ctx._pickled_broadcast_vars.clear()
+        class_tag = self._prev_jdstream.classTag()
+        env = MapConverter().convert(self.ctx.environment,
+                                     self.ctx._gateway._gateway_client)
+        includes = ListConverter().convert(self.ctx._python_includes,
+                                     self.ctx._gateway._gateway_client)
+        python_dstream = self.ctx._jvm.PythonDStream(self._prev_jdstream.dstream(),
+                bytearray(pickled_command),
+                env, includes, self.preservesPartitioning,
+                self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator,
+                class_tag)
+        self._jdstream_val = python_dstream.asJavaDStream()
+        return self._jdstream_val
+
+    def _is_pipelinable(self):
+        return not (self.is_cached or self.is_checkpointed)
diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
new file mode 100644
index 0000000000000..ef1b4f6cef237
--- /dev/null
+++ b/python/pyspark/streaming/duration.py
@@ -0,0 +1,171 @@
+__author__ = 'ktakagiw'
+
+from pyspark.streaming import utils
+
+class Duration(object):
+    """
+    Duration for Spark Streaming application. Used to set duration
+
+    Most of the time, you would create a Duration object with
+    C{Duration()}, which will load values from C{spark.streaming.*} Java system
+    properties as well. In this case, any parameters you set directly on
+    the C{Duration} object take priority over system properties.
+
+    """
+    def __init__(self, millis, _jvm=None):
+        """
+        Create new Duration.
+
+        @param millis: milisecond
+
+        """
+        self._millis = millis
+
+        from pyspark.context import SparkContext
+        SparkContext._ensure_initialized()
+        _jvm = _jvm or SparkContext._jvm
+        self._jduration = _jvm.Duration(millis)
+
+    def toString(self):
+        """ Return duration as string """
+        return str(self._millis) + " ms"
+
+    def isZero(self):
+        """ Check if millis is zero """
+        return self._millis == 0
+
+    def prettyPrint(self):
+        """
+        Return a human-readable string representing a duration
+        """
+        return utils.msDurationToString(self._millis)
+
+    def milliseconds(self):
+        """ Return millisecond """
+        return self._millis
+
+    def toFormattedString(self):
+        """ Return millisecond """
+        return str(self._millis)
+
+    def max(self, other):
+        """ Return higher Duration """
+        Duration._is_duration(other)
+        if self > other:
+            return self
+        else:
+            return other
+
+    def min(self, other):
+        """ Return lower Durattion """
+        Duration._is_duration(other)
+        if self < other:
+            return self
+        else:
+            return other
+
+    def __str__(self):
+        return self.toString()
+
+    def __add__(self, other):
+        """ Add Duration and Duration """
+        Duration._is_duration(other)
+        return Duration(self._millis + other._millis)
+
+    def __sub__(self, other):
+        """ Subtract Duration by Duration  """
+        Duration._is_duration(other)
+        return Duration(self._millis - other._millis)
+
+    def __mul__(self, other):
+        """ Multiple Duration by Duration """
+        Duration._is_duration(other)
+        return Duration(self._millis * other._millis)
+
+    def __div__(self, other):
+        """
+        Divide Duration by Duration
+        for Python 2.X
+        """
+        Duration._is_duration(other)
+        return Duration(self._millis / other._millis)
+
+    def __truediv__(self, other):
+        """
+        Divide Duration by Duration
+        for Python 3.0
+        """
+        Duration._is_duration(other)
+        return Duration(self._millis / other._millis)
+
+    def __floordiv__(self, other):
+        """ Divide Duration by Duration """
+        Duration._is_duration(other)
+        return Duration(self._millis // other._millis)
+
+    def __len__(self):
+        """ Length of miilisecond in Duration """
+        return len(self._millis)
+
+    def __lt__(self, other):
+        """ Duration < Duration """
+        Duration._is_duration(other)
+        return self._millis < other._millis
+
+    def __le__(self, other):
+        """ Duration <= Duration """
+        Duration._is_duration(other)
+        return self.millis <= other._millis
+
+    def __eq__(self, other):
+        """ Duration ==  Duration """
+        Duration._is_duration(other)
+        return self._millis == other._millis
+
+    def __ne__(self, other):
+        """ Duration != Duration """
+        Duration._is_duration(other)
+        return self._millis != other._millis
+
+    def __gt__(self, other):
+        """ Duration > Duration """
+        Duration._is_duration(other)
+        return self._millis > other._millis
+
+    def __ge__(self, other):
+        """ Duration >= Duration """
+        Duration._is_duration(other)
+        return self._millis >= other._millis
+
+    @classmethod
+    def _is_duration(self, instance):
+        """ is instance Duration """
+        if not isinstance(instance, Duration):
+            raise TypeError("This should be Duration")
+
+def Milliseconds(milliseconds):
+    """
+    Helper function that creates instance of [[pysparkstreaming.duration]] representing
+    a given number of milliseconds.
+    """
+    return Duration(milliseconds)
+
+def Seconds(seconds):
+    """
+    Helper function that creates instance of [[pysparkstreaming.duration]] representing
+    a given number of seconds.
+    """
+    return Duration(seconds * 1000)
+
+def Minites(minites):
+    """
+    Helper function that creates instance of [[pysparkstreaming.duration]] representing
+    a given number of minutes.
+    """
+    return Duration(minutes * 60000)
+
+if __name__ == "__main__":
+    d = Duration(1)
+    print d
+    print d.milliseconds()
+
diff --git a/python/pyspark/streaming/jtime.py b/python/pyspark/streaming/jtime.py
new file mode 100644
index 0000000000000..41670af659ea3
--- /dev/null
+++ b/python/pyspark/streaming/jtime.py
@@ -0,0 +1,116 @@
+__author__ = 'ktakagiw'
+
+from pyspark.streaming import utils
+from pyspark.streaming.duration import Duration
+
+class Time(object):
+    """
+    Time for Spark Streaming application. Used to set Time
+
+    Most of the time, you would create a Duration object with
+    C{Time()}, which will load values from C{spark.streaming.*} Java system
+    properties as well. In this case, any parameters you set directly on
+    the C{Time} object take priority over system properties.
+
+    """
+    def __init__(self, millis, _jvm=None):
+        """
+        Create new Time.
+
+        @param millis: milisecond
+
+        @param _jvm: internal parameter used to pass a handle to the
+               Java VM; does not need to be set by users
+
+        """
+        self._millis = millis
+
+        from pyspark.context import StreamingContext
+        StreamingContext._ensure_initialized()
+        _jvm = _jvm or StreamingContext._jvm
+        self._jtime = _jvm.Time(millis)
+
+    def toString(self):
+        """ Return time as string """
+        return str(self._millis) + " ms"
+
+    def milliseconds(self):
+        """ Return millisecond """
+        return self._millis
+
+    def max(self, other):
+        """ Return higher Time """
+        Time._is_time(other)
+        if self > other:
+            return self
+        else:
+            return other
+
+    def min(self, other):
+        """ Return lower Time """
+        Time._is_time(other)
+        if self < other:
+            return self
+        else:
+            return other
+
+    def __add__(self, other):
+        """ Add Time and Time """
+        Duration._is_duration(other)
+        return Time(self._millis + other._millis)
+
+    def __sub__(self, other):
+        """ Subtract Time by Duration or Time """
+        if isinstance(other, Duration):
+            return Time(self._millis - other._millis)
+        elif isinstance(other, Time):
+            return Duration(self._mills, other._millis)
+        else:
+            raise TypeError
+
+    def __lt__(self, other):
+        """ Time < Time """
+        Time._is_time(other)
+        return self._millis < other._millis
+
+    def __le__(self, other):
+        """ Time <= Time """
+        Time._is_time(other)
+        return self.millis <= other._millis
+
+    def __eq__(self, other):
+        """ Time ==  Time """
+        Time._is_time(other)
+        return self._millis == other._millis
+
+    def __ne__(self, other):
+        """ Time != Time """
+        Time._is_time(other)
+        return self._millis != other._millis
+
+    def __gt__(self, other):
+        """ Time > Time """
+        Time._is_time(other)
+        return self._millis > other._millis
+
+    def __ge__(self, other):
+        """ Time >= Time """
+        Time._is_time(other)
+        return self._millis >= other._millis
+
+    def isMultipbleOf(duration):
+        """ is multiple by Duration """
+        Duration._is_duration(duration)
+        return self._millis % duration._millis == 0
+
+    def until(time, interval):
+        raise NotImplementedError
+
+    def to(time, interval):
+        raise NotImplementedError
+
+    @classmethod
+    def _is_time(self, instance):
+        """ is instance Time """
+        if not isinstance(instance, Time):
+            raise TypeError
diff --git a/python/pyspark/streaming/pyprint.py b/python/pyspark/streaming/pyprint.py
new file mode 100644
index 0000000000000..fcdaca510812c
--- /dev/null
+++ b/python/pyspark/streaming/pyprint.py
@@ -0,0 +1,28 @@
+import sys
+from itertools import chain
+from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
+
+def collect(binary_file_path):
+    dse = PickleSerializer()
+    with open(binary_file_path, 'rb') as tempFile:
+        for item in dse.load_stream(tempFile):
+            yield item
+def main():
+    try:
+        binary_file_path = sys.argv[1]
+    except:
+        print "Missed FilePath in argement"
+
+    if not binary_file_path:
+        return 
+
+    counter = 0
+    for rdd in chain.from_iterable(collect(binary_file_path)):
+        print rdd
+        counter = counter + 1
+        if counter >= 10:
+            print "..."
+            break
+
+if __name__ =="__main__":
+    exit(main())
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
new file mode 100644
index 0000000000000..71aa3376c6578
--- /dev/null
+++ b/python/pyspark/streaming/utils.py
@@ -0,0 +1,18 @@
+__author__ = 'ktakagiw'
+
+def msDurationToString(ms):
+    """
+    Returns a human-readable string representing a duration such as "35ms"
+    """
+    second = 1000
+    minute = 60 * second
+    hour = 60 * minute
+
+    if ms < second:
+        return "%d ms" % ms
+    elif ms < minute:
+        return "%.1f s" % (float(ms) / second)
+    elif ms < hout:
+        return "%.1f m" % (float(ms) / minute)
+    else:
+        return "%.2f h" % (float(ms) / hour)
diff --git a/streaming/pom.xml b/streaming/pom.xml
index ce35520a28609..619c295e3490d 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>1.1.0-SNAPSHOT</version>
+    <version>1.0.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -77,9 +77,9 @@
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
       </plugin>
-      
-      <!-- 
-           This plugin forces the generation of jar containing streaming test classes, 
+
+      <!--
+           This plugin forces the generation of jar containing streaming test classes,
            so that the tests classes of external modules can use them. The two execution profiles
            are necessary - first one for 'mvn package', second one for 'mvn test-compile'. Ideally,
            'mvn compile' should not compile test classes and therefore should not need this. 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index a6184de4e83c1..cfa336df8674f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -54,6 +54,14 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     dstream.print()
   }
 
+  /**
+   * Print the first ten elements of each PythonRDD generated in the PythonDStream. This is an output
+   * operator, so this PythonDStream will be registered as an output stream and there materialized.
+   * This function is for PythonAPI.
+   */
+
+  def pyprint() = dstream.pyprint()
+
   /**
    * Return a new DStream in which each RDD has a single element generated by counting each RDD
    * of this DStream.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
new file mode 100644
index 0000000000000..2d8b1e468dc4c
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.api.python
+
+import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
+
+import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark._
+import org.apache.spark.util.Utils
+import java.io._
+import scala.Some
+import org.apache.spark.streaming.Duration
+import scala.util.control.Breaks._
+import org.apache.spark.broadcast.Broadcast
+import scala.Some
+import org.apache.spark.streaming.Duration
+import org.apache.spark.rdd.RDD
+import org.apache.spark.api.python.PythonRDD
+
+
+import org.apache.spark.streaming.{Duration, Time}
+import org.apache.spark.streaming.dstream._
+import org.apache.spark.streaming.api.java._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.api.python._
+import org.apache.spark.api.python.PairwiseRDD
+
+
+import scala.reflect.ClassTag
+
+
+class PythonDStream[T: ClassTag](
+                                  parent: DStream[T],
+                                  command: Array[Byte],
+                                  envVars: JMap[String, String],
+                                  pythonIncludes: JList[String],
+                                  preservePartitoning: Boolean,
+                                  pythonExec: String,
+                                  broadcastVars: JList[Broadcast[Array[Byte]]],
+                                  accumulator: Accumulator[JList[Array[Byte]]]
+                                  ) extends DStream[Array[Byte]](parent.ssc) {
+
+  override def dependencies = List(parent)
+
+  override def slideDuration: Duration = parent.slideDuration
+
+  //pythonDStream compute
+  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    parent.getOrCompute(validTime) match{
+      case Some(rdd) =>
+        val pythonRDD = new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning, pythonExec, broadcastVars, accumulator)
+        Some(pythonRDD.asJavaRDD.rdd)
+      case None => None
+    }
+  }
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+
+  /**
+   * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
+   * operator, so this PythonDStream will be registered as an output stream and there materialized.
+   * Since serialized Python object is readable by Python, pyprint writes out binary data to
+   * temporary file and run python script to deserialized and print the first ten elements
+   */
+  private[streaming] def ppyprint() {
+    def foreachFunc = (rdd: RDD[Array[Byte]], time: Time) => {
+      val iter = rdd.take(11).iterator
+
+      // make a temporary file
+      val prefix = "spark"
+      val suffix = ".tmp"
+      val tempFile = File.createTempFile(prefix, suffix)
+      val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
+      //write out serialized python object
+      PythonRDD.writeIteratorToStream(iter, tempFileStream)
+      tempFileStream.close()
+
+      // This value has to be passed from python
+      val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+      val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
+      //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
+      //absolute path to the python script is needed to change because we do not use pysparkstreaming
+      val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pysparkstreaming/streaming/pyprint.py", tempFile.getAbsolutePath)
+      val workerEnv = pb.environment()
+
+      //envVars also need to be pass
+      //workerEnv.putAll(envVars)
+      val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
+      workerEnv.put("PYTHONPATH", pythonPath)
+      val worker = pb.start()
+      val is = worker.getInputStream()
+      val isr = new InputStreamReader(is)
+      val br = new BufferedReader(isr)
+
+      println ("-------------------------------------------")
+      println ("Time: " + time)
+      println ("-------------------------------------------")
+
+      //print value from python std out
+      var line = ""
+      breakable {
+        while (true) {
+          line = br.readLine()
+          if (line == null) break()
+          println(line)
+        }
+      }
+      //delete temporary file
+      tempFile.delete()
+      println()
+
+    }
+    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
+  }
+}
+
+
+private class PairwiseDStream(prev:DStream[Array[Byte]]) extends
+DStream[(Long, Array[Byte])](prev.ssc){
+  override def dependencies = List(prev)
+
+  override def slideDuration: Duration = prev.slideDuration
+
+  override def compute(validTime:Time):Option[RDD[(Long, Array[Byte])]]={
+    prev.getOrCompute(validTime) match{
+      case Some(rdd)=>Some(rdd)
+        val pairwiseRDD = new PairwiseRDD(rdd)
+        Some(pairwiseRDD.asJavaPairRDD.rdd)
+      case None => None
+    }
+  }
+  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]] = JavaPairDStream(this)
+}
+
+
+
+
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index e05db236addca..b24109074e816 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -18,11 +18,13 @@
 package org.apache.spark.streaming.dstream
 
 
-import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
+import java.io._
 
 import scala.deprecated
 import scala.collection.mutable.HashMap
 import scala.reflect.ClassTag
+import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
+import scala.util.control.Breaks._
 
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.rdd.{BlockRDD, RDD}
@@ -31,6 +33,8 @@ import org.apache.spark.streaming._
 import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.scheduler.Job
 import org.apache.spark.util.MetadataCleaner
+import org.apache.spark.streaming.Duration
+import org.apache.spark.api.python.PythonRDD
 
 /**
  * A Discretized Stream (DStream), the basic abstraction in Spark Streaming, is a continuous
@@ -616,6 +620,68 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
+
+
+
+
+  /**
+   * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
+   * operator, so this PythonDStream will be registered as an output stream and there materialized.
+   * Since serialized Python object is readable by Python, pyprint writes out binary data to
+   * temporary file and run python script to deserialized and print the first ten elements
+   */
+  private[streaming] def pyprint() {
+    def foreachFunc = (rdd: RDD[T], time: Time) => {
+      val iter = rdd.take(11).iterator
+
+      // make a temporary file
+      val prefix = "spark"
+      val suffix = ".tmp"
+      val tempFile = File.createTempFile(prefix, suffix)
+      val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
+      //write out serialized python object
+      PythonRDD.writeIteratorToStream(iter, tempFileStream)
+      tempFileStream.close()
+
+      // This value has to be passed from python
+      val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+      val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
+      //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
+      //absolute path to the python script is needed to change because we do not use pysparkstreaming
+      val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath)
+      val workerEnv = pb.environment()
+
+      //envVars also need to be pass
+      //workerEnv.putAll(envVars)
+      val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
+      workerEnv.put("PYTHONPATH", pythonPath)
+      val worker = pb.start()
+      val is = worker.getInputStream()
+      val isr = new InputStreamReader(is)
+      val br = new BufferedReader(isr)
+
+      println ("-------------------------------------------")
+      println ("Time: " + time)
+      println ("-------------------------------------------")
+
+      //print value from python std out
+      var line = ""
+      breakable {
+        while (true) {
+          line = br.readLine()
+          if (line == null) break()
+          println(line)
+        }
+      }
+      //delete temporary file
+      tempFile.delete()
+      println()
+
+    }
+    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
+  }
+
+
   /**
    * Return a new DStream in which each RDD contains all the elements in seen in a
    * sliding window of time over this DStream. The new DStream generates RDDs with

From e497b9bfe6ba96db46122aa369b5dba528524c2e Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Tue, 15 Jul 2014 15:41:52 -0700
Subject: [PATCH 071/347] comment PythonDStream.PairwiseDStream

---
 .../apache/spark/streaming/api/python/PythonDStream.scala   | 3 ++-
 .../scala/org/apache/spark/streaming/dstream/DStream.scala  | 6 ++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 2d8b1e468dc4c..fe67250604d8e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -129,7 +129,7 @@ class PythonDStream[T: ClassTag](
   }
 }
 
-
+/*
 private class PairwiseDStream(prev:DStream[Array[Byte]]) extends
 DStream[(Long, Array[Byte])](prev.ssc){
   override def dependencies = List(prev)
@@ -146,6 +146,7 @@ DStream[(Long, Array[Byte])](prev.ssc){
   }
   val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]] = JavaPairDStream(this)
 }
+*/
 
 
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index b24109074e816..d9d5446b62e9f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -620,10 +620,7 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-
-
-
-
+//TODO move pyprint to PythonDStream
   /**
    * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
    * operator, so this PythonDStream will be registered as an output stream and there materialized.
@@ -644,6 +641,7 @@ abstract class DStream[T: ClassTag] (
       tempFileStream.close()
 
       // This value has to be passed from python
+      // Python currently does not do cluster deployment. But what happened
       val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
       val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
       //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???

From 6e0d9c749e7ef0067a6cd7ae9d21e8b599e32d54 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Tue, 15 Jul 2014 17:19:20 -0700
Subject: [PATCH 072/347] modify dstream.py to fix indent error

---
 python/pyspark/streaming/dstream.py                             | 2 +-
 .../org/apache/spark/streaming/api/python/PythonDStream.scala   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index b422b147d11e1..a512517f6e437 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -172,7 +172,7 @@ def _mergeCombiners(iterator):
         return shuffled.mapPartitions(_mergeCombiners) 
 
 
-   def partitionBy(self, numPartitions, partitionFunc=None):
+    def partitionBy(self, numPartitions, partitionFunc=None):
         """
         Return a copy of the DStream partitioned using the specified partitioner.
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index fe67250604d8e..389136f9e21a0 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -91,7 +91,7 @@ class PythonDStream[T: ClassTag](
       tempFileStream.close()
 
       // This value has to be passed from python
-      val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+      //val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
       val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
       //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
       //absolute path to the python script is needed to change because we do not use pysparkstreaming

From 9af03f40bbb9d04cfe66398a8632e4398214e3d7 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Tue, 15 Jul 2014 21:08:43 -0700
Subject: [PATCH 073/347] added reducedByKey not working yet

---
 .../src/main/python/streaming/wordcount.py    | 10 ++++++-
 python/pyspark/streaming/dstream.py           | 27 +++++++++++++++++--
 .../streaming/api/python/PythonDStream.scala  |  6 ++---
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index f44cd696894ba..3996991109d60 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -1,6 +1,7 @@
 import sys
 from operator import add
 
+from pyspark.conf import SparkConf
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
 
@@ -8,15 +9,22 @@
     if len(sys.argv) != 2:
         print >> sys.stderr, "Usage: wordcount <directory>"
         exit(-1)
-    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
+    conf = SparkConf()
+    conf.setAppName("PythonStreamingWordCount")
+    conf.set("spark.default.parallelism", 1)
+
+#    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
+    ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
     lines = ssc.textFileStream(sys.argv[1])
     fm_lines = lines.flatMap(lambda x: x.split(" "))
     filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
+    reduced_lines = mapped_lines.reduce(add)
     
     fm_lines.pyprint()
     filtered_lines.pyprint()
     mapped_lines.pyprint()
+    reduced_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index a512517f6e437..e144f8bc1cc09 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -29,6 +29,7 @@
 
 __all__ = ["DStream"]
 
+
 class DStream(object):
     def __init__(self, jdstream, ssc, jrdd_deserializer):
         self._jdstream = jdstream
@@ -149,7 +150,7 @@ def _combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         """
         """
         if numPartitions is None:
-            numPartitions = self.ctx._defaultParallelism()
+            numPartitions = self._defaultReducePartitions()
         def combineLocally(iterator):
             combiners = {}
             for x in iterator:
@@ -211,7 +212,6 @@ def add_shuffle_key(split, iterator):
         return dstream
 
 
-
     def reduceByWindow(self, reduceFunc, windowDuration, slideDuration, inReduceTunc):
         """
         """
@@ -254,8 +254,31 @@ def wrapRDD(self, rdd):
         raise NotImplementedError
 
     def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
+        """
+
+        """
         return PipelinedDStream(self, f, preservesPartitioning)
 
+    def _defaultReducePartitions(self):
+        """
+
+        """
+        # hard code to avoid the error
+        return 2
+        if self.ctx._conf.contains("spark.default.parallelism"):
+            return self.ctx.defaultParallelism
+        else:
+            return self.getNumPartitions()
+
+    def getNumPartitions(self):
+      """
+      Returns the number of partitions in RDD
+      >>> rdd = sc.parallelize([1, 2, 3, 4], 2)
+      >>> rdd.getNumPartitions()
+      2
+      """
+      return self._jdstream.partitions().size()
+
 
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 389136f9e21a0..719dd0a6a53c2 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -129,7 +129,7 @@ class PythonDStream[T: ClassTag](
   }
 }
 
-/*
+
 private class PairwiseDStream(prev:DStream[Array[Byte]]) extends
 DStream[(Long, Array[Byte])](prev.ssc){
   override def dependencies = List(prev)
@@ -144,9 +144,9 @@ DStream[(Long, Array[Byte])](prev.ssc){
       case None => None
     }
   }
-  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]] = JavaPairDStream(this)
+  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
 }
-*/
+
 
 
 

From dcf243f1cd0e7e5e47fb5b4ef9f269a344291f1b Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:07:42 -0700
Subject: [PATCH 074/347] implementing transform function in Python

---
 python/pyspark/streaming/dstream.py           |  3 +-
 .../api/python/PythonTransformedDStream.scala | 37 +++++++++++++++++++
 .../spark/streaming/dstream/DStream.scala     |  3 ++
 3 files changed, 41 insertions(+), 2 deletions(-)
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index e144f8bc1cc09..3365c6d69c1a2 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -172,7 +172,6 @@ def _mergeCombiners(iterator):
             return combiners.iteritems()
         return shuffled.mapPartitions(_mergeCombiners) 
 
-
     def partitionBy(self, numPartitions, partitionFunc=None):
         """
         Return a copy of the DStream partitioned using the specified partitioner.
@@ -231,6 +230,7 @@ def slice(self, fromTime, toTime):
     def transform(self, transformFunc):
         """
         """
+        self._jdstream.transform(transformFunc)
         raise NotImplementedError
 
     def transformWith(self, other, transformFunc):
@@ -264,7 +264,6 @@ def _defaultReducePartitions(self):
 
         """
         # hard code to avoid the error
-        return 2
         if self.ctx._conf.contains("spark.default.parallelism"):
             return self.ctx.defaultParallelism
         else:
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
new file mode 100644
index 0000000000000..ff70483b771a4
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
@@ -0,0 +1,37 @@
+package org.apache.spark.streaming.api.python
+
+import org.apache.spark.Accumulator
+import org.apache.spark.api.python.PythonRDD
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.RDD
+import org.apache.spark.streaming.api.java.JavaDStream
+import org.apache.spark.streaming.{Time, Duration}
+import org.apache.spark.streaming.dstream.DStream
+
+import scala.reflect.ClassTag
+
+/**
+ * Created by ken on 7/15/14.
+ */
+class PythonTransformedDStream[T: ClassTag](
+               parents: Seq[DStream[T]],
+               command: Array[Byte],
+               envVars: JMap[String, String],
+               pythonIncludes: JList[String],
+               preservePartitoning: Boolean,
+               pythonExec: String,
+               broadcastVars: JList[Broadcast[Array[Byte]]],
+               accumulator: Accumulator[JList[Array[Byte]]]
+               ) extends DStream[Array[Byte]](parent.ssc) {
+
+  override def dependencies = List(parent)
+
+  override def slideDuration: Duration = parent.slideDuration
+
+  //pythonDStream compute
+  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
+    Some()
+  }
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index d9d5446b62e9f..67977244ef420 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -561,9 +561,12 @@ abstract class DStream[T: ClassTag] (
     // because the DStream is reachable from the outer object here, and because 
     // DStreams can't be serialized with closures, we can't proactively check 
     // it for serializability and so we pass the optional false to SparkContext.clean
+
+    // serialized python
     val cleanedF = context.sparkContext.clean(transformFunc, false)
     val realTransformFunc =  (rdds: Seq[RDD[_]], time: Time) => {
       assert(rdds.length == 1)
+      // if transformfunc is fine, it is okay
       cleanedF(rdds.head.asInstanceOf[RDD[T]], time)
     }
     new TransformedDStream[U](Seq(this), realTransformFunc)

From c5518b42c6f5b3832a508eb302c34f84cf15b864 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:12:53 -0700
Subject: [PATCH 075/347] modified the code base on comment in
 https://github.com/tdas/spark/pull/10

---
 core/pom.xml                         | 2 +-
 python/pyspark/streaming/__init__.py | 1 -
 python/pyspark/streaming/context.py  | 5 +----
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/core/pom.xml b/core/pom.xml
index 3ac830f2237f7..6d8be37037729 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>1.0.0</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/python/pyspark/streaming/__init__.py b/python/pyspark/streaming/__init__.py
index 719592912e80c..e69de29bb2d1d 100644
--- a/python/pyspark/streaming/__init__.py
+++ b/python/pyspark/streaming/__init__.py
@@ -1 +0,0 @@
-__author__ = 'ktakagiw'
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index c8ae9c4af85c9..40e9d98942e2e 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -1,6 +1,3 @@
-__author__ = 'ktakagiw'
-
-
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -41,7 +38,7 @@
 
 class StreamingContext(object):
     """
-    Main entry point for Spark functionality. A StreamingContext represents the
+    Main entry point for Spark Streaming functionality. A StreamingContext represents the
     connection to a Spark cluster, and can be used to create L{RDD}s and
     broadcast variables on that cluster.
     """

From 375817561de68b54be4b41ddbf6dbfc352d59360 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:17:02 -0700
Subject: [PATCH 076/347] add coment for hack why PYSPARK_PYTHON is needed in
 spark-submit

---
 bin/spark-submit | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/bin/spark-submit b/bin/spark-submit
index ac275b7696d5c..fa022f707e572 100755
--- a/bin/spark-submit
+++ b/bin/spark-submit
@@ -37,6 +37,16 @@ done
 
 DEPLOY_MODE=${DEPLOY_MODE:-"client"}
 
+
+# This is a hack to make DStream.pyprint work. 
+# This will be removed after pyprint is moved to PythonDStream.
+# Problem is that print function is in (Scala)DStream. 
+# Whenever python code is executed, we call PythonDStream which passes
+# pythonExec(which python Spark should execute).
+# Since pyprint is located in DStream, Spark does not know which python should use. 
+# In that case, get python path from PYSPARK_PYTHON, environmental variable. 
+# This fix is ongoing in print branch in my repo.
+
 # Figure out which Python executable to use
 if [[ -z "$PYSPARK_PYTHON" ]]; then
   PYSPARK_PYTHON="python"

From e551e1355132ed239baf4edd51f3e275222362cc Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:19:13 -0700
Subject: [PATCH 077/347] add coment for hack why PYSPARK_PYTHON is needed in
 spark-submit

---
 bin/spark-submit | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/spark-submit b/bin/spark-submit
index fa022f707e572..ec4e10787cff0 100755
--- a/bin/spark-submit
+++ b/bin/spark-submit
@@ -45,7 +45,7 @@ DEPLOY_MODE=${DEPLOY_MODE:-"client"}
 # pythonExec(which python Spark should execute).
 # Since pyprint is located in DStream, Spark does not know which python should use. 
 # In that case, get python path from PYSPARK_PYTHON, environmental variable. 
-# This fix is ongoing in print branch in my repo.
+# This fix is ongoing in print branch in https://github.com/giwa/spark/tree/print.
 
 # Figure out which Python executable to use
 if [[ -z "$PYSPARK_PYTHON" ]]; then

From 2adca8419495eaaafab2677c8e2ba6f9588dfeb0 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:24:08 -0700
Subject: [PATCH 078/347] remove not implemented DStream functions in python

---
 python/pyspark/streaming/dstream.py | 102 ----------------------------
 1 file changed, 102 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 3365c6d69c1a2..3df6e5e09b0c1 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -55,50 +55,6 @@ def pyprint(self):
         """
         self._jdstream.pyprint()
 
-    def cache(self):
-        """
-        """
-        raise NotImplementedError
-
-    def checkpoint(self):
-        """
-        """
-        raise NotImplementedError
-
-    def compute(self, time):
-        """
-        """
-        raise NotImplementedError
-
-    def context(self):
-        """
-        """
-        raise NotImplementedError
-
-    def count(self):
-        """
-        """
-        raise NotImplementedError
-
-    def countByValue(self, numPartitions=None):
-        """
-        """
-        raise NotImplementedError
-
-    def countByValueAndWindow(self, duration, slideDuration=None):
-        """
-        """
-        raise NotImplementedError
-
-    def countByWindow(self, duration, slideDuration=None):
-        """
-        """
-        raise NotImplementedError
-
-    def dstream(self):
-        """
-        """
-        raise NotImplementedError
 
     def filter(self, f):
         """
@@ -112,16 +68,6 @@ def flatMap(self, f, preservesPartitioning=False):
         def func(s, iterator): return chain.from_iterable(imap(f, iterator))
         return self.mapPartitionsWithIndex(func, preservesPartitioning)
 
-    def foreachRDD(self, f, time):
-        """
-        """
-        raise NotImplementedError
-
-    def glom(self):
-        """
-        """
-        raise NotImplementedError
-
     def map(self, f, preservesPartitioning=False):
         """
         """
@@ -134,11 +80,6 @@ def mapPartitions(self, f):
         def func(s, iterator): return f(iterator)
         return self.mapPartitionsWithIndex(func)
 
-    def perist(self, storageLevel):
-        """
-        """
-        raise NotImplementedError
-
     def reduce(self, func, numPartitions=None):
         """
 
@@ -210,49 +151,6 @@ def add_shuffle_key(split, iterator):
         dstream._partitionFunc = partitionFunc
         return dstream
 
-
-    def reduceByWindow(self, reduceFunc, windowDuration, slideDuration, inReduceTunc):
-        """
-        """
-
-        raise NotImplementedError
-
-    def repartition(self, numPartitions):
-        """
-        """
-        raise NotImplementedError
-
-    def slice(self, fromTime, toTime):
-        """
-        """
-        raise NotImplementedError
-
-    def transform(self, transformFunc):
-        """
-        """
-        self._jdstream.transform(transformFunc)
-        raise NotImplementedError
-
-    def transformWith(self, other, transformFunc):
-        """
-        """
-        raise NotImplementedError
-
-    def union(self, that):
-        """
-        """
-        raise NotImplementedError
-
-    def window(self, windowDuration, slideDuration=None):
-        """
-        """
-        raise NotImplementedError
-
-    def wrapRDD(self, rdd):
-        """
-        """
-        raise NotImplementedError
-
     def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
 

From 5594bd43622adeac153642d600ab5585c5f7a2bb Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:35:59 -0700
Subject: [PATCH 079/347] revert pom.xml

---
 python/pyspark/streaming/pyprint.py | 2 +-
 streaming/pom.xml                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/streaming/pyprint.py b/python/pyspark/streaming/pyprint.py
index fcdaca510812c..6e87c985a57e3 100644
--- a/python/pyspark/streaming/pyprint.py
+++ b/python/pyspark/streaming/pyprint.py
@@ -1,6 +1,6 @@
 import sys
 from itertools import chain
-from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
+from pyspark.serializers import PickleSerializer
 
 def collect(binary_file_path):
     dse = PickleSerializer()
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 619c295e3490d..82bd94b855b67 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>1.0.0</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 490e338374bef5265796332f7b0a5defe6839754 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 12:15:06 -0700
Subject: [PATCH 080/347] sorted the import following Spark coding convention

---
 .../streaming/api/python/PythonDStream.scala  | 120 ++----------------
 1 file changed, 13 insertions(+), 107 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 719dd0a6a53c2..9d4eebaadc4c7 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -19,42 +19,28 @@ package org.apache.spark.streaming.api.python
 
 import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
 
-import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
-import org.apache.spark.broadcast.Broadcast
+import scala.reflect.ClassTag
+
 import org.apache.spark._
-import org.apache.spark.util.Utils
-import java.io._
-import scala.Some
-import org.apache.spark.streaming.Duration
-import scala.util.control.Breaks._
-import org.apache.spark.broadcast.Broadcast
-import scala.Some
-import org.apache.spark.streaming.Duration
 import org.apache.spark.rdd.RDD
-import org.apache.spark.api.python.PythonRDD
-
-
+import org.apache.spark.api.python._
+import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.streaming.{Duration, Time}
 import org.apache.spark.streaming.dstream._
 import org.apache.spark.streaming.api.java._
-import org.apache.spark.rdd.RDD
-import org.apache.spark.api.python._
-import org.apache.spark.api.python.PairwiseRDD
-
 
-import scala.reflect.ClassTag
 
 
 class PythonDStream[T: ClassTag](
-                                  parent: DStream[T],
-                                  command: Array[Byte],
-                                  envVars: JMap[String, String],
-                                  pythonIncludes: JList[String],
-                                  preservePartitoning: Boolean,
-                                  pythonExec: String,
-                                  broadcastVars: JList[Broadcast[Array[Byte]]],
-                                  accumulator: Accumulator[JList[Array[Byte]]]
-                                  ) extends DStream[Array[Byte]](parent.ssc) {
+    parent: DStream[T],
+    command: Array[Byte],
+    envVars: JMap[String, String],
+    pythonIncludes: JList[String],
+    preservePartitoning: Boolean,
+    pythonExec: String,
+    broadcastVars: JList[Broadcast[Array[Byte]]],
+    accumulator: Accumulator[JList[Array[Byte]]])
+  extends DStream[Array[Byte]](parent.ssc) {
 
   override def dependencies = List(parent)
 
@@ -70,84 +56,4 @@ class PythonDStream[T: ClassTag](
     }
   }
   val asJavaDStream  = JavaDStream.fromDStream(this)
-
-  /**
-   * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
-   * operator, so this PythonDStream will be registered as an output stream and there materialized.
-   * Since serialized Python object is readable by Python, pyprint writes out binary data to
-   * temporary file and run python script to deserialized and print the first ten elements
-   */
-  private[streaming] def ppyprint() {
-    def foreachFunc = (rdd: RDD[Array[Byte]], time: Time) => {
-      val iter = rdd.take(11).iterator
-
-      // make a temporary file
-      val prefix = "spark"
-      val suffix = ".tmp"
-      val tempFile = File.createTempFile(prefix, suffix)
-      val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
-      //write out serialized python object
-      PythonRDD.writeIteratorToStream(iter, tempFileStream)
-      tempFileStream.close()
-
-      // This value has to be passed from python
-      //val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
-      val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
-      //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
-      //absolute path to the python script is needed to change because we do not use pysparkstreaming
-      val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pysparkstreaming/streaming/pyprint.py", tempFile.getAbsolutePath)
-      val workerEnv = pb.environment()
-
-      //envVars also need to be pass
-      //workerEnv.putAll(envVars)
-      val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
-      workerEnv.put("PYTHONPATH", pythonPath)
-      val worker = pb.start()
-      val is = worker.getInputStream()
-      val isr = new InputStreamReader(is)
-      val br = new BufferedReader(isr)
-
-      println ("-------------------------------------------")
-      println ("Time: " + time)
-      println ("-------------------------------------------")
-
-      //print value from python std out
-      var line = ""
-      breakable {
-        while (true) {
-          line = br.readLine()
-          if (line == null) break()
-          println(line)
-        }
-      }
-      //delete temporary file
-      tempFile.delete()
-      println()
-
-    }
-    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
-  }
 }
-
-
-private class PairwiseDStream(prev:DStream[Array[Byte]]) extends
-DStream[(Long, Array[Byte])](prev.ssc){
-  override def dependencies = List(prev)
-
-  override def slideDuration: Duration = prev.slideDuration
-
-  override def compute(validTime:Time):Option[RDD[(Long, Array[Byte])]]={
-    prev.getOrCompute(validTime) match{
-      case Some(rdd)=>Some(rdd)
-        val pairwiseRDD = new PairwiseRDD(rdd)
-        Some(pairwiseRDD.asJavaPairRDD.rdd)
-      case None => None
-    }
-  }
-  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
-}
-
-
-
-
-

From 856d98e67b7df23f9c86da6c42795238c8dbcdc4 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 12:19:42 -0700
Subject: [PATCH 081/347] add empty line

---
 .../org/apache/spark/streaming/api/python/PythonDStream.scala    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 9d4eebaadc4c7..4c98f1c993317 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -55,5 +55,6 @@ class PythonDStream[T: ClassTag](
       case None => None
     }
   }
+  
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }

From 4ce4058a216de9118772df8b46085665bf28a51c Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 15:40:42 -0700
Subject: [PATCH 082/347] remove unused import in python

---
 python/pyspark/streaming/context.py           |  9 ------
 python/pyspark/streaming/dstream.py           | 30 +++----------------
 python/pyspark/streaming/duration.py          | 17 ++++++++++-
 python/pyspark/streaming/jtime.py             | 24 ++++++++++++++-
 python/pyspark/streaming/pyprint.py           | 19 ++++++++++++
 .../streaming/api/python/PythonDStream.scala  |  2 +-
 6 files changed, 63 insertions(+), 38 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 40e9d98942e2e..d3ff16fca764f 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -15,15 +15,6 @@
 # limitations under the License.
 #
 
-import os
-import shutil
-import sys
-from threading import Lock
-from tempfile import NamedTemporaryFile
-
-from pyspark import accumulators
-from pyspark.accumulators import Accumulator
-from pyspark.broadcast import Broadcast
 from pyspark.conf import SparkConf
 from pyspark.files import SparkFiles
 from pyspark.java_gateway import launch_gateway
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 3df6e5e09b0c1..5766cca39bdee 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -1,28 +1,8 @@
-from base64 import standard_b64encode as b64enc
-import copy
 from collections import defaultdict
-from collections import namedtuple
 from itertools import chain, ifilter, imap
-import operator
-import os
-import sys
-import shlex
-import traceback
-from subprocess import Popen, PIPE
-from tempfile import NamedTemporaryFile
-from threading import Thread
-import warnings
-import heapq
-from random import Random
-
-from pyspark.serializers import NoOpSerializer, CartesianDeserializer, \
-    BatchedSerializer, CloudPickleSerializer, PairDeserializer, pack_long
-from pyspark.join import python_join, python_left_outer_join, \
-    python_right_outer_join, python_cogroup
-from pyspark.statcounter import StatCounter
-from pyspark.rddsampler import RDDSampler
-from pyspark.storagelevel import StorageLevel
-#from pyspark.resultiterable import ResultIterable
+
+from pyspark.serializers import NoOpSerializer,\
+    BatchedSerializer, CloudPickleSerializer, pack_long
 from pyspark.rdd import _JavaStackTrace
 
 from py4j.java_collections import ListConverter, MapConverter
@@ -47,7 +27,7 @@ def generatedRDDs(self):
     def print_(self):
         """
         """
-        # print is a resrved name of Python. We cannot give print to function name
+        # print is a reserved name of Python. We cannot give print to function name
         getattr(self._jdstream, "print")()
 
     def pyprint(self):
@@ -55,7 +35,6 @@ def pyprint(self):
         """
         self._jdstream.pyprint()
 
-
     def filter(self, f):
         """
         """
@@ -140,7 +119,6 @@ def add_shuffle_key(split, iterator):
         keyed._bypass_serializer = True
         with _JavaStackTrace(self.ctx) as st:
             #JavaDStream
-            #pairRDD = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairRDD()
             pairDStream = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairDStream()
             partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
                                                           id(partitionFunc))
diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
index ef1b4f6cef237..5982146e69026 100644
--- a/python/pyspark/streaming/duration.py
+++ b/python/pyspark/streaming/duration.py
@@ -1,4 +1,19 @@
-__author__ = 'ktakagiw'
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 
 from pyspark.streaming import utils
 
diff --git a/python/pyspark/streaming/jtime.py b/python/pyspark/streaming/jtime.py
index 41670af659ea3..32ef741051283 100644
--- a/python/pyspark/streaming/jtime.py
+++ b/python/pyspark/streaming/jtime.py
@@ -1,8 +1,30 @@
-__author__ = 'ktakagiw'
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 
 from pyspark.streaming import utils
 from pyspark.streaming.duration import Duration
 
+"""
+The name of this file, time is not good naming for python
+because if we do import time when we want to use native python time package, it does
+not import python time package.
+"""
+
+
 class Time(object):
     """
     Time for Spark Streaming application. Used to set Time
diff --git a/python/pyspark/streaming/pyprint.py b/python/pyspark/streaming/pyprint.py
index 6e87c985a57e3..1aeb8e50375ed 100644
--- a/python/pyspark/streaming/pyprint.py
+++ b/python/pyspark/streaming/pyprint.py
@@ -1,5 +1,24 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
 import sys
 from itertools import chain
+
 from pyspark.serializers import PickleSerializer
 
 def collect(binary_file_path):
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 4c98f1c993317..76b88385e095a 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -55,6 +55,6 @@ class PythonDStream[T: ClassTag](
       case None => None
     }
   }
-  
+
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }

From 02f618a3967c39656b772bfa6b384779bbbad1d7 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 16:23:08 -0700
Subject: [PATCH 083/347] initial commit for socketTextStream

---
 .../python/streaming/nerwork_wordcount.py     | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 examples/src/main/python/streaming/nerwork_wordcount.py

diff --git a/examples/src/main/python/streaming/nerwork_wordcount.py b/examples/src/main/python/streaming/nerwork_wordcount.py
new file mode 100644
index 0000000000000..2e5048ccad213
--- /dev/null
+++ b/examples/src/main/python/streaming/nerwork_wordcount.py
@@ -0,0 +1,22 @@
+import sys
+from operator import add
+
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.duration import *
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
+        exit(-1)
+    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
+
+    lines = ssc.socketTextStream(sys.argv[1], sys.argv[2])
+    fm_lines = lines.flatMap(lambda x: x.split(" "))
+    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
+    mapped_lines = fm_lines.map(lambda x: (x, 1))
+    
+    fm_lines.pyprint()
+    filtered_lines.pyprint()
+    mapped_lines.pyprint()
+    ssc.start()
+    ssc.awaitTermination()

From 4b69fb15dd3fde3390aa4c35fdcb9171c18c29d1 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 16:27:28 -0700
Subject: [PATCH 084/347] fied input of socketTextDStream

---
 .../python/streaming/nerwork_wordcount.py     |  2 +-
 python/pyspark/java_gateway.py                |  1 +
 python/pyspark/streaming/context.py           | 25 +++----------------
 3 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/examples/src/main/python/streaming/nerwork_wordcount.py b/examples/src/main/python/streaming/nerwork_wordcount.py
index 2e5048ccad213..67dc28f7bf7f0 100644
--- a/examples/src/main/python/streaming/nerwork_wordcount.py
+++ b/examples/src/main/python/streaming/nerwork_wordcount.py
@@ -10,7 +10,7 @@
         exit(-1)
     ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
 
-    lines = ssc.socketTextStream(sys.argv[1], sys.argv[2])
+    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     fm_lines = lines.flatMap(lambda x: x.split(" "))
     filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 4547e54bd2d5d..9fd59be1456ef 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -87,6 +87,7 @@ def run(self):
     java_import(gateway.jvm, "org.apache.spark.streaming.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.dstream.*")
     java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index d3ff16fca764f..5dcc9ba35a653 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -19,7 +19,7 @@
 from pyspark.files import SparkFiles
 from pyspark.java_gateway import launch_gateway
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
-from pyspark.storagelevel import StorageLevel
+from pyspark.storagelevel import *
 from pyspark.rdd import RDD
 from pyspark.context import SparkContext
 
@@ -83,26 +83,9 @@ def awaitTermination(self, timeout=None):
         else:
             self._jssc.awaitTermination()
 
-    def checkpoint(self, directory):
-        raise NotImplementedError
-
-    def fileStream(self, directory, filter=None, newFilesOnly=None):
-        raise NotImplementedError
-
-    def networkStream(self, receiver):
-        raise NotImplementedError
-
-    def queueStream(self, queue, oneAtATime=True, defaultRDD=None):
-        raise NotImplementedError
-
-    def rawSocketStream(self, hostname, port, storagelevel):
-        raise NotImplementedError
-
-    def remember(self, duration):
-        raise NotImplementedError
-
-    def socketStream(hostname, port, converter,storageLevel):
-        raise NotImplementedError
+    # start from simple one. storageLevel is not passed for now.
+    def socketTextStream(self, hostname, port):
+        return DStream(self._jssc.socketTextStream(hostname, port), self, UTF8Deserializer())
 
     def start(self):
         self._jssc.start()

From 57fb740c5e03944a3f6107aba6b9cb7da1eff708 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Wed, 16 Jul 2014 23:29:37 -0700
Subject: [PATCH 085/347] added doctest for pyspark.streaming.duration

---
 python/pyspark/streaming/duration.py | 242 +++++++++++++++++++++++----
 python/pyspark/streaming/utils.py    |  20 ++-
 python/run-tests                     |   1 +
 3 files changed, 233 insertions(+), 30 deletions(-)

diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
index 5982146e69026..06a169e5215ac 100644
--- a/python/pyspark/streaming/duration.py
+++ b/python/pyspark/streaming/duration.py
@@ -42,29 +42,80 @@ def __init__(self, millis, _jvm=None):
         self._jduration = _jvm.Duration(millis)
 
     def toString(self):
-        """ Return duration as string """
+        """
+        Return duration as string
+
+        >>> d_10 = Duration(10)
+        >>> d_10.toString()
+        '10 ms'
+        """
         return str(self._millis) + " ms"
 
     def isZero(self):
-        """ Check if millis is zero """
+        """
+        Check if millis is zero
+
+        >>> d_10 = Duration(10)
+        >>> d_10.isZero()
+        False
+        >>> d_0 = Duration(0)
+        >>> d_0.isZero()
+        True
+        """
         return self._millis == 0
 
     def prettyPrint(self):
         """
         Return a human-readable string representing a duration
+
+        >>> d_10 = Duration(10)
+        >>> d_10.prettyPrint()
+        '10 ms'
+        >>> d_1sec = Duration(1000)
+        >>> d_1sec.prettyPrint()
+        '1.0 s'
+        >>> d_1min = Duration(60 * 1000)
+        >>> d_1min.prettyPrint()
+        '1.0 m'
+        >>> d_1hour = Duration(60 * 60 * 1000)
+        >>> d_1hour.prettyPrint()
+        '1.00 h'
         """
         return utils.msDurationToString(self._millis)
 
     def milliseconds(self):
-        """ Return millisecond """
+        """
+        Return millisecond
+
+        >>> d_10 = Duration(10)
+        >>> d_10.milliseconds()
+        10
+
+        """
         return self._millis
 
     def toFormattedString(self):
-        """ Return millisecond """
+        """
+        Return millisecond
+
+        >>> d_10 = Duration(10)
+        >>> d_10.toFormattedString()
+        '10'
+
+        """
         return str(self._millis)
 
     def max(self, other):
-        """ Return higher Duration """
+        """
+        Return higher Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_100 = Duration(100)
+        >>> d_max = d_10.max(d_100)
+        >>> print d_max
+        100 ms
+
+        """
         Duration._is_duration(other)
         if self > other:
             return self
@@ -72,7 +123,16 @@ def max(self, other):
             return other
 
     def min(self, other):
-        """ Return lower Durattion """
+        """
+        Return lower Durattion
+
+        >>> d_10 = Duration(10)
+        >>> d_100 = Duration(100)
+        >>> d_min = d_10.min(d_100)
+        >>> print d_min
+        10 ms
+
+        """
         Duration._is_duration(other)
         if self < other:
             return self
@@ -80,20 +140,52 @@ def min(self, other):
             return other
 
     def __str__(self):
+        """
+        >>> d_10 = Duration(10)
+        >>> str(d_10)
+        '10 ms'
+
+        """
         return self.toString()
 
     def __add__(self, other):
-        """ Add Duration and Duration """
+        """
+        Add Duration and Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_100 = Duration(100)
+        >>> d_110 = d_10 + d_100
+        >>> print d_110
+        110 ms
+        """
         Duration._is_duration(other)
         return Duration(self._millis + other._millis)
 
     def __sub__(self, other):
-        """ Subtract Duration by Duration  """
+        """
+        Subtract Duration by Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_100 = Duration(100)
+        >>> d_90 =  d_100 - d_10
+        >>> print d_90
+        90 ms
+
+        """
         Duration._is_duration(other)
         return Duration(self._millis - other._millis)
 
     def __mul__(self, other):
-        """ Multiple Duration by Duration """
+        """
+        Multiple Duration by Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_100 = Duration(100)
+        >>> d_1000 = d_10 * d_100
+        >>> print d_1000
+        1000 ms
+
+        """
         Duration._is_duration(other)
         return Duration(self._millis * other._millis)
 
@@ -101,6 +193,13 @@ def __div__(self, other):
         """
         Divide Duration by Duration
         for Python 2.X
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_2 = d_20 / d_10
+        >>> print d_2
+        2 ms
+
         """
         Duration._is_duration(other)
         return Duration(self._millis / other._millis)
@@ -109,46 +208,121 @@ def __truediv__(self, other):
         """
         Divide Duration by Duration
         for Python 3.0
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_2 = d_20 / d_10
+        >>> print d_2
+        2 ms
+
         """
         Duration._is_duration(other)
         return Duration(self._millis / other._millis)
 
     def __floordiv__(self, other):
-        """ Divide Duration by Duration """
+        """
+        Divide Duration by Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_3 = Duration(3)
+        >>> d_3 = d_10 // d_3
+        >>> print d_3
+        3 ms
+
+        """
         Duration._is_duration(other)
         return Duration(self._millis // other._millis)
 
-    def __len__(self):
-        """ Length of miilisecond in Duration """
-        return len(self._millis)
-
     def __lt__(self, other):
-        """ Duration < Duration """
+        """
+        Duration < Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_10 < d_20
+        True
+        >>> d_20 < d_10
+        False
+
+        """
         Duration._is_duration(other)
         return self._millis < other._millis
 
     def __le__(self, other):
-        """ Duration <= Duration """
+        """
+        Duration <= Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_10 <= d_20
+        True
+        >>> d_20 <= d_10
+        False
+
+        """
         Duration._is_duration(other)
-        return self.millis <= other._millis
+        return self._millis <= other._millis
 
     def __eq__(self, other):
-        """ Duration ==  Duration """
+        """
+        Duration ==  Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_10 == d_20
+        False
+        >>> other_d_10 = Duration(10)
+        >>> d_10 == other_d_10
+        True
+
+        """
         Duration._is_duration(other)
         return self._millis == other._millis
 
     def __ne__(self, other):
-        """ Duration != Duration """
+        """
+        Duration != Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_10 != d_20
+        True
+        >>> other_d_10 = Duration(10)
+        >>> d_10 != other_d_10
+        False
+
+        """
         Duration._is_duration(other)
         return self._millis != other._millis
 
     def __gt__(self, other):
-        """ Duration > Duration """
+        """
+        Duration > Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_10 > d_20
+        False
+        >>> d_20 > d_10
+        True
+
+        """
         Duration._is_duration(other)
         return self._millis > other._millis
 
     def __ge__(self, other):
-        """ Duration >= Duration """
+        """
+        Duration >= Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_10 < d_20
+        True
+        >>> d_20 < d_10
+        False
+
+
+        """
         Duration._is_duration(other)
         return self._millis >= other._millis
 
@@ -162,6 +336,12 @@ def Milliseconds(milliseconds):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
     a given number of milliseconds.
+
+    >>> milliseconds = Milliseconds(1)
+    >>> d_1 = Duration(1)
+    >>> milliseconds == d_1
+    True
+
     """
     return Duration(milliseconds)
 
@@ -169,18 +349,24 @@ def Seconds(seconds):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
     a given number of seconds.
+
+    >>> seconds = Seconds(1)
+    >>> d_1sec = Duration(1000)
+    >>> seconds == d_1sec
+    True
+
     """
     return Duration(seconds * 1000)
 
-def Minites(minites):
+def Minutes(minutes):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
     a given number of minutes.
-    """
-    return Duration(minutes * 60000)
 
-if __name__ == "__main__":
-    d = Duration(1)
-    print d
-    print d.milliseconds()
+    >>> minutes = Minutes(1)
+    >>> d_1min = Duration(60 * 1000)
+    >>> minutes == d_1min
+    True
 
+    """
+    return Duration(minutes * 60 * 1000)
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index 71aa3376c6578..b1fa1e227b0a1 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -1,4 +1,20 @@
-__author__ = 'ktakagiw'
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 
 def msDurationToString(ms):
     """
@@ -12,7 +28,7 @@ def msDurationToString(ms):
         return "%d ms" % ms
     elif ms < minute:
         return "%.1f s" % (float(ms) / second)
-    elif ms < hout:
+    elif ms < hour:
         return "%.1f m" % (float(ms) / minute)
     else:
         return "%.2f h" % (float(ms) / hour)
diff --git a/python/run-tests b/python/run-tests
index 1218edcbd7e08..3d00727f0ab81 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -68,6 +68,7 @@ export PYSPARK_DOC_TEST=1
 run_test "pyspark/broadcast.py"
 run_test "pyspark/accumulators.py"
 run_test "pyspark/serializers.py"
+run_test "pyspark/streaming/duration.py"
 unset PYSPARK_DOC_TEST
 run_test "pyspark/shuffle.py"
 run_test "pyspark/tests.py"

From 967dc26ea4c23dec9a186b5b1c3d9bf7e10e5b3b Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Wed, 16 Jul 2014 23:36:33 -0700
Subject: [PATCH 086/347] fixed typo of network_workdcount.py

---
 .../python/streaming/network_wordcount.py     | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 examples/src/main/python/streaming/network_wordcount.py

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
new file mode 100644
index 0000000000000..67dc28f7bf7f0
--- /dev/null
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -0,0 +1,22 @@
+import sys
+from operator import add
+
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.duration import *
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
+        exit(-1)
+    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
+
+    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
+    fm_lines = lines.flatMap(lambda x: x.split(" "))
+    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
+    mapped_lines = fm_lines.map(lambda x: (x, 1))
+    
+    fm_lines.pyprint()
+    filtered_lines.pyprint()
+    mapped_lines.pyprint()
+    ssc.start()
+    ssc.awaitTermination()

From 7f7c5d18e4553b717091227734e8eceaf6a8cbf2 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Wed, 16 Jul 2014 23:39:25 -0700
Subject: [PATCH 087/347] delete old file

---
 .../python/streaming/nerwork_wordcount.py     | 22 -------------------
 1 file changed, 22 deletions(-)
 delete mode 100644 examples/src/main/python/streaming/nerwork_wordcount.py

diff --git a/examples/src/main/python/streaming/nerwork_wordcount.py b/examples/src/main/python/streaming/nerwork_wordcount.py
deleted file mode 100644
index 67dc28f7bf7f0..0000000000000
--- a/examples/src/main/python/streaming/nerwork_wordcount.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import sys
-from operator import add
-
-from pyspark.streaming.context import StreamingContext
-from pyspark.streaming.duration import *
-
-if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
-        exit(-1)
-    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
-
-    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
-    fm_lines = lines.flatMap(lambda x: x.split(" "))
-    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
-    mapped_lines = fm_lines.map(lambda x: (x, 1))
-    
-    fm_lines.pyprint()
-    filtered_lines.pyprint()
-    mapped_lines.pyprint()
-    ssc.start()
-    ssc.awaitTermination()

From d25d5cf9c7343dacea9511da9330d757e9cb810e Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Tue, 15 Jul 2014 21:08:43 -0700
Subject: [PATCH 088/347] added reducedByKey not working yet

---
 .../org/apache/spark/streaming/api/python/PythonDStream.scala    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 76b88385e095a..c95c81d994a95 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -58,3 +58,4 @@ class PythonDStream[T: ClassTag](
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
+

From 0b8b7d0cb25249287950254752c926a097367786 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Thu, 17 Jul 2014 16:27:05 -0700
Subject: [PATCH 089/347] reduceByKey is working

---
 .../src/main/python/streaming/wordcount.pyc   | Bin 0 -> 1566 bytes
 python/pyspark/streaming/dstream.py           |   6 ++--
 .../streaming/api/python/PythonDStream.scala  |  27 ++++++++++++++++++
 .../api/python/PythonTransformedDStream.scala |  19 +++++++-----
 4 files changed, 41 insertions(+), 11 deletions(-)
 create mode 100644 examples/src/main/python/streaming/wordcount.pyc

diff --git a/examples/src/main/python/streaming/wordcount.pyc b/examples/src/main/python/streaming/wordcount.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db93702361f47f57988ea82c213aae522e7a9f81
GIT binary patch
literal 1566
zcmb_c+invv5S`tmX_F>xdP%Qb`WW$$yrZfDRUlD`3MEBIE2LoJN!i4Ek?qh#;suqD
z<A3-7n6a}d5{L(2H};Otj6G*APU%~t_Uz(oe+Y}GLf<De%)0<U(k}o75P3NS6+jX~
z6hgqd5~va+Wr)gtT?VxRNd=;cU$1~#g)3YGS%FCi`fY$!K!9-#ZUd03Fe!ol?2vUR
z%QXmJTwtBuk~&BMunf24y#d&O@dn7MQ}ALFWDVpx+!7V6fUL8LB*Ugd1KJlxTYy_I
z-d?yQ>q9y>+5y~w@h->>_~JZ8Ex;Cx_dsqI$i71kKsF2H5bzMjM}Q%7h>o4XJ*F4n
zK8zO<nJlE^r9>3&dpnXIIEg}|#P-`;l<D|8J(q1tH`o$QLQ21=(xRUWJvq<Pk%yWL
zS&brz(`$k)&aBt)_D6P&=D{ElaXJ{pAuY^3nqC`mjgL=mGA)PMG_@zEGS)b>nVz57
z12T9uw;!@}dGH%DJZD35$VV`Rj>M6eD%+ujPzqISlGsr$lgW?>I^S}tg^jax$SNMp
z;hctP(DaEa?*gX;_S>wwv}|<ZLs((ET*W^{G;0$EZZchxa3+#mq3Ieu>~fgh+)?kE
zA}QTvlpRuWK2DWL-b-<|OR}K>zmlsFO7$Rqlgg4A({btd9G1bVIK_XX)mx{dTQvSa
zhqw@<QCQG%$-ZOKW6J-5#a_Xp{g5Qre`ao6-m)C2n3?X80&Jb8eC~UJ%iVMN;QlDk
zWeGZ@=f>=Dh3(OAO^QZYbZjaT91NU$g{kG968ie1<$oh%C2H+3oW-`snC0*d+NXJb
zNn1T{w04!?K9YZFJZ6S=gYAjV^H`y8am%IUG8(sGq=bDbaz|y947`H9jACU`y92z4
za-rpWn$Q-`3VrjXg12du(;-^!crO;am|Z^4b(~s9!C0ZK<vT5IL~O56)3tW@=dQ9&
u`upBbGC5Uv#>Fxnn(9F_YzEDs7S@AS(4e<Uqggr*_kwNrZicm>cz*!UkyX3^

literal 0
HcmV?d00001

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 5766cca39bdee..4e18cbacf3eba 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -118,11 +118,9 @@ def add_shuffle_key(split, iterator):
         keyed = PipelinedDStream(self, add_shuffle_key)
         keyed._bypass_serializer = True
         with _JavaStackTrace(self.ctx) as st:
-            #JavaDStream
-            pairDStream = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairDStream()
             partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
-                                                          id(partitionFunc))
-        jdstream = pairDStream.partitionBy(partitioner).values()
+                                                      id(partitionFunc))
+            jdstream = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream(), partitioner).asJavaDStream()
         dstream = DStream(jdstream, self._ssc, BatchedSerializer(outputSerializer))
         # This is required so that id(partitionFunc) remains unique, even if
         # partitionFunc is a lambda:
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index c95c81d994a95..d305797bb4a0f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -59,3 +59,30 @@ class PythonDStream[T: ClassTag](
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
 
+
+private class PairwiseDStream(prev:DStream[Array[Byte]], partitioner: Partitioner) extends
+DStream[Array[Byte]](prev.ssc){
+  override def dependencies = List(prev)
+
+  override def slideDuration: Duration = prev.slideDuration
+
+  override def compute(validTime:Time):Option[RDD[Array[Byte]]]={
+    prev.getOrCompute(validTime) match{
+      case Some(rdd)=>Some(rdd)
+        val pairwiseRDD = new PairwiseRDD(rdd)
+        /*
+         * This is equivalent to following python code
+         * with _JavaStackTrace(self.context) as st:
+         *    pairRDD = self.ctx._jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD()
+         *    partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
+         *                                                  id(partitionFunc))
+         * jrdd = pairRDD.partitionBy(partitioner).values()
+         * rdd = RDD(jrdd, self.ctx, BatchedSerializer(outputSerializer))
+         */
+        Some(pairwiseRDD.asJavaPairRDD.partitionBy(partitioner).values().rdd)
+      case None => None
+    }
+  }
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+  //val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
index ff70483b771a4..bc07e09ec6d03 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
@@ -1,3 +1,5 @@
+/*
+
 package org.apache.spark.streaming.api.python
 
 import org.apache.spark.Accumulator
@@ -10,11 +12,8 @@ import org.apache.spark.streaming.dstream.DStream
 
 import scala.reflect.ClassTag
 
-/**
- * Created by ken on 7/15/14.
- */
 class PythonTransformedDStream[T: ClassTag](
-               parents: Seq[DStream[T]],
+               parent: DStream[T],
                command: Array[Byte],
                envVars: JMap[String, String],
                pythonIncludes: JList[String],
@@ -30,8 +29,14 @@ class PythonTransformedDStream[T: ClassTag](
 
   //pythonDStream compute
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
-    Some()
+
+//    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
+//    parents.map(_.getOrCompute(validTime).orNull).to
+//    parent = parents.head.asInstanceOf[RDD]
+//    Some()
   }
-  val asJavaDStream  = JavaDStream.fromDStream(this)
+
+  val asJavaDStream = JavaDStream.fromDStream(this)
 }
+
+*/

From d1ee6ca211ed7b6c507d9524856d8ccec0e3f8d7 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Thu, 17 Jul 2014 17:09:23 -0700
Subject: [PATCH 090/347] edit python sparkstreaming example

---
 examples/src/main/python/streaming/network_wordcount.py | 8 +++++++-
 examples/src/main/python/streaming/wordcount.py         | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index 67dc28f7bf7f0..77fca7ff7657d 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -1,6 +1,7 @@
 import sys
 from operator import add
 
+from pyspark.conf import SparkConf
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
 
@@ -8,15 +9,20 @@
     if len(sys.argv) != 3:
         print >> sys.stderr, "Usage: wordcount <hostname> <port>"
         exit(-1)
-    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
+    conf = SparkConf()
+    conf.setAppName("PythonStreamingNetworkWordCount")
+    conf.set("spark.default.parallelism", 1)
+    ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     fm_lines = lines.flatMap(lambda x: x.split(" "))
     filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
+    reduced_lines = mapped_lines.reduce(add)
     
     fm_lines.pyprint()
     filtered_lines.pyprint()
     mapped_lines.pyprint()
+    reduced_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index 3996991109d60..9ff8bc5ac9ab2 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -13,6 +13,7 @@
     conf.setAppName("PythonStreamingWordCount")
     conf.set("spark.default.parallelism", 1)
 
+# still has a bug
 #    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
 

From a9f4ecbe256b3b729ba4d0e13131cf3164b12b24 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Fri, 18 Jul 2014 17:58:58 -0700
Subject: [PATCH 091/347] added count operation but this implementation need
 double check

---
 python/pyspark/streaming/dstream.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 4e18cbacf3eba..a84e72cfb4ed9 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -1,5 +1,8 @@
 from collections import defaultdict
 from itertools import chain, ifilter, imap
+import operator
+
+import logging
 
 from pyspark.serializers import NoOpSerializer,\
     BatchedSerializer, CloudPickleSerializer, pack_long
@@ -24,6 +27,18 @@ def generatedRDDs(self):
         """
         pass
 
+    def count(self):
+        """
+
+        """
+        #TODO make sure count implementation, thiis different from what pyspark does
+        return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum().map(lambda x: x[1])
+
+    def sum(self):
+        """
+        """
+        return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+
     def print_(self):
         """
         """
@@ -63,9 +78,9 @@ def reduce(self, func, numPartitions=None):
         """
 
         """
-        return self._combineByKey(lambda x:x, func, func, numPartitions)
+        return self.combineByKey(lambda x:x, func, func, numPartitions)
 
-    def _combineByKey(self, createCombiner, mergeValue, mergeCombiners,
+    def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
                       numPartitions = None):
         """
         """
@@ -74,6 +89,12 @@ def _combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         def combineLocally(iterator):
             combiners = {}
             for x in iterator:
+
+                #TODO for count operation make sure count implementation
+                # This is different from what pyspark does
+                if isinstance(x, int):
+                    x = ("", x)
+
                 (k, v) = x
                 if k not in combiners:
                     combiners[k] = createCombiner(v)

From 05459c67cbd0b6d75a4731bb70fa1678d0935b29 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Sat, 19 Jul 2014 18:58:01 -0700
Subject: [PATCH 092/347] fix map function

---
 .../python/streaming/network_wordcount.py     |  2 ++
 python/pyspark/streaming/dstream.py           | 26 +++++++++----------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index 77fca7ff7657d..a1458e06f13d2 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -19,10 +19,12 @@
     filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
     reduced_lines = mapped_lines.reduce(add)
+    counted_lines = reduced_lines.count()
     
     fm_lines.pyprint()
     filtered_lines.pyprint()
     mapped_lines.pyprint()
     reduced_lines.pyprint()
+    counted_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index a84e72cfb4ed9..e3ad323e06015 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -20,21 +20,14 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
         self.ctx = ssc._sc
         self._jrdd_deserializer = jrdd_deserializer
 
-    def generatedRDDs(self):
-        """
-         // RDDs generated, marked as private[streaming] so that testsuites can access it
-         @transient
-        """
-        pass
-
     def count(self):
         """
 
         """
         #TODO make sure count implementation, thiis different from what pyspark does
-        return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum().map(lambda x: x[1])
+        return self.mapPartitions(lambda i: [sum(1 for _ in i)]).map(lambda x: (None, 1))
 
-    def sum(self):
+    def _sum(self):
         """
         """
         return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
@@ -65,8 +58,9 @@ def func(s, iterator): return chain.from_iterable(imap(f, iterator))
     def map(self, f, preservesPartitioning=False):
         """
         """
-        def func(split, iterator): return imap(f, iterator)
-        return PipelinedDStream(self, func, preservesPartitioning)
+        def func(iterator): return imap(f, iterator)
+        return self.mapPartitions(func)
+        #return PipelinedDStream(self, func, preservesPartitioning)
 
     def mapPartitions(self, f):
         """
@@ -74,6 +68,12 @@ def mapPartitions(self, f):
         def func(s, iterator): return f(iterator)
         return self.mapPartitionsWithIndex(func)
 
+    def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
+        """
+
+        """
+        return PipelinedDStream(self, f, preservesPartitioning)
+
     def reduce(self, func, numPartitions=None):
         """
 
@@ -92,8 +92,8 @@ def combineLocally(iterator):
 
                 #TODO for count operation make sure count implementation
                 # This is different from what pyspark does
-                if isinstance(x, int):
-                    x = ("", x)
+                #if isinstance(x, int):
+                #    x = ("", x)
 
                 (k, v) = x
                 if k not in combiners:

From 9fa249b2befc18db3e24cda97380557b186a073a Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Sun, 20 Jul 2014 14:31:55 -0700
Subject: [PATCH 093/347] clean up code

---
 python/pyspark/streaming/context.py           | 41 +++++-----
 python/pyspark/streaming/dstream.py           | 75 ++++++++++++-------
 python/pyspark/streaming/duration.py          |  1 +
 python/pyspark/streaming/pyprint.py           |  9 ++-
 .../streaming/api/java/JavaDStreamLike.scala  |  2 +-
 .../streaming/api/python/PythonDStream.scala  |  4 +-
 .../spark/streaming/dstream/DStream.scala     | 21 +++---
 7 files changed, 90 insertions(+), 63 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 5dcc9ba35a653..a4900191d1730 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -22,15 +22,15 @@
 from pyspark.storagelevel import *
 from pyspark.rdd import RDD
 from pyspark.context import SparkContext
+from pyspark.streaming.dstream import DStream
 
 from py4j.java_collections import ListConverter
 
-from pyspark.streaming.dstream import DStream
 
 class StreamingContext(object):
     """
     Main entry point for Spark Streaming functionality. A StreamingContext represents the
-    connection to a Spark cluster, and can be used to create L{RDD}s and
+    connection to a Spark cluster, and can be used to create L{DStream}s and
     broadcast variables on that cluster.
     """
 
@@ -71,13 +71,16 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
     def _initialize_context(self, jspark_context, jduration):
         return self._jvm.JavaStreamingContext(jspark_context, jduration)
 
-    def actorStream(self, props, name, storageLevel, supervisorStrategy):
-        raise NotImplementedError
-
-    def addStreamingListener(self, streamingListener):
-        raise NotImplementedError
+    def start(self):
+        """
+        Start the execution of the streams.
+        """
+        self._jssc.start()
 
     def awaitTermination(self, timeout=None):
+        """
+        Wait for the execution to stop.
+        """
         if timeout:
             self._jssc.awaitTermination(timeout)
         else:
@@ -85,20 +88,18 @@ def awaitTermination(self, timeout=None):
 
     # start from simple one. storageLevel is not passed for now.
     def socketTextStream(self, hostname, port):
+        """
+        Create an input from TCP source hostname:port. Data is received using
+        a TCP socket and receive byte is interpreted as UTF8 encoded '\n' delimited
+        lines.
+        """
         return DStream(self._jssc.socketTextStream(hostname, port), self, UTF8Deserializer())
 
-    def start(self):
-        self._jssc.start()
-
-    def stop(self, stopSparkContext=True):
-        raise NotImplementedError
-
     def textFileStream(self, directory):
+        """
+        Create an input stream that monitors a Hadoop-compatible file system
+        for new files and reads them as text files. Files must be wrriten to the
+        monitored directory by "moving" them from another location within the same
+        file system. FIle names starting with . are ignored.
+        """
         return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
-
-    def transform(self, seq):
-        raise NotImplementedError
-
-    def union(self, seq):
-        raise NotImplementedError
-
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index e3ad323e06015..a640df7394bcf 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -2,8 +2,6 @@
 from itertools import chain, ifilter, imap
 import operator
 
-import logging
-
 from pyspark.serializers import NoOpSerializer,\
     BatchedSerializer, CloudPickleSerializer, pack_long
 from pyspark.rdd import _JavaStackTrace
@@ -25,64 +23,86 @@ def count(self):
 
         """
         #TODO make sure count implementation, thiis different from what pyspark does
-        return self.mapPartitions(lambda i: [sum(1 for _ in i)]).map(lambda x: (None, 1))
+        return self._mapPartitions(lambda i: [sum(1 for _ in i)]).map(lambda x: (None, 1))
 
     def _sum(self):
         """
         """
-        return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+        return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
     def print_(self):
         """
+        Since print is reserved name for python, we cannot make a print method function.
+        This function prints serialized data in RDD in DStream because Scala and Java cannot
+        deserialized pickled python object. Please use DStream.pyprint() instead to print result.
+
+        Call DStream.print().
         """
-        # print is a reserved name of Python. We cannot give print to function name
+        #hack to call print function in DStream
         getattr(self._jdstream, "print")()
 
     def pyprint(self):
         """
+        Print the first ten elements of each RDD generated in this DStream. This is an output
+        operator, so this DStream will be registered as an output stream and there materialized.
+
         """
         self._jdstream.pyprint()
 
     def filter(self, f):
         """
+        Return DStream containing only the elements that satisfy predicate.
         """
         def func(iterator): return ifilter(f, iterator)
-        return self.mapPartitions(func)
+        return self._mapPartitions(func)
 
     def flatMap(self, f, preservesPartitioning=False):
         """
+        Pass each value in the key-value pair DStream through flatMap function
+        without changing the keys: this also retains the original RDD's partition.
         """
         def func(s, iterator): return chain.from_iterable(imap(f, iterator))
-        return self.mapPartitionsWithIndex(func, preservesPartitioning)
+        return self._mapPartitionsWithIndex(func, preservesPartitioning)
 
-    def map(self, f, preservesPartitioning=False):
+    def map(self, f):
         """
+        Return DStream by applying a function to each element of DStream.
         """
         def func(iterator): return imap(f, iterator)
-        return self.mapPartitions(func)
-        #return PipelinedDStream(self, func, preservesPartitioning)
+        return self._mapPartitions(func)
 
-    def mapPartitions(self, f):
+    def _mapPartitions(self, f):
         """
+        Return a new DStream by applying a function to each partition of this DStream.
         """
         def func(s, iterator): return f(iterator)
-        return self.mapPartitionsWithIndex(func)
+        return self._mapPartitionsWithIndex(func)
 
-    def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
+    def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
-
+        Return a new DStream by applying a function to each partition of this DStream,
+        While tracking the index of the original partition.
         """
         return PipelinedDStream(self, f, preservesPartitioning)
 
-    def reduce(self, func, numPartitions=None):
+
+    def reduceByKey(self, func, numPartitions=None):
         """
+        Merge the value for each key using an associative reduce function.
+
+        This will also perform the merging locally on each mapper before
+        sending resuls to reducer, similarly to a "combiner" in MapReduce.
 
+        Output will be hash-partitioned with C{numPartitions} partitions, or
+        the default parallelism level if C{numPartitions} is not specified.
         """
         return self.combineByKey(lambda x:x, func, func, numPartitions)
 
     def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
                       numPartitions = None):
         """
+        Count the number of elements for each key, and return the result to the
+        master as a dictionary
         """
         if numPartitions is None:
             numPartitions = self._defaultReducePartitions()
@@ -148,30 +168,27 @@ def add_shuffle_key(split, iterator):
         dstream._partitionFunc = partitionFunc
         return dstream
 
-    def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
-        """
-
-        """
-        return PipelinedDStream(self, f, preservesPartitioning)
-
     def _defaultReducePartitions(self):
         """
+        Returns the default number of partitions to use during reduce tasks (e.g., groupBy).
+        If spark.default.parallelism is set, then we'll use the value from SparkContext
+        defaultParallelism, otherwise we'll use the number of partitions in this RDD.
 
+        This mirrors the behavior of the Scala Partitioner#defaultPartitioner, intended to reduce
+        the likelihood of OOMs. Once PySpark adopts Partitioner-based APIs, this behavior will
+        be inherent.
         """
-        # hard code to avoid the error
         if self.ctx._conf.contains("spark.default.parallelism"):
             return self.ctx.defaultParallelism
         else:
             return self.getNumPartitions()
 
     def getNumPartitions(self):
-      """
-      Returns the number of partitions in RDD
-      >>> rdd = sc.parallelize([1, 2, 3, 4], 2)
-      >>> rdd.getNumPartitions()
-      2
-      """
-      return self._jdstream.partitions().size()
+        """
+        Return the number of partitions in RDD
+        """
+        # TODO: remove hardcoding. RDD has NumPartitions but DStream does not have.
+        return 2
 
 
 class PipelinedDStream(DStream):
diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
index 06a169e5215ac..a7f1036e4b856 100644
--- a/python/pyspark/streaming/duration.py
+++ b/python/pyspark/streaming/duration.py
@@ -17,6 +17,7 @@
 
 from pyspark.streaming import utils
 
+
 class Duration(object):
     """
     Duration for Spark Streaming application. Used to set duration
diff --git a/python/pyspark/streaming/pyprint.py b/python/pyspark/streaming/pyprint.py
index 1aeb8e50375ed..49517b3e5c247 100644
--- a/python/pyspark/streaming/pyprint.py
+++ b/python/pyspark/streaming/pyprint.py
@@ -21,16 +21,22 @@
 
 from pyspark.serializers import PickleSerializer
 
+
 def collect(binary_file_path):
+    """
+    Read pickled file written by SparkStreaming
+    """
     dse = PickleSerializer()
     with open(binary_file_path, 'rb') as tempFile:
         for item in dse.load_stream(tempFile):
             yield item
+
+
 def main():
     try:
         binary_file_path = sys.argv[1]
     except:
-        print "Missed FilePath in argement"
+        print "Missed FilePath in argements"
 
     if not binary_file_path:
         return 
@@ -43,5 +49,6 @@ def main():
             print "..."
             break
 
+
 if __name__ =="__main__":
     exit(main())
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index cfa336df8674f..a2b9d581f609c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -59,7 +59,7 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
    * operator, so this PythonDStream will be registered as an output stream and there materialized.
    * This function is for PythonAPI.
    */
-
+  //TODO move this function to PythonDStream
   def pyprint() = dstream.pyprint()
 
   /**
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index d305797bb4a0f..e2602117f3f86 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -71,7 +71,9 @@ DStream[Array[Byte]](prev.ssc){
       case Some(rdd)=>Some(rdd)
         val pairwiseRDD = new PairwiseRDD(rdd)
         /*
-         * This is equivalent to following python code
+         * Since python operation is executed by Scala after StreamingContext.start.
+         * What PairwiseDStream does is equivalent to following python code in pySpark.
+         *
          * with _JavaStackTrace(self.context) as st:
          *    pairRDD = self.ctx._jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD()
          *    partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 67977244ef420..fc7a2055025c1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -623,37 +623,36 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-//TODO move pyprint to PythonDStream
+//TODO move pyprint to PythonDStream and executed by py4j call back function
   /**
    * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
    * operator, so this PythonDStream will be registered as an output stream and there materialized.
    * Since serialized Python object is readable by Python, pyprint writes out binary data to
    * temporary file and run python script to deserialized and print the first ten elements
+   *
+   * Currently call python script directly. We should avoid this
    */
   private[streaming] def pyprint() {
     def foreachFunc = (rdd: RDD[T], time: Time) => {
       val iter = rdd.take(11).iterator
 
-      // make a temporary file
+      // Generate a temporary file
       val prefix = "spark"
       val suffix = ".tmp"
       val tempFile = File.createTempFile(prefix, suffix)
       val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
-      //write out serialized python object
+      // Write out serialized python object to temporary file
       PythonRDD.writeIteratorToStream(iter, tempFileStream)
       tempFileStream.close()
 
-      // This value has to be passed from python
-      // Python currently does not do cluster deployment. But what happened
+      // pythonExec should be passed from python. Move pyprint to PythonDStream
       val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
       val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
-      //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
-      //absolute path to the python script is needed to change because we do not use pysparkstreaming
+      // Call python script to deserialize and print result in stdout
       val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath)
       val workerEnv = pb.environment()
 
-      //envVars also need to be pass
-      //workerEnv.putAll(envVars)
+      // envVars also should be pass from python
       val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
       workerEnv.put("PYTHONPATH", pythonPath)
       val worker = pb.start()
@@ -665,7 +664,7 @@ abstract class DStream[T: ClassTag] (
       println ("Time: " + time)
       println ("-------------------------------------------")
 
-      //print value from python std out
+      // Print values which is from python std out
       var line = ""
       breakable {
         while (true) {
@@ -674,7 +673,7 @@ abstract class DStream[T: ClassTag] (
           println(line)
         }
       }
-      //delete temporary file
+      // Delete temporary file
       tempFile.delete()
       println()
 

From aeaf8a52b8a806c39634c26bdb6818dd2dc3a935 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Sun, 20 Jul 2014 15:32:20 -0700
Subject: [PATCH 094/347] clean up codes

---
 bin/spark-submit                                   |  4 ++--
 .../src/main/python/streaming/network_wordcount.py |  7 +------
 examples/src/main/python/streaming/wordcount.py    |  2 +-
 python/pyspark/streaming/dstream.py                | 14 +++++++-------
 .../apache/spark/streaming/dstream/DStream.scala   |  3 ++-
 5 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/bin/spark-submit b/bin/spark-submit
index ec4e10787cff0..a297714c67da0 100755
--- a/bin/spark-submit
+++ b/bin/spark-submit
@@ -42,10 +42,9 @@ DEPLOY_MODE=${DEPLOY_MODE:-"client"}
 # This will be removed after pyprint is moved to PythonDStream.
 # Problem is that print function is in (Scala)DStream. 
 # Whenever python code is executed, we call PythonDStream which passes
-# pythonExec(which python Spark should execute).
+# pythonExec(which python Spark should execute). pythonExec is used to call python.
 # Since pyprint is located in DStream, Spark does not know which python should use. 
 # In that case, get python path from PYSPARK_PYTHON, environmental variable. 
-# This fix is ongoing in print branch in https://github.com/giwa/spark/tree/print.
 
 # Figure out which Python executable to use
 if [[ -z "$PYSPARK_PYTHON" ]]; then
@@ -53,6 +52,7 @@ if [[ -z "$PYSPARK_PYTHON" ]]; then
 fi
 export PYSPARK_PYTHON
 
+
 if [ -n "$DRIVER_MEMORY" ] && [ $DEPLOY_MODE == "client" ]; then
   export SPARK_DRIVER_MEMORY=$DRIVER_MEMORY
 fi
diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index a1458e06f13d2..c6ededc24db21 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -11,20 +11,15 @@
         exit(-1)
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
-    conf.set("spark.default.parallelism", 1)
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     fm_lines = lines.flatMap(lambda x: x.split(" "))
-    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
-    reduced_lines = mapped_lines.reduce(add)
-    counted_lines = reduced_lines.count()
+    reduced_lines = mapped_lines.reduceByKey(add)
     
     fm_lines.pyprint()
-    filtered_lines.pyprint()
     mapped_lines.pyprint()
     reduced_lines.pyprint()
-    counted_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index 9ff8bc5ac9ab2..ee52c4e178142 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -21,7 +21,7 @@
     fm_lines = lines.flatMap(lambda x: x.split(" "))
     filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
-    reduced_lines = mapped_lines.reduce(add)
+    reduced_lines = mapped_lines.reduceByKey(add)
     
     fm_lines.pyprint()
     filtered_lines.pyprint()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index a640df7394bcf..08de8dbe9d542 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -22,13 +22,15 @@ def count(self):
         """
 
         """
-        #TODO make sure count implementation, thiis different from what pyspark does
-        return self._mapPartitions(lambda i: [sum(1 for _ in i)]).map(lambda x: (None, 1))
+        pass
+        #TODO: make sure count implementation, thiis different from what pyspark does
+        #return self._mapPartitions(lambda i: [sum(1 for _ in i)]).map(lambda x: (None, 1))
 
     def _sum(self):
         """
         """
-        return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+        pass
+        #return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
     def print_(self):
         """
@@ -85,7 +87,6 @@ def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
         return PipelinedDStream(self, f, preservesPartitioning)
 
-
     def reduceByKey(self, func, numPartitions=None):
         """
         Merge the value for each key using an associative reduce function.
@@ -121,7 +122,7 @@ def combineLocally(iterator):
                 else:
                     combiners[k] = mergeValue(combiners[k], v)
             return combiners.iteritems()
-        locally_combined = self.mapPartitions(combineLocally)
+        locally_combined = self._mapPartitions(combineLocally)
         shuffled = locally_combined.partitionBy(numPartitions)
         def _mergeCombiners(iterator):
             combiners = {}
@@ -131,12 +132,11 @@ def _mergeCombiners(iterator):
                 else:
                     combiners[k] = mergeCombiners(combiners[k], v)
             return combiners.iteritems()
-        return shuffled.mapPartitions(_mergeCombiners) 
+        return shuffled._mapPartitions(_mergeCombiners)
 
     def partitionBy(self, numPartitions, partitionFunc=None):
         """
         Return a copy of the DStream partitioned using the specified partitioner.
-
         """
         if numPartitions is None:
             numPartitions = self.ctx._defaultReducePartitions()
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index fc7a2055025c1..f539bc9aa147d 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -623,7 +623,7 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-//TODO move pyprint to PythonDStream and executed by py4j call back function
+//TODO: move pyprint to PythonDStream and executed by py4j call back function
   /**
    * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
    * operator, so this PythonDStream will be registered as an output stream and there materialized.
@@ -647,6 +647,7 @@ abstract class DStream[T: ClassTag] (
 
       // pythonExec should be passed from python. Move pyprint to PythonDStream
       val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+
       val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
       // Call python script to deserialize and print result in stdout
       val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath)

From 5e822d4aa35a72a86e41117da4e6b049013a56a6 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Sun, 20 Jul 2014 15:33:34 -0700
Subject: [PATCH 095/347] remove waste file

---
 examples/src/main/python/streaming/wordcount.pyc | Bin 1566 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 examples/src/main/python/streaming/wordcount.pyc

diff --git a/examples/src/main/python/streaming/wordcount.pyc b/examples/src/main/python/streaming/wordcount.pyc
deleted file mode 100644
index db93702361f47f57988ea82c213aae522e7a9f81..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1566
zcmb_c+invv5S`tmX_F>xdP%Qb`WW$$yrZfDRUlD`3MEBIE2LoJN!i4Ek?qh#;suqD
z<A3-7n6a}d5{L(2H};Otj6G*APU%~t_Uz(oe+Y}GLf<De%)0<U(k}o75P3NS6+jX~
z6hgqd5~va+Wr)gtT?VxRNd=;cU$1~#g)3YGS%FCi`fY$!K!9-#ZUd03Fe!ol?2vUR
z%QXmJTwtBuk~&BMunf24y#d&O@dn7MQ}ALFWDVpx+!7V6fUL8LB*Ugd1KJlxTYy_I
z-d?yQ>q9y>+5y~w@h->>_~JZ8Ex;Cx_dsqI$i71kKsF2H5bzMjM}Q%7h>o4XJ*F4n
zK8zO<nJlE^r9>3&dpnXIIEg}|#P-`;l<D|8J(q1tH`o$QLQ21=(xRUWJvq<Pk%yWL
zS&brz(`$k)&aBt)_D6P&=D{ElaXJ{pAuY^3nqC`mjgL=mGA)PMG_@zEGS)b>nVz57
z12T9uw;!@}dGH%DJZD35$VV`Rj>M6eD%+ujPzqISlGsr$lgW?>I^S}tg^jax$SNMp
z;hctP(DaEa?*gX;_S>wwv}|<ZLs((ET*W^{G;0$EZZchxa3+#mq3Ieu>~fgh+)?kE
zA}QTvlpRuWK2DWL-b-<|OR}K>zmlsFO7$Rqlgg4A({btd9G1bVIK_XX)mx{dTQvSa
zhqw@<QCQG%$-ZOKW6J-5#a_Xp{g5Qre`ao6-m)C2n3?X80&Jb8eC~UJ%iVMN;QlDk
zWeGZ@=f>=Dh3(OAO^QZYbZjaT91NU$g{kG968ie1<$oh%C2H+3oW-`snC0*d+NXJb
zNn1T{w04!?K9YZFJZ6S=gYAjV^H`y8am%IUG8(sGq=bDbaz|y947`H9jACU`y92z4
za-rpWn$Q-`3VrjXg12du(;-^!crO;am|Z^4b(~s9!C0ZK<vT5IL~O56)3tW@=dQ9&
u`upBbGC5Uv#>Fxnn(9F_YzEDs7S@AS(4e<Uqggr*_kwNrZicm>cz*!UkyX3^


From 4eff0530c2ba5d131f76f6a8df3b5624066c3714 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Wed, 23 Jul 2014 15:43:11 -0700
Subject: [PATCH 096/347] Implemented DStream.foreachRDD in the Python API
 using Py4J callback server.

---
 .../python/streaming/network_wordcount.py     |  4 +-
 python/pyspark/java_gateway.py                |  2 +-
 python/pyspark/streaming/dstream.py           | 44 ++++++++++----
 python/pyspark/streaming/utils.py             | 21 +++++++
 .../streaming/api/java/JavaDStreamLike.scala  |  8 ---
 .../streaming/api/python/PythonDStream.scala  | 38 ++++++++++++
 .../spark/streaming/dstream/DStream.scala     | 60 -------------------
 7 files changed, 95 insertions(+), 82 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index c6ededc24db21..2bbb36a6b787e 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -17,9 +17,7 @@
     fm_lines = lines.flatMap(lambda x: x.split(" "))
     mapped_lines = fm_lines.map(lambda x: (x, 1))
     reduced_lines = mapped_lines.reduceByKey(add)
-    
-    fm_lines.pyprint()
-    mapped_lines.pyprint()
+
     reduced_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 9fd59be1456ef..9d0d320da0900 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -78,7 +78,7 @@ def run(self):
         EchoOutputThread(proc.stdout).start()
 
     # Connect to the gateway
-    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False)
+    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=True)
 
     # Import the classes used by PySpark
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 08de8dbe9d542..0ba2b4b38a281 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -43,14 +43,6 @@ def print_(self):
         #hack to call print function in DStream
         getattr(self._jdstream, "print")()
 
-    def pyprint(self):
-        """
-        Print the first ten elements of each RDD generated in this DStream. This is an output
-        operator, so this DStream will be registered as an output stream and there materialized.
-
-        """
-        self._jdstream.pyprint()
-
     def filter(self, f):
         """
         Return DStream containing only the elements that satisfy predicate.
@@ -190,6 +182,38 @@ def getNumPartitions(self):
         # TODO: remove hardcoding. RDD has NumPartitions but DStream does not have.
         return 2
 
+    def foreachRDD(self, func):
+        """
+        """
+        from utils import RDDFunction
+        wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
+        self.ctx._jvm.PythonForeachDStream(self._jdstream.dstream(), wrapped_func)
+
+    def pyprint(self):
+        """
+        Print the first ten elements of each RDD generated in this DStream. This is an output
+        operator, so this DStream will be registered as an output stream and there materialized.
+
+        """
+        def takeAndPrint(rdd, time):
+            taken = rdd.take(11)
+            print "-------------------------------------------"
+            print "Time: %s" % (str(time))
+            print "-------------------------------------------"
+            for record in taken[:10]:
+                print record
+            if len(taken) > 10:
+                print "..."
+            print
+
+        self.foreachRDD(takeAndPrint)
+
+
+    #def transform(self, func):
+    #    from utils import RDDFunction
+    #    wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
+    #    jdstream = self.ctx._jvm.PythonTransformedDStream(self._jdstream.dstream(), wrapped_func).toJavaDStream
+    #    return DStream(jdstream, self._ssc, ...)  ## DO NOT KNOW HOW 
 
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
@@ -209,7 +233,6 @@ def pipeline_func(split, iterator):
             self._prev_jdstream = prev._prev_jdstream  # maintain the pipeline
             self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer
         self.is_cached = False
-        self.is_checkpointed = False
         self._ssc = prev._ssc
         self.ctx = prev.ctx
         self.prev = prev
@@ -246,4 +269,5 @@ def _jdstream(self):
         return self._jdstream_val
 
     def _is_pipelinable(self):
-        return not (self.is_cached or self.is_checkpointed)
+        return not (self.is_cached)
+
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index b1fa1e227b0a1..84f1dadeba03d 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -15,6 +15,27 @@
 # limitations under the License.
 #
 
+from pyspark.rdd import RDD
+
+class RDDFunction():
+    def __init__(self, ctx, jrdd_deserializer, func):
+        self.ctx = ctx
+        self.deserializer = jrdd_deserializer
+        self.func = func
+
+    def call(self, jrdd, time):
+        # Wrap JavaRDD into python's RDD class
+        rdd = RDD(jrdd, self.ctx, self.deserializer)
+        # Call user defined RDD function
+        self.func(rdd, time)
+
+    def __str__(self):
+        return "%s, %s" % (str(self.deserializer), str(self.func))
+
+    class Java:
+        implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction']
+
+
 
 def msDurationToString(ms):
     """
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index a2b9d581f609c..a6184de4e83c1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -54,14 +54,6 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     dstream.print()
   }
 
-  /**
-   * Print the first ten elements of each PythonRDD generated in the PythonDStream. This is an output
-   * operator, so this PythonDStream will be registered as an output stream and there materialized.
-   * This function is for PythonAPI.
-   */
-  //TODO move this function to PythonDStream
-  def pyprint() = dstream.pyprint()
-
   /**
    * Return a new DStream in which each RDD has a single element generated by counting each RDD
    * of this DStream.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index e2602117f3f86..fe01ebedbf622 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -56,6 +56,10 @@ class PythonDStream[T: ClassTag](
     }
   }
 
+  def foreachRDD(foreachFunc: PythonRDDFunction) {
+    new PythonForeachDStream(this, context.sparkContext.clean(foreachFunc, false)).register()
+  }
+
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
 
@@ -85,6 +89,40 @@ DStream[Array[Byte]](prev.ssc){
       case None => None
     }
   }
+
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
+
+class PythonForeachDStream(
+    prev: DStream[Array[Byte]],
+    foreachFunction: PythonRDDFunction
+  ) extends ForEachDStream[Array[Byte]](
+    prev,
+    (rdd: RDD[Array[Byte]], time: Time) => {
+      foreachFunction.call(rdd.toJavaRDD(), time.milliseconds)
+    }
+  ) {
+
+  this.register()
+}
+/*
+This does not work. Ignore this for now. -TD
+class PythonTransformedDStream(
+    prev: DStream[Array[Byte]],
+    transformFunction: PythonRDDFunction
+  ) extends DStream[Array[Byte]](prev.ssc) {
+
+  override def dependencies = List(prev)
+
+  override def slideDuration: Duration = prev.slideDuration
+
+  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    prev.getOrCompute(validTime).map(rdd => {
+      transformFunction.call(rdd.toJavaRDD(), validTime.milliseconds).rdd
+    })
+  }
+
   val asJavaDStream  = JavaDStream.fromDStream(this)
   //val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
 }
+*/
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index f539bc9aa147d..d8dbdf59e7ff1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -623,66 +623,6 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-//TODO: move pyprint to PythonDStream and executed by py4j call back function
-  /**
-   * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
-   * operator, so this PythonDStream will be registered as an output stream and there materialized.
-   * Since serialized Python object is readable by Python, pyprint writes out binary data to
-   * temporary file and run python script to deserialized and print the first ten elements
-   *
-   * Currently call python script directly. We should avoid this
-   */
-  private[streaming] def pyprint() {
-    def foreachFunc = (rdd: RDD[T], time: Time) => {
-      val iter = rdd.take(11).iterator
-
-      // Generate a temporary file
-      val prefix = "spark"
-      val suffix = ".tmp"
-      val tempFile = File.createTempFile(prefix, suffix)
-      val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
-      // Write out serialized python object to temporary file
-      PythonRDD.writeIteratorToStream(iter, tempFileStream)
-      tempFileStream.close()
-
-      // pythonExec should be passed from python. Move pyprint to PythonDStream
-      val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
-
-      val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
-      // Call python script to deserialize and print result in stdout
-      val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath)
-      val workerEnv = pb.environment()
-
-      // envVars also should be pass from python
-      val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
-      workerEnv.put("PYTHONPATH", pythonPath)
-      val worker = pb.start()
-      val is = worker.getInputStream()
-      val isr = new InputStreamReader(is)
-      val br = new BufferedReader(isr)
-
-      println ("-------------------------------------------")
-      println ("Time: " + time)
-      println ("-------------------------------------------")
-
-      // Print values which is from python std out
-      var line = ""
-      breakable {
-        while (true) {
-          line = br.readLine()
-          if (line == null) break()
-          println(line)
-        }
-      }
-      // Delete temporary file
-      tempFile.delete()
-      println()
-
-    }
-    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
-  }
-
-
   /**
    * Return a new DStream in which each RDD contains all the elements in seen in a
    * sliding window of time over this DStream. The new DStream generates RDDs with

From 4caae3fdcccabbcd9d1556c43b433b77425eadb9 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Fri, 1 Aug 2014 14:39:18 -0700
Subject: [PATCH 097/347] Added missing file

---
 .../spark/streaming/api/python/PythonRDDFunction.java     | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
new file mode 100644
index 0000000000000..88f7036c3a05b
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
@@ -0,0 +1,8 @@
+package org.apache.spark.streaming.api.python;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.streaming.Time;
+
+public interface PythonRDDFunction {
+  JavaRDD<byte[]> call(JavaRDD<byte[]> rdd, long time);
+}

From c9fc12429f617e527363494a1322238619d5be2c Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Fri, 1 Aug 2014 14:40:37 -0700
Subject: [PATCH 098/347] Added extra line.

---
 python/pyspark/streaming/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index 84f1dadeba03d..c60ecd1ed607a 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -17,6 +17,7 @@
 
 from pyspark.rdd import RDD
 
+
 class RDDFunction():
     def __init__(self, ctx, jrdd_deserializer, func):
         self.ctx = ctx

From 19ddcdd18fb97f82d105c1d6b37b31c4f08d9096 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Sat, 2 Aug 2014 15:58:24 -0700
Subject: [PATCH 099/347] tried to restart callback server

---
 python/pyspark/java_gateway.py      | 5 ++++-
 python/pyspark/streaming/context.py | 8 ++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 9d0d320da0900..efc04bc5e43a6 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -78,7 +78,10 @@ def run(self):
         EchoOutputThread(proc.stdout).start()
 
     # Connect to the gateway
-    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=True)
+    # If start_callback_server is True, it looks like callback server is not killed
+    # process is hang up and test case does not move forward.
+    #gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=True)
+    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=False)
 
     # Import the classes used by PySpark
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index a4900191d1730..04737243f3192 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 #
 
+import time
+
 from pyspark.conf import SparkConf
 from pyspark.files import SparkFiles
 from pyspark.java_gateway import launch_gateway
@@ -60,6 +62,12 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         @param duration: A L{Duration} Duration for SparkStreaming
 
         """
+
+        # launch call back server
+        if not gateway:
+            gateway = launch_gateway()
+#        gateway.restart_callback_server()
+
         # Create the Python Sparkcontext
         self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
                         pyFiles=pyFiles, environment=environment, batchSize=batchSize,

From b47b5fdd863d68b6d81c9d8029ab819f4b7fc935 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Sat, 2 Aug 2014 20:05:15 -0700
Subject: [PATCH 100/347] Kill py4j callback server properly

---
 python/pyspark/streaming/context.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 04737243f3192..5952e81a4bef3 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -15,7 +15,8 @@
 # limitations under the License.
 #
 
-import time
+import sys
+from signal import signal, SIGTERM, SIGINT
 
 from pyspark.conf import SparkConf
 from pyspark.files import SparkFiles
@@ -63,15 +64,14 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
 
         """
 
-        # launch call back server
-        if not gateway:
-            gateway = launch_gateway()
-#        gateway.restart_callback_server()
-
         # Create the Python Sparkcontext
         self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
                         pyFiles=pyFiles, environment=environment, batchSize=batchSize,
                         serializer=serializer, conf=conf, gateway=gateway)
+
+        # Start py4j callback server
+        SparkContext._gateway.restart_callback_server()
+        self._clean_up_trigger()
         self._jvm = self._sc._jvm
         self._jssc = self._initialize_context(self._sc._jsc, duration._jduration)
 
@@ -79,6 +79,16 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
     def _initialize_context(self, jspark_context, jduration):
         return self._jvm.JavaStreamingContext(jspark_context, jduration)
 
+    def _clean_up_trigger(self):
+        """Kill py4j callback server properly using signal lib"""
+
+        def clean_up_handler(*args):
+            SparkContext._gateway.shutdown()
+            sys.exit(0)
+
+        for sig in (SIGINT, SIGTERM):
+            signal(sig, clean_up_handler)
+
     def start(self):
         """
         Start the execution of the streams.

From b6468e679cfa27698f1dc1c28a3da8ee82a9a6e1 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sat, 2 Aug 2014 20:40:36 -0700
Subject: [PATCH 101/347] Removed the waste line

---
 python/pyspark/java_gateway.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index efc04bc5e43a6..9fd59be1456ef 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -78,10 +78,7 @@ def run(self):
         EchoOutputThread(proc.stdout).start()
 
     # Connect to the gateway
-    # If start_callback_server is True, it looks like callback server is not killed
-    # process is hang up and test case does not move forward.
-    #gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=True)
-    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=False)
+    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False)
 
     # Import the classes used by PySpark
     java_import(gateway.jvm, "org.apache.spark.SparkConf")

From b8d7d243ee750d314a3397c4c908aec5e7d87111 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 3 Aug 2014 19:25:13 -0700
Subject: [PATCH 102/347] implemented reduce and count function in Dstream

---
 .../python/streaming/network_wordcount.py     |  2 ++
 python/pyspark/streaming/dstream.py           | 27 ++++++++++++-------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index 2bbb36a6b787e..f6fba4488e238 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -19,5 +19,7 @@
     reduced_lines = mapped_lines.reduceByKey(add)
 
     reduced_lines.pyprint()
+    count_lines = mapped_lines.count()
+    count_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 0ba2b4b38a281..e6cd2eb9a49af 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -22,25 +22,23 @@ def count(self):
         """
 
         """
-        pass
-        #TODO: make sure count implementation, thiis different from what pyspark does
-        #return self._mapPartitions(lambda i: [sum(1 for _ in i)]).map(lambda x: (None, 1))
+        # TODO: make sure count implementation, this different from what pyspark does
+        return self._mapPartitions(lambda i: [sum(1 for _ in i)])._sum()
 
     def _sum(self):
         """
         """
-        pass
-        #return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+        return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
     def print_(self):
         """
-        Since print is reserved name for python, we cannot make a print method function.
+        Since print is reserved name for python, we cannot define a print method function.
         This function prints serialized data in RDD in DStream because Scala and Java cannot
-        deserialized pickled python object. Please use DStream.pyprint() instead to print result.
+        deserialized pickled python object. Please use DStream.pyprint() instead to print results.
 
         Call DStream.print().
         """
-        #hack to call print function in DStream
+        # a hack to call print function in DStream
         getattr(self._jdstream, "print")()
 
     def filter(self, f):
@@ -79,17 +77,23 @@ def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
         return PipelinedDStream(self, f, preservesPartitioning)
 
+    def reduce(self, func):
+        """
+
+        """
+        return self.map(lambda x: (None, x)).reduceByKey(func, 1).map(lambda x: x[1])
+
     def reduceByKey(self, func, numPartitions=None):
         """
         Merge the value for each key using an associative reduce function.
 
         This will also perform the merging locally on each mapper before
-        sending resuls to reducer, similarly to a "combiner" in MapReduce.
+        sending results to reducer, similarly to a "combiner" in MapReduce.
 
         Output will be hash-partitioned with C{numPartitions} partitions, or
         the default parallelism level if C{numPartitions} is not specified.
         """
-        return self.combineByKey(lambda x:x, func, func, numPartitions)
+        return self.combineByKey(lambda x: x, func, func, numPartitions)
 
     def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
                       numPartitions = None):
@@ -99,6 +103,7 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         """
         if numPartitions is None:
             numPartitions = self._defaultReducePartitions()
+
         def combineLocally(iterator):
             combiners = {}
             for x in iterator:
@@ -116,6 +121,7 @@ def combineLocally(iterator):
             return combiners.iteritems()
         locally_combined = self._mapPartitions(combineLocally)
         shuffled = locally_combined.partitionBy(numPartitions)
+
         def _mergeCombiners(iterator):
             combiners = {}
             for (k, v) in iterator:
@@ -124,6 +130,7 @@ def _mergeCombiners(iterator):
                 else:
                     combiners[k] = mergeCombiners(combiners[k], v)
             return combiners.iteritems()
+
         return shuffled._mapPartitions(_mergeCombiners)
 
     def partitionBy(self, numPartitions, partitionFunc=None):

From 189dcea1c4e06b31e8e1f29a4dfc169cbe7f8d27 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 3 Aug 2014 21:51:11 -0700
Subject: [PATCH 103/347] clean up examples

---
 .../main/python/streaming/network_wordcount.py    | 10 ++++------
 examples/src/main/python/streaming/wordcount.py   | 15 ++++-----------
 2 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index f6fba4488e238..9b7af07803b4d 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -14,12 +14,10 @@
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
-    fm_lines = lines.flatMap(lambda x: x.split(" "))
-    mapped_lines = fm_lines.map(lambda x: (x, 1))
-    reduced_lines = mapped_lines.reduceByKey(add)
+    words = lines.flatMap(lambda line: line.split(" "))
+    mapped_words = words.map(lambda word: (word, 1))
+    count = mapped_words.reduceByKey(add)
 
-    reduced_lines.pyprint()
-    count_lines = mapped_lines.count()
-    count_lines.pyprint()
+    count.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index ee52c4e178142..2426345711086 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -11,21 +11,14 @@
         exit(-1)
     conf = SparkConf()
     conf.setAppName("PythonStreamingWordCount")
-    conf.set("spark.default.parallelism", 1)
 
-# still has a bug
-#    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
     lines = ssc.textFileStream(sys.argv[1])
-    fm_lines = lines.flatMap(lambda x: x.split(" "))
-    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
-    mapped_lines = fm_lines.map(lambda x: (x, 1))
-    reduced_lines = mapped_lines.reduceByKey(add)
+    words = lines.flatMap(lambda line: line.split(" "))
+    mapped_words = words.map(lambda x: (x, 1))
+    count = mapped_words.reduceByKey(add)
     
-    fm_lines.pyprint()
-    filtered_lines.pyprint()
-    mapped_lines.pyprint()
-    reduced_lines.pyprint()
+    count.pyprint()
     ssc.start()
     ssc.awaitTermination()

From 79c580998ee74bbedb8dfc3e2329b7c4ec823b1b Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 3 Aug 2014 22:05:28 -0700
Subject: [PATCH 104/347] added stop in StreamingContext

---
 python/pyspark/streaming/context.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 5952e81a4bef3..01201f66421f8 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -121,3 +121,15 @@ def textFileStream(self, directory):
         file system. FIle names starting with . are ignored.
         """
         return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
+
+    def stop(self, stopSparkContext=True):
+        """
+        Stop the execution of the streams immediately (does not wait for all received data
+        to be processed).
+        """
+        
+        try:
+            self._jssc.stop(stopSparkContext)
+        finally:
+            # Stop Callback server
+            SparkContext._gateway.shutdown()

From 5a9b525dc98b0d147762375fa959a1871bac1810 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 3 Aug 2014 23:27:56 -0700
Subject: [PATCH 105/347] clean up dstream.py

---
 python/pyspark/streaming/dstream.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index e6cd2eb9a49af..7233ae5249e6d 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -20,9 +20,7 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
 
     def count(self):
         """
-
         """
-        # TODO: make sure count implementation, this different from what pyspark does
         return self._mapPartitions(lambda i: [sum(1 for _ in i)])._sum()
 
     def _sum(self):
@@ -79,7 +77,6 @@ def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
 
     def reduce(self, func):
         """
-
         """
         return self.map(lambda x: (None, x)).reduceByKey(func, 1).map(lambda x: x[1])
 
@@ -107,12 +104,6 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         def combineLocally(iterator):
             combiners = {}
             for x in iterator:
-
-                #TODO for count operation make sure count implementation
-                # This is different from what pyspark does
-                #if isinstance(x, int):
-                #    x = ("", x)
-
                 (k, v) = x
                 if k not in combiners:
                     combiners[k] = createCombiner(v)
@@ -142,6 +133,7 @@ def partitionBy(self, numPartitions, partitionFunc=None):
 
         if partitionFunc is None:
             partitionFunc = lambda x: 0 if x is None else hash(x)
+
         # Transferring O(n) objects to Java is too expensive.  Instead, we'll
         # form the hash buckets in Python, transferring O(numPartitions) objects
         # to Java.  Each object is a (splitNumber, [objects]) pair.
@@ -215,7 +207,6 @@ def takeAndPrint(rdd, time):
 
         self.foreachRDD(takeAndPrint)
 
-
     #def transform(self, func):
     #    from utils import RDDFunction
     #    wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)

From ea4b06b7bf500cd513e3fb0eb0152aed8c8a623f Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 3 Aug 2014 23:47:14 -0700
Subject: [PATCH 106/347] initial commit for testcase

---
 python/pyspark/streaming_tests.py | 58 +++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 python/pyspark/streaming_tests.py

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
new file mode 100644
index 0000000000000..95c5489a5695b
--- /dev/null
+++ b/python/pyspark/streaming_tests.py
@@ -0,0 +1,58 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Unit tests for PySpark; additional tests are implemented as doctests in
+individual modules.
+
+This file will merged to tests.py. But for now, this file is separated to
+focus to streaming test case
+
+"""
+from fileinput import input
+from glob import glob
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+import unittest
+import zipfile
+
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.duration import *
+
+
+SPARK_HOME = os.environ["SPARK_HOME"]
+
+
+class PySparkStreamingTestCase(unittest.TestCase):
+
+    def setUp(self):
+        self._old_sys_path = list(sys.path)
+        class_name = self.__class__.__name__
+        self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
+
+    def tearDown(self):
+        self.ssc.stop()
+        sys.path = self._old_sys_path
+
+
+if __name__ == "__main__":
+    unittest.main()

From 5d22c925b16853c2ab3a3dd91e4bdf2b5b78a570 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 4 Aug 2014 09:47:48 -0700
Subject: [PATCH 107/347] WIP

---
 .../main/python/streaming/test_oprations.py   | 24 +++++++++++++++++++
 python/pyspark/streaming/dstream.py           |  1 -
 2 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 examples/src/main/python/streaming/test_oprations.py

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
new file mode 100644
index 0000000000000..cb338ced5f228
--- /dev/null
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -0,0 +1,24 @@
+import sys
+from operator import add
+
+from pyspark.conf import SparkConf
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.duration import *
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
+        exit(-1)
+    conf = SparkConf()
+    conf.setAppName("PythonStreamingNetworkWordCount")
+    ssc = StreamingContext(conf=conf, duration=Seconds(1))
+
+    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
+    words = lines.flatMap(lambda line: line.split(" "))
+    mapped_words = words.map(lambda word: (word, 1))
+    count = mapped_words.reduceByKey(add)
+
+    count.pyprint()
+    ssc.start()
+#    ssc.awaitTermination()
+    ssc.stop()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 7233ae5249e6d..c5452b952cac4 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -120,7 +120,6 @@ def _mergeCombiners(iterator):
                     combiners[k] = v
                 else:
                     combiners[k] = mergeCombiners(combiners[k], v)
-            return combiners.iteritems()
 
         return shuffled._mapPartitions(_mergeCombiners)
 

From c880a3314c1bfdfa1c2e87dc356baf6770e66eed Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 4 Aug 2014 09:57:16 -0700
Subject: [PATCH 108/347] update comment

---
 python/pyspark/streaming/dstream.py | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index c5452b952cac4..37f625e2806e9 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -1,3 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 from collections import defaultdict
 from itertools import chain, ifilter, imap
 import operator
@@ -20,11 +37,13 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
 
     def count(self):
         """
+        Return a new DStream which contains the number of elements in this DStream.
         """
         return self._mapPartitions(lambda i: [sum(1 for _ in i)])._sum()
 
     def _sum(self):
         """
+        Add up the elements in this DStream.
         """
         return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
@@ -41,7 +60,7 @@ def print_(self):
 
     def filter(self, f):
         """
-        Return DStream containing only the elements that satisfy predicate.
+        Return a new DStream containing only the elements that satisfy predicate.
         """
         def func(iterator): return ifilter(f, iterator)
         return self._mapPartitions(func)
@@ -56,7 +75,7 @@ def func(s, iterator): return chain.from_iterable(imap(f, iterator))
 
     def map(self, f):
         """
-        Return DStream by applying a function to each element of DStream.
+        Return a new DStream by applying a function to each element of DStream.
         """
         def func(iterator): return imap(f, iterator)
         return self._mapPartitions(func)
@@ -71,12 +90,14 @@ def func(s, iterator): return f(iterator)
     def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
         Return a new DStream by applying a function to each partition of this DStream,
-        While tracking the index of the original partition.
+        while tracking the index of the original partition.
         """
         return PipelinedDStream(self, f, preservesPartitioning)
 
     def reduce(self, func):
         """
+        Return a new DStream by reduceing the elements of this RDD using the specified
+        commutative and associative binary operator.
         """
         return self.map(lambda x: (None, x)).reduceByKey(func, 1).map(lambda x: x[1])
 
@@ -267,4 +288,3 @@ def _jdstream(self):
 
     def _is_pipelinable(self):
         return not (self.is_cached)
-

From 1fd12ae87b1217582c78b2bc22d8a95675c59783 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 4 Aug 2014 16:07:48 -0700
Subject: [PATCH 109/347] WIP

---
 examples/src/main/python/streaming/test_oprations.py         | 5 +++--
 python/pyspark/streaming/context.py                          | 5 +++++
 python/pyspark/streaming/dstream.py                          | 4 +++-
 python/pyspark/streaming/utils.py                            | 1 -
 .../apache/spark/streaming/api/python/PythonDStream.scala    | 5 +++--
 5 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index cb338ced5f228..084902b6a2f0d 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -15,10 +15,11 @@
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     words = lines.flatMap(lambda line: line.split(" "))
+#    ssc.checkpoint("checkpoint")
     mapped_words = words.map(lambda word: (word, 1))
     count = mapped_words.reduceByKey(add)
 
     count.pyprint()
     ssc.start()
-#    ssc.awaitTermination()
-    ssc.stop()
+    ssc.awaitTermination()
+#    ssc.stop()
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 01201f66421f8..dfaa5cfbbae27 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -133,3 +133,8 @@ def stop(self, stopSparkContext=True):
         finally:
             # Stop Callback server
             SparkContext._gateway.shutdown()
+
+    def checkpoint(self, directory):
+        """
+        """
+        self._jssc.checkpoint(directory)
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 37f625e2806e9..3026254f8fab6 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -172,7 +172,8 @@ def add_shuffle_key(split, iterator):
         with _JavaStackTrace(self.ctx) as st:
             partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
                                                       id(partitionFunc))
-            jdstream = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream(), partitioner).asJavaDStream()
+            jdstream = self.ctx._jvm.PythonPairwiseDStream(keyed._jdstream.dstream(),
+                                                           partitioner).asJavaDStream()
         dstream = DStream(jdstream, self._ssc, BatchedSerializer(outputSerializer))
         # This is required so that id(partitionFunc) remains unique, even if
         # partitionFunc is a lambda:
@@ -233,6 +234,7 @@ def takeAndPrint(rdd, time):
     #    jdstream = self.ctx._jvm.PythonTransformedDStream(self._jdstream.dstream(), wrapped_func).toJavaDStream
     #    return DStream(jdstream, self._ssc, ...)  ## DO NOT KNOW HOW 
 
+
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index c60ecd1ed607a..aa5e19adbd927 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -37,7 +37,6 @@ class Java:
         implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction']
 
 
-
 def msDurationToString(ms):
     """
     Returns a human-readable string representing a duration such as "35ms"
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index fe01ebedbf622..70021df89b3a7 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -25,7 +25,7 @@ import org.apache.spark._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.api.python._
 import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.streaming.{Duration, Time}
+import org.apache.spark.streaming.{StreamingContext, Duration, Time}
 import org.apache.spark.streaming.dstream._
 import org.apache.spark.streaming.api.java._
 
@@ -64,7 +64,7 @@ class PythonDStream[T: ClassTag](
 }
 
 
-private class PairwiseDStream(prev:DStream[Array[Byte]], partitioner: Partitioner) extends
+private class PythonPairwiseDStream(prev:DStream[Array[Byte]], partitioner: Partitioner) extends
 DStream[Array[Byte]](prev.ssc){
   override def dependencies = List(prev)
 
@@ -105,6 +105,7 @@ class PythonForeachDStream(
 
   this.register()
 }
+
 /*
 This does not work. Ignore this for now. -TD
 class PythonTransformedDStream(

From c05922c76abc5f3cdc3aba3e5279df8d42528963 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Tue, 5 Aug 2014 00:09:38 -0700
Subject: [PATCH 110/347] WIP: added PythonTestInputStream

---
 .../main/python/streaming/test_oprations.py   | 14 +++--------
 python/pyspark/streaming/context.py           | 25 +++++++++++++++++++
 python/pyspark/streaming/dstream.py           |  1 +
 .../api/java/JavaStreamingContext.scala       |  3 +++
 .../streaming/api/python/PythonDStream.scala  |  1 +
 5 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index 084902b6a2f0d..3338a766b9cc3 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -6,20 +6,14 @@
 from pyspark.streaming.duration import *
 
 if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
-        exit(-1)
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
-    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
-    words = lines.flatMap(lambda line: line.split(" "))
-#    ssc.checkpoint("checkpoint")
-    mapped_words = words.map(lambda word: (word, 1))
-    count = mapped_words.reduceByKey(add)
+    test_input = ssc._testInputStream([1,1,1,1])
+    mapped = test_input.map(lambda x: (x, 1))
+    mapped.pyprint()
 
-    count.pyprint()
     ssc.start()
-    ssc.awaitTermination()
+#    ssc.awaitTermination()
 #    ssc.stop()
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index dfaa5cfbbae27..d544eab9b8fc7 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -17,6 +17,7 @@
 
 import sys
 from signal import signal, SIGTERM, SIGINT
+from tempfile import NamedTemporaryFile
 
 from pyspark.conf import SparkConf
 from pyspark.files import SparkFiles
@@ -138,3 +139,27 @@ def checkpoint(self, directory):
         """
         """
         self._jssc.checkpoint(directory)
+
+    def _testInputStream(self, test_input, numSlices=None):
+
+        numSlices = numSlices or self._sc.defaultParallelism
+        # Calling the Java parallelize() method with an ArrayList is too slow,
+        # because it sends O(n) Py4J commands.  As an alternative, serialized
+        # objects are written to a file and loaded through textFile().
+        tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
+        # Make sure we distribute data evenly if it's smaller than self.batchSize
+        if "__len__" not in dir(test_input):
+            c = list(test_input)    # Make it a list so we can compute its length
+        batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
+        if batchSize > 1:
+            serializer = BatchedSerializer(self._sc._unbatched_serializer,
+                                           batchSize)
+        else:
+            serializer = self._sc._unbatched_serializer
+        serializer.dump_stream(test_input, tempFile)
+        tempFile.close()
+        print tempFile.name
+        jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
+                                                        tempFile.name,
+                                                        numSlices).asJavaDStream()
+        return DStream(jinput_stream, self, UTF8Deserializer())
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 3026254f8fab6..77c9a22239c69 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -141,6 +141,7 @@ def _mergeCombiners(iterator):
                     combiners[k] = v
                 else:
                     combiners[k] = mergeCombiners(combiners[k], v)
+            return combiners.iteritems()
 
         return shuffled._mapPartitions(_mergeCombiners)
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
index 18605cac7006c..b51d5ff0be9fc 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
@@ -546,6 +546,9 @@ class JavaStreamingContext(val ssc: StreamingContext) {
  * JavaStreamingContext object contains a number of utility functions.
  */
 object JavaStreamingContext {
+  implicit def fromStreamingContext(ssc: StreamingContext): JavaStreamingContext = new JavaStreamingContext(ssc)
+
+  implicit def toStreamingContext(jssc: JavaStreamingContext): StreamingContext = jssc.ssc
 
   /**
    * Either recreate a StreamingContext from checkpoint data or create a new StreamingContext.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 70021df89b3a7..cabf073822cd9 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -23,6 +23,7 @@ import scala.reflect.ClassTag
 
 import org.apache.spark._
 import org.apache.spark.rdd.RDD
+import org.apache.spark.api.java._
 import org.apache.spark.api.python._
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.streaming.{StreamingContext, Duration, Time}

From 1f68b78b23aba962f81d89c3ab85ce96fc293dc9 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 6 Aug 2014 19:11:17 -0700
Subject: [PATCH 111/347] WIP

---
 .../scala/org/apache/spark/api/python/PythonRDD.scala  |  2 ++
 examples/src/main/python/streaming/test_oprations.py   | 10 +++++++---
 python/pyspark/streaming/context.py                    |  6 +++++-
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 851862856d67b..e24a113d0edba 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -312,6 +312,8 @@ private[spark] object PythonRDD extends Logging {
     } catch {
       case eof: EOFException => {}
     }
+    println("RDDDD ==================")
+    println(objs)
     JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
   }
 
diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index 3338a766b9cc3..5ee0bd4b31253 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -9,11 +9,15 @@
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
+    ssc.checkpoint("/tmp/spark_ckp")
 
-    test_input = ssc._testInputStream([1,1,1,1])
-    mapped = test_input.map(lambda x: (x, 1))
-    mapped.pyprint()
+    test_input = ssc._testInputStream([[1],[1],[1]])
+#    ssc.checkpoint("/tmp/spark_ckp")
+    fm_test = test_input.flatMap(lambda x: x.split(" "))
+    mapped_test = fm_test.map(lambda x: (x, 1))
 
+
+    mapped_test.print_()
     ssc.start()
 #    ssc.awaitTermination()
 #    ssc.stop()
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index d544eab9b8fc7..882db547faa39 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -146,7 +146,10 @@ def _testInputStream(self, test_input, numSlices=None):
         # Calling the Java parallelize() method with an ArrayList is too slow,
         # because it sends O(n) Py4J commands.  As an alternative, serialized
         # objects are written to a file and loaded through textFile().
-        tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
+
+        #tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
+        tempFile = open("/tmp/spark_rdd", "wb")
+
         # Make sure we distribute data evenly if it's smaller than self.batchSize
         if "__len__" not in dir(test_input):
             c = list(test_input)    # Make it a list so we can compute its length
@@ -157,6 +160,7 @@ def _testInputStream(self, test_input, numSlices=None):
         else:
             serializer = self._sc._unbatched_serializer
         serializer.dump_stream(test_input, tempFile)
+        tempFile.flush()
         tempFile.close()
         print tempFile.name
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc,

From 3dda31a6fa17ff5b00a693ad407ed51dc2808803 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 10 Aug 2014 18:43:09 -0700
Subject: [PATCH 112/347] WIP added test case

---
 .../apache/spark/api/python/PythonRDD.scala   |  2 -
 .../main/python/streaming/test_oprations.py   | 25 +++++---
 python/pyspark/streaming/context.py           | 16 +++--
 python/pyspark/streaming/dstream.py           | 22 +++++--
 python/pyspark/streaming_tests.py             | 62 +++++++++++++++++--
 python/pyspark/worker.py                      |  2 +-
 .../streaming/api/java/JavaDStreamLike.scala  |  9 +++
 .../streaming/api/python/PythonDStream.scala  |  9 ++-
 .../spark/streaming/dstream/DStream.scala     | 17 +++++
 9 files changed, 133 insertions(+), 31 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index e24a113d0edba..851862856d67b 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -312,8 +312,6 @@ private[spark] object PythonRDD extends Logging {
     } catch {
       case eof: EOFException => {}
     }
-    println("RDDDD ==================")
-    println(objs)
     JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
   }
 
diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index 5ee0bd4b31253..24ebe23d63166 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -9,15 +9,22 @@
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
-    ssc.checkpoint("/tmp/spark_ckp")
 
-    test_input = ssc._testInputStream([[1],[1],[1]])
-#    ssc.checkpoint("/tmp/spark_ckp")
-    fm_test = test_input.flatMap(lambda x: x.split(" "))
-    mapped_test = fm_test.map(lambda x: (x, 1))
+    test_input = ssc._testInputStream([1,2,3])
+    class buff:
+        pass
+   
+    fm_test = test_input.map(lambda x: (x, 1))
+    fm_test.test_output(buff)
 
-
-    mapped_test.print_()
     ssc.start()
-#    ssc.awaitTermination()
-#    ssc.stop()
+    while True:
+        ssc.awaitTermination(50)
+        try:
+            buff.result
+            break
+        except AttributeError:
+            pass
+
+    ssc.stop()
+    print buff.result
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 882db547faa39..0d7665d645be8 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -100,10 +100,10 @@ def awaitTermination(self, timeout=None):
         """
         Wait for the execution to stop.
         """
-        if timeout:
-            self._jssc.awaitTermination(timeout)
-        else:
+        if timeout is None:
             self._jssc.awaitTermination()
+        else:
+            self._jssc.awaitTermination(timeout)
 
     # start from simple one. storageLevel is not passed for now.
     def socketTextStream(self, hostname, port):
@@ -137,6 +137,7 @@ def stop(self, stopSparkContext=True):
 
     def checkpoint(self, directory):
         """
+        Not tested
         """
         self._jssc.checkpoint(directory)
 
@@ -147,8 +148,7 @@ def _testInputStream(self, test_input, numSlices=None):
         # because it sends O(n) Py4J commands.  As an alternative, serialized
         # objects are written to a file and loaded through textFile().
 
-        #tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
-        tempFile = open("/tmp/spark_rdd", "wb")
+        tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
 
         # Make sure we distribute data evenly if it's smaller than self.batchSize
         if "__len__" not in dir(test_input):
@@ -160,10 +160,8 @@ def _testInputStream(self, test_input, numSlices=None):
         else:
             serializer = self._sc._unbatched_serializer
         serializer.dump_stream(test_input, tempFile)
-        tempFile.flush()
-        tempFile.close()
-        print tempFile.name
+
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
                                                         tempFile.name,
                                                         numSlices).asJavaDStream()
-        return DStream(jinput_stream, self, UTF8Deserializer())
+        return DStream(jinput_stream, self, PickleSerializer())
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 77c9a22239c69..47196196466db 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -47,7 +47,7 @@ def _sum(self):
         """
         return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
-    def print_(self):
+    def print_(self, label=None):
         """
         Since print is reserved name for python, we cannot define a print method function.
         This function prints serialized data in RDD in DStream because Scala and Java cannot
@@ -56,7 +56,7 @@ def print_(self):
         Call DStream.print().
         """
         # a hack to call print function in DStream
-        getattr(self._jdstream, "print")()
+        getattr(self._jdstream, "print")(label)
 
     def filter(self, f):
         """
@@ -217,6 +217,7 @@ def pyprint(self):
 
         """
         def takeAndPrint(rdd, time):
+            print "take and print ==================="
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
@@ -229,11 +230,24 @@ def takeAndPrint(rdd, time):
 
         self.foreachRDD(takeAndPrint)
 
-    #def transform(self, func):
+    #def transform(self, func): - TD
     #    from utils import RDDFunction
     #    wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
     #    jdstream = self.ctx._jvm.PythonTransformedDStream(self._jdstream.dstream(), wrapped_func).toJavaDStream
-    #    return DStream(jdstream, self._ssc, ...)  ## DO NOT KNOW HOW 
+    #    return DStream(jdstream, self._ssc, ...)  ## DO NOT KNOW HOW
+
+    def _test_output(self, buff):
+        """
+        This function is only for testcase.
+        Store data in dstream to buffer to valify the result in tesecase
+        """
+        def get_output(rdd, time):
+            taken = rdd.take(11)
+            buff.result = taken
+        self.foreachRDD(get_output)
+
+    def output(self):
+        self._jdstream.outputToFile()
 
 
 class PipelinedDStream(DStream):
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 95c5489a5695b..0660be10b027b 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -19,12 +19,13 @@
 Unit tests for PySpark; additional tests are implemented as doctests in
 individual modules.
 
-This file will merged to tests.py. But for now, this file is separated to
-focus to streaming test case
+This file will merged to tests.py. But for now, this file is separated due
+to focusing to streaming test case
 
 """
 from fileinput import input
 from glob import glob
+from itertools import chain
 import os
 import re
 import shutil
@@ -41,18 +42,69 @@
 
 SPARK_HOME = os.environ["SPARK_HOME"]
 
+class buff:
+    """
+    Buffer for store the output from stream
+    """
+    result = None
 
 class PySparkStreamingTestCase(unittest.TestCase):
-
     def setUp(self):
-        self._old_sys_path = list(sys.path)
+        print "set up"
         class_name = self.__class__.__name__
         self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
 
     def tearDown(self):
+        print "tear donw"
         self.ssc.stop()
-        sys.path = self._old_sys_path
+        time.sleep(10)
+
+class TestBasicOperationsSuite(PySparkStreamingTestCase):
+    def setUp(self):
+        PySparkStreamingTestCase.setUp(self)
+        buff.result = None
+        self.timeout = 10 # seconds
+
+    def tearDown(self):
+        PySparkStreamingTestCase.tearDown(self)
+
+    def test_map(self):
+        test_input = [range(1,5), range(5,9), range(9, 13)]
+        def test_func(dstream):
+            return dstream.map(lambda x: str(x))
+        expected = map(str, test_input)
+        output = self.run_stream(test_input, test_func)
+        self.assertEqual(output, expected)
+
+    def test_flatMap(self):
+        test_input = [range(1,5), range(5,9), range(9, 13)]
+        def test_func(dstream):
+            return dstream.flatMap(lambda x: (x, x * 2))
+        # Maybe there be good way to create flatmap
+        excepted = map(lambda x: list(chain.from_iterable((map(lambda y:[y, y*2], x)))), 
+                       test_input)
+        output = self.run_stream(test_input, test_func)
+
+    def run_stream(self, test_input, test_func):
+        # Generate input stream with user-defined input
+        test_input_stream = self.ssc._testInputStream(test_input)
+        # Applyed test function to stream
+        test_stream = test_func(test_input_stream)
+        # Add job to get outpuf from stream
+        test_stream._test_output(buff)
+        self.ssc.start()
 
+        start_time = time.time()
+        while True:
+            current_time = time.time()
+            # check time out
+            if (current_time - start_time) > self.timeout:
+                self.ssc.stop()
+                break
+            self.ssc.awaitTermination(50)
+            if buff.result is not None:
+                break
+        return buff.result
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 77a9c4a0e0677..ceb50b4f99acd 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -58,7 +58,7 @@ def main(infile, outfile):
         SparkFiles._is_running_on_worker = True
 
         # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
-        sys.path.append(spark_files_dir)  # *.py files that were added will be copied here
+        sys.path.append(spark_files_dir) # *.py files that were added will be copied here
         num_python_includes = read_int(infile)
         for _ in range(num_python_includes):
             filename = utf8_deserializer.loads(infile)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index a6184de4e83c1..7a002bbe74ca9 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -54,6 +54,15 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     dstream.print()
   }
 
+  def print(label: String = null): Unit = {
+    dstream.print(label)
+  }
+
+  def outputToFile(): Unit = {
+    dstream.outputToFile()
+  }
+
+
   /**
    * Return a new DStream in which each RDD has a single element generated by counting each RDD
    * of this DStream.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index cabf073822cd9..734c2535ef8a3 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -17,9 +17,14 @@
 
 package org.apache.spark.streaming.api.python
 
+import java.io._
+import java.io.{ObjectInputStream, IOException}
 import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
 
+import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
+import scala.collection.JavaConversions._
+
 
 import org.apache.spark._
 import org.apache.spark.rdd.RDD
@@ -51,6 +56,8 @@ class PythonDStream[T: ClassTag](
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     parent.getOrCompute(validTime) match{
       case Some(rdd) =>
+        logInfo("RDD ID in python DStream     ===========")
+        logInfo("RDD id " + rdd.id)
         val pythonRDD = new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning, pythonExec, broadcastVars, accumulator)
         Some(pythonRDD.asJavaRDD.rdd)
       case None => None
@@ -77,7 +84,7 @@ DStream[Array[Byte]](prev.ssc){
         val pairwiseRDD = new PairwiseRDD(rdd)
         /*
          * Since python operation is executed by Scala after StreamingContext.start.
-         * What PairwiseDStream does is equivalent to following python code in pySpark.
+         * What PythonPairwiseDStream does is equivalent to python code in pySpark.
          *
          * with _JavaStackTrace(self.context) as st:
          *    pairRDD = self.ctx._jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD()
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index d8dbdf59e7ff1..bafff80adc54b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -623,6 +623,23 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
+
+  def print(label: String = null) {
+    def foreachFunc = (rdd: RDD[T], time: Time) => {
+      val first11 = rdd.take(11)
+      println ("-------------------------------------------")
+      println ("Time: " + time)
+      println ("-------------------------------------------")
+      if(label != null){
+        println (label)
+      }
+      first11.take(10).foreach(println)
+      if (first11.size > 10) println("...")
+      println()
+    }
+    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
+  }
+
   /**
    * Return a new DStream in which each RDD contains all the elements in seen in a
    * sliding window of time over this DStream. The new DStream generates RDDs with

From 7f96294ecd90b489b5a9ee88abe99a7715cbb41f Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 03:21:22 -0700
Subject: [PATCH 113/347] added basic operation test cases

---
 .../main/python/streaming/test_oprations.py   | 19 ++--
 python/pyspark/streaming/context.py           | 43 +++++----
 python/pyspark/streaming/dstream.py           |  8 +-
 python/pyspark/streaming_tests.py             | 95 +++++++++++++++----
 .../streaming/api/python/PythonDStream.scala  |  2 -
 5 files changed, 113 insertions(+), 54 deletions(-)

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index 24ebe23d63166..70a62058286e9 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -9,22 +9,23 @@
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
-
-    test_input = ssc._testInputStream([1,2,3])
-    class buff:
+    class Buff:
+        result = list()
         pass
+    Buff.result = list()
+
+    test_input = ssc._testInputStream([range(1,4), range(4,7), range(7,10)])
    
     fm_test = test_input.map(lambda x: (x, 1))
-    fm_test.test_output(buff)
+    fm_test.pyprint()
+    fm_test._test_output(Buff.result)
 
     ssc.start()
     while True:
         ssc.awaitTermination(50)
-        try:
-            buff.result
+        if len(Buff.result) == 3:
             break
-        except AttributeError:
-            pass
 
     ssc.stop()
-    print buff.result
+    print Buff.result
+
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 0d7665d645be8..be142fd4f327b 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -123,14 +123,14 @@ def textFileStream(self, directory):
         """
         return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
 
-    def stop(self, stopSparkContext=True):
+    def stop(self, stopSparkContext=True, stopGraceFully=False):
         """
         Stop the execution of the streams immediately (does not wait for all received data
         to be processed).
         """
         
         try:
-            self._jssc.stop(stopSparkContext)
+            self._jssc.stop(stopSparkContext, stopGraceFully)
         finally:
             # Stop Callback server
             SparkContext._gateway.shutdown()
@@ -141,27 +141,34 @@ def checkpoint(self, directory):
         """
         self._jssc.checkpoint(directory)
 
-    def _testInputStream(self, test_input, numSlices=None):
-
+    def _testInputStream(self, test_inputs, numSlices=None):
+        """
+        Generate multiple files to make "stream" in Scala side for test.
+        Scala chooses one of the files and generates RDD using PythonRDD.readRDDFromFile.
+        """
         numSlices = numSlices or self._sc.defaultParallelism
         # Calling the Java parallelize() method with an ArrayList is too slow,
         # because it sends O(n) Py4J commands.  As an alternative, serialized
         # objects are written to a file and loaded through textFile().
 
-        tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
-
-        # Make sure we distribute data evenly if it's smaller than self.batchSize
-        if "__len__" not in dir(test_input):
-            c = list(test_input)    # Make it a list so we can compute its length
-        batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
-        if batchSize > 1:
-            serializer = BatchedSerializer(self._sc._unbatched_serializer,
-                                           batchSize)
-        else:
-            serializer = self._sc._unbatched_serializer
-        serializer.dump_stream(test_input, tempFile)
-
+        tempFiles = list()
+        for test_input in test_inputs:
+            tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
+
+            # Make sure we distribute data evenly if it's smaller than self.batchSize
+            if "__len__" not in dir(test_input):
+                c = list(test_input)    # Make it a list so we can compute its length
+            batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
+            if batchSize > 1:
+                serializer = BatchedSerializer(self._sc._unbatched_serializer,
+                                               batchSize)
+            else:
+                serializer = self._sc._unbatched_serializer
+            serializer.dump_stream(test_input, tempFile)
+            tempFiles.append(tempFile.name)
+
+        jtempFiles = ListConverter().convert(tempFiles, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
-                                                        tempFile.name,
+                                                        jtempFiles,
                                                         numSlices).asJavaDStream()
         return DStream(jinput_stream, self, PickleSerializer())
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 47196196466db..0f0a1847535ce 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -217,7 +217,6 @@ def pyprint(self):
 
         """
         def takeAndPrint(rdd, time):
-            print "take and print ==================="
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
@@ -242,13 +241,10 @@ def _test_output(self, buff):
         Store data in dstream to buffer to valify the result in tesecase
         """
         def get_output(rdd, time):
-            taken = rdd.take(11)
-            buff.result = taken
+            taken = rdd.collect()
+            buff.append(taken)
         self.foreachRDD(get_output)
 
-    def output(self):
-        self._jdstream.outputToFile()
-
 
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 0660be10b027b..d2e638a7d2acc 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -35,76 +35,133 @@
 import time
 import unittest
 import zipfile
+import operator
 
+from pyspark.context import SparkContext
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
 
 
 SPARK_HOME = os.environ["SPARK_HOME"]
 
-class buff:
+class StreamOutput:
     """
-    Buffer for store the output from stream
+    a class to store the output from stream
     """
-    result = None
+    result = list()
 
 class PySparkStreamingTestCase(unittest.TestCase):
     def setUp(self):
-        print "set up"
         class_name = self.__class__.__name__
         self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
 
     def tearDown(self):
-        print "tear donw"
-        self.ssc.stop()
-        time.sleep(10)
+        # Do not call StreamingContext.stop directly because we do not wait to shutdown
+        # call back server and py4j client
+        self.ssc._jssc.stop()
+        self.ssc._sc.stop()
+        # Why does it long time to terminaete StremaingContext and SparkContext?
+        # Should we change the sleep time if this depends on machine spec?
+        time.sleep(5)
+
+    @classmethod
+    def tearDownClass(cls):
+        time.sleep(5)
+        SparkContext._gateway._shutdown_callback_server()
 
 class TestBasicOperationsSuite(PySparkStreamingTestCase):
+    """
+    Input and output of this TestBasicOperationsSuite is the equivalent to 
+    Scala TestBasicOperationsSuite.
+    """
     def setUp(self):
         PySparkStreamingTestCase.setUp(self)
-        buff.result = None
+        StreamOutput.result = list()
         self.timeout = 10 # seconds
 
     def tearDown(self):
         PySparkStreamingTestCase.tearDown(self)
 
+    @classmethod
+    def tearDownClass(cls):
+        PySparkStreamingTestCase.tearDownClass()
+
     def test_map(self):
+        """Basic operation test for DStream.map"""
         test_input = [range(1,5), range(5,9), range(9, 13)]
         def test_func(dstream):
             return dstream.map(lambda x: str(x))
-        expected = map(str, test_input)
-        output = self.run_stream(test_input, test_func)
-        self.assertEqual(output, expected)
+        expected_output = map(lambda x: map(lambda y: str(y), x), test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
 
     def test_flatMap(self):
+        """Basic operation test for DStream.faltMap"""
         test_input = [range(1,5), range(5,9), range(9, 13)]
         def test_func(dstream):
             return dstream.flatMap(lambda x: (x, x * 2))
-        # Maybe there be good way to create flatmap
-        excepted = map(lambda x: list(chain.from_iterable((map(lambda y:[y, y*2], x)))), 
+        expected_output = map(lambda x: list(chain.from_iterable((map(lambda y: [y, y * 2], x)))), 
                        test_input)
-        output = self.run_stream(test_input, test_func)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_filter(self):
+        """Basic operation test for DStream.filter"""
+        test_input = [range(1,5), range(5,9), range(9, 13)]
+        def test_func(dstream):
+            return dstream.filter(lambda x: x % 2 == 0)
+        expected_output = map(lambda x: filter(lambda y: y % 2 == 0, x), test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_count(self):
+        """Basic operation test for DStream.count"""
+        test_input = [[], [1], range(1, 3), range(1,4), range(1,5)]
+        def test_func(dstream):
+            return dstream.count()
+        expected_output = map(lambda x: [len(x)], test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+        
+    def test_reduce(self):
+        """Basic operation test for DStream.reduce"""
+        test_input = [range(1,5), range(5,9), range(9, 13)]
+        def test_func(dstream):
+            return dstream.reduce(operator.add)
+        expected_output = map(lambda x: [reduce(operator.add, x)], test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_reduceByKey(self):
+        """Basic operation test for DStream.reduceByKey"""
+        test_input = [["a", "a", "b"], ["", ""], []]
+        def test_func(dstream):
+            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
+        expected_output = [[("a", 2), ("b", 1)],[("", 2)], []]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
 
-    def run_stream(self, test_input, test_func):
+    def _run_stream(self, test_input, test_func, expected_output):
+        """Start stream and return the output"""
         # Generate input stream with user-defined input
         test_input_stream = self.ssc._testInputStream(test_input)
         # Applyed test function to stream
         test_stream = test_func(test_input_stream)
         # Add job to get outpuf from stream
-        test_stream._test_output(buff)
+        test_stream._test_output(StreamOutput.result)
         self.ssc.start()
 
         start_time = time.time()
+        # loop until get the result from stream
         while True:
             current_time = time.time()
             # check time out
             if (current_time - start_time) > self.timeout:
-                self.ssc.stop()
                 break
             self.ssc.awaitTermination(50)
-            if buff.result is not None:
+            if len(expected_output) == len(StreamOutput.result):
                 break
-        return buff.result
+        return StreamOutput.result
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 734c2535ef8a3..5ef31b0f7bb3c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -56,8 +56,6 @@ class PythonDStream[T: ClassTag](
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     parent.getOrCompute(validTime) match{
       case Some(rdd) =>
-        logInfo("RDD ID in python DStream     ===========")
-        logInfo("RDD id " + rdd.id)
         val pythonRDD = new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning, pythonExec, broadcastVars, accumulator)
         Some(pythonRDD.asJavaRDD.rdd)
       case None => None

From fa75d71119fbf43c1d1c9c125dec8323297c4b94 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 03:22:23 -0700
Subject: [PATCH 114/347] delete waste file

---
 .../main/python/streaming/test_oprations.py   | 31 -------------------
 1 file changed, 31 deletions(-)
 delete mode 100644 examples/src/main/python/streaming/test_oprations.py

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
deleted file mode 100644
index 70a62058286e9..0000000000000
--- a/examples/src/main/python/streaming/test_oprations.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import sys
-from operator import add
-
-from pyspark.conf import SparkConf
-from pyspark.streaming.context import StreamingContext
-from pyspark.streaming.duration import *
-
-if __name__ == "__main__":
-    conf = SparkConf()
-    conf.setAppName("PythonStreamingNetworkWordCount")
-    ssc = StreamingContext(conf=conf, duration=Seconds(1))
-    class Buff:
-        result = list()
-        pass
-    Buff.result = list()
-
-    test_input = ssc._testInputStream([range(1,4), range(4,7), range(7,10)])
-   
-    fm_test = test_input.map(lambda x: (x, 1))
-    fm_test.pyprint()
-    fm_test._test_output(Buff.result)
-
-    ssc.start()
-    while True:
-        ssc.awaitTermination(50)
-        if len(Buff.result) == 3:
-            break
-
-    ssc.stop()
-    print Buff.result
-

From 8efa266a258fb753a9dfac3908f370a45f3c203e Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 03:41:24 -0700
Subject: [PATCH 115/347] fixed PEP-008 violation

---
 python/pyspark/streaming/context.py |  5 ----
 python/pyspark/streaming/dstream.py | 19 +++++++++------
 python/pyspark/streaming_tests.py   | 37 +++++++++++++++--------------
 3 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index be142fd4f327b..088a4965b6b13 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -19,12 +19,7 @@
 from signal import signal, SIGTERM, SIGINT
 from tempfile import NamedTemporaryFile
 
-from pyspark.conf import SparkConf
-from pyspark.files import SparkFiles
-from pyspark.java_gateway import launch_gateway
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
-from pyspark.storagelevel import *
-from pyspark.rdd import RDD
 from pyspark.context import SparkContext
 from pyspark.streaming.dstream import DStream
 
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 0f0a1847535ce..746f323628c1c 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -49,7 +49,7 @@ def _sum(self):
 
     def print_(self, label=None):
         """
-        Since print is reserved name for python, we cannot define a print method function.
+        Since print is reserved name for python, we cannot define a "print" method function.
         This function prints serialized data in RDD in DStream because Scala and Java cannot
         deserialized pickled python object. Please use DStream.pyprint() instead to print results.
 
@@ -159,8 +159,8 @@ def partitionBy(self, numPartitions, partitionFunc=None):
         # form the hash buckets in Python, transferring O(numPartitions) objects
         # to Java.  Each object is a (splitNumber, [objects]) pair.
         outputSerializer = self.ctx._unbatched_serializer
-        def add_shuffle_key(split, iterator):
 
+        def add_shuffle_key(split, iterator):
             buckets = defaultdict(list)
 
             for (k, v) in iterator:
@@ -205,6 +205,11 @@ def getNumPartitions(self):
 
     def foreachRDD(self, func):
         """
+        Apply userdefined function to all RDD in a DStream.
+        This python implementation could be expensive because it uses callback server
+        in order to apply function to RDD in DStream.
+        This is an output operator, so this DStream will be registered as an output
+        stream and there materialized.
         """
         from utils import RDDFunction
         wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
@@ -214,7 +219,6 @@ def pyprint(self):
         """
         Print the first ten elements of each RDD generated in this DStream. This is an output
         operator, so this DStream will be registered as an output stream and there materialized.
-
         """
         def takeAndPrint(rdd, time):
             taken = rdd.take(11)
@@ -235,14 +239,15 @@ def takeAndPrint(rdd, time):
     #    jdstream = self.ctx._jvm.PythonTransformedDStream(self._jdstream.dstream(), wrapped_func).toJavaDStream
     #    return DStream(jdstream, self._ssc, ...)  ## DO NOT KNOW HOW
 
-    def _test_output(self, buff):
+    def _test_output(self, result):
         """
-        This function is only for testcase.
-        Store data in dstream to buffer to valify the result in tesecase
+        This function is only for test case.
+        Store data in a DStream to result to verify the result in tese case
         """
         def get_output(rdd, time):
             taken = rdd.collect()
-            buff.append(taken)
+            result.append(taken)
+
         self.foreachRDD(get_output)
 
 
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index d2e638a7d2acc..ef9b87756fcef 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -23,18 +23,10 @@
 to focusing to streaming test case
 
 """
-from fileinput import input
-from glob import glob
 from itertools import chain
 import os
-import re
-import shutil
-import subprocess
-import sys
-import tempfile
 import time
 import unittest
-import zipfile
 import operator
 
 from pyspark.context import SparkContext
@@ -44,12 +36,14 @@
 
 SPARK_HOME = os.environ["SPARK_HOME"]
 
+
 class StreamOutput:
     """
     a class to store the output from stream
     """
     result = list()
 
+
 class PySparkStreamingTestCase(unittest.TestCase):
     def setUp(self):
         class_name = self.__class__.__name__
@@ -69,6 +63,7 @@ def tearDownClass(cls):
         time.sleep(5)
         SparkContext._gateway._shutdown_callback_server()
 
+
 class TestBasicOperationsSuite(PySparkStreamingTestCase):
     """
     Input and output of this TestBasicOperationsSuite is the equivalent to 
@@ -77,7 +72,7 @@ class TestBasicOperationsSuite(PySparkStreamingTestCase):
     def setUp(self):
         PySparkStreamingTestCase.setUp(self)
         StreamOutput.result = list()
-        self.timeout = 10 # seconds
+        self.timeout = 10  # seconds
 
     def tearDown(self):
         PySparkStreamingTestCase.tearDown(self)
@@ -88,7 +83,8 @@ def tearDownClass(cls):
 
     def test_map(self):
         """Basic operation test for DStream.map"""
-        test_input = [range(1,5), range(5,9), range(9, 13)]
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+
         def test_func(dstream):
             return dstream.map(lambda x: str(x))
         expected_output = map(lambda x: map(lambda y: str(y), x), test_input)
@@ -97,17 +93,19 @@ def test_func(dstream):
 
     def test_flatMap(self):
         """Basic operation test for DStream.faltMap"""
-        test_input = [range(1,5), range(5,9), range(9, 13)]
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+
         def test_func(dstream):
             return dstream.flatMap(lambda x: (x, x * 2))
         expected_output = map(lambda x: list(chain.from_iterable((map(lambda y: [y, y * 2], x)))), 
-                       test_input)
+                              test_input)
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
     def test_filter(self):
         """Basic operation test for DStream.filter"""
-        test_input = [range(1,5), range(5,9), range(9, 13)]
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+
         def test_func(dstream):
             return dstream.filter(lambda x: x % 2 == 0)
         expected_output = map(lambda x: filter(lambda y: y % 2 == 0, x), test_input)
@@ -116,7 +114,8 @@ def test_func(dstream):
 
     def test_count(self):
         """Basic operation test for DStream.count"""
-        test_input = [[], [1], range(1, 3), range(1,4), range(1,5)]
+        test_input = [[], [1], range(1, 3), range(1, 4), range(1, 5)]
+
         def test_func(dstream):
             return dstream.count()
         expected_output = map(lambda x: [len(x)], test_input)
@@ -125,7 +124,8 @@ def test_func(dstream):
         
     def test_reduce(self):
         """Basic operation test for DStream.reduce"""
-        test_input = [range(1,5), range(5,9), range(9, 13)]
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+
         def test_func(dstream):
             return dstream.reduce(operator.add)
         expected_output = map(lambda x: [reduce(operator.add, x)], test_input)
@@ -135,9 +135,10 @@ def test_func(dstream):
     def test_reduceByKey(self):
         """Basic operation test for DStream.reduceByKey"""
         test_input = [["a", "a", "b"], ["", ""], []]
+
         def test_func(dstream):
             return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
-        expected_output = [[("a", 2), ("b", 1)],[("", 2)], []]
+        expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
@@ -145,9 +146,9 @@ def _run_stream(self, test_input, test_func, expected_output):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
         test_input_stream = self.ssc._testInputStream(test_input)
-        # Applyed test function to stream
+        # Applied test function to stream
         test_stream = test_func(test_input_stream)
-        # Add job to get outpuf from stream
+        # Add job to get output from stream
         test_stream._test_output(StreamOutput.result)
         self.ssc.start()
 

From 3a671cc4efe092c3b77701725b6b86dc387697e7 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 04:31:59 -0700
Subject: [PATCH 116/347] remove export PYSPARK_PYTHON in spark submit

---
 bin/spark-submit | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/bin/spark-submit b/bin/spark-submit
index a297714c67da0..9e7cecedd0325 100755
--- a/bin/spark-submit
+++ b/bin/spark-submit
@@ -37,22 +37,6 @@ done
 
 DEPLOY_MODE=${DEPLOY_MODE:-"client"}
 
-
-# This is a hack to make DStream.pyprint work. 
-# This will be removed after pyprint is moved to PythonDStream.
-# Problem is that print function is in (Scala)DStream. 
-# Whenever python code is executed, we call PythonDStream which passes
-# pythonExec(which python Spark should execute). pythonExec is used to call python.
-# Since pyprint is located in DStream, Spark does not know which python should use. 
-# In that case, get python path from PYSPARK_PYTHON, environmental variable. 
-
-# Figure out which Python executable to use
-if [[ -z "$PYSPARK_PYTHON" ]]; then
-  PYSPARK_PYTHON="python"
-fi
-export PYSPARK_PYTHON
-
-
 if [ -n "$DRIVER_MEMORY" ] && [ $DEPLOY_MODE == "client" ]; then
   export SPARK_DRIVER_MEMORY=$DRIVER_MEMORY
 fi

From 774f18d98c8679290aed40aa6dded8538e5f5c44 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 04:42:08 -0700
Subject: [PATCH 117/347] removed unnesessary changes

---
 .../org/apache/spark/streaming/dstream/DStream.scala      | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index bafff80adc54b..46ef05d9c37a1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -17,14 +17,11 @@
 
 package org.apache.spark.streaming.dstream
 
-
-import java.io._
+import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
 
 import scala.deprecated
 import scala.collection.mutable.HashMap
 import scala.reflect.ClassTag
-import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
-import scala.util.control.Breaks._
 
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.rdd.{BlockRDD, RDD}
@@ -34,7 +31,6 @@ import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.scheduler.Job
 import org.apache.spark.util.MetadataCleaner
 import org.apache.spark.streaming.Duration
-import org.apache.spark.api.python.PythonRDD
 
 /**
  * A Discretized Stream (DStream), the basic abstraction in Spark Streaming, is a continuous
@@ -562,11 +558,9 @@ abstract class DStream[T: ClassTag] (
     // DStreams can't be serialized with closures, we can't proactively check 
     // it for serializability and so we pass the optional false to SparkContext.clean
 
-    // serialized python
     val cleanedF = context.sparkContext.clean(transformFunc, false)
     val realTransformFunc =  (rdds: Seq[RDD[_]], time: Time) => {
       assert(rdds.length == 1)
-      // if transformfunc is fine, it is okay
       cleanedF(rdds.head.asInstanceOf[RDD[T]], time)
     }
     new TransformedDStream[U](Seq(this), realTransformFunc)

From 33c0f94dc9c266109cbe0399176fbf0598c77b39 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 05:32:28 -0700
Subject: [PATCH 118/347] edited the comment to add more precise description

---
 python/pyspark/streaming_tests.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index ef9b87756fcef..ec45acec94dbf 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -50,8 +50,8 @@ def setUp(self):
         self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
 
     def tearDown(self):
-        # Do not call StreamingContext.stop directly because we do not wait to shutdown
-        # call back server and py4j client
+        # Do not call pyspark.streaming.context.StreamingContext.stop directly because
+        # we do not wait to shutdowncall back server and py4j client
         self.ssc._jssc.stop()
         self.ssc._sc.stop()
         # Why does it long time to terminaete StremaingContext and SparkContext?
@@ -146,7 +146,7 @@ def _run_stream(self, test_input, test_func, expected_output):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
         test_input_stream = self.ssc._testInputStream(test_input)
-        # Applied test function to stream
+        # Apply test function to stream
         test_stream = test_func(test_input_stream)
         # Add job to get output from stream
         test_stream._test_output(StreamOutput.result)
@@ -160,6 +160,7 @@ def _run_stream(self, test_input, test_func, expected_output):
             if (current_time - start_time) > self.timeout:
                 break
             self.ssc.awaitTermination(50)
+            # check if the output is the same length of expexted output
             if len(expected_output) == len(StreamOutput.result):
                 break
         return StreamOutput.result

From 4f2d7e6d5190ae7d16889984e1ca937bd7b53ac9 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 16:34:12 -0700
Subject: [PATCH 119/347] added mapValues and flatMapVaules WIP for glom and
 mapPartitions test

---
 python/pyspark/streaming/context.py |  2 +
 python/pyspark/streaming/dstream.py | 69 ++++++++++++++++++++++-------
 python/pyspark/streaming_tests.py   | 48 +++++++++++++++++++-
 3 files changed, 101 insertions(+), 18 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 088a4965b6b13..eee298badcbad 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -140,6 +140,8 @@ def _testInputStream(self, test_inputs, numSlices=None):
         """
         Generate multiple files to make "stream" in Scala side for test.
         Scala chooses one of the files and generates RDD using PythonRDD.readRDDFromFile.
+
+        QueStream maybe good way to implement this function
         """
         numSlices = numSlices or self._sc.defaultParallelism
         # Calling the Java parallelize() method with an ArrayList is too slow,
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 746f323628c1c..5a6cf57ef1d9f 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -35,25 +35,31 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
         self.ctx = ssc._sc
         self._jrdd_deserializer = jrdd_deserializer
 
+    def context(self):
+        """
+        Return the StreamingContext associated with this DStream
+        """
+        return self._ssc
+
     def count(self):
         """
         Return a new DStream which contains the number of elements in this DStream.
         """
-        return self._mapPartitions(lambda i: [sum(1 for _ in i)])._sum()
+        return self.mapPartitions(lambda i: [sum(1 for _ in i)])._sum()
 
     def _sum(self):
         """
         Add up the elements in this DStream.
         """
-        return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+        return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
     def print_(self, label=None):
         """
         Since print is reserved name for python, we cannot define a "print" method function.
         This function prints serialized data in RDD in DStream because Scala and Java cannot
-        deserialized pickled python object. Please use DStream.pyprint() instead to print results.
+        deserialized pickled python object. Please use DStream.pyprint() to print results.
 
-        Call DStream.print().
+        Call DStream.print() and this function will print byte array in the DStream
         """
         # a hack to call print function in DStream
         getattr(self._jdstream, "print")(label)
@@ -63,29 +69,32 @@ def filter(self, f):
         Return a new DStream containing only the elements that satisfy predicate.
         """
         def func(iterator): return ifilter(f, iterator)
-        return self._mapPartitions(func)
+        return self.mapPartitions(func)
 
     def flatMap(self, f, preservesPartitioning=False):
         """
         Pass each value in the key-value pair DStream through flatMap function
         without changing the keys: this also retains the original RDD's partition.
         """
-        def func(s, iterator): return chain.from_iterable(imap(f, iterator))
+        def func(s, iterator):
+            return chain.from_iterable(imap(f, iterator))
         return self._mapPartitionsWithIndex(func, preservesPartitioning)
 
-    def map(self, f):
+    def map(self, f, preservesPartitioning=False):
         """
         Return a new DStream by applying a function to each element of DStream.
         """
-        def func(iterator): return imap(f, iterator)
-        return self._mapPartitions(func)
+        def func(iterator):
+            return imap(f, iterator)
+        return self.mapPartitions(func, preservesPartitioning)
 
-    def _mapPartitions(self, f):
+    def mapPartitions(self, f, preservesPartitioning=False):
         """
         Return a new DStream by applying a function to each partition of this DStream.
         """
-        def func(s, iterator): return f(iterator)
-        return self._mapPartitionsWithIndex(func)
+        def func(s, iterator):
+            return f(iterator)
+        return self._mapPartitionsWithIndex(func, preservesPartitioning)
 
     def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
@@ -131,7 +140,7 @@ def combineLocally(iterator):
                 else:
                     combiners[k] = mergeValue(combiners[k], v)
             return combiners.iteritems()
-        locally_combined = self._mapPartitions(combineLocally)
+        locally_combined = self.mapPartitions(combineLocally)
         shuffled = locally_combined.partitionBy(numPartitions)
 
         def _mergeCombiners(iterator):
@@ -143,7 +152,7 @@ def _mergeCombiners(iterator):
                     combiners[k] = mergeCombiners(combiners[k], v)
             return combiners.iteritems()
 
-        return shuffled._mapPartitions(_mergeCombiners)
+        return shuffled.mapPartitions(_mergeCombiners)
 
     def partitionBy(self, numPartitions, partitionFunc=None):
         """
@@ -233,6 +242,34 @@ def takeAndPrint(rdd, time):
 
         self.foreachRDD(takeAndPrint)
 
+    def mapValues(self, f):
+        """
+        Pass each value in the key-value pair RDD through a map function
+        without changing the keys; this also retains the original RDD's
+        partitioning.
+        """
+        map_values_fn = lambda (k, v): (k, f(v))
+        return self.map(map_values_fn, preservesPartitioning=True)
+
+    def flatMapValues(self, f):
+        """
+        Pass each value in the key-value pair RDD through a flatMap function
+        without changing the keys; this also retains the original RDD's
+        partitioning.
+        """
+        flat_map_fn = lambda (k, v): ((k, x) for x in f(v))
+        return self.flatMap(flat_map_fn, preservesPartitioning=True)
+
+    def glom(self):
+        """
+        Return a new DStream in which RDD is generated by applying glom() to RDD of
+        this DStream. Applying glom() to an RDD coalesces all elements within each partition into
+        an list.
+        """
+        def func(iterator):
+            yield list(iterator)
+        return self.mapPartitions(func)
+
     #def transform(self, func): - TD
     #    from utils import RDDFunction
     #    wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
@@ -242,7 +279,7 @@ def takeAndPrint(rdd, time):
     def _test_output(self, result):
         """
         This function is only for test case.
-        Store data in a DStream to result to verify the result in tese case
+        Store data in a DStream to result to verify the result in test case
         """
         def get_output(rdd, time):
             taken = rdd.collect()
@@ -305,4 +342,4 @@ def _jdstream(self):
         return self._jdstream_val
 
     def _is_pipelinable(self):
-        return not (self.is_cached)
+        return not self.is_cached
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index ec45acec94dbf..25ea350ca425f 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -142,10 +142,54 @@ def test_func(dstream):
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def _run_stream(self, test_input, test_func, expected_output):
+    def test_mapValues(self):
+        """Basic operation test for DStream.mapValues"""
+        test_input = [["a", "a", "b"], ["", ""], []]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add).mapValues(lambda x: x + 10)
+        expected_output = [[("a", 12), ("b", 11)], [("", 12)], []]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_flatMapValues(self):
+        """Basic operation test for DStream.flatMapValues"""
+        test_input = [["a", "a", "b"], ["", ""], []]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add).flatMapValues(lambda x: (x, x + 10))
+        expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_glom(self):
+        """Basic operation test for DStream.glom"""
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+        numSlices = 2
+
+        def test_func(dstream):
+            dstream.pyprint()
+            return dstream.glom()
+        expected_output = [[[1,2], [3,4]],[[5,6], [7,8]],[[9,10], [11,12]]]
+        output = self._run_stream(test_input, test_func, expected_output, numSlices)
+        self.assertEqual(expected_output, output)
+
+    def test_mapPartitions(self):
+        """Basic operation test for DStream.mapPartitions"""
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+        numSlices = 2
+
+        def test_func(dstream):
+            dstream.pyprint()
+            return dstream.mapPartitions(lambda x: reduce(operator.add, x))
+        expected_output = [[3, 7],[11, 15],[19, 23]]
+        output = self._run_stream(test_input, test_func, expected_output, numSlices)
+        self.assertEqual(expected_output, output)
+
+    def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
-        test_input_stream = self.ssc._testInputStream(test_input)
+        test_input_stream = self.ssc._testInputStream(test_input, numSlices)
         # Apply test function to stream
         test_stream = test_func(test_input_stream)
         # Add job to get output from stream

From 9767712a52be2d4b5ccacd65beb095f5609ff5a3 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 13 Aug 2014 21:04:26 -0700
Subject: [PATCH 120/347] WIP: solved partitioned and None is not recognized

---
 python/pyspark/streaming/context.py           | 20 +++++++++++++++-
 python/pyspark/streaming/dstream.py           | 16 +++++++++++++
 python/pyspark/streaming_tests.py             | 23 +++++++++++--------
 .../streaming/api/python/PythonDStream.scala  |  1 +
 4 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index eee298badcbad..32b52f74e16f0 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -154,7 +154,7 @@ def _testInputStream(self, test_inputs, numSlices=None):
 
             # Make sure we distribute data evenly if it's smaller than self.batchSize
             if "__len__" not in dir(test_input):
-                c = list(test_input)    # Make it a list so we can compute its length
+                test_input = list(test_input)    # Make it a list so we can compute its length
             batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
             if batchSize > 1:
                 serializer = BatchedSerializer(self._sc._unbatched_serializer,
@@ -162,6 +162,7 @@ def _testInputStream(self, test_inputs, numSlices=None):
             else:
                 serializer = self._sc._unbatched_serializer
             serializer.dump_stream(test_input, tempFile)
+            tempFile.close()
             tempFiles.append(tempFile.name)
 
         jtempFiles = ListConverter().convert(tempFiles, SparkContext._gateway._gateway_client)
@@ -169,3 +170,20 @@ def _testInputStream(self, test_inputs, numSlices=None):
                                                         jtempFiles,
                                                         numSlices).asJavaDStream()
         return DStream(jinput_stream, self, PickleSerializer())
+
+    
+    def _testInputStream2(self, test_inputs, numSlices=None):
+        """
+        This is inpired by QueStream implementation. Give list of RDD and generate DStream
+        which contain the RDD.
+        """
+        test_rdds = list()
+        for test_input in test_inputs:
+            test_rdd = self._sc.parallelize(test_input, numSlices)
+            print test_rdd.glom().collect()
+            test_rdds.append(test_rdd._jrdd)
+
+        jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
+        jinput_stream = self._jvm.PythonTestInputStream2(self._jssc, jtest_rdds).asJavaDStream()
+
+        return DStream(jinput_stream, self, BatchedSerializer(PickleSerializer()))
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 5a6cf57ef1d9f..101bfdbca0102 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -233,6 +233,8 @@ def takeAndPrint(rdd, time):
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
+            print rdd.glom().collect()
+            print "-------------------------------------------"
             print "-------------------------------------------"
             for record in taken[:10]:
                 print record
@@ -288,6 +290,20 @@ def get_output(rdd, time):
         self.foreachRDD(get_output)
 
 
+# TODO: implement groupByKey
+# TODO: impelment union
+# TODO: implement cache
+# TODO: implement persist
+# TODO: implement repertitions
+# TODO: implement saveAsTextFile
+# TODO: implement cogroup
+# TODO: implement join
+# TODO: implement countByValue
+# TODO: implement leftOuterJoin
+# TODO: implemtnt rightOuterJoin
+
+
+
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 25ea350ca425f..e346bc227fe46 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -71,8 +71,9 @@ class TestBasicOperationsSuite(PySparkStreamingTestCase):
     """
     def setUp(self):
         PySparkStreamingTestCase.setUp(self)
-        StreamOutput.result = list()
         self.timeout = 10  # seconds
+        self.numInputPartitions = 2
+        self.result = list()
 
     def tearDown(self):
         PySparkStreamingTestCase.tearDown(self)
@@ -137,6 +138,8 @@ def test_reduceByKey(self):
         test_input = [["a", "a", "b"], ["", ""], []]
 
         def test_func(dstream):
+            print "reduceByKey"
+            dstream.map(lambda x: (x, 1)).pyprint()
             return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
         expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
         output = self._run_stream(test_input, test_func, expected_output)
@@ -168,9 +171,8 @@ def test_glom(self):
         numSlices = 2
 
         def test_func(dstream):
-            dstream.pyprint()
             return dstream.glom()
-        expected_output = [[[1,2], [3,4]],[[5,6], [7,8]],[[9,10], [11,12]]]
+        expected_output = [[[1,2], [3,4]], [[5,6], [7,8]], [[9,10], [11,12]]]
         output = self._run_stream(test_input, test_func, expected_output, numSlices)
         self.assertEqual(expected_output, output)
 
@@ -180,20 +182,21 @@ def test_mapPartitions(self):
         numSlices = 2
 
         def test_func(dstream):
-            dstream.pyprint()
-            return dstream.mapPartitions(lambda x: reduce(operator.add, x))
-        expected_output = [[3, 7],[11, 15],[19, 23]]
+            def f(iterator): yield sum(iterator)
+            return dstream.mapPartitions(f)
+        expected_output = [[3, 7], [11, 15], [19, 23]]
         output = self._run_stream(test_input, test_func, expected_output, numSlices)
         self.assertEqual(expected_output, output)
 
     def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
-        test_input_stream = self.ssc._testInputStream(test_input, numSlices)
+        numSlices = numSlices or self.numInputPartitions
+        test_input_stream = self.ssc._testInputStream2(test_input, numSlices)
         # Apply test function to stream
         test_stream = test_func(test_input_stream)
         # Add job to get output from stream
-        test_stream._test_output(StreamOutput.result)
+        test_stream._test_output(self.result)
         self.ssc.start()
 
         start_time = time.time()
@@ -205,9 +208,9 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
                 break
             self.ssc.awaitTermination(50)
             # check if the output is the same length of expexted output
-            if len(expected_output) == len(StreamOutput.result):
+            if len(expected_output) == len(self.result):
                 break
-        return StreamOutput.result
+        return self.result
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 5ef31b0f7bb3c..f2506f211969c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -133,3 +133,4 @@ class PythonTransformedDStream(
   //val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
 }
 */
+

From 35933e1262134d0b7e0f514e31f012d2e5f52c0a Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 14 Aug 2014 02:19:46 -0700
Subject: [PATCH 121/347] broke something

---
 python/pyspark/streaming/context.py           | 10 ++-
 python/pyspark/streaming/dstream.py           | 20 +++++
 python/pyspark/streaming_tests.py             |  2 +
 python/pyspark/worker.py                      | 11 +++
 .../streaming/api/python/PythonDStream.scala  | 75 +++++++++++++++++++
 5 files changed, 114 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 32b52f74e16f0..809158aedbc96 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -169,8 +169,7 @@ def _testInputStream(self, test_inputs, numSlices=None):
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
                                                         jtempFiles,
                                                         numSlices).asJavaDStream()
-        return DStream(jinput_stream, self, PickleSerializer())
-
+        return DStream(jinput_stream, self, BatchedSerializer(PickleSerializer()))
     
     def _testInputStream2(self, test_inputs, numSlices=None):
         """
@@ -178,12 +177,15 @@ def _testInputStream2(self, test_inputs, numSlices=None):
         which contain the RDD.
         """
         test_rdds = list()
+        test_rdd_deserializers = list()
         for test_input in test_inputs:
             test_rdd = self._sc.parallelize(test_input, numSlices)
-            print test_rdd.glom().collect()
             test_rdds.append(test_rdd._jrdd)
+            test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
 
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream2(self._jssc, jtest_rdds).asJavaDStream()
 
-        return DStream(jinput_stream, self, BatchedSerializer(PickleSerializer()))
+        dstream = DStream(jinput_stream, self, test_rdd_deserializers[0])
+        dstream._test_switch_dserializer(test_rdd_deserializers)
+        return dstream
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 101bfdbca0102..0a93a46d2b2a2 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -17,6 +17,7 @@
 
 from collections import defaultdict
 from itertools import chain, ifilter, imap
+import time
 import operator
 
 from pyspark.serializers import NoOpSerializer,\
@@ -289,6 +290,25 @@ def get_output(rdd, time):
 
         self.foreachRDD(get_output)
 
+    def _test_switch_dserializer(self, serializer_que):
+        """
+        Deserializer is dynamically changed based on numSlice and the number of
+        input. This function choose deserializer. Currently this is just FIFO.
+        """
+        
+        jrdd_deserializer = self._jrdd_deserializer
+
+        def switch(rdd, jtime):
+            try:
+                print serializer_que
+                jrdd_deserializer = serializer_que.pop(0)
+                print jrdd_deserializer
+            except Exception as e:
+                print e
+
+        self.foreachRDD(switch)
+
+
 
 # TODO: implement groupByKey
 # TODO: impelment union
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index e346bc227fe46..e23b86e8f040e 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -118,6 +118,8 @@ def test_count(self):
         test_input = [[], [1], range(1, 3), range(1, 4), range(1, 5)]
 
         def test_func(dstream):
+            print "count"
+            dstream.count().pyprint()
             return dstream.count()
         expected_output = map(lambda x: [len(x)], test_input)
         output = self._run_stream(test_input, test_func, expected_output)
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index ceb50b4f99acd..e212fe6598e09 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -23,6 +23,7 @@
 import time
 import socket
 import traceback
+import itertools
 # CloudPickler needs to be imported so that depicklers are registered using the
 # copy_reg module.
 from pyspark.accumulators import _accumulatorRegistry
@@ -76,6 +77,16 @@ def main(infile, outfile):
         (func, deserializer, serializer) = command
         init_time = time.time()
         iterator = deserializer.load_stream(infile)
+        print "deserializer in worker: %s" % str(deserializer)
+        iterator, walk = itertools.tee(iterator)
+        if isinstance(walk, int):
+            print "this is int"
+            print walk
+        else:
+            try:
+                print list(walk)
+            except:
+                print list(walk)
         serializer.dump_stream(func(split_index, iterator), outfile)
     except Exception:
         try:
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index f2506f211969c..b9841744c15ee 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -134,3 +134,78 @@ class PythonTransformedDStream(
 }
 */
 
+<<<<<<< HEAD
+=======
+/**
+ * This is a input stream just for the unitest. This is equivalent to a checkpointable,
+ * replayable, reliable message queue like Kafka. It requires a sequence as input, and
+ * returns the i_th element at the i_th batch under manual clock.
+ */
+class PythonTestInputStream(ssc_ : JavaStreamingContext, inputFiles: JArrayList[String], numPartitions: Int)
+  extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)){
+
+  def start() {}
+
+  def stop() {}
+
+  def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    logInfo("Computing RDD for time " + validTime)
+    inputFiles.foreach(logInfo(_))
+    // make a temporary file
+    // make empty RDD
+    val prefix = "spark"
+    val suffix = ".tmp"
+    val tempFile = File.createTempFile(prefix, suffix)
+    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
+    logInfo("Index: " + index)
+
+    val selectedInputFile: String = {
+      if (inputFiles.isEmpty){
+        tempFile.getAbsolutePath
+      }else if (index < inputFiles.size()) {
+        inputFiles.get(index)
+      } else {
+        tempFile.getAbsolutePath
+      }
+    }
+    val rdd = PythonRDD.readRDDFromFile(JavaSparkContext.fromSparkContext(ssc_.sparkContext), selectedInputFile, numPartitions).rdd
+    logInfo("Created RDD " + rdd.id + " with " + selectedInputFile)
+    Some(rdd)
+  }
+
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
+
+/**
+ * This is a input stream just for the unitest. This is equivalent to a checkpointable,
+ * replayable, reliable message queue like Kafka. It requires a sequence as input, and
+ * returns the i_th element at the i_th batch under manual clock.
+ * This implementation is close to QueStream
+ */
+
+class PythonTestInputStream2(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[JavaRDD[Array[Byte]]])
+  extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)) {
+
+  def start() {}
+
+  def stop() {}
+
+  def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    val emptyRDD = ssc.sparkContext.emptyRDD[Array[Byte]]
+    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
+    val selectedRDD = {
+      if (inputRDDs.isEmpty) {
+        emptyRDD
+      } else if (index < inputRDDs.size()) {
+        inputRDDs.get(index).rdd
+      } else {
+        emptyRDD
+      }
+    }
+
+    Some(selectedRDD)
+  }
+
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
+>>>>>>> broke something

From 7051a84d0dce3e6c11e319dd861b5b39122e2d2c Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 14 Aug 2014 18:07:10 -0700
Subject: [PATCH 122/347] all tests are passed if numSlice is 2 and the numver
 of each input is over 4

---
 python/pyspark/streaming/context.py           |  5 +++-
 python/pyspark/streaming_tests.py             | 28 +++++++++----------
 .../streaming/api/python/PythonDStream.scala  | 19 ++++++++++++-
 3 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 809158aedbc96..123fa67f837e3 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -187,5 +187,8 @@ def _testInputStream2(self, test_inputs, numSlices=None):
         jinput_stream = self._jvm.PythonTestInputStream2(self._jssc, jtest_rdds).asJavaDStream()
 
         dstream = DStream(jinput_stream, self, test_rdd_deserializers[0])
-        dstream._test_switch_dserializer(test_rdd_deserializers)
         return dstream
+
+    def _testInputStream3(self):
+        jinput_stream = self._jvm.PythonTestInputStream3(self._jssc).asJavaDStream()
+        return DStream(jinput_stream, self, UTF8Deserializer())
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index e23b86e8f040e..19cce3f185833 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -37,13 +37,6 @@
 SPARK_HOME = os.environ["SPARK_HOME"]
 
 
-class StreamOutput:
-    """
-    a class to store the output from stream
-    """
-    result = list()
-
-
 class PySparkStreamingTestCase(unittest.TestCase):
     def setUp(self):
         class_name = self.__class__.__name__
@@ -115,7 +108,8 @@ def test_func(dstream):
 
     def test_count(self):
         """Basic operation test for DStream.count"""
-        test_input = [[], [1], range(1, 3), range(1, 4), range(1, 5)]
+        #test_input = [[], [1], range(1, 3), range(1, 4), range(1, 5)]
+        test_input = [range(1, 5), range(1,10), range(1,20)]
 
         def test_func(dstream):
             print "count"
@@ -137,33 +131,39 @@ def test_func(dstream):
 
     def test_reduceByKey(self):
         """Basic operation test for DStream.reduceByKey"""
-        test_input = [["a", "a", "b"], ["", ""], []]
+        #test_input = [["a", "a", "b"], ["", ""], []]
+        test_input = [["a", "a", "b", "b"], ["", "", "", ""], []]
 
         def test_func(dstream):
             print "reduceByKey"
             dstream.map(lambda x: (x, 1)).pyprint()
             return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
-        expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
+        #expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
+        expected_output = [[("a", 2), ("b", 2)], [("", 4)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
     def test_mapValues(self):
         """Basic operation test for DStream.mapValues"""
-        test_input = [["a", "a", "b"], ["", ""], []]
+        #test_input = [["a", "a", "b"], ["", ""], []]
+        test_input = [["a", "a", "b", "b"], ["", "", "", ""], []]
 
         def test_func(dstream):
             return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add).mapValues(lambda x: x + 10)
-        expected_output = [[("a", 12), ("b", 11)], [("", 12)], []]
+        #expected_output = [[("a", 12), ("b", 11)], [("", 12)], []]
+        expected_output = [[("a", 12), ("b", 12)], [("", 14)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
     def test_flatMapValues(self):
         """Basic operation test for DStream.flatMapValues"""
-        test_input = [["a", "a", "b"], ["", ""], []]
+        #test_input = [["a", "a", "b"], ["", ""], []]
+        test_input = [["a", "a", "b", "b"], ["", "", "",""], []]
 
         def test_func(dstream):
             return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add).flatMapValues(lambda x: (x, x + 10))
-        expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
+        #expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
+        expected_output = [[("a", 2), ("a", 12), ("b", 2), ("b", 12)], [("", 4), ("", 14)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index b9841744c15ee..2a2efcb57ac6c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -208,4 +208,21 @@ class PythonTestInputStream2(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
->>>>>>> broke something
+
+
+class PythonTestInputStream3(ssc_ : JavaStreamingContext)
+  extends InputDStream[Any](JavaStreamingContext.toStreamingContext(ssc_)) {
+
+  def start() {}
+
+  def stop() {}
+
+  def compute(validTime: Time): Option[RDD[Any]] = {
+    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
+    val selectedInput = ArrayBuffer(1, 2, 3).toSeq
+    val rdd :RDD[Any] = ssc.sc.makeRDD(selectedInput, 2)
+    Some(rdd)
+  }
+
+  val asJavaDStream = JavaDStream.fromDStream(this)
+}>>>>>>> broke something

From 99e4bb39bf6edc1d2a1d363c298bba85d87c02a8 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 14 Aug 2014 23:42:34 -0700
Subject: [PATCH 123/347] basic function test cases are passed

---
 python/pyspark/streaming_tests.py             | 209 +++++++++++++-----
 python/pyspark/worker.py                      |  11 -
 .../streaming/api/python/PythonDStream.scala  |  62 +-----
 3 files changed, 160 insertions(+), 122 deletions(-)

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 19cce3f185833..6d85a7faae859 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -24,7 +24,6 @@
 
 """
 from itertools import chain
-import os
 import time
 import unittest
 import operator
@@ -34,9 +33,6 @@
 from pyspark.streaming.duration import *
 
 
-SPARK_HOME = os.environ["SPARK_HOME"]
-
-
 class PySparkStreamingTestCase(unittest.TestCase):
     def setUp(self):
         class_name = self.__class__.__name__
@@ -49,7 +45,7 @@ def tearDown(self):
         self.ssc._sc.stop()
         # Why does it long time to terminaete StremaingContext and SparkContext?
         # Should we change the sleep time if this depends on machine spec?
-        time.sleep(5)
+        time.sleep(8)
 
     @classmethod
     def tearDownClass(cls):
@@ -59,8 +55,17 @@ def tearDownClass(cls):
 
 class TestBasicOperationsSuite(PySparkStreamingTestCase):
     """
-    Input and output of this TestBasicOperationsSuite is the equivalent to 
-    Scala TestBasicOperationsSuite.
+    2 tests for each function for batach deserializer and unbatch deserilizer because
+    we cannot change the deserializer after streaming process starts.
+    Default numInputPartitions is 2.
+    If the number of input element is over 3, that DStream use batach deserializer.
+    If not, that DStream use unbatch deserializer.
+
+    Most of the operation uses UTF8 deserializer to get value from Scala.
+    I am wondering if these test are enough or not.
+    All tests input should have list of lists. This represents stream.
+    Every batch interval, the first object of list are chosen to make DStream.
+    Please see the BasicTestSuits in Scala or QueStream which is close to this implementation.
     """
     def setUp(self):
         PySparkStreamingTestCase.setUp(self)
@@ -75,8 +80,8 @@ def tearDown(self):
     def tearDownClass(cls):
         PySparkStreamingTestCase.tearDownClass()
 
-    def test_map(self):
-        """Basic operation test for DStream.map"""
+    def test_map_batch(self):
+        """Basic operation test for DStream.map with batch deserializer"""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
 
         def test_func(dstream):
@@ -85,8 +90,18 @@ def test_func(dstream):
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_flatMap(self):
-        """Basic operation test for DStream.faltMap"""
+    def test_map_unbatach(self):
+        """Basic operation test for DStream.map with unbatch deserializer"""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: str(x))
+        expected_output = map(lambda x: map(lambda y: str(y), x), test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_flatMap_batch(self):
+        """Basic operation test for DStream.faltMap with batch deserializer"""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
 
         def test_func(dstream):
@@ -96,8 +111,19 @@ def test_func(dstream):
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_filter(self):
-        """Basic operation test for DStream.filter"""
+    def test_flatMap_unbatch(self):
+        """Basic operation test for DStream.faltMap with unbatch deserializer"""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+
+        def test_func(dstream):
+            return dstream.flatMap(lambda x: (x, x * 2))
+        expected_output = map(lambda x: list(chain.from_iterable((map(lambda y: [y, y * 2], x)))),
+                              test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_filter_batch(self):
+        """Basic operation test for DStream.filter with batch deserializer"""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
 
         def test_func(dstream):
@@ -106,21 +132,38 @@ def test_func(dstream):
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_count(self):
-        """Basic operation test for DStream.count"""
-        #test_input = [[], [1], range(1, 3), range(1, 4), range(1, 5)]
-        test_input = [range(1, 5), range(1,10), range(1,20)]
+    def test_filter_unbatch(self):
+        """Basic operation test for DStream.filter with unbatch deserializer"""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+
+        def test_func(dstream):
+            return dstream.filter(lambda x: x % 2 == 0)
+        expected_output = map(lambda x: filter(lambda y: y % 2 == 0, x), test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_count_batch(self):
+        """Basic operation test for DStream.count with batch deserializer"""
+        test_input = [range(1, 5), range(1, 10), range(1, 20)]
 
         def test_func(dstream):
-            print "count"
-            dstream.count().pyprint()
             return dstream.count()
         expected_output = map(lambda x: [len(x)], test_input)
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
-        
-    def test_reduce(self):
-        """Basic operation test for DStream.reduce"""
+
+    def test_count_unbatch(self):
+        """Basic operation test for DStream.count with unbatch deserializer"""
+        test_input = [[], [1], range(1, 3), range(1, 4)]
+
+        def test_func(dstream):
+            return dstream.count()
+        expected_output = map(lambda x: [len(x)], test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_reduce_batch(self):
+        """Basic operation test for DStream.reduce with batch deserializer"""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
 
         def test_func(dstream):
@@ -129,67 +172,132 @@ def test_func(dstream):
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_reduceByKey(self):
-        """Basic operation test for DStream.reduceByKey"""
-        #test_input = [["a", "a", "b"], ["", ""], []]
-        test_input = [["a", "a", "b", "b"], ["", "", "", ""], []]
+    def test_reduce_unbatch(self):
+        """Basic operation test for DStream.reduce with unbatch deserializer"""
+        test_input = [[1], range(1, 3), range(1, 4)]
+
+        def test_func(dstream):
+            return dstream.reduce(operator.add)
+        expected_output = map(lambda x: [reduce(operator.add, x)], test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_reduceByKey_batch(self):
+        """Basic operation test for DStream.reduceByKey with batch deserializer"""
+        test_input = [["a", "a", "b", "b"], ["", "", "", ""]]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
+        expected_output = [[("a", 2), ("b", 2)], [("", 4)]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_reduceByKey_unbatch(self):
+        """Basic operation test for DStream.reduceByKey with unbatch deserilizer"""
+        test_input = [["a", "a", "b"], ["", ""], []]
 
         def test_func(dstream):
-            print "reduceByKey"
-            dstream.map(lambda x: (x, 1)).pyprint()
             return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
-        #expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
-        expected_output = [[("a", 2), ("b", 2)], [("", 4)], []]
+        expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_mapValues(self):
-        """Basic operation test for DStream.mapValues"""
-        #test_input = [["a", "a", "b"], ["", ""], []]
-        test_input = [["a", "a", "b", "b"], ["", "", "", ""], []]
+    def test_mapValues_batch(self):
+        """Basic operation test for DStream.mapValues with batch deserializer"""
+        test_input = [["a", "a", "b", "b"], ["", "", "", ""]]
 
         def test_func(dstream):
-            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add).mapValues(lambda x: x + 10)
-        #expected_output = [[("a", 12), ("b", 11)], [("", 12)], []]
-        expected_output = [[("a", 12), ("b", 12)], [("", 14)], []]
+            return dstream.map(lambda x: (x, 1))\
+                          .reduceByKey(operator.add)\
+                          .mapValues(lambda x: x + 10)
+        expected_output = [[("a", 12), ("b", 12)], [("", 14)]]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_flatMapValues(self):
-        """Basic operation test for DStream.flatMapValues"""
-        #test_input = [["a", "a", "b"], ["", ""], []]
-        test_input = [["a", "a", "b", "b"], ["", "", "",""], []]
+    def test_mapValues_unbatch(self):
+        """Basic operation test for DStream.mapValues with unbatch deserializer"""
+        test_input = [["a", "a", "b"], ["", ""], []]
 
         def test_func(dstream):
-            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add).flatMapValues(lambda x: (x, x + 10))
-        #expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
-        expected_output = [[("a", 2), ("a", 12), ("b", 2), ("b", 12)], [("", 4), ("", 14)], []]
+            return dstream.map(lambda x: (x, 1))\
+                          .reduceByKey(operator.add)\
+                          .mapValues(lambda x: x + 10)
+        expected_output = [[("a", 12), ("b", 11)], [("", 12)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_glom(self):
-        """Basic operation test for DStream.glom"""
+    def test_flatMapValues_batch(self):
+        """Basic operation test for DStream.flatMapValues with batch deserializer"""
+        test_input = [["a", "a", "b", "b"], ["", "", "", ""]]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: (x, 1))\
+                          .reduceByKey(operator.add)\
+                          .flatMapValues(lambda x: (x, x + 10))
+        expected_output = [[("a", 2), ("a", 12), ("b", 2), ("b", 12)], [("", 4), ("", 14)]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_flatMapValues_unbatch(self):
+        """Basic operation test for DStream.flatMapValues with unbatch deserializer"""
+        test_input = [["a", "a", "b"], ["", ""], []]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: (x, 1))\
+                          .reduceByKey(operator.add)\
+                          .flatMapValues(lambda x: (x, x + 10))
+        expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_glom_batch(self):
+        """Basic operation test for DStream.glom with batch deserializer"""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
         numSlices = 2
 
         def test_func(dstream):
             return dstream.glom()
-        expected_output = [[[1,2], [3,4]], [[5,6], [7,8]], [[9,10], [11,12]]]
+        expected_output = [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]]
+        output = self._run_stream(test_input, test_func, expected_output, numSlices)
+        self.assertEqual(expected_output, output)
+
+    def test_glom_unbatach(self):
+        """Basic operation test for DStream.glom with unbatch deserialiser"""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+        numSlices = 2
+
+        def test_func(dstream):
+            return dstream.glom()
+        expected_output = [[[1], [2, 3]], [[4], [5, 6]], [[7], [8, 9]]]
         output = self._run_stream(test_input, test_func, expected_output, numSlices)
         self.assertEqual(expected_output, output)
 
-    def test_mapPartitions(self):
-        """Basic operation test for DStream.mapPartitions"""
+    def test_mapPartitions_batch(self):
+        """Basic operation test for DStream.mapPartitions with batch deserializer"""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
         numSlices = 2
 
         def test_func(dstream):
-            def f(iterator): yield sum(iterator)
+            def f(iterator):
+                yield sum(iterator)
             return dstream.mapPartitions(f)
         expected_output = [[3, 7], [11, 15], [19, 23]]
         output = self._run_stream(test_input, test_func, expected_output, numSlices)
         self.assertEqual(expected_output, output)
 
+    def test_mapPartitions_unbatch(self):
+        """Basic operation test for DStream.mapPartitions with unbatch deserializer"""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+        numSlices = 2
+
+        def test_func(dstream):
+            def f(iterator):
+                yield sum(iterator)
+            return dstream.mapPartitions(f)
+        expected_output = [[1, 5], [4, 11], [7, 17]]
+        output = self._run_stream(test_input, test_func, expected_output, numSlices)
+        self.assertEqual(expected_output, output)
+
     def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
@@ -212,6 +320,7 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
             # check if the output is the same length of expexted output
             if len(expected_output) == len(self.result):
                 break
+
         return self.result
 
 if __name__ == "__main__":
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index e212fe6598e09..ceb50b4f99acd 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -23,7 +23,6 @@
 import time
 import socket
 import traceback
-import itertools
 # CloudPickler needs to be imported so that depicklers are registered using the
 # copy_reg module.
 from pyspark.accumulators import _accumulatorRegistry
@@ -77,16 +76,6 @@ def main(infile, outfile):
         (func, deserializer, serializer) = command
         init_time = time.time()
         iterator = deserializer.load_stream(infile)
-        print "deserializer in worker: %s" % str(deserializer)
-        iterator, walk = itertools.tee(iterator)
-        if isinstance(walk, int):
-            print "this is int"
-            print walk
-        else:
-            try:
-                print list(walk)
-            except:
-                print list(walk)
         serializer.dump_stream(func(split_index, iterator), outfile)
     except Exception:
         try:
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 2a2efcb57ac6c..ede1070472a43 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -134,48 +134,6 @@ class PythonTransformedDStream(
 }
 */
 
-<<<<<<< HEAD
-=======
-/**
- * This is a input stream just for the unitest. This is equivalent to a checkpointable,
- * replayable, reliable message queue like Kafka. It requires a sequence as input, and
- * returns the i_th element at the i_th batch under manual clock.
- */
-class PythonTestInputStream(ssc_ : JavaStreamingContext, inputFiles: JArrayList[String], numPartitions: Int)
-  extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)){
-
-  def start() {}
-
-  def stop() {}
-
-  def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-    logInfo("Computing RDD for time " + validTime)
-    inputFiles.foreach(logInfo(_))
-    // make a temporary file
-    // make empty RDD
-    val prefix = "spark"
-    val suffix = ".tmp"
-    val tempFile = File.createTempFile(prefix, suffix)
-    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
-    logInfo("Index: " + index)
-
-    val selectedInputFile: String = {
-      if (inputFiles.isEmpty){
-        tempFile.getAbsolutePath
-      }else if (index < inputFiles.size()) {
-        inputFiles.get(index)
-      } else {
-        tempFile.getAbsolutePath
-      }
-    }
-    val rdd = PythonRDD.readRDDFromFile(JavaSparkContext.fromSparkContext(ssc_.sparkContext), selectedInputFile, numPartitions).rdd
-    logInfo("Created RDD " + rdd.id + " with " + selectedInputFile)
-    Some(rdd)
-  }
-
-  val asJavaDStream  = JavaDStream.fromDStream(this)
-}
-
 /**
  * This is a input stream just for the unitest. This is equivalent to a checkpointable,
  * replayable, reliable message queue like Kafka. It requires a sequence as input, and
@@ -183,7 +141,7 @@ class PythonTestInputStream(ssc_ : JavaStreamingContext, inputFiles: JArrayList[
  * This implementation is close to QueStream
  */
 
-class PythonTestInputStream2(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[JavaRDD[Array[Byte]]])
+class PythonTestInputStream(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[JavaRDD[Array[Byte]]])
   extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)) {
 
   def start() {}
@@ -208,21 +166,3 @@ class PythonTestInputStream2(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
-
-
-class PythonTestInputStream3(ssc_ : JavaStreamingContext)
-  extends InputDStream[Any](JavaStreamingContext.toStreamingContext(ssc_)) {
-
-  def start() {}
-
-  def stop() {}
-
-  def compute(validTime: Time): Option[RDD[Any]] = {
-    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
-    val selectedInput = ArrayBuffer(1, 2, 3).toSeq
-    val rdd :RDD[Any] = ssc.sc.makeRDD(selectedInput, 2)
-    Some(rdd)
-  }
-
-  val asJavaDStream = JavaDStream.fromDStream(this)
-}>>>>>>> broke something

From 580fbc27abe82b5a3e8b54c6bfeafe9aa42e3ed4 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 14 Aug 2014 23:46:45 -0700
Subject: [PATCH 124/347] modified streaming test case to add coment

---
 python/pyspark/streaming_tests.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 6d85a7faae859..02996ccce9a3e 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -18,6 +18,9 @@
 """
 Unit tests for PySpark; additional tests are implemented as doctests in
 individual modules.
+Other option is separate this test case with other tests.
+This makes sense becuase streaming tests takes long time due to waiting time
+for stoping callback server.
 
 This file will merged to tests.py. But for now, this file is separated due
 to focusing to streaming test case
@@ -45,7 +48,7 @@ def tearDown(self):
         self.ssc._sc.stop()
         # Why does it long time to terminaete StremaingContext and SparkContext?
         # Should we change the sleep time if this depends on machine spec?
-        time.sleep(8)
+        time.sleep(10)
 
     @classmethod
     def tearDownClass(cls):
@@ -302,7 +305,7 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
         numSlices = numSlices or self.numInputPartitions
-        test_input_stream = self.ssc._testInputStream2(test_input, numSlices)
+        test_input_stream = self.ssc._testInputStream(test_input, numSlices)
         # Apply test function to stream
         test_stream = test_func(test_input_stream)
         # Add job to get output from stream

From 94f2b65a6e85bc0fc765e9d2ab933b3bfc0138bb Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Fri, 15 Aug 2014 11:28:39 -0700
Subject: [PATCH 125/347] remove waste duplicated code

---
 python/pyspark/streaming/context.py | 43 +----------------
 python/pyspark/streaming/dstream.py | 75 +++++++++++++++++++++--------
 2 files changed, 56 insertions(+), 62 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 123fa67f837e3..60bcf86783e95 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -130,48 +130,7 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
             # Stop Callback server
             SparkContext._gateway.shutdown()
 
-    def checkpoint(self, directory):
-        """
-        Not tested
-        """
-        self._jssc.checkpoint(directory)
-
     def _testInputStream(self, test_inputs, numSlices=None):
-        """
-        Generate multiple files to make "stream" in Scala side for test.
-        Scala chooses one of the files and generates RDD using PythonRDD.readRDDFromFile.
-
-        QueStream maybe good way to implement this function
-        """
-        numSlices = numSlices or self._sc.defaultParallelism
-        # Calling the Java parallelize() method with an ArrayList is too slow,
-        # because it sends O(n) Py4J commands.  As an alternative, serialized
-        # objects are written to a file and loaded through textFile().
-
-        tempFiles = list()
-        for test_input in test_inputs:
-            tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
-
-            # Make sure we distribute data evenly if it's smaller than self.batchSize
-            if "__len__" not in dir(test_input):
-                test_input = list(test_input)    # Make it a list so we can compute its length
-            batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
-            if batchSize > 1:
-                serializer = BatchedSerializer(self._sc._unbatched_serializer,
-                                               batchSize)
-            else:
-                serializer = self._sc._unbatched_serializer
-            serializer.dump_stream(test_input, tempFile)
-            tempFile.close()
-            tempFiles.append(tempFile.name)
-
-        jtempFiles = ListConverter().convert(tempFiles, SparkContext._gateway._gateway_client)
-        jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
-                                                        jtempFiles,
-                                                        numSlices).asJavaDStream()
-        return DStream(jinput_stream, self, BatchedSerializer(PickleSerializer()))
-    
-    def _testInputStream2(self, test_inputs, numSlices=None):
         """
         This is inpired by QueStream implementation. Give list of RDD and generate DStream
         which contain the RDD.
@@ -184,7 +143,7 @@ def _testInputStream2(self, test_inputs, numSlices=None):
             test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
 
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
-        jinput_stream = self._jvm.PythonTestInputStream2(self._jssc, jtest_rdds).asJavaDStream()
+        jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
         dstream = DStream(jinput_stream, self, test_rdd_deserializers[0])
         return dstream
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 0a93a46d2b2a2..ea418822759c4 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -17,12 +17,13 @@
 
 from collections import defaultdict
 from itertools import chain, ifilter, imap
-import time
 import operator
 
 from pyspark.serializers import NoOpSerializer,\
     BatchedSerializer, CloudPickleSerializer, pack_long
 from pyspark.rdd import _JavaStackTrace
+from pyspark.storagelevel import StorageLevel
+from pyspark.resultiterable import ResultIterable
 
 from py4j.java_collections import ListConverter, MapConverter
 
@@ -35,6 +36,8 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
         self._ssc = ssc
         self.ctx = ssc._sc
         self._jrdd_deserializer = jrdd_deserializer
+        self.is_cached = False
+        self.is_checkpointed = False
 
     def context(self):
         """
@@ -234,8 +237,6 @@ def takeAndPrint(rdd, time):
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
-            print rdd.glom().collect()
-            print "-------------------------------------------"
             print "-------------------------------------------"
             for record in taken[:10]:
                 print record
@@ -290,32 +291,65 @@ def get_output(rdd, time):
 
         self.foreachRDD(get_output)
 
-    def _test_switch_dserializer(self, serializer_que):
+    def cache(self):
+        """
+        Persist this DStream with the default storage level (C{MEMORY_ONLY_SER}).
+        """
+        self.is_cached = True
+        self.persist(StorageLevel.MEMORY_ONLY_SER)
+        return self
+
+    def persist(self, storageLevel):
+        """
+        Set this DStream's storage level to persist its values across operations
+        after the first time it is computed. This can only be used to assign
+        a new storage level if the DStream does not have a storage level set yet.
+        """
+        self.is_cached = True
+        javaStorageLevel = self.ctx._getJavaStorageLevel(storageLevel)
+        self._jdstream.persist(javaStorageLevel)
+        return self
+
+    def checkpoint(self, interval):
         """
-        Deserializer is dynamically changed based on numSlice and the number of
-        input. This function choose deserializer. Currently this is just FIFO.
+        Mark this DStream for checkpointing. It will be saved to a file inside the
+        checkpoint directory set with L{SparkContext.setCheckpointDir()}
+
+        I am not sure this part in DStream
+        and
+        all references to its parent RDDs will be removed. This function must
+        be called before any job has been executed on this RDD. It is strongly
+        recommended that this RDD is persisted in memory, otherwise saving it
+        on a file will require recomputation.
+
+        interval must be pysprak.streaming.duration
         """
-        
-        jrdd_deserializer = self._jrdd_deserializer
+        self.is_checkpointed = True
+        self._jdstream.checkpoint(interval)
+        return self
+
+    def groupByKey(self, numPartitions=None):
+        def createCombiner(x):
+            return [x]
 
-        def switch(rdd, jtime):
-            try:
-                print serializer_que
-                jrdd_deserializer = serializer_que.pop(0)
-                print jrdd_deserializer
-            except Exception as e:
-                print e
+        def mergeValue(xs, x):
+            xs.append(x)
+            return xs
 
-        self.foreachRDD(switch)
+        def mergeCombiners(a, b):
+            a.extend(b)
+            return a
 
+        return self.combineByKey(createCombiner, mergeValue, mergeCombiners,
+                                 numPartitions).mapValues(lambda x: ResultIterable(x))
 
 
 # TODO: implement groupByKey
+# TODO: implement saveAsTextFile
+
+# Following operation has dependency to transform
 # TODO: impelment union
-# TODO: implement cache
-# TODO: implement persist
 # TODO: implement repertitions
-# TODO: implement saveAsTextFile
 # TODO: implement cogroup
 # TODO: implement join
 # TODO: implement countByValue
@@ -342,6 +376,7 @@ def pipeline_func(split, iterator):
             self._prev_jdstream = prev._prev_jdstream  # maintain the pipeline
             self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer
         self.is_cached = False
+        self.is_checkpointed = False
         self._ssc = prev._ssc
         self.ctx = prev.ctx
         self.prev = prev
@@ -378,4 +413,4 @@ def _jdstream(self):
         return self._jdstream_val
 
     def _is_pipelinable(self):
-        return not self.is_cached
+        return not (self.is_cached or self.is_checkpointed)

From e9fab72715ac27d86e7dad1738d080280344410d Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Fri, 15 Aug 2014 17:10:56 -0700
Subject: [PATCH 126/347] added saveAsTextFiles and saveAsPickledFiles

---
 python/pyspark/streaming/context.py           | 17 +++++----
 python/pyspark/streaming/dstream.py           | 35 ++++++++++++++++---
 python/pyspark/streaming/utils.py             |  6 ++++
 python/pyspark/streaming_tests.py             | 32 +++++++++++++++++
 .../streaming/api/python/PythonDStream.scala  |  2 +-
 5 files changed, 78 insertions(+), 14 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 60bcf86783e95..691f9b06ad4e9 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -114,7 +114,7 @@ def textFileStream(self, directory):
         Create an input stream that monitors a Hadoop-compatible file system
         for new files and reads them as text files. Files must be wrriten to the
         monitored directory by "moving" them from another location within the same
-        file system. FIle names starting with . are ignored.
+        file system. File names starting with . are ignored.
         """
         return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
 
@@ -132,8 +132,9 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
 
     def _testInputStream(self, test_inputs, numSlices=None):
         """
-        This is inpired by QueStream implementation. Give list of RDD and generate DStream
-        which contain the RDD.
+        This function is only for test.
+        This implementation is inpired by QueStream implementation. 
+        Give list of RDD to generate DStream which contains the RDD.
         """
         test_rdds = list()
         test_rdd_deserializers = list()
@@ -142,12 +143,10 @@ def _testInputStream(self, test_inputs, numSlices=None):
             test_rdds.append(test_rdd._jrdd)
             test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
 
+#        if len(set(test_rdd_deserializers)) > 1:
+#            raise IOError("Deserializer should be one type to run test case. "
+#                          "See the SparkContext.parallelize to understand how to decide deserializer")
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
-        dstream = DStream(jinput_stream, self, test_rdd_deserializers[0])
-        return dstream
-
-    def _testInputStream3(self):
-        jinput_stream = self._jvm.PythonTestInputStream3(self._jssc).asJavaDStream()
-        return DStream(jinput_stream, self, UTF8Deserializer())
+        return DStream(jinput_stream, self, test_rdd_deserializers[0])
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index ea418822759c4..679360dbca08d 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -24,6 +24,8 @@
 from pyspark.rdd import _JavaStackTrace
 from pyspark.storagelevel import StorageLevel
 from pyspark.resultiterable import ResultIterable
+from pyspark.streaming.utils import rddToFileName
+
 
 from py4j.java_collections import ListConverter, MapConverter
 
@@ -343,21 +345,46 @@ def mergeCombiners(a, b):
         return self.combineByKey(createCombiner, mergeValue, mergeCombiners,
                                  numPartitions).mapValues(lambda x: ResultIterable(x))
 
+    def countByValue(self):
+        def countPartition(iterator):
+            counts = defaultdict(int)
+            for obj in iterator:
+                counts[obj] += 1
+            yield counts
+
+        def mergeMaps(m1, m2):
+            for (k, v) in m2.iteritems():
+                m1[k] += v
+            return m1
+
+        return self.mapPartitions(countPartition).reduce(mergeMaps).flatMap(lambda x: x.items())
+
+    def saveAsTextFiles(self, prefix, suffix=None):
+
+        def saveAsTextFile(rdd, time):
+            path = rddToFileName(prefix, suffix, time)
+            rdd.saveAsTextFile(path)
+
+        return self.foreachRDD(saveAsTextFile)
+
+    def saveAsPickledFiles(self, prefix, suffix=None):
+
+        def saveAsTextFile(rdd, time):
+            path = rddToFileName(prefix, suffix, time)
+            rdd.saveAsPickleFile(path)
+
+        return self.foreachRDD(saveAsTextFile)
 
-# TODO: implement groupByKey
-# TODO: implement saveAsTextFile
 
 # Following operation has dependency to transform
 # TODO: impelment union
 # TODO: implement repertitions
 # TODO: implement cogroup
 # TODO: implement join
-# TODO: implement countByValue
 # TODO: implement leftOuterJoin
 # TODO: implemtnt rightOuterJoin
 
 
-
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index aa5e19adbd927..9178577743e0b 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -53,3 +53,9 @@ def msDurationToString(ms):
         return "%.1f m" % (float(ms) / minute)
     else:
         return "%.2f h" % (float(ms) / hour)
+
+def rddToFileName(prefix, suffix, time):
+    if suffix is not None:
+        return prefix + "-" + str(time) + "." + suffix
+    else:
+        return prefix + "-" + str(time)
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 02996ccce9a3e..2bb01ed3a0642 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -301,6 +301,38 @@ def f(iterator):
         output = self._run_stream(test_input, test_func, expected_output, numSlices)
         self.assertEqual(expected_output, output)
 
+    def test_countByValue_batch(self):
+        """Basic operation test for DStream.countByValue with batch deserializer"""
+        test_input = [range(1, 5) + range(1,5), range(5, 7) + range(5, 9), ["a"] * 2 + ["b"] + [""] ]
+
+        def test_func(dstream):
+            return dstream.countByValue()
+        expected_output = [[(1, 2), (2, 2), (3, 2), (4, 2)],
+                           [(5, 2), (6, 2), (7, 1), (8, 1)],
+                           [("a", 2), ("b", 1), ("", 1)]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def test_countByValue_unbatch(self):
+        """Basic operation test for DStream.countByValue with unbatch deserializer"""
+        test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
+
+        def test_func(dstream):
+            return dstream.countByValue()
+        expected_output = [[(1, 1), (2, 1), (3, 1)],
+                           [(1, 2), ("", 1)],
+                           [("a", 2), ("b", 1)]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def _sort_result_based_on_key(self, outputs):
+        for output in outputs:
+            output.sort(key=lambda x: x[0])
+
     def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index ede1070472a43..20bc69fffcbd7 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -138,7 +138,7 @@ class PythonTransformedDStream(
  * This is a input stream just for the unitest. This is equivalent to a checkpointable,
  * replayable, reliable message queue like Kafka. It requires a sequence as input, and
  * returns the i_th element at the i_th batch under manual clock.
- * This implementation is close to QueStream
+ * This implementation is inspired by QueStream
  */
 
 class PythonTestInputStream(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[JavaRDD[Array[Byte]]])

From 4aa99e43b059c7644fd18871ef6e968551f52ecf Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Fri, 15 Aug 2014 22:30:58 -0700
Subject: [PATCH 127/347] added TODO coments

---
 python/pyspark/streaming/context.py |  3 ++-
 python/pyspark/streaming/dstream.py | 16 ++++++++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 691f9b06ad4e9..470ed270cdbfb 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -17,7 +17,6 @@
 
 import sys
 from signal import signal, SIGTERM, SIGINT
-from tempfile import NamedTemporaryFile
 
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
 from pyspark.context import SparkContext
@@ -79,6 +78,7 @@ def _clean_up_trigger(self):
         """Kill py4j callback server properly using signal lib"""
 
         def clean_up_handler(*args):
+            SparkContext._gateway._shutdown_callback_server()
             SparkContext._gateway.shutdown()
             sys.exit(0)
 
@@ -128,6 +128,7 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
             self._jssc.stop(stopSparkContext, stopGraceFully)
         finally:
             # Stop Callback server
+            SparkContext._gateway._shutdown_callback_server()
             SparkContext._gateway.shutdown()
 
     def _testInputStream(self, test_inputs, numSlices=None):
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 679360dbca08d..ef0e2258e9922 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -376,15 +376,27 @@ def saveAsTextFile(rdd, time):
         return self.foreachRDD(saveAsTextFile)
 
 
+# TODO: implement updateStateByKey
+# TODO: implement slice
+
+# Window Operations
+# TODO: implement window
+# TODO: implement groupByKeyAndWindow
+# TODO: implement reduceByKeyAndWindow
+# TODO: implement countByValueAndWindow
+# TODO: implement countByWindow
+# TODO: implement reduceByWindow
+
 # Following operation has dependency to transform
-# TODO: impelment union
+# TODO: implement transform
+# TODO: implement transformWith
+# TODO: implement union
 # TODO: implement repertitions
 # TODO: implement cogroup
 # TODO: implement join
 # TODO: implement leftOuterJoin
 # TODO: implemtnt rightOuterJoin
 
-
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():

From 6d8190a5167762bea58969fc2e675bb46496dacc Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 00:30:17 -0700
Subject: [PATCH 128/347] add comments

---
 python/pyspark/java_gateway.py      |  5 ++---
 python/pyspark/streaming/context.py | 13 ++++++-----
 python/pyspark/streaming/dstream.py | 24 ++++++++++++++++++++
 python/pyspark/streaming_tests.py   | 34 ++++++++++++++++++++++-------
 4 files changed, 59 insertions(+), 17 deletions(-)

diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 9fd59be1456ef..8a4adada9011d 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -84,15 +84,14 @@ def run(self):
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
     java_import(gateway.jvm, "org.apache.spark.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.api.python.*")
-    java_import(gateway.jvm, "org.apache.spark.streaming.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.*")  # do we need this?
     java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
-    java_import(gateway.jvm, "org.apache.spark.streaming.dstream.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.dstream.*")  # do we need this?
     java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.LocalHiveContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.TestHiveContext")
     java_import(gateway.jvm, "scala.Tuple2")
-
     return gateway
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 470ed270cdbfb..e380626aa080b 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -64,7 +64,9 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
                         pyFiles=pyFiles, environment=environment, batchSize=batchSize,
                         serializer=serializer, conf=conf, gateway=gateway)
 
-        # Start py4j callback server
+        # Start py4j callback server.
+        # Callback sever is need only by SparkStreming; therefore the callback sever
+        # is started in StreamingContext.
         SparkContext._gateway.restart_callback_server()
         self._clean_up_trigger()
         self._jvm = self._sc._jvm
@@ -78,6 +80,8 @@ def _clean_up_trigger(self):
         """Kill py4j callback server properly using signal lib"""
 
         def clean_up_handler(*args):
+            # Make sure stop callback server.
+            # This need improvement how to terminate callback sever properly.
             SparkContext._gateway._shutdown_callback_server()
             SparkContext._gateway.shutdown()
             sys.exit(0)
@@ -100,7 +104,7 @@ def awaitTermination(self, timeout=None):
         else:
             self._jssc.awaitTermination(timeout)
 
-    # start from simple one. storageLevel is not passed for now.
+    #TODO: add storageLevel
     def socketTextStream(self, hostname, port):
         """
         Create an input from TCP source hostname:port. Data is received using
@@ -134,7 +138,7 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
     def _testInputStream(self, test_inputs, numSlices=None):
         """
         This function is only for test.
-        This implementation is inpired by QueStream implementation. 
+        This implementation is inspired by QueStream implementation.
         Give list of RDD to generate DStream which contains the RDD.
         """
         test_rdds = list()
@@ -144,9 +148,6 @@ def _testInputStream(self, test_inputs, numSlices=None):
             test_rdds.append(test_rdd._jrdd)
             test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
 
-#        if len(set(test_rdd_deserializers)) > 1:
-#            raise IOError("Deserializer should be one type to run test case. "
-#                          "See the SparkContext.parallelize to understand how to decide deserializer")
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index ef0e2258e9922..8ed50d3dd2531 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -331,6 +331,17 @@ def checkpoint(self, interval):
         return self
 
     def groupByKey(self, numPartitions=None):
+        """
+        Return a new DStream which contains group the values for each key in the
+        DStream into a single sequence.
+        Hash-partitions the resulting RDD with into numPartitions partitions in
+        the DStream.
+
+        Note: If you are grouping in order to perform an aggregation (such as a
+        sum or average) over each key, using reduceByKey will provide much
+        better performance.
+
+        """
         def createCombiner(x):
             return [x]
 
@@ -346,6 +357,10 @@ def mergeCombiners(a, b):
                                  numPartitions).mapValues(lambda x: ResultIterable(x))
 
     def countByValue(self):
+        """
+        Return new DStream which contains the count of each unique value in this
+        DStreeam as a (value, count) pairs.
+        """
         def countPartition(iterator):
             counts = defaultdict(int)
             for obj in iterator:
@@ -360,6 +375,9 @@ def mergeMaps(m1, m2):
         return self.mapPartitions(countPartition).reduce(mergeMaps).flatMap(lambda x: x.items())
 
     def saveAsTextFiles(self, prefix, suffix=None):
+        """
+        Save this DStream as a text file, using string representations of elements.
+        """
 
         def saveAsTextFile(rdd, time):
             path = rddToFileName(prefix, suffix, time)
@@ -368,6 +386,11 @@ def saveAsTextFile(rdd, time):
         return self.foreachRDD(saveAsTextFile)
 
     def saveAsPickledFiles(self, prefix, suffix=None):
+        """
+        Save this DStream as a SequenceFile of serialized objects. The serializer
+        used is L{pyspark.serializers.PickleSerializer}, default batch size
+        is 10.
+        """
 
         def saveAsTextFile(rdd, time):
             path = rddToFileName(prefix, suffix, time)
@@ -397,6 +420,7 @@ def saveAsTextFile(rdd, time):
 # TODO: implement leftOuterJoin
 # TODO: implemtnt rightOuterJoin
 
+
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 2bb01ed3a0642..ef308fdd6aa59 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -18,12 +18,11 @@
 """
 Unit tests for PySpark; additional tests are implemented as doctests in
 individual modules.
-Other option is separate this test case with other tests.
-This makes sense becuase streaming tests takes long time due to waiting time
-for stoping callback server.
 
-This file will merged to tests.py. But for now, this file is separated due
-to focusing to streaming test case
+This file would be merged to tests.py after all functions are ready.
+But for now, this file is separated due to focusing to streaming test case.
+
+Callback server seems like unstable sometimes, which cause error in test case.
 
 """
 from itertools import chain
@@ -43,10 +42,10 @@ def setUp(self):
 
     def tearDown(self):
         # Do not call pyspark.streaming.context.StreamingContext.stop directly because
-        # we do not wait to shutdowncall back server and py4j client
+        # we do not wait to shutdown call back server and py4j client
         self.ssc._jssc.stop()
         self.ssc._sc.stop()
-        # Why does it long time to terminaete StremaingContext and SparkContext?
+        # Why does it long time to terminate StremaingContext and SparkContext?
         # Should we change the sleep time if this depends on machine spec?
         time.sleep(10)
 
@@ -68,7 +67,7 @@ class TestBasicOperationsSuite(PySparkStreamingTestCase):
     I am wondering if these test are enough or not.
     All tests input should have list of lists. This represents stream.
     Every batch interval, the first object of list are chosen to make DStream.
-    Please see the BasicTestSuits in Scala or QueStream which is close to this implementation.
+    Please see the BasicTestSuits in Scala which is close to this implementation.
     """
     def setUp(self):
         PySparkStreamingTestCase.setUp(self)
@@ -358,5 +357,24 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
 
         return self.result
 
+class TestSaveAsFilesSuite(PySparkStreamingTestCase):
+    def setUp(self):
+        PySparkStreamingTestCase.setUp(self)
+        self.timeout = 10  # seconds
+        self.numInputPartitions = 2
+        self.result = list()
+
+    def tearDown(self):
+        PySparkStreamingTestCase.tearDown(self)
+
+    @classmethod
+    def tearDownClass(cls):
+        PySparkStreamingTestCase.tearDownClass()
+
+
+
+
+
+
 if __name__ == "__main__":
     unittest.main()

From 14d4c0e3e59f337a219984a3eb1090cb355706c1 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 00:35:50 -0700
Subject: [PATCH 129/347] removed wasted print in DStream

---
 .../streaming/api/java/JavaDStreamLike.scala    |  9 ---------
 .../spark/streaming/dstream/DStream.scala       | 17 -----------------
 2 files changed, 26 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index 7a002bbe74ca9..a6184de4e83c1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -54,15 +54,6 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     dstream.print()
   }
 
-  def print(label: String = null): Unit = {
-    dstream.print(label)
-  }
-
-  def outputToFile(): Unit = {
-    dstream.outputToFile()
-  }
-
-
   /**
    * Return a new DStream in which each RDD has a single element generated by counting each RDD
    * of this DStream.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 46ef05d9c37a1..39ad591e8896e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -617,23 +617,6 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-
-  def print(label: String = null) {
-    def foreachFunc = (rdd: RDD[T], time: Time) => {
-      val first11 = rdd.take(11)
-      println ("-------------------------------------------")
-      println ("Time: " + time)
-      println ("-------------------------------------------")
-      if(label != null){
-        println (label)
-      }
-      first11.take(10).foreach(println)
-      if (first11.size > 10) println("...")
-      println()
-    }
-    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
-  }
-
   /**
    * Return a new DStream in which each RDD contains all the elements in seen in a
    * sliding window of time over this DStream. The new DStream generates RDDs with

From 97742fe9d83565d4c5330db9cdd5b5cfea2075ee Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 13:24:17 -0700
Subject: [PATCH 130/347] added sparkContext as input parameter in
 StreamingContext

---
 python/pyspark/streaming/context.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index e380626aa080b..3f455a3e06072 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -34,7 +34,7 @@ class StreamingContext(object):
 
     def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
-        gateway=None, duration=None):
+        gateway=None, sparkContext=None, duration=None):
         """
         Create a new StreamingContext. At least the master and app name and duration
         should be set, either through the named parameters here or through C{conf}.
@@ -55,14 +55,18 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         @param conf: A L{SparkConf} object setting Spark properties.
         @param gateway: Use an existing gateway and JVM, otherwise a new JVM
                will be instatiated.
-        @param duration: A L{Duration} Duration for SparkStreaming
+        @param sparkContext: L{SparkContext} object.
+        @param duration: A L{Duration} object for SparkStreaming.
 
         """
 
-        # Create the Python Sparkcontext
-        self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
-                        pyFiles=pyFiles, environment=environment, batchSize=batchSize,
-                        serializer=serializer, conf=conf, gateway=gateway)
+        if sparkContext is None:
+            # Create the Python Sparkcontext
+            self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
+                            pyFiles=pyFiles, environment=environment, batchSize=batchSize,
+                            serializer=serializer, conf=conf, gateway=gateway)
+        else:
+            self._sc = sparkContext
 
         # Start py4j callback server.
         # Callback sever is need only by SparkStreming; therefore the callback sever

From e162822750cd264ce5baeb542068e967337f4c68 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 14:39:45 -0700
Subject: [PATCH 131/347] added gorupByKey testcase

---
 python/pyspark/streaming_tests.py | 70 ++++++++++++++++++++++++-------
 1 file changed, 54 insertions(+), 16 deletions(-)

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index ef308fdd6aa59..c35d352c66ca5 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -275,7 +275,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_mapPartitions_batch(self):
-        """Basic operation test for DStream.mapPartitions with batch deserializer"""
+        """Basic operation test for DStream.mapPartitions with batch deserializer."""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
         numSlices = 2
 
@@ -288,7 +288,7 @@ def f(iterator):
         self.assertEqual(expected_output, output)
 
     def test_mapPartitions_unbatch(self):
-        """Basic operation test for DStream.mapPartitions with unbatch deserializer"""
+        """Basic operation test for DStream.mapPartitions with unbatch deserializer."""
         test_input = [range(1, 4), range(4, 7), range(7, 10)]
         numSlices = 2
 
@@ -301,8 +301,8 @@ def f(iterator):
         self.assertEqual(expected_output, output)
 
     def test_countByValue_batch(self):
-        """Basic operation test for DStream.countByValue with batch deserializer"""
-        test_input = [range(1, 5) + range(1,5), range(5, 7) + range(5, 9), ["a"] * 2 + ["b"] + [""] ]
+        """Basic operation test for DStream.countByValue with batch deserializer."""
+        test_input = [range(1, 5) + range(1,5), range(5, 7) + range(5, 9), ["a", "a", "b", ""]]
 
         def test_func(dstream):
             return dstream.countByValue()
@@ -315,7 +315,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_countByValue_unbatch(self):
-        """Basic operation test for DStream.countByValue with unbatch deserializer"""
+        """Basic operation test for DStream.countByValue with unbatch deserializer."""
         test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
 
         def test_func(dstream):
@@ -328,30 +328,72 @@ def test_func(dstream):
             self._sort_result_based_on_key(result)
         self.assertEqual(expected_output, output)
 
+    def test_groupByKey_batch(self):
+        """Basic operation test for DStream.groupByKey with batch deserializer."""
+        test_input = [range(1, 5), [1, 1, 1, 2, 2, 3], ["a", "a", "b", "", "", ""]]
+        def test_func(dstream):
+            return dstream.map(lambda x: (x,1)).groupByKey()
+        expected_output = [[(1, [1]), (2, [1]), (3, [1]), (4, [1])],
+                           [(1, [1, 1, 1]), (2, [1, 1]), (3, [1])],
+                           [("a", [1, 1]), ("b", [1]), ("", [1, 1, 1])]]
+        scattered_output = self._run_stream(test_input, test_func, expected_output)
+        output = self._convert_iter_value_to_list(scattered_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def test_groupByKey_unbatch(self):
+        """Basic operation test for DStream.groupByKey with unbatch deserializer."""
+        test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
+        def test_func(dstream):
+            return dstream.map(lambda x: (x,1)).groupByKey()
+        expected_output = [[(1, [1]), (2, [1]), (3, [1])],
+                           [(1, [1, 1]), ("", [1])],
+                           [("a", [1, 1]), ("b", [1])]]
+        scattered_output = self._run_stream(test_input, test_func, expected_output)
+        output = self._convert_iter_value_to_list(scattered_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def _convert_iter_value_to_list(self, outputs):
+        """Return key value pair list. Value is converted to iterator to list."""
+        result = list()
+        for output in outputs:
+            result.append(map(lambda (x, y): (x, list(y)), output))
+        return result
+
     def _sort_result_based_on_key(self, outputs):
+        """Sort the list base onf first value."""
         for output in outputs:
             output.sort(key=lambda x: x[0])
 
     def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
-        """Start stream and return the output"""
-        # Generate input stream with user-defined input
+        """
+        Start stream and return the output.
+        @param test_input: dataset for the test. This should be list of lists.
+        @param test_func: wrapped test_function. This function should return PythonDstream object.
+        @param expexted_output: expected output for this testcase.
+        @param numSlices: the number of slices in the rdd in the dstream.
+        """
+        # Generate input stream with user-defined input.
         numSlices = numSlices or self.numInputPartitions
         test_input_stream = self.ssc._testInputStream(test_input, numSlices)
-        # Apply test function to stream
+        # Apply test function to stream.
         test_stream = test_func(test_input_stream)
-        # Add job to get output from stream
+        # Add job to get output from stream.
         test_stream._test_output(self.result)
         self.ssc.start()
 
         start_time = time.time()
-        # loop until get the result from stream
+        # Loop until get the expected the number of the result from the stream.
         while True:
             current_time = time.time()
-            # check time out
+            # Check time out.
             if (current_time - start_time) > self.timeout:
                 break
             self.ssc.awaitTermination(50)
-            # check if the output is the same length of expexted output
+            # Check if the output is the same length of expexted output.
             if len(expected_output) == len(self.result):
                 break
 
@@ -372,9 +414,5 @@ def tearDownClass(cls):
         PySparkStreamingTestCase.tearDownClass()
 
 
-
-
-
-
 if __name__ == "__main__":
     unittest.main()

From e70f7067981290a1e90a581328bb39b6bc75ad6e Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 15:12:31 -0700
Subject: [PATCH 132/347] added testcase for combineByKey

---
 python/pyspark/streaming_tests.py | 35 +++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index c35d352c66ca5..7f6960faed1a0 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -332,7 +332,7 @@ def test_groupByKey_batch(self):
         """Basic operation test for DStream.groupByKey with batch deserializer."""
         test_input = [range(1, 5), [1, 1, 1, 2, 2, 3], ["a", "a", "b", "", "", ""]]
         def test_func(dstream):
-            return dstream.map(lambda x: (x,1)).groupByKey()
+            return dstream.map(lambda x: (x, 1)).groupByKey()
         expected_output = [[(1, [1]), (2, [1]), (3, [1]), (4, [1])],
                            [(1, [1, 1, 1]), (2, [1, 1]), (3, [1])],
                            [("a", [1, 1]), ("b", [1]), ("", [1, 1, 1])]]
@@ -345,8 +345,9 @@ def test_func(dstream):
     def test_groupByKey_unbatch(self):
         """Basic operation test for DStream.groupByKey with unbatch deserializer."""
         test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
+
         def test_func(dstream):
-            return dstream.map(lambda x: (x,1)).groupByKey()
+            return dstream.map(lambda x: (x, 1)).groupByKey()
         expected_output = [[(1, [1]), (2, [1]), (3, [1])],
                            [(1, [1, 1]), ("", [1])],
                            [("a", [1, 1]), ("b", [1])]]
@@ -356,6 +357,36 @@ def test_func(dstream):
             self._sort_result_based_on_key(result)
         self.assertEqual(expected_output, output)
 
+    def test_combineByKey_batch(self):
+        """Basic operation test for DStream.combineByKey with batch deserializer."""
+        test_input = [range(1, 5), [1, 1, 1, 2, 2, 3], ["a", "a", "b", "", "", ""]]
+
+        def test_func(dstream):
+            def add(a, b): return a + str(b)
+            return dstream.map(lambda x: (x, 1)).combineByKey(str, add, add)
+        expected_output = [[(1, "1"), (2, "1"), (3, "1"), (4, "1")],
+                           [(1, "111"), (2, "11"), (3, "1")],
+                           [("a", "11"), ("b", "1"), ("", "111")]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def test_combineByKey_unbatch(self):
+        """Basic operation test for DStream.combineByKey with unbatch deserializer."""
+        test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
+
+        def test_func(dstream):
+            def add(a, b): return a + str(b)
+            return dstream.map(lambda x: (x, 1)).combineByKey(str, add, add)
+        expected_output = [[(1, "1"), (2, "1"), (3, "1")],
+                           [(1, "11"), ("", "1")],
+                           [("a", "11"), ("b", "1")]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
     def _convert_iter_value_to_list(self, outputs):
         """Return key value pair list. Value is converted to iterator to list."""
         result = list()

From f1798c4fec127c7370dba190e89d3d2c2ed4fd5a Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 16:55:51 -0700
Subject: [PATCH 133/347] merge with master

---
 python/pyspark/streaming/dstream.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 8ed50d3dd2531..86bacf432e0ac 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -431,6 +431,7 @@ def __init__(self, prev, func, preservesPartitioning=False):
             self._prev_jrdd_deserializer = prev._jrdd_deserializer
         else:
             prev_func = prev.func
+            
             def pipeline_func(split, iterator):
                 return func(split, prev_func(split, iterator))
             self.func = pipeline_func

From 199e37f029de744e82bdbcced73d2202926c1567 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 17:37:28 -0700
Subject: [PATCH 134/347] adopted the latest compression way of python command

---
 python/pyspark/streaming/dstream.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 8ea20a5e14d43..780653c813811 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -20,7 +20,8 @@
 import operator
 
 from pyspark.serializers import NoOpSerializer,\
-    BatchedSerializer, CloudPickleSerializer, pack_long
+    BatchedSerializer, CloudPickleSerializer, pack_long,\
+    CompressedSerializer
 from pyspark.rdd import _JavaStackTrace
 from pyspark.storagelevel import StorageLevel
 from pyspark.resultiterable import ResultIterable
@@ -458,7 +459,8 @@ def _jdstream(self):
             serializer = self.ctx.serializer
 
         command = (self.func, self._prev_jrdd_deserializer, serializer)
-        pickled_command = CloudPickleSerializer().dumps(command)
+        ser = CompressedSerializer(CloudPickleSerializer())
+        pickled_command = ser.dumps(command)
         broadcast_vars = ListConverter().convert(
             [x._jbroadcast for x in self.ctx._pickled_broadcast_vars],
             self.ctx._gateway._gateway_client)
@@ -467,12 +469,13 @@ def _jdstream(self):
         env = MapConverter().convert(self.ctx.environment,
                                      self.ctx._gateway._gateway_client)
         includes = ListConverter().convert(self.ctx._python_includes,
-                                     self.ctx._gateway._gateway_client)
+                                           self.ctx._gateway._gateway_client)
         python_dstream = self.ctx._jvm.PythonDStream(self._prev_jdstream.dstream(),
-                bytearray(pickled_command),
-                env, includes, self.preservesPartitioning,
-                self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator,
-                class_tag)
+                                                     bytearray(pickled_command),
+                                                     env, includes, self.preservesPartitioning,
+                                                     self.ctx.pythonExec,
+                                                     broadcast_vars, self.ctx._javaAccumulator,
+                                                     class_tag)
         self._jdstream_val = python_dstream.asJavaDStream()
         return self._jdstream_val
 

From 58150f55d89170e37a1c6635adaaacdb97de8d6b Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Tue, 19 Aug 2014 01:42:39 -0700
Subject: [PATCH 135/347] Changed the test case to focus the test operation

---
 python/pyspark/java_gateway.py      |  3 +-
 python/pyspark/streaming/dstream.py |  9 ++--
 python/pyspark/streaming_tests.py   | 79 +++++++++++++++++------------
 3 files changed, 51 insertions(+), 40 deletions(-)

diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 8a4adada9011d..4e1ad472a6d2c 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -84,10 +84,9 @@ def run(self):
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
     java_import(gateway.jvm, "org.apache.spark.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.api.python.*")
-    java_import(gateway.jvm, "org.apache.spark.streaming.*")  # do we need this?
     java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
-    java_import(gateway.jvm, "org.apache.spark.streaming.dstream.*")  # do we need this?
+    java_import(gateway.jvm, "org.apache.spark.streaming.*")  # for Duration and Time
     java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 780653c813811..7834809f1cbee 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -25,7 +25,7 @@
 from pyspark.rdd import _JavaStackTrace
 from pyspark.storagelevel import StorageLevel
 from pyspark.resultiterable import ResultIterable
-from pyspark.streaming.utils import rddToFileName
+from pyspark.streaming.utils import rddToFileName, RDDFunction
 
 
 from py4j.java_collections import ListConverter, MapConverter
@@ -227,7 +227,6 @@ def foreachRDD(self, func):
         This is an output operator, so this DStream will be registered as an output
         stream and there materialized.
         """
-        from utils import RDDFunction
         wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
         self.ctx._jvm.PythonForeachDStream(self._jdstream.dstream(), wrapped_func)
 
@@ -386,18 +385,18 @@ def saveAsTextFile(rdd, time):
 
         return self.foreachRDD(saveAsTextFile)
 
-    def saveAsPickledFiles(self, prefix, suffix=None):
+    def saveAsPickleFiles(self, prefix, suffix=None):
         """
         Save this DStream as a SequenceFile of serialized objects. The serializer
         used is L{pyspark.serializers.PickleSerializer}, default batch size
         is 10.
         """
 
-        def saveAsTextFile(rdd, time):
+        def saveAsPickleFile(rdd, time):
             path = rddToFileName(prefix, suffix, time)
             rdd.saveAsPickleFile(path)
 
-        return self.foreachRDD(saveAsTextFile)
+        return self.foreachRDD(saveAsPickleFile)
 
 
 # TODO: implement updateStateByKey
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 7f6960faed1a0..fe893210ab089 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -39,6 +39,7 @@ class PySparkStreamingTestCase(unittest.TestCase):
     def setUp(self):
         class_name = self.__class__.__name__
         self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
+        time.sleep(1)
 
     def tearDown(self):
         # Do not call pyspark.streaming.context.StreamingContext.stop directly because
@@ -186,68 +187,73 @@ def test_func(dstream):
 
     def test_reduceByKey_batch(self):
         """Basic operation test for DStream.reduceByKey with batch deserializer"""
-        test_input = [["a", "a", "b", "b"], ["", "", "", ""]]
+        test_input = [[("a", 1), ("a", 1), ("b", 1), ("b", 1)],
+                      [("", 1),("", 1), ("", 1), ("", 1)],
+                      [(1, 1), (1, 1), (2, 1), (2, 1), (3, 1)]]
 
         def test_func(dstream):
-            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
-        expected_output = [[("a", 2), ("b", 2)], [("", 4)]]
+            return dstream.reduceByKey(operator.add)
+        expected_output = [[("a", 2), ("b", 2)], [("", 4)], [(1, 2), (2, 2), (3 ,1)]]
         output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
         self.assertEqual(expected_output, output)
 
     def test_reduceByKey_unbatch(self):
         """Basic operation test for DStream.reduceByKey with unbatch deserilizer"""
-        test_input = [["a", "a", "b"], ["", ""], []]
+        test_input = [[("a", 1), ("a", 1), ("b", 1)], [("", 1), ("", 1)], []]
 
         def test_func(dstream):
-            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
+            return dstream.reduceByKey(operator.add)
         expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
         output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
         self.assertEqual(expected_output, output)
 
     def test_mapValues_batch(self):
         """Basic operation test for DStream.mapValues with batch deserializer"""
-        test_input = [["a", "a", "b", "b"], ["", "", "", ""]]
+        test_input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)], 
+                      [("", 4), (1, 1), (2, 2), (3, 3)]]
 
         def test_func(dstream):
-            return dstream.map(lambda x: (x, 1))\
-                          .reduceByKey(operator.add)\
-                          .mapValues(lambda x: x + 10)
-        expected_output = [[("a", 12), ("b", 12)], [("", 14)]]
+            return dstream.mapValues(lambda x: x + 10)
+        expected_output = [[("a", 12), ("b", 12), ("c", 11), ("d", 11)], 
+                           [("", 14), (1, 11), (2, 12), (3, 13)]]
         output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
         self.assertEqual(expected_output, output)
 
     def test_mapValues_unbatch(self):
         """Basic operation test for DStream.mapValues with unbatch deserializer"""
-        test_input = [["a", "a", "b"], ["", ""], []]
+        test_input = [[("a", 2), ("b", 1)], [("", 2)], []]
 
         def test_func(dstream):
-            return dstream.map(lambda x: (x, 1))\
-                          .reduceByKey(operator.add)\
-                          .mapValues(lambda x: x + 10)
+            return dstream.mapValues(lambda x: x + 10)
         expected_output = [[("a", 12), ("b", 11)], [("", 12)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
     def test_flatMapValues_batch(self):
         """Basic operation test for DStream.flatMapValues with batch deserializer"""
-        test_input = [["a", "a", "b", "b"], ["", "", "", ""]]
+        test_input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)], [("", 4), (1, 1), (2, 1), (3, 1)]]
 
         def test_func(dstream):
-            return dstream.map(lambda x: (x, 1))\
-                          .reduceByKey(operator.add)\
-                          .flatMapValues(lambda x: (x, x + 10))
-        expected_output = [[("a", 2), ("a", 12), ("b", 2), ("b", 12)], [("", 4), ("", 14)]]
+            return dstream.flatMapValues(lambda x: (x, x + 10))
+        expected_output = [[("a", 2), ("a", 12), ("b", 2), ("b", 12), 
+                            ("c", 1), ("c", 11), ("d", 1), ("d", 11)], 
+                           [("", 4), ("", 14), (1, 1), (1, 11), 
+                            (2, 1), (2, 11), (3, 1), (3, 11)]]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
     def test_flatMapValues_unbatch(self):
         """Basic operation test for DStream.flatMapValues with unbatch deserializer"""
-        test_input = [["a", "a", "b"], ["", ""], []]
+        test_input = [[("a", 2), ("b", 1)], [("", 2)], []]
 
         def test_func(dstream):
-            return dstream.map(lambda x: (x, 1))\
-                          .reduceByKey(operator.add)\
-                          .flatMapValues(lambda x: (x, x + 10))
+            return dstream.flatMapValues(lambda x: (x, x + 10))
         expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
@@ -302,7 +308,7 @@ def f(iterator):
 
     def test_countByValue_batch(self):
         """Basic operation test for DStream.countByValue with batch deserializer."""
-        test_input = [range(1, 5) + range(1,5), range(5, 7) + range(5, 9), ["a", "a", "b", ""]]
+        test_input = [range(1, 5) * 2, range(5, 7) + range(5, 9), ["a", "a", "b", ""]]
 
         def test_func(dstream):
             return dstream.countByValue()
@@ -330,9 +336,12 @@ def test_func(dstream):
 
     def test_groupByKey_batch(self):
         """Basic operation test for DStream.groupByKey with batch deserializer."""
-        test_input = [range(1, 5), [1, 1, 1, 2, 2, 3], ["a", "a", "b", "", "", ""]]
+        test_input = [[(1, 1), (2, 1), (3, 1), (4, 1)], 
+                      [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)],
+                      [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]]
+
         def test_func(dstream):
-            return dstream.map(lambda x: (x, 1)).groupByKey()
+            return dstream.groupByKey()
         expected_output = [[(1, [1]), (2, [1]), (3, [1]), (4, [1])],
                            [(1, [1, 1, 1]), (2, [1, 1]), (3, [1])],
                            [("a", [1, 1]), ("b", [1]), ("", [1, 1, 1])]]
@@ -344,10 +353,12 @@ def test_func(dstream):
 
     def test_groupByKey_unbatch(self):
         """Basic operation test for DStream.groupByKey with unbatch deserializer."""
-        test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
+        test_input = [[(1, 1), (2, 1), (3, 1)], 
+                      [(1, 1), (1, 1), ("", 1)],
+                      [("a", 1), ("a", 1), ("b", 1)]]
 
         def test_func(dstream):
-            return dstream.map(lambda x: (x, 1)).groupByKey()
+            return dstream.groupByKey()
         expected_output = [[(1, [1]), (2, [1]), (3, [1])],
                            [(1, [1, 1]), ("", [1])],
                            [("a", [1, 1]), ("b", [1])]]
@@ -359,11 +370,13 @@ def test_func(dstream):
 
     def test_combineByKey_batch(self):
         """Basic operation test for DStream.combineByKey with batch deserializer."""
-        test_input = [range(1, 5), [1, 1, 1, 2, 2, 3], ["a", "a", "b", "", "", ""]]
+        test_input = [[(1, 1), (2, 1), (3, 1), (4, 1)], 
+                      [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)], 
+                      [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]]
 
         def test_func(dstream):
             def add(a, b): return a + str(b)
-            return dstream.map(lambda x: (x, 1)).combineByKey(str, add, add)
+            return dstream.combineByKey(str, add, add)
         expected_output = [[(1, "1"), (2, "1"), (3, "1"), (4, "1")],
                            [(1, "111"), (2, "11"), (3, "1")],
                            [("a", "11"), ("b", "1"), ("", "111")]]
@@ -374,11 +387,11 @@ def add(a, b): return a + str(b)
 
     def test_combineByKey_unbatch(self):
         """Basic operation test for DStream.combineByKey with unbatch deserializer."""
-        test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
+        test_input = [[(1, 1), (2, 1), (3 ,1)], [(1, 1), (1, 1), ("", 1)], [("a", 1),  ("a", 1), ("b", 1)]]
 
         def test_func(dstream):
             def add(a, b): return a + str(b)
-            return dstream.map(lambda x: (x, 1)).combineByKey(str, add, add)
+            return dstream.combineByKey(str, add, add)
         expected_output = [[(1, "1"), (2, "1"), (3, "1")],
                            [(1, "11"), ("", "1")],
                            [("a", "11"), ("b", "1")]]
@@ -446,4 +459,4 @@ def tearDownClass(cls):
 
 
 if __name__ == "__main__":
-    unittest.main()
+    unittest.main(verbosity=2)

From 09a28bf4f72b291e6ece23dcdc3cdf8b008c57b6 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Tue, 19 Aug 2014 14:49:28 -0700
Subject: [PATCH 136/347] improve testcases

---
 python/pyspark/streaming_tests.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index fe893210ab089..8396c4f960e81 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -43,7 +43,7 @@ def setUp(self):
 
     def tearDown(self):
         # Do not call pyspark.streaming.context.StreamingContext.stop directly because
-        # we do not wait to shutdown call back server and py4j client
+        # we do not wait to shutdown py4j client.
         self.ssc._jssc.stop()
         self.ssc._sc.stop()
         # Why does it long time to terminate StremaingContext and SparkContext?
@@ -74,7 +74,6 @@ def setUp(self):
         PySparkStreamingTestCase.setUp(self)
         self.timeout = 10  # seconds
         self.numInputPartitions = 2
-        self.result = list()
 
     def tearDown(self):
         PySparkStreamingTestCase.tearDown(self)
@@ -426,7 +425,8 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
         # Apply test function to stream.
         test_stream = test_func(test_input_stream)
         # Add job to get output from stream.
-        test_stream._test_output(self.result)
+        result = list()
+        test_stream._test_output(result)
         self.ssc.start()
 
         start_time = time.time()
@@ -438,10 +438,10 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
                 break
             self.ssc.awaitTermination(50)
             # Check if the output is the same length of expexted output.
-            if len(expected_output) == len(self.result):
+            if len(expected_output) == len(result):
                 break
 
-        return self.result
+        return result
 
 class TestSaveAsFilesSuite(PySparkStreamingTestCase):
     def setUp(self):

From 268a6a567f7a00ad990cd8c70946557b733c654c Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Tue, 19 Aug 2014 15:33:04 -0700
Subject: [PATCH 137/347] Changed awaitTermination not to call
 awaitTermincation in Scala. Just use time.sleep instread

---
 python/pyspark/streaming/context.py | 4 +++-
 python/pyspark/streaming_tests.py   | 5 +++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 3f455a3e06072..5d6740893ada5 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -16,6 +16,7 @@
 #
 
 import sys
+import time
 from signal import signal, SIGTERM, SIGINT
 
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
@@ -102,11 +103,12 @@ def start(self):
     def awaitTermination(self, timeout=None):
         """
         Wait for the execution to stop.
+        timeout is milliseconds
         """
         if timeout is None:
             self._jssc.awaitTermination()
         else:
-            self._jssc.awaitTermination(timeout)
+            time.sleep(timeout/1000)
 
     #TODO: add storageLevel
     def socketTextStream(self, hostname, port):
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 8396c4f960e81..2964107f2d92e 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -48,7 +48,7 @@ def tearDown(self):
         self.ssc._sc.stop()
         # Why does it long time to terminate StremaingContext and SparkContext?
         # Should we change the sleep time if this depends on machine spec?
-        time.sleep(10)
+        time.sleep(1)
 
     @classmethod
     def tearDownClass(cls):
@@ -436,7 +436,8 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
             # Check time out.
             if (current_time - start_time) > self.timeout:
                 break
-            self.ssc.awaitTermination(50)
+            #self.ssc.awaitTermination(50)
+            time.sleep(0.05)
             # Check if the output is the same length of expexted output.
             if len(expected_output) == len(result):
                 break

From 4dedd2d516d80a3792699ec714e65fc465b6e495 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Tue, 19 Aug 2014 15:59:49 -0700
Subject: [PATCH 138/347] change test case not to use awaitTermination

---
 python/pyspark/streaming/context.py                         | 4 ++--
 python/pyspark/streaming_tests.py                           | 6 +++---
 .../apache/spark/streaming/api/python/PythonDStream.scala   | 3 ---
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 5d6740893ada5..66bed6bf76d77 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -103,12 +103,12 @@ def start(self):
     def awaitTermination(self, timeout=None):
         """
         Wait for the execution to stop.
-        timeout is milliseconds
+        @param timeout: time to wait in milliseconds
         """
         if timeout is None:
             self._jssc.awaitTermination()
         else:
-            time.sleep(timeout/1000)
+            self._jssc.awaitTermination(timeout)
 
     #TODO: add storageLevel
     def socketTextStream(self, hostname, port):
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 2964107f2d92e..fb24858dee964 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -39,7 +39,6 @@ class PySparkStreamingTestCase(unittest.TestCase):
     def setUp(self):
         class_name = self.__class__.__name__
         self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
-        time.sleep(1)
 
     def tearDown(self):
         # Do not call pyspark.streaming.context.StreamingContext.stop directly because
@@ -52,7 +51,7 @@ def tearDown(self):
 
     @classmethod
     def tearDownClass(cls):
-        time.sleep(5)
+        # Make sure tp shutdown the callback server 
         SparkContext._gateway._shutdown_callback_server()
 
 
@@ -436,7 +435,8 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
             # Check time out.
             if (current_time - start_time) > self.timeout:
                 break
-            #self.ssc.awaitTermination(50)
+            # StreamingContext.awaitTermination is not used to wait because 
+            # if py4j server is called every 50 milliseconds, it gets an error
             time.sleep(0.05)
             # Check if the output is the same length of expexted output.
             if len(expected_output) == len(result):
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 75e086134f896..c6782215fc869 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -112,8 +112,6 @@ class PythonForeachDStream(
   this.register()
 }
 
-/*
-This does not work. Ignore this for now. -TD
 class PythonTransformedDStream(
     prev: DStream[Array[Byte]],
     transformFunction: PythonRDDFunction
@@ -131,7 +129,6 @@ class PythonTransformedDStream(
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
-*/
 
 /**
  * This is a input stream just for the unitest. This is equivalent to a checkpointable,

From 171edebc73c76858705bdc4ec2a5d9f5dc930a2e Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 16:07:42 -0700
Subject: [PATCH 139/347] clean up

---
 python/pyspark/streaming/context.py           | 10 ++---
 python/pyspark/streaming/dstream.py           | 28 +++++--------
 python/pyspark/streaming/jtime.py             |  3 +-
 python/pyspark/streaming/utils.py             | 12 ++++--
 python/pyspark/streaming_tests.py             | 15 +++----
 .../streaming/api/python/PythonDStream.scala  | 31 ++++----------
 .../api/python/PythonRDDFunction.java         |  4 ++
 .../api/python/PythonTransformedDStream.scala | 42 -------------------
 8 files changed, 46 insertions(+), 99 deletions(-)
 delete mode 100644 streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 66bed6bf76d77..f7e356319ecac 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -16,7 +16,6 @@
 #
 
 import sys
-import time
 from signal import signal, SIGTERM, SIGINT
 
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
@@ -143,9 +142,9 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
 
     def _testInputStream(self, test_inputs, numSlices=None):
         """
-        This function is only for test.
-        This implementation is inspired by QueStream implementation.
-        Give list of RDD to generate DStream which contains the RDD.
+        This function is only for unittest.
+        It requires a sequence as input, and returns the i_th element at the i_th batch
+        under manual clock.
         """
         test_rdds = list()
         test_rdd_deserializers = list()
@@ -153,7 +152,8 @@ def _testInputStream(self, test_inputs, numSlices=None):
             test_rdd = self._sc.parallelize(test_input, numSlices)
             test_rdds.append(test_rdd._jrdd)
             test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
-
+        # All deserializer has to be the same.
+        # TODO: add deserializer validation
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 7834809f1cbee..caf4378a9b1b9 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -276,23 +276,6 @@ def func(iterator):
             yield list(iterator)
         return self.mapPartitions(func)
 
-    #def transform(self, func): - TD
-    #    from utils import RDDFunction
-    #    wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
-    #    jdstream = self.ctx._jvm.PythonTransformedDStream(self._jdstream.dstream(), wrapped_func).toJavaDStream
-    #    return DStream(jdstream, self._ssc, ...)  ## DO NOT KNOW HOW
-
-    def _test_output(self, result):
-        """
-        This function is only for test case.
-        Store data in a DStream to result to verify the result in test case
-        """
-        def get_output(rdd, time):
-            taken = rdd.collect()
-            result.append(taken)
-
-        self.foreachRDD(get_output)
-
     def cache(self):
         """
         Persist this DStream with the default storage level (C{MEMORY_ONLY_SER}).
@@ -398,6 +381,17 @@ def saveAsPickleFile(rdd, time):
 
         return self.foreachRDD(saveAsPickleFile)
 
+    def _test_output(self, result):
+        """
+        This function is only for test case.
+        Store data in a DStream to result to verify the result in test case
+        """
+        def get_output(rdd, time):
+            collected = rdd.collect()
+            result.append(collected)
+
+        self.foreachRDD(get_output)
+
 
 # TODO: implement updateStateByKey
 # TODO: implement slice
diff --git a/python/pyspark/streaming/jtime.py b/python/pyspark/streaming/jtime.py
index 32ef741051283..f169228e81868 100644
--- a/python/pyspark/streaming/jtime.py
+++ b/python/pyspark/streaming/jtime.py
@@ -19,10 +19,11 @@
 from pyspark.streaming.duration import Duration
 
 """
-The name of this file, time is not good naming for python
+The name of this file, time is not a good naming for python
 because if we do import time when we want to use native python time package, it does
 not import python time package.
 """
+# TODO: add doctest
 
 
 class Time(object):
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index 9178577743e0b..5ba179cae7f9c 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -19,6 +19,9 @@
 
 
 class RDDFunction():
+    """
+    This class is for py4j callback. This
+    """
     def __init__(self, ctx, jrdd_deserializer, func):
         self.ctx = ctx
         self.deserializer = jrdd_deserializer
@@ -38,6 +41,7 @@ class Java:
 
 
 def msDurationToString(ms):
+    #TODO: add doctest
     """
     Returns a human-readable string representing a duration such as "35ms"
     """
@@ -54,8 +58,10 @@ def msDurationToString(ms):
     else:
         return "%.2f h" % (float(ms) / hour)
 
+
 def rddToFileName(prefix, suffix, time):
-    if suffix is not None:
-        return prefix + "-" + str(time) + "." + suffix
-    else:
+    #TODO: add doctest
+    if suffix is None:
         return prefix + "-" + str(time)
+    else:
+        return prefix + "-" + str(time) + "." + suffix
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index fb24858dee964..f2ef45ab23ccc 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -20,9 +20,10 @@
 individual modules.
 
 This file would be merged to tests.py after all functions are ready.
-But for now, this file is separated due to focusing to streaming test case.
+Since python API for streaming is beta, this file is separated.
 
-Callback server seems like unstable sometimes, which cause error in test case.
+Callback server is sometimes unstable sometimes, which cause error in test case.
+But this is very rare case.
 
 """
 from itertools import chain
@@ -58,15 +59,14 @@ def tearDownClass(cls):
 class TestBasicOperationsSuite(PySparkStreamingTestCase):
     """
     2 tests for each function for batach deserializer and unbatch deserilizer because
-    we cannot change the deserializer after streaming process starts.
+    the deserializer is not changed dunamically after streaming process starts.
     Default numInputPartitions is 2.
     If the number of input element is over 3, that DStream use batach deserializer.
     If not, that DStream use unbatch deserializer.
 
-    Most of the operation uses UTF8 deserializer to get value from Scala.
-    I am wondering if these test are enough or not.
-    All tests input should have list of lists. This represents stream.
+    All tests input should have list of lists. This list represents stream.
     Every batch interval, the first object of list are chosen to make DStream.
+    e.g The first list in the list is input of the first batch. 
     Please see the BasicTestSuits in Scala which is close to this implementation.
     """
     def setUp(self):
@@ -412,7 +412,7 @@ def _sort_result_based_on_key(self, outputs):
 
     def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
         """
-        Start stream and return the output.
+        Start stream and return the result.
         @param test_input: dataset for the test. This should be list of lists.
         @param test_func: wrapped test_function. This function should return PythonDstream object.
         @param expexted_output: expected output for this testcase.
@@ -444,6 +444,7 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
 
         return result
 
+
 class TestSaveAsFilesSuite(PySparkStreamingTestCase):
     def setUp(self):
         PySparkStreamingTestCase.setUp(self)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index c6782215fc869..37df73717c65d 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -18,10 +18,8 @@
 package org.apache.spark.streaming.api.python
 
 import java.io._
-import java.io.{ObjectInputStream, IOException}
-import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
+import java.util.{List => JList, ArrayList => JArrayList, Map => JMap}
 
-import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
 import scala.collection.JavaConversions._
 
@@ -56,7 +54,9 @@ class PythonDStream[T: ClassTag](
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     parent.getOrCompute(validTime) match{
       case Some(rdd) =>
-        val pythonRDD = new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning, pythonExec, broadcastVars, accumulator)
+        // create PythonRDD to compute Python functions.
+        val pythonRDD = new PythonRDD(rdd, command, envVars, pythonIncludes,
+          preservePartitoning, pythonExec, broadcastVars, accumulator)
         Some(pythonRDD.asJavaRDD.rdd)
       case None => None
     }
@@ -81,8 +81,8 @@ DStream[Array[Byte]](prev.ssc){
       case Some(rdd)=>Some(rdd)
         val pairwiseRDD = new PairwiseRDD(rdd)
         /*
-         * Since python operation is executed by Scala after StreamingContext.start.
-         * What PythonPairwiseDStream does is equivalent to python code in pySpark.
+         * Since python function is executed by Scala after StreamingContext.start.
+         * What PythonPairwiseDStream does is equivalent to python code in pyspark.
          *
          * with _JavaStackTrace(self.context) as st:
          *    pairRDD = self.ctx._jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD()
@@ -99,6 +99,7 @@ DStream[Array[Byte]](prev.ssc){
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
 
+
 class PythonForeachDStream(
     prev: DStream[Array[Byte]],
     foreachFunction: PythonRDDFunction
@@ -112,29 +113,11 @@ class PythonForeachDStream(
   this.register()
 }
 
-class PythonTransformedDStream(
-    prev: DStream[Array[Byte]],
-    transformFunction: PythonRDDFunction
-  ) extends DStream[Array[Byte]](prev.ssc) {
-
-  override def dependencies = List(prev)
-
-  override def slideDuration: Duration = prev.slideDuration
-
-  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-    prev.getOrCompute(validTime).map(rdd => {
-      transformFunction.call(rdd.toJavaRDD(), validTime.milliseconds).rdd
-    })
-  }
-
-  val asJavaDStream  = JavaDStream.fromDStream(this)
-}
 
 /**
  * This is a input stream just for the unitest. This is equivalent to a checkpointable,
  * replayable, reliable message queue like Kafka. It requires a sequence as input, and
  * returns the i_th element at the i_th batch under manual clock.
- * This implementation is inspired by QueStream
  */
 
 class PythonTestInputStream(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[JavaRDD[Array[Byte]]])
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
index 88f7036c3a05b..b46a644dacb7c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
@@ -3,6 +3,10 @@
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.streaming.Time;
 
+/*
+ * Interface for py4j callback function.
+ * This function is called by pyspark.streaming.dstream.DStream.foreachRDD .
+ */
 public interface PythonRDDFunction {
   JavaRDD<byte[]> call(JavaRDD<byte[]> rdd, long time);
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
deleted file mode 100644
index bc07e09ec6d03..0000000000000
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
-
-package org.apache.spark.streaming.api.python
-
-import org.apache.spark.Accumulator
-import org.apache.spark.api.python.PythonRDD
-import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.rdd.RDD
-import org.apache.spark.streaming.api.java.JavaDStream
-import org.apache.spark.streaming.{Time, Duration}
-import org.apache.spark.streaming.dstream.DStream
-
-import scala.reflect.ClassTag
-
-class PythonTransformedDStream[T: ClassTag](
-               parent: DStream[T],
-               command: Array[Byte],
-               envVars: JMap[String, String],
-               pythonIncludes: JList[String],
-               preservePartitoning: Boolean,
-               pythonExec: String,
-               broadcastVars: JList[Broadcast[Array[Byte]]],
-               accumulator: Accumulator[JList[Array[Byte]]]
-               ) extends DStream[Array[Byte]](parent.ssc) {
-
-  override def dependencies = List(parent)
-
-  override def slideDuration: Duration = parent.slideDuration
-
-  //pythonDStream compute
-  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-
-//    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
-//    parents.map(_.getOrCompute(validTime).orNull).to
-//    parent = parents.head.asInstanceOf[RDD]
-//    Some()
-  }
-
-  val asJavaDStream = JavaDStream.fromDStream(this)
-}
-
-*/

From f0ea311e444655d5e13a72b85004683452919267 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 17:09:34 -0700
Subject: [PATCH 140/347] clean up code

---
 python/pyspark/streaming/context.py | 11 ++---
 python/pyspark/streaming/dstream.py | 32 ++++++++-----
 python/pyspark/streaming/pyprint.py | 54 ---------------------
 python/pyspark/streaming_tests.py   | 74 ++++++++++++++++-------------
 4 files changed, 63 insertions(+), 108 deletions(-)
 delete mode 100644 python/pyspark/streaming/pyprint.py

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index f7e356319ecac..dbb6fdf1694ad 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -72,7 +72,7 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         # Callback sever is need only by SparkStreming; therefore the callback sever
         # is started in StreamingContext.
         SparkContext._gateway.restart_callback_server()
-        self._clean_up_trigger()
+        self._set_clean_up_trigger()
         self._jvm = self._sc._jvm
         self._jssc = self._initialize_context(self._sc._jsc, duration._jduration)
 
@@ -80,13 +80,11 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
     def _initialize_context(self, jspark_context, jduration):
         return self._jvm.JavaStreamingContext(jspark_context, jduration)
 
-    def _clean_up_trigger(self):
+    def _set_clean_up_trigger(self):
         """Kill py4j callback server properly using signal lib"""
 
         def clean_up_handler(*args):
             # Make sure stop callback server.
-            # This need improvement how to terminate callback sever properly.
-            SparkContext._gateway._shutdown_callback_server()
             SparkContext._gateway.shutdown()
             sys.exit(0)
 
@@ -132,18 +130,15 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
         Stop the execution of the streams immediately (does not wait for all received data
         to be processed).
         """
-        
         try:
             self._jssc.stop(stopSparkContext, stopGraceFully)
         finally:
-            # Stop Callback server
-            SparkContext._gateway._shutdown_callback_server()
             SparkContext._gateway.shutdown()
 
     def _testInputStream(self, test_inputs, numSlices=None):
         """
         This function is only for unittest.
-        It requires a sequence as input, and returns the i_th element at the i_th batch
+        It requires a list as input, and returns the i_th element at the i_th batch
         under manual clock.
         """
         test_rdds = list()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index caf4378a9b1b9..fc15309679c2a 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -201,7 +201,7 @@ def _defaultReducePartitions(self):
         """
         Returns the default number of partitions to use during reduce tasks (e.g., groupBy).
         If spark.default.parallelism is set, then we'll use the value from SparkContext
-        defaultParallelism, otherwise we'll use the number of partitions in this RDD.
+        defaultParallelism, otherwise we'll use the number of partitions in this RDD
 
         This mirrors the behavior of the Scala Partitioner#defaultPartitioner, intended to reduce
         the likelihood of OOMs. Once PySpark adopts Partitioner-based APIs, this behavior will
@@ -216,7 +216,8 @@ def getNumPartitions(self):
         """
         Return the number of partitions in RDD
         """
-        # TODO: remove hardcoding. RDD has NumPartitions but DStream does not have.
+        # TODO: remove hardcoding. RDD has NumPartitions. How do we get the number of partition
+        # through DStream?
         return 2
 
     def foreachRDD(self, func):
@@ -236,6 +237,10 @@ def pyprint(self):
         operator, so this DStream will be registered as an output stream and there materialized.
         """
         def takeAndPrint(rdd, time):
+            """
+            Closure to take element from RDD and print first 10 elements.
+            This closure is called by py4j callback server.
+            """
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
@@ -300,17 +305,11 @@ def checkpoint(self, interval):
         Mark this DStream for checkpointing. It will be saved to a file inside the
         checkpoint directory set with L{SparkContext.setCheckpointDir()}
 
-        I am not sure this part in DStream
-        and
-        all references to its parent RDDs will be removed. This function must
-        be called before any job has been executed on this RDD. It is strongly
-        recommended that this RDD is persisted in memory, otherwise saving it
-        on a file will require recomputation.
-
-        interval must be pysprak.streaming.duration
+        @param interval: Time interval after which generated RDD will be checkpointed
+               interval has to be pyspark.streaming.duration.Duration
         """
         self.is_checkpointed = True
-        self._jdstream.checkpoint(interval)
+        self._jdstream.checkpoint(interval._jduration)
         return self
 
     def groupByKey(self, numPartitions=None):
@@ -363,6 +362,10 @@ def saveAsTextFiles(self, prefix, suffix=None):
         """
 
         def saveAsTextFile(rdd, time):
+            """
+            Closure to save element in RDD in DStream as Pickled data in file.
+            This closure is called by py4j callback server.
+            """
             path = rddToFileName(prefix, suffix, time)
             rdd.saveAsTextFile(path)
 
@@ -376,6 +379,10 @@ def saveAsPickleFiles(self, prefix, suffix=None):
         """
 
         def saveAsPickleFile(rdd, time):
+            """
+            Closure to save element in RDD in the DStream as Pickled data in file.
+            This closure is called by py4j callback server.
+            """
             path = rddToFileName(prefix, suffix, time)
             rdd.saveAsPickleFile(path)
 
@@ -404,9 +411,10 @@ def get_output(rdd, time):
 # TODO: implement countByWindow
 # TODO: implement reduceByWindow
 
-# Following operation has dependency to transform
+# transform Operation
 # TODO: implement transform
 # TODO: implement transformWith
+# Following operation has dependency with transform
 # TODO: implement union
 # TODO: implement repertitions
 # TODO: implement cogroup
diff --git a/python/pyspark/streaming/pyprint.py b/python/pyspark/streaming/pyprint.py
deleted file mode 100644
index 49517b3e5c247..0000000000000
--- a/python/pyspark/streaming/pyprint.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-import sys
-from itertools import chain
-
-from pyspark.serializers import PickleSerializer
-
-
-def collect(binary_file_path):
-    """
-    Read pickled file written by SparkStreaming
-    """
-    dse = PickleSerializer()
-    with open(binary_file_path, 'rb') as tempFile:
-        for item in dse.load_stream(tempFile):
-            yield item
-
-
-def main():
-    try:
-        binary_file_path = sys.argv[1]
-    except:
-        print "Missed FilePath in argements"
-
-    if not binary_file_path:
-        return 
-
-    counter = 0
-    for rdd in chain.from_iterable(collect(binary_file_path)):
-        print rdd
-        counter = counter + 1
-        if counter >= 10:
-            print "..."
-            break
-
-
-if __name__ =="__main__":
-    exit(main())
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index f2ef45ab23ccc..ba6c028f1fb55 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -64,7 +64,7 @@ class TestBasicOperationsSuite(PySparkStreamingTestCase):
     If the number of input element is over 3, that DStream use batach deserializer.
     If not, that DStream use unbatch deserializer.
 
-    All tests input should have list of lists. This list represents stream.
+    All tests input should have list of lists(3 lists are default). This list represents stream.
     Every batch interval, the first object of list are chosen to make DStream.
     e.g The first list in the list is input of the first batch. 
     Please see the BasicTestSuits in Scala which is close to this implementation.
@@ -82,7 +82,7 @@ def tearDownClass(cls):
         PySparkStreamingTestCase.tearDownClass()
 
     def test_map_batch(self):
-        """Basic operation test for DStream.map with batch deserializer"""
+        """Basic operation test for DStream.map with batch deserializer."""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
 
         def test_func(dstream):
@@ -92,7 +92,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_map_unbatach(self):
-        """Basic operation test for DStream.map with unbatch deserializer"""
+        """Basic operation test for DStream.map with unbatch deserializer."""
         test_input = [range(1, 4), range(4, 7), range(7, 10)]
 
         def test_func(dstream):
@@ -102,7 +102,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_flatMap_batch(self):
-        """Basic operation test for DStream.faltMap with batch deserializer"""
+        """Basic operation test for DStream.faltMap with batch deserializer."""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
 
         def test_func(dstream):
@@ -113,7 +113,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_flatMap_unbatch(self):
-        """Basic operation test for DStream.faltMap with unbatch deserializer"""
+        """Basic operation test for DStream.faltMap with unbatch deserializer."""
         test_input = [range(1, 4), range(4, 7), range(7, 10)]
 
         def test_func(dstream):
@@ -124,7 +124,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_filter_batch(self):
-        """Basic operation test for DStream.filter with batch deserializer"""
+        """Basic operation test for DStream.filter with batch deserializer."""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
 
         def test_func(dstream):
@@ -134,7 +134,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_filter_unbatch(self):
-        """Basic operation test for DStream.filter with unbatch deserializer"""
+        """Basic operation test for DStream.filter with unbatch deserializer."""
         test_input = [range(1, 4), range(4, 7), range(7, 10)]
 
         def test_func(dstream):
@@ -144,7 +144,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_count_batch(self):
-        """Basic operation test for DStream.count with batch deserializer"""
+        """Basic operation test for DStream.count with batch deserializer."""
         test_input = [range(1, 5), range(1, 10), range(1, 20)]
 
         def test_func(dstream):
@@ -154,7 +154,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_count_unbatch(self):
-        """Basic operation test for DStream.count with unbatch deserializer"""
+        """Basic operation test for DStream.count with unbatch deserializer."""
         test_input = [[], [1], range(1, 3), range(1, 4)]
 
         def test_func(dstream):
@@ -164,7 +164,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_reduce_batch(self):
-        """Basic operation test for DStream.reduce with batch deserializer"""
+        """Basic operation test for DStream.reduce with batch deserializer."""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
 
         def test_func(dstream):
@@ -174,7 +174,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_reduce_unbatch(self):
-        """Basic operation test for DStream.reduce with unbatch deserializer"""
+        """Basic operation test for DStream.reduce with unbatch deserializer."""
         test_input = [[1], range(1, 3), range(1, 4)]
 
         def test_func(dstream):
@@ -184,7 +184,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_reduceByKey_batch(self):
-        """Basic operation test for DStream.reduceByKey with batch deserializer"""
+        """Basic operation test for DStream.reduceByKey with batch deserializer."""
         test_input = [[("a", 1), ("a", 1), ("b", 1), ("b", 1)],
                       [("", 1),("", 1), ("", 1), ("", 1)],
                       [(1, 1), (1, 1), (2, 1), (2, 1), (3, 1)]]
@@ -198,7 +198,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_reduceByKey_unbatch(self):
-        """Basic operation test for DStream.reduceByKey with unbatch deserilizer"""
+        """Basic operation test for DStream.reduceByKey with unbatch deserializer."""
         test_input = [[("a", 1), ("a", 1), ("b", 1)], [("", 1), ("", 1)], []]
 
         def test_func(dstream):
@@ -210,44 +210,49 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_mapValues_batch(self):
-        """Basic operation test for DStream.mapValues with batch deserializer"""
+        """Basic operation test for DStream.mapValues with batch deserializer."""
         test_input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)], 
-                      [("", 4), (1, 1), (2, 2), (3, 3)]]
+                      [("", 4), (1, 1), (2, 2), (3, 3)],
+                      [(1, 1), (2, 1), (3, 1), (4, 1)]]
 
         def test_func(dstream):
             return dstream.mapValues(lambda x: x + 10)
         expected_output = [[("a", 12), ("b", 12), ("c", 11), ("d", 11)], 
-                           [("", 14), (1, 11), (2, 12), (3, 13)]]
+                           [("", 14), (1, 11), (2, 12), (3, 13)],
+                           [(1, 11), (2, 11), (3, 11), (4, 11)]]
         output = self._run_stream(test_input, test_func, expected_output)
         for result in (output, expected_output):
             self._sort_result_based_on_key(result)
         self.assertEqual(expected_output, output)
 
     def test_mapValues_unbatch(self):
-        """Basic operation test for DStream.mapValues with unbatch deserializer"""
-        test_input = [[("a", 2), ("b", 1)], [("", 2)], []]
+        """Basic operation test for DStream.mapValues with unbatch deserializer."""
+        test_input = [[("a", 2), ("b", 1)], [("", 2)], [], [(1, 1), (2, 2)]]
 
         def test_func(dstream):
             return dstream.mapValues(lambda x: x + 10)
-        expected_output = [[("a", 12), ("b", 11)], [("", 12)], []]
+        expected_output = [[("a", 12), ("b", 11)], [("", 12)], [], [(1, 11), (2, 12)]]
         output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
         self.assertEqual(expected_output, output)
 
     def test_flatMapValues_batch(self):
-        """Basic operation test for DStream.flatMapValues with batch deserializer"""
-        test_input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)], [("", 4), (1, 1), (2, 1), (3, 1)]]
+        """Basic operation test for DStream.flatMapValues with batch deserializer."""
+        test_input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)],
+                      [("", 4), (1, 1), (2, 1), (3, 1)],
+                      [(1, 1), (2, 1), (3, 1), (4, 1)]]
 
         def test_func(dstream):
             return dstream.flatMapValues(lambda x: (x, x + 10))
-        expected_output = [[("a", 2), ("a", 12), ("b", 2), ("b", 12), 
-                            ("c", 1), ("c", 11), ("d", 1), ("d", 11)], 
-                           [("", 4), ("", 14), (1, 1), (1, 11), 
-                            (2, 1), (2, 11), (3, 1), (3, 11)]]
+        expected_output = [[("a", 2), ("a", 12), ("b", 2), ("b", 12), ("c", 1), ("c", 11), ("d", 1), ("d", 11)],
+                           [("", 4), ("", 14), (1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11)],
+                           [(1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11), (4, 1), (4, 11)]]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
     def test_flatMapValues_unbatch(self):
-        """Basic operation test for DStream.flatMapValues with unbatch deserializer"""
+        """Basic operation test for DStream.flatMapValues with unbatch deserializer."""
         test_input = [[("a", 2), ("b", 1)], [("", 2)], []]
 
         def test_func(dstream):
@@ -257,7 +262,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_glom_batch(self):
-        """Basic operation test for DStream.glom with batch deserializer"""
+        """Basic operation test for DStream.glom with batch deserializer."""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
         numSlices = 2
 
@@ -268,7 +273,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_glom_unbatach(self):
-        """Basic operation test for DStream.glom with unbatch deserialiser"""
+        """Basic operation test for DStream.glom with unbatch deserializer."""
         test_input = [range(1, 4), range(4, 7), range(7, 10)]
         numSlices = 2
 
@@ -385,7 +390,7 @@ def add(a, b): return a + str(b)
 
     def test_combineByKey_unbatch(self):
         """Basic operation test for DStream.combineByKey with unbatch deserializer."""
-        test_input = [[(1, 1), (2, 1), (3 ,1)], [(1, 1), (1, 1), ("", 1)], [("a", 1),  ("a", 1), ("b", 1)]]
+        test_input = [[(1, 1), (2, 1), (3, 1)], [(1, 1), (1, 1), ("", 1)], [("a", 1),  ("a", 1), ("b", 1)]]
 
         def test_func(dstream):
             def add(a, b): return a + str(b)
@@ -414,8 +419,8 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
         """
         Start stream and return the result.
         @param test_input: dataset for the test. This should be list of lists.
-        @param test_func: wrapped test_function. This function should return PythonDstream object.
-        @param expexted_output: expected output for this testcase.
+        @param test_func: wrapped test_function. This function should return PythonDStream object.
+        @param expected_output: expected output for this testcase.
         @param numSlices: the number of slices in the rdd in the dstream.
         """
         # Generate input stream with user-defined input.
@@ -436,21 +441,22 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
             if (current_time - start_time) > self.timeout:
                 break
             # StreamingContext.awaitTermination is not used to wait because 
-            # if py4j server is called every 50 milliseconds, it gets an error
+            # if py4j server is called every 50 milliseconds, it gets an error.
             time.sleep(0.05)
-            # Check if the output is the same length of expexted output.
+            # Check if the output is the same length of expected output.
             if len(expected_output) == len(result):
                 break
 
         return result
 
+#TODO: add testcase for saveAs*
+
 
 class TestSaveAsFilesSuite(PySparkStreamingTestCase):
     def setUp(self):
         PySparkStreamingTestCase.setUp(self)
         self.timeout = 10  # seconds
         self.numInputPartitions = 2
-        self.result = list()
 
     def tearDown(self):
         PySparkStreamingTestCase.tearDown(self)

From 1d841425c5a73d69c505cea3d197ba94cea9f503 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 17:11:02 -0700
Subject: [PATCH 141/347] remove unimplement test

---
 python/pyspark/streaming_tests.py | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index ba6c028f1fb55..067b168e2b528 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -449,22 +449,5 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
 
         return result
 
-#TODO: add testcase for saveAs*
-
-
-class TestSaveAsFilesSuite(PySparkStreamingTestCase):
-    def setUp(self):
-        PySparkStreamingTestCase.setUp(self)
-        self.timeout = 10  # seconds
-        self.numInputPartitions = 2
-
-    def tearDown(self):
-        PySparkStreamingTestCase.tearDown(self)
-
-    @classmethod
-    def tearDownClass(cls):
-        PySparkStreamingTestCase.tearDownClass()
-
-
 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    unittest.main()

From 583e66dab70e892f8eeaa4ec96c868d0e2cbef20 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 17:28:12 -0700
Subject: [PATCH 142/347] move tests for streaming inside streaming directory

---
 python/pyspark/streaming/dstream.py                       | 2 +-
 python/pyspark/{streaming_tests.py => streaming/tests.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename python/pyspark/{streaming_tests.py => streaming/tests.py} (100%)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index fc15309679c2a..e1fc95db09eea 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -216,7 +216,7 @@ def getNumPartitions(self):
         """
         Return the number of partitions in RDD
         """
-        # TODO: remove hardcoding. RDD has NumPartitions. How do we get the number of partition
+        # TODO: remove hard coding. RDD has NumPartitions. How do we get the number of partition
         # through DStream?
         return 2
 
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming/tests.py
similarity index 100%
rename from python/pyspark/streaming_tests.py
rename to python/pyspark/streaming/tests.py

From b7dab85701d65e6606b469df9a834136625d29ae Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 18:17:42 -0700
Subject: [PATCH 143/347] improve test case

---
 python/pyspark/streaming/dstream.py           |  2 +-
 python/pyspark/streaming/duration.py          |  4 ++--
 python/pyspark/streaming/jtime.py             |  7 +++---
 python/pyspark/streaming/tests.py             |  8 ++++++-
 .../pyspark/streaming/{utils.py => util.py}   | 24 ++++++++++++++++---
 python/run-tests                              |  2 ++
 6 files changed, 36 insertions(+), 11 deletions(-)
 rename python/pyspark/streaming/{utils.py => util.py} (79%)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index e1fc95db09eea..f91a3b8a355d2 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -25,7 +25,7 @@
 from pyspark.rdd import _JavaStackTrace
 from pyspark.storagelevel import StorageLevel
 from pyspark.resultiterable import ResultIterable
-from pyspark.streaming.utils import rddToFileName, RDDFunction
+from pyspark.streaming.util import rddToFileName, RDDFunction
 
 
 from py4j.java_collections import ListConverter, MapConverter
diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
index a7f1036e4b856..fa03410f3f8e2 100644
--- a/python/pyspark/streaming/duration.py
+++ b/python/pyspark/streaming/duration.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-from pyspark.streaming import utils
+from pyspark.streaming import util
 
 
 class Duration(object):
@@ -82,7 +82,7 @@ def prettyPrint(self):
         >>> d_1hour.prettyPrint()
         '1.00 h'
         """
-        return utils.msDurationToString(self._millis)
+        return util.msDurationToString(self._millis)
 
     def milliseconds(self):
         """
diff --git a/python/pyspark/streaming/jtime.py b/python/pyspark/streaming/jtime.py
index f169228e81868..02e0e0cf2d17e 100644
--- a/python/pyspark/streaming/jtime.py
+++ b/python/pyspark/streaming/jtime.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 
-from pyspark.streaming import utils
 from pyspark.streaming.duration import Duration
 
 """
@@ -87,7 +86,7 @@ def __sub__(self, other):
         if isinstance(other, Duration):
             return Time(self._millis - other._millis)
         elif isinstance(other, Time):
-            return Duration(self._mills, other._millis)
+            return Duration(self._millis, other._millis)
         else:
             raise TypeError
 
@@ -99,7 +98,7 @@ def __lt__(self, other):
     def __le__(self, other):
         """ Time <= Time """
         Time._is_time(other)
-        return self.millis <= other._millis
+        return self._millis <= other._millis
 
     def __eq__(self, other):
         """ Time ==  Time """
@@ -121,7 +120,7 @@ def __ge__(self, other):
         Time._is_time(other)
         return self._millis >= other._millis
 
-    def isMultipbleOf(duration):
+    def isMultipbleOf(self, duration):
         """ is multiple by Duration """
         Duration._is_duration(duration)
         return self._millis % duration._millis == 0
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 067b168e2b528..4af48ee8f86b4 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -28,8 +28,13 @@
 """
 from itertools import chain
 import time
-import unittest
 import operator
+import sys
+
+if sys.version_info[:2] <= (2, 6):
+        import unittest2 as unittest
+    else:
+        import unittest
 
 from pyspark.context import SparkContext
 from pyspark.streaming.context import StreamingContext
@@ -451,3 +456,4 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
 
 if __name__ == "__main__":
     unittest.main()
+    SparkContext._gateway._shutdown_callback_server()
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/util.py
similarity index 79%
rename from python/pyspark/streaming/utils.py
rename to python/pyspark/streaming/util.py
index 5ba179cae7f9c..651ba363957ea 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/util.py
@@ -20,7 +20,8 @@
 
 class RDDFunction():
     """
-    This class is for py4j callback. This
+    This class is for py4j callback. This class is related with
+    org.apache.spark.streaming.api.python.PythonRDDFunction.
     """
     def __init__(self, ctx, jrdd_deserializer, func):
         self.ctx = ctx
@@ -41,10 +42,19 @@ class Java:
 
 
 def msDurationToString(ms):
-    #TODO: add doctest
     """
     Returns a human-readable string representing a duration such as "35ms"
+
+    >> msDurationToString(10)
+    '10 ms'
+    >>> msDurationToString(1000)
+    '1.0 s'
+    >>> msDurationToString(60000)
+    '1.0 m'
+    >>> msDurationToString(3600000)
+    '1.00 h'
     """
+    #TODO: add doctest
     second = 1000
     minute = 60 * second
     hour = 60 * minute
@@ -60,7 +70,15 @@ def msDurationToString(ms):
 
 
 def rddToFileName(prefix, suffix, time):
-    #TODO: add doctest
+    """
+    Return string prefix-time(.suffix)
+
+    >>> rddToFileName("spark", None, 12345678910)
+    'spark-12345678910'
+    >>> rddToFileName("spark", "tmp", 12345678910)
+    'spark-12345678910.tmp'
+
+    """
     if suffix is None:
         return prefix + "-" + str(time)
     else:
diff --git a/python/run-tests b/python/run-tests
index 3d00727f0ab81..ef4994d4e4b00 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -69,6 +69,7 @@ run_test "pyspark/broadcast.py"
 run_test "pyspark/accumulators.py"
 run_test "pyspark/serializers.py"
 run_test "pyspark/streaming/duration.py"
+run_test "pyspark/streaming/util.py"
 unset PYSPARK_DOC_TEST
 run_test "pyspark/shuffle.py"
 run_test "pyspark/tests.py"
@@ -81,6 +82,7 @@ run_test "pyspark/mllib/recommendation.py"
 run_test "pyspark/mllib/regression.py"
 run_test "pyspark/mllib/tests.py"
 run_test "pyspark/mllib/util.py"
+run_test "pyspark/streaming/tests.py"
 
 if [[ $FAILED == 0 ]]; then
     echo -en "\033[32m"  # Green

From 0d30109cdc51c0bb0692b418c462f7da3c36387f Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 18:43:08 -0700
Subject: [PATCH 144/347] fixed pep8 violation

---
 python/pyspark/streaming/context.py  | 10 +++----
 python/pyspark/streaming/dstream.py  |  9 +++---
 python/pyspark/streaming/duration.py |  3 ++
 python/pyspark/streaming/tests.py    | 43 ++++++++++++++++------------
 python/pyspark/streaming/util.py     |  1 -
 python/pyspark/worker.py             |  2 +-
 python/run-tests                     |  4 ++-
 7 files changed, 41 insertions(+), 31 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index dbb6fdf1694ad..12023374333a2 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -33,8 +33,8 @@ class StreamingContext(object):
     """
 
     def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
-        environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
-        gateway=None, sparkContext=None, duration=None):
+                 environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
+                 gateway=None, sparkContext=None, duration=None):
         """
         Create a new StreamingContext. At least the master and app name and duration
         should be set, either through the named parameters here or through C{conf}.
@@ -63,8 +63,8 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         if sparkContext is None:
             # Create the Python Sparkcontext
             self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
-                            pyFiles=pyFiles, environment=environment, batchSize=batchSize,
-                            serializer=serializer, conf=conf, gateway=gateway)
+                                    pyFiles=pyFiles, environment=environment, batchSize=batchSize,
+                                    serializer=serializer, conf=conf, gateway=gateway)
         else:
             self._sc = sparkContext
 
@@ -107,7 +107,7 @@ def awaitTermination(self, timeout=None):
         else:
             self._jssc.awaitTermination(timeout)
 
-    #TODO: add storageLevel
+    # TODO: add storageLevel
     def socketTextStream(self, hostname, port):
         """
         Create an input from TCP source hostname:port. Data is received using
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index f91a3b8a355d2..b1d1b0d8dc165 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -75,7 +75,8 @@ def filter(self, f):
         """
         Return a new DStream containing only the elements that satisfy predicate.
         """
-        def func(iterator): return ifilter(f, iterator)
+        def func(iterator):
+            return ifilter(f, iterator)
         return self.mapPartitions(func)
 
     def flatMap(self, f, preservesPartitioning=False):
@@ -130,7 +131,7 @@ def reduceByKey(self, func, numPartitions=None):
         return self.combineByKey(lambda x: x, func, func, numPartitions)
 
     def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
-                      numPartitions = None):
+                     numPartitions=None):
         """
         Count the number of elements for each key, and return the result to the
         master as a dictionary
@@ -153,7 +154,7 @@ def combineLocally(iterator):
         def _mergeCombiners(iterator):
             combiners = {}
             for (k, v) in iterator:
-                if not k in combiners:
+                if k not in combiners:
                     combiners[k] = v
                 else:
                     combiners[k] = mergeCombiners(combiners[k], v)
@@ -188,7 +189,7 @@ def add_shuffle_key(split, iterator):
         keyed._bypass_serializer = True
         with _JavaStackTrace(self.ctx) as st:
             partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
-                                                      id(partitionFunc))
+                                                          id(partitionFunc))
             jdstream = self.ctx._jvm.PythonPairwiseDStream(keyed._jdstream.dstream(),
                                                            partitioner).asJavaDStream()
         dstream = DStream(jdstream, self._ssc, BatchedSerializer(outputSerializer))
diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
index fa03410f3f8e2..495ac2edff198 100644
--- a/python/pyspark/streaming/duration.py
+++ b/python/pyspark/streaming/duration.py
@@ -333,6 +333,7 @@ def _is_duration(self, instance):
         if not isinstance(instance, Duration):
             raise TypeError("This should be Duration")
 
+
 def Milliseconds(milliseconds):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
@@ -346,6 +347,7 @@ def Milliseconds(milliseconds):
     """
     return Duration(milliseconds)
 
+
 def Seconds(seconds):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
@@ -359,6 +361,7 @@ def Seconds(seconds):
     """
     return Duration(seconds * 1000)
 
+
 def Minutes(minutes):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 4af48ee8f86b4..2ed099b1004c3 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -32,9 +32,9 @@
 import sys
 
 if sys.version_info[:2] <= (2, 6):
-        import unittest2 as unittest
-    else:
-        import unittest
+    import unittest2 as unittest
+else:
+    import unittest
 
 from pyspark.context import SparkContext
 from pyspark.streaming.context import StreamingContext
@@ -57,7 +57,7 @@ def tearDown(self):
 
     @classmethod
     def tearDownClass(cls):
-        # Make sure tp shutdown the callback server 
+        # Make sure tp shutdown the callback server
         SparkContext._gateway._shutdown_callback_server()
 
 
@@ -71,7 +71,7 @@ class TestBasicOperationsSuite(PySparkStreamingTestCase):
 
     All tests input should have list of lists(3 lists are default). This list represents stream.
     Every batch interval, the first object of list are chosen to make DStream.
-    e.g The first list in the list is input of the first batch. 
+    e.g The first list in the list is input of the first batch.
     Please see the BasicTestSuits in Scala which is close to this implementation.
     """
     def setUp(self):
@@ -112,7 +112,7 @@ def test_flatMap_batch(self):
 
         def test_func(dstream):
             return dstream.flatMap(lambda x: (x, x * 2))
-        expected_output = map(lambda x: list(chain.from_iterable((map(lambda y: [y, y * 2], x)))), 
+        expected_output = map(lambda x: list(chain.from_iterable((map(lambda y: [y, y * 2], x)))),
                               test_input)
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
@@ -191,12 +191,12 @@ def test_func(dstream):
     def test_reduceByKey_batch(self):
         """Basic operation test for DStream.reduceByKey with batch deserializer."""
         test_input = [[("a", 1), ("a", 1), ("b", 1), ("b", 1)],
-                      [("", 1),("", 1), ("", 1), ("", 1)],
+                      [("", 1), ("", 1), ("", 1), ("", 1)],
                       [(1, 1), (1, 1), (2, 1), (2, 1), (3, 1)]]
 
         def test_func(dstream):
             return dstream.reduceByKey(operator.add)
-        expected_output = [[("a", 2), ("b", 2)], [("", 4)], [(1, 2), (2, 2), (3 ,1)]]
+        expected_output = [[("a", 2), ("b", 2)], [("", 4)], [(1, 2), (2, 2), (3, 1)]]
         output = self._run_stream(test_input, test_func, expected_output)
         for result in (output, expected_output):
             self._sort_result_based_on_key(result)
@@ -216,13 +216,13 @@ def test_func(dstream):
 
     def test_mapValues_batch(self):
         """Basic operation test for DStream.mapValues with batch deserializer."""
-        test_input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)], 
+        test_input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)],
                       [("", 4), (1, 1), (2, 2), (3, 3)],
                       [(1, 1), (2, 1), (3, 1), (4, 1)]]
 
         def test_func(dstream):
             return dstream.mapValues(lambda x: x + 10)
-        expected_output = [[("a", 12), ("b", 12), ("c", 11), ("d", 11)], 
+        expected_output = [[("a", 12), ("b", 12), ("c", 11), ("d", 11)],
                            [("", 14), (1, 11), (2, 12), (3, 13)],
                            [(1, 11), (2, 11), (3, 11), (4, 11)]]
         output = self._run_stream(test_input, test_func, expected_output)
@@ -250,7 +250,8 @@ def test_flatMapValues_batch(self):
 
         def test_func(dstream):
             return dstream.flatMapValues(lambda x: (x, x + 10))
-        expected_output = [[("a", 2), ("a", 12), ("b", 2), ("b", 12), ("c", 1), ("c", 11), ("d", 1), ("d", 11)],
+        expected_output = [[("a", 2), ("a", 12), ("b", 2), ("b", 12),
+                            ("c", 1), ("c", 11), ("d", 1), ("d", 11)],
                            [("", 4), ("", 14), (1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11)],
                            [(1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11), (4, 1), (4, 11)]]
         output = self._run_stream(test_input, test_func, expected_output)
@@ -344,7 +345,7 @@ def test_func(dstream):
 
     def test_groupByKey_batch(self):
         """Basic operation test for DStream.groupByKey with batch deserializer."""
-        test_input = [[(1, 1), (2, 1), (3, 1), (4, 1)], 
+        test_input = [[(1, 1), (2, 1), (3, 1), (4, 1)],
                       [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)],
                       [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]]
 
@@ -361,7 +362,7 @@ def test_func(dstream):
 
     def test_groupByKey_unbatch(self):
         """Basic operation test for DStream.groupByKey with unbatch deserializer."""
-        test_input = [[(1, 1), (2, 1), (3, 1)], 
+        test_input = [[(1, 1), (2, 1), (3, 1)],
                       [(1, 1), (1, 1), ("", 1)],
                       [("a", 1), ("a", 1), ("b", 1)]]
 
@@ -378,12 +379,13 @@ def test_func(dstream):
 
     def test_combineByKey_batch(self):
         """Basic operation test for DStream.combineByKey with batch deserializer."""
-        test_input = [[(1, 1), (2, 1), (3, 1), (4, 1)], 
-                      [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)], 
+        test_input = [[(1, 1), (2, 1), (3, 1), (4, 1)],
+                      [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)],
                       [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]]
 
         def test_func(dstream):
-            def add(a, b): return a + str(b)
+            def add(a, b):
+                return a + str(b)
             return dstream.combineByKey(str, add, add)
         expected_output = [[(1, "1"), (2, "1"), (3, "1"), (4, "1")],
                            [(1, "111"), (2, "11"), (3, "1")],
@@ -395,10 +397,13 @@ def add(a, b): return a + str(b)
 
     def test_combineByKey_unbatch(self):
         """Basic operation test for DStream.combineByKey with unbatch deserializer."""
-        test_input = [[(1, 1), (2, 1), (3, 1)], [(1, 1), (1, 1), ("", 1)], [("a", 1),  ("a", 1), ("b", 1)]]
+        test_input = [[(1, 1), (2, 1), (3, 1)],
+                      [(1, 1), (1, 1), ("", 1)],
+                      [("a", 1),  ("a", 1), ("b", 1)]]
 
         def test_func(dstream):
-            def add(a, b): return a + str(b)
+            def add(a, b):
+                return a + str(b)
             return dstream.combineByKey(str, add, add)
         expected_output = [[(1, "1"), (2, "1"), (3, "1")],
                            [(1, "11"), ("", "1")],
@@ -445,7 +450,7 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
             # Check time out.
             if (current_time - start_time) > self.timeout:
                 break
-            # StreamingContext.awaitTermination is not used to wait because 
+            # StreamingContext.awaitTermination is not used to wait because
             # if py4j server is called every 50 milliseconds, it gets an error.
             time.sleep(0.05)
             # Check if the output is the same length of expected output.
diff --git a/python/pyspark/streaming/util.py b/python/pyspark/streaming/util.py
index 651ba363957ea..cf90952543fc0 100644
--- a/python/pyspark/streaming/util.py
+++ b/python/pyspark/streaming/util.py
@@ -54,7 +54,6 @@ def msDurationToString(ms):
     >>> msDurationToString(3600000)
     '1.00 h'
     """
-    #TODO: add doctest
     second = 1000
     minute = 60 * second
     hour = 60 * minute
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index ceb50b4f99acd..77a9c4a0e0677 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -58,7 +58,7 @@ def main(infile, outfile):
         SparkFiles._is_running_on_worker = True
 
         # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
-        sys.path.append(spark_files_dir) # *.py files that were added will be copied here
+        sys.path.append(spark_files_dir)  # *.py files that were added will be copied here
         num_python_includes = read_int(infile)
         for _ in range(num_python_includes):
             filename = utf8_deserializer.loads(infile)
diff --git a/python/run-tests b/python/run-tests
index ef4994d4e4b00..d5560dad69dc4 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -82,7 +82,9 @@ run_test "pyspark/mllib/recommendation.py"
 run_test "pyspark/mllib/regression.py"
 run_test "pyspark/mllib/tests.py"
 run_test "pyspark/mllib/util.py"
-run_test "pyspark/streaming/tests.py"
+if [ -n "$_RUN_STREAMING_TESTS" ]; then
+  run_test "pyspark/streaming/tests.py"
+fi
 
 if [[ $FAILED == 0 ]]; then
     echo -en "\033[32m"  # Green

From 24f95db9766177b557ee42c5aeaad974c5c18d50 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 18:54:22 -0700
Subject: [PATCH 145/347] clen up examples

---
 .../src/main/python/streaming/network_wordcount.py     |  8 +++-----
 examples/src/main/python/streaming/wordcount.py        | 10 +++-------
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index 9b7af07803b4d..f242f8d29658a 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -1,7 +1,6 @@
 import sys
 from operator import add
 
-from pyspark.conf import SparkConf
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
 
@@ -9,15 +8,14 @@
     if len(sys.argv) != 3:
         print >> sys.stderr, "Usage: wordcount <hostname> <port>"
         exit(-1)
-    conf = SparkConf()
-    conf.setAppName("PythonStreamingNetworkWordCount")
-    ssc = StreamingContext(conf=conf, duration=Seconds(1))
+    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", 
+                           duration=Seconds(1))
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     words = lines.flatMap(lambda line: line.split(" "))
     mapped_words = words.map(lambda word: (word, 1))
     count = mapped_words.reduceByKey(add)
-
     count.pyprint()
+
     ssc.start()
     ssc.awaitTermination()
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index 2426345711086..e0600bbdb34d2 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -1,7 +1,5 @@
 import sys
-from operator import add
 
-from pyspark.conf import SparkConf
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
 
@@ -9,16 +7,14 @@
     if len(sys.argv) != 2:
         print >> sys.stderr, "Usage: wordcount <directory>"
         exit(-1)
-    conf = SparkConf()
-    conf.setAppName("PythonStreamingWordCount")
 
-    ssc = StreamingContext(conf=conf, duration=Seconds(1))
+    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
 
     lines = ssc.textFileStream(sys.argv[1])
     words = lines.flatMap(lambda line: line.split(" "))
     mapped_words = words.map(lambda x: (x, 1))
-    count = mapped_words.reduceByKey(add)
-    
+    count = mapped_words.reduceByKey(lambda a, b: a+b)
     count.pyprint()
+
     ssc.start()
     ssc.awaitTermination()

From 9c85e484968291f7024e095ffc595af8a8237a83 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 19:06:03 -0700
Subject: [PATCH 146/347] clean up exmples

---
 .../src/main/python/streaming/network_wordcount.py    |  9 ++++-----
 examples/src/main/python/streaming/wordcount.py       | 11 ++++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index f242f8d29658a..cd2a8a73de63b 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -1,5 +1,4 @@
 import sys
-from operator import add
 
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
@@ -12,10 +11,10 @@
                            duration=Seconds(1))
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
-    words = lines.flatMap(lambda line: line.split(" "))
-    mapped_words = words.map(lambda word: (word, 1))
-    count = mapped_words.reduceByKey(add)
-    count.pyprint()
+    counts = lines.flatMap(lambda line: line.split(" "))\
+                  .map(lambda word: (word, 1))\
+                  .reduceByKey(lambda a,b: a+b)
+    counts.pyprint()
 
     ssc.start()
     ssc.awaitTermination()
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index e0600bbdb34d2..4c62835ed8025 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -8,13 +8,14 @@
         print >> sys.stderr, "Usage: wordcount <directory>"
         exit(-1)
 
-    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
+    ssc = StreamingContext(appName="PythonStreamingWordCount",
+                           duration=Seconds(1))
 
     lines = ssc.textFileStream(sys.argv[1])
-    words = lines.flatMap(lambda line: line.split(" "))
-    mapped_words = words.map(lambda x: (x, 1))
-    count = mapped_words.reduceByKey(lambda a, b: a+b)
-    count.pyprint()
+    counts = lines.flatMap(lambda line: line.split(" "))\
+                  .map(lambda x: (x, 1))\
+                  .reduceByKey(lambda a, b: a+b)
+    counts.pyprint()
 
     ssc.start()
     ssc.awaitTermination()

From 7339df2b444937c845611c951b9e9ef9ce32dc72 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 19:21:37 -0700
Subject: [PATCH 147/347] fixed typo

---
 .../src/main/scala/org/apache/spark/deploy/PythonRunner.scala | 1 -
 python/pyspark/java_gateway.py                                | 3 ++-
 python/pyspark/streaming/dstream.py                           | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
index 89f3fd47724fe..0d6751f3fa6d2 100644
--- a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
@@ -57,7 +57,6 @@ object PythonRunner {
     val builder = new ProcessBuilder(Seq(pythonExec, "-u", formattedPythonFile) ++ otherArgs)
     val env = builder.environment()
     env.put("PYTHONPATH", pythonPath)
-    env.put("PYSPARK_PYTHON", pythonExec)
     env.put("PYSPARK_GATEWAY_PORT", "" + gatewayServer.getListeningPort)
     builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize
     val process = builder.start()
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 4e1ad472a6d2c..099cc1db0ba52 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -86,11 +86,12 @@ def run(self):
     java_import(gateway.jvm, "org.apache.spark.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
-    java_import(gateway.jvm, "org.apache.spark.streaming.*")  # for Duration and Time
+    java_import(gateway.jvm, "org.apache.spark.streaming.*")
     java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.LocalHiveContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.TestHiveContext")
     java_import(gateway.jvm, "scala.Tuple2")
+
     return gateway
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index b1d1b0d8dc165..696de900b0a69 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -412,7 +412,7 @@ def get_output(rdd, time):
 # TODO: implement countByWindow
 # TODO: implement reduceByWindow
 
-# transform Operation
+# Transform Operation
 # TODO: implement transform
 # TODO: implement transformWith
 # Following operation has dependency with transform
@@ -421,7 +421,7 @@ def get_output(rdd, time):
 # TODO: implement cogroup
 # TODO: implement join
 # TODO: implement leftOuterJoin
-# TODO: implemtnt rightOuterJoin
+# TODO: implement rightOuterJoin
 
 
 class PipelinedDStream(DStream):

From 9d1de23d1221e4c76114a316f61d37ba9f762bfe Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 19:28:03 -0700
Subject: [PATCH 148/347] revert pom.xml

---
 python/pyspark/streaming/context.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 12023374333a2..6b6cc653dfa6f 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -147,7 +147,7 @@ def _testInputStream(self, test_inputs, numSlices=None):
             test_rdd = self._sc.parallelize(test_input, numSlices)
             test_rdds.append(test_rdd._jrdd)
             test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
-        # All deserializer has to be the same.
+        # All deserializers have to be the same.
         # TODO: add deserializer validation
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()

From 4f82c89e5974acd56612ca3649dcaf6a0856dbf7 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 19:30:36 -0700
Subject: [PATCH 149/347] remove duplicated import

---
 .../scala/org/apache/spark/streaming/dstream/DStream.scala     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 39ad591e8896e..e05db236addca 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.streaming.dstream
 
+
 import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
 
 import scala.deprecated
@@ -30,7 +31,6 @@ import org.apache.spark.streaming._
 import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.scheduler.Job
 import org.apache.spark.util.MetadataCleaner
-import org.apache.spark.streaming.Duration
 
 /**
  * A Discretized Stream (DStream), the basic abstraction in Spark Streaming, is a continuous
@@ -557,7 +557,6 @@ abstract class DStream[T: ClassTag] (
     // because the DStream is reachable from the outer object here, and because 
     // DStreams can't be serialized with closures, we can't proactively check 
     // it for serializability and so we pass the optional false to SparkContext.clean
-
     val cleanedF = context.sparkContext.clean(transformFunc, false)
     val realTransformFunc =  (rdds: Seq[RDD[_]], time: Time) => {
       assert(rdds.length == 1)

From 50fd6f997f83e27aeca31d027813a7bc307bd856 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 19:33:13 -0700
Subject: [PATCH 150/347] revert pom.xml

---
 streaming/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/streaming/pom.xml b/streaming/pom.xml
index 82bd94b855b67..91cf2e8f6fb35 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -79,7 +79,7 @@
       </plugin>
 
       <!--
-           This plugin forces the generation of jar containing streaming test classes,
+           This plugin forces the generation of jar containing streaming test classes, 
            so that the tests classes of external modules can use them. The two execution profiles
            are necessary - first one for 'mvn package', second one for 'mvn test-compile'. Ideally,
            'mvn compile' should not compile test classes and therefore should not need this. 

From 93f7637bfde0ff9a611d6b5d975232575eeb8fc0 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 19:34:20 -0700
Subject: [PATCH 151/347] fixed explanaiton

---
 streaming/pom.xml                                               | 2 +-
 .../apache/spark/streaming/api/python/PythonRDDFunction.java    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/streaming/pom.xml b/streaming/pom.xml
index 91cf2e8f6fb35..82bd94b855b67 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -79,7 +79,7 @@
       </plugin>
 
       <!--
-           This plugin forces the generation of jar containing streaming test classes, 
+           This plugin forces the generation of jar containing streaming test classes,
            so that the tests classes of external modules can use them. The two execution profiles
            are necessary - first one for 'mvn package', second one for 'mvn test-compile'. Ideally,
            'mvn compile' should not compile test classes and therefore should not need this. 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
index b46a644dacb7c..84d092ea0048f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
@@ -5,7 +5,7 @@
 
 /*
  * Interface for py4j callback function.
- * This function is called by pyspark.streaming.dstream.DStream.foreachRDD .
+ * This interface is related to pyspark.streaming.dstream.DStream.foreachRDD .
  */
 public interface PythonRDDFunction {
   JavaRDD<byte[]> call(JavaRDD<byte[]> rdd, long time);

From acfcaebdfb68e36a93796c3f8923d07734af53a8 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 19:38:38 -0700
Subject: [PATCH 152/347] revert pom.xml

---
 streaming/pom.xml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/streaming/pom.xml b/streaming/pom.xml
index 82bd94b855b67..ce35520a28609 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -77,9 +77,9 @@
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
       </plugin>
-
-      <!--
-           This plugin forces the generation of jar containing streaming test classes,
+      
+      <!-- 
+           This plugin forces the generation of jar containing streaming test classes, 
            so that the tests classes of external modules can use them. The two execution profiles
            are necessary - first one for 'mvn package', second one for 'mvn test-compile'. Ideally,
            'mvn compile' should not compile test classes and therefore should not need this. 

From 3b27bd426c8d1b001d43266aa880a31ae17fe435 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 19:44:05 -0700
Subject: [PATCH 153/347] remove the last brank line

---
 .../org/apache/spark/streaming/api/python/PythonDStream.scala   | 2 +-
 .../apache/spark/streaming/api/python/PythonRDDFunction.java    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 37df73717c65d..990feacbdc598 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -144,4 +144,4 @@ class PythonTestInputStream(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[J
   }
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
-}
+}
\ No newline at end of file
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
index 84d092ea0048f..eacff4b0e6006 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
@@ -9,4 +9,4 @@
  */
 public interface PythonRDDFunction {
   JavaRDD<byte[]> call(JavaRDD<byte[]> rdd, long time);
-}
+}
\ No newline at end of file

From 2ea769e0656c7077ada56719fccf0fe9efe056a5 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 21 Aug 2014 12:56:27 -0700
Subject: [PATCH 154/347] added comment in dstream._test_output

---
 python/pyspark/streaming/dstream.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 696de900b0a69..1b095f2a42372 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -395,6 +395,10 @@ def _test_output(self, result):
         Store data in a DStream to result to verify the result in test case
         """
         def get_output(rdd, time):
+            """
+            Closure to get element in RDD in the DStream.
+            This closure is called by py4j callback server.
+            """
             collected = rdd.collect()
             result.append(collected)
 

From c97377c06d9b83fc4d357d7a6c05540531447984 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 21 Aug 2014 15:15:55 -0700
Subject: [PATCH 155/347] delete inproper comments

---
 python/pyspark/streaming/tests.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 2ed099b1004c3..3c9174e64cf77 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -16,15 +16,11 @@
 #
 
 """
-Unit tests for PySpark; additional tests are implemented as doctests in
+Unit tests for Python SparkStreaming; additional tests are implemented as doctests in
 individual modules.
 
-This file would be merged to tests.py after all functions are ready.
-Since python API for streaming is beta, this file is separated.
-
 Callback server is sometimes unstable sometimes, which cause error in test case.
 But this is very rare case.
-
 """
 from itertools import chain
 import time

From 67473a950c922693a0ca38630f8fa53d4edb9d53 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 21 Aug 2014 15:43:57 -0700
Subject: [PATCH 156/347] delete not implemented functions

---
 python/pyspark/streaming/jtime.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/streaming/jtime.py b/python/pyspark/streaming/jtime.py
index 02e0e0cf2d17e..801b8871b3879 100644
--- a/python/pyspark/streaming/jtime.py
+++ b/python/pyspark/streaming/jtime.py
@@ -125,14 +125,12 @@ def isMultipbleOf(self, duration):
         Duration._is_duration(duration)
         return self._millis % duration._millis == 0
 
-    def until(time, interval):
-        raise NotImplementedError
-
-    def to(time, interval):
-        raise NotImplementedError
-
     @classmethod
     def _is_time(self, instance):
         """ is instance Time """
         if not isinstance(instance, Time):
             raise TypeError
+
+# TODO: implement until
+# TODO: implement to
+

From d9d59fe4129ff4dee27011101061e93c47c7f30b Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@redhat.com>
Date: Tue, 26 Aug 2014 17:01:47 -0400
Subject: [PATCH 157/347] Fix scalastyle errors

---
 .../spark/streaming/api/java/JavaStreamingContext.scala      | 3 ++-
 .../apache/spark/streaming/api/python/PythonDStream.scala    | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
index b51d5ff0be9fc..dc71bea59c794 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
@@ -546,7 +546,8 @@ class JavaStreamingContext(val ssc: StreamingContext) {
  * JavaStreamingContext object contains a number of utility functions.
  */
 object JavaStreamingContext {
-  implicit def fromStreamingContext(ssc: StreamingContext): JavaStreamingContext = new JavaStreamingContext(ssc)
+  implicit def fromStreamingContext(ssc: StreamingContext):
+    JavaStreamingContext = new JavaStreamingContext(ssc)
 
   implicit def toStreamingContext(jssc: JavaStreamingContext): StreamingContext = jssc.ssc
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 990feacbdc598..dfb5601a9256e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -120,7 +120,8 @@ class PythonForeachDStream(
  * returns the i_th element at the i_th batch under manual clock.
  */
 
-class PythonTestInputStream(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[JavaRDD[Array[Byte]]])
+class PythonTestInputStream(ssc_ : JavaStreamingContext,
+      inputRDDs: JArrayList[JavaRDD[Array[Byte]]])
   extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)) {
 
   def start() {}
@@ -144,4 +145,4 @@ class PythonTestInputStream(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[J
   }
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
-}
\ No newline at end of file
+}

From 4afa3902823aea7b00eedda5f2a029a874abd7e3 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 31 Aug 2014 15:56:14 +0900
Subject: [PATCH 158/347] clean up code

---
 .../apache/spark/streaming/api/python/PythonDStream.scala | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 990feacbdc598..a6f4181547f02 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -116,11 +116,13 @@ class PythonForeachDStream(
 
 /**
  * This is a input stream just for the unitest. This is equivalent to a checkpointable,
- * replayable, reliable message queue like Kafka. It requires a sequence as input, and
- * returns the i_th element at the i_th batch under manual clock.
+ * replayable, reliable message queue like Kafka. It requires a JArrayList input of JavaRDD,
+ * and returns the i_th element at the i_th batch under manual clock.
  */
 
-class PythonTestInputStream(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[JavaRDD[Array[Byte]]])
+class PythonTestInputStream(
+    ssc_ : JavaStreamingContext,
+    inputRDDs: JArrayList[JavaRDD[Array[Byte]]])
   extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)) {
 
   def start() {}

From da0976807efe2a1ebec24cfa7838621d07a2404a Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 31 Aug 2014 16:19:38 +0900
Subject: [PATCH 159/347] added StreamingContext.remember

---
 python/pyspark/streaming/context.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 6b6cc653dfa6f..a1a9be1eae439 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -21,6 +21,7 @@
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
 from pyspark.context import SparkContext
 from pyspark.streaming.dstream import DStream
+from pyspark.streaming.duration import Duration
 
 from py4j.java_collections import ListConverter
 
@@ -107,6 +108,20 @@ def awaitTermination(self, timeout=None):
         else:
             self._jssc.awaitTermination(timeout)
 
+    def remember(self, duration):
+        """
+        Set each DStreams in this context to remember RDDs it generated in the last given duration.
+        DStreams remember RDDs only for a limited duration of time and releases them for garbage
+        collection. This method allows the developer to specify how to long to remember the RDDs (
+        if the developer wishes to query old data outside the DStream computation).
+        @param duration pyspark.streaming.duration.Duration object.
+               Minimum duration that each DStream should remember its RDDs
+        """
+        if not isinstance(duration, Duration):
+            raise TypeError("Input should be pyspark.streaming.duration.Duration object")
+
+        self._jssc.remember(duration._jduration)
+
     # TODO: add storageLevel
     def socketTextStream(self, hostname, port):
         """

From f5bfb70abc7f5eae71f52b42f7d3ef869ad6d8c8 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 31 Aug 2014 16:29:37 +0900
Subject: [PATCH 160/347] added StreamingContext.sparkContext

---
 python/pyspark/streaming/context.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index a1a9be1eae439..78828ea4ea3c2 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -92,6 +92,10 @@ def clean_up_handler(*args):
         for sig in (SIGINT, SIGTERM):
             signal(sig, clean_up_handler)
 
+    @property
+    def sparkContext(self):
+        return self._sc
+
     def start(self):
         """
         Start the execution of the streams.

From fdc91254766edb6da1337781bd0e2ec11aee3129 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 31 Aug 2014 16:31:01 +0900
Subject: [PATCH 161/347] added comment for StreamingContext.sparkContext

---
 python/pyspark/streaming/context.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 78828ea4ea3c2..bbb4f6764e266 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -94,6 +94,9 @@ def clean_up_handler(*args):
 
     @property
     def sparkContext(self):
+        """
+        Return SparkContext which is associated this StreamingContext
+        """
         return self._sc
 
     def start(self):

From ee50c5a9f76b2c10a6ba430be9d0b3ae01deed85 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 31 Aug 2014 20:48:10 +0900
Subject: [PATCH 162/347] added atexit to handle callback server

---
 python/pyspark/streaming/context.py | 28 +++++++-----
 python/pyspark/streaming/tests.py   | 68 ++++++++++++++++++++++++++++-
 2 files changed, 83 insertions(+), 13 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index bbb4f6764e266..19e97f38861a6 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -17,6 +17,8 @@
 
 import sys
 from signal import signal, SIGTERM, SIGINT
+import atexit
+import time
 
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
 from pyspark.context import SparkContext
@@ -73,7 +75,7 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         # Callback sever is need only by SparkStreming; therefore the callback sever
         # is started in StreamingContext.
         SparkContext._gateway.restart_callback_server()
-        self._set_clean_up_trigger()
+        self._set_clean_up_handler()
         self._jvm = self._sc._jvm
         self._jssc = self._initialize_context(self._sc._jsc, duration._jduration)
 
@@ -81,21 +83,22 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
     def _initialize_context(self, jspark_context, jduration):
         return self._jvm.JavaStreamingContext(jspark_context, jduration)
 
-    def _set_clean_up_trigger(self):
-        """Kill py4j callback server properly using signal lib"""
+    def _set_clean_up_handler(self):
+        """ set clean up hander using atexit """
 
-        def clean_up_handler(*args):
-            # Make sure stop callback server.
+        def clean_up_handler():
             SparkContext._gateway.shutdown()
-            sys.exit(0)
 
+        atexit.register(clean_up_handler)
+        # atext is not called when the program is killed by a signal not handled by
+        # Python.
         for sig in (SIGINT, SIGTERM):
             signal(sig, clean_up_handler)
 
     @property
     def sparkContext(self):
         """
-        Return SparkContext which is associated this StreamingContext
+        Return SparkContext which is associated with this StreamingContext.
         """
         return self._sc
 
@@ -152,11 +155,14 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
         Stop the execution of the streams immediately (does not wait for all received data
         to be processed).
         """
-        try:
-            self._jssc.stop(stopSparkContext, stopGraceFully)
-        finally:
-            SparkContext._gateway.shutdown()
+        self._jssc.stop(stopSparkContext, stopGraceFully)
+        if stopSparkContext:
+            self._sc.stop()
 
+        # Shutdown only callback server and all py3j client is shutdowned
+        # clean up handler
+        SparkContext._gateway._shutdown_callback_server()
+        
     def _testInputStream(self, test_inputs, numSlices=None):
         """
         This function is only for unittest.
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 3c9174e64cf77..d7f86fc8f5923 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -33,6 +33,7 @@
     import unittest
 
 from pyspark.context import SparkContext
+from pyspark.conf import SparkConf
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
 
@@ -47,8 +48,6 @@ def tearDown(self):
         # we do not wait to shutdown py4j client.
         self.ssc._jssc.stop()
         self.ssc._sc.stop()
-        # Why does it long time to terminate StremaingContext and SparkContext?
-        # Should we change the sleep time if this depends on machine spec?
         time.sleep(1)
 
     @classmethod
@@ -455,6 +454,71 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
 
         return result
 
+
+class TestStreamingContextSuite(unittest.TestCase):
+    """
+    Should we have conf property in  SparkContext?
+    @property
+    def conf(self):
+        return self._conf
+
+    """
+    def setUp(self):
+        self.master = "local[2]"
+        self.appName = self.__class__.__name__
+        self.batachDuration = Milliseconds(500)
+        self.sparkHome = "SomeDir"
+        self.envPair = {"key": "value"}
+
+    def tearDown(self):
+        # Do not call pyspark.streaming.context.StreamingContext.stop directly because
+        # we do not wait to shutdown py4j client.
+        self.ssc._jssc.stop()
+        self.ssc._sc.stop()
+        # Why does it long time to terminate StremaingContext and SparkContext?
+        # Should we change the sleep time if this depends on machine spec?
+        time.sleep(1)
+
+    @classmethod
+    def tearDownClass(cls):
+        # Make sure tp shutdown the callback server
+        SparkContext._gateway._shutdown_callback_server()
+
+
+    def test_from_no_conf_constructor(self):
+        ssc = StreamingContext(master=self.master, appName=self.appName, duration=batachDuration)
+        # Alternative call master: ssc.sparkContext.master
+        # I try to make code close to Scala.
+        self.assertEqual(ssc.sparkContext._conf.get("spark.master"), self.master)
+        self.assertEqual(ssc.sparkContext._conf.get("spark.app.name"), self.appName)
+
+    def test_from_no_conf_plus_spark_home(self):
+        ssc = StreamingContext(master=self.master, appName=self.appName, 
+                               sparkHome=self.sparkHome, duration=batachDuration)
+        self.assertEqual(ssc.sparkContext._conf.get("spark.home"), self.sparkHome)
+
+    def test_from_existing_spark_context(self):
+        sc = SparkContext(master=self.master, appName=self.appName)
+        ssc = StreamingContext(sparkContext=sc)
+
+    def test_existing_spark_context_with_settings(self):
+        conf = SparkConf()
+        conf.set("spark.cleaner.ttl", "10")
+        sc = SparkContext(master=self.master, appName=self.appName, conf=conf)
+        ssc = StreamingContext(context=sc)
+        self.assertEqual(int(ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)
+
+    def _addInputStream(self, s):
+        test_inputs = map(lambda x: range(1, x), range(5, 101))
+        # make sure numSlice is 2 due to deserializer proglem in pyspark
+        s._testInputStream(test_inputs, 2)
+
+
+
+
+
+
+
 if __name__ == "__main__":
     unittest.main()
     SparkContext._gateway._shutdown_callback_server()

From f7bc8f970597a09b3a8da5b0cb618b5828f2a98b Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 31 Aug 2014 21:05:23 +0900
Subject: [PATCH 163/347] WIP:added more test for StreamingContext

---
 python/pyspark/streaming/context.py |  3 +++
 python/pyspark/streaming/tests.py   | 13 +++++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 19e97f38861a6..d7a20caac1ee8 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -63,6 +63,9 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
 
         """
 
+        if not isinstance(duration, Duration):
+            raise TypeError("Input should be pyspark.streaming.duration.Duration object")
+
         if sparkContext is None:
             # Create the Python Sparkcontext
             self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index d7f86fc8f5923..dab36cc3a9c24 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -473,6 +473,7 @@ def setUp(self):
     def tearDown(self):
         # Do not call pyspark.streaming.context.StreamingContext.stop directly because
         # we do not wait to shutdown py4j client.
+        # We need change this simply calll streamingConxt.Stop
         self.ssc._jssc.stop()
         self.ssc._sc.stop()
         # Why does it long time to terminate StremaingContext and SparkContext?
@@ -484,7 +485,6 @@ def tearDownClass(cls):
         # Make sure tp shutdown the callback server
         SparkContext._gateway._shutdown_callback_server()
 
-
     def test_from_no_conf_constructor(self):
         ssc = StreamingContext(master=self.master, appName=self.appName, duration=batachDuration)
         # Alternative call master: ssc.sparkContext.master
@@ -513,7 +513,17 @@ def _addInputStream(self, s):
         # make sure numSlice is 2 due to deserializer proglem in pyspark
         s._testInputStream(test_inputs, 2)
 
+    def test_from_no_conf_plus_spark_home_plus_env(self):
+        pass
+
+    def test_from_conf_with_settings(self):
+        pass
+
+    def test_stop_only_streaming_context(self):
+        pass
 
+    def test_await_termination(self):
+        pass
 
 
 
@@ -521,4 +531,3 @@ def _addInputStream(self, s):
 
 if __name__ == "__main__":
     unittest.main()
-    SparkContext._gateway._shutdown_callback_server()

From 150b94c2ecc56e9d864be9bd659c20a1bcf4d41c Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 1 Sep 2014 10:30:41 +0900
Subject: [PATCH 164/347] added some StreamingContextTestSuite

---
 python/pyspark/streaming/tests.py | 80 ++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 28 deletions(-)

diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index dab36cc3a9c24..95cb76a15be07 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -469,13 +469,18 @@ def setUp(self):
         self.batachDuration = Milliseconds(500)
         self.sparkHome = "SomeDir"
         self.envPair = {"key": "value"}
+        self.ssc = None
+        self.sc = None
 
     def tearDown(self):
         # Do not call pyspark.streaming.context.StreamingContext.stop directly because
         # we do not wait to shutdown py4j client.
         # We need change this simply calll streamingConxt.Stop
-        self.ssc._jssc.stop()
-        self.ssc._sc.stop()
+        #self.ssc._jssc.stop()
+        if self.ssc is not None:
+            self.ssc.stop()
+        if self.sc is not None:
+            self.sc.stop()
         # Why does it long time to terminate StremaingContext and SparkContext?
         # Should we change the sleep time if this depends on machine spec?
         time.sleep(1)
@@ -486,48 +491,67 @@ def tearDownClass(cls):
         SparkContext._gateway._shutdown_callback_server()
 
     def test_from_no_conf_constructor(self):
-        ssc = StreamingContext(master=self.master, appName=self.appName, duration=batachDuration)
+        self.ssc = StreamingContext(master=self.master, appName=self.appName,
+                               duration=self.batachDuration)
         # Alternative call master: ssc.sparkContext.master
         # I try to make code close to Scala.
-        self.assertEqual(ssc.sparkContext._conf.get("spark.master"), self.master)
-        self.assertEqual(ssc.sparkContext._conf.get("spark.app.name"), self.appName)
+        self.assertEqual(self.ssc.sparkContext._conf.get("spark.master"), self.master)
+        self.assertEqual(self.ssc.sparkContext._conf.get("spark.app.name"), self.appName)
 
     def test_from_no_conf_plus_spark_home(self):
-        ssc = StreamingContext(master=self.master, appName=self.appName, 
-                               sparkHome=self.sparkHome, duration=batachDuration)
-        self.assertEqual(ssc.sparkContext._conf.get("spark.home"), self.sparkHome)
+        self.ssc = StreamingContext(master=self.master, appName=self.appName, 
+                               sparkHome=self.sparkHome, duration=self.batachDuration)
+        self.assertEqual(self.ssc.sparkContext._conf.get("spark.home"), self.sparkHome)
+
+    def test_from_no_conf_plus_spark_home_plus_env(self):
+        self.ssc = StreamingContext(master=self.master, appName=self.appName, 
+                               sparkHome=self.sparkHome, environment=self.envPair,
+                               duration=self.batachDuration)
+        self.assertEqual(self.ssc.sparkContext._conf.get("spark.executorEnv.key"), self.envPair["key"])
 
     def test_from_existing_spark_context(self):
-        sc = SparkContext(master=self.master, appName=self.appName)
-        ssc = StreamingContext(sparkContext=sc)
+        self.sc = SparkContext(master=self.master, appName=self.appName)
+        self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
 
     def test_existing_spark_context_with_settings(self):
         conf = SparkConf()
         conf.set("spark.cleaner.ttl", "10")
-        sc = SparkContext(master=self.master, appName=self.appName, conf=conf)
-        ssc = StreamingContext(context=sc)
-        self.assertEqual(int(ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)
-
-    def _addInputStream(self, s):
-        test_inputs = map(lambda x: range(1, x), range(5, 101))
-        # make sure numSlice is 2 due to deserializer proglem in pyspark
-        s._testInputStream(test_inputs, 2)
-
-    def test_from_no_conf_plus_spark_home_plus_env(self):
-        pass
+        self.sc = SparkContext(master=self.master, appName=self.appName, conf=conf)
+        self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
+        self.assertEqual(int(self.ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)
 
     def test_from_conf_with_settings(self):
-        pass
+        conf = SparkConf()
+        conf.set("spark.cleaner.ttl", "10")
+        conf.setMaster(self.master)
+        conf.setAppName(self.appName)
+        self.ssc = StreamingContext(conf=conf, duration=self.batachDuration)
+        self.assertEqual(int(self.ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)
 
     def test_stop_only_streaming_context(self):
-        pass
-
-    def test_await_termination(self):
-        pass
-
-
+        self.sc = SparkContext(master=self.master, appName=self.appName)
+        self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
+        self._addInputStream(self.ssc)
+        self.ssc.start()
+        self.ssc.stop(False)
+        self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)
 
+    def test_stop_multiple_times(self):
+        self.ssc = StreamingContext(master=self.master, appName=self.appName,
+                               duration=self.batachDuration)
+        self._addInputStream(self.ssc)
+        self.ssc.start()
+        self.ssc.stop()
+        self.ssc.stop()
 
+    def _addInputStream(self, s):
+        # Make sure each length of input is over 3 and 
+        # numSlice is 2 due to deserializer problem in pyspark.streaming
+        test_inputs = map(lambda x: range(1, x), range(5, 101))
+        test_stream = s._testInputStream(test_inputs, 2)
+        # Register fake output operation
+        result = list()
+        test_stream._test_output(result)
 
 if __name__ == "__main__":
     unittest.main()

From 454981de8b5f7886c6c73aadaa84ddf2673fb80c Mon Sep 17 00:00:00 2001
From: Ken <ugw.gi.world@gmail.com>
Date: Tue, 8 Jul 2014 18:31:41 -0700
Subject: [PATCH 165/347] initial commit for pySparkStreaming

---
 core/pom.xml                                  |   4 +
 .../apache/spark/api/python/PythonRDD.scala   |   2 +-
 .../apache/spark/deploy/PythonRunner.scala    |   1 +
 .../src/main/python/streaming/wordcount.py    |  22 ++
 python/pyspark/java_gateway.py                |   3 +
 python/pyspark/streaming/__init__.py          |   1 +
 python/pyspark/streaming/context.py           | 133 ++++++++
 python/pyspark/streaming/dstream.py           | 315 ++++++++++++++++++
 python/pyspark/streaming/duration.py          | 171 ++++++++++
 python/pyspark/streaming/jtime.py             | 116 +++++++
 python/pyspark/streaming/pyprint.py           |  28 ++
 python/pyspark/streaming/utils.py             |  18 +
 streaming/pom.xml                             |   6 +-
 .../streaming/api/java/JavaDStreamLike.scala  |   8 +
 .../streaming/api/python/PythonDStream.scala  | 152 +++++++++
 .../spark/streaming/dstream/DStream.scala     |  68 +++-
 16 files changed, 1043 insertions(+), 5 deletions(-)
 create mode 100644 examples/src/main/python/streaming/wordcount.py
 create mode 100644 python/pyspark/streaming/__init__.py
 create mode 100644 python/pyspark/streaming/context.py
 create mode 100644 python/pyspark/streaming/dstream.py
 create mode 100644 python/pyspark/streaming/duration.py
 create mode 100644 python/pyspark/streaming/jtime.py
 create mode 100644 python/pyspark/streaming/pyprint.py
 create mode 100644 python/pyspark/streaming/utils.py
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala

diff --git a/core/pom.xml b/core/pom.xml
index 2a81f6df289c0..7eb0b48eaeebd 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,11 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
+<<<<<<< HEAD
     <version>1.2.0-SNAPSHOT</version>
+=======
+    <version>1.0.0</version>
+>>>>>>> initial commit for pySparkStreaming
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index f9ff4ea6ca157..022e2891559d7 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -288,7 +288,7 @@ private class PythonException(msg: String, cause: Exception) extends RuntimeExce
  * Form an RDD[(Array[Byte], Array[Byte])] from key-value pairs returned from Python.
  * This is used by PySpark's shuffle operations.
  */
-private class PairwiseRDD(prev: RDD[Array[Byte]]) extends
+private[spark] class PairwiseRDD(prev: RDD[Array[Byte]]) extends
   RDD[(Long, Array[Byte])](prev) {
   override def getPartitions = prev.partitions
   override def compute(split: Partition, context: TaskContext) =
diff --git a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
index b66c3ba4d5fb0..dc68b1fbda8bb 100644
--- a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
@@ -57,6 +57,7 @@ object PythonRunner {
     val builder = new ProcessBuilder(Seq(pythonExec, "-u", formattedPythonFile) ++ otherArgs)
     val env = builder.environment()
     env.put("PYTHONPATH", pythonPath)
+    env.put("PYSPARK_PYTHON", pythonExec)
     env.put("PYSPARK_GATEWAY_PORT", "" + gatewayServer.getListeningPort)
     builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize
     val process = builder.start()
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
new file mode 100644
index 0000000000000..f44cd696894ba
--- /dev/null
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -0,0 +1,22 @@
+import sys
+from operator import add
+
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.duration import *
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print >> sys.stderr, "Usage: wordcount <directory>"
+        exit(-1)
+    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
+
+    lines = ssc.textFileStream(sys.argv[1])
+    fm_lines = lines.flatMap(lambda x: x.split(" "))
+    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
+    mapped_lines = fm_lines.map(lambda x: (x, 1))
+    
+    fm_lines.pyprint()
+    filtered_lines.pyprint()
+    mapped_lines.pyprint()
+    ssc.start()
+    ssc.awaitTermination()
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 9c70fa5c16d0c..c3fef42d118bd 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -108,6 +108,9 @@ def run(self):
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
     java_import(gateway.jvm, "org.apache.spark.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.api.python.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
diff --git a/python/pyspark/streaming/__init__.py b/python/pyspark/streaming/__init__.py
new file mode 100644
index 0000000000000..719592912e80c
--- /dev/null
+++ b/python/pyspark/streaming/__init__.py
@@ -0,0 +1 @@
+__author__ = 'ktakagiw'
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
new file mode 100644
index 0000000000000..c8ae9c4af85c9
--- /dev/null
+++ b/python/pyspark/streaming/context.py
@@ -0,0 +1,133 @@
+__author__ = 'ktakagiw'
+
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import shutil
+import sys
+from threading import Lock
+from tempfile import NamedTemporaryFile
+
+from pyspark import accumulators
+from pyspark.accumulators import Accumulator
+from pyspark.broadcast import Broadcast
+from pyspark.conf import SparkConf
+from pyspark.files import SparkFiles
+from pyspark.java_gateway import launch_gateway
+from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
+from pyspark.storagelevel import StorageLevel
+from pyspark.rdd import RDD
+from pyspark.context import SparkContext
+
+from py4j.java_collections import ListConverter
+
+from pyspark.streaming.dstream import DStream
+
+class StreamingContext(object):
+    """
+    Main entry point for Spark functionality. A StreamingContext represents the
+    connection to a Spark cluster, and can be used to create L{RDD}s and
+    broadcast variables on that cluster.
+    """
+
+    def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
+        environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
+        gateway=None, duration=None):
+        """
+        Create a new StreamingContext. At least the master and app name and duration
+        should be set, either through the named parameters here or through C{conf}.
+
+        @param master: Cluster URL to connect to
+               (e.g. mesos://host:port, spark://host:port, local[4]).
+        @param appName: A name for your job, to display on the cluster web UI.
+        @param sparkHome: Location where Spark is installed on cluster nodes.
+        @param pyFiles: Collection of .zip or .py files to send to the cluster
+               and add to PYTHONPATH.  These can be paths on the local file
+               system or HDFS, HTTP, HTTPS, or FTP URLs.
+        @param environment: A dictionary of environment variables to set on
+               worker nodes.
+        @param batchSize: The number of Python objects represented as a single
+               Java object.  Set 1 to disable batching or -1 to use an
+               unlimited batch size.
+        @param serializer: The serializer for RDDs.
+        @param conf: A L{SparkConf} object setting Spark properties.
+        @param gateway: Use an existing gateway and JVM, otherwise a new JVM
+               will be instatiated.
+        @param duration: A L{Duration} Duration for SparkStreaming
+
+        """
+        # Create the Python Sparkcontext
+        self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
+                        pyFiles=pyFiles, environment=environment, batchSize=batchSize,
+                        serializer=serializer, conf=conf, gateway=gateway)
+        self._jvm = self._sc._jvm
+        self._jssc = self._initialize_context(self._sc._jsc, duration._jduration)
+
+    # Initialize StremaingContext in function to allow subclass specific initialization
+    def _initialize_context(self, jspark_context, jduration):
+        return self._jvm.JavaStreamingContext(jspark_context, jduration)
+
+    def actorStream(self, props, name, storageLevel, supervisorStrategy):
+        raise NotImplementedError
+
+    def addStreamingListener(self, streamingListener):
+        raise NotImplementedError
+
+    def awaitTermination(self, timeout=None):
+        if timeout:
+            self._jssc.awaitTermination(timeout)
+        else:
+            self._jssc.awaitTermination()
+
+    def checkpoint(self, directory):
+        raise NotImplementedError
+
+    def fileStream(self, directory, filter=None, newFilesOnly=None):
+        raise NotImplementedError
+
+    def networkStream(self, receiver):
+        raise NotImplementedError
+
+    def queueStream(self, queue, oneAtATime=True, defaultRDD=None):
+        raise NotImplementedError
+
+    def rawSocketStream(self, hostname, port, storagelevel):
+        raise NotImplementedError
+
+    def remember(self, duration):
+        raise NotImplementedError
+
+    def socketStream(hostname, port, converter,storageLevel):
+        raise NotImplementedError
+
+    def start(self):
+        self._jssc.start()
+
+    def stop(self, stopSparkContext=True):
+        raise NotImplementedError
+
+    def textFileStream(self, directory):
+        return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
+
+    def transform(self, seq):
+        raise NotImplementedError
+
+    def union(self, seq):
+        raise NotImplementedError
+
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
new file mode 100644
index 0000000000000..b422b147d11e1
--- /dev/null
+++ b/python/pyspark/streaming/dstream.py
@@ -0,0 +1,315 @@
+from base64 import standard_b64encode as b64enc
+import copy
+from collections import defaultdict
+from collections import namedtuple
+from itertools import chain, ifilter, imap
+import operator
+import os
+import sys
+import shlex
+import traceback
+from subprocess import Popen, PIPE
+from tempfile import NamedTemporaryFile
+from threading import Thread
+import warnings
+import heapq
+from random import Random
+
+from pyspark.serializers import NoOpSerializer, CartesianDeserializer, \
+    BatchedSerializer, CloudPickleSerializer, PairDeserializer, pack_long
+from pyspark.join import python_join, python_left_outer_join, \
+    python_right_outer_join, python_cogroup
+from pyspark.statcounter import StatCounter
+from pyspark.rddsampler import RDDSampler
+from pyspark.storagelevel import StorageLevel
+#from pyspark.resultiterable import ResultIterable
+from pyspark.rdd import _JavaStackTrace
+
+from py4j.java_collections import ListConverter, MapConverter
+
+__all__ = ["DStream"]
+
+class DStream(object):
+    def __init__(self, jdstream, ssc, jrdd_deserializer):
+        self._jdstream = jdstream
+        self._ssc = ssc
+        self.ctx = ssc._sc
+        self._jrdd_deserializer = jrdd_deserializer
+
+    def generatedRDDs(self):
+        """
+         // RDDs generated, marked as private[streaming] so that testsuites can access it
+         @transient
+        """
+        pass
+
+    def print_(self):
+        """
+        """
+        # print is a resrved name of Python. We cannot give print to function name
+        getattr(self._jdstream, "print")()
+
+    def pyprint(self):
+        """
+        """
+        self._jdstream.pyprint()
+
+    def cache(self):
+        """
+        """
+        raise NotImplementedError
+
+    def checkpoint(self):
+        """
+        """
+        raise NotImplementedError
+
+    def compute(self, time):
+        """
+        """
+        raise NotImplementedError
+
+    def context(self):
+        """
+        """
+        raise NotImplementedError
+
+    def count(self):
+        """
+        """
+        raise NotImplementedError
+
+    def countByValue(self, numPartitions=None):
+        """
+        """
+        raise NotImplementedError
+
+    def countByValueAndWindow(self, duration, slideDuration=None):
+        """
+        """
+        raise NotImplementedError
+
+    def countByWindow(self, duration, slideDuration=None):
+        """
+        """
+        raise NotImplementedError
+
+    def dstream(self):
+        """
+        """
+        raise NotImplementedError
+
+    def filter(self, f):
+        """
+        """
+        def func(iterator): return ifilter(f, iterator)
+        return self.mapPartitions(func)
+
+    def flatMap(self, f, preservesPartitioning=False):
+        """
+        """
+        def func(s, iterator): return chain.from_iterable(imap(f, iterator))
+        return self.mapPartitionsWithIndex(func, preservesPartitioning)
+
+    def foreachRDD(self, f, time):
+        """
+        """
+        raise NotImplementedError
+
+    def glom(self):
+        """
+        """
+        raise NotImplementedError
+
+    def map(self, f, preservesPartitioning=False):
+        """
+        """
+        def func(split, iterator): return imap(f, iterator)
+        return PipelinedDStream(self, func, preservesPartitioning)
+
+    def mapPartitions(self, f):
+        """
+        """
+        def func(s, iterator): return f(iterator)
+        return self.mapPartitionsWithIndex(func)
+
+    def perist(self, storageLevel):
+        """
+        """
+        raise NotImplementedError
+
+    def reduce(self, func, numPartitions=None):
+        """
+
+        """
+        return self._combineByKey(lambda x:x, func, func, numPartitions)
+
+    def _combineByKey(self, createCombiner, mergeValue, mergeCombiners,
+                      numPartitions = None):
+        """
+        """
+        if numPartitions is None:
+            numPartitions = self.ctx._defaultParallelism()
+        def combineLocally(iterator):
+            combiners = {}
+            for x in iterator:
+                (k, v) = x
+                if k not in combiners:
+                    combiners[k] = createCombiner(v)
+                else:
+                    combiners[k] = mergeValue(combiners[k], v)
+            return combiners.iteritems()
+        locally_combined = self.mapPartitions(combineLocally)
+        shuffled = locally_combined.partitionBy(numPartitions)
+        def _mergeCombiners(iterator):
+            combiners = {}
+            for (k, v) in iterator:
+                if not k in combiners:
+                    combiners[k] = v
+                else:
+                    combiners[k] = mergeCombiners(combiners[k], v)
+            return combiners.iteritems()
+        return shuffled.mapPartitions(_mergeCombiners) 
+
+
+   def partitionBy(self, numPartitions, partitionFunc=None):
+        """
+        Return a copy of the DStream partitioned using the specified partitioner.
+
+        """
+        if numPartitions is None:
+            numPartitions = self.ctx._defaultReducePartitions()
+
+        if partitionFunc is None:
+            partitionFunc = lambda x: 0 if x is None else hash(x)
+        # Transferring O(n) objects to Java is too expensive.  Instead, we'll
+        # form the hash buckets in Python, transferring O(numPartitions) objects
+        # to Java.  Each object is a (splitNumber, [objects]) pair.
+        outputSerializer = self.ctx._unbatched_serializer
+        def add_shuffle_key(split, iterator):
+
+            buckets = defaultdict(list)
+
+            for (k, v) in iterator:
+                buckets[partitionFunc(k) % numPartitions].append((k, v))
+            for (split, items) in buckets.iteritems():
+                yield pack_long(split)
+                yield outputSerializer.dumps(items)
+        keyed = PipelinedDStream(self, add_shuffle_key)
+        keyed._bypass_serializer = True
+        with _JavaStackTrace(self.ctx) as st:
+            #JavaDStream
+            #pairRDD = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairRDD()
+            pairDStream = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairDStream()
+            partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
+                                                          id(partitionFunc))
+        jdstream = pairDStream.partitionBy(partitioner).values()
+        dstream = DStream(jdstream, self._ssc, BatchedSerializer(outputSerializer))
+        # This is required so that id(partitionFunc) remains unique, even if
+        # partitionFunc is a lambda:
+        dstream._partitionFunc = partitionFunc
+        return dstream
+
+
+
+    def reduceByWindow(self, reduceFunc, windowDuration, slideDuration, inReduceTunc):
+        """
+        """
+
+        raise NotImplementedError
+
+    def repartition(self, numPartitions):
+        """
+        """
+        raise NotImplementedError
+
+    def slice(self, fromTime, toTime):
+        """
+        """
+        raise NotImplementedError
+
+    def transform(self, transformFunc):
+        """
+        """
+        raise NotImplementedError
+
+    def transformWith(self, other, transformFunc):
+        """
+        """
+        raise NotImplementedError
+
+    def union(self, that):
+        """
+        """
+        raise NotImplementedError
+
+    def window(self, windowDuration, slideDuration=None):
+        """
+        """
+        raise NotImplementedError
+
+    def wrapRDD(self, rdd):
+        """
+        """
+        raise NotImplementedError
+
+    def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
+        return PipelinedDStream(self, f, preservesPartitioning)
+
+
+class PipelinedDStream(DStream):
+    def __init__(self, prev, func, preservesPartitioning=False):
+        if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
+            # This transformation is the first in its stage:
+            self.func = func
+            self.preservesPartitioning = preservesPartitioning
+            self._prev_jdstream = prev._jdstream
+            self._prev_jrdd_deserializer = prev._jrdd_deserializer
+        else:
+            prev_func = prev.func
+            def pipeline_func(split, iterator):
+                return func(split, prev_func(split, iterator))
+            self.func = pipeline_func
+            self.preservesPartitioning = \
+                prev.preservesPartitioning and preservesPartitioning
+            self._prev_jdstream = prev._prev_jdstream  # maintain the pipeline
+            self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer
+        self.is_cached = False
+        self.is_checkpointed = False
+        self._ssc = prev._ssc
+        self.ctx = prev.ctx
+        self.prev = prev
+        self._jdstream_val = None
+        self._jrdd_deserializer = self.ctx.serializer
+        self._bypass_serializer = False
+
+    @property
+    def _jdstream(self):
+        if self._jdstream_val:
+            return self._jdstream_val
+        if self._bypass_serializer:
+            serializer = NoOpSerializer()
+        else:
+            serializer = self.ctx.serializer
+
+        command = (self.func, self._prev_jrdd_deserializer, serializer)
+        pickled_command = CloudPickleSerializer().dumps(command)
+        broadcast_vars = ListConverter().convert(
+            [x._jbroadcast for x in self.ctx._pickled_broadcast_vars],
+            self.ctx._gateway._gateway_client)
+        self.ctx._pickled_broadcast_vars.clear()
+        class_tag = self._prev_jdstream.classTag()
+        env = MapConverter().convert(self.ctx.environment,
+                                     self.ctx._gateway._gateway_client)
+        includes = ListConverter().convert(self.ctx._python_includes,
+                                     self.ctx._gateway._gateway_client)
+        python_dstream = self.ctx._jvm.PythonDStream(self._prev_jdstream.dstream(),
+                bytearray(pickled_command),
+                env, includes, self.preservesPartitioning,
+                self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator,
+                class_tag)
+        self._jdstream_val = python_dstream.asJavaDStream()
+        return self._jdstream_val
+
+    def _is_pipelinable(self):
+        return not (self.is_cached or self.is_checkpointed)
diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
new file mode 100644
index 0000000000000..ef1b4f6cef237
--- /dev/null
+++ b/python/pyspark/streaming/duration.py
@@ -0,0 +1,171 @@
+__author__ = 'ktakagiw'
+
+from pyspark.streaming import utils
+
+class Duration(object):
+    """
+    Duration for Spark Streaming application. Used to set duration
+
+    Most of the time, you would create a Duration object with
+    C{Duration()}, which will load values from C{spark.streaming.*} Java system
+    properties as well. In this case, any parameters you set directly on
+    the C{Duration} object take priority over system properties.
+
+    """
+    def __init__(self, millis, _jvm=None):
+        """
+        Create new Duration.
+
+        @param millis: milisecond
+
+        """
+        self._millis = millis
+
+        from pyspark.context import SparkContext
+        SparkContext._ensure_initialized()
+        _jvm = _jvm or SparkContext._jvm
+        self._jduration = _jvm.Duration(millis)
+
+    def toString(self):
+        """ Return duration as string """
+        return str(self._millis) + " ms"
+
+    def isZero(self):
+        """ Check if millis is zero """
+        return self._millis == 0
+
+    def prettyPrint(self):
+        """
+        Return a human-readable string representing a duration
+        """
+        return utils.msDurationToString(self._millis)
+
+    def milliseconds(self):
+        """ Return millisecond """
+        return self._millis
+
+    def toFormattedString(self):
+        """ Return millisecond """
+        return str(self._millis)
+
+    def max(self, other):
+        """ Return higher Duration """
+        Duration._is_duration(other)
+        if self > other:
+            return self
+        else:
+            return other
+
+    def min(self, other):
+        """ Return lower Durattion """
+        Duration._is_duration(other)
+        if self < other:
+            return self
+        else:
+            return other
+
+    def __str__(self):
+        return self.toString()
+
+    def __add__(self, other):
+        """ Add Duration and Duration """
+        Duration._is_duration(other)
+        return Duration(self._millis + other._millis)
+
+    def __sub__(self, other):
+        """ Subtract Duration by Duration  """
+        Duration._is_duration(other)
+        return Duration(self._millis - other._millis)
+
+    def __mul__(self, other):
+        """ Multiple Duration by Duration """
+        Duration._is_duration(other)
+        return Duration(self._millis * other._millis)
+
+    def __div__(self, other):
+        """
+        Divide Duration by Duration
+        for Python 2.X
+        """
+        Duration._is_duration(other)
+        return Duration(self._millis / other._millis)
+
+    def __truediv__(self, other):
+        """
+        Divide Duration by Duration
+        for Python 3.0
+        """
+        Duration._is_duration(other)
+        return Duration(self._millis / other._millis)
+
+    def __floordiv__(self, other):
+        """ Divide Duration by Duration """
+        Duration._is_duration(other)
+        return Duration(self._millis // other._millis)
+
+    def __len__(self):
+        """ Length of miilisecond in Duration """
+        return len(self._millis)
+
+    def __lt__(self, other):
+        """ Duration < Duration """
+        Duration._is_duration(other)
+        return self._millis < other._millis
+
+    def __le__(self, other):
+        """ Duration <= Duration """
+        Duration._is_duration(other)
+        return self.millis <= other._millis
+
+    def __eq__(self, other):
+        """ Duration ==  Duration """
+        Duration._is_duration(other)
+        return self._millis == other._millis
+
+    def __ne__(self, other):
+        """ Duration != Duration """
+        Duration._is_duration(other)
+        return self._millis != other._millis
+
+    def __gt__(self, other):
+        """ Duration > Duration """
+        Duration._is_duration(other)
+        return self._millis > other._millis
+
+    def __ge__(self, other):
+        """ Duration >= Duration """
+        Duration._is_duration(other)
+        return self._millis >= other._millis
+
+    @classmethod
+    def _is_duration(self, instance):
+        """ is instance Duration """
+        if not isinstance(instance, Duration):
+            raise TypeError("This should be Duration")
+
+def Milliseconds(milliseconds):
+    """
+    Helper function that creates instance of [[pysparkstreaming.duration]] representing
+    a given number of milliseconds.
+    """
+    return Duration(milliseconds)
+
+def Seconds(seconds):
+    """
+    Helper function that creates instance of [[pysparkstreaming.duration]] representing
+    a given number of seconds.
+    """
+    return Duration(seconds * 1000)
+
+def Minites(minites):
+    """
+    Helper function that creates instance of [[pysparkstreaming.duration]] representing
+    a given number of minutes.
+    """
+    return Duration(minutes * 60000)
+
+if __name__ == "__main__":
+    d = Duration(1)
+    print d
+    print d.milliseconds()
+
diff --git a/python/pyspark/streaming/jtime.py b/python/pyspark/streaming/jtime.py
new file mode 100644
index 0000000000000..41670af659ea3
--- /dev/null
+++ b/python/pyspark/streaming/jtime.py
@@ -0,0 +1,116 @@
+__author__ = 'ktakagiw'
+
+from pyspark.streaming import utils
+from pyspark.streaming.duration import Duration
+
+class Time(object):
+    """
+    Time for Spark Streaming application. Used to set Time
+
+    Most of the time, you would create a Duration object with
+    C{Time()}, which will load values from C{spark.streaming.*} Java system
+    properties as well. In this case, any parameters you set directly on
+    the C{Time} object take priority over system properties.
+
+    """
+    def __init__(self, millis, _jvm=None):
+        """
+        Create new Time.
+
+        @param millis: milisecond
+
+        @param _jvm: internal parameter used to pass a handle to the
+               Java VM; does not need to be set by users
+
+        """
+        self._millis = millis
+
+        from pyspark.context import StreamingContext
+        StreamingContext._ensure_initialized()
+        _jvm = _jvm or StreamingContext._jvm
+        self._jtime = _jvm.Time(millis)
+
+    def toString(self):
+        """ Return time as string """
+        return str(self._millis) + " ms"
+
+    def milliseconds(self):
+        """ Return millisecond """
+        return self._millis
+
+    def max(self, other):
+        """ Return higher Time """
+        Time._is_time(other)
+        if self > other:
+            return self
+        else:
+            return other
+
+    def min(self, other):
+        """ Return lower Time """
+        Time._is_time(other)
+        if self < other:
+            return self
+        else:
+            return other
+
+    def __add__(self, other):
+        """ Add Time and Time """
+        Duration._is_duration(other)
+        return Time(self._millis + other._millis)
+
+    def __sub__(self, other):
+        """ Subtract Time by Duration or Time """
+        if isinstance(other, Duration):
+            return Time(self._millis - other._millis)
+        elif isinstance(other, Time):
+            return Duration(self._mills, other._millis)
+        else:
+            raise TypeError
+
+    def __lt__(self, other):
+        """ Time < Time """
+        Time._is_time(other)
+        return self._millis < other._millis
+
+    def __le__(self, other):
+        """ Time <= Time """
+        Time._is_time(other)
+        return self.millis <= other._millis
+
+    def __eq__(self, other):
+        """ Time ==  Time """
+        Time._is_time(other)
+        return self._millis == other._millis
+
+    def __ne__(self, other):
+        """ Time != Time """
+        Time._is_time(other)
+        return self._millis != other._millis
+
+    def __gt__(self, other):
+        """ Time > Time """
+        Time._is_time(other)
+        return self._millis > other._millis
+
+    def __ge__(self, other):
+        """ Time >= Time """
+        Time._is_time(other)
+        return self._millis >= other._millis
+
+    def isMultipbleOf(duration):
+        """ is multiple by Duration """
+        Duration._is_duration(duration)
+        return self._millis % duration._millis == 0
+
+    def until(time, interval):
+        raise NotImplementedError
+
+    def to(time, interval):
+        raise NotImplementedError
+
+    @classmethod
+    def _is_time(self, instance):
+        """ is instance Time """
+        if not isinstance(instance, Time):
+            raise TypeError
diff --git a/python/pyspark/streaming/pyprint.py b/python/pyspark/streaming/pyprint.py
new file mode 100644
index 0000000000000..fcdaca510812c
--- /dev/null
+++ b/python/pyspark/streaming/pyprint.py
@@ -0,0 +1,28 @@
+import sys
+from itertools import chain
+from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
+
+def collect(binary_file_path):
+    dse = PickleSerializer()
+    with open(binary_file_path, 'rb') as tempFile:
+        for item in dse.load_stream(tempFile):
+            yield item
+def main():
+    try:
+        binary_file_path = sys.argv[1]
+    except:
+        print "Missed FilePath in argement"
+
+    if not binary_file_path:
+        return 
+
+    counter = 0
+    for rdd in chain.from_iterable(collect(binary_file_path)):
+        print rdd
+        counter = counter + 1
+        if counter >= 10:
+            print "..."
+            break
+
+if __name__ =="__main__":
+    exit(main())
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
new file mode 100644
index 0000000000000..71aa3376c6578
--- /dev/null
+++ b/python/pyspark/streaming/utils.py
@@ -0,0 +1,18 @@
+__author__ = 'ktakagiw'
+
+def msDurationToString(ms):
+    """
+    Returns a human-readable string representing a duration such as "35ms"
+    """
+    second = 1000
+    minute = 60 * second
+    hour = 60 * minute
+
+    if ms < second:
+        return "%d ms" % ms
+    elif ms < minute:
+        return "%.1f s" % (float(ms) / second)
+    elif ms < hout:
+        return "%.1f m" % (float(ms) / minute)
+    else:
+        return "%.2f h" % (float(ms) / hour)
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 12f900c91eb98..483e200ff9f16 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -77,9 +77,9 @@
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
       </plugin>
-      
-      <!-- 
-           This plugin forces the generation of jar containing streaming test classes, 
+
+      <!--
+           This plugin forces the generation of jar containing streaming test classes,
            so that the tests classes of external modules can use them. The two execution profiles
            are necessary - first one for 'mvn package', second one for 'mvn test-compile'. Ideally,
            'mvn compile' should not compile test classes and therefore should not need this. 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index a6184de4e83c1..cfa336df8674f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -54,6 +54,14 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     dstream.print()
   }
 
+  /**
+   * Print the first ten elements of each PythonRDD generated in the PythonDStream. This is an output
+   * operator, so this PythonDStream will be registered as an output stream and there materialized.
+   * This function is for PythonAPI.
+   */
+
+  def pyprint() = dstream.pyprint()
+
   /**
    * Return a new DStream in which each RDD has a single element generated by counting each RDD
    * of this DStream.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
new file mode 100644
index 0000000000000..2d8b1e468dc4c
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.api.python
+
+import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
+
+import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark._
+import org.apache.spark.util.Utils
+import java.io._
+import scala.Some
+import org.apache.spark.streaming.Duration
+import scala.util.control.Breaks._
+import org.apache.spark.broadcast.Broadcast
+import scala.Some
+import org.apache.spark.streaming.Duration
+import org.apache.spark.rdd.RDD
+import org.apache.spark.api.python.PythonRDD
+
+
+import org.apache.spark.streaming.{Duration, Time}
+import org.apache.spark.streaming.dstream._
+import org.apache.spark.streaming.api.java._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.api.python._
+import org.apache.spark.api.python.PairwiseRDD
+
+
+import scala.reflect.ClassTag
+
+
+class PythonDStream[T: ClassTag](
+                                  parent: DStream[T],
+                                  command: Array[Byte],
+                                  envVars: JMap[String, String],
+                                  pythonIncludes: JList[String],
+                                  preservePartitoning: Boolean,
+                                  pythonExec: String,
+                                  broadcastVars: JList[Broadcast[Array[Byte]]],
+                                  accumulator: Accumulator[JList[Array[Byte]]]
+                                  ) extends DStream[Array[Byte]](parent.ssc) {
+
+  override def dependencies = List(parent)
+
+  override def slideDuration: Duration = parent.slideDuration
+
+  //pythonDStream compute
+  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    parent.getOrCompute(validTime) match{
+      case Some(rdd) =>
+        val pythonRDD = new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning, pythonExec, broadcastVars, accumulator)
+        Some(pythonRDD.asJavaRDD.rdd)
+      case None => None
+    }
+  }
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+
+  /**
+   * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
+   * operator, so this PythonDStream will be registered as an output stream and there materialized.
+   * Since serialized Python object is readable by Python, pyprint writes out binary data to
+   * temporary file and run python script to deserialized and print the first ten elements
+   */
+  private[streaming] def ppyprint() {
+    def foreachFunc = (rdd: RDD[Array[Byte]], time: Time) => {
+      val iter = rdd.take(11).iterator
+
+      // make a temporary file
+      val prefix = "spark"
+      val suffix = ".tmp"
+      val tempFile = File.createTempFile(prefix, suffix)
+      val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
+      //write out serialized python object
+      PythonRDD.writeIteratorToStream(iter, tempFileStream)
+      tempFileStream.close()
+
+      // This value has to be passed from python
+      val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+      val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
+      //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
+      //absolute path to the python script is needed to change because we do not use pysparkstreaming
+      val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pysparkstreaming/streaming/pyprint.py", tempFile.getAbsolutePath)
+      val workerEnv = pb.environment()
+
+      //envVars also need to be pass
+      //workerEnv.putAll(envVars)
+      val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
+      workerEnv.put("PYTHONPATH", pythonPath)
+      val worker = pb.start()
+      val is = worker.getInputStream()
+      val isr = new InputStreamReader(is)
+      val br = new BufferedReader(isr)
+
+      println ("-------------------------------------------")
+      println ("Time: " + time)
+      println ("-------------------------------------------")
+
+      //print value from python std out
+      var line = ""
+      breakable {
+        while (true) {
+          line = br.readLine()
+          if (line == null) break()
+          println(line)
+        }
+      }
+      //delete temporary file
+      tempFile.delete()
+      println()
+
+    }
+    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
+  }
+}
+
+
+private class PairwiseDStream(prev:DStream[Array[Byte]]) extends
+DStream[(Long, Array[Byte])](prev.ssc){
+  override def dependencies = List(prev)
+
+  override def slideDuration: Duration = prev.slideDuration
+
+  override def compute(validTime:Time):Option[RDD[(Long, Array[Byte])]]={
+    prev.getOrCompute(validTime) match{
+      case Some(rdd)=>Some(rdd)
+        val pairwiseRDD = new PairwiseRDD(rdd)
+        Some(pairwiseRDD.asJavaPairRDD.rdd)
+      case None => None
+    }
+  }
+  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]] = JavaPairDStream(this)
+}
+
+
+
+
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index e05db236addca..b24109074e816 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -18,11 +18,13 @@
 package org.apache.spark.streaming.dstream
 
 
-import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
+import java.io._
 
 import scala.deprecated
 import scala.collection.mutable.HashMap
 import scala.reflect.ClassTag
+import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
+import scala.util.control.Breaks._
 
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.rdd.{BlockRDD, RDD}
@@ -31,6 +33,8 @@ import org.apache.spark.streaming._
 import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.scheduler.Job
 import org.apache.spark.util.MetadataCleaner
+import org.apache.spark.streaming.Duration
+import org.apache.spark.api.python.PythonRDD
 
 /**
  * A Discretized Stream (DStream), the basic abstraction in Spark Streaming, is a continuous
@@ -616,6 +620,68 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
+
+
+
+
+  /**
+   * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
+   * operator, so this PythonDStream will be registered as an output stream and there materialized.
+   * Since serialized Python object is readable by Python, pyprint writes out binary data to
+   * temporary file and run python script to deserialized and print the first ten elements
+   */
+  private[streaming] def pyprint() {
+    def foreachFunc = (rdd: RDD[T], time: Time) => {
+      val iter = rdd.take(11).iterator
+
+      // make a temporary file
+      val prefix = "spark"
+      val suffix = ".tmp"
+      val tempFile = File.createTempFile(prefix, suffix)
+      val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
+      //write out serialized python object
+      PythonRDD.writeIteratorToStream(iter, tempFileStream)
+      tempFileStream.close()
+
+      // This value has to be passed from python
+      val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+      val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
+      //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
+      //absolute path to the python script is needed to change because we do not use pysparkstreaming
+      val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath)
+      val workerEnv = pb.environment()
+
+      //envVars also need to be pass
+      //workerEnv.putAll(envVars)
+      val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
+      workerEnv.put("PYTHONPATH", pythonPath)
+      val worker = pb.start()
+      val is = worker.getInputStream()
+      val isr = new InputStreamReader(is)
+      val br = new BufferedReader(isr)
+
+      println ("-------------------------------------------")
+      println ("Time: " + time)
+      println ("-------------------------------------------")
+
+      //print value from python std out
+      var line = ""
+      breakable {
+        while (true) {
+          line = br.readLine()
+          if (line == null) break()
+          println(line)
+        }
+      }
+      //delete temporary file
+      tempFile.delete()
+      println()
+
+    }
+    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
+  }
+
+
   /**
    * Return a new DStream in which each RDD contains all the elements in seen in a
    * sliding window of time over this DStream. The new DStream generates RDDs with

From b406252c89b1cd5c5678faebfd720767bf0ee9f5 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Tue, 15 Jul 2014 15:41:52 -0700
Subject: [PATCH 166/347] comment PythonDStream.PairwiseDStream

---
 .../apache/spark/streaming/api/python/PythonDStream.scala   | 3 ++-
 .../scala/org/apache/spark/streaming/dstream/DStream.scala  | 6 ++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 2d8b1e468dc4c..fe67250604d8e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -129,7 +129,7 @@ class PythonDStream[T: ClassTag](
   }
 }
 
-
+/*
 private class PairwiseDStream(prev:DStream[Array[Byte]]) extends
 DStream[(Long, Array[Byte])](prev.ssc){
   override def dependencies = List(prev)
@@ -146,6 +146,7 @@ DStream[(Long, Array[Byte])](prev.ssc){
   }
   val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]] = JavaPairDStream(this)
 }
+*/
 
 
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index b24109074e816..d9d5446b62e9f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -620,10 +620,7 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-
-
-
-
+//TODO move pyprint to PythonDStream
   /**
    * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
    * operator, so this PythonDStream will be registered as an output stream and there materialized.
@@ -644,6 +641,7 @@ abstract class DStream[T: ClassTag] (
       tempFileStream.close()
 
       // This value has to be passed from python
+      // Python currently does not do cluster deployment. But what happened
       val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
       val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
       //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???

From 87438e276f368cf8f4cba138958e08e05d3fe26f Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Tue, 15 Jul 2014 17:19:20 -0700
Subject: [PATCH 167/347] modify dstream.py to fix indent error

---
 python/pyspark/streaming/dstream.py                             | 2 +-
 .../org/apache/spark/streaming/api/python/PythonDStream.scala   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index b422b147d11e1..a512517f6e437 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -172,7 +172,7 @@ def _mergeCombiners(iterator):
         return shuffled.mapPartitions(_mergeCombiners) 
 
 
-   def partitionBy(self, numPartitions, partitionFunc=None):
+    def partitionBy(self, numPartitions, partitionFunc=None):
         """
         Return a copy of the DStream partitioned using the specified partitioner.
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index fe67250604d8e..389136f9e21a0 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -91,7 +91,7 @@ class PythonDStream[T: ClassTag](
       tempFileStream.close()
 
       // This value has to be passed from python
-      val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+      //val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
       val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
       //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
       //absolute path to the python script is needed to change because we do not use pysparkstreaming

From d7b4d6fc10eb763f81176a6dbeef56945ac1bd45 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Tue, 15 Jul 2014 21:08:43 -0700
Subject: [PATCH 168/347] added reducedByKey not working yet

---
 .../src/main/python/streaming/wordcount.py    | 10 ++++++-
 python/pyspark/streaming/dstream.py           | 27 +++++++++++++++++--
 .../streaming/api/python/PythonDStream.scala  |  6 ++---
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index f44cd696894ba..3996991109d60 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -1,6 +1,7 @@
 import sys
 from operator import add
 
+from pyspark.conf import SparkConf
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
 
@@ -8,15 +9,22 @@
     if len(sys.argv) != 2:
         print >> sys.stderr, "Usage: wordcount <directory>"
         exit(-1)
-    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
+    conf = SparkConf()
+    conf.setAppName("PythonStreamingWordCount")
+    conf.set("spark.default.parallelism", 1)
+
+#    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
+    ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
     lines = ssc.textFileStream(sys.argv[1])
     fm_lines = lines.flatMap(lambda x: x.split(" "))
     filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
+    reduced_lines = mapped_lines.reduce(add)
     
     fm_lines.pyprint()
     filtered_lines.pyprint()
     mapped_lines.pyprint()
+    reduced_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index a512517f6e437..e144f8bc1cc09 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -29,6 +29,7 @@
 
 __all__ = ["DStream"]
 
+
 class DStream(object):
     def __init__(self, jdstream, ssc, jrdd_deserializer):
         self._jdstream = jdstream
@@ -149,7 +150,7 @@ def _combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         """
         """
         if numPartitions is None:
-            numPartitions = self.ctx._defaultParallelism()
+            numPartitions = self._defaultReducePartitions()
         def combineLocally(iterator):
             combiners = {}
             for x in iterator:
@@ -211,7 +212,6 @@ def add_shuffle_key(split, iterator):
         return dstream
 
 
-
     def reduceByWindow(self, reduceFunc, windowDuration, slideDuration, inReduceTunc):
         """
         """
@@ -254,8 +254,31 @@ def wrapRDD(self, rdd):
         raise NotImplementedError
 
     def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
+        """
+
+        """
         return PipelinedDStream(self, f, preservesPartitioning)
 
+    def _defaultReducePartitions(self):
+        """
+
+        """
+        # hard code to avoid the error
+        return 2
+        if self.ctx._conf.contains("spark.default.parallelism"):
+            return self.ctx.defaultParallelism
+        else:
+            return self.getNumPartitions()
+
+    def getNumPartitions(self):
+      """
+      Returns the number of partitions in RDD
+      >>> rdd = sc.parallelize([1, 2, 3, 4], 2)
+      >>> rdd.getNumPartitions()
+      2
+      """
+      return self._jdstream.partitions().size()
+
 
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 389136f9e21a0..719dd0a6a53c2 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -129,7 +129,7 @@ class PythonDStream[T: ClassTag](
   }
 }
 
-/*
+
 private class PairwiseDStream(prev:DStream[Array[Byte]]) extends
 DStream[(Long, Array[Byte])](prev.ssc){
   override def dependencies = List(prev)
@@ -144,9 +144,9 @@ DStream[(Long, Array[Byte])](prev.ssc){
       case None => None
     }
   }
-  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]] = JavaPairDStream(this)
+  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
 }
-*/
+
 
 
 

From 1a0f0653a66b5c8c3b843d5c5e2049afef08a4e1 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:07:42 -0700
Subject: [PATCH 169/347] implementing transform function in Python

---
 python/pyspark/streaming/dstream.py           |  3 +-
 .../api/python/PythonTransformedDStream.scala | 37 +++++++++++++++++++
 .../spark/streaming/dstream/DStream.scala     |  3 ++
 3 files changed, 41 insertions(+), 2 deletions(-)
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index e144f8bc1cc09..3365c6d69c1a2 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -172,7 +172,6 @@ def _mergeCombiners(iterator):
             return combiners.iteritems()
         return shuffled.mapPartitions(_mergeCombiners) 
 
-
     def partitionBy(self, numPartitions, partitionFunc=None):
         """
         Return a copy of the DStream partitioned using the specified partitioner.
@@ -231,6 +230,7 @@ def slice(self, fromTime, toTime):
     def transform(self, transformFunc):
         """
         """
+        self._jdstream.transform(transformFunc)
         raise NotImplementedError
 
     def transformWith(self, other, transformFunc):
@@ -264,7 +264,6 @@ def _defaultReducePartitions(self):
 
         """
         # hard code to avoid the error
-        return 2
         if self.ctx._conf.contains("spark.default.parallelism"):
             return self.ctx.defaultParallelism
         else:
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
new file mode 100644
index 0000000000000..ff70483b771a4
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
@@ -0,0 +1,37 @@
+package org.apache.spark.streaming.api.python
+
+import org.apache.spark.Accumulator
+import org.apache.spark.api.python.PythonRDD
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.RDD
+import org.apache.spark.streaming.api.java.JavaDStream
+import org.apache.spark.streaming.{Time, Duration}
+import org.apache.spark.streaming.dstream.DStream
+
+import scala.reflect.ClassTag
+
+/**
+ * Created by ken on 7/15/14.
+ */
+class PythonTransformedDStream[T: ClassTag](
+               parents: Seq[DStream[T]],
+               command: Array[Byte],
+               envVars: JMap[String, String],
+               pythonIncludes: JList[String],
+               preservePartitoning: Boolean,
+               pythonExec: String,
+               broadcastVars: JList[Broadcast[Array[Byte]]],
+               accumulator: Accumulator[JList[Array[Byte]]]
+               ) extends DStream[Array[Byte]](parent.ssc) {
+
+  override def dependencies = List(parent)
+
+  override def slideDuration: Duration = parent.slideDuration
+
+  //pythonDStream compute
+  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
+    Some()
+  }
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index d9d5446b62e9f..67977244ef420 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -561,9 +561,12 @@ abstract class DStream[T: ClassTag] (
     // because the DStream is reachable from the outer object here, and because 
     // DStreams can't be serialized with closures, we can't proactively check 
     // it for serializability and so we pass the optional false to SparkContext.clean
+
+    // serialized python
     val cleanedF = context.sparkContext.clean(transformFunc, false)
     val realTransformFunc =  (rdds: Seq[RDD[_]], time: Time) => {
       assert(rdds.length == 1)
+      // if transformfunc is fine, it is okay
       cleanedF(rdds.head.asInstanceOf[RDD[T]], time)
     }
     new TransformedDStream[U](Seq(this), realTransformFunc)

From 17a74c6298406589af42dc122a0409494b8ea168 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:12:53 -0700
Subject: [PATCH 170/347] modified the code base on comment in
 https://github.com/tdas/spark/pull/10

---
 core/pom.xml                         | 4 ----
 python/pyspark/streaming/__init__.py | 1 -
 python/pyspark/streaming/context.py  | 5 +----
 3 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/core/pom.xml b/core/pom.xml
index 7eb0b48eaeebd..2a81f6df289c0 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,11 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-<<<<<<< HEAD
     <version>1.2.0-SNAPSHOT</version>
-=======
-    <version>1.0.0</version>
->>>>>>> initial commit for pySparkStreaming
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/python/pyspark/streaming/__init__.py b/python/pyspark/streaming/__init__.py
index 719592912e80c..e69de29bb2d1d 100644
--- a/python/pyspark/streaming/__init__.py
+++ b/python/pyspark/streaming/__init__.py
@@ -1 +0,0 @@
-__author__ = 'ktakagiw'
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index c8ae9c4af85c9..40e9d98942e2e 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -1,6 +1,3 @@
-__author__ = 'ktakagiw'
-
-
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -41,7 +38,7 @@
 
 class StreamingContext(object):
     """
-    Main entry point for Spark functionality. A StreamingContext represents the
+    Main entry point for Spark Streaming functionality. A StreamingContext represents the
     connection to a Spark cluster, and can be used to create L{RDD}s and
     broadcast variables on that cluster.
     """

From 494cae543fa0706865e68f55bb94d9f55d3418e9 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:24:08 -0700
Subject: [PATCH 171/347] remove not implemented DStream functions in python

---
 python/pyspark/streaming/dstream.py | 102 ----------------------------
 1 file changed, 102 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 3365c6d69c1a2..3df6e5e09b0c1 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -55,50 +55,6 @@ def pyprint(self):
         """
         self._jdstream.pyprint()
 
-    def cache(self):
-        """
-        """
-        raise NotImplementedError
-
-    def checkpoint(self):
-        """
-        """
-        raise NotImplementedError
-
-    def compute(self, time):
-        """
-        """
-        raise NotImplementedError
-
-    def context(self):
-        """
-        """
-        raise NotImplementedError
-
-    def count(self):
-        """
-        """
-        raise NotImplementedError
-
-    def countByValue(self, numPartitions=None):
-        """
-        """
-        raise NotImplementedError
-
-    def countByValueAndWindow(self, duration, slideDuration=None):
-        """
-        """
-        raise NotImplementedError
-
-    def countByWindow(self, duration, slideDuration=None):
-        """
-        """
-        raise NotImplementedError
-
-    def dstream(self):
-        """
-        """
-        raise NotImplementedError
 
     def filter(self, f):
         """
@@ -112,16 +68,6 @@ def flatMap(self, f, preservesPartitioning=False):
         def func(s, iterator): return chain.from_iterable(imap(f, iterator))
         return self.mapPartitionsWithIndex(func, preservesPartitioning)
 
-    def foreachRDD(self, f, time):
-        """
-        """
-        raise NotImplementedError
-
-    def glom(self):
-        """
-        """
-        raise NotImplementedError
-
     def map(self, f, preservesPartitioning=False):
         """
         """
@@ -134,11 +80,6 @@ def mapPartitions(self, f):
         def func(s, iterator): return f(iterator)
         return self.mapPartitionsWithIndex(func)
 
-    def perist(self, storageLevel):
-        """
-        """
-        raise NotImplementedError
-
     def reduce(self, func, numPartitions=None):
         """
 
@@ -210,49 +151,6 @@ def add_shuffle_key(split, iterator):
         dstream._partitionFunc = partitionFunc
         return dstream
 
-
-    def reduceByWindow(self, reduceFunc, windowDuration, slideDuration, inReduceTunc):
-        """
-        """
-
-        raise NotImplementedError
-
-    def repartition(self, numPartitions):
-        """
-        """
-        raise NotImplementedError
-
-    def slice(self, fromTime, toTime):
-        """
-        """
-        raise NotImplementedError
-
-    def transform(self, transformFunc):
-        """
-        """
-        self._jdstream.transform(transformFunc)
-        raise NotImplementedError
-
-    def transformWith(self, other, transformFunc):
-        """
-        """
-        raise NotImplementedError
-
-    def union(self, that):
-        """
-        """
-        raise NotImplementedError
-
-    def window(self, windowDuration, slideDuration=None):
-        """
-        """
-        raise NotImplementedError
-
-    def wrapRDD(self, rdd):
-        """
-        """
-        raise NotImplementedError
-
     def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
 

From e1df94094fd0cdb4353ddbdde04920668933bdc4 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:35:59 -0700
Subject: [PATCH 172/347] revert pom.xml

---
 python/pyspark/streaming/pyprint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/streaming/pyprint.py b/python/pyspark/streaming/pyprint.py
index fcdaca510812c..6e87c985a57e3 100644
--- a/python/pyspark/streaming/pyprint.py
+++ b/python/pyspark/streaming/pyprint.py
@@ -1,6 +1,6 @@
 import sys
 from itertools import chain
-from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
+from pyspark.serializers import PickleSerializer
 
 def collect(binary_file_path):
     dse = PickleSerializer()

From 5bac7ecefa326ac5439e12ef3def44629a7f5694 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:44:14 -0700
Subject: [PATCH 173/347] revert streaming/pom.xml

---
 streaming/pom.xml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/streaming/pom.xml b/streaming/pom.xml
index 483e200ff9f16..12f900c91eb98 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -77,9 +77,9 @@
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
       </plugin>
-
-      <!--
-           This plugin forces the generation of jar containing streaming test classes,
+      
+      <!-- 
+           This plugin forces the generation of jar containing streaming test classes, 
            so that the tests classes of external modules can use them. The two execution profiles
            are necessary - first one for 'mvn package', second one for 'mvn test-compile'. Ideally,
            'mvn compile' should not compile test classes and therefore should not need this. 

From d2099d8535a5c4168859527fd368e4f1dd623f2f Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 12:15:06 -0700
Subject: [PATCH 174/347] sorted the import following Spark coding convention

---
 .../streaming/api/python/PythonDStream.scala  | 45 ++++++-------------
 1 file changed, 13 insertions(+), 32 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 719dd0a6a53c2..924a03cd2f0ae 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -19,42 +19,28 @@ package org.apache.spark.streaming.api.python
 
 import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
 
-import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
-import org.apache.spark.broadcast.Broadcast
+import scala.reflect.ClassTag
+
 import org.apache.spark._
-import org.apache.spark.util.Utils
-import java.io._
-import scala.Some
-import org.apache.spark.streaming.Duration
-import scala.util.control.Breaks._
-import org.apache.spark.broadcast.Broadcast
-import scala.Some
-import org.apache.spark.streaming.Duration
 import org.apache.spark.rdd.RDD
-import org.apache.spark.api.python.PythonRDD
-
-
+import org.apache.spark.api.python._
+import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.streaming.{Duration, Time}
 import org.apache.spark.streaming.dstream._
 import org.apache.spark.streaming.api.java._
-import org.apache.spark.rdd.RDD
-import org.apache.spark.api.python._
-import org.apache.spark.api.python.PairwiseRDD
-
 
-import scala.reflect.ClassTag
 
 
 class PythonDStream[T: ClassTag](
-                                  parent: DStream[T],
-                                  command: Array[Byte],
-                                  envVars: JMap[String, String],
-                                  pythonIncludes: JList[String],
-                                  preservePartitoning: Boolean,
-                                  pythonExec: String,
-                                  broadcastVars: JList[Broadcast[Array[Byte]]],
-                                  accumulator: Accumulator[JList[Array[Byte]]]
-                                  ) extends DStream[Array[Byte]](parent.ssc) {
+    parent: DStream[T],
+    command: Array[Byte],
+    envVars: JMap[String, String],
+    pythonIncludes: JList[String],
+    preservePartitoning: Boolean,
+    pythonExec: String,
+    broadcastVars: JList[Broadcast[Array[Byte]]],
+    accumulator: Accumulator[JList[Array[Byte]]])
+  extends DStream[Array[Byte]](parent.ssc) {
 
   override def dependencies = List(parent)
 
@@ -146,8 +132,3 @@ DStream[(Long, Array[Byte])](prev.ssc){
   }
   val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
 }
-
-
-
-
-

From 224fc5e271ef1a58c7b9a7e6f77652d6a0691d7c Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 12:19:42 -0700
Subject: [PATCH 175/347] add empty line

---
 .../org/apache/spark/streaming/api/python/PythonDStream.scala    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 924a03cd2f0ae..0f4a58eef4a78 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -55,6 +55,7 @@ class PythonDStream[T: ClassTag](
       case None => None
     }
   }
+  
   val asJavaDStream  = JavaDStream.fromDStream(this)
 
   /**

From bb7ccf3a103644b5701b8c667397130d807ce7db Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 15:40:42 -0700
Subject: [PATCH 176/347] remove unused import in python

---
 python/pyspark/streaming/context.py           |  9 ------
 python/pyspark/streaming/dstream.py           | 30 +++----------------
 python/pyspark/streaming/duration.py          | 17 ++++++++++-
 python/pyspark/streaming/jtime.py             | 24 ++++++++++++++-
 python/pyspark/streaming/pyprint.py           | 19 ++++++++++++
 .../streaming/api/python/PythonDStream.scala  |  2 +-
 6 files changed, 63 insertions(+), 38 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 40e9d98942e2e..d3ff16fca764f 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -15,15 +15,6 @@
 # limitations under the License.
 #
 
-import os
-import shutil
-import sys
-from threading import Lock
-from tempfile import NamedTemporaryFile
-
-from pyspark import accumulators
-from pyspark.accumulators import Accumulator
-from pyspark.broadcast import Broadcast
 from pyspark.conf import SparkConf
 from pyspark.files import SparkFiles
 from pyspark.java_gateway import launch_gateway
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 3df6e5e09b0c1..5766cca39bdee 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -1,28 +1,8 @@
-from base64 import standard_b64encode as b64enc
-import copy
 from collections import defaultdict
-from collections import namedtuple
 from itertools import chain, ifilter, imap
-import operator
-import os
-import sys
-import shlex
-import traceback
-from subprocess import Popen, PIPE
-from tempfile import NamedTemporaryFile
-from threading import Thread
-import warnings
-import heapq
-from random import Random
-
-from pyspark.serializers import NoOpSerializer, CartesianDeserializer, \
-    BatchedSerializer, CloudPickleSerializer, PairDeserializer, pack_long
-from pyspark.join import python_join, python_left_outer_join, \
-    python_right_outer_join, python_cogroup
-from pyspark.statcounter import StatCounter
-from pyspark.rddsampler import RDDSampler
-from pyspark.storagelevel import StorageLevel
-#from pyspark.resultiterable import ResultIterable
+
+from pyspark.serializers import NoOpSerializer,\
+    BatchedSerializer, CloudPickleSerializer, pack_long
 from pyspark.rdd import _JavaStackTrace
 
 from py4j.java_collections import ListConverter, MapConverter
@@ -47,7 +27,7 @@ def generatedRDDs(self):
     def print_(self):
         """
         """
-        # print is a resrved name of Python. We cannot give print to function name
+        # print is a reserved name of Python. We cannot give print to function name
         getattr(self._jdstream, "print")()
 
     def pyprint(self):
@@ -55,7 +35,6 @@ def pyprint(self):
         """
         self._jdstream.pyprint()
 
-
     def filter(self, f):
         """
         """
@@ -140,7 +119,6 @@ def add_shuffle_key(split, iterator):
         keyed._bypass_serializer = True
         with _JavaStackTrace(self.ctx) as st:
             #JavaDStream
-            #pairRDD = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairRDD()
             pairDStream = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairDStream()
             partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
                                                           id(partitionFunc))
diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
index ef1b4f6cef237..5982146e69026 100644
--- a/python/pyspark/streaming/duration.py
+++ b/python/pyspark/streaming/duration.py
@@ -1,4 +1,19 @@
-__author__ = 'ktakagiw'
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 
 from pyspark.streaming import utils
 
diff --git a/python/pyspark/streaming/jtime.py b/python/pyspark/streaming/jtime.py
index 41670af659ea3..32ef741051283 100644
--- a/python/pyspark/streaming/jtime.py
+++ b/python/pyspark/streaming/jtime.py
@@ -1,8 +1,30 @@
-__author__ = 'ktakagiw'
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 
 from pyspark.streaming import utils
 from pyspark.streaming.duration import Duration
 
+"""
+The name of this file, time is not good naming for python
+because if we do import time when we want to use native python time package, it does
+not import python time package.
+"""
+
+
 class Time(object):
     """
     Time for Spark Streaming application. Used to set Time
diff --git a/python/pyspark/streaming/pyprint.py b/python/pyspark/streaming/pyprint.py
index 6e87c985a57e3..1aeb8e50375ed 100644
--- a/python/pyspark/streaming/pyprint.py
+++ b/python/pyspark/streaming/pyprint.py
@@ -1,5 +1,24 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
 import sys
 from itertools import chain
+
 from pyspark.serializers import PickleSerializer
 
 def collect(binary_file_path):
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 0f4a58eef4a78..cecb10fc30eb4 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -55,7 +55,7 @@ class PythonDStream[T: ClassTag](
       case None => None
     }
   }
-  
+
   val asJavaDStream  = JavaDStream.fromDStream(this)
 
   /**

From f7461091dde482edac0a3cbe2cdf1100ec6c5c03 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 16:23:08 -0700
Subject: [PATCH 177/347] initial commit for socketTextStream

---
 .../python/streaming/nerwork_wordcount.py     | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 examples/src/main/python/streaming/nerwork_wordcount.py

diff --git a/examples/src/main/python/streaming/nerwork_wordcount.py b/examples/src/main/python/streaming/nerwork_wordcount.py
new file mode 100644
index 0000000000000..2e5048ccad213
--- /dev/null
+++ b/examples/src/main/python/streaming/nerwork_wordcount.py
@@ -0,0 +1,22 @@
+import sys
+from operator import add
+
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.duration import *
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
+        exit(-1)
+    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
+
+    lines = ssc.socketTextStream(sys.argv[1], sys.argv[2])
+    fm_lines = lines.flatMap(lambda x: x.split(" "))
+    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
+    mapped_lines = fm_lines.map(lambda x: (x, 1))
+    
+    fm_lines.pyprint()
+    filtered_lines.pyprint()
+    mapped_lines.pyprint()
+    ssc.start()
+    ssc.awaitTermination()

From 0d1b954e709c3888744246a502329259aa3df3ab Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 16:27:28 -0700
Subject: [PATCH 178/347] fied input of socketTextDStream

---
 .../python/streaming/nerwork_wordcount.py     |  2 +-
 python/pyspark/java_gateway.py                |  1 +
 python/pyspark/streaming/context.py           | 25 +++----------------
 3 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/examples/src/main/python/streaming/nerwork_wordcount.py b/examples/src/main/python/streaming/nerwork_wordcount.py
index 2e5048ccad213..67dc28f7bf7f0 100644
--- a/examples/src/main/python/streaming/nerwork_wordcount.py
+++ b/examples/src/main/python/streaming/nerwork_wordcount.py
@@ -10,7 +10,7 @@
         exit(-1)
     ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
 
-    lines = ssc.socketTextStream(sys.argv[1], sys.argv[2])
+    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     fm_lines = lines.flatMap(lambda x: x.split(" "))
     filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index c3fef42d118bd..d7e7bbf806544 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -111,6 +111,7 @@ def run(self):
     java_import(gateway.jvm, "org.apache.spark.streaming.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.dstream.*")
     java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index d3ff16fca764f..5dcc9ba35a653 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -19,7 +19,7 @@
 from pyspark.files import SparkFiles
 from pyspark.java_gateway import launch_gateway
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
-from pyspark.storagelevel import StorageLevel
+from pyspark.storagelevel import *
 from pyspark.rdd import RDD
 from pyspark.context import SparkContext
 
@@ -83,26 +83,9 @@ def awaitTermination(self, timeout=None):
         else:
             self._jssc.awaitTermination()
 
-    def checkpoint(self, directory):
-        raise NotImplementedError
-
-    def fileStream(self, directory, filter=None, newFilesOnly=None):
-        raise NotImplementedError
-
-    def networkStream(self, receiver):
-        raise NotImplementedError
-
-    def queueStream(self, queue, oneAtATime=True, defaultRDD=None):
-        raise NotImplementedError
-
-    def rawSocketStream(self, hostname, port, storagelevel):
-        raise NotImplementedError
-
-    def remember(self, duration):
-        raise NotImplementedError
-
-    def socketStream(hostname, port, converter,storageLevel):
-        raise NotImplementedError
+    # start from simple one. storageLevel is not passed for now.
+    def socketTextStream(self, hostname, port):
+        return DStream(self._jssc.socketTextStream(hostname, port), self, UTF8Deserializer())
 
     def start(self):
         self._jssc.start()

From ccfd2149ac762d590ee571b2dcb448a06f397232 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Wed, 16 Jul 2014 23:29:37 -0700
Subject: [PATCH 179/347] added doctest for pyspark.streaming.duration

---
 python/pyspark/streaming/duration.py | 242 +++++++++++++++++++++++----
 python/pyspark/streaming/utils.py    |  20 ++-
 python/run-tests                     |   1 +
 3 files changed, 233 insertions(+), 30 deletions(-)

diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
index 5982146e69026..06a169e5215ac 100644
--- a/python/pyspark/streaming/duration.py
+++ b/python/pyspark/streaming/duration.py
@@ -42,29 +42,80 @@ def __init__(self, millis, _jvm=None):
         self._jduration = _jvm.Duration(millis)
 
     def toString(self):
-        """ Return duration as string """
+        """
+        Return duration as string
+
+        >>> d_10 = Duration(10)
+        >>> d_10.toString()
+        '10 ms'
+        """
         return str(self._millis) + " ms"
 
     def isZero(self):
-        """ Check if millis is zero """
+        """
+        Check if millis is zero
+
+        >>> d_10 = Duration(10)
+        >>> d_10.isZero()
+        False
+        >>> d_0 = Duration(0)
+        >>> d_0.isZero()
+        True
+        """
         return self._millis == 0
 
     def prettyPrint(self):
         """
         Return a human-readable string representing a duration
+
+        >>> d_10 = Duration(10)
+        >>> d_10.prettyPrint()
+        '10 ms'
+        >>> d_1sec = Duration(1000)
+        >>> d_1sec.prettyPrint()
+        '1.0 s'
+        >>> d_1min = Duration(60 * 1000)
+        >>> d_1min.prettyPrint()
+        '1.0 m'
+        >>> d_1hour = Duration(60 * 60 * 1000)
+        >>> d_1hour.prettyPrint()
+        '1.00 h'
         """
         return utils.msDurationToString(self._millis)
 
     def milliseconds(self):
-        """ Return millisecond """
+        """
+        Return millisecond
+
+        >>> d_10 = Duration(10)
+        >>> d_10.milliseconds()
+        10
+
+        """
         return self._millis
 
     def toFormattedString(self):
-        """ Return millisecond """
+        """
+        Return millisecond
+
+        >>> d_10 = Duration(10)
+        >>> d_10.toFormattedString()
+        '10'
+
+        """
         return str(self._millis)
 
     def max(self, other):
-        """ Return higher Duration """
+        """
+        Return higher Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_100 = Duration(100)
+        >>> d_max = d_10.max(d_100)
+        >>> print d_max
+        100 ms
+
+        """
         Duration._is_duration(other)
         if self > other:
             return self
@@ -72,7 +123,16 @@ def max(self, other):
             return other
 
     def min(self, other):
-        """ Return lower Durattion """
+        """
+        Return lower Durattion
+
+        >>> d_10 = Duration(10)
+        >>> d_100 = Duration(100)
+        >>> d_min = d_10.min(d_100)
+        >>> print d_min
+        10 ms
+
+        """
         Duration._is_duration(other)
         if self < other:
             return self
@@ -80,20 +140,52 @@ def min(self, other):
             return other
 
     def __str__(self):
+        """
+        >>> d_10 = Duration(10)
+        >>> str(d_10)
+        '10 ms'
+
+        """
         return self.toString()
 
     def __add__(self, other):
-        """ Add Duration and Duration """
+        """
+        Add Duration and Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_100 = Duration(100)
+        >>> d_110 = d_10 + d_100
+        >>> print d_110
+        110 ms
+        """
         Duration._is_duration(other)
         return Duration(self._millis + other._millis)
 
     def __sub__(self, other):
-        """ Subtract Duration by Duration  """
+        """
+        Subtract Duration by Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_100 = Duration(100)
+        >>> d_90 =  d_100 - d_10
+        >>> print d_90
+        90 ms
+
+        """
         Duration._is_duration(other)
         return Duration(self._millis - other._millis)
 
     def __mul__(self, other):
-        """ Multiple Duration by Duration """
+        """
+        Multiple Duration by Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_100 = Duration(100)
+        >>> d_1000 = d_10 * d_100
+        >>> print d_1000
+        1000 ms
+
+        """
         Duration._is_duration(other)
         return Duration(self._millis * other._millis)
 
@@ -101,6 +193,13 @@ def __div__(self, other):
         """
         Divide Duration by Duration
         for Python 2.X
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_2 = d_20 / d_10
+        >>> print d_2
+        2 ms
+
         """
         Duration._is_duration(other)
         return Duration(self._millis / other._millis)
@@ -109,46 +208,121 @@ def __truediv__(self, other):
         """
         Divide Duration by Duration
         for Python 3.0
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_2 = d_20 / d_10
+        >>> print d_2
+        2 ms
+
         """
         Duration._is_duration(other)
         return Duration(self._millis / other._millis)
 
     def __floordiv__(self, other):
-        """ Divide Duration by Duration """
+        """
+        Divide Duration by Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_3 = Duration(3)
+        >>> d_3 = d_10 // d_3
+        >>> print d_3
+        3 ms
+
+        """
         Duration._is_duration(other)
         return Duration(self._millis // other._millis)
 
-    def __len__(self):
-        """ Length of miilisecond in Duration """
-        return len(self._millis)
-
     def __lt__(self, other):
-        """ Duration < Duration """
+        """
+        Duration < Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_10 < d_20
+        True
+        >>> d_20 < d_10
+        False
+
+        """
         Duration._is_duration(other)
         return self._millis < other._millis
 
     def __le__(self, other):
-        """ Duration <= Duration """
+        """
+        Duration <= Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_10 <= d_20
+        True
+        >>> d_20 <= d_10
+        False
+
+        """
         Duration._is_duration(other)
-        return self.millis <= other._millis
+        return self._millis <= other._millis
 
     def __eq__(self, other):
-        """ Duration ==  Duration """
+        """
+        Duration ==  Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_10 == d_20
+        False
+        >>> other_d_10 = Duration(10)
+        >>> d_10 == other_d_10
+        True
+
+        """
         Duration._is_duration(other)
         return self._millis == other._millis
 
     def __ne__(self, other):
-        """ Duration != Duration """
+        """
+        Duration != Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_10 != d_20
+        True
+        >>> other_d_10 = Duration(10)
+        >>> d_10 != other_d_10
+        False
+
+        """
         Duration._is_duration(other)
         return self._millis != other._millis
 
     def __gt__(self, other):
-        """ Duration > Duration """
+        """
+        Duration > Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_10 > d_20
+        False
+        >>> d_20 > d_10
+        True
+
+        """
         Duration._is_duration(other)
         return self._millis > other._millis
 
     def __ge__(self, other):
-        """ Duration >= Duration """
+        """
+        Duration >= Duration
+
+        >>> d_10 = Duration(10)
+        >>> d_20 = Duration(20)
+        >>> d_10 < d_20
+        True
+        >>> d_20 < d_10
+        False
+
+
+        """
         Duration._is_duration(other)
         return self._millis >= other._millis
 
@@ -162,6 +336,12 @@ def Milliseconds(milliseconds):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
     a given number of milliseconds.
+
+    >>> milliseconds = Milliseconds(1)
+    >>> d_1 = Duration(1)
+    >>> milliseconds == d_1
+    True
+
     """
     return Duration(milliseconds)
 
@@ -169,18 +349,24 @@ def Seconds(seconds):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
     a given number of seconds.
+
+    >>> seconds = Seconds(1)
+    >>> d_1sec = Duration(1000)
+    >>> seconds == d_1sec
+    True
+
     """
     return Duration(seconds * 1000)
 
-def Minites(minites):
+def Minutes(minutes):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
     a given number of minutes.
-    """
-    return Duration(minutes * 60000)
 
-if __name__ == "__main__":
-    d = Duration(1)
-    print d
-    print d.milliseconds()
+    >>> minutes = Minutes(1)
+    >>> d_1min = Duration(60 * 1000)
+    >>> minutes == d_1min
+    True
 
+    """
+    return Duration(minutes * 60 * 1000)
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index 71aa3376c6578..b1fa1e227b0a1 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -1,4 +1,20 @@
-__author__ = 'ktakagiw'
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 
 def msDurationToString(ms):
     """
@@ -12,7 +28,7 @@ def msDurationToString(ms):
         return "%d ms" % ms
     elif ms < minute:
         return "%.1f s" % (float(ms) / second)
-    elif ms < hout:
+    elif ms < hour:
         return "%.1f m" % (float(ms) / minute)
     else:
         return "%.2f h" % (float(ms) / hour)
diff --git a/python/run-tests b/python/run-tests
index a7ec270c7da21..27f47edaebbad 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -70,6 +70,7 @@ export PYSPARK_DOC_TEST=1
 run_test "pyspark/broadcast.py"
 run_test "pyspark/accumulators.py"
 run_test "pyspark/serializers.py"
+run_test "pyspark/streaming/duration.py"
 unset PYSPARK_DOC_TEST
 run_test "pyspark/shuffle.py"
 run_test "pyspark/tests.py"

From b31446a72099317132e5894bcdd53e9f872c076f Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Wed, 16 Jul 2014 23:36:33 -0700
Subject: [PATCH 180/347] fixed typo of network_workdcount.py

---
 .../python/streaming/network_wordcount.py     | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 examples/src/main/python/streaming/network_wordcount.py

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
new file mode 100644
index 0000000000000..67dc28f7bf7f0
--- /dev/null
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -0,0 +1,22 @@
+import sys
+from operator import add
+
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.duration import *
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
+        exit(-1)
+    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
+
+    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
+    fm_lines = lines.flatMap(lambda x: x.split(" "))
+    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
+    mapped_lines = fm_lines.map(lambda x: (x, 1))
+    
+    fm_lines.pyprint()
+    filtered_lines.pyprint()
+    mapped_lines.pyprint()
+    ssc.start()
+    ssc.awaitTermination()

From dc6995d1e010eadc49089b86e853c8698f10aafa Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Wed, 16 Jul 2014 23:39:25 -0700
Subject: [PATCH 181/347] delete old file

---
 .../python/streaming/nerwork_wordcount.py     | 22 -------------------
 1 file changed, 22 deletions(-)
 delete mode 100644 examples/src/main/python/streaming/nerwork_wordcount.py

diff --git a/examples/src/main/python/streaming/nerwork_wordcount.py b/examples/src/main/python/streaming/nerwork_wordcount.py
deleted file mode 100644
index 67dc28f7bf7f0..0000000000000
--- a/examples/src/main/python/streaming/nerwork_wordcount.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import sys
-from operator import add
-
-from pyspark.streaming.context import StreamingContext
-from pyspark.streaming.duration import *
-
-if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
-        exit(-1)
-    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
-
-    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
-    fm_lines = lines.flatMap(lambda x: x.split(" "))
-    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
-    mapped_lines = fm_lines.map(lambda x: (x, 1))
-    
-    fm_lines.pyprint()
-    filtered_lines.pyprint()
-    mapped_lines.pyprint()
-    ssc.start()
-    ssc.awaitTermination()

From c455c8d711235532904b18e3c26ac254c74558c6 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Tue, 15 Jul 2014 21:08:43 -0700
Subject: [PATCH 182/347] added reducedByKey not working yet

---
 python/pyspark/streaming/dstream.py           |  2 +-
 .../streaming/api/python/PythonDStream.scala  | 85 +++++++++++++++++++
 2 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 5766cca39bdee..91b83511b1cca 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -143,7 +143,7 @@ def _defaultReducePartitions(self):
         if self.ctx._conf.contains("spark.default.parallelism"):
             return self.ctx.defaultParallelism
         else:
-            return self.getNumPartitions()
+            return 2
 
     def getNumPartitions(self):
       """
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index cecb10fc30eb4..5986bcc84f31e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -55,6 +55,7 @@ class PythonDStream[T: ClassTag](
       case None => None
     }
   }
+<<<<<<< HEAD
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
 
@@ -133,3 +134,87 @@ DStream[(Long, Array[Byte])](prev.ssc){
   }
   val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
 }
+=======
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+
+  /**
+   * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
+   * operator, so this PythonDStream will be registered as an output stream and there materialized.
+   * Since serialized Python object is readable by Python, pyprint writes out binary data to
+   * temporary file and run python script to deserialized and print the first ten elements
+   */
+  private[streaming] def ppyprint() {
+    def foreachFunc = (rdd: RDD[Array[Byte]], time: Time) => {
+      val iter = rdd.take(11).iterator
+
+      // make a temporary file
+      val prefix = "spark"
+      val suffix = ".tmp"
+      val tempFile = File.createTempFile(prefix, suffix)
+      val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
+      //write out serialized python object
+      PythonRDD.writeIteratorToStream(iter, tempFileStream)
+      tempFileStream.close()
+
+      // This value has to be passed from python
+      //val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+      val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
+      //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
+      //absolute path to the python script is needed to change because we do not use pysparkstreaming
+      val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pysparkstreaming/streaming/pyprint.py", tempFile.getAbsolutePath)
+      val workerEnv = pb.environment()
+
+      //envVars also need to be pass
+      //workerEnv.putAll(envVars)
+      val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
+      workerEnv.put("PYTHONPATH", pythonPath)
+      val worker = pb.start()
+      val is = worker.getInputStream()
+      val isr = new InputStreamReader(is)
+      val br = new BufferedReader(isr)
+
+      println ("-------------------------------------------")
+      println ("Time: " + time)
+      println ("-------------------------------------------")
+
+      //print value from python std out
+      var line = ""
+      breakable {
+        while (true) {
+          line = br.readLine()
+          if (line == null) break()
+          println(line)
+        }
+      }
+      //delete temporary file
+      tempFile.delete()
+      println()
+
+    }
+    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
+  }
+}
+
+
+private class PairwiseDStream(prev:DStream[Array[Byte]]) extends
+DStream[(Long, Array[Byte])](prev.ssc){
+  override def dependencies = List(prev)
+
+  override def slideDuration: Duration = prev.slideDuration
+
+  override def compute(validTime:Time):Option[RDD[(Long, Array[Byte])]]={
+    prev.getOrCompute(validTime) match{
+      case Some(rdd)=>Some(rdd)
+        val pairwiseRDD = new PairwiseRDD(rdd)
+        Some(pairwiseRDD.asJavaPairRDD.rdd)
+      case None => None
+    }
+  }
+  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
+}
+
+
+
+
+
+>>>>>>> added reducedByKey not working yet

From 6f98e50eae899f3c4ad04351dbd3942c49b13771 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Thu, 17 Jul 2014 16:27:05 -0700
Subject: [PATCH 183/347] reduceByKey is working

---
 .../src/main/python/streaming/wordcount.pyc   | Bin 0 -> 1566 bytes
 python/pyspark/streaming/dstream.py           |   6 +-
 .../streaming/api/python/PythonDStream.scala  |  87 +++---------------
 .../api/python/PythonTransformedDStream.scala |  19 ++--
 4 files changed, 29 insertions(+), 83 deletions(-)
 create mode 100644 examples/src/main/python/streaming/wordcount.pyc

diff --git a/examples/src/main/python/streaming/wordcount.pyc b/examples/src/main/python/streaming/wordcount.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db93702361f47f57988ea82c213aae522e7a9f81
GIT binary patch
literal 1566
zcmb_c+invv5S`tmX_F>xdP%Qb`WW$$yrZfDRUlD`3MEBIE2LoJN!i4Ek?qh#;suqD
z<A3-7n6a}d5{L(2H};Otj6G*APU%~t_Uz(oe+Y}GLf<De%)0<U(k}o75P3NS6+jX~
z6hgqd5~va+Wr)gtT?VxRNd=;cU$1~#g)3YGS%FCi`fY$!K!9-#ZUd03Fe!ol?2vUR
z%QXmJTwtBuk~&BMunf24y#d&O@dn7MQ}ALFWDVpx+!7V6fUL8LB*Ugd1KJlxTYy_I
z-d?yQ>q9y>+5y~w@h->>_~JZ8Ex;Cx_dsqI$i71kKsF2H5bzMjM}Q%7h>o4XJ*F4n
zK8zO<nJlE^r9>3&dpnXIIEg}|#P-`;l<D|8J(q1tH`o$QLQ21=(xRUWJvq<Pk%yWL
zS&brz(`$k)&aBt)_D6P&=D{ElaXJ{pAuY^3nqC`mjgL=mGA)PMG_@zEGS)b>nVz57
z12T9uw;!@}dGH%DJZD35$VV`Rj>M6eD%+ujPzqISlGsr$lgW?>I^S}tg^jax$SNMp
z;hctP(DaEa?*gX;_S>wwv}|<ZLs((ET*W^{G;0$EZZchxa3+#mq3Ieu>~fgh+)?kE
zA}QTvlpRuWK2DWL-b-<|OR}K>zmlsFO7$Rqlgg4A({btd9G1bVIK_XX)mx{dTQvSa
zhqw@<QCQG%$-ZOKW6J-5#a_Xp{g5Qre`ao6-m)C2n3?X80&Jb8eC~UJ%iVMN;QlDk
zWeGZ@=f>=Dh3(OAO^QZYbZjaT91NU$g{kG968ie1<$oh%C2H+3oW-`snC0*d+NXJb
zNn1T{w04!?K9YZFJZ6S=gYAjV^H`y8am%IUG8(sGq=bDbaz|y947`H9jACU`y92z4
za-rpWn$Q-`3VrjXg12du(;-^!crO;am|Z^4b(~s9!C0ZK<vT5IL~O56)3tW@=dQ9&
u`upBbGC5Uv#>Fxnn(9F_YzEDs7S@AS(4e<Uqggr*_kwNrZicm>cz*!UkyX3^

literal 0
HcmV?d00001

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 91b83511b1cca..6a843a70b6386 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -118,11 +118,9 @@ def add_shuffle_key(split, iterator):
         keyed = PipelinedDStream(self, add_shuffle_key)
         keyed._bypass_serializer = True
         with _JavaStackTrace(self.ctx) as st:
-            #JavaDStream
-            pairDStream = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairDStream()
             partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
-                                                          id(partitionFunc))
-        jdstream = pairDStream.partitionBy(partitioner).values()
+                                                      id(partitionFunc))
+            jdstream = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream(), partitioner).asJavaDStream()
         dstream = DStream(jdstream, self._ssc, BatchedSerializer(outputSerializer))
         # This is required so that id(partitionFunc) remains unique, even if
         # partitionFunc is a lambda:
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 5986bcc84f31e..9727356da1f51 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -55,7 +55,6 @@ class PythonDStream[T: ClassTag](
       case None => None
     }
   }
-<<<<<<< HEAD
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
 
@@ -134,87 +133,31 @@ DStream[(Long, Array[Byte])](prev.ssc){
   }
   val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
 }
-=======
-  val asJavaDStream  = JavaDStream.fromDStream(this)
-
-  /**
-   * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
-   * operator, so this PythonDStream will be registered as an output stream and there materialized.
-   * Since serialized Python object is readable by Python, pyprint writes out binary data to
-   * temporary file and run python script to deserialized and print the first ten elements
-   */
-  private[streaming] def ppyprint() {
-    def foreachFunc = (rdd: RDD[Array[Byte]], time: Time) => {
-      val iter = rdd.take(11).iterator
-
-      // make a temporary file
-      val prefix = "spark"
-      val suffix = ".tmp"
-      val tempFile = File.createTempFile(prefix, suffix)
-      val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
-      //write out serialized python object
-      PythonRDD.writeIteratorToStream(iter, tempFileStream)
-      tempFileStream.close()
-
-      // This value has to be passed from python
-      //val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
-      val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
-      //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
-      //absolute path to the python script is needed to change because we do not use pysparkstreaming
-      val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pysparkstreaming/streaming/pyprint.py", tempFile.getAbsolutePath)
-      val workerEnv = pb.environment()
-
-      //envVars also need to be pass
-      //workerEnv.putAll(envVars)
-      val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
-      workerEnv.put("PYTHONPATH", pythonPath)
-      val worker = pb.start()
-      val is = worker.getInputStream()
-      val isr = new InputStreamReader(is)
-      val br = new BufferedReader(isr)
 
-      println ("-------------------------------------------")
-      println ("Time: " + time)
-      println ("-------------------------------------------")
-
-      //print value from python std out
-      var line = ""
-      breakable {
-        while (true) {
-          line = br.readLine()
-          if (line == null) break()
-          println(line)
-        }
-      }
-      //delete temporary file
-      tempFile.delete()
-      println()
 
-    }
-    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
-  }
-}
-
-
-private class PairwiseDStream(prev:DStream[Array[Byte]]) extends
-DStream[(Long, Array[Byte])](prev.ssc){
+private class PairwiseDStream(prev:DStream[Array[Byte]], partitioner: Partitioner) extends
+DStream[Array[Byte]](prev.ssc){
   override def dependencies = List(prev)
 
   override def slideDuration: Duration = prev.slideDuration
 
-  override def compute(validTime:Time):Option[RDD[(Long, Array[Byte])]]={
+  override def compute(validTime:Time):Option[RDD[Array[Byte]]]={
     prev.getOrCompute(validTime) match{
       case Some(rdd)=>Some(rdd)
         val pairwiseRDD = new PairwiseRDD(rdd)
-        Some(pairwiseRDD.asJavaPairRDD.rdd)
+        /*
+         * This is equivalent to following python code
+         * with _JavaStackTrace(self.context) as st:
+         *    pairRDD = self.ctx._jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD()
+         *    partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
+         *                                                  id(partitionFunc))
+         * jrdd = pairRDD.partitionBy(partitioner).values()
+         * rdd = RDD(jrdd, self.ctx, BatchedSerializer(outputSerializer))
+         */
+        Some(pairwiseRDD.asJavaPairRDD.partitionBy(partitioner).values().rdd)
       case None => None
     }
   }
-  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+  //val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
 }
-
-
-
-
-
->>>>>>> added reducedByKey not working yet
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
index ff70483b771a4..bc07e09ec6d03 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
@@ -1,3 +1,5 @@
+/*
+
 package org.apache.spark.streaming.api.python
 
 import org.apache.spark.Accumulator
@@ -10,11 +12,8 @@ import org.apache.spark.streaming.dstream.DStream
 
 import scala.reflect.ClassTag
 
-/**
- * Created by ken on 7/15/14.
- */
 class PythonTransformedDStream[T: ClassTag](
-               parents: Seq[DStream[T]],
+               parent: DStream[T],
                command: Array[Byte],
                envVars: JMap[String, String],
                pythonIncludes: JList[String],
@@ -30,8 +29,14 @@ class PythonTransformedDStream[T: ClassTag](
 
   //pythonDStream compute
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
-    Some()
+
+//    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
+//    parents.map(_.getOrCompute(validTime).orNull).to
+//    parent = parents.head.asInstanceOf[RDD]
+//    Some()
   }
-  val asJavaDStream  = JavaDStream.fromDStream(this)
+
+  val asJavaDStream = JavaDStream.fromDStream(this)
 }
+
+*/

From 15feea9f2db99e84abba5fadbdfaa444cd2b06ce Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Thu, 17 Jul 2014 17:09:23 -0700
Subject: [PATCH 184/347] edit python sparkstreaming example

---
 examples/src/main/python/streaming/network_wordcount.py | 8 +++++++-
 examples/src/main/python/streaming/wordcount.py         | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index 67dc28f7bf7f0..77fca7ff7657d 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -1,6 +1,7 @@
 import sys
 from operator import add
 
+from pyspark.conf import SparkConf
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
 
@@ -8,15 +9,20 @@
     if len(sys.argv) != 3:
         print >> sys.stderr, "Usage: wordcount <hostname> <port>"
         exit(-1)
-    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
+    conf = SparkConf()
+    conf.setAppName("PythonStreamingNetworkWordCount")
+    conf.set("spark.default.parallelism", 1)
+    ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     fm_lines = lines.flatMap(lambda x: x.split(" "))
     filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
+    reduced_lines = mapped_lines.reduce(add)
     
     fm_lines.pyprint()
     filtered_lines.pyprint()
     mapped_lines.pyprint()
+    reduced_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index 3996991109d60..9ff8bc5ac9ab2 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -13,6 +13,7 @@
     conf.setAppName("PythonStreamingWordCount")
     conf.set("spark.default.parallelism", 1)
 
+# still has a bug
 #    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
 

From d3ee86a7bf9e64231ccd746bd3679d5e57dd489b Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Fri, 18 Jul 2014 17:58:58 -0700
Subject: [PATCH 185/347] added count operation but this implementation need
 double check

---
 python/pyspark/streaming/dstream.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 6a843a70b6386..c55c9a3ce641e 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -1,5 +1,8 @@
 from collections import defaultdict
 from itertools import chain, ifilter, imap
+import operator
+
+import logging
 
 from pyspark.serializers import NoOpSerializer,\
     BatchedSerializer, CloudPickleSerializer, pack_long
@@ -24,6 +27,18 @@ def generatedRDDs(self):
         """
         pass
 
+    def count(self):
+        """
+
+        """
+        #TODO make sure count implementation, thiis different from what pyspark does
+        return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum().map(lambda x: x[1])
+
+    def sum(self):
+        """
+        """
+        return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+
     def print_(self):
         """
         """
@@ -63,9 +78,9 @@ def reduce(self, func, numPartitions=None):
         """
 
         """
-        return self._combineByKey(lambda x:x, func, func, numPartitions)
+        return self.combineByKey(lambda x:x, func, func, numPartitions)
 
-    def _combineByKey(self, createCombiner, mergeValue, mergeCombiners,
+    def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
                       numPartitions = None):
         """
         """
@@ -74,6 +89,12 @@ def _combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         def combineLocally(iterator):
             combiners = {}
             for x in iterator:
+
+                #TODO for count operation make sure count implementation
+                # This is different from what pyspark does
+                if isinstance(x, int):
+                    x = ("", x)
+
                 (k, v) = x
                 if k not in combiners:
                     combiners[k] = createCombiner(v)

From 72b9738db8b9f1fe2d915cd22723f241a40b7391 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Sat, 19 Jul 2014 18:58:01 -0700
Subject: [PATCH 186/347] fix map function

---
 .../python/streaming/network_wordcount.py     |  2 ++
 python/pyspark/streaming/dstream.py           | 26 +++++++++----------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index 77fca7ff7657d..a1458e06f13d2 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -19,10 +19,12 @@
     filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
     reduced_lines = mapped_lines.reduce(add)
+    counted_lines = reduced_lines.count()
     
     fm_lines.pyprint()
     filtered_lines.pyprint()
     mapped_lines.pyprint()
     reduced_lines.pyprint()
+    counted_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index c55c9a3ce641e..f8376371b1425 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -20,21 +20,14 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
         self.ctx = ssc._sc
         self._jrdd_deserializer = jrdd_deserializer
 
-    def generatedRDDs(self):
-        """
-         // RDDs generated, marked as private[streaming] so that testsuites can access it
-         @transient
-        """
-        pass
-
     def count(self):
         """
 
         """
         #TODO make sure count implementation, thiis different from what pyspark does
-        return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum().map(lambda x: x[1])
+        return self.mapPartitions(lambda i: [sum(1 for _ in i)]).map(lambda x: (None, 1))
 
-    def sum(self):
+    def _sum(self):
         """
         """
         return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
@@ -65,8 +58,9 @@ def func(s, iterator): return chain.from_iterable(imap(f, iterator))
     def map(self, f, preservesPartitioning=False):
         """
         """
-        def func(split, iterator): return imap(f, iterator)
-        return PipelinedDStream(self, func, preservesPartitioning)
+        def func(iterator): return imap(f, iterator)
+        return self.mapPartitions(func)
+        #return PipelinedDStream(self, func, preservesPartitioning)
 
     def mapPartitions(self, f):
         """
@@ -74,6 +68,12 @@ def mapPartitions(self, f):
         def func(s, iterator): return f(iterator)
         return self.mapPartitionsWithIndex(func)
 
+    def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
+        """
+
+        """
+        return PipelinedDStream(self, f, preservesPartitioning)
+
     def reduce(self, func, numPartitions=None):
         """
 
@@ -92,8 +92,8 @@ def combineLocally(iterator):
 
                 #TODO for count operation make sure count implementation
                 # This is different from what pyspark does
-                if isinstance(x, int):
-                    x = ("", x)
+                #if isinstance(x, int):
+                #    x = ("", x)
 
                 (k, v) = x
                 if k not in combiners:

From bab31c1e1ee84774e81c78a4cded375e000e5550 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Sun, 20 Jul 2014 14:31:55 -0700
Subject: [PATCH 187/347] clean up code

---
 python/pyspark/streaming/context.py           | 41 +++++-----
 python/pyspark/streaming/dstream.py           | 74 +++++++++++++------
 python/pyspark/streaming/duration.py          |  1 +
 python/pyspark/streaming/pyprint.py           |  9 ++-
 .../streaming/api/java/JavaDStreamLike.scala  |  2 +-
 .../streaming/api/python/PythonDStream.scala  |  4 +-
 .../api/python/PythonTransformedDStream.scala | 42 -----------
 .../spark/streaming/dstream/DStream.scala     | 21 +++---
 8 files changed, 96 insertions(+), 98 deletions(-)
 delete mode 100644 streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 5dcc9ba35a653..a4900191d1730 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -22,15 +22,15 @@
 from pyspark.storagelevel import *
 from pyspark.rdd import RDD
 from pyspark.context import SparkContext
+from pyspark.streaming.dstream import DStream
 
 from py4j.java_collections import ListConverter
 
-from pyspark.streaming.dstream import DStream
 
 class StreamingContext(object):
     """
     Main entry point for Spark Streaming functionality. A StreamingContext represents the
-    connection to a Spark cluster, and can be used to create L{RDD}s and
+    connection to a Spark cluster, and can be used to create L{DStream}s and
     broadcast variables on that cluster.
     """
 
@@ -71,13 +71,16 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
     def _initialize_context(self, jspark_context, jduration):
         return self._jvm.JavaStreamingContext(jspark_context, jduration)
 
-    def actorStream(self, props, name, storageLevel, supervisorStrategy):
-        raise NotImplementedError
-
-    def addStreamingListener(self, streamingListener):
-        raise NotImplementedError
+    def start(self):
+        """
+        Start the execution of the streams.
+        """
+        self._jssc.start()
 
     def awaitTermination(self, timeout=None):
+        """
+        Wait for the execution to stop.
+        """
         if timeout:
             self._jssc.awaitTermination(timeout)
         else:
@@ -85,20 +88,18 @@ def awaitTermination(self, timeout=None):
 
     # start from simple one. storageLevel is not passed for now.
     def socketTextStream(self, hostname, port):
+        """
+        Create an input from TCP source hostname:port. Data is received using
+        a TCP socket and receive byte is interpreted as UTF8 encoded '\n' delimited
+        lines.
+        """
         return DStream(self._jssc.socketTextStream(hostname, port), self, UTF8Deserializer())
 
-    def start(self):
-        self._jssc.start()
-
-    def stop(self, stopSparkContext=True):
-        raise NotImplementedError
-
     def textFileStream(self, directory):
+        """
+        Create an input stream that monitors a Hadoop-compatible file system
+        for new files and reads them as text files. Files must be wrriten to the
+        monitored directory by "moving" them from another location within the same
+        file system. FIle names starting with . are ignored.
+        """
         return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
-
-    def transform(self, seq):
-        raise NotImplementedError
-
-    def union(self, seq):
-        raise NotImplementedError
-
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index f8376371b1425..0c37cb7164288 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -2,8 +2,6 @@
 from itertools import chain, ifilter, imap
 import operator
 
-import logging
-
 from pyspark.serializers import NoOpSerializer,\
     BatchedSerializer, CloudPickleSerializer, pack_long
 from pyspark.rdd import _JavaStackTrace
@@ -25,64 +23,86 @@ def count(self):
 
         """
         #TODO make sure count implementation, thiis different from what pyspark does
-        return self.mapPartitions(lambda i: [sum(1 for _ in i)]).map(lambda x: (None, 1))
+        return self._mapPartitions(lambda i: [sum(1 for _ in i)]).map(lambda x: (None, 1))
 
     def _sum(self):
         """
         """
-        return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+        return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
     def print_(self):
         """
+        Since print is reserved name for python, we cannot make a print method function.
+        This function prints serialized data in RDD in DStream because Scala and Java cannot
+        deserialized pickled python object. Please use DStream.pyprint() instead to print result.
+
+        Call DStream.print().
         """
-        # print is a reserved name of Python. We cannot give print to function name
+        #hack to call print function in DStream
         getattr(self._jdstream, "print")()
 
     def pyprint(self):
         """
+        Print the first ten elements of each RDD generated in this DStream. This is an output
+        operator, so this DStream will be registered as an output stream and there materialized.
+
         """
         self._jdstream.pyprint()
 
     def filter(self, f):
         """
+        Return DStream containing only the elements that satisfy predicate.
         """
         def func(iterator): return ifilter(f, iterator)
-        return self.mapPartitions(func)
+        return self._mapPartitions(func)
 
     def flatMap(self, f, preservesPartitioning=False):
         """
+        Pass each value in the key-value pair DStream through flatMap function
+        without changing the keys: this also retains the original RDD's partition.
         """
         def func(s, iterator): return chain.from_iterable(imap(f, iterator))
-        return self.mapPartitionsWithIndex(func, preservesPartitioning)
+        return self._mapPartitionsWithIndex(func, preservesPartitioning)
 
-    def map(self, f, preservesPartitioning=False):
+    def map(self, f):
         """
+        Return DStream by applying a function to each element of DStream.
         """
         def func(iterator): return imap(f, iterator)
-        return self.mapPartitions(func)
-        #return PipelinedDStream(self, func, preservesPartitioning)
+        return self._mapPartitions(func)
 
-    def mapPartitions(self, f):
+    def _mapPartitions(self, f):
         """
+        Return a new DStream by applying a function to each partition of this DStream.
         """
         def func(s, iterator): return f(iterator)
-        return self.mapPartitionsWithIndex(func)
+        return self._mapPartitionsWithIndex(func)
 
-    def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
+    def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
-
+        Return a new DStream by applying a function to each partition of this DStream,
+        While tracking the index of the original partition.
         """
         return PipelinedDStream(self, f, preservesPartitioning)
 
-    def reduce(self, func, numPartitions=None):
+
+    def reduceByKey(self, func, numPartitions=None):
         """
+        Merge the value for each key using an associative reduce function.
+
+        This will also perform the merging locally on each mapper before
+        sending resuls to reducer, similarly to a "combiner" in MapReduce.
 
+        Output will be hash-partitioned with C{numPartitions} partitions, or
+        the default parallelism level if C{numPartitions} is not specified.
         """
         return self.combineByKey(lambda x:x, func, func, numPartitions)
 
     def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
                       numPartitions = None):
         """
+        Count the number of elements for each key, and return the result to the
+        master as a dictionary
         """
         if numPartitions is None:
             numPartitions = self._defaultReducePartitions()
@@ -148,20 +168,20 @@ def add_shuffle_key(split, iterator):
         dstream._partitionFunc = partitionFunc
         return dstream
 
-    def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
-        """
-
-        """
-        return PipelinedDStream(self, f, preservesPartitioning)
-
     def _defaultReducePartitions(self):
         """
+        Returns the default number of partitions to use during reduce tasks (e.g., groupBy).
+        If spark.default.parallelism is set, then we'll use the value from SparkContext
+        defaultParallelism, otherwise we'll use the number of partitions in this RDD.
 
+        This mirrors the behavior of the Scala Partitioner#defaultPartitioner, intended to reduce
+        the likelihood of OOMs. Once PySpark adopts Partitioner-based APIs, this behavior will
+        be inherent.
         """
-        # hard code to avoid the error
         if self.ctx._conf.contains("spark.default.parallelism"):
             return self.ctx.defaultParallelism
         else:
+<<<<<<< HEAD
             return 2
 
     def getNumPartitions(self):
@@ -172,6 +192,16 @@ def getNumPartitions(self):
       2
       """
       return self._jdstream.partitions().size()
+=======
+            return self.getNumPartitions()
+
+    def getNumPartitions(self):
+        """
+        Return the number of partitions in RDD
+        """
+        # TODO: remove hardcoding. RDD has NumPartitions but DStream does not have.
+        return 2
+>>>>>>> clean up code
 
 
 class PipelinedDStream(DStream):
diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
index 06a169e5215ac..a7f1036e4b856 100644
--- a/python/pyspark/streaming/duration.py
+++ b/python/pyspark/streaming/duration.py
@@ -17,6 +17,7 @@
 
 from pyspark.streaming import utils
 
+
 class Duration(object):
     """
     Duration for Spark Streaming application. Used to set duration
diff --git a/python/pyspark/streaming/pyprint.py b/python/pyspark/streaming/pyprint.py
index 1aeb8e50375ed..49517b3e5c247 100644
--- a/python/pyspark/streaming/pyprint.py
+++ b/python/pyspark/streaming/pyprint.py
@@ -21,16 +21,22 @@
 
 from pyspark.serializers import PickleSerializer
 
+
 def collect(binary_file_path):
+    """
+    Read pickled file written by SparkStreaming
+    """
     dse = PickleSerializer()
     with open(binary_file_path, 'rb') as tempFile:
         for item in dse.load_stream(tempFile):
             yield item
+
+
 def main():
     try:
         binary_file_path = sys.argv[1]
     except:
-        print "Missed FilePath in argement"
+        print "Missed FilePath in argements"
 
     if not binary_file_path:
         return 
@@ -43,5 +49,6 @@ def main():
             print "..."
             break
 
+
 if __name__ =="__main__":
     exit(main())
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index cfa336df8674f..a2b9d581f609c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -59,7 +59,7 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
    * operator, so this PythonDStream will be registered as an output stream and there materialized.
    * This function is for PythonAPI.
    */
-
+  //TODO move this function to PythonDStream
   def pyprint() = dstream.pyprint()
 
   /**
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 9727356da1f51..fbf6d6dc05441 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -146,7 +146,9 @@ DStream[Array[Byte]](prev.ssc){
       case Some(rdd)=>Some(rdd)
         val pairwiseRDD = new PairwiseRDD(rdd)
         /*
-         * This is equivalent to following python code
+         * Since python operation is executed by Scala after StreamingContext.start.
+         * What PairwiseDStream does is equivalent to following python code in pySpark.
+         *
          * with _JavaStackTrace(self.context) as st:
          *    pairRDD = self.ctx._jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD()
          *    partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
deleted file mode 100644
index bc07e09ec6d03..0000000000000
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
-
-package org.apache.spark.streaming.api.python
-
-import org.apache.spark.Accumulator
-import org.apache.spark.api.python.PythonRDD
-import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.rdd.RDD
-import org.apache.spark.streaming.api.java.JavaDStream
-import org.apache.spark.streaming.{Time, Duration}
-import org.apache.spark.streaming.dstream.DStream
-
-import scala.reflect.ClassTag
-
-class PythonTransformedDStream[T: ClassTag](
-               parent: DStream[T],
-               command: Array[Byte],
-               envVars: JMap[String, String],
-               pythonIncludes: JList[String],
-               preservePartitoning: Boolean,
-               pythonExec: String,
-               broadcastVars: JList[Broadcast[Array[Byte]]],
-               accumulator: Accumulator[JList[Array[Byte]]]
-               ) extends DStream[Array[Byte]](parent.ssc) {
-
-  override def dependencies = List(parent)
-
-  override def slideDuration: Duration = parent.slideDuration
-
-  //pythonDStream compute
-  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-
-//    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
-//    parents.map(_.getOrCompute(validTime).orNull).to
-//    parent = parents.head.asInstanceOf[RDD]
-//    Some()
-  }
-
-  val asJavaDStream = JavaDStream.fromDStream(this)
-}
-
-*/
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 67977244ef420..fc7a2055025c1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -623,37 +623,36 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-//TODO move pyprint to PythonDStream
+//TODO move pyprint to PythonDStream and executed by py4j call back function
   /**
    * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
    * operator, so this PythonDStream will be registered as an output stream and there materialized.
    * Since serialized Python object is readable by Python, pyprint writes out binary data to
    * temporary file and run python script to deserialized and print the first ten elements
+   *
+   * Currently call python script directly. We should avoid this
    */
   private[streaming] def pyprint() {
     def foreachFunc = (rdd: RDD[T], time: Time) => {
       val iter = rdd.take(11).iterator
 
-      // make a temporary file
+      // Generate a temporary file
       val prefix = "spark"
       val suffix = ".tmp"
       val tempFile = File.createTempFile(prefix, suffix)
       val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
-      //write out serialized python object
+      // Write out serialized python object to temporary file
       PythonRDD.writeIteratorToStream(iter, tempFileStream)
       tempFileStream.close()
 
-      // This value has to be passed from python
-      // Python currently does not do cluster deployment. But what happened
+      // pythonExec should be passed from python. Move pyprint to PythonDStream
       val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
       val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
-      //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
-      //absolute path to the python script is needed to change because we do not use pysparkstreaming
+      // Call python script to deserialize and print result in stdout
       val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath)
       val workerEnv = pb.environment()
 
-      //envVars also need to be pass
-      //workerEnv.putAll(envVars)
+      // envVars also should be pass from python
       val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
       workerEnv.put("PYTHONPATH", pythonPath)
       val worker = pb.start()
@@ -665,7 +664,7 @@ abstract class DStream[T: ClassTag] (
       println ("Time: " + time)
       println ("-------------------------------------------")
 
-      //print value from python std out
+      // Print values which is from python std out
       var line = ""
       breakable {
         while (true) {
@@ -674,7 +673,7 @@ abstract class DStream[T: ClassTag] (
           println(line)
         }
       }
-      //delete temporary file
+      // Delete temporary file
       tempFile.delete()
       println()
 

From 0a8bbbbaba94ca67a345f59b88b765476bbd8c3a Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Sun, 20 Jul 2014 15:32:20 -0700
Subject: [PATCH 188/347] clean up codes

---
 .../src/main/python/streaming/network_wordcount.py |  7 +------
 examples/src/main/python/streaming/wordcount.py    |  2 +-
 python/pyspark/streaming/dstream.py                | 14 +++++++-------
 .../apache/spark/streaming/dstream/DStream.scala   |  3 ++-
 4 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index a1458e06f13d2..c6ededc24db21 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -11,20 +11,15 @@
         exit(-1)
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
-    conf.set("spark.default.parallelism", 1)
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     fm_lines = lines.flatMap(lambda x: x.split(" "))
-    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
-    reduced_lines = mapped_lines.reduce(add)
-    counted_lines = reduced_lines.count()
+    reduced_lines = mapped_lines.reduceByKey(add)
     
     fm_lines.pyprint()
-    filtered_lines.pyprint()
     mapped_lines.pyprint()
     reduced_lines.pyprint()
-    counted_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index 9ff8bc5ac9ab2..ee52c4e178142 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -21,7 +21,7 @@
     fm_lines = lines.flatMap(lambda x: x.split(" "))
     filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
-    reduced_lines = mapped_lines.reduce(add)
+    reduced_lines = mapped_lines.reduceByKey(add)
     
     fm_lines.pyprint()
     filtered_lines.pyprint()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 0c37cb7164288..3e617cafbaa93 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -22,13 +22,15 @@ def count(self):
         """
 
         """
-        #TODO make sure count implementation, thiis different from what pyspark does
-        return self._mapPartitions(lambda i: [sum(1 for _ in i)]).map(lambda x: (None, 1))
+        pass
+        #TODO: make sure count implementation, thiis different from what pyspark does
+        #return self._mapPartitions(lambda i: [sum(1 for _ in i)]).map(lambda x: (None, 1))
 
     def _sum(self):
         """
         """
-        return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+        pass
+        #return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
     def print_(self):
         """
@@ -85,7 +87,6 @@ def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
         return PipelinedDStream(self, f, preservesPartitioning)
 
-
     def reduceByKey(self, func, numPartitions=None):
         """
         Merge the value for each key using an associative reduce function.
@@ -121,7 +122,7 @@ def combineLocally(iterator):
                 else:
                     combiners[k] = mergeValue(combiners[k], v)
             return combiners.iteritems()
-        locally_combined = self.mapPartitions(combineLocally)
+        locally_combined = self._mapPartitions(combineLocally)
         shuffled = locally_combined.partitionBy(numPartitions)
         def _mergeCombiners(iterator):
             combiners = {}
@@ -131,12 +132,11 @@ def _mergeCombiners(iterator):
                 else:
                     combiners[k] = mergeCombiners(combiners[k], v)
             return combiners.iteritems()
-        return shuffled.mapPartitions(_mergeCombiners) 
+        return shuffled._mapPartitions(_mergeCombiners)
 
     def partitionBy(self, numPartitions, partitionFunc=None):
         """
         Return a copy of the DStream partitioned using the specified partitioner.
-
         """
         if numPartitions is None:
             numPartitions = self.ctx._defaultReducePartitions()
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index fc7a2055025c1..f539bc9aa147d 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -623,7 +623,7 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-//TODO move pyprint to PythonDStream and executed by py4j call back function
+//TODO: move pyprint to PythonDStream and executed by py4j call back function
   /**
    * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
    * operator, so this PythonDStream will be registered as an output stream and there materialized.
@@ -647,6 +647,7 @@ abstract class DStream[T: ClassTag] (
 
       // pythonExec should be passed from python. Move pyprint to PythonDStream
       val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+
       val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
       // Call python script to deserialize and print result in stdout
       val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath)

From 678e854a82d646b01666291c3998b4677821f4bd Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Sun, 20 Jul 2014 15:33:34 -0700
Subject: [PATCH 189/347] remove waste file

---
 examples/src/main/python/streaming/wordcount.pyc | Bin 1566 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 examples/src/main/python/streaming/wordcount.pyc

diff --git a/examples/src/main/python/streaming/wordcount.pyc b/examples/src/main/python/streaming/wordcount.pyc
deleted file mode 100644
index db93702361f47f57988ea82c213aae522e7a9f81..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1566
zcmb_c+invv5S`tmX_F>xdP%Qb`WW$$yrZfDRUlD`3MEBIE2LoJN!i4Ek?qh#;suqD
z<A3-7n6a}d5{L(2H};Otj6G*APU%~t_Uz(oe+Y}GLf<De%)0<U(k}o75P3NS6+jX~
z6hgqd5~va+Wr)gtT?VxRNd=;cU$1~#g)3YGS%FCi`fY$!K!9-#ZUd03Fe!ol?2vUR
z%QXmJTwtBuk~&BMunf24y#d&O@dn7MQ}ALFWDVpx+!7V6fUL8LB*Ugd1KJlxTYy_I
z-d?yQ>q9y>+5y~w@h->>_~JZ8Ex;Cx_dsqI$i71kKsF2H5bzMjM}Q%7h>o4XJ*F4n
zK8zO<nJlE^r9>3&dpnXIIEg}|#P-`;l<D|8J(q1tH`o$QLQ21=(xRUWJvq<Pk%yWL
zS&brz(`$k)&aBt)_D6P&=D{ElaXJ{pAuY^3nqC`mjgL=mGA)PMG_@zEGS)b>nVz57
z12T9uw;!@}dGH%DJZD35$VV`Rj>M6eD%+ujPzqISlGsr$lgW?>I^S}tg^jax$SNMp
z;hctP(DaEa?*gX;_S>wwv}|<ZLs((ET*W^{G;0$EZZchxa3+#mq3Ieu>~fgh+)?kE
zA}QTvlpRuWK2DWL-b-<|OR}K>zmlsFO7$Rqlgg4A({btd9G1bVIK_XX)mx{dTQvSa
zhqw@<QCQG%$-ZOKW6J-5#a_Xp{g5Qre`ao6-m)C2n3?X80&Jb8eC~UJ%iVMN;QlDk
zWeGZ@=f>=Dh3(OAO^QZYbZjaT91NU$g{kG968ie1<$oh%C2H+3oW-`snC0*d+NXJb
zNn1T{w04!?K9YZFJZ6S=gYAjV^H`y8am%IUG8(sGq=bDbaz|y947`H9jACU`y92z4
za-rpWn$Q-`3VrjXg12du(;-^!crO;am|Z^4b(~s9!C0ZK<vT5IL~O56)3tW@=dQ9&
u`upBbGC5Uv#>Fxnn(9F_YzEDs7S@AS(4e<Uqggr*_kwNrZicm>cz*!UkyX3^


From b1d2a3050f2cb9fbcb194a3624cefcefb388f1d8 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Wed, 23 Jul 2014 15:43:11 -0700
Subject: [PATCH 190/347] Implemented DStream.foreachRDD in the Python API
 using Py4J callback server.

---
 .../python/streaming/network_wordcount.py     |  4 +-
 python/pyspark/java_gateway.py                |  2 +-
 python/pyspark/streaming/dstream.py           | 44 ++++++++++----
 python/pyspark/streaming/utils.py             | 21 +++++++
 .../streaming/api/java/JavaDStreamLike.scala  |  8 ---
 .../streaming/api/python/PythonDStream.scala  | 38 ++++++++++++
 .../spark/streaming/dstream/DStream.scala     | 60 -------------------
 7 files changed, 95 insertions(+), 82 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index c6ededc24db21..2bbb36a6b787e 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -17,9 +17,7 @@
     fm_lines = lines.flatMap(lambda x: x.split(" "))
     mapped_lines = fm_lines.map(lambda x: (x, 1))
     reduced_lines = mapped_lines.reduceByKey(add)
-    
-    fm_lines.pyprint()
-    mapped_lines.pyprint()
+
     reduced_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index d7e7bbf806544..888c850779ff8 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -102,7 +102,7 @@ def run(self):
         EchoOutputThread(proc.stdout).start()
 
     # Connect to the gateway
-    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False)
+    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=True)
 
     # Import the classes used by PySpark
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 3e617cafbaa93..45a591db5a416 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -43,14 +43,6 @@ def print_(self):
         #hack to call print function in DStream
         getattr(self._jdstream, "print")()
 
-    def pyprint(self):
-        """
-        Print the first ten elements of each RDD generated in this DStream. This is an output
-        operator, so this DStream will be registered as an output stream and there materialized.
-
-        """
-        self._jdstream.pyprint()
-
     def filter(self, f):
         """
         Return DStream containing only the elements that satisfy predicate.
@@ -203,6 +195,38 @@ def getNumPartitions(self):
         return 2
 >>>>>>> clean up code
 
+    def foreachRDD(self, func):
+        """
+        """
+        from utils import RDDFunction
+        wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
+        self.ctx._jvm.PythonForeachDStream(self._jdstream.dstream(), wrapped_func)
+
+    def pyprint(self):
+        """
+        Print the first ten elements of each RDD generated in this DStream. This is an output
+        operator, so this DStream will be registered as an output stream and there materialized.
+
+        """
+        def takeAndPrint(rdd, time):
+            taken = rdd.take(11)
+            print "-------------------------------------------"
+            print "Time: %s" % (str(time))
+            print "-------------------------------------------"
+            for record in taken[:10]:
+                print record
+            if len(taken) > 10:
+                print "..."
+            print
+
+        self.foreachRDD(takeAndPrint)
+
+
+    #def transform(self, func):
+    #    from utils import RDDFunction
+    #    wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
+    #    jdstream = self.ctx._jvm.PythonTransformedDStream(self._jdstream.dstream(), wrapped_func).toJavaDStream
+    #    return DStream(jdstream, self._ssc, ...)  ## DO NOT KNOW HOW 
 
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
@@ -222,7 +246,6 @@ def pipeline_func(split, iterator):
             self._prev_jdstream = prev._prev_jdstream  # maintain the pipeline
             self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer
         self.is_cached = False
-        self.is_checkpointed = False
         self._ssc = prev._ssc
         self.ctx = prev.ctx
         self.prev = prev
@@ -259,4 +282,5 @@ def _jdstream(self):
         return self._jdstream_val
 
     def _is_pipelinable(self):
-        return not (self.is_cached or self.is_checkpointed)
+        return not (self.is_cached)
+
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index b1fa1e227b0a1..84f1dadeba03d 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -15,6 +15,27 @@
 # limitations under the License.
 #
 
+from pyspark.rdd import RDD
+
+class RDDFunction():
+    def __init__(self, ctx, jrdd_deserializer, func):
+        self.ctx = ctx
+        self.deserializer = jrdd_deserializer
+        self.func = func
+
+    def call(self, jrdd, time):
+        # Wrap JavaRDD into python's RDD class
+        rdd = RDD(jrdd, self.ctx, self.deserializer)
+        # Call user defined RDD function
+        self.func(rdd, time)
+
+    def __str__(self):
+        return "%s, %s" % (str(self.deserializer), str(self.func))
+
+    class Java:
+        implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction']
+
+
 
 def msDurationToString(ms):
     """
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index a2b9d581f609c..a6184de4e83c1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -54,14 +54,6 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     dstream.print()
   }
 
-  /**
-   * Print the first ten elements of each PythonRDD generated in the PythonDStream. This is an output
-   * operator, so this PythonDStream will be registered as an output stream and there materialized.
-   * This function is for PythonAPI.
-   */
-  //TODO move this function to PythonDStream
-  def pyprint() = dstream.pyprint()
-
   /**
    * Return a new DStream in which each RDD has a single element generated by counting each RDD
    * of this DStream.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index fbf6d6dc05441..86b067e1a2810 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -56,6 +56,10 @@ class PythonDStream[T: ClassTag](
     }
   }
 
+  def foreachRDD(foreachFunc: PythonRDDFunction) {
+    new PythonForeachDStream(this, context.sparkContext.clean(foreachFunc, false)).register()
+  }
+
   val asJavaDStream  = JavaDStream.fromDStream(this)
 
   /**
@@ -160,6 +164,40 @@ DStream[Array[Byte]](prev.ssc){
       case None => None
     }
   }
+
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
+
+class PythonForeachDStream(
+    prev: DStream[Array[Byte]],
+    foreachFunction: PythonRDDFunction
+  ) extends ForEachDStream[Array[Byte]](
+    prev,
+    (rdd: RDD[Array[Byte]], time: Time) => {
+      foreachFunction.call(rdd.toJavaRDD(), time.milliseconds)
+    }
+  ) {
+
+  this.register()
+}
+/*
+This does not work. Ignore this for now. -TD
+class PythonTransformedDStream(
+    prev: DStream[Array[Byte]],
+    transformFunction: PythonRDDFunction
+  ) extends DStream[Array[Byte]](prev.ssc) {
+
+  override def dependencies = List(prev)
+
+  override def slideDuration: Duration = prev.slideDuration
+
+  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    prev.getOrCompute(validTime).map(rdd => {
+      transformFunction.call(rdd.toJavaRDD(), validTime.milliseconds).rdd
+    })
+  }
+
   val asJavaDStream  = JavaDStream.fromDStream(this)
   //val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
 }
+*/
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index f539bc9aa147d..d8dbdf59e7ff1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -623,66 +623,6 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-//TODO: move pyprint to PythonDStream and executed by py4j call back function
-  /**
-   * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
-   * operator, so this PythonDStream will be registered as an output stream and there materialized.
-   * Since serialized Python object is readable by Python, pyprint writes out binary data to
-   * temporary file and run python script to deserialized and print the first ten elements
-   *
-   * Currently call python script directly. We should avoid this
-   */
-  private[streaming] def pyprint() {
-    def foreachFunc = (rdd: RDD[T], time: Time) => {
-      val iter = rdd.take(11).iterator
-
-      // Generate a temporary file
-      val prefix = "spark"
-      val suffix = ".tmp"
-      val tempFile = File.createTempFile(prefix, suffix)
-      val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
-      // Write out serialized python object to temporary file
-      PythonRDD.writeIteratorToStream(iter, tempFileStream)
-      tempFileStream.close()
-
-      // pythonExec should be passed from python. Move pyprint to PythonDStream
-      val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
-
-      val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
-      // Call python script to deserialize and print result in stdout
-      val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath)
-      val workerEnv = pb.environment()
-
-      // envVars also should be pass from python
-      val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
-      workerEnv.put("PYTHONPATH", pythonPath)
-      val worker = pb.start()
-      val is = worker.getInputStream()
-      val isr = new InputStreamReader(is)
-      val br = new BufferedReader(isr)
-
-      println ("-------------------------------------------")
-      println ("Time: " + time)
-      println ("-------------------------------------------")
-
-      // Print values which is from python std out
-      var line = ""
-      breakable {
-        while (true) {
-          line = br.readLine()
-          if (line == null) break()
-          println(line)
-        }
-      }
-      // Delete temporary file
-      tempFile.delete()
-      println()
-
-    }
-    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
-  }
-
-
   /**
    * Return a new DStream in which each RDD contains all the elements in seen in a
    * sliding window of time over this DStream. The new DStream generates RDDs with

From 05e991be91c614603a41e1ce2c90d7b352bcbe33 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Fri, 1 Aug 2014 14:39:18 -0700
Subject: [PATCH 191/347] Added missing file

---
 .../spark/streaming/api/python/PythonRDDFunction.java     | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
new file mode 100644
index 0000000000000..88f7036c3a05b
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
@@ -0,0 +1,8 @@
+package org.apache.spark.streaming.api.python;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.streaming.Time;
+
+public interface PythonRDDFunction {
+  JavaRDD<byte[]> call(JavaRDD<byte[]> rdd, long time);
+}

From 9ab89523a783666052ca9046aa77636d16cc0cfd Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Fri, 1 Aug 2014 14:40:37 -0700
Subject: [PATCH 192/347] Added extra line.

---
 python/pyspark/streaming/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index 84f1dadeba03d..c60ecd1ed607a 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -17,6 +17,7 @@
 
 from pyspark.rdd import RDD
 
+
 class RDDFunction():
     def __init__(self, ctx, jrdd_deserializer, func):
         self.ctx = ctx

From 84a966811695f73497cc80dda6b2e53d0df5051f Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Sat, 2 Aug 2014 15:58:24 -0700
Subject: [PATCH 193/347] tried to restart callback server

---
 python/pyspark/java_gateway.py      | 5 ++++-
 python/pyspark/streaming/context.py | 8 ++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 888c850779ff8..d32beef9b860c 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -102,7 +102,10 @@ def run(self):
         EchoOutputThread(proc.stdout).start()
 
     # Connect to the gateway
-    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=True)
+    # If start_callback_server is True, it looks like callback server is not killed
+    # process is hang up and test case does not move forward.
+    #gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=True)
+    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=False)
 
     # Import the classes used by PySpark
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index a4900191d1730..04737243f3192 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 #
 
+import time
+
 from pyspark.conf import SparkConf
 from pyspark.files import SparkFiles
 from pyspark.java_gateway import launch_gateway
@@ -60,6 +62,12 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         @param duration: A L{Duration} Duration for SparkStreaming
 
         """
+
+        # launch call back server
+        if not gateway:
+            gateway = launch_gateway()
+#        gateway.restart_callback_server()
+
         # Create the Python Sparkcontext
         self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
                         pyFiles=pyFiles, environment=environment, batchSize=batchSize,

From 3b498e16ee7de90266135eb503ef5e5d4b3976d3 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Sat, 2 Aug 2014 20:05:15 -0700
Subject: [PATCH 194/347] Kill py4j callback server properly

---
 python/pyspark/streaming/context.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 04737243f3192..5952e81a4bef3 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -15,7 +15,8 @@
 # limitations under the License.
 #
 
-import time
+import sys
+from signal import signal, SIGTERM, SIGINT
 
 from pyspark.conf import SparkConf
 from pyspark.files import SparkFiles
@@ -63,15 +64,14 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
 
         """
 
-        # launch call back server
-        if not gateway:
-            gateway = launch_gateway()
-#        gateway.restart_callback_server()
-
         # Create the Python Sparkcontext
         self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
                         pyFiles=pyFiles, environment=environment, batchSize=batchSize,
                         serializer=serializer, conf=conf, gateway=gateway)
+
+        # Start py4j callback server
+        SparkContext._gateway.restart_callback_server()
+        self._clean_up_trigger()
         self._jvm = self._sc._jvm
         self._jssc = self._initialize_context(self._sc._jsc, duration._jduration)
 
@@ -79,6 +79,16 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
     def _initialize_context(self, jspark_context, jduration):
         return self._jvm.JavaStreamingContext(jspark_context, jduration)
 
+    def _clean_up_trigger(self):
+        """Kill py4j callback server properly using signal lib"""
+
+        def clean_up_handler(*args):
+            SparkContext._gateway.shutdown()
+            sys.exit(0)
+
+        for sig in (SIGINT, SIGTERM):
+            signal(sig, clean_up_handler)
+
     def start(self):
         """
         Start the execution of the streams.

From b349649b0739c51f92d760345083556474e24156 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sat, 2 Aug 2014 20:40:36 -0700
Subject: [PATCH 195/347] Removed the waste line

---
 python/pyspark/java_gateway.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index d32beef9b860c..d7e7bbf806544 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -102,10 +102,7 @@ def run(self):
         EchoOutputThread(proc.stdout).start()
 
     # Connect to the gateway
-    # If start_callback_server is True, it looks like callback server is not killed
-    # process is hang up and test case does not move forward.
-    #gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=True)
-    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=False)
+    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False)
 
     # Import the classes used by PySpark
     java_import(gateway.jvm, "org.apache.spark.SparkConf")

From 3c45cd2a462c6d1e47ba37e5096edd3a724d36b6 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 3 Aug 2014 19:25:13 -0700
Subject: [PATCH 196/347] implemented reduce and count function in Dstream

---
 .../python/streaming/network_wordcount.py     |  2 ++
 python/pyspark/streaming/dstream.py           | 27 ++++++++++++-------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index 2bbb36a6b787e..f6fba4488e238 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -19,5 +19,7 @@
     reduced_lines = mapped_lines.reduceByKey(add)
 
     reduced_lines.pyprint()
+    count_lines = mapped_lines.count()
+    count_lines.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 45a591db5a416..0049c1e7a0d5c 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -22,25 +22,23 @@ def count(self):
         """
 
         """
-        pass
-        #TODO: make sure count implementation, thiis different from what pyspark does
-        #return self._mapPartitions(lambda i: [sum(1 for _ in i)]).map(lambda x: (None, 1))
+        # TODO: make sure count implementation, this different from what pyspark does
+        return self._mapPartitions(lambda i: [sum(1 for _ in i)])._sum()
 
     def _sum(self):
         """
         """
-        pass
-        #return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+        return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
     def print_(self):
         """
-        Since print is reserved name for python, we cannot make a print method function.
+        Since print is reserved name for python, we cannot define a print method function.
         This function prints serialized data in RDD in DStream because Scala and Java cannot
-        deserialized pickled python object. Please use DStream.pyprint() instead to print result.
+        deserialized pickled python object. Please use DStream.pyprint() instead to print results.
 
         Call DStream.print().
         """
-        #hack to call print function in DStream
+        # a hack to call print function in DStream
         getattr(self._jdstream, "print")()
 
     def filter(self, f):
@@ -79,17 +77,23 @@ def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
         return PipelinedDStream(self, f, preservesPartitioning)
 
+    def reduce(self, func):
+        """
+
+        """
+        return self.map(lambda x: (None, x)).reduceByKey(func, 1).map(lambda x: x[1])
+
     def reduceByKey(self, func, numPartitions=None):
         """
         Merge the value for each key using an associative reduce function.
 
         This will also perform the merging locally on each mapper before
-        sending resuls to reducer, similarly to a "combiner" in MapReduce.
+        sending results to reducer, similarly to a "combiner" in MapReduce.
 
         Output will be hash-partitioned with C{numPartitions} partitions, or
         the default parallelism level if C{numPartitions} is not specified.
         """
-        return self.combineByKey(lambda x:x, func, func, numPartitions)
+        return self.combineByKey(lambda x: x, func, func, numPartitions)
 
     def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
                       numPartitions = None):
@@ -99,6 +103,7 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         """
         if numPartitions is None:
             numPartitions = self._defaultReducePartitions()
+
         def combineLocally(iterator):
             combiners = {}
             for x in iterator:
@@ -116,6 +121,7 @@ def combineLocally(iterator):
             return combiners.iteritems()
         locally_combined = self._mapPartitions(combineLocally)
         shuffled = locally_combined.partitionBy(numPartitions)
+
         def _mergeCombiners(iterator):
             combiners = {}
             for (k, v) in iterator:
@@ -124,6 +130,7 @@ def _mergeCombiners(iterator):
                 else:
                     combiners[k] = mergeCombiners(combiners[k], v)
             return combiners.iteritems()
+
         return shuffled._mapPartitions(_mergeCombiners)
 
     def partitionBy(self, numPartitions, partitionFunc=None):

From d2c01ba003ccbc478f3a708fe3543be6c075f20f Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 3 Aug 2014 21:51:11 -0700
Subject: [PATCH 197/347] clean up examples

---
 .../main/python/streaming/network_wordcount.py    | 10 ++++------
 examples/src/main/python/streaming/wordcount.py   | 15 ++++-----------
 2 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index f6fba4488e238..9b7af07803b4d 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -14,12 +14,10 @@
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
-    fm_lines = lines.flatMap(lambda x: x.split(" "))
-    mapped_lines = fm_lines.map(lambda x: (x, 1))
-    reduced_lines = mapped_lines.reduceByKey(add)
+    words = lines.flatMap(lambda line: line.split(" "))
+    mapped_words = words.map(lambda word: (word, 1))
+    count = mapped_words.reduceByKey(add)
 
-    reduced_lines.pyprint()
-    count_lines = mapped_lines.count()
-    count_lines.pyprint()
+    count.pyprint()
     ssc.start()
     ssc.awaitTermination()
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index ee52c4e178142..2426345711086 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -11,21 +11,14 @@
         exit(-1)
     conf = SparkConf()
     conf.setAppName("PythonStreamingWordCount")
-    conf.set("spark.default.parallelism", 1)
 
-# still has a bug
-#    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
     lines = ssc.textFileStream(sys.argv[1])
-    fm_lines = lines.flatMap(lambda x: x.split(" "))
-    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
-    mapped_lines = fm_lines.map(lambda x: (x, 1))
-    reduced_lines = mapped_lines.reduceByKey(add)
+    words = lines.flatMap(lambda line: line.split(" "))
+    mapped_words = words.map(lambda x: (x, 1))
+    count = mapped_words.reduceByKey(add)
     
-    fm_lines.pyprint()
-    filtered_lines.pyprint()
-    mapped_lines.pyprint()
-    reduced_lines.pyprint()
+    count.pyprint()
     ssc.start()
     ssc.awaitTermination()

From c462bb3019c23e13229f478a682c504c833b8173 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 3 Aug 2014 22:05:28 -0700
Subject: [PATCH 198/347] added stop in StreamingContext

---
 python/pyspark/streaming/context.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 5952e81a4bef3..01201f66421f8 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -121,3 +121,15 @@ def textFileStream(self, directory):
         file system. FIle names starting with . are ignored.
         """
         return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
+
+    def stop(self, stopSparkContext=True):
+        """
+        Stop the execution of the streams immediately (does not wait for all received data
+        to be processed).
+        """
+        
+        try:
+            self._jssc.stop(stopSparkContext)
+        finally:
+            # Stop Callback server
+            SparkContext._gateway.shutdown()

From 4d40d632b2674e99fb78ecd47f3ee4caf8f6b32d Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 3 Aug 2014 23:27:56 -0700
Subject: [PATCH 199/347] clean up dstream.py

---
 python/pyspark/streaming/dstream.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 0049c1e7a0d5c..e96fac007fa50 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -20,9 +20,7 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
 
     def count(self):
         """
-
         """
-        # TODO: make sure count implementation, this different from what pyspark does
         return self._mapPartitions(lambda i: [sum(1 for _ in i)])._sum()
 
     def _sum(self):
@@ -79,7 +77,6 @@ def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
 
     def reduce(self, func):
         """
-
         """
         return self.map(lambda x: (None, x)).reduceByKey(func, 1).map(lambda x: x[1])
 
@@ -107,12 +104,6 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         def combineLocally(iterator):
             combiners = {}
             for x in iterator:
-
-                #TODO for count operation make sure count implementation
-                # This is different from what pyspark does
-                #if isinstance(x, int):
-                #    x = ("", x)
-
                 (k, v) = x
                 if k not in combiners:
                     combiners[k] = createCombiner(v)
@@ -142,6 +133,7 @@ def partitionBy(self, numPartitions, partitionFunc=None):
 
         if partitionFunc is None:
             partitionFunc = lambda x: 0 if x is None else hash(x)
+
         # Transferring O(n) objects to Java is too expensive.  Instead, we'll
         # form the hash buckets in Python, transferring O(numPartitions) objects
         # to Java.  Each object is a (splitNumber, [objects]) pair.
@@ -228,7 +220,6 @@ def takeAndPrint(rdd, time):
 
         self.foreachRDD(takeAndPrint)
 
-
     #def transform(self, func):
     #    from utils import RDDFunction
     #    wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)

From 29c2bc5ad4569e3ac4da25a2d7605fe84280eab5 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 3 Aug 2014 23:47:14 -0700
Subject: [PATCH 200/347] initial commit for testcase

---
 python/pyspark/streaming_tests.py | 58 +++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 python/pyspark/streaming_tests.py

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
new file mode 100644
index 0000000000000..95c5489a5695b
--- /dev/null
+++ b/python/pyspark/streaming_tests.py
@@ -0,0 +1,58 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Unit tests for PySpark; additional tests are implemented as doctests in
+individual modules.
+
+This file will merged to tests.py. But for now, this file is separated to
+focus to streaming test case
+
+"""
+from fileinput import input
+from glob import glob
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+import unittest
+import zipfile
+
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.duration import *
+
+
+SPARK_HOME = os.environ["SPARK_HOME"]
+
+
+class PySparkStreamingTestCase(unittest.TestCase):
+
+    def setUp(self):
+        self._old_sys_path = list(sys.path)
+        class_name = self.__class__.__name__
+        self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
+
+    def tearDown(self):
+        self.ssc.stop()
+        sys.path = self._old_sys_path
+
+
+if __name__ == "__main__":
+    unittest.main()

From fe648e377ae81988e7b2e868f55a53a37fde27f8 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 4 Aug 2014 09:47:48 -0700
Subject: [PATCH 201/347] WIP

---
 .../main/python/streaming/test_oprations.py   | 24 +++++++++++++++++++
 python/pyspark/streaming/dstream.py           |  1 -
 2 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 examples/src/main/python/streaming/test_oprations.py

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
new file mode 100644
index 0000000000000..cb338ced5f228
--- /dev/null
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -0,0 +1,24 @@
+import sys
+from operator import add
+
+from pyspark.conf import SparkConf
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.duration import *
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
+        exit(-1)
+    conf = SparkConf()
+    conf.setAppName("PythonStreamingNetworkWordCount")
+    ssc = StreamingContext(conf=conf, duration=Seconds(1))
+
+    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
+    words = lines.flatMap(lambda line: line.split(" "))
+    mapped_words = words.map(lambda word: (word, 1))
+    count = mapped_words.reduceByKey(add)
+
+    count.pyprint()
+    ssc.start()
+#    ssc.awaitTermination()
+    ssc.stop()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index e96fac007fa50..6332c4527d334 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -120,7 +120,6 @@ def _mergeCombiners(iterator):
                     combiners[k] = v
                 else:
                     combiners[k] = mergeCombiners(combiners[k], v)
-            return combiners.iteritems()
 
         return shuffled._mapPartitions(_mergeCombiners)
 

From 8a0fbbc4347a6988da01ade554f4c32cf89354e2 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 4 Aug 2014 09:57:16 -0700
Subject: [PATCH 202/347] update comment

---
 python/pyspark/streaming/dstream.py | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 6332c4527d334..d6781a3e50e65 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -1,3 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 from collections import defaultdict
 from itertools import chain, ifilter, imap
 import operator
@@ -20,11 +37,13 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
 
     def count(self):
         """
+        Return a new DStream which contains the number of elements in this DStream.
         """
         return self._mapPartitions(lambda i: [sum(1 for _ in i)])._sum()
 
     def _sum(self):
         """
+        Add up the elements in this DStream.
         """
         return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
@@ -41,7 +60,7 @@ def print_(self):
 
     def filter(self, f):
         """
-        Return DStream containing only the elements that satisfy predicate.
+        Return a new DStream containing only the elements that satisfy predicate.
         """
         def func(iterator): return ifilter(f, iterator)
         return self._mapPartitions(func)
@@ -56,7 +75,7 @@ def func(s, iterator): return chain.from_iterable(imap(f, iterator))
 
     def map(self, f):
         """
-        Return DStream by applying a function to each element of DStream.
+        Return a new DStream by applying a function to each element of DStream.
         """
         def func(iterator): return imap(f, iterator)
         return self._mapPartitions(func)
@@ -71,12 +90,14 @@ def func(s, iterator): return f(iterator)
     def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
         Return a new DStream by applying a function to each partition of this DStream,
-        While tracking the index of the original partition.
+        while tracking the index of the original partition.
         """
         return PipelinedDStream(self, f, preservesPartitioning)
 
     def reduce(self, func):
         """
+        Return a new DStream by reduceing the elements of this RDD using the specified
+        commutative and associative binary operator.
         """
         return self.map(lambda x: (None, x)).reduceByKey(func, 1).map(lambda x: x[1])
 
@@ -280,4 +301,3 @@ def _jdstream(self):
 
     def _is_pipelinable(self):
         return not (self.is_cached)
-

From 1523b66c5aa4b098c1de3ec55ee44848973b7d00 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 4 Aug 2014 16:07:48 -0700
Subject: [PATCH 203/347] WIP

---
 examples/src/main/python/streaming/test_oprations.py         | 5 +++--
 python/pyspark/streaming/context.py                          | 5 +++++
 python/pyspark/streaming/dstream.py                          | 4 +++-
 python/pyspark/streaming/utils.py                            | 1 -
 .../apache/spark/streaming/api/python/PythonDStream.scala    | 5 +++--
 5 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index cb338ced5f228..084902b6a2f0d 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -15,10 +15,11 @@
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     words = lines.flatMap(lambda line: line.split(" "))
+#    ssc.checkpoint("checkpoint")
     mapped_words = words.map(lambda word: (word, 1))
     count = mapped_words.reduceByKey(add)
 
     count.pyprint()
     ssc.start()
-#    ssc.awaitTermination()
-    ssc.stop()
+    ssc.awaitTermination()
+#    ssc.stop()
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 01201f66421f8..dfaa5cfbbae27 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -133,3 +133,8 @@ def stop(self, stopSparkContext=True):
         finally:
             # Stop Callback server
             SparkContext._gateway.shutdown()
+
+    def checkpoint(self, directory):
+        """
+        """
+        self._jssc.checkpoint(directory)
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index d6781a3e50e65..937bafc6262b1 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -172,7 +172,8 @@ def add_shuffle_key(split, iterator):
         with _JavaStackTrace(self.ctx) as st:
             partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
                                                       id(partitionFunc))
-            jdstream = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream(), partitioner).asJavaDStream()
+            jdstream = self.ctx._jvm.PythonPairwiseDStream(keyed._jdstream.dstream(),
+                                                           partitioner).asJavaDStream()
         dstream = DStream(jdstream, self._ssc, BatchedSerializer(outputSerializer))
         # This is required so that id(partitionFunc) remains unique, even if
         # partitionFunc is a lambda:
@@ -246,6 +247,7 @@ def takeAndPrint(rdd, time):
     #    jdstream = self.ctx._jvm.PythonTransformedDStream(self._jdstream.dstream(), wrapped_func).toJavaDStream
     #    return DStream(jdstream, self._ssc, ...)  ## DO NOT KNOW HOW 
 
+
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index c60ecd1ed607a..aa5e19adbd927 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -37,7 +37,6 @@ class Java:
         implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction']
 
 
-
 def msDurationToString(ms):
     """
     Returns a human-readable string representing a duration such as "35ms"
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 86b067e1a2810..6f2e4bd0d90c8 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -25,7 +25,7 @@ import org.apache.spark._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.api.python._
 import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.streaming.{Duration, Time}
+import org.apache.spark.streaming.{StreamingContext, Duration, Time}
 import org.apache.spark.streaming.dstream._
 import org.apache.spark.streaming.api.java._
 
@@ -139,7 +139,7 @@ DStream[(Long, Array[Byte])](prev.ssc){
 }
 
 
-private class PairwiseDStream(prev:DStream[Array[Byte]], partitioner: Partitioner) extends
+private class PythonPairwiseDStream(prev:DStream[Array[Byte]], partitioner: Partitioner) extends
 DStream[Array[Byte]](prev.ssc){
   override def dependencies = List(prev)
 
@@ -180,6 +180,7 @@ class PythonForeachDStream(
 
   this.register()
 }
+
 /*
 This does not work. Ignore this for now. -TD
 class PythonTransformedDStream(

From 1df77f50e63fc31416485a4d0d854388dce72788 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Tue, 5 Aug 2014 00:09:38 -0700
Subject: [PATCH 204/347] WIP: added PythonTestInputStream

---
 .../main/python/streaming/test_oprations.py   | 14 +++--------
 python/pyspark/streaming/context.py           | 25 +++++++++++++++++++
 python/pyspark/streaming/dstream.py           |  1 +
 .../api/java/JavaStreamingContext.scala       |  3 +++
 .../streaming/api/python/PythonDStream.scala  |  1 +
 5 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index 084902b6a2f0d..3338a766b9cc3 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -6,20 +6,14 @@
 from pyspark.streaming.duration import *
 
 if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
-        exit(-1)
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
-    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
-    words = lines.flatMap(lambda line: line.split(" "))
-#    ssc.checkpoint("checkpoint")
-    mapped_words = words.map(lambda word: (word, 1))
-    count = mapped_words.reduceByKey(add)
+    test_input = ssc._testInputStream([1,1,1,1])
+    mapped = test_input.map(lambda x: (x, 1))
+    mapped.pyprint()
 
-    count.pyprint()
     ssc.start()
-    ssc.awaitTermination()
+#    ssc.awaitTermination()
 #    ssc.stop()
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index dfaa5cfbbae27..d544eab9b8fc7 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -17,6 +17,7 @@
 
 import sys
 from signal import signal, SIGTERM, SIGINT
+from tempfile import NamedTemporaryFile
 
 from pyspark.conf import SparkConf
 from pyspark.files import SparkFiles
@@ -138,3 +139,27 @@ def checkpoint(self, directory):
         """
         """
         self._jssc.checkpoint(directory)
+
+    def _testInputStream(self, test_input, numSlices=None):
+
+        numSlices = numSlices or self._sc.defaultParallelism
+        # Calling the Java parallelize() method with an ArrayList is too slow,
+        # because it sends O(n) Py4J commands.  As an alternative, serialized
+        # objects are written to a file and loaded through textFile().
+        tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
+        # Make sure we distribute data evenly if it's smaller than self.batchSize
+        if "__len__" not in dir(test_input):
+            c = list(test_input)    # Make it a list so we can compute its length
+        batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
+        if batchSize > 1:
+            serializer = BatchedSerializer(self._sc._unbatched_serializer,
+                                           batchSize)
+        else:
+            serializer = self._sc._unbatched_serializer
+        serializer.dump_stream(test_input, tempFile)
+        tempFile.close()
+        print tempFile.name
+        jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
+                                                        tempFile.name,
+                                                        numSlices).asJavaDStream()
+        return DStream(jinput_stream, self, UTF8Deserializer())
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 937bafc6262b1..0885ee35c2f28 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -141,6 +141,7 @@ def _mergeCombiners(iterator):
                     combiners[k] = v
                 else:
                     combiners[k] = mergeCombiners(combiners[k], v)
+            return combiners.iteritems()
 
         return shuffled._mapPartitions(_mergeCombiners)
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
index 9dc26dc6b32a1..50162e0a1d3b4 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
@@ -549,6 +549,9 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
  * JavaStreamingContext object contains a number of utility functions.
  */
 object JavaStreamingContext {
+  implicit def fromStreamingContext(ssc: StreamingContext): JavaStreamingContext = new JavaStreamingContext(ssc)
+
+  implicit def toStreamingContext(jssc: JavaStreamingContext): StreamingContext = jssc.ssc
 
   /**
    * Either recreate a StreamingContext from checkpoint data or create a new StreamingContext.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 6f2e4bd0d90c8..1a271edf4df80 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -23,6 +23,7 @@ import scala.reflect.ClassTag
 
 import org.apache.spark._
 import org.apache.spark.rdd.RDD
+import org.apache.spark.api.java._
 import org.apache.spark.api.python._
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.streaming.{StreamingContext, Duration, Time}

From 9ad68559ecfa7561a8e4bbf49adfd503f9d7d3b4 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 6 Aug 2014 19:11:17 -0700
Subject: [PATCH 205/347] WIP

---
 .../scala/org/apache/spark/api/python/PythonRDD.scala  |  2 ++
 examples/src/main/python/streaming/test_oprations.py   | 10 +++++++---
 python/pyspark/streaming/context.py                    |  6 +++++-
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 022e2891559d7..41ed95b607161 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -350,6 +350,8 @@ private[spark] object PythonRDD extends Logging {
     } catch {
       case eof: EOFException => {}
     }
+    println("RDDDD ==================")
+    println(objs)
     JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
   }
 
diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index 3338a766b9cc3..5ee0bd4b31253 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -9,11 +9,15 @@
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
+    ssc.checkpoint("/tmp/spark_ckp")
 
-    test_input = ssc._testInputStream([1,1,1,1])
-    mapped = test_input.map(lambda x: (x, 1))
-    mapped.pyprint()
+    test_input = ssc._testInputStream([[1],[1],[1]])
+#    ssc.checkpoint("/tmp/spark_ckp")
+    fm_test = test_input.flatMap(lambda x: x.split(" "))
+    mapped_test = fm_test.map(lambda x: (x, 1))
 
+
+    mapped_test.print_()
     ssc.start()
 #    ssc.awaitTermination()
 #    ssc.stop()
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index d544eab9b8fc7..882db547faa39 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -146,7 +146,10 @@ def _testInputStream(self, test_input, numSlices=None):
         # Calling the Java parallelize() method with an ArrayList is too slow,
         # because it sends O(n) Py4J commands.  As an alternative, serialized
         # objects are written to a file and loaded through textFile().
-        tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
+
+        #tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
+        tempFile = open("/tmp/spark_rdd", "wb")
+
         # Make sure we distribute data evenly if it's smaller than self.batchSize
         if "__len__" not in dir(test_input):
             c = list(test_input)    # Make it a list so we can compute its length
@@ -157,6 +160,7 @@ def _testInputStream(self, test_input, numSlices=None):
         else:
             serializer = self._sc._unbatched_serializer
         serializer.dump_stream(test_input, tempFile)
+        tempFile.flush()
         tempFile.close()
         print tempFile.name
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc,

From ce2acd26ddb16d654f35c63ad8108c65d8b329ac Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 10 Aug 2014 18:43:09 -0700
Subject: [PATCH 206/347] WIP added test case

---
 .../apache/spark/api/python/PythonRDD.scala   |  2 -
 .../main/python/streaming/test_oprations.py   | 25 +++++---
 python/pyspark/streaming/context.py           | 16 +++--
 python/pyspark/streaming/dstream.py           | 22 +++++--
 python/pyspark/streaming_tests.py             | 62 +++++++++++++++++--
 .../streaming/api/java/JavaDStreamLike.scala  |  9 +++
 .../streaming/api/python/PythonDStream.scala  |  9 ++-
 .../spark/streaming/dstream/DStream.scala     | 17 +++++
 8 files changed, 132 insertions(+), 30 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 41ed95b607161..022e2891559d7 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -350,8 +350,6 @@ private[spark] object PythonRDD extends Logging {
     } catch {
       case eof: EOFException => {}
     }
-    println("RDDDD ==================")
-    println(objs)
     JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
   }
 
diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index 5ee0bd4b31253..24ebe23d63166 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -9,15 +9,22 @@
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
-    ssc.checkpoint("/tmp/spark_ckp")
 
-    test_input = ssc._testInputStream([[1],[1],[1]])
-#    ssc.checkpoint("/tmp/spark_ckp")
-    fm_test = test_input.flatMap(lambda x: x.split(" "))
-    mapped_test = fm_test.map(lambda x: (x, 1))
+    test_input = ssc._testInputStream([1,2,3])
+    class buff:
+        pass
+   
+    fm_test = test_input.map(lambda x: (x, 1))
+    fm_test.test_output(buff)
 
-
-    mapped_test.print_()
     ssc.start()
-#    ssc.awaitTermination()
-#    ssc.stop()
+    while True:
+        ssc.awaitTermination(50)
+        try:
+            buff.result
+            break
+        except AttributeError:
+            pass
+
+    ssc.stop()
+    print buff.result
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 882db547faa39..0d7665d645be8 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -100,10 +100,10 @@ def awaitTermination(self, timeout=None):
         """
         Wait for the execution to stop.
         """
-        if timeout:
-            self._jssc.awaitTermination(timeout)
-        else:
+        if timeout is None:
             self._jssc.awaitTermination()
+        else:
+            self._jssc.awaitTermination(timeout)
 
     # start from simple one. storageLevel is not passed for now.
     def socketTextStream(self, hostname, port):
@@ -137,6 +137,7 @@ def stop(self, stopSparkContext=True):
 
     def checkpoint(self, directory):
         """
+        Not tested
         """
         self._jssc.checkpoint(directory)
 
@@ -147,8 +148,7 @@ def _testInputStream(self, test_input, numSlices=None):
         # because it sends O(n) Py4J commands.  As an alternative, serialized
         # objects are written to a file and loaded through textFile().
 
-        #tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
-        tempFile = open("/tmp/spark_rdd", "wb")
+        tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
 
         # Make sure we distribute data evenly if it's smaller than self.batchSize
         if "__len__" not in dir(test_input):
@@ -160,10 +160,8 @@ def _testInputStream(self, test_input, numSlices=None):
         else:
             serializer = self._sc._unbatched_serializer
         serializer.dump_stream(test_input, tempFile)
-        tempFile.flush()
-        tempFile.close()
-        print tempFile.name
+
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
                                                         tempFile.name,
                                                         numSlices).asJavaDStream()
-        return DStream(jinput_stream, self, UTF8Deserializer())
+        return DStream(jinput_stream, self, PickleSerializer())
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 0885ee35c2f28..5f4e3a6cd15f1 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -47,7 +47,7 @@ def _sum(self):
         """
         return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
-    def print_(self):
+    def print_(self, label=None):
         """
         Since print is reserved name for python, we cannot define a print method function.
         This function prints serialized data in RDD in DStream because Scala and Java cannot
@@ -56,7 +56,7 @@ def print_(self):
         Call DStream.print().
         """
         # a hack to call print function in DStream
-        getattr(self._jdstream, "print")()
+        getattr(self._jdstream, "print")(label)
 
     def filter(self, f):
         """
@@ -230,6 +230,7 @@ def pyprint(self):
 
         """
         def takeAndPrint(rdd, time):
+            print "take and print ==================="
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
@@ -242,11 +243,24 @@ def takeAndPrint(rdd, time):
 
         self.foreachRDD(takeAndPrint)
 
-    #def transform(self, func):
+    #def transform(self, func): - TD
     #    from utils import RDDFunction
     #    wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
     #    jdstream = self.ctx._jvm.PythonTransformedDStream(self._jdstream.dstream(), wrapped_func).toJavaDStream
-    #    return DStream(jdstream, self._ssc, ...)  ## DO NOT KNOW HOW 
+    #    return DStream(jdstream, self._ssc, ...)  ## DO NOT KNOW HOW
+
+    def _test_output(self, buff):
+        """
+        This function is only for testcase.
+        Store data in dstream to buffer to valify the result in tesecase
+        """
+        def get_output(rdd, time):
+            taken = rdd.take(11)
+            buff.result = taken
+        self.foreachRDD(get_output)
+
+    def output(self):
+        self._jdstream.outputToFile()
 
 
 class PipelinedDStream(DStream):
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 95c5489a5695b..0660be10b027b 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -19,12 +19,13 @@
 Unit tests for PySpark; additional tests are implemented as doctests in
 individual modules.
 
-This file will merged to tests.py. But for now, this file is separated to
-focus to streaming test case
+This file will merged to tests.py. But for now, this file is separated due
+to focusing to streaming test case
 
 """
 from fileinput import input
 from glob import glob
+from itertools import chain
 import os
 import re
 import shutil
@@ -41,18 +42,69 @@
 
 SPARK_HOME = os.environ["SPARK_HOME"]
 
+class buff:
+    """
+    Buffer for store the output from stream
+    """
+    result = None
 
 class PySparkStreamingTestCase(unittest.TestCase):
-
     def setUp(self):
-        self._old_sys_path = list(sys.path)
+        print "set up"
         class_name = self.__class__.__name__
         self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
 
     def tearDown(self):
+        print "tear donw"
         self.ssc.stop()
-        sys.path = self._old_sys_path
+        time.sleep(10)
+
+class TestBasicOperationsSuite(PySparkStreamingTestCase):
+    def setUp(self):
+        PySparkStreamingTestCase.setUp(self)
+        buff.result = None
+        self.timeout = 10 # seconds
+
+    def tearDown(self):
+        PySparkStreamingTestCase.tearDown(self)
+
+    def test_map(self):
+        test_input = [range(1,5), range(5,9), range(9, 13)]
+        def test_func(dstream):
+            return dstream.map(lambda x: str(x))
+        expected = map(str, test_input)
+        output = self.run_stream(test_input, test_func)
+        self.assertEqual(output, expected)
+
+    def test_flatMap(self):
+        test_input = [range(1,5), range(5,9), range(9, 13)]
+        def test_func(dstream):
+            return dstream.flatMap(lambda x: (x, x * 2))
+        # Maybe there be good way to create flatmap
+        excepted = map(lambda x: list(chain.from_iterable((map(lambda y:[y, y*2], x)))), 
+                       test_input)
+        output = self.run_stream(test_input, test_func)
+
+    def run_stream(self, test_input, test_func):
+        # Generate input stream with user-defined input
+        test_input_stream = self.ssc._testInputStream(test_input)
+        # Applyed test function to stream
+        test_stream = test_func(test_input_stream)
+        # Add job to get outpuf from stream
+        test_stream._test_output(buff)
+        self.ssc.start()
 
+        start_time = time.time()
+        while True:
+            current_time = time.time()
+            # check time out
+            if (current_time - start_time) > self.timeout:
+                self.ssc.stop()
+                break
+            self.ssc.awaitTermination(50)
+            if buff.result is not None:
+                break
+        return buff.result
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index a6184de4e83c1..7a002bbe74ca9 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -54,6 +54,15 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     dstream.print()
   }
 
+  def print(label: String = null): Unit = {
+    dstream.print(label)
+  }
+
+  def outputToFile(): Unit = {
+    dstream.outputToFile()
+  }
+
+
   /**
    * Return a new DStream in which each RDD has a single element generated by counting each RDD
    * of this DStream.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 1a271edf4df80..0e07602ed9bcf 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -17,9 +17,14 @@
 
 package org.apache.spark.streaming.api.python
 
+import java.io._
+import java.io.{ObjectInputStream, IOException}
 import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
 
+import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
+import scala.collection.JavaConversions._
+
 
 import org.apache.spark._
 import org.apache.spark.rdd.RDD
@@ -51,6 +56,8 @@ class PythonDStream[T: ClassTag](
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     parent.getOrCompute(validTime) match{
       case Some(rdd) =>
+        logInfo("RDD ID in python DStream     ===========")
+        logInfo("RDD id " + rdd.id)
         val pythonRDD = new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning, pythonExec, broadcastVars, accumulator)
         Some(pythonRDD.asJavaRDD.rdd)
       case None => None
@@ -152,7 +159,7 @@ DStream[Array[Byte]](prev.ssc){
         val pairwiseRDD = new PairwiseRDD(rdd)
         /*
          * Since python operation is executed by Scala after StreamingContext.start.
-         * What PairwiseDStream does is equivalent to following python code in pySpark.
+         * What PythonPairwiseDStream does is equivalent to python code in pySpark.
          *
          * with _JavaStackTrace(self.context) as st:
          *    pairRDD = self.ctx._jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD()
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index d8dbdf59e7ff1..bafff80adc54b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -623,6 +623,23 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
+
+  def print(label: String = null) {
+    def foreachFunc = (rdd: RDD[T], time: Time) => {
+      val first11 = rdd.take(11)
+      println ("-------------------------------------------")
+      println ("Time: " + time)
+      println ("-------------------------------------------")
+      if(label != null){
+        println (label)
+      }
+      first11.take(10).foreach(println)
+      if (first11.size > 10) println("...")
+      println()
+    }
+    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
+  }
+
   /**
    * Return a new DStream in which each RDD contains all the elements in seen in a
    * sliding window of time over this DStream. The new DStream generates RDDs with

From 878bad7b879e81ed73b5a7c219d02652b067fdc6 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 03:21:22 -0700
Subject: [PATCH 207/347] added basic operation test cases

---
 .../main/python/streaming/test_oprations.py   | 19 ++--
 python/pyspark/streaming/context.py           | 43 +++++----
 python/pyspark/streaming/dstream.py           |  8 +-
 python/pyspark/streaming_tests.py             | 95 +++++++++++++++----
 .../streaming/api/python/PythonDStream.scala  |  2 -
 5 files changed, 113 insertions(+), 54 deletions(-)

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index 24ebe23d63166..70a62058286e9 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -9,22 +9,23 @@
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
-
-    test_input = ssc._testInputStream([1,2,3])
-    class buff:
+    class Buff:
+        result = list()
         pass
+    Buff.result = list()
+
+    test_input = ssc._testInputStream([range(1,4), range(4,7), range(7,10)])
    
     fm_test = test_input.map(lambda x: (x, 1))
-    fm_test.test_output(buff)
+    fm_test.pyprint()
+    fm_test._test_output(Buff.result)
 
     ssc.start()
     while True:
         ssc.awaitTermination(50)
-        try:
-            buff.result
+        if len(Buff.result) == 3:
             break
-        except AttributeError:
-            pass
 
     ssc.stop()
-    print buff.result
+    print Buff.result
+
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 0d7665d645be8..be142fd4f327b 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -123,14 +123,14 @@ def textFileStream(self, directory):
         """
         return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
 
-    def stop(self, stopSparkContext=True):
+    def stop(self, stopSparkContext=True, stopGraceFully=False):
         """
         Stop the execution of the streams immediately (does not wait for all received data
         to be processed).
         """
         
         try:
-            self._jssc.stop(stopSparkContext)
+            self._jssc.stop(stopSparkContext, stopGraceFully)
         finally:
             # Stop Callback server
             SparkContext._gateway.shutdown()
@@ -141,27 +141,34 @@ def checkpoint(self, directory):
         """
         self._jssc.checkpoint(directory)
 
-    def _testInputStream(self, test_input, numSlices=None):
-
+    def _testInputStream(self, test_inputs, numSlices=None):
+        """
+        Generate multiple files to make "stream" in Scala side for test.
+        Scala chooses one of the files and generates RDD using PythonRDD.readRDDFromFile.
+        """
         numSlices = numSlices or self._sc.defaultParallelism
         # Calling the Java parallelize() method with an ArrayList is too slow,
         # because it sends O(n) Py4J commands.  As an alternative, serialized
         # objects are written to a file and loaded through textFile().
 
-        tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
-
-        # Make sure we distribute data evenly if it's smaller than self.batchSize
-        if "__len__" not in dir(test_input):
-            c = list(test_input)    # Make it a list so we can compute its length
-        batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
-        if batchSize > 1:
-            serializer = BatchedSerializer(self._sc._unbatched_serializer,
-                                           batchSize)
-        else:
-            serializer = self._sc._unbatched_serializer
-        serializer.dump_stream(test_input, tempFile)
-
+        tempFiles = list()
+        for test_input in test_inputs:
+            tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
+
+            # Make sure we distribute data evenly if it's smaller than self.batchSize
+            if "__len__" not in dir(test_input):
+                c = list(test_input)    # Make it a list so we can compute its length
+            batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
+            if batchSize > 1:
+                serializer = BatchedSerializer(self._sc._unbatched_serializer,
+                                               batchSize)
+            else:
+                serializer = self._sc._unbatched_serializer
+            serializer.dump_stream(test_input, tempFile)
+            tempFiles.append(tempFile.name)
+
+        jtempFiles = ListConverter().convert(tempFiles, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
-                                                        tempFile.name,
+                                                        jtempFiles,
                                                         numSlices).asJavaDStream()
         return DStream(jinput_stream, self, PickleSerializer())
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 5f4e3a6cd15f1..ec66af3177826 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -230,7 +230,6 @@ def pyprint(self):
 
         """
         def takeAndPrint(rdd, time):
-            print "take and print ==================="
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
@@ -255,13 +254,10 @@ def _test_output(self, buff):
         Store data in dstream to buffer to valify the result in tesecase
         """
         def get_output(rdd, time):
-            taken = rdd.take(11)
-            buff.result = taken
+            taken = rdd.collect()
+            buff.append(taken)
         self.foreachRDD(get_output)
 
-    def output(self):
-        self._jdstream.outputToFile()
-
 
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 0660be10b027b..d2e638a7d2acc 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -35,76 +35,133 @@
 import time
 import unittest
 import zipfile
+import operator
 
+from pyspark.context import SparkContext
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
 
 
 SPARK_HOME = os.environ["SPARK_HOME"]
 
-class buff:
+class StreamOutput:
     """
-    Buffer for store the output from stream
+    a class to store the output from stream
     """
-    result = None
+    result = list()
 
 class PySparkStreamingTestCase(unittest.TestCase):
     def setUp(self):
-        print "set up"
         class_name = self.__class__.__name__
         self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
 
     def tearDown(self):
-        print "tear donw"
-        self.ssc.stop()
-        time.sleep(10)
+        # Do not call StreamingContext.stop directly because we do not wait to shutdown
+        # call back server and py4j client
+        self.ssc._jssc.stop()
+        self.ssc._sc.stop()
+        # Why does it long time to terminaete StremaingContext and SparkContext?
+        # Should we change the sleep time if this depends on machine spec?
+        time.sleep(5)
+
+    @classmethod
+    def tearDownClass(cls):
+        time.sleep(5)
+        SparkContext._gateway._shutdown_callback_server()
 
 class TestBasicOperationsSuite(PySparkStreamingTestCase):
+    """
+    Input and output of this TestBasicOperationsSuite is the equivalent to 
+    Scala TestBasicOperationsSuite.
+    """
     def setUp(self):
         PySparkStreamingTestCase.setUp(self)
-        buff.result = None
+        StreamOutput.result = list()
         self.timeout = 10 # seconds
 
     def tearDown(self):
         PySparkStreamingTestCase.tearDown(self)
 
+    @classmethod
+    def tearDownClass(cls):
+        PySparkStreamingTestCase.tearDownClass()
+
     def test_map(self):
+        """Basic operation test for DStream.map"""
         test_input = [range(1,5), range(5,9), range(9, 13)]
         def test_func(dstream):
             return dstream.map(lambda x: str(x))
-        expected = map(str, test_input)
-        output = self.run_stream(test_input, test_func)
-        self.assertEqual(output, expected)
+        expected_output = map(lambda x: map(lambda y: str(y), x), test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
 
     def test_flatMap(self):
+        """Basic operation test for DStream.faltMap"""
         test_input = [range(1,5), range(5,9), range(9, 13)]
         def test_func(dstream):
             return dstream.flatMap(lambda x: (x, x * 2))
-        # Maybe there be good way to create flatmap
-        excepted = map(lambda x: list(chain.from_iterable((map(lambda y:[y, y*2], x)))), 
+        expected_output = map(lambda x: list(chain.from_iterable((map(lambda y: [y, y * 2], x)))), 
                        test_input)
-        output = self.run_stream(test_input, test_func)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_filter(self):
+        """Basic operation test for DStream.filter"""
+        test_input = [range(1,5), range(5,9), range(9, 13)]
+        def test_func(dstream):
+            return dstream.filter(lambda x: x % 2 == 0)
+        expected_output = map(lambda x: filter(lambda y: y % 2 == 0, x), test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_count(self):
+        """Basic operation test for DStream.count"""
+        test_input = [[], [1], range(1, 3), range(1,4), range(1,5)]
+        def test_func(dstream):
+            return dstream.count()
+        expected_output = map(lambda x: [len(x)], test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+        
+    def test_reduce(self):
+        """Basic operation test for DStream.reduce"""
+        test_input = [range(1,5), range(5,9), range(9, 13)]
+        def test_func(dstream):
+            return dstream.reduce(operator.add)
+        expected_output = map(lambda x: [reduce(operator.add, x)], test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_reduceByKey(self):
+        """Basic operation test for DStream.reduceByKey"""
+        test_input = [["a", "a", "b"], ["", ""], []]
+        def test_func(dstream):
+            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
+        expected_output = [[("a", 2), ("b", 1)],[("", 2)], []]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
 
-    def run_stream(self, test_input, test_func):
+    def _run_stream(self, test_input, test_func, expected_output):
+        """Start stream and return the output"""
         # Generate input stream with user-defined input
         test_input_stream = self.ssc._testInputStream(test_input)
         # Applyed test function to stream
         test_stream = test_func(test_input_stream)
         # Add job to get outpuf from stream
-        test_stream._test_output(buff)
+        test_stream._test_output(StreamOutput.result)
         self.ssc.start()
 
         start_time = time.time()
+        # loop until get the result from stream
         while True:
             current_time = time.time()
             # check time out
             if (current_time - start_time) > self.timeout:
-                self.ssc.stop()
                 break
             self.ssc.awaitTermination(50)
-            if buff.result is not None:
+            if len(expected_output) == len(StreamOutput.result):
                 break
-        return buff.result
+        return StreamOutput.result
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 0e07602ed9bcf..a24300f626d1a 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -56,8 +56,6 @@ class PythonDStream[T: ClassTag](
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     parent.getOrCompute(validTime) match{
       case Some(rdd) =>
-        logInfo("RDD ID in python DStream     ===========")
-        logInfo("RDD id " + rdd.id)
         val pythonRDD = new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning, pythonExec, broadcastVars, accumulator)
         Some(pythonRDD.asJavaRDD.rdd)
       case None => None

From f21cab39c4122672f835196dc36e067361e39274 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 03:22:23 -0700
Subject: [PATCH 208/347] delete waste file

---
 .../main/python/streaming/test_oprations.py   | 31 -------------------
 1 file changed, 31 deletions(-)
 delete mode 100644 examples/src/main/python/streaming/test_oprations.py

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
deleted file mode 100644
index 70a62058286e9..0000000000000
--- a/examples/src/main/python/streaming/test_oprations.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import sys
-from operator import add
-
-from pyspark.conf import SparkConf
-from pyspark.streaming.context import StreamingContext
-from pyspark.streaming.duration import *
-
-if __name__ == "__main__":
-    conf = SparkConf()
-    conf.setAppName("PythonStreamingNetworkWordCount")
-    ssc = StreamingContext(conf=conf, duration=Seconds(1))
-    class Buff:
-        result = list()
-        pass
-    Buff.result = list()
-
-    test_input = ssc._testInputStream([range(1,4), range(4,7), range(7,10)])
-   
-    fm_test = test_input.map(lambda x: (x, 1))
-    fm_test.pyprint()
-    fm_test._test_output(Buff.result)
-
-    ssc.start()
-    while True:
-        ssc.awaitTermination(50)
-        if len(Buff.result) == 3:
-            break
-
-    ssc.stop()
-    print Buff.result
-

From 3d3782236850b0b9ca2558c52bfe0535881757b0 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 03:41:24 -0700
Subject: [PATCH 209/347] fixed PEP-008 violation

---
 python/pyspark/streaming/context.py |  5 ----
 python/pyspark/streaming/dstream.py | 19 +++++++++------
 python/pyspark/streaming_tests.py   | 37 +++++++++++++++--------------
 3 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index be142fd4f327b..088a4965b6b13 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -19,12 +19,7 @@
 from signal import signal, SIGTERM, SIGINT
 from tempfile import NamedTemporaryFile
 
-from pyspark.conf import SparkConf
-from pyspark.files import SparkFiles
-from pyspark.java_gateway import launch_gateway
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
-from pyspark.storagelevel import *
-from pyspark.rdd import RDD
 from pyspark.context import SparkContext
 from pyspark.streaming.dstream import DStream
 
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index ec66af3177826..07429f477d310 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -49,7 +49,7 @@ def _sum(self):
 
     def print_(self, label=None):
         """
-        Since print is reserved name for python, we cannot define a print method function.
+        Since print is reserved name for python, we cannot define a "print" method function.
         This function prints serialized data in RDD in DStream because Scala and Java cannot
         deserialized pickled python object. Please use DStream.pyprint() instead to print results.
 
@@ -159,8 +159,8 @@ def partitionBy(self, numPartitions, partitionFunc=None):
         # form the hash buckets in Python, transferring O(numPartitions) objects
         # to Java.  Each object is a (splitNumber, [objects]) pair.
         outputSerializer = self.ctx._unbatched_serializer
-        def add_shuffle_key(split, iterator):
 
+        def add_shuffle_key(split, iterator):
             buckets = defaultdict(list)
 
             for (k, v) in iterator:
@@ -218,6 +218,11 @@ def getNumPartitions(self):
 
     def foreachRDD(self, func):
         """
+        Apply userdefined function to all RDD in a DStream.
+        This python implementation could be expensive because it uses callback server
+        in order to apply function to RDD in DStream.
+        This is an output operator, so this DStream will be registered as an output
+        stream and there materialized.
         """
         from utils import RDDFunction
         wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
@@ -227,7 +232,6 @@ def pyprint(self):
         """
         Print the first ten elements of each RDD generated in this DStream. This is an output
         operator, so this DStream will be registered as an output stream and there materialized.
-
         """
         def takeAndPrint(rdd, time):
             taken = rdd.take(11)
@@ -248,14 +252,15 @@ def takeAndPrint(rdd, time):
     #    jdstream = self.ctx._jvm.PythonTransformedDStream(self._jdstream.dstream(), wrapped_func).toJavaDStream
     #    return DStream(jdstream, self._ssc, ...)  ## DO NOT KNOW HOW
 
-    def _test_output(self, buff):
+    def _test_output(self, result):
         """
-        This function is only for testcase.
-        Store data in dstream to buffer to valify the result in tesecase
+        This function is only for test case.
+        Store data in a DStream to result to verify the result in tese case
         """
         def get_output(rdd, time):
             taken = rdd.collect()
-            buff.append(taken)
+            result.append(taken)
+
         self.foreachRDD(get_output)
 
 
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index d2e638a7d2acc..ef9b87756fcef 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -23,18 +23,10 @@
 to focusing to streaming test case
 
 """
-from fileinput import input
-from glob import glob
 from itertools import chain
 import os
-import re
-import shutil
-import subprocess
-import sys
-import tempfile
 import time
 import unittest
-import zipfile
 import operator
 
 from pyspark.context import SparkContext
@@ -44,12 +36,14 @@
 
 SPARK_HOME = os.environ["SPARK_HOME"]
 
+
 class StreamOutput:
     """
     a class to store the output from stream
     """
     result = list()
 
+
 class PySparkStreamingTestCase(unittest.TestCase):
     def setUp(self):
         class_name = self.__class__.__name__
@@ -69,6 +63,7 @@ def tearDownClass(cls):
         time.sleep(5)
         SparkContext._gateway._shutdown_callback_server()
 
+
 class TestBasicOperationsSuite(PySparkStreamingTestCase):
     """
     Input and output of this TestBasicOperationsSuite is the equivalent to 
@@ -77,7 +72,7 @@ class TestBasicOperationsSuite(PySparkStreamingTestCase):
     def setUp(self):
         PySparkStreamingTestCase.setUp(self)
         StreamOutput.result = list()
-        self.timeout = 10 # seconds
+        self.timeout = 10  # seconds
 
     def tearDown(self):
         PySparkStreamingTestCase.tearDown(self)
@@ -88,7 +83,8 @@ def tearDownClass(cls):
 
     def test_map(self):
         """Basic operation test for DStream.map"""
-        test_input = [range(1,5), range(5,9), range(9, 13)]
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+
         def test_func(dstream):
             return dstream.map(lambda x: str(x))
         expected_output = map(lambda x: map(lambda y: str(y), x), test_input)
@@ -97,17 +93,19 @@ def test_func(dstream):
 
     def test_flatMap(self):
         """Basic operation test for DStream.faltMap"""
-        test_input = [range(1,5), range(5,9), range(9, 13)]
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+
         def test_func(dstream):
             return dstream.flatMap(lambda x: (x, x * 2))
         expected_output = map(lambda x: list(chain.from_iterable((map(lambda y: [y, y * 2], x)))), 
-                       test_input)
+                              test_input)
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
     def test_filter(self):
         """Basic operation test for DStream.filter"""
-        test_input = [range(1,5), range(5,9), range(9, 13)]
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+
         def test_func(dstream):
             return dstream.filter(lambda x: x % 2 == 0)
         expected_output = map(lambda x: filter(lambda y: y % 2 == 0, x), test_input)
@@ -116,7 +114,8 @@ def test_func(dstream):
 
     def test_count(self):
         """Basic operation test for DStream.count"""
-        test_input = [[], [1], range(1, 3), range(1,4), range(1,5)]
+        test_input = [[], [1], range(1, 3), range(1, 4), range(1, 5)]
+
         def test_func(dstream):
             return dstream.count()
         expected_output = map(lambda x: [len(x)], test_input)
@@ -125,7 +124,8 @@ def test_func(dstream):
         
     def test_reduce(self):
         """Basic operation test for DStream.reduce"""
-        test_input = [range(1,5), range(5,9), range(9, 13)]
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+
         def test_func(dstream):
             return dstream.reduce(operator.add)
         expected_output = map(lambda x: [reduce(operator.add, x)], test_input)
@@ -135,9 +135,10 @@ def test_func(dstream):
     def test_reduceByKey(self):
         """Basic operation test for DStream.reduceByKey"""
         test_input = [["a", "a", "b"], ["", ""], []]
+
         def test_func(dstream):
             return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
-        expected_output = [[("a", 2), ("b", 1)],[("", 2)], []]
+        expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
@@ -145,9 +146,9 @@ def _run_stream(self, test_input, test_func, expected_output):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
         test_input_stream = self.ssc._testInputStream(test_input)
-        # Applyed test function to stream
+        # Applied test function to stream
         test_stream = test_func(test_input_stream)
-        # Add job to get outpuf from stream
+        # Add job to get output from stream
         test_stream._test_output(StreamOutput.result)
         self.ssc.start()
 

From 253a8630728951de77fc430a9b805bbbf7442ecd Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 04:42:08 -0700
Subject: [PATCH 210/347] removed unnesessary changes

---
 .../org/apache/spark/streaming/dstream/DStream.scala      | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index bafff80adc54b..46ef05d9c37a1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -17,14 +17,11 @@
 
 package org.apache.spark.streaming.dstream
 
-
-import java.io._
+import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
 
 import scala.deprecated
 import scala.collection.mutable.HashMap
 import scala.reflect.ClassTag
-import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
-import scala.util.control.Breaks._
 
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.rdd.{BlockRDD, RDD}
@@ -34,7 +31,6 @@ import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.scheduler.Job
 import org.apache.spark.util.MetadataCleaner
 import org.apache.spark.streaming.Duration
-import org.apache.spark.api.python.PythonRDD
 
 /**
  * A Discretized Stream (DStream), the basic abstraction in Spark Streaming, is a continuous
@@ -562,11 +558,9 @@ abstract class DStream[T: ClassTag] (
     // DStreams can't be serialized with closures, we can't proactively check 
     // it for serializability and so we pass the optional false to SparkContext.clean
 
-    // serialized python
     val cleanedF = context.sparkContext.clean(transformFunc, false)
     val realTransformFunc =  (rdds: Seq[RDD[_]], time: Time) => {
       assert(rdds.length == 1)
-      // if transformfunc is fine, it is okay
       cleanedF(rdds.head.asInstanceOf[RDD[T]], time)
     }
     new TransformedDStream[U](Seq(this), realTransformFunc)

From bb1095659153a4e1a6bc8f5159c08244d7e8a0c1 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 05:32:28 -0700
Subject: [PATCH 211/347] edited the comment to add more precise description

---
 python/pyspark/streaming_tests.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index ef9b87756fcef..ec45acec94dbf 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -50,8 +50,8 @@ def setUp(self):
         self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
 
     def tearDown(self):
-        # Do not call StreamingContext.stop directly because we do not wait to shutdown
-        # call back server and py4j client
+        # Do not call pyspark.streaming.context.StreamingContext.stop directly because
+        # we do not wait to shutdowncall back server and py4j client
         self.ssc._jssc.stop()
         self.ssc._sc.stop()
         # Why does it long time to terminaete StremaingContext and SparkContext?
@@ -146,7 +146,7 @@ def _run_stream(self, test_input, test_func, expected_output):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
         test_input_stream = self.ssc._testInputStream(test_input)
-        # Applied test function to stream
+        # Apply test function to stream
         test_stream = test_func(test_input_stream)
         # Add job to get output from stream
         test_stream._test_output(StreamOutput.result)
@@ -160,6 +160,7 @@ def _run_stream(self, test_input, test_func, expected_output):
             if (current_time - start_time) > self.timeout:
                 break
             self.ssc.awaitTermination(50)
+            # check if the output is the same length of expexted output
             if len(expected_output) == len(StreamOutput.result):
                 break
         return StreamOutput.result

From 270a9e13dcae53a1b0563b637b6002ded0e0650f Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 16:34:12 -0700
Subject: [PATCH 212/347] added mapValues and flatMapVaules WIP for glom and
 mapPartitions test

---
 python/pyspark/streaming/context.py |  2 +
 python/pyspark/streaming/dstream.py | 69 ++++++++++++++++++++++-------
 python/pyspark/streaming_tests.py   | 48 +++++++++++++++++++-
 3 files changed, 101 insertions(+), 18 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 088a4965b6b13..eee298badcbad 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -140,6 +140,8 @@ def _testInputStream(self, test_inputs, numSlices=None):
         """
         Generate multiple files to make "stream" in Scala side for test.
         Scala chooses one of the files and generates RDD using PythonRDD.readRDDFromFile.
+
+        QueStream maybe good way to implement this function
         """
         numSlices = numSlices or self._sc.defaultParallelism
         # Calling the Java parallelize() method with an ArrayList is too slow,
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 07429f477d310..02f35fac47ac0 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -35,25 +35,31 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
         self.ctx = ssc._sc
         self._jrdd_deserializer = jrdd_deserializer
 
+    def context(self):
+        """
+        Return the StreamingContext associated with this DStream
+        """
+        return self._ssc
+
     def count(self):
         """
         Return a new DStream which contains the number of elements in this DStream.
         """
-        return self._mapPartitions(lambda i: [sum(1 for _ in i)])._sum()
+        return self.mapPartitions(lambda i: [sum(1 for _ in i)])._sum()
 
     def _sum(self):
         """
         Add up the elements in this DStream.
         """
-        return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+        return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
     def print_(self, label=None):
         """
         Since print is reserved name for python, we cannot define a "print" method function.
         This function prints serialized data in RDD in DStream because Scala and Java cannot
-        deserialized pickled python object. Please use DStream.pyprint() instead to print results.
+        deserialized pickled python object. Please use DStream.pyprint() to print results.
 
-        Call DStream.print().
+        Call DStream.print() and this function will print byte array in the DStream
         """
         # a hack to call print function in DStream
         getattr(self._jdstream, "print")(label)
@@ -63,29 +69,32 @@ def filter(self, f):
         Return a new DStream containing only the elements that satisfy predicate.
         """
         def func(iterator): return ifilter(f, iterator)
-        return self._mapPartitions(func)
+        return self.mapPartitions(func)
 
     def flatMap(self, f, preservesPartitioning=False):
         """
         Pass each value in the key-value pair DStream through flatMap function
         without changing the keys: this also retains the original RDD's partition.
         """
-        def func(s, iterator): return chain.from_iterable(imap(f, iterator))
+        def func(s, iterator):
+            return chain.from_iterable(imap(f, iterator))
         return self._mapPartitionsWithIndex(func, preservesPartitioning)
 
-    def map(self, f):
+    def map(self, f, preservesPartitioning=False):
         """
         Return a new DStream by applying a function to each element of DStream.
         """
-        def func(iterator): return imap(f, iterator)
-        return self._mapPartitions(func)
+        def func(iterator):
+            return imap(f, iterator)
+        return self.mapPartitions(func, preservesPartitioning)
 
-    def _mapPartitions(self, f):
+    def mapPartitions(self, f, preservesPartitioning=False):
         """
         Return a new DStream by applying a function to each partition of this DStream.
         """
-        def func(s, iterator): return f(iterator)
-        return self._mapPartitionsWithIndex(func)
+        def func(s, iterator):
+            return f(iterator)
+        return self._mapPartitionsWithIndex(func, preservesPartitioning)
 
     def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
@@ -131,7 +140,7 @@ def combineLocally(iterator):
                 else:
                     combiners[k] = mergeValue(combiners[k], v)
             return combiners.iteritems()
-        locally_combined = self._mapPartitions(combineLocally)
+        locally_combined = self.mapPartitions(combineLocally)
         shuffled = locally_combined.partitionBy(numPartitions)
 
         def _mergeCombiners(iterator):
@@ -143,7 +152,7 @@ def _mergeCombiners(iterator):
                     combiners[k] = mergeCombiners(combiners[k], v)
             return combiners.iteritems()
 
-        return shuffled._mapPartitions(_mergeCombiners)
+        return shuffled.mapPartitions(_mergeCombiners)
 
     def partitionBy(self, numPartitions, partitionFunc=None):
         """
@@ -246,6 +255,34 @@ def takeAndPrint(rdd, time):
 
         self.foreachRDD(takeAndPrint)
 
+    def mapValues(self, f):
+        """
+        Pass each value in the key-value pair RDD through a map function
+        without changing the keys; this also retains the original RDD's
+        partitioning.
+        """
+        map_values_fn = lambda (k, v): (k, f(v))
+        return self.map(map_values_fn, preservesPartitioning=True)
+
+    def flatMapValues(self, f):
+        """
+        Pass each value in the key-value pair RDD through a flatMap function
+        without changing the keys; this also retains the original RDD's
+        partitioning.
+        """
+        flat_map_fn = lambda (k, v): ((k, x) for x in f(v))
+        return self.flatMap(flat_map_fn, preservesPartitioning=True)
+
+    def glom(self):
+        """
+        Return a new DStream in which RDD is generated by applying glom() to RDD of
+        this DStream. Applying glom() to an RDD coalesces all elements within each partition into
+        an list.
+        """
+        def func(iterator):
+            yield list(iterator)
+        return self.mapPartitions(func)
+
     #def transform(self, func): - TD
     #    from utils import RDDFunction
     #    wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
@@ -255,7 +292,7 @@ def takeAndPrint(rdd, time):
     def _test_output(self, result):
         """
         This function is only for test case.
-        Store data in a DStream to result to verify the result in tese case
+        Store data in a DStream to result to verify the result in test case
         """
         def get_output(rdd, time):
             taken = rdd.collect()
@@ -318,4 +355,4 @@ def _jdstream(self):
         return self._jdstream_val
 
     def _is_pipelinable(self):
-        return not (self.is_cached)
+        return not self.is_cached
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index ec45acec94dbf..25ea350ca425f 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -142,10 +142,54 @@ def test_func(dstream):
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def _run_stream(self, test_input, test_func, expected_output):
+    def test_mapValues(self):
+        """Basic operation test for DStream.mapValues"""
+        test_input = [["a", "a", "b"], ["", ""], []]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add).mapValues(lambda x: x + 10)
+        expected_output = [[("a", 12), ("b", 11)], [("", 12)], []]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_flatMapValues(self):
+        """Basic operation test for DStream.flatMapValues"""
+        test_input = [["a", "a", "b"], ["", ""], []]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add).flatMapValues(lambda x: (x, x + 10))
+        expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_glom(self):
+        """Basic operation test for DStream.glom"""
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+        numSlices = 2
+
+        def test_func(dstream):
+            dstream.pyprint()
+            return dstream.glom()
+        expected_output = [[[1,2], [3,4]],[[5,6], [7,8]],[[9,10], [11,12]]]
+        output = self._run_stream(test_input, test_func, expected_output, numSlices)
+        self.assertEqual(expected_output, output)
+
+    def test_mapPartitions(self):
+        """Basic operation test for DStream.mapPartitions"""
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+        numSlices = 2
+
+        def test_func(dstream):
+            dstream.pyprint()
+            return dstream.mapPartitions(lambda x: reduce(operator.add, x))
+        expected_output = [[3, 7],[11, 15],[19, 23]]
+        output = self._run_stream(test_input, test_func, expected_output, numSlices)
+        self.assertEqual(expected_output, output)
+
+    def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
-        test_input_stream = self.ssc._testInputStream(test_input)
+        test_input_stream = self.ssc._testInputStream(test_input, numSlices)
         # Apply test function to stream
         test_stream = test_func(test_input_stream)
         # Add job to get output from stream

From bcdec333146151da4777bd4df3b53f81483ad816 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 13 Aug 2014 21:04:26 -0700
Subject: [PATCH 213/347] WIP: solved partitioned and None is not recognized

---
 python/pyspark/streaming/context.py | 20 +++++++++++++++++++-
 python/pyspark/streaming/dstream.py | 16 ++++++++++++++++
 python/pyspark/streaming_tests.py   | 23 +++++++++++++----------
 3 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index eee298badcbad..32b52f74e16f0 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -154,7 +154,7 @@ def _testInputStream(self, test_inputs, numSlices=None):
 
             # Make sure we distribute data evenly if it's smaller than self.batchSize
             if "__len__" not in dir(test_input):
-                c = list(test_input)    # Make it a list so we can compute its length
+                test_input = list(test_input)    # Make it a list so we can compute its length
             batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
             if batchSize > 1:
                 serializer = BatchedSerializer(self._sc._unbatched_serializer,
@@ -162,6 +162,7 @@ def _testInputStream(self, test_inputs, numSlices=None):
             else:
                 serializer = self._sc._unbatched_serializer
             serializer.dump_stream(test_input, tempFile)
+            tempFile.close()
             tempFiles.append(tempFile.name)
 
         jtempFiles = ListConverter().convert(tempFiles, SparkContext._gateway._gateway_client)
@@ -169,3 +170,20 @@ def _testInputStream(self, test_inputs, numSlices=None):
                                                         jtempFiles,
                                                         numSlices).asJavaDStream()
         return DStream(jinput_stream, self, PickleSerializer())
+
+    
+    def _testInputStream2(self, test_inputs, numSlices=None):
+        """
+        This is inpired by QueStream implementation. Give list of RDD and generate DStream
+        which contain the RDD.
+        """
+        test_rdds = list()
+        for test_input in test_inputs:
+            test_rdd = self._sc.parallelize(test_input, numSlices)
+            print test_rdd.glom().collect()
+            test_rdds.append(test_rdd._jrdd)
+
+        jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
+        jinput_stream = self._jvm.PythonTestInputStream2(self._jssc, jtest_rdds).asJavaDStream()
+
+        return DStream(jinput_stream, self, BatchedSerializer(PickleSerializer()))
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 02f35fac47ac0..240b2983b5b5d 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -246,6 +246,8 @@ def takeAndPrint(rdd, time):
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
+            print rdd.glom().collect()
+            print "-------------------------------------------"
             print "-------------------------------------------"
             for record in taken[:10]:
                 print record
@@ -301,6 +303,20 @@ def get_output(rdd, time):
         self.foreachRDD(get_output)
 
 
+# TODO: implement groupByKey
+# TODO: impelment union
+# TODO: implement cache
+# TODO: implement persist
+# TODO: implement repertitions
+# TODO: implement saveAsTextFile
+# TODO: implement cogroup
+# TODO: implement join
+# TODO: implement countByValue
+# TODO: implement leftOuterJoin
+# TODO: implemtnt rightOuterJoin
+
+
+
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 25ea350ca425f..e346bc227fe46 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -71,8 +71,9 @@ class TestBasicOperationsSuite(PySparkStreamingTestCase):
     """
     def setUp(self):
         PySparkStreamingTestCase.setUp(self)
-        StreamOutput.result = list()
         self.timeout = 10  # seconds
+        self.numInputPartitions = 2
+        self.result = list()
 
     def tearDown(self):
         PySparkStreamingTestCase.tearDown(self)
@@ -137,6 +138,8 @@ def test_reduceByKey(self):
         test_input = [["a", "a", "b"], ["", ""], []]
 
         def test_func(dstream):
+            print "reduceByKey"
+            dstream.map(lambda x: (x, 1)).pyprint()
             return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
         expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
         output = self._run_stream(test_input, test_func, expected_output)
@@ -168,9 +171,8 @@ def test_glom(self):
         numSlices = 2
 
         def test_func(dstream):
-            dstream.pyprint()
             return dstream.glom()
-        expected_output = [[[1,2], [3,4]],[[5,6], [7,8]],[[9,10], [11,12]]]
+        expected_output = [[[1,2], [3,4]], [[5,6], [7,8]], [[9,10], [11,12]]]
         output = self._run_stream(test_input, test_func, expected_output, numSlices)
         self.assertEqual(expected_output, output)
 
@@ -180,20 +182,21 @@ def test_mapPartitions(self):
         numSlices = 2
 
         def test_func(dstream):
-            dstream.pyprint()
-            return dstream.mapPartitions(lambda x: reduce(operator.add, x))
-        expected_output = [[3, 7],[11, 15],[19, 23]]
+            def f(iterator): yield sum(iterator)
+            return dstream.mapPartitions(f)
+        expected_output = [[3, 7], [11, 15], [19, 23]]
         output = self._run_stream(test_input, test_func, expected_output, numSlices)
         self.assertEqual(expected_output, output)
 
     def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
-        test_input_stream = self.ssc._testInputStream(test_input, numSlices)
+        numSlices = numSlices or self.numInputPartitions
+        test_input_stream = self.ssc._testInputStream2(test_input, numSlices)
         # Apply test function to stream
         test_stream = test_func(test_input_stream)
         # Add job to get output from stream
-        test_stream._test_output(StreamOutput.result)
+        test_stream._test_output(self.result)
         self.ssc.start()
 
         start_time = time.time()
@@ -205,9 +208,9 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
                 break
             self.ssc.awaitTermination(50)
             # check if the output is the same length of expexted output
-            if len(expected_output) == len(StreamOutput.result):
+            if len(expected_output) == len(self.result):
                 break
-        return StreamOutput.result
+        return self.result
 
 if __name__ == "__main__":
     unittest.main()

From ff14070b8dcfcef831909db3add9d849a185ac06 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 14 Aug 2014 02:19:46 -0700
Subject: [PATCH 214/347] broke something

---
 python/pyspark/streaming/context.py | 10 ++++++----
 python/pyspark/streaming/dstream.py | 20 ++++++++++++++++++++
 python/pyspark/streaming_tests.py   |  2 ++
 python/pyspark/worker.py            | 10 ++++++++++
 4 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 32b52f74e16f0..809158aedbc96 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -169,8 +169,7 @@ def _testInputStream(self, test_inputs, numSlices=None):
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
                                                         jtempFiles,
                                                         numSlices).asJavaDStream()
-        return DStream(jinput_stream, self, PickleSerializer())
-
+        return DStream(jinput_stream, self, BatchedSerializer(PickleSerializer()))
     
     def _testInputStream2(self, test_inputs, numSlices=None):
         """
@@ -178,12 +177,15 @@ def _testInputStream2(self, test_inputs, numSlices=None):
         which contain the RDD.
         """
         test_rdds = list()
+        test_rdd_deserializers = list()
         for test_input in test_inputs:
             test_rdd = self._sc.parallelize(test_input, numSlices)
-            print test_rdd.glom().collect()
             test_rdds.append(test_rdd._jrdd)
+            test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
 
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream2(self._jssc, jtest_rdds).asJavaDStream()
 
-        return DStream(jinput_stream, self, BatchedSerializer(PickleSerializer()))
+        dstream = DStream(jinput_stream, self, test_rdd_deserializers[0])
+        dstream._test_switch_dserializer(test_rdd_deserializers)
+        return dstream
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 240b2983b5b5d..f96efb1fd1db7 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -17,6 +17,7 @@
 
 from collections import defaultdict
 from itertools import chain, ifilter, imap
+import time
 import operator
 
 from pyspark.serializers import NoOpSerializer,\
@@ -302,6 +303,25 @@ def get_output(rdd, time):
 
         self.foreachRDD(get_output)
 
+    def _test_switch_dserializer(self, serializer_que):
+        """
+        Deserializer is dynamically changed based on numSlice and the number of
+        input. This function choose deserializer. Currently this is just FIFO.
+        """
+        
+        jrdd_deserializer = self._jrdd_deserializer
+
+        def switch(rdd, jtime):
+            try:
+                print serializer_que
+                jrdd_deserializer = serializer_que.pop(0)
+                print jrdd_deserializer
+            except Exception as e:
+                print e
+
+        self.foreachRDD(switch)
+
+
 
 # TODO: implement groupByKey
 # TODO: impelment union
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index e346bc227fe46..e23b86e8f040e 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -118,6 +118,8 @@ def test_count(self):
         test_input = [[], [1], range(1, 3), range(1, 4), range(1, 5)]
 
         def test_func(dstream):
+            print "count"
+            dstream.count().pyprint()
             return dstream.count()
         expected_output = map(lambda x: [len(x)], test_input)
         output = self._run_stream(test_input, test_func, expected_output)
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index d6c06e2dbef62..5756ca59a6118 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -86,6 +86,16 @@ def main(infile, outfile):
         (func, deserializer, serializer) = command
         init_time = time.time()
         iterator = deserializer.load_stream(infile)
+        print "deserializer in worker: %s" % str(deserializer)
+        iterator, walk = itertools.tee(iterator)
+        if isinstance(walk, int):
+            print "this is int"
+            print walk
+        else:
+            try:
+                print list(walk)
+            except:
+                print list(walk)
         serializer.dump_stream(func(split_index, iterator), outfile)
     except Exception:
         try:

From 3000b2b2573aa9cd9d3263bb900dc8cb99f9d9f2 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 14 Aug 2014 18:07:10 -0700
Subject: [PATCH 215/347] all tests are passed if numSlice is 2 and the numver
 of each input is over 4

---
 python/pyspark/streaming/context.py           |  5 +++-
 python/pyspark/streaming_tests.py             | 28 +++++++++----------
 .../streaming/api/python/PythonDStream.scala  | 17 +++++++++++
 3 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 809158aedbc96..123fa67f837e3 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -187,5 +187,8 @@ def _testInputStream2(self, test_inputs, numSlices=None):
         jinput_stream = self._jvm.PythonTestInputStream2(self._jssc, jtest_rdds).asJavaDStream()
 
         dstream = DStream(jinput_stream, self, test_rdd_deserializers[0])
-        dstream._test_switch_dserializer(test_rdd_deserializers)
         return dstream
+
+    def _testInputStream3(self):
+        jinput_stream = self._jvm.PythonTestInputStream3(self._jssc).asJavaDStream()
+        return DStream(jinput_stream, self, UTF8Deserializer())
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index e23b86e8f040e..19cce3f185833 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -37,13 +37,6 @@
 SPARK_HOME = os.environ["SPARK_HOME"]
 
 
-class StreamOutput:
-    """
-    a class to store the output from stream
-    """
-    result = list()
-
-
 class PySparkStreamingTestCase(unittest.TestCase):
     def setUp(self):
         class_name = self.__class__.__name__
@@ -115,7 +108,8 @@ def test_func(dstream):
 
     def test_count(self):
         """Basic operation test for DStream.count"""
-        test_input = [[], [1], range(1, 3), range(1, 4), range(1, 5)]
+        #test_input = [[], [1], range(1, 3), range(1, 4), range(1, 5)]
+        test_input = [range(1, 5), range(1,10), range(1,20)]
 
         def test_func(dstream):
             print "count"
@@ -137,33 +131,39 @@ def test_func(dstream):
 
     def test_reduceByKey(self):
         """Basic operation test for DStream.reduceByKey"""
-        test_input = [["a", "a", "b"], ["", ""], []]
+        #test_input = [["a", "a", "b"], ["", ""], []]
+        test_input = [["a", "a", "b", "b"], ["", "", "", ""], []]
 
         def test_func(dstream):
             print "reduceByKey"
             dstream.map(lambda x: (x, 1)).pyprint()
             return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
-        expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
+        #expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
+        expected_output = [[("a", 2), ("b", 2)], [("", 4)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
     def test_mapValues(self):
         """Basic operation test for DStream.mapValues"""
-        test_input = [["a", "a", "b"], ["", ""], []]
+        #test_input = [["a", "a", "b"], ["", ""], []]
+        test_input = [["a", "a", "b", "b"], ["", "", "", ""], []]
 
         def test_func(dstream):
             return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add).mapValues(lambda x: x + 10)
-        expected_output = [[("a", 12), ("b", 11)], [("", 12)], []]
+        #expected_output = [[("a", 12), ("b", 11)], [("", 12)], []]
+        expected_output = [[("a", 12), ("b", 12)], [("", 14)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
     def test_flatMapValues(self):
         """Basic operation test for DStream.flatMapValues"""
-        test_input = [["a", "a", "b"], ["", ""], []]
+        #test_input = [["a", "a", "b"], ["", ""], []]
+        test_input = [["a", "a", "b", "b"], ["", "", "",""], []]
 
         def test_func(dstream):
             return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add).flatMapValues(lambda x: (x, x + 10))
-        expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
+        #expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
+        expected_output = [[("a", 2), ("a", 12), ("b", 2), ("b", 12)], [("", 4), ("", 14)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index a24300f626d1a..d1f41cfbb8885 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -174,6 +174,23 @@ DStream[Array[Byte]](prev.ssc){
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
 
+
+class PythonTestInputStream3(ssc_ : JavaStreamingContext)
+  extends InputDStream[Any](JavaStreamingContext.toStreamingContext(ssc_)) {
+
+  def start() {}
+
+  def stop() {}
+
+  def compute(validTime: Time): Option[RDD[Any]] = {
+    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
+    val selectedInput = ArrayBuffer(1, 2, 3).toSeq
+    val rdd :RDD[Any] = ssc.sc.makeRDD(selectedInput, 2)
+    Some(rdd)
+  }
+
+  val asJavaDStream = JavaDStream.fromDStream(this)
+}
 class PythonForeachDStream(
     prev: DStream[Array[Byte]],
     foreachFunction: PythonRDDFunction

From 13fb44c64cace81035e33fa16edd0293f654915a Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 14 Aug 2014 23:42:34 -0700
Subject: [PATCH 216/347] basic function test cases are passed

---
 python/pyspark/streaming_tests.py | 209 +++++++++++++++++++++++-------
 python/pyspark/worker.py          |  10 --
 2 files changed, 159 insertions(+), 60 deletions(-)

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 19cce3f185833..6d85a7faae859 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -24,7 +24,6 @@
 
 """
 from itertools import chain
-import os
 import time
 import unittest
 import operator
@@ -34,9 +33,6 @@
 from pyspark.streaming.duration import *
 
 
-SPARK_HOME = os.environ["SPARK_HOME"]
-
-
 class PySparkStreamingTestCase(unittest.TestCase):
     def setUp(self):
         class_name = self.__class__.__name__
@@ -49,7 +45,7 @@ def tearDown(self):
         self.ssc._sc.stop()
         # Why does it long time to terminaete StremaingContext and SparkContext?
         # Should we change the sleep time if this depends on machine spec?
-        time.sleep(5)
+        time.sleep(8)
 
     @classmethod
     def tearDownClass(cls):
@@ -59,8 +55,17 @@ def tearDownClass(cls):
 
 class TestBasicOperationsSuite(PySparkStreamingTestCase):
     """
-    Input and output of this TestBasicOperationsSuite is the equivalent to 
-    Scala TestBasicOperationsSuite.
+    2 tests for each function for batach deserializer and unbatch deserilizer because
+    we cannot change the deserializer after streaming process starts.
+    Default numInputPartitions is 2.
+    If the number of input element is over 3, that DStream use batach deserializer.
+    If not, that DStream use unbatch deserializer.
+
+    Most of the operation uses UTF8 deserializer to get value from Scala.
+    I am wondering if these test are enough or not.
+    All tests input should have list of lists. This represents stream.
+    Every batch interval, the first object of list are chosen to make DStream.
+    Please see the BasicTestSuits in Scala or QueStream which is close to this implementation.
     """
     def setUp(self):
         PySparkStreamingTestCase.setUp(self)
@@ -75,8 +80,8 @@ def tearDown(self):
     def tearDownClass(cls):
         PySparkStreamingTestCase.tearDownClass()
 
-    def test_map(self):
-        """Basic operation test for DStream.map"""
+    def test_map_batch(self):
+        """Basic operation test for DStream.map with batch deserializer"""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
 
         def test_func(dstream):
@@ -85,8 +90,18 @@ def test_func(dstream):
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_flatMap(self):
-        """Basic operation test for DStream.faltMap"""
+    def test_map_unbatach(self):
+        """Basic operation test for DStream.map with unbatch deserializer"""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: str(x))
+        expected_output = map(lambda x: map(lambda y: str(y), x), test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_flatMap_batch(self):
+        """Basic operation test for DStream.faltMap with batch deserializer"""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
 
         def test_func(dstream):
@@ -96,8 +111,19 @@ def test_func(dstream):
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_filter(self):
-        """Basic operation test for DStream.filter"""
+    def test_flatMap_unbatch(self):
+        """Basic operation test for DStream.faltMap with unbatch deserializer"""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+
+        def test_func(dstream):
+            return dstream.flatMap(lambda x: (x, x * 2))
+        expected_output = map(lambda x: list(chain.from_iterable((map(lambda y: [y, y * 2], x)))),
+                              test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_filter_batch(self):
+        """Basic operation test for DStream.filter with batch deserializer"""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
 
         def test_func(dstream):
@@ -106,21 +132,38 @@ def test_func(dstream):
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_count(self):
-        """Basic operation test for DStream.count"""
-        #test_input = [[], [1], range(1, 3), range(1, 4), range(1, 5)]
-        test_input = [range(1, 5), range(1,10), range(1,20)]
+    def test_filter_unbatch(self):
+        """Basic operation test for DStream.filter with unbatch deserializer"""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+
+        def test_func(dstream):
+            return dstream.filter(lambda x: x % 2 == 0)
+        expected_output = map(lambda x: filter(lambda y: y % 2 == 0, x), test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_count_batch(self):
+        """Basic operation test for DStream.count with batch deserializer"""
+        test_input = [range(1, 5), range(1, 10), range(1, 20)]
 
         def test_func(dstream):
-            print "count"
-            dstream.count().pyprint()
             return dstream.count()
         expected_output = map(lambda x: [len(x)], test_input)
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
-        
-    def test_reduce(self):
-        """Basic operation test for DStream.reduce"""
+
+    def test_count_unbatch(self):
+        """Basic operation test for DStream.count with unbatch deserializer"""
+        test_input = [[], [1], range(1, 3), range(1, 4)]
+
+        def test_func(dstream):
+            return dstream.count()
+        expected_output = map(lambda x: [len(x)], test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_reduce_batch(self):
+        """Basic operation test for DStream.reduce with batch deserializer"""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
 
         def test_func(dstream):
@@ -129,67 +172,132 @@ def test_func(dstream):
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_reduceByKey(self):
-        """Basic operation test for DStream.reduceByKey"""
-        #test_input = [["a", "a", "b"], ["", ""], []]
-        test_input = [["a", "a", "b", "b"], ["", "", "", ""], []]
+    def test_reduce_unbatch(self):
+        """Basic operation test for DStream.reduce with unbatch deserializer"""
+        test_input = [[1], range(1, 3), range(1, 4)]
+
+        def test_func(dstream):
+            return dstream.reduce(operator.add)
+        expected_output = map(lambda x: [reduce(operator.add, x)], test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_reduceByKey_batch(self):
+        """Basic operation test for DStream.reduceByKey with batch deserializer"""
+        test_input = [["a", "a", "b", "b"], ["", "", "", ""]]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
+        expected_output = [[("a", 2), ("b", 2)], [("", 4)]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_reduceByKey_unbatch(self):
+        """Basic operation test for DStream.reduceByKey with unbatch deserilizer"""
+        test_input = [["a", "a", "b"], ["", ""], []]
 
         def test_func(dstream):
-            print "reduceByKey"
-            dstream.map(lambda x: (x, 1)).pyprint()
             return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
-        #expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
-        expected_output = [[("a", 2), ("b", 2)], [("", 4)], []]
+        expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_mapValues(self):
-        """Basic operation test for DStream.mapValues"""
-        #test_input = [["a", "a", "b"], ["", ""], []]
-        test_input = [["a", "a", "b", "b"], ["", "", "", ""], []]
+    def test_mapValues_batch(self):
+        """Basic operation test for DStream.mapValues with batch deserializer"""
+        test_input = [["a", "a", "b", "b"], ["", "", "", ""]]
 
         def test_func(dstream):
-            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add).mapValues(lambda x: x + 10)
-        #expected_output = [[("a", 12), ("b", 11)], [("", 12)], []]
-        expected_output = [[("a", 12), ("b", 12)], [("", 14)], []]
+            return dstream.map(lambda x: (x, 1))\
+                          .reduceByKey(operator.add)\
+                          .mapValues(lambda x: x + 10)
+        expected_output = [[("a", 12), ("b", 12)], [("", 14)]]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_flatMapValues(self):
-        """Basic operation test for DStream.flatMapValues"""
-        #test_input = [["a", "a", "b"], ["", ""], []]
-        test_input = [["a", "a", "b", "b"], ["", "", "",""], []]
+    def test_mapValues_unbatch(self):
+        """Basic operation test for DStream.mapValues with unbatch deserializer"""
+        test_input = [["a", "a", "b"], ["", ""], []]
 
         def test_func(dstream):
-            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add).flatMapValues(lambda x: (x, x + 10))
-        #expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
-        expected_output = [[("a", 2), ("a", 12), ("b", 2), ("b", 12)], [("", 4), ("", 14)], []]
+            return dstream.map(lambda x: (x, 1))\
+                          .reduceByKey(operator.add)\
+                          .mapValues(lambda x: x + 10)
+        expected_output = [[("a", 12), ("b", 11)], [("", 12)], []]
         output = self._run_stream(test_input, test_func, expected_output)
         self.assertEqual(expected_output, output)
 
-    def test_glom(self):
-        """Basic operation test for DStream.glom"""
+    def test_flatMapValues_batch(self):
+        """Basic operation test for DStream.flatMapValues with batch deserializer"""
+        test_input = [["a", "a", "b", "b"], ["", "", "", ""]]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: (x, 1))\
+                          .reduceByKey(operator.add)\
+                          .flatMapValues(lambda x: (x, x + 10))
+        expected_output = [[("a", 2), ("a", 12), ("b", 2), ("b", 12)], [("", 4), ("", 14)]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_flatMapValues_unbatch(self):
+        """Basic operation test for DStream.flatMapValues with unbatch deserializer"""
+        test_input = [["a", "a", "b"], ["", ""], []]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: (x, 1))\
+                          .reduceByKey(operator.add)\
+                          .flatMapValues(lambda x: (x, x + 10))
+        expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_glom_batch(self):
+        """Basic operation test for DStream.glom with batch deserializer"""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
         numSlices = 2
 
         def test_func(dstream):
             return dstream.glom()
-        expected_output = [[[1,2], [3,4]], [[5,6], [7,8]], [[9,10], [11,12]]]
+        expected_output = [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]]
+        output = self._run_stream(test_input, test_func, expected_output, numSlices)
+        self.assertEqual(expected_output, output)
+
+    def test_glom_unbatach(self):
+        """Basic operation test for DStream.glom with unbatch deserialiser"""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+        numSlices = 2
+
+        def test_func(dstream):
+            return dstream.glom()
+        expected_output = [[[1], [2, 3]], [[4], [5, 6]], [[7], [8, 9]]]
         output = self._run_stream(test_input, test_func, expected_output, numSlices)
         self.assertEqual(expected_output, output)
 
-    def test_mapPartitions(self):
-        """Basic operation test for DStream.mapPartitions"""
+    def test_mapPartitions_batch(self):
+        """Basic operation test for DStream.mapPartitions with batch deserializer"""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
         numSlices = 2
 
         def test_func(dstream):
-            def f(iterator): yield sum(iterator)
+            def f(iterator):
+                yield sum(iterator)
             return dstream.mapPartitions(f)
         expected_output = [[3, 7], [11, 15], [19, 23]]
         output = self._run_stream(test_input, test_func, expected_output, numSlices)
         self.assertEqual(expected_output, output)
 
+    def test_mapPartitions_unbatch(self):
+        """Basic operation test for DStream.mapPartitions with unbatch deserializer"""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+        numSlices = 2
+
+        def test_func(dstream):
+            def f(iterator):
+                yield sum(iterator)
+            return dstream.mapPartitions(f)
+        expected_output = [[1, 5], [4, 11], [7, 17]]
+        output = self._run_stream(test_input, test_func, expected_output, numSlices)
+        self.assertEqual(expected_output, output)
+
     def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
@@ -212,6 +320,7 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
             # check if the output is the same length of expexted output
             if len(expected_output) == len(self.result):
                 break
+
         return self.result
 
 if __name__ == "__main__":
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 5756ca59a6118..d6c06e2dbef62 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -86,16 +86,6 @@ def main(infile, outfile):
         (func, deserializer, serializer) = command
         init_time = time.time()
         iterator = deserializer.load_stream(infile)
-        print "deserializer in worker: %s" % str(deserializer)
-        iterator, walk = itertools.tee(iterator)
-        if isinstance(walk, int):
-            print "this is int"
-            print walk
-        else:
-            try:
-                print list(walk)
-            except:
-                print list(walk)
         serializer.dump_stream(func(split_index, iterator), outfile)
     except Exception:
         try:

From 18c8723e54481e0ab6702fbf6abf1724fd357e63 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 14 Aug 2014 23:46:45 -0700
Subject: [PATCH 217/347] modified streaming test case to add coment

---
 python/pyspark/streaming_tests.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 6d85a7faae859..02996ccce9a3e 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -18,6 +18,9 @@
 """
 Unit tests for PySpark; additional tests are implemented as doctests in
 individual modules.
+Other option is separate this test case with other tests.
+This makes sense becuase streaming tests takes long time due to waiting time
+for stoping callback server.
 
 This file will merged to tests.py. But for now, this file is separated due
 to focusing to streaming test case
@@ -45,7 +48,7 @@ def tearDown(self):
         self.ssc._sc.stop()
         # Why does it long time to terminaete StremaingContext and SparkContext?
         # Should we change the sleep time if this depends on machine spec?
-        time.sleep(8)
+        time.sleep(10)
 
     @classmethod
     def tearDownClass(cls):
@@ -302,7 +305,7 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
         """Start stream and return the output"""
         # Generate input stream with user-defined input
         numSlices = numSlices or self.numInputPartitions
-        test_input_stream = self.ssc._testInputStream2(test_input, numSlices)
+        test_input_stream = self.ssc._testInputStream(test_input, numSlices)
         # Apply test function to stream
         test_stream = test_func(test_input_stream)
         # Add job to get output from stream

From f76c18200c3bce35994f5b9e08ce55dc150bf664 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Fri, 15 Aug 2014 11:28:39 -0700
Subject: [PATCH 218/347] remove waste duplicated code

---
 python/pyspark/streaming/context.py | 43 +----------------
 python/pyspark/streaming/dstream.py | 75 +++++++++++++++++++++--------
 2 files changed, 56 insertions(+), 62 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 123fa67f837e3..60bcf86783e95 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -130,48 +130,7 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
             # Stop Callback server
             SparkContext._gateway.shutdown()
 
-    def checkpoint(self, directory):
-        """
-        Not tested
-        """
-        self._jssc.checkpoint(directory)
-
     def _testInputStream(self, test_inputs, numSlices=None):
-        """
-        Generate multiple files to make "stream" in Scala side for test.
-        Scala chooses one of the files and generates RDD using PythonRDD.readRDDFromFile.
-
-        QueStream maybe good way to implement this function
-        """
-        numSlices = numSlices or self._sc.defaultParallelism
-        # Calling the Java parallelize() method with an ArrayList is too slow,
-        # because it sends O(n) Py4J commands.  As an alternative, serialized
-        # objects are written to a file and loaded through textFile().
-
-        tempFiles = list()
-        for test_input in test_inputs:
-            tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
-
-            # Make sure we distribute data evenly if it's smaller than self.batchSize
-            if "__len__" not in dir(test_input):
-                test_input = list(test_input)    # Make it a list so we can compute its length
-            batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
-            if batchSize > 1:
-                serializer = BatchedSerializer(self._sc._unbatched_serializer,
-                                               batchSize)
-            else:
-                serializer = self._sc._unbatched_serializer
-            serializer.dump_stream(test_input, tempFile)
-            tempFile.close()
-            tempFiles.append(tempFile.name)
-
-        jtempFiles = ListConverter().convert(tempFiles, SparkContext._gateway._gateway_client)
-        jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
-                                                        jtempFiles,
-                                                        numSlices).asJavaDStream()
-        return DStream(jinput_stream, self, BatchedSerializer(PickleSerializer()))
-    
-    def _testInputStream2(self, test_inputs, numSlices=None):
         """
         This is inpired by QueStream implementation. Give list of RDD and generate DStream
         which contain the RDD.
@@ -184,7 +143,7 @@ def _testInputStream2(self, test_inputs, numSlices=None):
             test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
 
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
-        jinput_stream = self._jvm.PythonTestInputStream2(self._jssc, jtest_rdds).asJavaDStream()
+        jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
         dstream = DStream(jinput_stream, self, test_rdd_deserializers[0])
         return dstream
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index f96efb1fd1db7..183826cf2ef96 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -17,12 +17,13 @@
 
 from collections import defaultdict
 from itertools import chain, ifilter, imap
-import time
 import operator
 
 from pyspark.serializers import NoOpSerializer,\
     BatchedSerializer, CloudPickleSerializer, pack_long
 from pyspark.rdd import _JavaStackTrace
+from pyspark.storagelevel import StorageLevel
+from pyspark.resultiterable import ResultIterable
 
 from py4j.java_collections import ListConverter, MapConverter
 
@@ -35,6 +36,8 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
         self._ssc = ssc
         self.ctx = ssc._sc
         self._jrdd_deserializer = jrdd_deserializer
+        self.is_cached = False
+        self.is_checkpointed = False
 
     def context(self):
         """
@@ -247,8 +250,6 @@ def takeAndPrint(rdd, time):
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
-            print rdd.glom().collect()
-            print "-------------------------------------------"
             print "-------------------------------------------"
             for record in taken[:10]:
                 print record
@@ -303,32 +304,65 @@ def get_output(rdd, time):
 
         self.foreachRDD(get_output)
 
-    def _test_switch_dserializer(self, serializer_que):
+    def cache(self):
+        """
+        Persist this DStream with the default storage level (C{MEMORY_ONLY_SER}).
+        """
+        self.is_cached = True
+        self.persist(StorageLevel.MEMORY_ONLY_SER)
+        return self
+
+    def persist(self, storageLevel):
+        """
+        Set this DStream's storage level to persist its values across operations
+        after the first time it is computed. This can only be used to assign
+        a new storage level if the DStream does not have a storage level set yet.
+        """
+        self.is_cached = True
+        javaStorageLevel = self.ctx._getJavaStorageLevel(storageLevel)
+        self._jdstream.persist(javaStorageLevel)
+        return self
+
+    def checkpoint(self, interval):
         """
-        Deserializer is dynamically changed based on numSlice and the number of
-        input. This function choose deserializer. Currently this is just FIFO.
+        Mark this DStream for checkpointing. It will be saved to a file inside the
+        checkpoint directory set with L{SparkContext.setCheckpointDir()}
+
+        I am not sure this part in DStream
+        and
+        all references to its parent RDDs will be removed. This function must
+        be called before any job has been executed on this RDD. It is strongly
+        recommended that this RDD is persisted in memory, otherwise saving it
+        on a file will require recomputation.
+
+        interval must be pysprak.streaming.duration
         """
-        
-        jrdd_deserializer = self._jrdd_deserializer
+        self.is_checkpointed = True
+        self._jdstream.checkpoint(interval)
+        return self
+
+    def groupByKey(self, numPartitions=None):
+        def createCombiner(x):
+            return [x]
 
-        def switch(rdd, jtime):
-            try:
-                print serializer_que
-                jrdd_deserializer = serializer_que.pop(0)
-                print jrdd_deserializer
-            except Exception as e:
-                print e
+        def mergeValue(xs, x):
+            xs.append(x)
+            return xs
 
-        self.foreachRDD(switch)
+        def mergeCombiners(a, b):
+            a.extend(b)
+            return a
 
+        return self.combineByKey(createCombiner, mergeValue, mergeCombiners,
+                                 numPartitions).mapValues(lambda x: ResultIterable(x))
 
 
 # TODO: implement groupByKey
+# TODO: implement saveAsTextFile
+
+# Following operation has dependency to transform
 # TODO: impelment union
-# TODO: implement cache
-# TODO: implement persist
 # TODO: implement repertitions
-# TODO: implement saveAsTextFile
 # TODO: implement cogroup
 # TODO: implement join
 # TODO: implement countByValue
@@ -355,6 +389,7 @@ def pipeline_func(split, iterator):
             self._prev_jdstream = prev._prev_jdstream  # maintain the pipeline
             self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer
         self.is_cached = False
+        self.is_checkpointed = False
         self._ssc = prev._ssc
         self.ctx = prev.ctx
         self.prev = prev
@@ -391,4 +426,4 @@ def _jdstream(self):
         return self._jdstream_val
 
     def _is_pipelinable(self):
-        return not self.is_cached
+        return not (self.is_cached or self.is_checkpointed)

From 74535d40ec21b09af6d0160dbbe735228aa2445f Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Fri, 15 Aug 2014 17:10:56 -0700
Subject: [PATCH 219/347] added saveAsTextFiles and saveAsPickledFiles

---
 python/pyspark/streaming/context.py | 17 +++++++-------
 python/pyspark/streaming/dstream.py | 35 +++++++++++++++++++++++++----
 python/pyspark/streaming/utils.py   |  6 +++++
 python/pyspark/streaming_tests.py   | 32 ++++++++++++++++++++++++++
 4 files changed, 77 insertions(+), 13 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 60bcf86783e95..691f9b06ad4e9 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -114,7 +114,7 @@ def textFileStream(self, directory):
         Create an input stream that monitors a Hadoop-compatible file system
         for new files and reads them as text files. Files must be wrriten to the
         monitored directory by "moving" them from another location within the same
-        file system. FIle names starting with . are ignored.
+        file system. File names starting with . are ignored.
         """
         return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
 
@@ -132,8 +132,9 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
 
     def _testInputStream(self, test_inputs, numSlices=None):
         """
-        This is inpired by QueStream implementation. Give list of RDD and generate DStream
-        which contain the RDD.
+        This function is only for test.
+        This implementation is inpired by QueStream implementation. 
+        Give list of RDD to generate DStream which contains the RDD.
         """
         test_rdds = list()
         test_rdd_deserializers = list()
@@ -142,12 +143,10 @@ def _testInputStream(self, test_inputs, numSlices=None):
             test_rdds.append(test_rdd._jrdd)
             test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
 
+#        if len(set(test_rdd_deserializers)) > 1:
+#            raise IOError("Deserializer should be one type to run test case. "
+#                          "See the SparkContext.parallelize to understand how to decide deserializer")
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
-        dstream = DStream(jinput_stream, self, test_rdd_deserializers[0])
-        return dstream
-
-    def _testInputStream3(self):
-        jinput_stream = self._jvm.PythonTestInputStream3(self._jssc).asJavaDStream()
-        return DStream(jinput_stream, self, UTF8Deserializer())
+        return DStream(jinput_stream, self, test_rdd_deserializers[0])
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 183826cf2ef96..dd51dcc5d4d48 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -24,6 +24,8 @@
 from pyspark.rdd import _JavaStackTrace
 from pyspark.storagelevel import StorageLevel
 from pyspark.resultiterable import ResultIterable
+from pyspark.streaming.utils import rddToFileName
+
 
 from py4j.java_collections import ListConverter, MapConverter
 
@@ -356,21 +358,46 @@ def mergeCombiners(a, b):
         return self.combineByKey(createCombiner, mergeValue, mergeCombiners,
                                  numPartitions).mapValues(lambda x: ResultIterable(x))
 
+    def countByValue(self):
+        def countPartition(iterator):
+            counts = defaultdict(int)
+            for obj in iterator:
+                counts[obj] += 1
+            yield counts
+
+        def mergeMaps(m1, m2):
+            for (k, v) in m2.iteritems():
+                m1[k] += v
+            return m1
+
+        return self.mapPartitions(countPartition).reduce(mergeMaps).flatMap(lambda x: x.items())
+
+    def saveAsTextFiles(self, prefix, suffix=None):
+
+        def saveAsTextFile(rdd, time):
+            path = rddToFileName(prefix, suffix, time)
+            rdd.saveAsTextFile(path)
+
+        return self.foreachRDD(saveAsTextFile)
+
+    def saveAsPickledFiles(self, prefix, suffix=None):
+
+        def saveAsTextFile(rdd, time):
+            path = rddToFileName(prefix, suffix, time)
+            rdd.saveAsPickleFile(path)
+
+        return self.foreachRDD(saveAsTextFile)
 
-# TODO: implement groupByKey
-# TODO: implement saveAsTextFile
 
 # Following operation has dependency to transform
 # TODO: impelment union
 # TODO: implement repertitions
 # TODO: implement cogroup
 # TODO: implement join
-# TODO: implement countByValue
 # TODO: implement leftOuterJoin
 # TODO: implemtnt rightOuterJoin
 
 
-
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index aa5e19adbd927..9178577743e0b 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -53,3 +53,9 @@ def msDurationToString(ms):
         return "%.1f m" % (float(ms) / minute)
     else:
         return "%.2f h" % (float(ms) / hour)
+
+def rddToFileName(prefix, suffix, time):
+    if suffix is not None:
+        return prefix + "-" + str(time) + "." + suffix
+    else:
+        return prefix + "-" + str(time)
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 02996ccce9a3e..2bb01ed3a0642 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -301,6 +301,38 @@ def f(iterator):
         output = self._run_stream(test_input, test_func, expected_output, numSlices)
         self.assertEqual(expected_output, output)
 
+    def test_countByValue_batch(self):
+        """Basic operation test for DStream.countByValue with batch deserializer"""
+        test_input = [range(1, 5) + range(1,5), range(5, 7) + range(5, 9), ["a"] * 2 + ["b"] + [""] ]
+
+        def test_func(dstream):
+            return dstream.countByValue()
+        expected_output = [[(1, 2), (2, 2), (3, 2), (4, 2)],
+                           [(5, 2), (6, 2), (7, 1), (8, 1)],
+                           [("a", 2), ("b", 1), ("", 1)]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def test_countByValue_unbatch(self):
+        """Basic operation test for DStream.countByValue with unbatch deserializer"""
+        test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
+
+        def test_func(dstream):
+            return dstream.countByValue()
+        expected_output = [[(1, 1), (2, 1), (3, 1)],
+                           [(1, 2), ("", 1)],
+                           [("a", 2), ("b", 1)]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def _sort_result_based_on_key(self, outputs):
+        for output in outputs:
+            output.sort(key=lambda x: x[0])
+
     def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
         """Start stream and return the output"""
         # Generate input stream with user-defined input

From 16aa64f2a8547856c918090dbc43a9601c78e0fd Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Fri, 15 Aug 2014 22:30:58 -0700
Subject: [PATCH 220/347] added TODO coments

---
 python/pyspark/streaming/context.py |  3 ++-
 python/pyspark/streaming/dstream.py | 16 ++++++++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 691f9b06ad4e9..470ed270cdbfb 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -17,7 +17,6 @@
 
 import sys
 from signal import signal, SIGTERM, SIGINT
-from tempfile import NamedTemporaryFile
 
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
 from pyspark.context import SparkContext
@@ -79,6 +78,7 @@ def _clean_up_trigger(self):
         """Kill py4j callback server properly using signal lib"""
 
         def clean_up_handler(*args):
+            SparkContext._gateway._shutdown_callback_server()
             SparkContext._gateway.shutdown()
             sys.exit(0)
 
@@ -128,6 +128,7 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
             self._jssc.stop(stopSparkContext, stopGraceFully)
         finally:
             # Stop Callback server
+            SparkContext._gateway._shutdown_callback_server()
             SparkContext._gateway.shutdown()
 
     def _testInputStream(self, test_inputs, numSlices=None):
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index dd51dcc5d4d48..78efe8897a5b2 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -389,15 +389,27 @@ def saveAsTextFile(rdd, time):
         return self.foreachRDD(saveAsTextFile)
 
 
+# TODO: implement updateStateByKey
+# TODO: implement slice
+
+# Window Operations
+# TODO: implement window
+# TODO: implement groupByKeyAndWindow
+# TODO: implement reduceByKeyAndWindow
+# TODO: implement countByValueAndWindow
+# TODO: implement countByWindow
+# TODO: implement reduceByWindow
+
 # Following operation has dependency to transform
-# TODO: impelment union
+# TODO: implement transform
+# TODO: implement transformWith
+# TODO: implement union
 # TODO: implement repertitions
 # TODO: implement cogroup
 # TODO: implement join
 # TODO: implement leftOuterJoin
 # TODO: implemtnt rightOuterJoin
 
-
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():

From e54f98687b59f4674f0099a70d4bc39ef7981667 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 00:30:17 -0700
Subject: [PATCH 221/347] add comments

---
 python/pyspark/java_gateway.py      |  5 ++---
 python/pyspark/streaming/context.py | 13 ++++++-----
 python/pyspark/streaming/dstream.py | 24 ++++++++++++++++++++
 python/pyspark/streaming_tests.py   | 34 ++++++++++++++++++++++-------
 4 files changed, 59 insertions(+), 17 deletions(-)

diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index d7e7bbf806544..f3c6d231ab777 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -108,15 +108,14 @@ def run(self):
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
     java_import(gateway.jvm, "org.apache.spark.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.api.python.*")
-    java_import(gateway.jvm, "org.apache.spark.streaming.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.*")  # do we need this?
     java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
-    java_import(gateway.jvm, "org.apache.spark.streaming.dstream.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.dstream.*")  # do we need this?
     java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.LocalHiveContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.TestHiveContext")
     java_import(gateway.jvm, "scala.Tuple2")
-
     return gateway
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 470ed270cdbfb..e380626aa080b 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -64,7 +64,9 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
                         pyFiles=pyFiles, environment=environment, batchSize=batchSize,
                         serializer=serializer, conf=conf, gateway=gateway)
 
-        # Start py4j callback server
+        # Start py4j callback server.
+        # Callback sever is need only by SparkStreming; therefore the callback sever
+        # is started in StreamingContext.
         SparkContext._gateway.restart_callback_server()
         self._clean_up_trigger()
         self._jvm = self._sc._jvm
@@ -78,6 +80,8 @@ def _clean_up_trigger(self):
         """Kill py4j callback server properly using signal lib"""
 
         def clean_up_handler(*args):
+            # Make sure stop callback server.
+            # This need improvement how to terminate callback sever properly.
             SparkContext._gateway._shutdown_callback_server()
             SparkContext._gateway.shutdown()
             sys.exit(0)
@@ -100,7 +104,7 @@ def awaitTermination(self, timeout=None):
         else:
             self._jssc.awaitTermination(timeout)
 
-    # start from simple one. storageLevel is not passed for now.
+    #TODO: add storageLevel
     def socketTextStream(self, hostname, port):
         """
         Create an input from TCP source hostname:port. Data is received using
@@ -134,7 +138,7 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
     def _testInputStream(self, test_inputs, numSlices=None):
         """
         This function is only for test.
-        This implementation is inpired by QueStream implementation. 
+        This implementation is inspired by QueStream implementation.
         Give list of RDD to generate DStream which contains the RDD.
         """
         test_rdds = list()
@@ -144,9 +148,6 @@ def _testInputStream(self, test_inputs, numSlices=None):
             test_rdds.append(test_rdd._jrdd)
             test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
 
-#        if len(set(test_rdd_deserializers)) > 1:
-#            raise IOError("Deserializer should be one type to run test case. "
-#                          "See the SparkContext.parallelize to understand how to decide deserializer")
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 78efe8897a5b2..5fa30f6d89fbd 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -344,6 +344,17 @@ def checkpoint(self, interval):
         return self
 
     def groupByKey(self, numPartitions=None):
+        """
+        Return a new DStream which contains group the values for each key in the
+        DStream into a single sequence.
+        Hash-partitions the resulting RDD with into numPartitions partitions in
+        the DStream.
+
+        Note: If you are grouping in order to perform an aggregation (such as a
+        sum or average) over each key, using reduceByKey will provide much
+        better performance.
+
+        """
         def createCombiner(x):
             return [x]
 
@@ -359,6 +370,10 @@ def mergeCombiners(a, b):
                                  numPartitions).mapValues(lambda x: ResultIterable(x))
 
     def countByValue(self):
+        """
+        Return new DStream which contains the count of each unique value in this
+        DStreeam as a (value, count) pairs.
+        """
         def countPartition(iterator):
             counts = defaultdict(int)
             for obj in iterator:
@@ -373,6 +388,9 @@ def mergeMaps(m1, m2):
         return self.mapPartitions(countPartition).reduce(mergeMaps).flatMap(lambda x: x.items())
 
     def saveAsTextFiles(self, prefix, suffix=None):
+        """
+        Save this DStream as a text file, using string representations of elements.
+        """
 
         def saveAsTextFile(rdd, time):
             path = rddToFileName(prefix, suffix, time)
@@ -381,6 +399,11 @@ def saveAsTextFile(rdd, time):
         return self.foreachRDD(saveAsTextFile)
 
     def saveAsPickledFiles(self, prefix, suffix=None):
+        """
+        Save this DStream as a SequenceFile of serialized objects. The serializer
+        used is L{pyspark.serializers.PickleSerializer}, default batch size
+        is 10.
+        """
 
         def saveAsTextFile(rdd, time):
             path = rddToFileName(prefix, suffix, time)
@@ -410,6 +433,7 @@ def saveAsTextFile(rdd, time):
 # TODO: implement leftOuterJoin
 # TODO: implemtnt rightOuterJoin
 
+
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 2bb01ed3a0642..ef308fdd6aa59 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -18,12 +18,11 @@
 """
 Unit tests for PySpark; additional tests are implemented as doctests in
 individual modules.
-Other option is separate this test case with other tests.
-This makes sense becuase streaming tests takes long time due to waiting time
-for stoping callback server.
 
-This file will merged to tests.py. But for now, this file is separated due
-to focusing to streaming test case
+This file would be merged to tests.py after all functions are ready.
+But for now, this file is separated due to focusing to streaming test case.
+
+Callback server seems like unstable sometimes, which cause error in test case.
 
 """
 from itertools import chain
@@ -43,10 +42,10 @@ def setUp(self):
 
     def tearDown(self):
         # Do not call pyspark.streaming.context.StreamingContext.stop directly because
-        # we do not wait to shutdowncall back server and py4j client
+        # we do not wait to shutdown call back server and py4j client
         self.ssc._jssc.stop()
         self.ssc._sc.stop()
-        # Why does it long time to terminaete StremaingContext and SparkContext?
+        # Why does it long time to terminate StremaingContext and SparkContext?
         # Should we change the sleep time if this depends on machine spec?
         time.sleep(10)
 
@@ -68,7 +67,7 @@ class TestBasicOperationsSuite(PySparkStreamingTestCase):
     I am wondering if these test are enough or not.
     All tests input should have list of lists. This represents stream.
     Every batch interval, the first object of list are chosen to make DStream.
-    Please see the BasicTestSuits in Scala or QueStream which is close to this implementation.
+    Please see the BasicTestSuits in Scala which is close to this implementation.
     """
     def setUp(self):
         PySparkStreamingTestCase.setUp(self)
@@ -358,5 +357,24 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
 
         return self.result
 
+class TestSaveAsFilesSuite(PySparkStreamingTestCase):
+    def setUp(self):
+        PySparkStreamingTestCase.setUp(self)
+        self.timeout = 10  # seconds
+        self.numInputPartitions = 2
+        self.result = list()
+
+    def tearDown(self):
+        PySparkStreamingTestCase.tearDown(self)
+
+    @classmethod
+    def tearDownClass(cls):
+        PySparkStreamingTestCase.tearDownClass()
+
+
+
+
+
+
 if __name__ == "__main__":
     unittest.main()

From 10b5b0416521665ae8c29aecf03723c8607a51bc Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 00:35:50 -0700
Subject: [PATCH 222/347] removed wasted print in DStream

---
 .../streaming/api/java/JavaDStreamLike.scala    |  9 ---------
 .../spark/streaming/dstream/DStream.scala       | 17 -----------------
 2 files changed, 26 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index 7a002bbe74ca9..a6184de4e83c1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -54,15 +54,6 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     dstream.print()
   }
 
-  def print(label: String = null): Unit = {
-    dstream.print(label)
-  }
-
-  def outputToFile(): Unit = {
-    dstream.outputToFile()
-  }
-
-
   /**
    * Return a new DStream in which each RDD has a single element generated by counting each RDD
    * of this DStream.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 46ef05d9c37a1..39ad591e8896e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -617,23 +617,6 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-
-  def print(label: String = null) {
-    def foreachFunc = (rdd: RDD[T], time: Time) => {
-      val first11 = rdd.take(11)
-      println ("-------------------------------------------")
-      println ("Time: " + time)
-      println ("-------------------------------------------")
-      if(label != null){
-        println (label)
-      }
-      first11.take(10).foreach(println)
-      if (first11.size > 10) println("...")
-      println()
-    }
-    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
-  }
-
   /**
    * Return a new DStream in which each RDD contains all the elements in seen in a
    * sliding window of time over this DStream. The new DStream generates RDDs with

From 10ab87b0710163a5694333567e519d06ded78b7a Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 13:24:17 -0700
Subject: [PATCH 223/347] added sparkContext as input parameter in
 StreamingContext

---
 python/pyspark/streaming/context.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index e380626aa080b..3f455a3e06072 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -34,7 +34,7 @@ class StreamingContext(object):
 
     def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
-        gateway=None, duration=None):
+        gateway=None, sparkContext=None, duration=None):
         """
         Create a new StreamingContext. At least the master and app name and duration
         should be set, either through the named parameters here or through C{conf}.
@@ -55,14 +55,18 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         @param conf: A L{SparkConf} object setting Spark properties.
         @param gateway: Use an existing gateway and JVM, otherwise a new JVM
                will be instatiated.
-        @param duration: A L{Duration} Duration for SparkStreaming
+        @param sparkContext: L{SparkContext} object.
+        @param duration: A L{Duration} object for SparkStreaming.
 
         """
 
-        # Create the Python Sparkcontext
-        self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
-                        pyFiles=pyFiles, environment=environment, batchSize=batchSize,
-                        serializer=serializer, conf=conf, gateway=gateway)
+        if sparkContext is None:
+            # Create the Python Sparkcontext
+            self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
+                            pyFiles=pyFiles, environment=environment, batchSize=batchSize,
+                            serializer=serializer, conf=conf, gateway=gateway)
+        else:
+            self._sc = sparkContext
 
         # Start py4j callback server.
         # Callback sever is need only by SparkStreming; therefore the callback sever

From 5625bdc1e0f01449752040b5e2bf867a7ef09e64 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 14:39:45 -0700
Subject: [PATCH 224/347] added gorupByKey testcase

---
 python/pyspark/streaming_tests.py | 70 ++++++++++++++++++++++++-------
 1 file changed, 54 insertions(+), 16 deletions(-)

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index ef308fdd6aa59..c35d352c66ca5 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -275,7 +275,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_mapPartitions_batch(self):
-        """Basic operation test for DStream.mapPartitions with batch deserializer"""
+        """Basic operation test for DStream.mapPartitions with batch deserializer."""
         test_input = [range(1, 5), range(5, 9), range(9, 13)]
         numSlices = 2
 
@@ -288,7 +288,7 @@ def f(iterator):
         self.assertEqual(expected_output, output)
 
     def test_mapPartitions_unbatch(self):
-        """Basic operation test for DStream.mapPartitions with unbatch deserializer"""
+        """Basic operation test for DStream.mapPartitions with unbatch deserializer."""
         test_input = [range(1, 4), range(4, 7), range(7, 10)]
         numSlices = 2
 
@@ -301,8 +301,8 @@ def f(iterator):
         self.assertEqual(expected_output, output)
 
     def test_countByValue_batch(self):
-        """Basic operation test for DStream.countByValue with batch deserializer"""
-        test_input = [range(1, 5) + range(1,5), range(5, 7) + range(5, 9), ["a"] * 2 + ["b"] + [""] ]
+        """Basic operation test for DStream.countByValue with batch deserializer."""
+        test_input = [range(1, 5) + range(1,5), range(5, 7) + range(5, 9), ["a", "a", "b", ""]]
 
         def test_func(dstream):
             return dstream.countByValue()
@@ -315,7 +315,7 @@ def test_func(dstream):
         self.assertEqual(expected_output, output)
 
     def test_countByValue_unbatch(self):
-        """Basic operation test for DStream.countByValue with unbatch deserializer"""
+        """Basic operation test for DStream.countByValue with unbatch deserializer."""
         test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
 
         def test_func(dstream):
@@ -328,30 +328,72 @@ def test_func(dstream):
             self._sort_result_based_on_key(result)
         self.assertEqual(expected_output, output)
 
+    def test_groupByKey_batch(self):
+        """Basic operation test for DStream.groupByKey with batch deserializer."""
+        test_input = [range(1, 5), [1, 1, 1, 2, 2, 3], ["a", "a", "b", "", "", ""]]
+        def test_func(dstream):
+            return dstream.map(lambda x: (x,1)).groupByKey()
+        expected_output = [[(1, [1]), (2, [1]), (3, [1]), (4, [1])],
+                           [(1, [1, 1, 1]), (2, [1, 1]), (3, [1])],
+                           [("a", [1, 1]), ("b", [1]), ("", [1, 1, 1])]]
+        scattered_output = self._run_stream(test_input, test_func, expected_output)
+        output = self._convert_iter_value_to_list(scattered_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def test_groupByKey_unbatch(self):
+        """Basic operation test for DStream.groupByKey with unbatch deserializer."""
+        test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
+        def test_func(dstream):
+            return dstream.map(lambda x: (x,1)).groupByKey()
+        expected_output = [[(1, [1]), (2, [1]), (3, [1])],
+                           [(1, [1, 1]), ("", [1])],
+                           [("a", [1, 1]), ("b", [1])]]
+        scattered_output = self._run_stream(test_input, test_func, expected_output)
+        output = self._convert_iter_value_to_list(scattered_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def _convert_iter_value_to_list(self, outputs):
+        """Return key value pair list. Value is converted to iterator to list."""
+        result = list()
+        for output in outputs:
+            result.append(map(lambda (x, y): (x, list(y)), output))
+        return result
+
     def _sort_result_based_on_key(self, outputs):
+        """Sort the list base onf first value."""
         for output in outputs:
             output.sort(key=lambda x: x[0])
 
     def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
-        """Start stream and return the output"""
-        # Generate input stream with user-defined input
+        """
+        Start stream and return the output.
+        @param test_input: dataset for the test. This should be list of lists.
+        @param test_func: wrapped test_function. This function should return PythonDstream object.
+        @param expexted_output: expected output for this testcase.
+        @param numSlices: the number of slices in the rdd in the dstream.
+        """
+        # Generate input stream with user-defined input.
         numSlices = numSlices or self.numInputPartitions
         test_input_stream = self.ssc._testInputStream(test_input, numSlices)
-        # Apply test function to stream
+        # Apply test function to stream.
         test_stream = test_func(test_input_stream)
-        # Add job to get output from stream
+        # Add job to get output from stream.
         test_stream._test_output(self.result)
         self.ssc.start()
 
         start_time = time.time()
-        # loop until get the result from stream
+        # Loop until get the expected the number of the result from the stream.
         while True:
             current_time = time.time()
-            # check time out
+            # Check time out.
             if (current_time - start_time) > self.timeout:
                 break
             self.ssc.awaitTermination(50)
-            # check if the output is the same length of expexted output
+            # Check if the output is the same length of expexted output.
             if len(expected_output) == len(self.result):
                 break
 
@@ -372,9 +414,5 @@ def tearDownClass(cls):
         PySparkStreamingTestCase.tearDownClass()
 
 
-
-
-
-
 if __name__ == "__main__":
     unittest.main()

From c2141995851c59388f1f0e3d01f6018c428fbef2 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 15:12:31 -0700
Subject: [PATCH 225/347] added testcase for combineByKey

---
 python/pyspark/streaming_tests.py | 35 +++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index c35d352c66ca5..7f6960faed1a0 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -332,7 +332,7 @@ def test_groupByKey_batch(self):
         """Basic operation test for DStream.groupByKey with batch deserializer."""
         test_input = [range(1, 5), [1, 1, 1, 2, 2, 3], ["a", "a", "b", "", "", ""]]
         def test_func(dstream):
-            return dstream.map(lambda x: (x,1)).groupByKey()
+            return dstream.map(lambda x: (x, 1)).groupByKey()
         expected_output = [[(1, [1]), (2, [1]), (3, [1]), (4, [1])],
                            [(1, [1, 1, 1]), (2, [1, 1]), (3, [1])],
                            [("a", [1, 1]), ("b", [1]), ("", [1, 1, 1])]]
@@ -345,8 +345,9 @@ def test_func(dstream):
     def test_groupByKey_unbatch(self):
         """Basic operation test for DStream.groupByKey with unbatch deserializer."""
         test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
+
         def test_func(dstream):
-            return dstream.map(lambda x: (x,1)).groupByKey()
+            return dstream.map(lambda x: (x, 1)).groupByKey()
         expected_output = [[(1, [1]), (2, [1]), (3, [1])],
                            [(1, [1, 1]), ("", [1])],
                            [("a", [1, 1]), ("b", [1])]]
@@ -356,6 +357,36 @@ def test_func(dstream):
             self._sort_result_based_on_key(result)
         self.assertEqual(expected_output, output)
 
+    def test_combineByKey_batch(self):
+        """Basic operation test for DStream.combineByKey with batch deserializer."""
+        test_input = [range(1, 5), [1, 1, 1, 2, 2, 3], ["a", "a", "b", "", "", ""]]
+
+        def test_func(dstream):
+            def add(a, b): return a + str(b)
+            return dstream.map(lambda x: (x, 1)).combineByKey(str, add, add)
+        expected_output = [[(1, "1"), (2, "1"), (3, "1"), (4, "1")],
+                           [(1, "111"), (2, "11"), (3, "1")],
+                           [("a", "11"), ("b", "1"), ("", "111")]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def test_combineByKey_unbatch(self):
+        """Basic operation test for DStream.combineByKey with unbatch deserializer."""
+        test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
+
+        def test_func(dstream):
+            def add(a, b): return a + str(b)
+            return dstream.map(lambda x: (x, 1)).combineByKey(str, add, add)
+        expected_output = [[(1, "1"), (2, "1"), (3, "1")],
+                           [(1, "11"), ("", "1")],
+                           [("a", "11"), ("b", "1")]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
     def _convert_iter_value_to_list(self, outputs):
         """Return key value pair list. Value is converted to iterator to list."""
         result = list()

From 0b99becf8607fbea151eac48ea4a8205f7514edb Mon Sep 17 00:00:00 2001
From: Ken <ugw.gi.world@gmail.com>
Date: Tue, 8 Jul 2014 18:31:41 -0700
Subject: [PATCH 226/347] initial commit for pySparkStreaming

---
 bin/spark-submit                              |  11 +
 core/pom.xml                                  |   4 +
 .../src/main/python/streaming/wordcount.py    |  16 ++
 python/pyspark/java_gateway.py                |   6 +
 python/pyspark/streaming/__init__.py          |   1 +
 python/pyspark/streaming/context.py           |  99 ++++++++
 python/pyspark/streaming/dstream.py           | 225 ++++++++++++++++++
 python/pyspark/streaming/duration.py          | 110 +++++++++
 python/pyspark/streaming/jtime.py             |   7 +
 python/pyspark/streaming/pyprint.py           |  18 ++
 python/pyspark/streaming/utils.py             |  11 +
 streaming/pom.xml                             |  10 +-
 .../streaming/api/java/JavaDStreamLike.scala  |   8 +
 .../streaming/api/python/PythonDStream.scala  |  57 +++++
 .../spark/streaming/dstream/DStream.scala     |  64 +++++
 15 files changed, 644 insertions(+), 3 deletions(-)

diff --git a/bin/spark-submit b/bin/spark-submit
index c557311b4b20e..37e973a50b6fa 100755
--- a/bin/spark-submit
+++ b/bin/spark-submit
@@ -48,6 +48,7 @@ export SPARK_SUBMIT_PROPERTIES_FILE=${SPARK_SUBMIT_PROPERTIES_FILE:-"$DEFAULT_PR
 # paths, library paths, java options and memory early on. Otherwise, it will
 # be too late by the time the driver JVM has started.
 
+<<<<<<< HEAD
 if [[ "$SPARK_SUBMIT_DEPLOY_MODE" == "client" && -f "$SPARK_SUBMIT_PROPERTIES_FILE" ]]; then
   # Parse the properties file only if the special configs exist
   contains_special_configs=$(
@@ -57,6 +58,16 @@ if [[ "$SPARK_SUBMIT_DEPLOY_MODE" == "client" && -f "$SPARK_SUBMIT_PROPERTIES_FI
   if [ -n "$contains_special_configs" ]; then
     export SPARK_SUBMIT_BOOTSTRAP_DRIVER=1
   fi
+=======
+# Figure out which Python executable to use
+if [[ -z "$PYSPARK_PYTHON" ]]; then
+  PYSPARK_PYTHON="python"
+fi
+export PYSPARK_PYTHON
+
+if [ -n "$DRIVER_MEMORY" ] && [ $DEPLOY_MODE == "client" ]; then
+  export SPARK_DRIVER_MEMORY=$DRIVER_MEMORY
+>>>>>>> initial commit for pySparkStreaming
 fi
 
 exec "$SPARK_HOME"/bin/spark-class org.apache.spark.deploy.SparkSubmit "${ORIG_ARGS[@]}"
diff --git a/core/pom.xml b/core/pom.xml
index 2a81f6df289c0..7eb0b48eaeebd 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,11 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
+<<<<<<< HEAD
     <version>1.2.0-SNAPSHOT</version>
+=======
+    <version>1.0.0</version>
+>>>>>>> initial commit for pySparkStreaming
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index 2426345711086..c19eb74c44ed6 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -1,7 +1,10 @@
 import sys
 from operator import add
 
+<<<<<<< HEAD
 from pyspark.conf import SparkConf
+=======
+>>>>>>> initial commit for pySparkStreaming
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
 
@@ -9,6 +12,7 @@
     if len(sys.argv) != 2:
         print >> sys.stderr, "Usage: wordcount <directory>"
         exit(-1)
+<<<<<<< HEAD
     conf = SparkConf()
     conf.setAppName("PythonStreamingWordCount")
 
@@ -20,5 +24,17 @@
     count = mapped_words.reduceByKey(add)
     
     count.pyprint()
+=======
+    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
+
+    lines = ssc.textFileStream(sys.argv[1])
+    fm_lines = lines.flatMap(lambda x: x.split(" "))
+    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
+    mapped_lines = fm_lines.map(lambda x: (x, 1))
+    
+    fm_lines.pyprint()
+    filtered_lines.pyprint()
+    mapped_lines.pyprint()
+>>>>>>> initial commit for pySparkStreaming
     ssc.start()
     ssc.awaitTermination()
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index f3c6d231ab777..9b18696213691 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -108,10 +108,16 @@ def run(self):
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
     java_import(gateway.jvm, "org.apache.spark.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.api.python.*")
+<<<<<<< HEAD
     java_import(gateway.jvm, "org.apache.spark.streaming.*")  # do we need this?
     java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.dstream.*")  # do we need this?
+=======
+    java_import(gateway.jvm, "org.apache.spark.streaming.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
+>>>>>>> initial commit for pySparkStreaming
     java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
diff --git a/python/pyspark/streaming/__init__.py b/python/pyspark/streaming/__init__.py
index e69de29bb2d1d..719592912e80c 100644
--- a/python/pyspark/streaming/__init__.py
+++ b/python/pyspark/streaming/__init__.py
@@ -0,0 +1 @@
+__author__ = 'ktakagiw'
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 3f455a3e06072..e65d2243ca7d5 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -1,3 +1,9 @@
+<<<<<<< HEAD
+=======
+__author__ = 'ktakagiw'
+
+
+>>>>>>> initial commit for pySparkStreaming
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -15,6 +21,7 @@
 # limitations under the License.
 #
 
+<<<<<<< HEAD
 import sys
 from signal import signal, SIGTERM, SIGINT
 
@@ -29,12 +36,43 @@ class StreamingContext(object):
     """
     Main entry point for Spark Streaming functionality. A StreamingContext represents the
     connection to a Spark cluster, and can be used to create L{DStream}s and
+=======
+import os
+import shutil
+import sys
+from threading import Lock
+from tempfile import NamedTemporaryFile
+
+from pyspark import accumulators
+from pyspark.accumulators import Accumulator
+from pyspark.broadcast import Broadcast
+from pyspark.conf import SparkConf
+from pyspark.files import SparkFiles
+from pyspark.java_gateway import launch_gateway
+from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
+from pyspark.storagelevel import StorageLevel
+from pyspark.rdd import RDD
+from pyspark.context import SparkContext
+
+from py4j.java_collections import ListConverter
+
+from pyspark.streaming.dstream import DStream
+
+class StreamingContext(object):
+    """
+    Main entry point for Spark functionality. A StreamingContext represents the
+    connection to a Spark cluster, and can be used to create L{RDD}s and
+>>>>>>> initial commit for pySparkStreaming
     broadcast variables on that cluster.
     """
 
     def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
+<<<<<<< HEAD
         gateway=None, sparkContext=None, duration=None):
+=======
+        gateway=None, duration=None):
+>>>>>>> initial commit for pySparkStreaming
         """
         Create a new StreamingContext. At least the master and app name and duration
         should be set, either through the named parameters here or through C{conf}.
@@ -55,6 +93,7 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         @param conf: A L{SparkConf} object setting Spark properties.
         @param gateway: Use an existing gateway and JVM, otherwise a new JVM
                will be instatiated.
+<<<<<<< HEAD
         @param sparkContext: L{SparkContext} object.
         @param duration: A L{Duration} object for SparkStreaming.
 
@@ -73,6 +112,15 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         # is started in StreamingContext.
         SparkContext._gateway.restart_callback_server()
         self._clean_up_trigger()
+=======
+        @param duration: A L{Duration} Duration for SparkStreaming
+
+        """
+        # Create the Python Sparkcontext
+        self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
+                        pyFiles=pyFiles, environment=environment, batchSize=batchSize,
+                        serializer=serializer, conf=conf, gateway=gateway)
+>>>>>>> initial commit for pySparkStreaming
         self._jvm = self._sc._jvm
         self._jssc = self._initialize_context(self._sc._jsc, duration._jduration)
 
@@ -80,6 +128,7 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
     def _initialize_context(self, jspark_context, jduration):
         return self._jvm.JavaStreamingContext(jspark_context, jduration)
 
+<<<<<<< HEAD
     def _clean_up_trigger(self):
         """Kill py4j callback server properly using signal lib"""
 
@@ -156,3 +205,53 @@ def _testInputStream(self, test_inputs, numSlices=None):
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
         return DStream(jinput_stream, self, test_rdd_deserializers[0])
+=======
+    def actorStream(self, props, name, storageLevel, supervisorStrategy):
+        raise NotImplementedError
+
+    def addStreamingListener(self, streamingListener):
+        raise NotImplementedError
+
+    def awaitTermination(self, timeout=None):
+        if timeout:
+            self._jssc.awaitTermination(timeout)
+        else:
+            self._jssc.awaitTermination()
+
+    def checkpoint(self, directory):
+        raise NotImplementedError
+
+    def fileStream(self, directory, filter=None, newFilesOnly=None):
+        raise NotImplementedError
+
+    def networkStream(self, receiver):
+        raise NotImplementedError
+
+    def queueStream(self, queue, oneAtATime=True, defaultRDD=None):
+        raise NotImplementedError
+
+    def rawSocketStream(self, hostname, port, storagelevel):
+        raise NotImplementedError
+
+    def remember(self, duration):
+        raise NotImplementedError
+
+    def socketStream(hostname, port, converter,storageLevel):
+        raise NotImplementedError
+
+    def start(self):
+        self._jssc.start()
+
+    def stop(self, stopSparkContext=True):
+        raise NotImplementedError
+
+    def textFileStream(self, directory):
+        return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
+
+    def transform(self, seq):
+        raise NotImplementedError
+
+    def union(self, seq):
+        raise NotImplementedError
+
+>>>>>>> initial commit for pySparkStreaming
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 5fa30f6d89fbd..eeb6b5644d1d3 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -26,18 +27,50 @@
 from pyspark.resultiterable import ResultIterable
 from pyspark.streaming.utils import rddToFileName
 
+=======
+from base64 import standard_b64encode as b64enc
+import copy
+from collections import defaultdict
+from collections import namedtuple
+from itertools import chain, ifilter, imap
+import operator
+import os
+import sys
+import shlex
+import traceback
+from subprocess import Popen, PIPE
+from tempfile import NamedTemporaryFile
+from threading import Thread
+import warnings
+import heapq
+from random import Random
+
+from pyspark.serializers import NoOpSerializer, CartesianDeserializer, \
+    BatchedSerializer, CloudPickleSerializer, PairDeserializer, pack_long
+from pyspark.join import python_join, python_left_outer_join, \
+    python_right_outer_join, python_cogroup
+from pyspark.statcounter import StatCounter
+from pyspark.rddsampler import RDDSampler
+from pyspark.storagelevel import StorageLevel
+#from pyspark.resultiterable import ResultIterable
+from pyspark.rdd import _JavaStackTrace
+>>>>>>> initial commit for pySparkStreaming
 
 from py4j.java_collections import ListConverter, MapConverter
 
 __all__ = ["DStream"]
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> initial commit for pySparkStreaming
 class DStream(object):
     def __init__(self, jdstream, ssc, jrdd_deserializer):
         self._jdstream = jdstream
         self._ssc = ssc
         self.ctx = ssc._sc
         self._jrdd_deserializer = jrdd_deserializer
+<<<<<<< HEAD
         self.is_cached = False
         self.is_checkpointed = False
 
@@ -73,12 +106,81 @@ def print_(self, label=None):
     def filter(self, f):
         """
         Return a new DStream containing only the elements that satisfy predicate.
+=======
+
+    def generatedRDDs(self):
+        """
+         // RDDs generated, marked as private[streaming] so that testsuites can access it
+         @transient
+        """
+        pass
+
+    def print_(self):
+        """
+        """
+        # print is a resrved name of Python. We cannot give print to function name
+        getattr(self._jdstream, "print")()
+
+    def pyprint(self):
+        """
+        """
+        self._jdstream.pyprint()
+
+    def cache(self):
+        """
+        """
+        raise NotImplementedError
+
+    def checkpoint(self):
+        """
+        """
+        raise NotImplementedError
+
+    def compute(self, time):
+        """
+        """
+        raise NotImplementedError
+
+    def context(self):
+        """
+        """
+        raise NotImplementedError
+
+    def count(self):
+        """
+        """
+        raise NotImplementedError
+
+    def countByValue(self, numPartitions=None):
+        """
+        """
+        raise NotImplementedError
+
+    def countByValueAndWindow(self, duration, slideDuration=None):
+        """
+        """
+        raise NotImplementedError
+
+    def countByWindow(self, duration, slideDuration=None):
+        """
+        """
+        raise NotImplementedError
+
+    def dstream(self):
+        """
+        """
+        raise NotImplementedError
+
+    def filter(self, f):
+        """
+>>>>>>> initial commit for pySparkStreaming
         """
         def func(iterator): return ifilter(f, iterator)
         return self.mapPartitions(func)
 
     def flatMap(self, f, preservesPartitioning=False):
         """
+<<<<<<< HEAD
         Pass each value in the key-value pair DStream through flatMap function
         without changing the keys: this also retains the original RDD's partition.
         """
@@ -137,6 +239,51 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         if numPartitions is None:
             numPartitions = self._defaultReducePartitions()
 
+=======
+        """
+        def func(s, iterator): return chain.from_iterable(imap(f, iterator))
+        return self.mapPartitionsWithIndex(func, preservesPartitioning)
+
+    def foreachRDD(self, f, time):
+        """
+        """
+        raise NotImplementedError
+
+    def glom(self):
+        """
+        """
+        raise NotImplementedError
+
+    def map(self, f, preservesPartitioning=False):
+        """
+        """
+        def func(split, iterator): return imap(f, iterator)
+        return PipelinedDStream(self, func, preservesPartitioning)
+
+    def mapPartitions(self, f):
+        """
+        """
+        def func(s, iterator): return f(iterator)
+        return self.mapPartitionsWithIndex(func)
+
+    def perist(self, storageLevel):
+        """
+        """
+        raise NotImplementedError
+
+    def reduce(self, func, numPartitions=None):
+        """
+
+        """
+        return self._combineByKey(lambda x:x, func, func, numPartitions)
+
+    def _combineByKey(self, createCombiner, mergeValue, mergeCombiners,
+                      numPartitions = None):
+        """
+        """
+        if numPartitions is None:
+            numPartitions = self.ctx._defaultParallelism()
+>>>>>>> initial commit for pySparkStreaming
         def combineLocally(iterator):
             combiners = {}
             for x in iterator:
@@ -148,7 +295,10 @@ def combineLocally(iterator):
             return combiners.iteritems()
         locally_combined = self.mapPartitions(combineLocally)
         shuffled = locally_combined.partitionBy(numPartitions)
+<<<<<<< HEAD
 
+=======
+>>>>>>> initial commit for pySparkStreaming
         def _mergeCombiners(iterator):
             combiners = {}
             for (k, v) in iterator:
@@ -157,25 +307,43 @@ def _mergeCombiners(iterator):
                 else:
                     combiners[k] = mergeCombiners(combiners[k], v)
             return combiners.iteritems()
+<<<<<<< HEAD
 
         return shuffled.mapPartitions(_mergeCombiners)
 
     def partitionBy(self, numPartitions, partitionFunc=None):
         """
         Return a copy of the DStream partitioned using the specified partitioner.
+=======
+        return shuffled.mapPartitions(_mergeCombiners) 
+
+
+   def partitionBy(self, numPartitions, partitionFunc=None):
+        """
+        Return a copy of the DStream partitioned using the specified partitioner.
+
+>>>>>>> initial commit for pySparkStreaming
         """
         if numPartitions is None:
             numPartitions = self.ctx._defaultReducePartitions()
 
         if partitionFunc is None:
             partitionFunc = lambda x: 0 if x is None else hash(x)
+<<<<<<< HEAD
 
+=======
+>>>>>>> initial commit for pySparkStreaming
         # Transferring O(n) objects to Java is too expensive.  Instead, we'll
         # form the hash buckets in Python, transferring O(numPartitions) objects
         # to Java.  Each object is a (splitNumber, [objects]) pair.
         outputSerializer = self.ctx._unbatched_serializer
+<<<<<<< HEAD
 
         def add_shuffle_key(split, iterator):
+=======
+        def add_shuffle_key(split, iterator):
+
+>>>>>>> initial commit for pySparkStreaming
             buckets = defaultdict(list)
 
             for (k, v) in iterator:
@@ -186,16 +354,26 @@ def add_shuffle_key(split, iterator):
         keyed = PipelinedDStream(self, add_shuffle_key)
         keyed._bypass_serializer = True
         with _JavaStackTrace(self.ctx) as st:
+<<<<<<< HEAD
             partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
                                                       id(partitionFunc))
             jdstream = self.ctx._jvm.PythonPairwiseDStream(keyed._jdstream.dstream(),
                                                            partitioner).asJavaDStream()
+=======
+            #JavaDStream
+            #pairRDD = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairRDD()
+            pairDStream = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairDStream()
+            partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
+                                                          id(partitionFunc))
+        jdstream = pairDStream.partitionBy(partitioner).values()
+>>>>>>> initial commit for pySparkStreaming
         dstream = DStream(jdstream, self._ssc, BatchedSerializer(outputSerializer))
         # This is required so that id(partitionFunc) remains unique, even if
         # partitionFunc is a lambda:
         dstream._partitionFunc = partitionFunc
         return dstream
 
+<<<<<<< HEAD
     def _defaultReducePartitions(self):
         """
         Returns the default number of partitions to use during reduce tasks (e.g., groupBy).
@@ -432,6 +610,53 @@ def saveAsTextFile(rdd, time):
 # TODO: implement join
 # TODO: implement leftOuterJoin
 # TODO: implemtnt rightOuterJoin
+=======
+
+
+    def reduceByWindow(self, reduceFunc, windowDuration, slideDuration, inReduceTunc):
+        """
+        """
+
+        raise NotImplementedError
+
+    def repartition(self, numPartitions):
+        """
+        """
+        raise NotImplementedError
+
+    def slice(self, fromTime, toTime):
+        """
+        """
+        raise NotImplementedError
+
+    def transform(self, transformFunc):
+        """
+        """
+        raise NotImplementedError
+
+    def transformWith(self, other, transformFunc):
+        """
+        """
+        raise NotImplementedError
+
+    def union(self, that):
+        """
+        """
+        raise NotImplementedError
+
+    def window(self, windowDuration, slideDuration=None):
+        """
+        """
+        raise NotImplementedError
+
+    def wrapRDD(self, rdd):
+        """
+        """
+        raise NotImplementedError
+
+    def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
+        return PipelinedDStream(self, f, preservesPartitioning)
+>>>>>>> initial commit for pySparkStreaming
 
 
 class PipelinedDStream(DStream):
diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
index a7f1036e4b856..245c137ecfc29 100644
--- a/python/pyspark/streaming/duration.py
+++ b/python/pyspark/streaming/duration.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -18,6 +19,12 @@
 from pyspark.streaming import utils
 
 
+=======
+__author__ = 'ktakagiw'
+
+from pyspark.streaming import utils
+
+>>>>>>> initial commit for pySparkStreaming
 class Duration(object):
     """
     Duration for Spark Streaming application. Used to set duration
@@ -43,6 +50,7 @@ def __init__(self, millis, _jvm=None):
         self._jduration = _jvm.Duration(millis)
 
     def toString(self):
+<<<<<<< HEAD
         """
         Return duration as string
 
@@ -63,11 +71,19 @@ def isZero(self):
         >>> d_0.isZero()
         True
         """
+=======
+        """ Return duration as string """
+        return str(self._millis) + " ms"
+
+    def isZero(self):
+        """ Check if millis is zero """
+>>>>>>> initial commit for pySparkStreaming
         return self._millis == 0
 
     def prettyPrint(self):
         """
         Return a human-readable string representing a duration
+<<<<<<< HEAD
 
         >>> d_10 = Duration(10)
         >>> d_10.prettyPrint()
@@ -81,10 +97,13 @@ def prettyPrint(self):
         >>> d_1hour = Duration(60 * 60 * 1000)
         >>> d_1hour.prettyPrint()
         '1.00 h'
+=======
+>>>>>>> initial commit for pySparkStreaming
         """
         return utils.msDurationToString(self._millis)
 
     def milliseconds(self):
+<<<<<<< HEAD
         """
         Return millisecond
 
@@ -117,6 +136,17 @@ def max(self, other):
         100 ms
 
         """
+=======
+        """ Return millisecond """
+        return self._millis
+
+    def toFormattedString(self):
+        """ Return millisecond """
+        return str(self._millis)
+
+    def max(self, other):
+        """ Return higher Duration """
+>>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         if self > other:
             return self
@@ -124,6 +154,7 @@ def max(self, other):
             return other
 
     def min(self, other):
+<<<<<<< HEAD
         """
         Return lower Durattion
 
@@ -134,6 +165,9 @@ def min(self, other):
         10 ms
 
         """
+=======
+        """ Return lower Durattion """
+>>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         if self < other:
             return self
@@ -141,6 +175,7 @@ def min(self, other):
             return other
 
     def __str__(self):
+<<<<<<< HEAD
         """
         >>> d_10 = Duration(10)
         >>> str(d_10)
@@ -159,10 +194,17 @@ def __add__(self, other):
         >>> print d_110
         110 ms
         """
+=======
+        return self.toString()
+
+    def __add__(self, other):
+        """ Add Duration and Duration """
+>>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         return Duration(self._millis + other._millis)
 
     def __sub__(self, other):
+<<<<<<< HEAD
         """
         Subtract Duration by Duration
 
@@ -173,10 +215,14 @@ def __sub__(self, other):
         90 ms
 
         """
+=======
+        """ Subtract Duration by Duration  """
+>>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         return Duration(self._millis - other._millis)
 
     def __mul__(self, other):
+<<<<<<< HEAD
         """
         Multiple Duration by Duration
 
@@ -187,6 +233,9 @@ def __mul__(self, other):
         1000 ms
 
         """
+=======
+        """ Multiple Duration by Duration """
+>>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         return Duration(self._millis * other._millis)
 
@@ -194,6 +243,7 @@ def __div__(self, other):
         """
         Divide Duration by Duration
         for Python 2.X
+<<<<<<< HEAD
 
         >>> d_10 = Duration(10)
         >>> d_20 = Duration(20)
@@ -201,6 +251,8 @@ def __div__(self, other):
         >>> print d_2
         2 ms
 
+=======
+>>>>>>> initial commit for pySparkStreaming
         """
         Duration._is_duration(other)
         return Duration(self._millis / other._millis)
@@ -209,6 +261,7 @@ def __truediv__(self, other):
         """
         Divide Duration by Duration
         for Python 3.0
+<<<<<<< HEAD
 
         >>> d_10 = Duration(10)
         >>> d_20 = Duration(20)
@@ -216,11 +269,14 @@ def __truediv__(self, other):
         >>> print d_2
         2 ms
 
+=======
+>>>>>>> initial commit for pySparkStreaming
         """
         Duration._is_duration(other)
         return Duration(self._millis / other._millis)
 
     def __floordiv__(self, other):
+<<<<<<< HEAD
         """
         Divide Duration by Duration
 
@@ -246,10 +302,23 @@ def __lt__(self, other):
         False
 
         """
+=======
+        """ Divide Duration by Duration """
+        Duration._is_duration(other)
+        return Duration(self._millis // other._millis)
+
+    def __len__(self):
+        """ Length of miilisecond in Duration """
+        return len(self._millis)
+
+    def __lt__(self, other):
+        """ Duration < Duration """
+>>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         return self._millis < other._millis
 
     def __le__(self, other):
+<<<<<<< HEAD
         """
         Duration <= Duration
 
@@ -277,10 +346,19 @@ def __eq__(self, other):
         True
 
         """
+=======
+        """ Duration <= Duration """
+        Duration._is_duration(other)
+        return self.millis <= other._millis
+
+    def __eq__(self, other):
+        """ Duration ==  Duration """
+>>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         return self._millis == other._millis
 
     def __ne__(self, other):
+<<<<<<< HEAD
         """
         Duration != Duration
 
@@ -293,10 +371,14 @@ def __ne__(self, other):
         False
 
         """
+=======
+        """ Duration != Duration """
+>>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         return self._millis != other._millis
 
     def __gt__(self, other):
+<<<<<<< HEAD
         """
         Duration > Duration
 
@@ -308,10 +390,14 @@ def __gt__(self, other):
         True
 
         """
+=======
+        """ Duration > Duration """
+>>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         return self._millis > other._millis
 
     def __ge__(self, other):
+<<<<<<< HEAD
         """
         Duration >= Duration
 
@@ -324,6 +410,9 @@ def __ge__(self, other):
 
 
         """
+=======
+        """ Duration >= Duration """
+>>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         return self._millis >= other._millis
 
@@ -337,12 +426,15 @@ def Milliseconds(milliseconds):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
     a given number of milliseconds.
+<<<<<<< HEAD
 
     >>> milliseconds = Milliseconds(1)
     >>> d_1 = Duration(1)
     >>> milliseconds == d_1
     True
 
+=======
+>>>>>>> initial commit for pySparkStreaming
     """
     return Duration(milliseconds)
 
@@ -350,6 +442,7 @@ def Seconds(seconds):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
     a given number of seconds.
+<<<<<<< HEAD
 
     >>> seconds = Seconds(1)
     >>> d_1sec = Duration(1000)
@@ -371,3 +464,20 @@ def Minutes(minutes):
 
     """
     return Duration(minutes * 60 * 1000)
+=======
+    """
+    return Duration(seconds * 1000)
+
+def Minites(minites):
+    """
+    Helper function that creates instance of [[pysparkstreaming.duration]] representing
+    a given number of minutes.
+    """
+    return Duration(minutes * 60000)
+
+if __name__ == "__main__":
+    d = Duration(1)
+    print d
+    print d.milliseconds()
+
+>>>>>>> initial commit for pySparkStreaming
diff --git a/python/pyspark/streaming/jtime.py b/python/pyspark/streaming/jtime.py
index 32ef741051283..9295c4ee27705 100644
--- a/python/pyspark/streaming/jtime.py
+++ b/python/pyspark/streaming/jtime.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -14,10 +15,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+=======
+__author__ = 'ktakagiw'
+>>>>>>> initial commit for pySparkStreaming
 
 from pyspark.streaming import utils
 from pyspark.streaming.duration import Duration
 
+<<<<<<< HEAD
 """
 The name of this file, time is not good naming for python
 because if we do import time when we want to use native python time package, it does
@@ -25,6 +30,8 @@
 """
 
 
+=======
+>>>>>>> initial commit for pySparkStreaming
 class Time(object):
     """
     Time for Spark Streaming application. Used to set Time
diff --git a/python/pyspark/streaming/pyprint.py b/python/pyspark/streaming/pyprint.py
index 49517b3e5c247..4beb66950d851 100644
--- a/python/pyspark/streaming/pyprint.py
+++ b/python/pyspark/streaming/pyprint.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -26,17 +27,31 @@ def collect(binary_file_path):
     """
     Read pickled file written by SparkStreaming
     """
+=======
+import sys
+from itertools import chain
+from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
+
+def collect(binary_file_path):
+>>>>>>> initial commit for pySparkStreaming
     dse = PickleSerializer()
     with open(binary_file_path, 'rb') as tempFile:
         for item in dse.load_stream(tempFile):
             yield item
+<<<<<<< HEAD
 
 
+=======
+>>>>>>> initial commit for pySparkStreaming
 def main():
     try:
         binary_file_path = sys.argv[1]
     except:
+<<<<<<< HEAD
         print "Missed FilePath in argements"
+=======
+        print "Missed FilePath in argement"
+>>>>>>> initial commit for pySparkStreaming
 
     if not binary_file_path:
         return 
@@ -49,6 +64,9 @@ def main():
             print "..."
             break
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> initial commit for pySparkStreaming
 if __name__ =="__main__":
     exit(main())
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index 9178577743e0b..56bb0ca1e9620 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -36,6 +37,9 @@ def __str__(self):
     class Java:
         implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction']
 
+=======
+__author__ = 'ktakagiw'
+>>>>>>> initial commit for pySparkStreaming
 
 def msDurationToString(ms):
     """
@@ -49,6 +53,7 @@ def msDurationToString(ms):
         return "%d ms" % ms
     elif ms < minute:
         return "%.1f s" % (float(ms) / second)
+<<<<<<< HEAD
     elif ms < hour:
         return "%.1f m" % (float(ms) / minute)
     else:
@@ -59,3 +64,9 @@ def rddToFileName(prefix, suffix, time):
         return prefix + "-" + str(time) + "." + suffix
     else:
         return prefix + "-" + str(time)
+=======
+    elif ms < hout:
+        return "%.1f m" % (float(ms) / minute)
+    else:
+        return "%.2f h" % (float(ms) / hour)
+>>>>>>> initial commit for pySparkStreaming
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 12f900c91eb98..cb38015c24622 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,11 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
+<<<<<<< HEAD
     <version>1.2.0-SNAPSHOT</version>
+=======
+    <version>1.0.0</version>
+>>>>>>> initial commit for pySparkStreaming
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -77,9 +81,9 @@
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
       </plugin>
-      
-      <!-- 
-           This plugin forces the generation of jar containing streaming test classes, 
+
+      <!--
+           This plugin forces the generation of jar containing streaming test classes,
            so that the tests classes of external modules can use them. The two execution profiles
            are necessary - first one for 'mvn package', second one for 'mvn test-compile'. Ideally,
            'mvn compile' should not compile test classes and therefore should not need this. 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index a6184de4e83c1..cfa336df8674f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -54,6 +54,14 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     dstream.print()
   }
 
+  /**
+   * Print the first ten elements of each PythonRDD generated in the PythonDStream. This is an output
+   * operator, so this PythonDStream will be registered as an output stream and there materialized.
+   * This function is for PythonAPI.
+   */
+
+  def pyprint() = dstream.pyprint()
+
   /**
    * Return a new DStream in which each RDD has a single element generated by counting each RDD
    * of this DStream.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index d1f41cfbb8885..459cee31f2e8e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.streaming.api.python
 
+<<<<<<< HEAD
 import java.io._
 import java.io.{ObjectInputStream, IOException}
 import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
@@ -47,6 +48,46 @@ class PythonDStream[T: ClassTag](
     broadcastVars: JList[Broadcast[Array[Byte]]],
     accumulator: Accumulator[JList[Array[Byte]]])
   extends DStream[Array[Byte]](parent.ssc) {
+=======
+import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
+
+import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark._
+import org.apache.spark.util.Utils
+import java.io._
+import scala.Some
+import org.apache.spark.streaming.Duration
+import scala.util.control.Breaks._
+import org.apache.spark.broadcast.Broadcast
+import scala.Some
+import org.apache.spark.streaming.Duration
+import org.apache.spark.rdd.RDD
+import org.apache.spark.api.python.PythonRDD
+
+
+import org.apache.spark.streaming.{Duration, Time}
+import org.apache.spark.streaming.dstream._
+import org.apache.spark.streaming.api.java._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.api.python._
+import org.apache.spark.api.python.PairwiseRDD
+
+
+import scala.reflect.ClassTag
+
+
+class PythonDStream[T: ClassTag](
+                                  parent: DStream[T],
+                                  command: Array[Byte],
+                                  envVars: JMap[String, String],
+                                  pythonIncludes: JList[String],
+                                  preservePartitoning: Boolean,
+                                  pythonExec: String,
+                                  broadcastVars: JList[Broadcast[Array[Byte]]],
+                                  accumulator: Accumulator[JList[Array[Byte]]]
+                                  ) extends DStream[Array[Byte]](parent.ssc) {
+>>>>>>> initial commit for pySparkStreaming
 
   override def dependencies = List(parent)
 
@@ -61,11 +102,14 @@ class PythonDStream[T: ClassTag](
       case None => None
     }
   }
+<<<<<<< HEAD
 
   def foreachRDD(foreachFunc: PythonRDDFunction) {
     new PythonForeachDStream(this, context.sparkContext.clean(foreachFunc, false)).register()
   }
 
+=======
+>>>>>>> initial commit for pySparkStreaming
   val asJavaDStream  = JavaDStream.fromDStream(this)
 
   /**
@@ -88,7 +132,11 @@ class PythonDStream[T: ClassTag](
       tempFileStream.close()
 
       // This value has to be passed from python
+<<<<<<< HEAD
       //val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+=======
+      val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+>>>>>>> initial commit for pySparkStreaming
       val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
       //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
       //absolute path to the python script is needed to change because we do not use pysparkstreaming
@@ -141,6 +189,7 @@ DStream[(Long, Array[Byte])](prev.ssc){
       case None => None
     }
   }
+<<<<<<< HEAD
   val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
 }
 
@@ -225,3 +274,11 @@ class PythonTransformedDStream(
   //val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
 }
 */
+=======
+  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]] = JavaPairDStream(this)
+}
+
+
+
+
+>>>>>>> initial commit for pySparkStreaming
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 39ad591e8896e..f6dcd2fb88f45 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -22,6 +22,8 @@ import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
 import scala.deprecated
 import scala.collection.mutable.HashMap
 import scala.reflect.ClassTag
+import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
+import scala.util.control.Breaks._
 
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.rdd.{BlockRDD, RDD}
@@ -617,6 +619,68 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
+
+
+
+
+  /**
+   * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
+   * operator, so this PythonDStream will be registered as an output stream and there materialized.
+   * Since serialized Python object is readable by Python, pyprint writes out binary data to
+   * temporary file and run python script to deserialized and print the first ten elements
+   */
+  private[streaming] def pyprint() {
+    def foreachFunc = (rdd: RDD[T], time: Time) => {
+      val iter = rdd.take(11).iterator
+
+      // make a temporary file
+      val prefix = "spark"
+      val suffix = ".tmp"
+      val tempFile = File.createTempFile(prefix, suffix)
+      val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
+      //write out serialized python object
+      PythonRDD.writeIteratorToStream(iter, tempFileStream)
+      tempFileStream.close()
+
+      // This value has to be passed from python
+      val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+      val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
+      //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
+      //absolute path to the python script is needed to change because we do not use pysparkstreaming
+      val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath)
+      val workerEnv = pb.environment()
+
+      //envVars also need to be pass
+      //workerEnv.putAll(envVars)
+      val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
+      workerEnv.put("PYTHONPATH", pythonPath)
+      val worker = pb.start()
+      val is = worker.getInputStream()
+      val isr = new InputStreamReader(is)
+      val br = new BufferedReader(isr)
+
+      println ("-------------------------------------------")
+      println ("Time: " + time)
+      println ("-------------------------------------------")
+
+      //print value from python std out
+      var line = ""
+      breakable {
+        while (true) {
+          line = br.readLine()
+          if (line == null) break()
+          println(line)
+        }
+      }
+      //delete temporary file
+      tempFile.delete()
+      println()
+
+    }
+    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
+  }
+
+
   /**
    * Return a new DStream in which each RDD contains all the elements in seen in a
    * sliding window of time over this DStream. The new DStream generates RDDs with

From 41886c20bb5376ef14aa00be21cc64b5d460a161 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Tue, 15 Jul 2014 15:41:52 -0700
Subject: [PATCH 227/347] comment PythonDStream.PairwiseDStream

---
 .../apache/spark/streaming/api/python/PythonDStream.scala   | 3 ++-
 .../scala/org/apache/spark/streaming/dstream/DStream.scala  | 6 ++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 459cee31f2e8e..998fa24eba91b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -174,7 +174,7 @@ class PythonDStream[T: ClassTag](
   }
 }
 
-
+/*
 private class PairwiseDStream(prev:DStream[Array[Byte]]) extends
 DStream[(Long, Array[Byte])](prev.ssc){
   override def dependencies = List(prev)
@@ -277,6 +277,7 @@ class PythonTransformedDStream(
 =======
   val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]] = JavaPairDStream(this)
 }
+*/
 
 
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index f6dcd2fb88f45..5377dfa52d461 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -619,10 +619,7 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-
-
-
-
+//TODO move pyprint to PythonDStream
   /**
    * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
    * operator, so this PythonDStream will be registered as an output stream and there materialized.
@@ -643,6 +640,7 @@ abstract class DStream[T: ClassTag] (
       tempFileStream.close()
 
       // This value has to be passed from python
+      // Python currently does not do cluster deployment. But what happened
       val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
       val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
       //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???

From 66fcfffa1be42f3fb71d620e03504af5cdd4c8e4 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Tue, 15 Jul 2014 17:19:20 -0700
Subject: [PATCH 228/347] modify dstream.py to fix indent error

---
 bin/spark-submit                              |  11 -
 core/pom.xml                                  |   4 -
 .../src/main/python/streaming/wordcount.py    |  16 --
 python/pyspark/java_gateway.py                |   6 -
 python/pyspark/streaming/context.py           |  99 --------
 python/pyspark/streaming/dstream.py           | 237 ------------------
 python/pyspark/streaming/duration.py          | 110 --------
 python/pyspark/streaming/jtime.py             |   7 -
 python/pyspark/streaming/pyprint.py           |  72 ------
 python/pyspark/streaming/utils.py             |  10 -
 streaming/pom.xml                             |   4 -
 .../streaming/api/python/PythonDStream.scala  |  79 ------
 12 files changed, 655 deletions(-)
 delete mode 100644 python/pyspark/streaming/pyprint.py

diff --git a/bin/spark-submit b/bin/spark-submit
index 37e973a50b6fa..c557311b4b20e 100755
--- a/bin/spark-submit
+++ b/bin/spark-submit
@@ -48,7 +48,6 @@ export SPARK_SUBMIT_PROPERTIES_FILE=${SPARK_SUBMIT_PROPERTIES_FILE:-"$DEFAULT_PR
 # paths, library paths, java options and memory early on. Otherwise, it will
 # be too late by the time the driver JVM has started.
 
-<<<<<<< HEAD
 if [[ "$SPARK_SUBMIT_DEPLOY_MODE" == "client" && -f "$SPARK_SUBMIT_PROPERTIES_FILE" ]]; then
   # Parse the properties file only if the special configs exist
   contains_special_configs=$(
@@ -58,16 +57,6 @@ if [[ "$SPARK_SUBMIT_DEPLOY_MODE" == "client" && -f "$SPARK_SUBMIT_PROPERTIES_FI
   if [ -n "$contains_special_configs" ]; then
     export SPARK_SUBMIT_BOOTSTRAP_DRIVER=1
   fi
-=======
-# Figure out which Python executable to use
-if [[ -z "$PYSPARK_PYTHON" ]]; then
-  PYSPARK_PYTHON="python"
-fi
-export PYSPARK_PYTHON
-
-if [ -n "$DRIVER_MEMORY" ] && [ $DEPLOY_MODE == "client" ]; then
-  export SPARK_DRIVER_MEMORY=$DRIVER_MEMORY
->>>>>>> initial commit for pySparkStreaming
 fi
 
 exec "$SPARK_HOME"/bin/spark-class org.apache.spark.deploy.SparkSubmit "${ORIG_ARGS[@]}"
diff --git a/core/pom.xml b/core/pom.xml
index 7eb0b48eaeebd..2a81f6df289c0 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,11 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-<<<<<<< HEAD
     <version>1.2.0-SNAPSHOT</version>
-=======
-    <version>1.0.0</version>
->>>>>>> initial commit for pySparkStreaming
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index c19eb74c44ed6..2426345711086 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -1,10 +1,7 @@
 import sys
 from operator import add
 
-<<<<<<< HEAD
 from pyspark.conf import SparkConf
-=======
->>>>>>> initial commit for pySparkStreaming
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
 
@@ -12,7 +9,6 @@
     if len(sys.argv) != 2:
         print >> sys.stderr, "Usage: wordcount <directory>"
         exit(-1)
-<<<<<<< HEAD
     conf = SparkConf()
     conf.setAppName("PythonStreamingWordCount")
 
@@ -24,17 +20,5 @@
     count = mapped_words.reduceByKey(add)
     
     count.pyprint()
-=======
-    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
-
-    lines = ssc.textFileStream(sys.argv[1])
-    fm_lines = lines.flatMap(lambda x: x.split(" "))
-    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
-    mapped_lines = fm_lines.map(lambda x: (x, 1))
-    
-    fm_lines.pyprint()
-    filtered_lines.pyprint()
-    mapped_lines.pyprint()
->>>>>>> initial commit for pySparkStreaming
     ssc.start()
     ssc.awaitTermination()
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 9b18696213691..f3c6d231ab777 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -108,16 +108,10 @@ def run(self):
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
     java_import(gateway.jvm, "org.apache.spark.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.api.python.*")
-<<<<<<< HEAD
     java_import(gateway.jvm, "org.apache.spark.streaming.*")  # do we need this?
     java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.dstream.*")  # do we need this?
-=======
-    java_import(gateway.jvm, "org.apache.spark.streaming.*")
-    java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
-    java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
->>>>>>> initial commit for pySparkStreaming
     java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index e65d2243ca7d5..3f455a3e06072 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -1,9 +1,3 @@
-<<<<<<< HEAD
-=======
-__author__ = 'ktakagiw'
-
-
->>>>>>> initial commit for pySparkStreaming
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -21,7 +15,6 @@
 # limitations under the License.
 #
 
-<<<<<<< HEAD
 import sys
 from signal import signal, SIGTERM, SIGINT
 
@@ -36,43 +29,12 @@ class StreamingContext(object):
     """
     Main entry point for Spark Streaming functionality. A StreamingContext represents the
     connection to a Spark cluster, and can be used to create L{DStream}s and
-=======
-import os
-import shutil
-import sys
-from threading import Lock
-from tempfile import NamedTemporaryFile
-
-from pyspark import accumulators
-from pyspark.accumulators import Accumulator
-from pyspark.broadcast import Broadcast
-from pyspark.conf import SparkConf
-from pyspark.files import SparkFiles
-from pyspark.java_gateway import launch_gateway
-from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
-from pyspark.storagelevel import StorageLevel
-from pyspark.rdd import RDD
-from pyspark.context import SparkContext
-
-from py4j.java_collections import ListConverter
-
-from pyspark.streaming.dstream import DStream
-
-class StreamingContext(object):
-    """
-    Main entry point for Spark functionality. A StreamingContext represents the
-    connection to a Spark cluster, and can be used to create L{RDD}s and
->>>>>>> initial commit for pySparkStreaming
     broadcast variables on that cluster.
     """
 
     def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
-<<<<<<< HEAD
         gateway=None, sparkContext=None, duration=None):
-=======
-        gateway=None, duration=None):
->>>>>>> initial commit for pySparkStreaming
         """
         Create a new StreamingContext. At least the master and app name and duration
         should be set, either through the named parameters here or through C{conf}.
@@ -93,7 +55,6 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         @param conf: A L{SparkConf} object setting Spark properties.
         @param gateway: Use an existing gateway and JVM, otherwise a new JVM
                will be instatiated.
-<<<<<<< HEAD
         @param sparkContext: L{SparkContext} object.
         @param duration: A L{Duration} object for SparkStreaming.
 
@@ -112,15 +73,6 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         # is started in StreamingContext.
         SparkContext._gateway.restart_callback_server()
         self._clean_up_trigger()
-=======
-        @param duration: A L{Duration} Duration for SparkStreaming
-
-        """
-        # Create the Python Sparkcontext
-        self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
-                        pyFiles=pyFiles, environment=environment, batchSize=batchSize,
-                        serializer=serializer, conf=conf, gateway=gateway)
->>>>>>> initial commit for pySparkStreaming
         self._jvm = self._sc._jvm
         self._jssc = self._initialize_context(self._sc._jsc, duration._jduration)
 
@@ -128,7 +80,6 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
     def _initialize_context(self, jspark_context, jduration):
         return self._jvm.JavaStreamingContext(jspark_context, jduration)
 
-<<<<<<< HEAD
     def _clean_up_trigger(self):
         """Kill py4j callback server properly using signal lib"""
 
@@ -205,53 +156,3 @@ def _testInputStream(self, test_inputs, numSlices=None):
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
         return DStream(jinput_stream, self, test_rdd_deserializers[0])
-=======
-    def actorStream(self, props, name, storageLevel, supervisorStrategy):
-        raise NotImplementedError
-
-    def addStreamingListener(self, streamingListener):
-        raise NotImplementedError
-
-    def awaitTermination(self, timeout=None):
-        if timeout:
-            self._jssc.awaitTermination(timeout)
-        else:
-            self._jssc.awaitTermination()
-
-    def checkpoint(self, directory):
-        raise NotImplementedError
-
-    def fileStream(self, directory, filter=None, newFilesOnly=None):
-        raise NotImplementedError
-
-    def networkStream(self, receiver):
-        raise NotImplementedError
-
-    def queueStream(self, queue, oneAtATime=True, defaultRDD=None):
-        raise NotImplementedError
-
-    def rawSocketStream(self, hostname, port, storagelevel):
-        raise NotImplementedError
-
-    def remember(self, duration):
-        raise NotImplementedError
-
-    def socketStream(hostname, port, converter,storageLevel):
-        raise NotImplementedError
-
-    def start(self):
-        self._jssc.start()
-
-    def stop(self, stopSparkContext=True):
-        raise NotImplementedError
-
-    def textFileStream(self, directory):
-        return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
-
-    def transform(self, seq):
-        raise NotImplementedError
-
-    def union(self, seq):
-        raise NotImplementedError
-
->>>>>>> initial commit for pySparkStreaming
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index eeb6b5644d1d3..224d2bbdeeb53 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -1,4 +1,3 @@
-<<<<<<< HEAD
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -27,50 +26,17 @@
 from pyspark.resultiterable import ResultIterable
 from pyspark.streaming.utils import rddToFileName
 
-=======
-from base64 import standard_b64encode as b64enc
-import copy
-from collections import defaultdict
-from collections import namedtuple
-from itertools import chain, ifilter, imap
-import operator
-import os
-import sys
-import shlex
-import traceback
-from subprocess import Popen, PIPE
-from tempfile import NamedTemporaryFile
-from threading import Thread
-import warnings
-import heapq
-from random import Random
-
-from pyspark.serializers import NoOpSerializer, CartesianDeserializer, \
-    BatchedSerializer, CloudPickleSerializer, PairDeserializer, pack_long
-from pyspark.join import python_join, python_left_outer_join, \
-    python_right_outer_join, python_cogroup
-from pyspark.statcounter import StatCounter
-from pyspark.rddsampler import RDDSampler
-from pyspark.storagelevel import StorageLevel
-#from pyspark.resultiterable import ResultIterable
-from pyspark.rdd import _JavaStackTrace
->>>>>>> initial commit for pySparkStreaming
-
 from py4j.java_collections import ListConverter, MapConverter
 
 __all__ = ["DStream"]
 
-<<<<<<< HEAD
 
-=======
->>>>>>> initial commit for pySparkStreaming
 class DStream(object):
     def __init__(self, jdstream, ssc, jrdd_deserializer):
         self._jdstream = jdstream
         self._ssc = ssc
         self.ctx = ssc._sc
         self._jrdd_deserializer = jrdd_deserializer
-<<<<<<< HEAD
         self.is_cached = False
         self.is_checkpointed = False
 
@@ -106,81 +72,12 @@ def print_(self, label=None):
     def filter(self, f):
         """
         Return a new DStream containing only the elements that satisfy predicate.
-=======
-
-    def generatedRDDs(self):
-        """
-         // RDDs generated, marked as private[streaming] so that testsuites can access it
-         @transient
-        """
-        pass
-
-    def print_(self):
-        """
-        """
-        # print is a resrved name of Python. We cannot give print to function name
-        getattr(self._jdstream, "print")()
-
-    def pyprint(self):
-        """
-        """
-        self._jdstream.pyprint()
-
-    def cache(self):
-        """
-        """
-        raise NotImplementedError
-
-    def checkpoint(self):
-        """
-        """
-        raise NotImplementedError
-
-    def compute(self, time):
-        """
-        """
-        raise NotImplementedError
-
-    def context(self):
-        """
-        """
-        raise NotImplementedError
-
-    def count(self):
-        """
-        """
-        raise NotImplementedError
-
-    def countByValue(self, numPartitions=None):
-        """
-        """
-        raise NotImplementedError
-
-    def countByValueAndWindow(self, duration, slideDuration=None):
-        """
-        """
-        raise NotImplementedError
-
-    def countByWindow(self, duration, slideDuration=None):
-        """
-        """
-        raise NotImplementedError
-
-    def dstream(self):
-        """
-        """
-        raise NotImplementedError
-
-    def filter(self, f):
-        """
->>>>>>> initial commit for pySparkStreaming
         """
         def func(iterator): return ifilter(f, iterator)
         return self.mapPartitions(func)
 
     def flatMap(self, f, preservesPartitioning=False):
         """
-<<<<<<< HEAD
         Pass each value in the key-value pair DStream through flatMap function
         without changing the keys: this also retains the original RDD's partition.
         """
@@ -239,51 +136,6 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         if numPartitions is None:
             numPartitions = self._defaultReducePartitions()
 
-=======
-        """
-        def func(s, iterator): return chain.from_iterable(imap(f, iterator))
-        return self.mapPartitionsWithIndex(func, preservesPartitioning)
-
-    def foreachRDD(self, f, time):
-        """
-        """
-        raise NotImplementedError
-
-    def glom(self):
-        """
-        """
-        raise NotImplementedError
-
-    def map(self, f, preservesPartitioning=False):
-        """
-        """
-        def func(split, iterator): return imap(f, iterator)
-        return PipelinedDStream(self, func, preservesPartitioning)
-
-    def mapPartitions(self, f):
-        """
-        """
-        def func(s, iterator): return f(iterator)
-        return self.mapPartitionsWithIndex(func)
-
-    def perist(self, storageLevel):
-        """
-        """
-        raise NotImplementedError
-
-    def reduce(self, func, numPartitions=None):
-        """
-
-        """
-        return self._combineByKey(lambda x:x, func, func, numPartitions)
-
-    def _combineByKey(self, createCombiner, mergeValue, mergeCombiners,
-                      numPartitions = None):
-        """
-        """
-        if numPartitions is None:
-            numPartitions = self.ctx._defaultParallelism()
->>>>>>> initial commit for pySparkStreaming
         def combineLocally(iterator):
             combiners = {}
             for x in iterator:
@@ -295,10 +147,7 @@ def combineLocally(iterator):
             return combiners.iteritems()
         locally_combined = self.mapPartitions(combineLocally)
         shuffled = locally_combined.partitionBy(numPartitions)
-<<<<<<< HEAD
 
-=======
->>>>>>> initial commit for pySparkStreaming
         def _mergeCombiners(iterator):
             combiners = {}
             for (k, v) in iterator:
@@ -307,43 +156,25 @@ def _mergeCombiners(iterator):
                 else:
                     combiners[k] = mergeCombiners(combiners[k], v)
             return combiners.iteritems()
-<<<<<<< HEAD
 
         return shuffled.mapPartitions(_mergeCombiners)
 
     def partitionBy(self, numPartitions, partitionFunc=None):
         """
         Return a copy of the DStream partitioned using the specified partitioner.
-=======
-        return shuffled.mapPartitions(_mergeCombiners) 
-
-
-   def partitionBy(self, numPartitions, partitionFunc=None):
-        """
-        Return a copy of the DStream partitioned using the specified partitioner.
-
->>>>>>> initial commit for pySparkStreaming
         """
         if numPartitions is None:
             numPartitions = self.ctx._defaultReducePartitions()
 
         if partitionFunc is None:
             partitionFunc = lambda x: 0 if x is None else hash(x)
-<<<<<<< HEAD
 
-=======
->>>>>>> initial commit for pySparkStreaming
         # Transferring O(n) objects to Java is too expensive.  Instead, we'll
         # form the hash buckets in Python, transferring O(numPartitions) objects
         # to Java.  Each object is a (splitNumber, [objects]) pair.
         outputSerializer = self.ctx._unbatched_serializer
-<<<<<<< HEAD
 
         def add_shuffle_key(split, iterator):
-=======
-        def add_shuffle_key(split, iterator):
-
->>>>>>> initial commit for pySparkStreaming
             buckets = defaultdict(list)
 
             for (k, v) in iterator:
@@ -354,26 +185,16 @@ def add_shuffle_key(split, iterator):
         keyed = PipelinedDStream(self, add_shuffle_key)
         keyed._bypass_serializer = True
         with _JavaStackTrace(self.ctx) as st:
-<<<<<<< HEAD
             partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
                                                       id(partitionFunc))
             jdstream = self.ctx._jvm.PythonPairwiseDStream(keyed._jdstream.dstream(),
                                                            partitioner).asJavaDStream()
-=======
-            #JavaDStream
-            #pairRDD = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairRDD()
-            pairDStream = self.ctx._jvm.PairwiseDStream(keyed._jdstream.dstream()).asJavaPairDStream()
-            partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
-                                                          id(partitionFunc))
-        jdstream = pairDStream.partitionBy(partitioner).values()
->>>>>>> initial commit for pySparkStreaming
         dstream = DStream(jdstream, self._ssc, BatchedSerializer(outputSerializer))
         # This is required so that id(partitionFunc) remains unique, even if
         # partitionFunc is a lambda:
         dstream._partitionFunc = partitionFunc
         return dstream
 
-<<<<<<< HEAD
     def _defaultReducePartitions(self):
         """
         Returns the default number of partitions to use during reduce tasks (e.g., groupBy).
@@ -387,7 +208,6 @@ def _defaultReducePartitions(self):
         if self.ctx._conf.contains("spark.default.parallelism"):
             return self.ctx.defaultParallelism
         else:
-<<<<<<< HEAD
             return 2
 
     def getNumPartitions(self):
@@ -398,16 +218,6 @@ def getNumPartitions(self):
       2
       """
       return self._jdstream.partitions().size()
-=======
-            return self.getNumPartitions()
-
-    def getNumPartitions(self):
-        """
-        Return the number of partitions in RDD
-        """
-        # TODO: remove hardcoding. RDD has NumPartitions but DStream does not have.
-        return 2
->>>>>>> clean up code
 
     def foreachRDD(self, func):
         """
@@ -610,53 +420,6 @@ def saveAsTextFile(rdd, time):
 # TODO: implement join
 # TODO: implement leftOuterJoin
 # TODO: implemtnt rightOuterJoin
-=======
-
-
-    def reduceByWindow(self, reduceFunc, windowDuration, slideDuration, inReduceTunc):
-        """
-        """
-
-        raise NotImplementedError
-
-    def repartition(self, numPartitions):
-        """
-        """
-        raise NotImplementedError
-
-    def slice(self, fromTime, toTime):
-        """
-        """
-        raise NotImplementedError
-
-    def transform(self, transformFunc):
-        """
-        """
-        raise NotImplementedError
-
-    def transformWith(self, other, transformFunc):
-        """
-        """
-        raise NotImplementedError
-
-    def union(self, that):
-        """
-        """
-        raise NotImplementedError
-
-    def window(self, windowDuration, slideDuration=None):
-        """
-        """
-        raise NotImplementedError
-
-    def wrapRDD(self, rdd):
-        """
-        """
-        raise NotImplementedError
-
-    def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
-        return PipelinedDStream(self, f, preservesPartitioning)
->>>>>>> initial commit for pySparkStreaming
 
 
 class PipelinedDStream(DStream):
diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
index 245c137ecfc29..a7f1036e4b856 100644
--- a/python/pyspark/streaming/duration.py
+++ b/python/pyspark/streaming/duration.py
@@ -1,4 +1,3 @@
-<<<<<<< HEAD
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -19,12 +18,6 @@
 from pyspark.streaming import utils
 
 
-=======
-__author__ = 'ktakagiw'
-
-from pyspark.streaming import utils
-
->>>>>>> initial commit for pySparkStreaming
 class Duration(object):
     """
     Duration for Spark Streaming application. Used to set duration
@@ -50,7 +43,6 @@ def __init__(self, millis, _jvm=None):
         self._jduration = _jvm.Duration(millis)
 
     def toString(self):
-<<<<<<< HEAD
         """
         Return duration as string
 
@@ -71,19 +63,11 @@ def isZero(self):
         >>> d_0.isZero()
         True
         """
-=======
-        """ Return duration as string """
-        return str(self._millis) + " ms"
-
-    def isZero(self):
-        """ Check if millis is zero """
->>>>>>> initial commit for pySparkStreaming
         return self._millis == 0
 
     def prettyPrint(self):
         """
         Return a human-readable string representing a duration
-<<<<<<< HEAD
 
         >>> d_10 = Duration(10)
         >>> d_10.prettyPrint()
@@ -97,13 +81,10 @@ def prettyPrint(self):
         >>> d_1hour = Duration(60 * 60 * 1000)
         >>> d_1hour.prettyPrint()
         '1.00 h'
-=======
->>>>>>> initial commit for pySparkStreaming
         """
         return utils.msDurationToString(self._millis)
 
     def milliseconds(self):
-<<<<<<< HEAD
         """
         Return millisecond
 
@@ -136,17 +117,6 @@ def max(self, other):
         100 ms
 
         """
-=======
-        """ Return millisecond """
-        return self._millis
-
-    def toFormattedString(self):
-        """ Return millisecond """
-        return str(self._millis)
-
-    def max(self, other):
-        """ Return higher Duration """
->>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         if self > other:
             return self
@@ -154,7 +124,6 @@ def max(self, other):
             return other
 
     def min(self, other):
-<<<<<<< HEAD
         """
         Return lower Durattion
 
@@ -165,9 +134,6 @@ def min(self, other):
         10 ms
 
         """
-=======
-        """ Return lower Durattion """
->>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         if self < other:
             return self
@@ -175,7 +141,6 @@ def min(self, other):
             return other
 
     def __str__(self):
-<<<<<<< HEAD
         """
         >>> d_10 = Duration(10)
         >>> str(d_10)
@@ -194,17 +159,10 @@ def __add__(self, other):
         >>> print d_110
         110 ms
         """
-=======
-        return self.toString()
-
-    def __add__(self, other):
-        """ Add Duration and Duration """
->>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         return Duration(self._millis + other._millis)
 
     def __sub__(self, other):
-<<<<<<< HEAD
         """
         Subtract Duration by Duration
 
@@ -215,14 +173,10 @@ def __sub__(self, other):
         90 ms
 
         """
-=======
-        """ Subtract Duration by Duration  """
->>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         return Duration(self._millis - other._millis)
 
     def __mul__(self, other):
-<<<<<<< HEAD
         """
         Multiple Duration by Duration
 
@@ -233,9 +187,6 @@ def __mul__(self, other):
         1000 ms
 
         """
-=======
-        """ Multiple Duration by Duration """
->>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         return Duration(self._millis * other._millis)
 
@@ -243,7 +194,6 @@ def __div__(self, other):
         """
         Divide Duration by Duration
         for Python 2.X
-<<<<<<< HEAD
 
         >>> d_10 = Duration(10)
         >>> d_20 = Duration(20)
@@ -251,8 +201,6 @@ def __div__(self, other):
         >>> print d_2
         2 ms
 
-=======
->>>>>>> initial commit for pySparkStreaming
         """
         Duration._is_duration(other)
         return Duration(self._millis / other._millis)
@@ -261,7 +209,6 @@ def __truediv__(self, other):
         """
         Divide Duration by Duration
         for Python 3.0
-<<<<<<< HEAD
 
         >>> d_10 = Duration(10)
         >>> d_20 = Duration(20)
@@ -269,14 +216,11 @@ def __truediv__(self, other):
         >>> print d_2
         2 ms
 
-=======
->>>>>>> initial commit for pySparkStreaming
         """
         Duration._is_duration(other)
         return Duration(self._millis / other._millis)
 
     def __floordiv__(self, other):
-<<<<<<< HEAD
         """
         Divide Duration by Duration
 
@@ -302,23 +246,10 @@ def __lt__(self, other):
         False
 
         """
-=======
-        """ Divide Duration by Duration """
-        Duration._is_duration(other)
-        return Duration(self._millis // other._millis)
-
-    def __len__(self):
-        """ Length of miilisecond in Duration """
-        return len(self._millis)
-
-    def __lt__(self, other):
-        """ Duration < Duration """
->>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         return self._millis < other._millis
 
     def __le__(self, other):
-<<<<<<< HEAD
         """
         Duration <= Duration
 
@@ -346,19 +277,10 @@ def __eq__(self, other):
         True
 
         """
-=======
-        """ Duration <= Duration """
-        Duration._is_duration(other)
-        return self.millis <= other._millis
-
-    def __eq__(self, other):
-        """ Duration ==  Duration """
->>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         return self._millis == other._millis
 
     def __ne__(self, other):
-<<<<<<< HEAD
         """
         Duration != Duration
 
@@ -371,14 +293,10 @@ def __ne__(self, other):
         False
 
         """
-=======
-        """ Duration != Duration """
->>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         return self._millis != other._millis
 
     def __gt__(self, other):
-<<<<<<< HEAD
         """
         Duration > Duration
 
@@ -390,14 +308,10 @@ def __gt__(self, other):
         True
 
         """
-=======
-        """ Duration > Duration """
->>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         return self._millis > other._millis
 
     def __ge__(self, other):
-<<<<<<< HEAD
         """
         Duration >= Duration
 
@@ -410,9 +324,6 @@ def __ge__(self, other):
 
 
         """
-=======
-        """ Duration >= Duration """
->>>>>>> initial commit for pySparkStreaming
         Duration._is_duration(other)
         return self._millis >= other._millis
 
@@ -426,15 +337,12 @@ def Milliseconds(milliseconds):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
     a given number of milliseconds.
-<<<<<<< HEAD
 
     >>> milliseconds = Milliseconds(1)
     >>> d_1 = Duration(1)
     >>> milliseconds == d_1
     True
 
-=======
->>>>>>> initial commit for pySparkStreaming
     """
     return Duration(milliseconds)
 
@@ -442,7 +350,6 @@ def Seconds(seconds):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
     a given number of seconds.
-<<<<<<< HEAD
 
     >>> seconds = Seconds(1)
     >>> d_1sec = Duration(1000)
@@ -464,20 +371,3 @@ def Minutes(minutes):
 
     """
     return Duration(minutes * 60 * 1000)
-=======
-    """
-    return Duration(seconds * 1000)
-
-def Minites(minites):
-    """
-    Helper function that creates instance of [[pysparkstreaming.duration]] representing
-    a given number of minutes.
-    """
-    return Duration(minutes * 60000)
-
-if __name__ == "__main__":
-    d = Duration(1)
-    print d
-    print d.milliseconds()
-
->>>>>>> initial commit for pySparkStreaming
diff --git a/python/pyspark/streaming/jtime.py b/python/pyspark/streaming/jtime.py
index 9295c4ee27705..32ef741051283 100644
--- a/python/pyspark/streaming/jtime.py
+++ b/python/pyspark/streaming/jtime.py
@@ -1,4 +1,3 @@
-<<<<<<< HEAD
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -15,14 +14,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-=======
-__author__ = 'ktakagiw'
->>>>>>> initial commit for pySparkStreaming
 
 from pyspark.streaming import utils
 from pyspark.streaming.duration import Duration
 
-<<<<<<< HEAD
 """
 The name of this file, time is not good naming for python
 because if we do import time when we want to use native python time package, it does
@@ -30,8 +25,6 @@
 """
 
 
-=======
->>>>>>> initial commit for pySparkStreaming
 class Time(object):
     """
     Time for Spark Streaming application. Used to set Time
diff --git a/python/pyspark/streaming/pyprint.py b/python/pyspark/streaming/pyprint.py
deleted file mode 100644
index 4beb66950d851..0000000000000
--- a/python/pyspark/streaming/pyprint.py
+++ /dev/null
@@ -1,72 +0,0 @@
-<<<<<<< HEAD
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-import sys
-from itertools import chain
-
-from pyspark.serializers import PickleSerializer
-
-
-def collect(binary_file_path):
-    """
-    Read pickled file written by SparkStreaming
-    """
-=======
-import sys
-from itertools import chain
-from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
-
-def collect(binary_file_path):
->>>>>>> initial commit for pySparkStreaming
-    dse = PickleSerializer()
-    with open(binary_file_path, 'rb') as tempFile:
-        for item in dse.load_stream(tempFile):
-            yield item
-<<<<<<< HEAD
-
-
-=======
->>>>>>> initial commit for pySparkStreaming
-def main():
-    try:
-        binary_file_path = sys.argv[1]
-    except:
-<<<<<<< HEAD
-        print "Missed FilePath in argements"
-=======
-        print "Missed FilePath in argement"
->>>>>>> initial commit for pySparkStreaming
-
-    if not binary_file_path:
-        return 
-
-    counter = 0
-    for rdd in chain.from_iterable(collect(binary_file_path)):
-        print rdd
-        counter = counter + 1
-        if counter >= 10:
-            print "..."
-            break
-
-<<<<<<< HEAD
-
-=======
->>>>>>> initial commit for pySparkStreaming
-if __name__ =="__main__":
-    exit(main())
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index 56bb0ca1e9620..ad1dab0696b1a 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -37,9 +37,6 @@ def __str__(self):
     class Java:
         implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction']
 
-=======
-__author__ = 'ktakagiw'
->>>>>>> initial commit for pySparkStreaming
 
 def msDurationToString(ms):
     """
@@ -53,7 +50,6 @@ def msDurationToString(ms):
         return "%d ms" % ms
     elif ms < minute:
         return "%.1f s" % (float(ms) / second)
-<<<<<<< HEAD
     elif ms < hour:
         return "%.1f m" % (float(ms) / minute)
     else:
@@ -64,9 +60,3 @@ def rddToFileName(prefix, suffix, time):
         return prefix + "-" + str(time) + "." + suffix
     else:
         return prefix + "-" + str(time)
-=======
-    elif ms < hout:
-        return "%.1f m" % (float(ms) / minute)
-    else:
-        return "%.2f h" % (float(ms) / hour)
->>>>>>> initial commit for pySparkStreaming
diff --git a/streaming/pom.xml b/streaming/pom.xml
index cb38015c24622..483e200ff9f16 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,11 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-<<<<<<< HEAD
     <version>1.2.0-SNAPSHOT</version>
-=======
-    <version>1.0.0</version>
->>>>>>> initial commit for pySparkStreaming
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 998fa24eba91b..cb9014cda6311 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.streaming.api.python
 
-<<<<<<< HEAD
 import java.io._
 import java.io.{ObjectInputStream, IOException}
 import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
@@ -37,7 +36,6 @@ import org.apache.spark.streaming.dstream._
 import org.apache.spark.streaming.api.java._
 
 
-
 class PythonDStream[T: ClassTag](
     parent: DStream[T],
     command: Array[Byte],
@@ -48,46 +46,6 @@ class PythonDStream[T: ClassTag](
     broadcastVars: JList[Broadcast[Array[Byte]]],
     accumulator: Accumulator[JList[Array[Byte]]])
   extends DStream[Array[Byte]](parent.ssc) {
-=======
-import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
-
-import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
-import org.apache.spark.broadcast.Broadcast
-import org.apache.spark._
-import org.apache.spark.util.Utils
-import java.io._
-import scala.Some
-import org.apache.spark.streaming.Duration
-import scala.util.control.Breaks._
-import org.apache.spark.broadcast.Broadcast
-import scala.Some
-import org.apache.spark.streaming.Duration
-import org.apache.spark.rdd.RDD
-import org.apache.spark.api.python.PythonRDD
-
-
-import org.apache.spark.streaming.{Duration, Time}
-import org.apache.spark.streaming.dstream._
-import org.apache.spark.streaming.api.java._
-import org.apache.spark.rdd.RDD
-import org.apache.spark.api.python._
-import org.apache.spark.api.python.PairwiseRDD
-
-
-import scala.reflect.ClassTag
-
-
-class PythonDStream[T: ClassTag](
-                                  parent: DStream[T],
-                                  command: Array[Byte],
-                                  envVars: JMap[String, String],
-                                  pythonIncludes: JList[String],
-                                  preservePartitoning: Boolean,
-                                  pythonExec: String,
-                                  broadcastVars: JList[Broadcast[Array[Byte]]],
-                                  accumulator: Accumulator[JList[Array[Byte]]]
-                                  ) extends DStream[Array[Byte]](parent.ssc) {
->>>>>>> initial commit for pySparkStreaming
 
   override def dependencies = List(parent)
 
@@ -102,14 +60,11 @@ class PythonDStream[T: ClassTag](
       case None => None
     }
   }
-<<<<<<< HEAD
 
   def foreachRDD(foreachFunc: PythonRDDFunction) {
     new PythonForeachDStream(this, context.sparkContext.clean(foreachFunc, false)).register()
   }
 
-=======
->>>>>>> initial commit for pySparkStreaming
   val asJavaDStream  = JavaDStream.fromDStream(this)
 
   /**
@@ -132,11 +87,6 @@ class PythonDStream[T: ClassTag](
       tempFileStream.close()
 
       // This value has to be passed from python
-<<<<<<< HEAD
-      //val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
-=======
-      val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
->>>>>>> initial commit for pySparkStreaming
       val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
       //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
       //absolute path to the python script is needed to change because we do not use pysparkstreaming
@@ -174,26 +124,6 @@ class PythonDStream[T: ClassTag](
   }
 }
 
-/*
-private class PairwiseDStream(prev:DStream[Array[Byte]]) extends
-DStream[(Long, Array[Byte])](prev.ssc){
-  override def dependencies = List(prev)
-
-  override def slideDuration: Duration = prev.slideDuration
-
-  override def compute(validTime:Time):Option[RDD[(Long, Array[Byte])]]={
-    prev.getOrCompute(validTime) match{
-      case Some(rdd)=>Some(rdd)
-        val pairwiseRDD = new PairwiseRDD(rdd)
-        Some(pairwiseRDD.asJavaPairRDD.rdd)
-      case None => None
-    }
-  }
-<<<<<<< HEAD
-  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
-}
-
-
 private class PythonPairwiseDStream(prev:DStream[Array[Byte]], partitioner: Partitioner) extends
 DStream[Array[Byte]](prev.ssc){
   override def dependencies = List(prev)
@@ -274,12 +204,3 @@ class PythonTransformedDStream(
   //val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
 }
 */
-=======
-  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]] = JavaPairDStream(this)
-}
-*/
-
-
-
-
->>>>>>> initial commit for pySparkStreaming

From 38adf957f67e0858af76cec4b08c379f64dc5c59 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Tue, 15 Jul 2014 21:08:43 -0700
Subject: [PATCH 229/347] added reducedByKey not working yet

---
 python/pyspark/streaming/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index ad1dab0696b1a..9178577743e0b 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -1,4 +1,3 @@
-<<<<<<< HEAD
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with

From 4bcb31888aa03a6441711f291c64e78e8b259308 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:07:42 -0700
Subject: [PATCH 230/347] implementing transform function in Python

---
 python/pyspark/streaming/dstream.py           |  1 -
 .../api/python/PythonTransformedDStream.scala | 37 +++++++++++++++++++
 .../spark/streaming/dstream/DStream.scala     |  1 +
 3 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 224d2bbdeeb53..5a56a3d958254 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -379,7 +379,6 @@ def saveAsTextFiles(self, prefix, suffix=None):
         """
         Save this DStream as a text file, using string representations of elements.
         """
-
         def saveAsTextFile(rdd, time):
             path = rddToFileName(prefix, suffix, time)
             rdd.saveAsTextFile(path)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
new file mode 100644
index 0000000000000..ff70483b771a4
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
@@ -0,0 +1,37 @@
+package org.apache.spark.streaming.api.python
+
+import org.apache.spark.Accumulator
+import org.apache.spark.api.python.PythonRDD
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.RDD
+import org.apache.spark.streaming.api.java.JavaDStream
+import org.apache.spark.streaming.{Time, Duration}
+import org.apache.spark.streaming.dstream.DStream
+
+import scala.reflect.ClassTag
+
+/**
+ * Created by ken on 7/15/14.
+ */
+class PythonTransformedDStream[T: ClassTag](
+               parents: Seq[DStream[T]],
+               command: Array[Byte],
+               envVars: JMap[String, String],
+               pythonIncludes: JList[String],
+               preservePartitoning: Boolean,
+               pythonExec: String,
+               broadcastVars: JList[Broadcast[Array[Byte]]],
+               accumulator: Accumulator[JList[Array[Byte]]]
+               ) extends DStream[Array[Byte]](parent.ssc) {
+
+  override def dependencies = List(parent)
+
+  override def slideDuration: Duration = parent.slideDuration
+
+  //pythonDStream compute
+  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
+    Some()
+  }
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 5377dfa52d461..13032fca15616 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -563,6 +563,7 @@ abstract class DStream[T: ClassTag] (
     val cleanedF = context.sparkContext.clean(transformFunc, false)
     val realTransformFunc =  (rdds: Seq[RDD[_]], time: Time) => {
       assert(rdds.length == 1)
+      // if transformfunc is fine, it is okay
       cleanedF(rdds.head.asInstanceOf[RDD[T]], time)
     }
     new TransformedDStream[U](Seq(this), realTransformFunc)

From 247fd748ddc59db14195b599115d6495387989a0 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 11:12:53 -0700
Subject: [PATCH 231/347] modified the code base on comment in
 https://github.com/tdas/spark/pull/10

---
 core/pom.xml                         | 4 ++++
 python/pyspark/streaming/__init__.py | 1 -
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/core/pom.xml b/core/pom.xml
index 2a81f6df289c0..6e3781f2bb0c6 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,11 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
+<<<<<<< HEAD
     <version>1.2.0-SNAPSHOT</version>
+=======
+    <version>1.1.0-SNAPSHOT</version>
+>>>>>>> modified the code base on comment in https://github.com/tdas/spark/pull/10
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/python/pyspark/streaming/__init__.py b/python/pyspark/streaming/__init__.py
index 719592912e80c..e69de29bb2d1d 100644
--- a/python/pyspark/streaming/__init__.py
+++ b/python/pyspark/streaming/__init__.py
@@ -1 +0,0 @@
-__author__ = 'ktakagiw'

From dd6de814a6a4cdc254c5018bbb299ec9e938a248 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 16:23:08 -0700
Subject: [PATCH 232/347] initial commit for socketTextStream

---
 .../python/streaming/nerwork_wordcount.py     | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 examples/src/main/python/streaming/nerwork_wordcount.py

diff --git a/examples/src/main/python/streaming/nerwork_wordcount.py b/examples/src/main/python/streaming/nerwork_wordcount.py
new file mode 100644
index 0000000000000..2e5048ccad213
--- /dev/null
+++ b/examples/src/main/python/streaming/nerwork_wordcount.py
@@ -0,0 +1,22 @@
+import sys
+from operator import add
+
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.duration import *
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
+        exit(-1)
+    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
+
+    lines = ssc.socketTextStream(sys.argv[1], sys.argv[2])
+    fm_lines = lines.flatMap(lambda x: x.split(" "))
+    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
+    mapped_lines = fm_lines.map(lambda x: (x, 1))
+    
+    fm_lines.pyprint()
+    filtered_lines.pyprint()
+    mapped_lines.pyprint()
+    ssc.start()
+    ssc.awaitTermination()

From f485b1d1f9ebbcc7de376932fcdd981777f8b295 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Wed, 16 Jul 2014 16:27:28 -0700
Subject: [PATCH 233/347] fied input of socketTextDStream

---
 examples/src/main/python/streaming/nerwork_wordcount.py | 2 +-
 python/pyspark/java_gateway.py                          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/src/main/python/streaming/nerwork_wordcount.py b/examples/src/main/python/streaming/nerwork_wordcount.py
index 2e5048ccad213..67dc28f7bf7f0 100644
--- a/examples/src/main/python/streaming/nerwork_wordcount.py
+++ b/examples/src/main/python/streaming/nerwork_wordcount.py
@@ -10,7 +10,7 @@
         exit(-1)
     ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
 
-    lines = ssc.socketTextStream(sys.argv[1], sys.argv[2])
+    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     fm_lines = lines.flatMap(lambda x: x.split(" "))
     filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
     mapped_lines = fm_lines.map(lambda x: (x, 1))
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index f3c6d231ab777..2af917efc40a3 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -111,7 +111,7 @@ def run(self):
     java_import(gateway.jvm, "org.apache.spark.streaming.*")  # do we need this?
     java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
-    java_import(gateway.jvm, "org.apache.spark.streaming.dstream.*")  # do we need this?
+    java_import(gateway.jvm, "org.apache.spark.streaming.dstream.*")
     java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")

From 0df71118694f4d1ce4f49b97caa4631a2e13797a Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Wed, 16 Jul 2014 23:39:25 -0700
Subject: [PATCH 234/347] delete old file

---
 .../python/streaming/nerwork_wordcount.py     | 22 -------------------
 1 file changed, 22 deletions(-)
 delete mode 100644 examples/src/main/python/streaming/nerwork_wordcount.py

diff --git a/examples/src/main/python/streaming/nerwork_wordcount.py b/examples/src/main/python/streaming/nerwork_wordcount.py
deleted file mode 100644
index 67dc28f7bf7f0..0000000000000
--- a/examples/src/main/python/streaming/nerwork_wordcount.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import sys
-from operator import add
-
-from pyspark.streaming.context import StreamingContext
-from pyspark.streaming.duration import *
-
-if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
-        exit(-1)
-    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1))
-
-    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
-    fm_lines = lines.flatMap(lambda x: x.split(" "))
-    filtered_lines = fm_lines.filter(lambda line: "Spark" in line)
-    mapped_lines = fm_lines.map(lambda x: (x, 1))
-    
-    fm_lines.pyprint()
-    filtered_lines.pyprint()
-    mapped_lines.pyprint()
-    ssc.start()
-    ssc.awaitTermination()

From 58591d2a9b9555677f5b3cc00a98aead61553e8b Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Thu, 17 Jul 2014 16:27:05 -0700
Subject: [PATCH 235/347] reduceByKey is working

---
 .../src/main/python/streaming/wordcount.pyc   | Bin 0 -> 1566 bytes
 .../api/python/PythonTransformedDStream.scala |  19 +++++++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)
 create mode 100644 examples/src/main/python/streaming/wordcount.pyc

diff --git a/examples/src/main/python/streaming/wordcount.pyc b/examples/src/main/python/streaming/wordcount.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db93702361f47f57988ea82c213aae522e7a9f81
GIT binary patch
literal 1566
zcmb_c+invv5S`tmX_F>xdP%Qb`WW$$yrZfDRUlD`3MEBIE2LoJN!i4Ek?qh#;suqD
z<A3-7n6a}d5{L(2H};Otj6G*APU%~t_Uz(oe+Y}GLf<De%)0<U(k}o75P3NS6+jX~
z6hgqd5~va+Wr)gtT?VxRNd=;cU$1~#g)3YGS%FCi`fY$!K!9-#ZUd03Fe!ol?2vUR
z%QXmJTwtBuk~&BMunf24y#d&O@dn7MQ}ALFWDVpx+!7V6fUL8LB*Ugd1KJlxTYy_I
z-d?yQ>q9y>+5y~w@h->>_~JZ8Ex;Cx_dsqI$i71kKsF2H5bzMjM}Q%7h>o4XJ*F4n
zK8zO<nJlE^r9>3&dpnXIIEg}|#P-`;l<D|8J(q1tH`o$QLQ21=(xRUWJvq<Pk%yWL
zS&brz(`$k)&aBt)_D6P&=D{ElaXJ{pAuY^3nqC`mjgL=mGA)PMG_@zEGS)b>nVz57
z12T9uw;!@}dGH%DJZD35$VV`Rj>M6eD%+ujPzqISlGsr$lgW?>I^S}tg^jax$SNMp
z;hctP(DaEa?*gX;_S>wwv}|<ZLs((ET*W^{G;0$EZZchxa3+#mq3Ieu>~fgh+)?kE
zA}QTvlpRuWK2DWL-b-<|OR}K>zmlsFO7$Rqlgg4A({btd9G1bVIK_XX)mx{dTQvSa
zhqw@<QCQG%$-ZOKW6J-5#a_Xp{g5Qre`ao6-m)C2n3?X80&Jb8eC~UJ%iVMN;QlDk
zWeGZ@=f>=Dh3(OAO^QZYbZjaT91NU$g{kG968ie1<$oh%C2H+3oW-`snC0*d+NXJb
zNn1T{w04!?K9YZFJZ6S=gYAjV^H`y8am%IUG8(sGq=bDbaz|y947`H9jACU`y92z4
za-rpWn$Q-`3VrjXg12du(;-^!crO;am|Z^4b(~s9!C0ZK<vT5IL~O56)3tW@=dQ9&
u`upBbGC5Uv#>Fxnn(9F_YzEDs7S@AS(4e<Uqggr*_kwNrZicm>cz*!UkyX3^

literal 0
HcmV?d00001

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
index ff70483b771a4..bc07e09ec6d03 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
@@ -1,3 +1,5 @@
+/*
+
 package org.apache.spark.streaming.api.python
 
 import org.apache.spark.Accumulator
@@ -10,11 +12,8 @@ import org.apache.spark.streaming.dstream.DStream
 
 import scala.reflect.ClassTag
 
-/**
- * Created by ken on 7/15/14.
- */
 class PythonTransformedDStream[T: ClassTag](
-               parents: Seq[DStream[T]],
+               parent: DStream[T],
                command: Array[Byte],
                envVars: JMap[String, String],
                pythonIncludes: JList[String],
@@ -30,8 +29,14 @@ class PythonTransformedDStream[T: ClassTag](
 
   //pythonDStream compute
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
-    Some()
+
+//    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
+//    parents.map(_.getOrCompute(validTime).orNull).to
+//    parent = parents.head.asInstanceOf[RDD]
+//    Some()
   }
-  val asJavaDStream  = JavaDStream.fromDStream(this)
+
+  val asJavaDStream = JavaDStream.fromDStream(this)
 }
+
+*/

From 98c2a00250e826ac45eec7f17f820db9b6106f1d Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Fri, 18 Jul 2014 17:58:58 -0700
Subject: [PATCH 236/347] added count operation but this implementation need
 double check

---
 python/pyspark/streaming/dstream.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 5a56a3d958254..18415fcebe771 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -139,6 +139,12 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         def combineLocally(iterator):
             combiners = {}
             for x in iterator:
+
+                #TODO for count operation make sure count implementation
+                # This is different from what pyspark does
+                if isinstance(x, int):
+                    x = ("", x)
+
                 (k, v) = x
                 if k not in combiners:
                     combiners[k] = createCombiner(v)

From eb4bf483b9725854d704ab17f862b90476c552b8 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Sat, 19 Jul 2014 18:58:01 -0700
Subject: [PATCH 237/347] fix map function

---
 python/pyspark/streaming/dstream.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 18415fcebe771..8eea3f4d52782 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -142,8 +142,8 @@ def combineLocally(iterator):
 
                 #TODO for count operation make sure count implementation
                 # This is different from what pyspark does
-                if isinstance(x, int):
-                    x = ("", x)
+                #if isinstance(x, int):
+                #    x = ("", x)
 
                 (k, v) = x
                 if k not in combiners:

From 6197a112ae447726baeedf23e2dd1f0c75ff6e87 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Sun, 20 Jul 2014 14:31:55 -0700
Subject: [PATCH 238/347] clean up code

---
 python/pyspark/streaming/dstream.py           | 19 ++++++++---------
 .../streaming/api/java/JavaDStreamLike.scala  |  2 +-
 .../spark/streaming/dstream/DStream.scala     | 21 +++++++++----------
 3 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 8eea3f4d52782..cfadef941a551 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -56,7 +56,7 @@ def _sum(self):
         """
         Add up the elements in this DStream.
         """
-        return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+        return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
     def print_(self, label=None):
         """
@@ -65,6 +65,7 @@ def print_(self, label=None):
         deserialized pickled python object. Please use DStream.pyprint() to print results.
 
         Call DStream.print() and this function will print byte array in the DStream
+
         """
         # a hack to call print function in DStream
         getattr(self._jdstream, "print")(label)
@@ -74,7 +75,7 @@ def filter(self, f):
         Return a new DStream containing only the elements that satisfy predicate.
         """
         def func(iterator): return ifilter(f, iterator)
-        return self.mapPartitions(func)
+        return self._mapPartitions(func)
 
     def flatMap(self, f, preservesPartitioning=False):
         """
@@ -85,7 +86,7 @@ def func(s, iterator):
             return chain.from_iterable(imap(f, iterator))
         return self._mapPartitionsWithIndex(func, preservesPartitioning)
 
-    def map(self, f, preservesPartitioning=False):
+    def map(self, f):
         """
         Return a new DStream by applying a function to each element of DStream.
         """
@@ -217,13 +218,11 @@ def _defaultReducePartitions(self):
             return 2
 
     def getNumPartitions(self):
-      """
-      Returns the number of partitions in RDD
-      >>> rdd = sc.parallelize([1, 2, 3, 4], 2)
-      >>> rdd.getNumPartitions()
-      2
-      """
-      return self._jdstream.partitions().size()
+        """
+        Return the number of partitions in RDD
+        """
+        # TODO: remove hardcoding. RDD has NumPartitions but DStream does not have.
+        return 2
 
     def foreachRDD(self, func):
         """
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index cfa336df8674f..a2b9d581f609c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -59,7 +59,7 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
    * operator, so this PythonDStream will be registered as an output stream and there materialized.
    * This function is for PythonAPI.
    */
-
+  //TODO move this function to PythonDStream
   def pyprint() = dstream.pyprint()
 
   /**
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 13032fca15616..994a696a44f5b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -620,37 +620,36 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-//TODO move pyprint to PythonDStream
+//TODO move pyprint to PythonDStream and executed by py4j call back function
   /**
    * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
    * operator, so this PythonDStream will be registered as an output stream and there materialized.
    * Since serialized Python object is readable by Python, pyprint writes out binary data to
    * temporary file and run python script to deserialized and print the first ten elements
+   *
+   * Currently call python script directly. We should avoid this
    */
   private[streaming] def pyprint() {
     def foreachFunc = (rdd: RDD[T], time: Time) => {
       val iter = rdd.take(11).iterator
 
-      // make a temporary file
+      // Generate a temporary file
       val prefix = "spark"
       val suffix = ".tmp"
       val tempFile = File.createTempFile(prefix, suffix)
       val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
-      //write out serialized python object
+      // Write out serialized python object to temporary file
       PythonRDD.writeIteratorToStream(iter, tempFileStream)
       tempFileStream.close()
 
-      // This value has to be passed from python
-      // Python currently does not do cluster deployment. But what happened
+      // pythonExec should be passed from python. Move pyprint to PythonDStream
       val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
       val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
-      //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
-      //absolute path to the python script is needed to change because we do not use pysparkstreaming
+      // Call python script to deserialize and print result in stdout
       val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath)
       val workerEnv = pb.environment()
 
-      //envVars also need to be pass
-      //workerEnv.putAll(envVars)
+      // envVars also should be pass from python
       val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
       workerEnv.put("PYTHONPATH", pythonPath)
       val worker = pb.start()
@@ -662,7 +661,7 @@ abstract class DStream[T: ClassTag] (
       println ("Time: " + time)
       println ("-------------------------------------------")
 
-      //print value from python std out
+      // Print values which is from python std out
       var line = ""
       breakable {
         while (true) {
@@ -671,7 +670,7 @@ abstract class DStream[T: ClassTag] (
           println(line)
         }
       }
-      //delete temporary file
+      // Delete temporary file
       tempFile.delete()
       println()
 

From 2ad7bd34b3665afa1a7595c8b653720ec0747cf7 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Sun, 20 Jul 2014 15:32:20 -0700
Subject: [PATCH 239/347] clean up codes

---
 python/pyspark/streaming/dstream.py                          | 5 +++--
 .../scala/org/apache/spark/streaming/dstream/DStream.scala   | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index cfadef941a551..c569459767060 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -56,7 +56,8 @@ def _sum(self):
         """
         Add up the elements in this DStream.
         """
-        return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+        pass
+        #return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
     def print_(self, label=None):
         """
@@ -152,7 +153,7 @@ def combineLocally(iterator):
                 else:
                     combiners[k] = mergeValue(combiners[k], v)
             return combiners.iteritems()
-        locally_combined = self.mapPartitions(combineLocally)
+        locally_combined = self._mapPartitions(combineLocally)
         shuffled = locally_combined.partitionBy(numPartitions)
 
         def _mergeCombiners(iterator):
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 994a696a44f5b..9902ea9548bb7 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -620,7 +620,7 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-//TODO move pyprint to PythonDStream and executed by py4j call back function
+//TODO: move pyprint to PythonDStream and executed by py4j call back function
   /**
    * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
    * operator, so this PythonDStream will be registered as an output stream and there materialized.
@@ -644,6 +644,7 @@ abstract class DStream[T: ClassTag] (
 
       // pythonExec should be passed from python. Move pyprint to PythonDStream
       val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+
       val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
       // Call python script to deserialize and print result in stdout
       val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath)

From fe025471ea200cae76e65ebe03323b62060eeb69 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@Kens-MacBook-Pro.local>
Date: Sun, 20 Jul 2014 15:33:34 -0700
Subject: [PATCH 240/347] remove waste file

---
 examples/src/main/python/streaming/wordcount.pyc | Bin 1566 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 examples/src/main/python/streaming/wordcount.pyc

diff --git a/examples/src/main/python/streaming/wordcount.pyc b/examples/src/main/python/streaming/wordcount.pyc
deleted file mode 100644
index db93702361f47f57988ea82c213aae522e7a9f81..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1566
zcmb_c+invv5S`tmX_F>xdP%Qb`WW$$yrZfDRUlD`3MEBIE2LoJN!i4Ek?qh#;suqD
z<A3-7n6a}d5{L(2H};Otj6G*APU%~t_Uz(oe+Y}GLf<De%)0<U(k}o75P3NS6+jX~
z6hgqd5~va+Wr)gtT?VxRNd=;cU$1~#g)3YGS%FCi`fY$!K!9-#ZUd03Fe!ol?2vUR
z%QXmJTwtBuk~&BMunf24y#d&O@dn7MQ}ALFWDVpx+!7V6fUL8LB*Ugd1KJlxTYy_I
z-d?yQ>q9y>+5y~w@h->>_~JZ8Ex;Cx_dsqI$i71kKsF2H5bzMjM}Q%7h>o4XJ*F4n
zK8zO<nJlE^r9>3&dpnXIIEg}|#P-`;l<D|8J(q1tH`o$QLQ21=(xRUWJvq<Pk%yWL
zS&brz(`$k)&aBt)_D6P&=D{ElaXJ{pAuY^3nqC`mjgL=mGA)PMG_@zEGS)b>nVz57
z12T9uw;!@}dGH%DJZD35$VV`Rj>M6eD%+ujPzqISlGsr$lgW?>I^S}tg^jax$SNMp
z;hctP(DaEa?*gX;_S>wwv}|<ZLs((ET*W^{G;0$EZZchxa3+#mq3Ieu>~fgh+)?kE
zA}QTvlpRuWK2DWL-b-<|OR}K>zmlsFO7$Rqlgg4A({btd9G1bVIK_XX)mx{dTQvSa
zhqw@<QCQG%$-ZOKW6J-5#a_Xp{g5Qre`ao6-m)C2n3?X80&Jb8eC~UJ%iVMN;QlDk
zWeGZ@=f>=Dh3(OAO^QZYbZjaT91NU$g{kG968ie1<$oh%C2H+3oW-`snC0*d+NXJb
zNn1T{w04!?K9YZFJZ6S=gYAjV^H`y8am%IUG8(sGq=bDbaz|y947`H9jACU`y92z4
za-rpWn$Q-`3VrjXg12du(;-^!crO;am|Z^4b(~s9!C0ZK<vT5IL~O56)3tW@=dQ9&
u`upBbGC5Uv#>Fxnn(9F_YzEDs7S@AS(4e<Uqggr*_kwNrZicm>cz*!UkyX3^


From 4f0716374590f4d2c7613e85ebbb94a214c4fa1e Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Wed, 23 Jul 2014 15:43:11 -0700
Subject: [PATCH 241/347] Implemented DStream.foreachRDD in the Python API
 using Py4J callback server.

---
 python/pyspark/java_gateway.py                |  2 +-
 python/pyspark/streaming/dstream.py           |  4 +-
 .../streaming/api/java/JavaDStreamLike.scala  |  8 ---
 .../streaming/api/python/PythonDStream.scala  |  1 +
 .../spark/streaming/dstream/DStream.scala     | 60 -------------------
 5 files changed, 4 insertions(+), 71 deletions(-)

diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 2af917efc40a3..fd85e07bf3a50 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -102,7 +102,7 @@ def run(self):
         EchoOutputThread(proc.stdout).start()
 
     # Connect to the gateway
-    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False)
+    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=True)
 
     # Import the classes used by PySpark
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index c569459767060..76119981b0d9f 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -445,7 +445,6 @@ def pipeline_func(split, iterator):
             self._prev_jdstream = prev._prev_jdstream  # maintain the pipeline
             self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer
         self.is_cached = False
-        self.is_checkpointed = False
         self._ssc = prev._ssc
         self.ctx = prev.ctx
         self.prev = prev
@@ -482,4 +481,5 @@ def _jdstream(self):
         return self._jdstream_val
 
     def _is_pipelinable(self):
-        return not (self.is_cached or self.is_checkpointed)
+        return not (self.is_cached)
+
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index a2b9d581f609c..a6184de4e83c1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -54,14 +54,6 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     dstream.print()
   }
 
-  /**
-   * Print the first ten elements of each PythonRDD generated in the PythonDStream. This is an output
-   * operator, so this PythonDStream will be registered as an output stream and there materialized.
-   * This function is for PythonAPI.
-   */
-  //TODO move this function to PythonDStream
-  def pyprint() = dstream.pyprint()
-
   /**
    * Return a new DStream in which each RDD has a single element generated by counting each RDD
    * of this DStream.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index cb9014cda6311..6db7e0cfa6846 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -170,6 +170,7 @@ class PythonTestInputStream3(ssc_ : JavaStreamingContext)
 
   val asJavaDStream = JavaDStream.fromDStream(this)
 }
+
 class PythonForeachDStream(
     prev: DStream[Array[Byte]],
     foreachFunction: PythonRDDFunction
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 9902ea9548bb7..e045ff50e09f9 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -620,66 +620,6 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-//TODO: move pyprint to PythonDStream and executed by py4j call back function
-  /**
-   * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
-   * operator, so this PythonDStream will be registered as an output stream and there materialized.
-   * Since serialized Python object is readable by Python, pyprint writes out binary data to
-   * temporary file and run python script to deserialized and print the first ten elements
-   *
-   * Currently call python script directly. We should avoid this
-   */
-  private[streaming] def pyprint() {
-    def foreachFunc = (rdd: RDD[T], time: Time) => {
-      val iter = rdd.take(11).iterator
-
-      // Generate a temporary file
-      val prefix = "spark"
-      val suffix = ".tmp"
-      val tempFile = File.createTempFile(prefix, suffix)
-      val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
-      // Write out serialized python object to temporary file
-      PythonRDD.writeIteratorToStream(iter, tempFileStream)
-      tempFileStream.close()
-
-      // pythonExec should be passed from python. Move pyprint to PythonDStream
-      val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
-
-      val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
-      // Call python script to deserialize and print result in stdout
-      val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath)
-      val workerEnv = pb.environment()
-
-      // envVars also should be pass from python
-      val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
-      workerEnv.put("PYTHONPATH", pythonPath)
-      val worker = pb.start()
-      val is = worker.getInputStream()
-      val isr = new InputStreamReader(is)
-      val br = new BufferedReader(isr)
-
-      println ("-------------------------------------------")
-      println ("Time: " + time)
-      println ("-------------------------------------------")
-
-      // Print values which is from python std out
-      var line = ""
-      breakable {
-        while (true) {
-          line = br.readLine()
-          if (line == null) break()
-          println(line)
-        }
-      }
-      // Delete temporary file
-      tempFile.delete()
-      println()
-
-    }
-    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
-  }
-
-
   /**
    * Return a new DStream in which each RDD contains all the elements in seen in a
    * sliding window of time over this DStream. The new DStream generates RDDs with

From 54b53586d6c71d49b1678fb55a8723000db119c8 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Sat, 2 Aug 2014 15:58:24 -0700
Subject: [PATCH 242/347] tried to restart callback server

---
 python/pyspark/java_gateway.py      | 5 ++++-
 python/pyspark/streaming/context.py | 5 +++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index fd85e07bf3a50..af6109dc2ab69 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -102,7 +102,10 @@ def run(self):
         EchoOutputThread(proc.stdout).start()
 
     # Connect to the gateway
-    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=True)
+    # If start_callback_server is True, it looks like callback server is not killed
+    # process is hang up and test case does not move forward.
+    #gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=True)
+    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=False)
 
     # Import the classes used by PySpark
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 3f455a3e06072..0e32b80cbee31 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -18,6 +18,11 @@
 import sys
 from signal import signal, SIGTERM, SIGINT
 
+import time
+
+from pyspark.conf import SparkConf
+from pyspark.files import SparkFiles
+from pyspark.java_gateway import launch_gateway
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
 from pyspark.context import SparkContext
 from pyspark.streaming.dstream import DStream

From 88f75062dfe41f87f4e556755073da9153b9f414 Mon Sep 17 00:00:00 2001
From: Ken Takagiwa <ken@kens-mbp.gateway.sonic.net>
Date: Sat, 2 Aug 2014 20:05:15 -0700
Subject: [PATCH 243/347] Kill py4j callback server properly

---
 python/pyspark/streaming/context.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 0e32b80cbee31..a953fe382259c 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -17,8 +17,11 @@
 
 import sys
 from signal import signal, SIGTERM, SIGINT
+<<<<<<< HEAD
 
 import time
+=======
+>>>>>>> Kill py4j callback server properly
 
 from pyspark.conf import SparkConf
 from pyspark.files import SparkFiles

From 1b83354d8446e6a3c2a31854d7b7e7108089fde9 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sat, 2 Aug 2014 20:40:36 -0700
Subject: [PATCH 244/347] Removed the waste line

---
 python/pyspark/java_gateway.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index af6109dc2ab69..2af917efc40a3 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -102,10 +102,7 @@ def run(self):
         EchoOutputThread(proc.stdout).start()
 
     # Connect to the gateway
-    # If start_callback_server is True, it looks like callback server is not killed
-    # process is hang up and test case does not move forward.
-    #gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=True)
-    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False, start_callback_server=False)
+    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False)
 
     # Import the classes used by PySpark
     java_import(gateway.jvm, "org.apache.spark.SparkConf")

From 92e333e2add1af1a621bf5595de7a72eed19a358 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 3 Aug 2014 19:25:13 -0700
Subject: [PATCH 245/347] implemented reduce and count function in Dstream

---
 python/pyspark/streaming/dstream.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 76119981b0d9f..9bce826e4a650 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -56,8 +56,7 @@ def _sum(self):
         """
         Add up the elements in this DStream.
         """
-        pass
-        #return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+        return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
     def print_(self, label=None):
         """

From 0b09cff8f186f6edc21d67e3c0887994ca3e0c45 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 3 Aug 2014 22:05:28 -0700
Subject: [PATCH 246/347] added stop in StreamingContext

---
 python/pyspark/streaming/context.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index a953fe382259c..1e284b58941c4 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -17,11 +17,7 @@
 
 import sys
 from signal import signal, SIGTERM, SIGINT
-<<<<<<< HEAD
 
-import time
-=======
->>>>>>> Kill py4j callback server properly
 
 from pyspark.conf import SparkConf
 from pyspark.files import SparkFiles

From 932372a7876c7ba0ad59e9ebc48a13efda84e80a Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 3 Aug 2014 23:27:56 -0700
Subject: [PATCH 247/347] clean up dstream.py

---
 python/pyspark/streaming/dstream.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 9bce826e4a650..01ab35523bbcf 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -140,12 +140,6 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         def combineLocally(iterator):
             combiners = {}
             for x in iterator:
-
-                #TODO for count operation make sure count implementation
-                # This is different from what pyspark does
-                #if isinstance(x, int):
-                #    x = ("", x)
-
                 (k, v) = x
                 if k not in combiners:
                     combiners[k] = createCombiner(v)

From 376e3ac3b1991d920f01e413791b6365cfe490bc Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 4 Aug 2014 09:47:48 -0700
Subject: [PATCH 248/347] WIP

---
 .../main/python/streaming/test_oprations.py   | 24 +++++++++++++++++++
 python/pyspark/streaming/dstream.py           |  1 -
 2 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 examples/src/main/python/streaming/test_oprations.py

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
new file mode 100644
index 0000000000000..cb338ced5f228
--- /dev/null
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -0,0 +1,24 @@
+import sys
+from operator import add
+
+from pyspark.conf import SparkConf
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.duration import *
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
+        exit(-1)
+    conf = SparkConf()
+    conf.setAppName("PythonStreamingNetworkWordCount")
+    ssc = StreamingContext(conf=conf, duration=Seconds(1))
+
+    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
+    words = lines.flatMap(lambda line: line.split(" "))
+    mapped_words = words.map(lambda word: (word, 1))
+    count = mapped_words.reduceByKey(add)
+
+    count.pyprint()
+    ssc.start()
+#    ssc.awaitTermination()
+    ssc.stop()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 01ab35523bbcf..aab514bedaa3c 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -156,7 +156,6 @@ def _mergeCombiners(iterator):
                     combiners[k] = v
                 else:
                     combiners[k] = mergeCombiners(combiners[k], v)
-            return combiners.iteritems()
 
         return shuffled.mapPartitions(_mergeCombiners)
 

From 193472604849779c825467a5ffadc01e1171da91 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 4 Aug 2014 09:57:16 -0700
Subject: [PATCH 249/347] update comment

---
 python/pyspark/streaming/dstream.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index aab514bedaa3c..1ad2e36ad21cf 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -474,4 +474,3 @@ def _jdstream(self):
 
     def _is_pipelinable(self):
         return not (self.is_cached)
-

From 019ef3848680afc0f7d75ec169b5d378797fcb5e Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 4 Aug 2014 16:07:48 -0700
Subject: [PATCH 250/347] WIP

---
 examples/src/main/python/streaming/test_oprations.py | 5 +++--
 python/pyspark/streaming/dstream.py                  | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index cb338ced5f228..084902b6a2f0d 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -15,10 +15,11 @@
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     words = lines.flatMap(lambda line: line.split(" "))
+#    ssc.checkpoint("checkpoint")
     mapped_words = words.map(lambda word: (word, 1))
     count = mapped_words.reduceByKey(add)
 
     count.pyprint()
     ssc.start()
-#    ssc.awaitTermination()
-    ssc.stop()
+    ssc.awaitTermination()
+#    ssc.stop()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 1ad2e36ad21cf..be6da7f2aad68 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -419,6 +419,7 @@ def saveAsTextFile(rdd, time):
 # TODO: implemtnt rightOuterJoin
 
 
+
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():

From 5c04a5f46e68e5fdbaa5c36fbd54fc6a0e7ee7b4 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Tue, 5 Aug 2014 00:09:38 -0700
Subject: [PATCH 251/347] WIP: added PythonTestInputStream

---
 .../src/main/python/streaming/test_oprations.py    | 14 ++++----------
 python/pyspark/streaming/context.py                |  1 +
 python/pyspark/streaming/dstream.py                |  1 +
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index 084902b6a2f0d..3338a766b9cc3 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -6,20 +6,14 @@
 from pyspark.streaming.duration import *
 
 if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
-        exit(-1)
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
 
-    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
-    words = lines.flatMap(lambda line: line.split(" "))
-#    ssc.checkpoint("checkpoint")
-    mapped_words = words.map(lambda word: (word, 1))
-    count = mapped_words.reduceByKey(add)
+    test_input = ssc._testInputStream([1,1,1,1])
+    mapped = test_input.map(lambda x: (x, 1))
+    mapped.pyprint()
 
-    count.pyprint()
     ssc.start()
-    ssc.awaitTermination()
+#    ssc.awaitTermination()
 #    ssc.stop()
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 1e284b58941c4..765c4d5b96c74 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -17,6 +17,7 @@
 
 import sys
 from signal import signal, SIGTERM, SIGINT
+from tempfile import NamedTemporaryFile
 
 
 from pyspark.conf import SparkConf
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index be6da7f2aad68..a1cccac2eed4e 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -156,6 +156,7 @@ def _mergeCombiners(iterator):
                     combiners[k] = v
                 else:
                     combiners[k] = mergeCombiners(combiners[k], v)
+            return combiners.iteritems()
 
         return shuffled.mapPartitions(_mergeCombiners)
 

From bd3ba53a4e9df4ffd322a08f93e4b07abccfdb47 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 6 Aug 2014 19:11:17 -0700
Subject: [PATCH 252/347] WIP

---
 .../scala/org/apache/spark/api/python/PythonRDD.scala  |  2 ++
 examples/src/main/python/streaming/test_oprations.py   | 10 +++++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 022e2891559d7..41ed95b607161 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -350,6 +350,8 @@ private[spark] object PythonRDD extends Logging {
     } catch {
       case eof: EOFException => {}
     }
+    println("RDDDD ==================")
+    println(objs)
     JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
   }
 
diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index 3338a766b9cc3..5ee0bd4b31253 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -9,11 +9,15 @@
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
+    ssc.checkpoint("/tmp/spark_ckp")
 
-    test_input = ssc._testInputStream([1,1,1,1])
-    mapped = test_input.map(lambda x: (x, 1))
-    mapped.pyprint()
+    test_input = ssc._testInputStream([[1],[1],[1]])
+#    ssc.checkpoint("/tmp/spark_ckp")
+    fm_test = test_input.flatMap(lambda x: x.split(" "))
+    mapped_test = fm_test.map(lambda x: (x, 1))
 
+
+    mapped_test.print_()
     ssc.start()
 #    ssc.awaitTermination()
 #    ssc.stop()

From 9cde7c96b9c10befd90bbe87f9da4b2f40d643a0 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 10 Aug 2014 18:43:09 -0700
Subject: [PATCH 253/347] WIP added test case

---
 .../apache/spark/api/python/PythonRDD.scala   |  2 --
 .../main/python/streaming/test_oprations.py   | 25 ++++++++++++-------
 python/pyspark/streaming/dstream.py           |  2 +-
 python/pyspark/streaming_tests.py             | 11 ++++++++
 python/pyspark/worker.py                      |  2 +-
 .../streaming/api/java/JavaDStreamLike.scala  |  9 +++++++
 .../streaming/api/python/PythonDStream.scala  |  2 ++
 .../spark/streaming/dstream/DStream.scala     | 17 +++++++++++++
 8 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 41ed95b607161..022e2891559d7 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -350,8 +350,6 @@ private[spark] object PythonRDD extends Logging {
     } catch {
       case eof: EOFException => {}
     }
-    println("RDDDD ==================")
-    println(objs)
     JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
   }
 
diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index 5ee0bd4b31253..24ebe23d63166 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -9,15 +9,22 @@
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
-    ssc.checkpoint("/tmp/spark_ckp")
 
-    test_input = ssc._testInputStream([[1],[1],[1]])
-#    ssc.checkpoint("/tmp/spark_ckp")
-    fm_test = test_input.flatMap(lambda x: x.split(" "))
-    mapped_test = fm_test.map(lambda x: (x, 1))
+    test_input = ssc._testInputStream([1,2,3])
+    class buff:
+        pass
+   
+    fm_test = test_input.map(lambda x: (x, 1))
+    fm_test.test_output(buff)
 
-
-    mapped_test.print_()
     ssc.start()
-#    ssc.awaitTermination()
-#    ssc.stop()
+    while True:
+        ssc.awaitTermination(50)
+        try:
+            buff.result
+            break
+        except AttributeError:
+            pass
+
+    ssc.stop()
+    print buff.result
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index a1cccac2eed4e..4c51a862a58fa 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -236,6 +236,7 @@ def pyprint(self):
         operator, so this DStream will be registered as an output stream and there materialized.
         """
         def takeAndPrint(rdd, time):
+            print "take and print ==================="
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
@@ -420,7 +421,6 @@ def saveAsTextFile(rdd, time):
 # TODO: implemtnt rightOuterJoin
 
 
-
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index 7f6960faed1a0..d81f9b023ada2 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -444,6 +444,17 @@ def tearDown(self):
     def tearDownClass(cls):
         PySparkStreamingTestCase.tearDownClass()
 
+        start_time = time.time()
+        while True:
+            current_time = time.time()
+            # check time out
+            if (current_time - start_time) > self.timeout:
+                self.ssc.stop()
+                break
+            self.ssc.awaitTermination(50)
+            if buff.result is not None:
+                break
+        return buff.result
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index d6c06e2dbef62..90ea7b453d401 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -61,7 +61,7 @@ def main(infile, outfile):
         SparkFiles._is_running_on_worker = True
 
         # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
-        sys.path.append(spark_files_dir)  # *.py files that were added will be copied here
+        sys.path.append(spark_files_dir) # *.py files that were added will be copied here
         num_python_includes = read_int(infile)
         for _ in range(num_python_includes):
             filename = utf8_deserializer.loads(infile)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index a6184de4e83c1..7a002bbe74ca9 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -54,6 +54,15 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     dstream.print()
   }
 
+  def print(label: String = null): Unit = {
+    dstream.print(label)
+  }
+
+  def outputToFile(): Unit = {
+    dstream.outputToFile()
+  }
+
+
   /**
    * Return a new DStream in which each RDD has a single element generated by counting each RDD
    * of this DStream.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 6db7e0cfa6846..289363206ca5a 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -55,6 +55,8 @@ class PythonDStream[T: ClassTag](
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     parent.getOrCompute(validTime) match{
       case Some(rdd) =>
+        logInfo("RDD ID in python DStream     ===========")
+        logInfo("RDD id " + rdd.id)
         val pythonRDD = new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning, pythonExec, broadcastVars, accumulator)
         Some(pythonRDD.asJavaRDD.rdd)
       case None => None
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index e045ff50e09f9..15ead505eac5b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -620,6 +620,23 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
+
+  def print(label: String = null) {
+    def foreachFunc = (rdd: RDD[T], time: Time) => {
+      val first11 = rdd.take(11)
+      println ("-------------------------------------------")
+      println ("Time: " + time)
+      println ("-------------------------------------------")
+      if(label != null){
+        println (label)
+      }
+      first11.take(10).foreach(println)
+      if (first11.size > 10) println("...")
+      println()
+    }
+    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
+  }
+
   /**
    * Return a new DStream in which each RDD contains all the elements in seen in a
    * sliding window of time over this DStream. The new DStream generates RDDs with

From b3b0362fb162b9029ff14a5896f2b82ace191de5 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 03:21:22 -0700
Subject: [PATCH 254/347] added basic operation test cases

---
 .../main/python/streaming/test_oprations.py   | 19 +++++-----
 python/pyspark/streaming/context.py           | 36 +++++++++++++++++++
 python/pyspark/streaming/dstream.py           |  1 -
 python/pyspark/streaming_tests.py             |  5 ++-
 .../streaming/api/python/PythonDStream.scala  |  2 --
 5 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
index 24ebe23d63166..70a62058286e9 100644
--- a/examples/src/main/python/streaming/test_oprations.py
+++ b/examples/src/main/python/streaming/test_oprations.py
@@ -9,22 +9,23 @@
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
-
-    test_input = ssc._testInputStream([1,2,3])
-    class buff:
+    class Buff:
+        result = list()
         pass
+    Buff.result = list()
+
+    test_input = ssc._testInputStream([range(1,4), range(4,7), range(7,10)])
    
     fm_test = test_input.map(lambda x: (x, 1))
-    fm_test.test_output(buff)
+    fm_test.pyprint()
+    fm_test._test_output(Buff.result)
 
     ssc.start()
     while True:
         ssc.awaitTermination(50)
-        try:
-            buff.result
+        if len(Buff.result) == 3:
             break
-        except AttributeError:
-            pass
 
     ssc.stop()
-    print buff.result
+    print Buff.result
+
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 765c4d5b96c74..e1ea6bda7adac 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -150,6 +150,7 @@ def _testInputStream(self, test_inputs, numSlices=None):
         This implementation is inspired by QueStream implementation.
         Give list of RDD to generate DStream which contains the RDD.
         """
+<<<<<<< HEAD
         test_rdds = list()
         test_rdd_deserializers = list()
         for test_input in test_inputs:
@@ -161,3 +162,38 @@ def _testInputStream(self, test_inputs, numSlices=None):
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
         return DStream(jinput_stream, self, test_rdd_deserializers[0])
+=======
+        self._jssc.checkpoint(directory)
+
+    def _testInputStream(self, test_inputs, numSlices=None):
+        """
+        Generate multiple files to make "stream" in Scala side for test.
+        Scala chooses one of the files and generates RDD using PythonRDD.readRDDFromFile.
+        """
+        numSlices = numSlices or self._sc.defaultParallelism
+        # Calling the Java parallelize() method with an ArrayList is too slow,
+        # because it sends O(n) Py4J commands.  As an alternative, serialized
+        # objects are written to a file and loaded through textFile().
+
+        tempFiles = list()
+        for test_input in test_inputs:
+            tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
+
+            # Make sure we distribute data evenly if it's smaller than self.batchSize
+            if "__len__" not in dir(test_input):
+                c = list(test_input)    # Make it a list so we can compute its length
+            batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
+            if batchSize > 1:
+                serializer = BatchedSerializer(self._sc._unbatched_serializer,
+                                               batchSize)
+            else:
+                serializer = self._sc._unbatched_serializer
+            serializer.dump_stream(test_input, tempFile)
+            tempFiles.append(tempFile.name)
+
+        jtempFiles = ListConverter().convert(tempFiles, SparkContext._gateway._gateway_client)
+        jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
+                                                        jtempFiles,
+                                                        numSlices).asJavaDStream()
+        return DStream(jinput_stream, self, PickleSerializer())
+>>>>>>> added basic operation test cases
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 4c51a862a58fa..c1ace350b1bf5 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -236,7 +236,6 @@ def pyprint(self):
         operator, so this DStream will be registered as an output stream and there materialized.
         """
         def takeAndPrint(rdd, time):
-            print "take and print ==================="
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
index d81f9b023ada2..0682a68a419ac 100644
--- a/python/pyspark/streaming_tests.py
+++ b/python/pyspark/streaming_tests.py
@@ -449,12 +449,11 @@ def tearDownClass(cls):
             current_time = time.time()
             # check time out
             if (current_time - start_time) > self.timeout:
-                self.ssc.stop()
                 break
             self.ssc.awaitTermination(50)
-            if buff.result is not None:
+            if len(expected_output) == len(StreamOutput.result):
                 break
-        return buff.result
+        return StreamOutput.result
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 289363206ca5a..6db7e0cfa6846 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -55,8 +55,6 @@ class PythonDStream[T: ClassTag](
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     parent.getOrCompute(validTime) match{
       case Some(rdd) =>
-        logInfo("RDD ID in python DStream     ===========")
-        logInfo("RDD id " + rdd.id)
         val pythonRDD = new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning, pythonExec, broadcastVars, accumulator)
         Some(pythonRDD.asJavaRDD.rdd)
       case None => None

From 99410bed9b8884dc91e498229f424ed4e7606a45 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 03:22:23 -0700
Subject: [PATCH 255/347] delete waste file

---
 .../main/python/streaming/test_oprations.py   | 31 -------------------
 1 file changed, 31 deletions(-)
 delete mode 100644 examples/src/main/python/streaming/test_oprations.py

diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
deleted file mode 100644
index 70a62058286e9..0000000000000
--- a/examples/src/main/python/streaming/test_oprations.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import sys
-from operator import add
-
-from pyspark.conf import SparkConf
-from pyspark.streaming.context import StreamingContext
-from pyspark.streaming.duration import *
-
-if __name__ == "__main__":
-    conf = SparkConf()
-    conf.setAppName("PythonStreamingNetworkWordCount")
-    ssc = StreamingContext(conf=conf, duration=Seconds(1))
-    class Buff:
-        result = list()
-        pass
-    Buff.result = list()
-
-    test_input = ssc._testInputStream([range(1,4), range(4,7), range(7,10)])
-   
-    fm_test = test_input.map(lambda x: (x, 1))
-    fm_test.pyprint()
-    fm_test._test_output(Buff.result)
-
-    ssc.start()
-    while True:
-        ssc.awaitTermination(50)
-        if len(Buff.result) == 3:
-            break
-
-    ssc.stop()
-    print Buff.result
-

From c1d546e5bd4ef3ecb04f6514c1f96762fe346586 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 03:41:24 -0700
Subject: [PATCH 256/347] fixed PEP-008 violation

---
 python/pyspark/streaming/context.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index e1ea6bda7adac..4497818c8a250 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -19,10 +19,6 @@
 from signal import signal, SIGTERM, SIGINT
 from tempfile import NamedTemporaryFile
 
-
-from pyspark.conf import SparkConf
-from pyspark.files import SparkFiles
-from pyspark.java_gateway import launch_gateway
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
 from pyspark.context import SparkContext
 from pyspark.streaming.dstream import DStream

From af610d3ba888f411162ef19384409929a1592e3c Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 04:42:08 -0700
Subject: [PATCH 257/347] removed unnesessary changes

---
 .../scala/org/apache/spark/streaming/dstream/DStream.scala     | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 15ead505eac5b..46ef05d9c37a1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -22,8 +22,6 @@ import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
 import scala.deprecated
 import scala.collection.mutable.HashMap
 import scala.reflect.ClassTag
-import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
-import scala.util.control.Breaks._
 
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.rdd.{BlockRDD, RDD}
@@ -563,7 +561,6 @@ abstract class DStream[T: ClassTag] (
     val cleanedF = context.sparkContext.clean(transformFunc, false)
     val realTransformFunc =  (rdds: Seq[RDD[_]], time: Time) => {
       assert(rdds.length == 1)
-      // if transformfunc is fine, it is okay
       cleanedF(rdds.head.asInstanceOf[RDD[T]], time)
     }
     new TransformedDStream[U](Seq(this), realTransformFunc)

From 953deb0d9c635b7271d6f13f2628dd2bead03d03 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 05:32:28 -0700
Subject: [PATCH 258/347] edited the comment to add more precise description

---
 python/pyspark/streaming/context.py |  36 ---
 python/pyspark/streaming_tests.py   | 459 ----------------------------
 2 files changed, 495 deletions(-)
 delete mode 100644 python/pyspark/streaming_tests.py

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 4497818c8a250..1668bfcd41a57 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -146,7 +146,6 @@ def _testInputStream(self, test_inputs, numSlices=None):
         This implementation is inspired by QueStream implementation.
         Give list of RDD to generate DStream which contains the RDD.
         """
-<<<<<<< HEAD
         test_rdds = list()
         test_rdd_deserializers = list()
         for test_input in test_inputs:
@@ -158,38 +157,3 @@ def _testInputStream(self, test_inputs, numSlices=None):
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
         return DStream(jinput_stream, self, test_rdd_deserializers[0])
-=======
-        self._jssc.checkpoint(directory)
-
-    def _testInputStream(self, test_inputs, numSlices=None):
-        """
-        Generate multiple files to make "stream" in Scala side for test.
-        Scala chooses one of the files and generates RDD using PythonRDD.readRDDFromFile.
-        """
-        numSlices = numSlices or self._sc.defaultParallelism
-        # Calling the Java parallelize() method with an ArrayList is too slow,
-        # because it sends O(n) Py4J commands.  As an alternative, serialized
-        # objects are written to a file and loaded through textFile().
-
-        tempFiles = list()
-        for test_input in test_inputs:
-            tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
-
-            # Make sure we distribute data evenly if it's smaller than self.batchSize
-            if "__len__" not in dir(test_input):
-                c = list(test_input)    # Make it a list so we can compute its length
-            batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
-            if batchSize > 1:
-                serializer = BatchedSerializer(self._sc._unbatched_serializer,
-                                               batchSize)
-            else:
-                serializer = self._sc._unbatched_serializer
-            serializer.dump_stream(test_input, tempFile)
-            tempFiles.append(tempFile.name)
-
-        jtempFiles = ListConverter().convert(tempFiles, SparkContext._gateway._gateway_client)
-        jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
-                                                        jtempFiles,
-                                                        numSlices).asJavaDStream()
-        return DStream(jinput_stream, self, PickleSerializer())
->>>>>>> added basic operation test cases
diff --git a/python/pyspark/streaming_tests.py b/python/pyspark/streaming_tests.py
deleted file mode 100644
index 0682a68a419ac..0000000000000
--- a/python/pyspark/streaming_tests.py
+++ /dev/null
@@ -1,459 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Unit tests for PySpark; additional tests are implemented as doctests in
-individual modules.
-
-This file would be merged to tests.py after all functions are ready.
-But for now, this file is separated due to focusing to streaming test case.
-
-Callback server seems like unstable sometimes, which cause error in test case.
-
-"""
-from itertools import chain
-import time
-import unittest
-import operator
-
-from pyspark.context import SparkContext
-from pyspark.streaming.context import StreamingContext
-from pyspark.streaming.duration import *
-
-
-class PySparkStreamingTestCase(unittest.TestCase):
-    def setUp(self):
-        class_name = self.__class__.__name__
-        self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
-
-    def tearDown(self):
-        # Do not call pyspark.streaming.context.StreamingContext.stop directly because
-        # we do not wait to shutdown call back server and py4j client
-        self.ssc._jssc.stop()
-        self.ssc._sc.stop()
-        # Why does it long time to terminate StremaingContext and SparkContext?
-        # Should we change the sleep time if this depends on machine spec?
-        time.sleep(10)
-
-    @classmethod
-    def tearDownClass(cls):
-        time.sleep(5)
-        SparkContext._gateway._shutdown_callback_server()
-
-
-class TestBasicOperationsSuite(PySparkStreamingTestCase):
-    """
-    2 tests for each function for batach deserializer and unbatch deserilizer because
-    we cannot change the deserializer after streaming process starts.
-    Default numInputPartitions is 2.
-    If the number of input element is over 3, that DStream use batach deserializer.
-    If not, that DStream use unbatch deserializer.
-
-    Most of the operation uses UTF8 deserializer to get value from Scala.
-    I am wondering if these test are enough or not.
-    All tests input should have list of lists. This represents stream.
-    Every batch interval, the first object of list are chosen to make DStream.
-    Please see the BasicTestSuits in Scala which is close to this implementation.
-    """
-    def setUp(self):
-        PySparkStreamingTestCase.setUp(self)
-        self.timeout = 10  # seconds
-        self.numInputPartitions = 2
-        self.result = list()
-
-    def tearDown(self):
-        PySparkStreamingTestCase.tearDown(self)
-
-    @classmethod
-    def tearDownClass(cls):
-        PySparkStreamingTestCase.tearDownClass()
-
-    def test_map_batch(self):
-        """Basic operation test for DStream.map with batch deserializer"""
-        test_input = [range(1, 5), range(5, 9), range(9, 13)]
-
-        def test_func(dstream):
-            return dstream.map(lambda x: str(x))
-        expected_output = map(lambda x: map(lambda y: str(y), x), test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_map_unbatach(self):
-        """Basic operation test for DStream.map with unbatch deserializer"""
-        test_input = [range(1, 4), range(4, 7), range(7, 10)]
-
-        def test_func(dstream):
-            return dstream.map(lambda x: str(x))
-        expected_output = map(lambda x: map(lambda y: str(y), x), test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_flatMap_batch(self):
-        """Basic operation test for DStream.faltMap with batch deserializer"""
-        test_input = [range(1, 5), range(5, 9), range(9, 13)]
-
-        def test_func(dstream):
-            return dstream.flatMap(lambda x: (x, x * 2))
-        expected_output = map(lambda x: list(chain.from_iterable((map(lambda y: [y, y * 2], x)))), 
-                              test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_flatMap_unbatch(self):
-        """Basic operation test for DStream.faltMap with unbatch deserializer"""
-        test_input = [range(1, 4), range(4, 7), range(7, 10)]
-
-        def test_func(dstream):
-            return dstream.flatMap(lambda x: (x, x * 2))
-        expected_output = map(lambda x: list(chain.from_iterable((map(lambda y: [y, y * 2], x)))),
-                              test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_filter_batch(self):
-        """Basic operation test for DStream.filter with batch deserializer"""
-        test_input = [range(1, 5), range(5, 9), range(9, 13)]
-
-        def test_func(dstream):
-            return dstream.filter(lambda x: x % 2 == 0)
-        expected_output = map(lambda x: filter(lambda y: y % 2 == 0, x), test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_filter_unbatch(self):
-        """Basic operation test for DStream.filter with unbatch deserializer"""
-        test_input = [range(1, 4), range(4, 7), range(7, 10)]
-
-        def test_func(dstream):
-            return dstream.filter(lambda x: x % 2 == 0)
-        expected_output = map(lambda x: filter(lambda y: y % 2 == 0, x), test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_count_batch(self):
-        """Basic operation test for DStream.count with batch deserializer"""
-        test_input = [range(1, 5), range(1, 10), range(1, 20)]
-
-        def test_func(dstream):
-            return dstream.count()
-        expected_output = map(lambda x: [len(x)], test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_count_unbatch(self):
-        """Basic operation test for DStream.count with unbatch deserializer"""
-        test_input = [[], [1], range(1, 3), range(1, 4)]
-
-        def test_func(dstream):
-            return dstream.count()
-        expected_output = map(lambda x: [len(x)], test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_reduce_batch(self):
-        """Basic operation test for DStream.reduce with batch deserializer"""
-        test_input = [range(1, 5), range(5, 9), range(9, 13)]
-
-        def test_func(dstream):
-            return dstream.reduce(operator.add)
-        expected_output = map(lambda x: [reduce(operator.add, x)], test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_reduce_unbatch(self):
-        """Basic operation test for DStream.reduce with unbatch deserializer"""
-        test_input = [[1], range(1, 3), range(1, 4)]
-
-        def test_func(dstream):
-            return dstream.reduce(operator.add)
-        expected_output = map(lambda x: [reduce(operator.add, x)], test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_reduceByKey_batch(self):
-        """Basic operation test for DStream.reduceByKey with batch deserializer"""
-        test_input = [["a", "a", "b", "b"], ["", "", "", ""]]
-
-        def test_func(dstream):
-            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
-        expected_output = [[("a", 2), ("b", 2)], [("", 4)]]
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_reduceByKey_unbatch(self):
-        """Basic operation test for DStream.reduceByKey with unbatch deserilizer"""
-        test_input = [["a", "a", "b"], ["", ""], []]
-
-        def test_func(dstream):
-            return dstream.map(lambda x: (x, 1)).reduceByKey(operator.add)
-        expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_mapValues_batch(self):
-        """Basic operation test for DStream.mapValues with batch deserializer"""
-        test_input = [["a", "a", "b", "b"], ["", "", "", ""]]
-
-        def test_func(dstream):
-            return dstream.map(lambda x: (x, 1))\
-                          .reduceByKey(operator.add)\
-                          .mapValues(lambda x: x + 10)
-        expected_output = [[("a", 12), ("b", 12)], [("", 14)]]
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_mapValues_unbatch(self):
-        """Basic operation test for DStream.mapValues with unbatch deserializer"""
-        test_input = [["a", "a", "b"], ["", ""], []]
-
-        def test_func(dstream):
-            return dstream.map(lambda x: (x, 1))\
-                          .reduceByKey(operator.add)\
-                          .mapValues(lambda x: x + 10)
-        expected_output = [[("a", 12), ("b", 11)], [("", 12)], []]
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_flatMapValues_batch(self):
-        """Basic operation test for DStream.flatMapValues with batch deserializer"""
-        test_input = [["a", "a", "b", "b"], ["", "", "", ""]]
-
-        def test_func(dstream):
-            return dstream.map(lambda x: (x, 1))\
-                          .reduceByKey(operator.add)\
-                          .flatMapValues(lambda x: (x, x + 10))
-        expected_output = [[("a", 2), ("a", 12), ("b", 2), ("b", 12)], [("", 4), ("", 14)]]
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_flatMapValues_unbatch(self):
-        """Basic operation test for DStream.flatMapValues with unbatch deserializer"""
-        test_input = [["a", "a", "b"], ["", ""], []]
-
-        def test_func(dstream):
-            return dstream.map(lambda x: (x, 1))\
-                          .reduceByKey(operator.add)\
-                          .flatMapValues(lambda x: (x, x + 10))
-        expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_glom_batch(self):
-        """Basic operation test for DStream.glom with batch deserializer"""
-        test_input = [range(1, 5), range(5, 9), range(9, 13)]
-        numSlices = 2
-
-        def test_func(dstream):
-            return dstream.glom()
-        expected_output = [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]]
-        output = self._run_stream(test_input, test_func, expected_output, numSlices)
-        self.assertEqual(expected_output, output)
-
-    def test_glom_unbatach(self):
-        """Basic operation test for DStream.glom with unbatch deserialiser"""
-        test_input = [range(1, 4), range(4, 7), range(7, 10)]
-        numSlices = 2
-
-        def test_func(dstream):
-            return dstream.glom()
-        expected_output = [[[1], [2, 3]], [[4], [5, 6]], [[7], [8, 9]]]
-        output = self._run_stream(test_input, test_func, expected_output, numSlices)
-        self.assertEqual(expected_output, output)
-
-    def test_mapPartitions_batch(self):
-        """Basic operation test for DStream.mapPartitions with batch deserializer."""
-        test_input = [range(1, 5), range(5, 9), range(9, 13)]
-        numSlices = 2
-
-        def test_func(dstream):
-            def f(iterator):
-                yield sum(iterator)
-            return dstream.mapPartitions(f)
-        expected_output = [[3, 7], [11, 15], [19, 23]]
-        output = self._run_stream(test_input, test_func, expected_output, numSlices)
-        self.assertEqual(expected_output, output)
-
-    def test_mapPartitions_unbatch(self):
-        """Basic operation test for DStream.mapPartitions with unbatch deserializer."""
-        test_input = [range(1, 4), range(4, 7), range(7, 10)]
-        numSlices = 2
-
-        def test_func(dstream):
-            def f(iterator):
-                yield sum(iterator)
-            return dstream.mapPartitions(f)
-        expected_output = [[1, 5], [4, 11], [7, 17]]
-        output = self._run_stream(test_input, test_func, expected_output, numSlices)
-        self.assertEqual(expected_output, output)
-
-    def test_countByValue_batch(self):
-        """Basic operation test for DStream.countByValue with batch deserializer."""
-        test_input = [range(1, 5) + range(1,5), range(5, 7) + range(5, 9), ["a", "a", "b", ""]]
-
-        def test_func(dstream):
-            return dstream.countByValue()
-        expected_output = [[(1, 2), (2, 2), (3, 2), (4, 2)],
-                           [(5, 2), (6, 2), (7, 1), (8, 1)],
-                           [("a", 2), ("b", 1), ("", 1)]]
-        output = self._run_stream(test_input, test_func, expected_output)
-        for result in (output, expected_output):
-            self._sort_result_based_on_key(result)
-        self.assertEqual(expected_output, output)
-
-    def test_countByValue_unbatch(self):
-        """Basic operation test for DStream.countByValue with unbatch deserializer."""
-        test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
-
-        def test_func(dstream):
-            return dstream.countByValue()
-        expected_output = [[(1, 1), (2, 1), (3, 1)],
-                           [(1, 2), ("", 1)],
-                           [("a", 2), ("b", 1)]]
-        output = self._run_stream(test_input, test_func, expected_output)
-        for result in (output, expected_output):
-            self._sort_result_based_on_key(result)
-        self.assertEqual(expected_output, output)
-
-    def test_groupByKey_batch(self):
-        """Basic operation test for DStream.groupByKey with batch deserializer."""
-        test_input = [range(1, 5), [1, 1, 1, 2, 2, 3], ["a", "a", "b", "", "", ""]]
-        def test_func(dstream):
-            return dstream.map(lambda x: (x, 1)).groupByKey()
-        expected_output = [[(1, [1]), (2, [1]), (3, [1]), (4, [1])],
-                           [(1, [1, 1, 1]), (2, [1, 1]), (3, [1])],
-                           [("a", [1, 1]), ("b", [1]), ("", [1, 1, 1])]]
-        scattered_output = self._run_stream(test_input, test_func, expected_output)
-        output = self._convert_iter_value_to_list(scattered_output)
-        for result in (output, expected_output):
-            self._sort_result_based_on_key(result)
-        self.assertEqual(expected_output, output)
-
-    def test_groupByKey_unbatch(self):
-        """Basic operation test for DStream.groupByKey with unbatch deserializer."""
-        test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
-
-        def test_func(dstream):
-            return dstream.map(lambda x: (x, 1)).groupByKey()
-        expected_output = [[(1, [1]), (2, [1]), (3, [1])],
-                           [(1, [1, 1]), ("", [1])],
-                           [("a", [1, 1]), ("b", [1])]]
-        scattered_output = self._run_stream(test_input, test_func, expected_output)
-        output = self._convert_iter_value_to_list(scattered_output)
-        for result in (output, expected_output):
-            self._sort_result_based_on_key(result)
-        self.assertEqual(expected_output, output)
-
-    def test_combineByKey_batch(self):
-        """Basic operation test for DStream.combineByKey with batch deserializer."""
-        test_input = [range(1, 5), [1, 1, 1, 2, 2, 3], ["a", "a", "b", "", "", ""]]
-
-        def test_func(dstream):
-            def add(a, b): return a + str(b)
-            return dstream.map(lambda x: (x, 1)).combineByKey(str, add, add)
-        expected_output = [[(1, "1"), (2, "1"), (3, "1"), (4, "1")],
-                           [(1, "111"), (2, "11"), (3, "1")],
-                           [("a", "11"), ("b", "1"), ("", "111")]]
-        output = self._run_stream(test_input, test_func, expected_output)
-        for result in (output, expected_output):
-            self._sort_result_based_on_key(result)
-        self.assertEqual(expected_output, output)
-
-    def test_combineByKey_unbatch(self):
-        """Basic operation test for DStream.combineByKey with unbatch deserializer."""
-        test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
-
-        def test_func(dstream):
-            def add(a, b): return a + str(b)
-            return dstream.map(lambda x: (x, 1)).combineByKey(str, add, add)
-        expected_output = [[(1, "1"), (2, "1"), (3, "1")],
-                           [(1, "11"), ("", "1")],
-                           [("a", "11"), ("b", "1")]]
-        output = self._run_stream(test_input, test_func, expected_output)
-        for result in (output, expected_output):
-            self._sort_result_based_on_key(result)
-        self.assertEqual(expected_output, output)
-
-    def _convert_iter_value_to_list(self, outputs):
-        """Return key value pair list. Value is converted to iterator to list."""
-        result = list()
-        for output in outputs:
-            result.append(map(lambda (x, y): (x, list(y)), output))
-        return result
-
-    def _sort_result_based_on_key(self, outputs):
-        """Sort the list base onf first value."""
-        for output in outputs:
-            output.sort(key=lambda x: x[0])
-
-    def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
-        """
-        Start stream and return the output.
-        @param test_input: dataset for the test. This should be list of lists.
-        @param test_func: wrapped test_function. This function should return PythonDstream object.
-        @param expexted_output: expected output for this testcase.
-        @param numSlices: the number of slices in the rdd in the dstream.
-        """
-        # Generate input stream with user-defined input.
-        numSlices = numSlices or self.numInputPartitions
-        test_input_stream = self.ssc._testInputStream(test_input, numSlices)
-        # Apply test function to stream.
-        test_stream = test_func(test_input_stream)
-        # Add job to get output from stream.
-        test_stream._test_output(self.result)
-        self.ssc.start()
-
-        start_time = time.time()
-        # Loop until get the expected the number of the result from the stream.
-        while True:
-            current_time = time.time()
-            # Check time out.
-            if (current_time - start_time) > self.timeout:
-                break
-            self.ssc.awaitTermination(50)
-            # Check if the output is the same length of expexted output.
-            if len(expected_output) == len(self.result):
-                break
-
-        return self.result
-
-class TestSaveAsFilesSuite(PySparkStreamingTestCase):
-    def setUp(self):
-        PySparkStreamingTestCase.setUp(self)
-        self.timeout = 10  # seconds
-        self.numInputPartitions = 2
-        self.result = list()
-
-    def tearDown(self):
-        PySparkStreamingTestCase.tearDown(self)
-
-    @classmethod
-    def tearDownClass(cls):
-        PySparkStreamingTestCase.tearDownClass()
-
-        start_time = time.time()
-        while True:
-            current_time = time.time()
-            # check time out
-            if (current_time - start_time) > self.timeout:
-                break
-            self.ssc.awaitTermination(50)
-            if len(expected_output) == len(StreamOutput.result):
-                break
-        return StreamOutput.result
-
-if __name__ == "__main__":
-    unittest.main()

From f67cf57d0edc28b4bafbee9fb5547213dd79b93c Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 11 Aug 2014 16:34:12 -0700
Subject: [PATCH 259/347] added mapValues and flatMapVaules WIP for glom and
 mapPartitions test

---
 python/pyspark/streaming/dstream.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index c1ace350b1bf5..b669d0839f3bf 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -46,6 +46,12 @@ def context(self):
         """
         return self._ssc
 
+    def context(self):
+        """
+        Return the StreamingContext associated with this DStream
+        """
+        return self._ssc
+
     def count(self):
         """
         Return a new DStream which contains the number of elements in this DStream.
@@ -56,7 +62,7 @@ def _sum(self):
         """
         Add up the elements in this DStream.
         """
-        return self._mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+        return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
     def print_(self, label=None):
         """
@@ -75,7 +81,7 @@ def filter(self, f):
         Return a new DStream containing only the elements that satisfy predicate.
         """
         def func(iterator): return ifilter(f, iterator)
-        return self._mapPartitions(func)
+        return self.mapPartitions(func)
 
     def flatMap(self, f, preservesPartitioning=False):
         """
@@ -86,7 +92,7 @@ def func(s, iterator):
             return chain.from_iterable(imap(f, iterator))
         return self._mapPartitionsWithIndex(func, preservesPartitioning)
 
-    def map(self, f):
+    def map(self, f, preservesPartitioning=False):
         """
         Return a new DStream by applying a function to each element of DStream.
         """
@@ -146,7 +152,7 @@ def combineLocally(iterator):
                 else:
                     combiners[k] = mergeValue(combiners[k], v)
             return combiners.iteritems()
-        locally_combined = self._mapPartitions(combineLocally)
+        locally_combined = self.mapPartitions(combineLocally)
         shuffled = locally_combined.partitionBy(numPartitions)
 
         def _mergeCombiners(iterator):
@@ -474,4 +480,4 @@ def _jdstream(self):
         return self._jdstream_val
 
     def _is_pipelinable(self):
-        return not (self.is_cached)
+        return not self.is_cached

From 1e126bfd07995cf25477e78163262da9a46114ae Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 13 Aug 2014 21:04:26 -0700
Subject: [PATCH 260/347] WIP: solved partitioned and None is not recognized

---
 python/pyspark/streaming/dstream.py              | 16 ++++++++++++++++
 .../streaming/api/python/PythonDStream.scala     |  1 +
 2 files changed, 17 insertions(+)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index b669d0839f3bf..f3e80b06a9790 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -245,6 +245,8 @@ def takeAndPrint(rdd, time):
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
+            print rdd.glom().collect()
+            print "-------------------------------------------"
             print "-------------------------------------------"
             for record in taken[:10]:
                 print record
@@ -426,6 +428,20 @@ def saveAsTextFile(rdd, time):
 # TODO: implemtnt rightOuterJoin
 
 
+# TODO: implement groupByKey
+# TODO: impelment union
+# TODO: implement cache
+# TODO: implement persist
+# TODO: implement repertitions
+# TODO: implement saveAsTextFile
+# TODO: implement cogroup
+# TODO: implement join
+# TODO: implement countByValue
+# TODO: implement leftOuterJoin
+# TODO: implemtnt rightOuterJoin
+
+
+
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 6db7e0cfa6846..133704c115419 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -205,3 +205,4 @@ class PythonTransformedDStream(
   //val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
 }
 */
+

From 795b2cd1bebe4f0e85ab25bc28f3e5eb1ee00186 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 14 Aug 2014 02:19:46 -0700
Subject: [PATCH 261/347] broke something

---
 python/pyspark/streaming/context.py           | 46 ++++++++++++
 python/pyspark/streaming/dstream.py           | 15 +---
 python/pyspark/worker.py                      | 10 +++
 .../streaming/api/python/PythonDStream.scala  | 75 +++++++++++++++++++
 4 files changed, 132 insertions(+), 14 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 1668bfcd41a57..96d717cfcc75c 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -142,9 +142,49 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
 
     def _testInputStream(self, test_inputs, numSlices=None):
         """
+<<<<<<< HEAD
         This function is only for test.
         This implementation is inspired by QueStream implementation.
         Give list of RDD to generate DStream which contains the RDD.
+=======
+        Generate multiple files to make "stream" in Scala side for test.
+        Scala chooses one of the files and generates RDD using PythonRDD.readRDDFromFile.
+
+        QueStream maybe good way to implement this function
+        """
+        numSlices = numSlices or self._sc.defaultParallelism
+        # Calling the Java parallelize() method with an ArrayList is too slow,
+        # because it sends O(n) Py4J commands.  As an alternative, serialized
+        # objects are written to a file and loaded through textFile().
+
+        tempFiles = list()
+        for test_input in test_inputs:
+            tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
+
+            # Make sure we distribute data evenly if it's smaller than self.batchSize
+            if "__len__" not in dir(test_input):
+                test_input = list(test_input)    # Make it a list so we can compute its length
+            batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
+            if batchSize > 1:
+                serializer = BatchedSerializer(self._sc._unbatched_serializer,
+                                               batchSize)
+            else:
+                serializer = self._sc._unbatched_serializer
+            serializer.dump_stream(test_input, tempFile)
+            tempFile.close()
+            tempFiles.append(tempFile.name)
+
+        jtempFiles = ListConverter().convert(tempFiles, SparkContext._gateway._gateway_client)
+        jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
+                                                        jtempFiles,
+                                                        numSlices).asJavaDStream()
+        return DStream(jinput_stream, self, BatchedSerializer(PickleSerializer()))
+    
+    def _testInputStream2(self, test_inputs, numSlices=None):
+        """
+        This is inpired by QueStream implementation. Give list of RDD and generate DStream
+        which contain the RDD.
+>>>>>>> broke something
         """
         test_rdds = list()
         test_rdd_deserializers = list()
@@ -156,4 +196,10 @@ def _testInputStream(self, test_inputs, numSlices=None):
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
+<<<<<<< HEAD
         return DStream(jinput_stream, self, test_rdd_deserializers[0])
+=======
+        dstream = DStream(jinput_stream, self, test_rdd_deserializers[0])
+        dstream._test_switch_dserializer(test_rdd_deserializers)
+        return dstream
+>>>>>>> broke something
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index f3e80b06a9790..32d6c3e45eb54 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -17,6 +17,7 @@
 
 from collections import defaultdict
 from itertools import chain, ifilter, imap
+import time
 import operator
 
 from pyspark.serializers import NoOpSerializer,\
@@ -428,20 +429,6 @@ def saveAsTextFile(rdd, time):
 # TODO: implemtnt rightOuterJoin
 
 
-# TODO: implement groupByKey
-# TODO: impelment union
-# TODO: implement cache
-# TODO: implement persist
-# TODO: implement repertitions
-# TODO: implement saveAsTextFile
-# TODO: implement cogroup
-# TODO: implement join
-# TODO: implement countByValue
-# TODO: implement leftOuterJoin
-# TODO: implemtnt rightOuterJoin
-
-
-
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 90ea7b453d401..78e56143fb3ab 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -86,6 +86,16 @@ def main(infile, outfile):
         (func, deserializer, serializer) = command
         init_time = time.time()
         iterator = deserializer.load_stream(infile)
+        print "deserializer in worker: %s" % str(deserializer)
+        iterator, walk = itertools.tee(iterator)
+        if isinstance(walk, int):
+            print "this is int"
+            print walk
+        else:
+            try:
+                print list(walk)
+            except:
+                print list(walk)
         serializer.dump_stream(func(split_index, iterator), outfile)
     except Exception:
         try:
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 133704c115419..63b2f709df7e4 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -206,3 +206,78 @@ class PythonTransformedDStream(
 }
 */
 
+<<<<<<< HEAD
+=======
+/**
+ * This is a input stream just for the unitest. This is equivalent to a checkpointable,
+ * replayable, reliable message queue like Kafka. It requires a sequence as input, and
+ * returns the i_th element at the i_th batch under manual clock.
+ */
+class PythonTestInputStream(ssc_ : JavaStreamingContext, inputFiles: JArrayList[String], numPartitions: Int)
+  extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)){
+
+  def start() {}
+
+  def stop() {}
+
+  def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    logInfo("Computing RDD for time " + validTime)
+    inputFiles.foreach(logInfo(_))
+    // make a temporary file
+    // make empty RDD
+    val prefix = "spark"
+    val suffix = ".tmp"
+    val tempFile = File.createTempFile(prefix, suffix)
+    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
+    logInfo("Index: " + index)
+
+    val selectedInputFile: String = {
+      if (inputFiles.isEmpty){
+        tempFile.getAbsolutePath
+      }else if (index < inputFiles.size()) {
+        inputFiles.get(index)
+      } else {
+        tempFile.getAbsolutePath
+      }
+    }
+    val rdd = PythonRDD.readRDDFromFile(JavaSparkContext.fromSparkContext(ssc_.sparkContext), selectedInputFile, numPartitions).rdd
+    logInfo("Created RDD " + rdd.id + " with " + selectedInputFile)
+    Some(rdd)
+  }
+
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
+
+/**
+ * This is a input stream just for the unitest. This is equivalent to a checkpointable,
+ * replayable, reliable message queue like Kafka. It requires a sequence as input, and
+ * returns the i_th element at the i_th batch under manual clock.
+ * This implementation is close to QueStream
+ */
+
+class PythonTestInputStream2(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[JavaRDD[Array[Byte]]])
+  extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)) {
+
+  def start() {}
+
+  def stop() {}
+
+  def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    val emptyRDD = ssc.sparkContext.emptyRDD[Array[Byte]]
+    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
+    val selectedRDD = {
+      if (inputRDDs.isEmpty) {
+        emptyRDD
+      } else if (index < inputRDDs.size()) {
+        inputRDDs.get(index).rdd
+      } else {
+        emptyRDD
+      }
+    }
+
+    Some(selectedRDD)
+  }
+
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
+>>>>>>> broke something

From 8dcda84d2d1de47b00e0209b269513ff0bdb1671 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 14 Aug 2014 18:07:10 -0700
Subject: [PATCH 262/347] all tests are passed if numSlice is 2 and the numver
 of each input is over 4

---
 python/pyspark/streaming/context.py           | 46 -------------------
 .../streaming/api/python/PythonDStream.scala  | 19 +++++++-
 2 files changed, 18 insertions(+), 47 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 96d717cfcc75c..1668bfcd41a57 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -142,49 +142,9 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
 
     def _testInputStream(self, test_inputs, numSlices=None):
         """
-<<<<<<< HEAD
         This function is only for test.
         This implementation is inspired by QueStream implementation.
         Give list of RDD to generate DStream which contains the RDD.
-=======
-        Generate multiple files to make "stream" in Scala side for test.
-        Scala chooses one of the files and generates RDD using PythonRDD.readRDDFromFile.
-
-        QueStream maybe good way to implement this function
-        """
-        numSlices = numSlices or self._sc.defaultParallelism
-        # Calling the Java parallelize() method with an ArrayList is too slow,
-        # because it sends O(n) Py4J commands.  As an alternative, serialized
-        # objects are written to a file and loaded through textFile().
-
-        tempFiles = list()
-        for test_input in test_inputs:
-            tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
-
-            # Make sure we distribute data evenly if it's smaller than self.batchSize
-            if "__len__" not in dir(test_input):
-                test_input = list(test_input)    # Make it a list so we can compute its length
-            batchSize = min(len(test_input) // numSlices, self._sc._batchSize)
-            if batchSize > 1:
-                serializer = BatchedSerializer(self._sc._unbatched_serializer,
-                                               batchSize)
-            else:
-                serializer = self._sc._unbatched_serializer
-            serializer.dump_stream(test_input, tempFile)
-            tempFile.close()
-            tempFiles.append(tempFile.name)
-
-        jtempFiles = ListConverter().convert(tempFiles, SparkContext._gateway._gateway_client)
-        jinput_stream = self._jvm.PythonTestInputStream(self._jssc,
-                                                        jtempFiles,
-                                                        numSlices).asJavaDStream()
-        return DStream(jinput_stream, self, BatchedSerializer(PickleSerializer()))
-    
-    def _testInputStream2(self, test_inputs, numSlices=None):
-        """
-        This is inpired by QueStream implementation. Give list of RDD and generate DStream
-        which contain the RDD.
->>>>>>> broke something
         """
         test_rdds = list()
         test_rdd_deserializers = list()
@@ -196,10 +156,4 @@ def _testInputStream2(self, test_inputs, numSlices=None):
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
-<<<<<<< HEAD
         return DStream(jinput_stream, self, test_rdd_deserializers[0])
-=======
-        dstream = DStream(jinput_stream, self, test_rdd_deserializers[0])
-        dstream._test_switch_dserializer(test_rdd_deserializers)
-        return dstream
->>>>>>> broke something
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 63b2f709df7e4..0bafe3f846793 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -280,4 +280,21 @@ class PythonTestInputStream2(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
->>>>>>> broke something
+
+
+class PythonTestInputStream3(ssc_ : JavaStreamingContext)
+  extends InputDStream[Any](JavaStreamingContext.toStreamingContext(ssc_)) {
+
+  def start() {}
+
+  def stop() {}
+
+  def compute(validTime: Time): Option[RDD[Any]] = {
+    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
+    val selectedInput = ArrayBuffer(1, 2, 3).toSeq
+    val rdd :RDD[Any] = ssc.sc.makeRDD(selectedInput, 2)
+    Some(rdd)
+  }
+
+  val asJavaDStream = JavaDStream.fromDStream(this)
+}>>>>>>> broke something

From c5ecfc1f5d65f8458c3919a2371ada4d32e8f03d Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 14 Aug 2014 23:42:34 -0700
Subject: [PATCH 263/347] basic function test cases are passed

---
 python/pyspark/worker.py                      | 10 ---
 .../streaming/api/python/PythonDStream.scala  | 62 +------------------
 2 files changed, 1 insertion(+), 71 deletions(-)

diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 78e56143fb3ab..90ea7b453d401 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -86,16 +86,6 @@ def main(infile, outfile):
         (func, deserializer, serializer) = command
         init_time = time.time()
         iterator = deserializer.load_stream(infile)
-        print "deserializer in worker: %s" % str(deserializer)
-        iterator, walk = itertools.tee(iterator)
-        if isinstance(walk, int):
-            print "this is int"
-            print walk
-        else:
-            try:
-                print list(walk)
-            except:
-                print list(walk)
         serializer.dump_stream(func(split_index, iterator), outfile)
     except Exception:
         try:
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 0bafe3f846793..0186003f70c49 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -206,48 +206,6 @@ class PythonTransformedDStream(
 }
 */
 
-<<<<<<< HEAD
-=======
-/**
- * This is a input stream just for the unitest. This is equivalent to a checkpointable,
- * replayable, reliable message queue like Kafka. It requires a sequence as input, and
- * returns the i_th element at the i_th batch under manual clock.
- */
-class PythonTestInputStream(ssc_ : JavaStreamingContext, inputFiles: JArrayList[String], numPartitions: Int)
-  extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)){
-
-  def start() {}
-
-  def stop() {}
-
-  def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-    logInfo("Computing RDD for time " + validTime)
-    inputFiles.foreach(logInfo(_))
-    // make a temporary file
-    // make empty RDD
-    val prefix = "spark"
-    val suffix = ".tmp"
-    val tempFile = File.createTempFile(prefix, suffix)
-    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
-    logInfo("Index: " + index)
-
-    val selectedInputFile: String = {
-      if (inputFiles.isEmpty){
-        tempFile.getAbsolutePath
-      }else if (index < inputFiles.size()) {
-        inputFiles.get(index)
-      } else {
-        tempFile.getAbsolutePath
-      }
-    }
-    val rdd = PythonRDD.readRDDFromFile(JavaSparkContext.fromSparkContext(ssc_.sparkContext), selectedInputFile, numPartitions).rdd
-    logInfo("Created RDD " + rdd.id + " with " + selectedInputFile)
-    Some(rdd)
-  }
-
-  val asJavaDStream  = JavaDStream.fromDStream(this)
-}
-
 /**
  * This is a input stream just for the unitest. This is equivalent to a checkpointable,
  * replayable, reliable message queue like Kafka. It requires a sequence as input, and
@@ -255,7 +213,7 @@ class PythonTestInputStream(ssc_ : JavaStreamingContext, inputFiles: JArrayList[
  * This implementation is close to QueStream
  */
 
-class PythonTestInputStream2(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[JavaRDD[Array[Byte]]])
+class PythonTestInputStream(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[JavaRDD[Array[Byte]]])
   extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)) {
 
   def start() {}
@@ -280,21 +238,3 @@ class PythonTestInputStream2(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
-
-
-class PythonTestInputStream3(ssc_ : JavaStreamingContext)
-  extends InputDStream[Any](JavaStreamingContext.toStreamingContext(ssc_)) {
-
-  def start() {}
-
-  def stop() {}
-
-  def compute(validTime: Time): Option[RDD[Any]] = {
-    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
-    val selectedInput = ArrayBuffer(1, 2, 3).toSeq
-    val rdd :RDD[Any] = ssc.sc.makeRDD(selectedInput, 2)
-    Some(rdd)
-  }
-
-  val asJavaDStream = JavaDStream.fromDStream(this)
-}>>>>>>> broke something

From 2a06cdb1633126dbd1cc1d2ae07a1e1e85fad902 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Fri, 15 Aug 2014 11:28:39 -0700
Subject: [PATCH 264/347] remove waste duplicated code

---
 python/pyspark/streaming/dstream.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 32d6c3e45eb54..66024d539ce5c 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -17,7 +17,6 @@
 
 from collections import defaultdict
 from itertools import chain, ifilter, imap
-import time
 import operator
 
 from pyspark.serializers import NoOpSerializer,\
@@ -246,8 +245,6 @@ def takeAndPrint(rdd, time):
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
-            print rdd.glom().collect()
-            print "-------------------------------------------"
             print "-------------------------------------------"
             for record in taken[:10]:
                 print record
@@ -447,6 +444,7 @@ def pipeline_func(split, iterator):
             self._prev_jdstream = prev._prev_jdstream  # maintain the pipeline
             self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer
         self.is_cached = False
+        self.is_checkpointed = False
         self._ssc = prev._ssc
         self.ctx = prev.ctx
         self.prev = prev
@@ -483,4 +481,4 @@ def _jdstream(self):
         return self._jdstream_val
 
     def _is_pipelinable(self):
-        return not self.is_cached
+        return not (self.is_cached or self.is_checkpointed)

From 99ce042b5644ab21ae703a35a7bf4d699a5ae482 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Fri, 15 Aug 2014 17:10:56 -0700
Subject: [PATCH 265/347] added saveAsTextFiles and saveAsPickledFiles

---
 python/pyspark/streaming/context.py                            | 3 +++
 .../org/apache/spark/streaming/api/python/PythonDStream.scala  | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 1668bfcd41a57..baf6d12a8025a 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -153,6 +153,9 @@ def _testInputStream(self, test_inputs, numSlices=None):
             test_rdds.append(test_rdd._jrdd)
             test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
 
+#        if len(set(test_rdd_deserializers)) > 1:
+#            raise IOError("Deserializer should be one type to run test case. "
+#                          "See the SparkContext.parallelize to understand how to decide deserializer")
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 0186003f70c49..681ef5997002e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -210,7 +210,7 @@ class PythonTransformedDStream(
  * This is a input stream just for the unitest. This is equivalent to a checkpointable,
  * replayable, reliable message queue like Kafka. It requires a sequence as input, and
  * returns the i_th element at the i_th batch under manual clock.
- * This implementation is close to QueStream
+ * This implementation is inspired by QueStream
  */
 
 class PythonTestInputStream(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[JavaRDD[Array[Byte]]])

From ddd4ee1ee0821e1e45e9fbd2dbef11a5a6c69aca Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Fri, 15 Aug 2014 22:30:58 -0700
Subject: [PATCH 266/347] added TODO coments

---
 python/pyspark/streaming/context.py | 1 -
 python/pyspark/streaming/dstream.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index baf6d12a8025a..2cc9d03a87e9e 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -17,7 +17,6 @@
 
 import sys
 from signal import signal, SIGTERM, SIGINT
-from tempfile import NamedTemporaryFile
 
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
 from pyspark.context import SparkContext
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 66024d539ce5c..bfe639e3ef791 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -425,7 +425,6 @@ def saveAsTextFile(rdd, time):
 # TODO: implement leftOuterJoin
 # TODO: implemtnt rightOuterJoin
 
-
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():

From af336b776c0586e1d8e2dbe4df2073ee8f7ca566 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 00:30:17 -0700
Subject: [PATCH 267/347] add comments

---
 python/pyspark/java_gateway.py      | 2 +-
 python/pyspark/streaming/context.py | 3 ---
 python/pyspark/streaming/dstream.py | 1 +
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 2af917efc40a3..f3c6d231ab777 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -111,7 +111,7 @@ def run(self):
     java_import(gateway.jvm, "org.apache.spark.streaming.*")  # do we need this?
     java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
-    java_import(gateway.jvm, "org.apache.spark.streaming.dstream.*")
+    java_import(gateway.jvm, "org.apache.spark.streaming.dstream.*")  # do we need this?
     java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 2cc9d03a87e9e..3f455a3e06072 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -152,9 +152,6 @@ def _testInputStream(self, test_inputs, numSlices=None):
             test_rdds.append(test_rdd._jrdd)
             test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
 
-#        if len(set(test_rdd_deserializers)) > 1:
-#            raise IOError("Deserializer should be one type to run test case. "
-#                          "See the SparkContext.parallelize to understand how to decide deserializer")
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index bfe639e3ef791..66024d539ce5c 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -425,6 +425,7 @@ def saveAsTextFile(rdd, time):
 # TODO: implement leftOuterJoin
 # TODO: implemtnt rightOuterJoin
 
+
 class PipelinedDStream(DStream):
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():

From 455e5af16e5c46167b0afb7d39e083f72ac7fe0d Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 00:35:50 -0700
Subject: [PATCH 268/347] removed wasted print in DStream

---
 .../streaming/api/java/JavaDStreamLike.scala    |  9 ---------
 .../spark/streaming/dstream/DStream.scala       | 17 -----------------
 2 files changed, 26 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index 7a002bbe74ca9..a6184de4e83c1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -54,15 +54,6 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     dstream.print()
   }
 
-  def print(label: String = null): Unit = {
-    dstream.print(label)
-  }
-
-  def outputToFile(): Unit = {
-    dstream.outputToFile()
-  }
-
-
   /**
    * Return a new DStream in which each RDD has a single element generated by counting each RDD
    * of this DStream.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 46ef05d9c37a1..39ad591e8896e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -617,23 +617,6 @@ abstract class DStream[T: ClassTag] (
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
   }
 
-
-  def print(label: String = null) {
-    def foreachFunc = (rdd: RDD[T], time: Time) => {
-      val first11 = rdd.take(11)
-      println ("-------------------------------------------")
-      println ("Time: " + time)
-      println ("-------------------------------------------")
-      if(label != null){
-        println (label)
-      }
-      first11.take(10).foreach(println)
-      if (first11.size > 10) println("...")
-      println()
-    }
-    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
-  }
-
   /**
    * Return a new DStream in which each RDD contains all the elements in seen in a
    * sliding window of time over this DStream. The new DStream generates RDDs with

From 58e41ff4588a1c0d31b83d43ec3155de7fb92a02 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 16:55:51 -0700
Subject: [PATCH 269/347] merge with master

---
 python/pyspark/streaming/dstream.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 66024d539ce5c..a36f4b9bf9d87 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -436,6 +436,7 @@ def __init__(self, prev, func, preservesPartitioning=False):
             self._prev_jrdd_deserializer = prev._jrdd_deserializer
         else:
             prev_func = prev.func
+            
             def pipeline_func(split, iterator):
                 return func(split, prev_func(split, iterator))
             self.func = pipeline_func

From e80647e6031890cf944d323650e253bb87fcce29 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 18 Aug 2014 17:37:28 -0700
Subject: [PATCH 270/347] adopted the latest compression way of python command

---
 python/pyspark/streaming/dstream.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index a36f4b9bf9d87..0e2641e33032f 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -20,7 +20,8 @@
 import operator
 
 from pyspark.serializers import NoOpSerializer,\
-    BatchedSerializer, CloudPickleSerializer, pack_long
+    BatchedSerializer, CloudPickleSerializer, pack_long,\
+    CompressedSerializer
 from pyspark.rdd import _JavaStackTrace
 from pyspark.storagelevel import StorageLevel
 from pyspark.resultiterable import ResultIterable
@@ -463,7 +464,8 @@ def _jdstream(self):
             serializer = self.ctx.serializer
 
         command = (self.func, self._prev_jrdd_deserializer, serializer)
-        pickled_command = CloudPickleSerializer().dumps(command)
+        ser = CompressedSerializer(CloudPickleSerializer())
+        pickled_command = ser.dumps(command)
         broadcast_vars = ListConverter().convert(
             [x._jbroadcast for x in self.ctx._pickled_broadcast_vars],
             self.ctx._gateway._gateway_client)
@@ -472,12 +474,13 @@ def _jdstream(self):
         env = MapConverter().convert(self.ctx.environment,
                                      self.ctx._gateway._gateway_client)
         includes = ListConverter().convert(self.ctx._python_includes,
-                                     self.ctx._gateway._gateway_client)
+                                           self.ctx._gateway._gateway_client)
         python_dstream = self.ctx._jvm.PythonDStream(self._prev_jdstream.dstream(),
-                bytearray(pickled_command),
-                env, includes, self.preservesPartitioning,
-                self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator,
-                class_tag)
+                                                     bytearray(pickled_command),
+                                                     env, includes, self.preservesPartitioning,
+                                                     self.ctx.pythonExec,
+                                                     broadcast_vars, self.ctx._javaAccumulator,
+                                                     class_tag)
         self._jdstream_val = python_dstream.asJavaDStream()
         return self._jdstream_val
 

From c00e09172ef2c376f1e437684e4448c1028f25b9 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Tue, 19 Aug 2014 15:59:49 -0700
Subject: [PATCH 271/347] change test case not to use awaitTermination

---
 python/pyspark/streaming/context.py                            | 1 +
 .../org/apache/spark/streaming/api/python/PythonDStream.scala  | 3 ---
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 3f455a3e06072..cdd6926bf1c7a 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -102,6 +102,7 @@ def start(self):
     def awaitTermination(self, timeout=None):
         """
         Wait for the execution to stop.
+        @param timeout: time to wait in milliseconds
         """
         if timeout is None:
             self._jssc.awaitTermination()
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 681ef5997002e..2c44f6cc1d42f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -184,8 +184,6 @@ class PythonForeachDStream(
   this.register()
 }
 
-/*
-This does not work. Ignore this for now. -TD
 class PythonTransformedDStream(
     prev: DStream[Array[Byte]],
     transformFunction: PythonRDDFunction
@@ -204,7 +202,6 @@ class PythonTransformedDStream(
   val asJavaDStream  = JavaDStream.fromDStream(this)
   //val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
 }
-*/
 
 /**
  * This is a input stream just for the unitest. This is equivalent to a checkpointable,

From 3166d3146dbc194c098a4f7a8ad32cadffabdd54 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 16:07:42 -0700
Subject: [PATCH 272/347] clean up

---
 python/pyspark/streaming/context.py           |  9 ++--
 python/pyspark/streaming/dstream.py           | 28 +++++------
 python/pyspark/streaming/jtime.py             |  3 +-
 python/pyspark/streaming/utils.py             | 12 +++--
 .../streaming/api/python/PythonDStream.scala  | 48 +++----------------
 .../api/python/PythonRDDFunction.java         |  4 ++
 .../api/python/PythonTransformedDStream.scala | 42 ----------------
 7 files changed, 37 insertions(+), 109 deletions(-)
 delete mode 100644 streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index cdd6926bf1c7a..f7e356319ecac 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -142,9 +142,9 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
 
     def _testInputStream(self, test_inputs, numSlices=None):
         """
-        This function is only for test.
-        This implementation is inspired by QueStream implementation.
-        Give list of RDD to generate DStream which contains the RDD.
+        This function is only for unittest.
+        It requires a sequence as input, and returns the i_th element at the i_th batch
+        under manual clock.
         """
         test_rdds = list()
         test_rdd_deserializers = list()
@@ -152,7 +152,8 @@ def _testInputStream(self, test_inputs, numSlices=None):
             test_rdd = self._sc.parallelize(test_input, numSlices)
             test_rdds.append(test_rdd._jrdd)
             test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
-
+        # All deserializer has to be the same.
+        # TODO: add deserializer validation
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
 
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 0e2641e33032f..0b01a9f02f51f 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -283,23 +283,6 @@ def func(iterator):
             yield list(iterator)
         return self.mapPartitions(func)
 
-    #def transform(self, func): - TD
-    #    from utils import RDDFunction
-    #    wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
-    #    jdstream = self.ctx._jvm.PythonTransformedDStream(self._jdstream.dstream(), wrapped_func).toJavaDStream
-    #    return DStream(jdstream, self._ssc, ...)  ## DO NOT KNOW HOW
-
-    def _test_output(self, result):
-        """
-        This function is only for test case.
-        Store data in a DStream to result to verify the result in test case
-        """
-        def get_output(rdd, time):
-            taken = rdd.collect()
-            result.append(taken)
-
-        self.foreachRDD(get_output)
-
     def cache(self):
         """
         Persist this DStream with the default storage level (C{MEMORY_ONLY_SER}).
@@ -404,6 +387,17 @@ def saveAsTextFile(rdd, time):
 
         return self.foreachRDD(saveAsTextFile)
 
+    def _test_output(self, result):
+        """
+        This function is only for test case.
+        Store data in a DStream to result to verify the result in test case
+        """
+        def get_output(rdd, time):
+            collected = rdd.collect()
+            result.append(collected)
+
+        self.foreachRDD(get_output)
+
 
 # TODO: implement updateStateByKey
 # TODO: implement slice
diff --git a/python/pyspark/streaming/jtime.py b/python/pyspark/streaming/jtime.py
index 32ef741051283..f169228e81868 100644
--- a/python/pyspark/streaming/jtime.py
+++ b/python/pyspark/streaming/jtime.py
@@ -19,10 +19,11 @@
 from pyspark.streaming.duration import Duration
 
 """
-The name of this file, time is not good naming for python
+The name of this file, time is not a good naming for python
 because if we do import time when we want to use native python time package, it does
 not import python time package.
 """
+# TODO: add doctest
 
 
 class Time(object):
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
index 9178577743e0b..5ba179cae7f9c 100644
--- a/python/pyspark/streaming/utils.py
+++ b/python/pyspark/streaming/utils.py
@@ -19,6 +19,9 @@
 
 
 class RDDFunction():
+    """
+    This class is for py4j callback. This
+    """
     def __init__(self, ctx, jrdd_deserializer, func):
         self.ctx = ctx
         self.deserializer = jrdd_deserializer
@@ -38,6 +41,7 @@ class Java:
 
 
 def msDurationToString(ms):
+    #TODO: add doctest
     """
     Returns a human-readable string representing a duration such as "35ms"
     """
@@ -54,8 +58,10 @@ def msDurationToString(ms):
     else:
         return "%.2f h" % (float(ms) / hour)
 
+
 def rddToFileName(prefix, suffix, time):
-    if suffix is not None:
-        return prefix + "-" + str(time) + "." + suffix
-    else:
+    #TODO: add doctest
+    if suffix is None:
         return prefix + "-" + str(time)
+    else:
+        return prefix + "-" + str(time) + "." + suffix
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 2c44f6cc1d42f..3c9fff6f4bf5c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -18,10 +18,8 @@
 package org.apache.spark.streaming.api.python
 
 import java.io._
-import java.io.{ObjectInputStream, IOException}
-import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
+import java.util.{List => JList, ArrayList => JArrayList, Map => JMap}
 
-import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
 import scala.collection.JavaConversions._
 
@@ -55,7 +53,9 @@ class PythonDStream[T: ClassTag](
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     parent.getOrCompute(validTime) match{
       case Some(rdd) =>
-        val pythonRDD = new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning, pythonExec, broadcastVars, accumulator)
+        // create PythonRDD to compute Python functions.
+        val pythonRDD = new PythonRDD(rdd, command, envVars, pythonIncludes,
+          preservePartitoning, pythonExec, broadcastVars, accumulator)
         Some(pythonRDD.asJavaRDD.rdd)
       case None => None
     }
@@ -135,8 +135,8 @@ DStream[Array[Byte]](prev.ssc){
       case Some(rdd)=>Some(rdd)
         val pairwiseRDD = new PairwiseRDD(rdd)
         /*
-         * Since python operation is executed by Scala after StreamingContext.start.
-         * What PythonPairwiseDStream does is equivalent to python code in pySpark.
+         * Since python function is executed by Scala after StreamingContext.start.
+         * What PythonPairwiseDStream does is equivalent to python code in pyspark.
          *
          * with _JavaStackTrace(self.context) as st:
          *    pairRDD = self.ctx._jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD()
@@ -154,23 +154,6 @@ DStream[Array[Byte]](prev.ssc){
 }
 
 
-class PythonTestInputStream3(ssc_ : JavaStreamingContext)
-  extends InputDStream[Any](JavaStreamingContext.toStreamingContext(ssc_)) {
-
-  def start() {}
-
-  def stop() {}
-
-  def compute(validTime: Time): Option[RDD[Any]] = {
-    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
-    val selectedInput = ArrayBuffer(1, 2, 3).toSeq
-    val rdd :RDD[Any] = ssc.sc.makeRDD(selectedInput, 2)
-    Some(rdd)
-  }
-
-  val asJavaDStream = JavaDStream.fromDStream(this)
-}
-
 class PythonForeachDStream(
     prev: DStream[Array[Byte]],
     foreachFunction: PythonRDDFunction
@@ -184,30 +167,11 @@ class PythonForeachDStream(
   this.register()
 }
 
-class PythonTransformedDStream(
-    prev: DStream[Array[Byte]],
-    transformFunction: PythonRDDFunction
-  ) extends DStream[Array[Byte]](prev.ssc) {
-
-  override def dependencies = List(prev)
-
-  override def slideDuration: Duration = prev.slideDuration
-
-  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-    prev.getOrCompute(validTime).map(rdd => {
-      transformFunction.call(rdd.toJavaRDD(), validTime.milliseconds).rdd
-    })
-  }
-
-  val asJavaDStream  = JavaDStream.fromDStream(this)
-  //val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
-}
 
 /**
  * This is a input stream just for the unitest. This is equivalent to a checkpointable,
  * replayable, reliable message queue like Kafka. It requires a sequence as input, and
  * returns the i_th element at the i_th batch under manual clock.
- * This implementation is inspired by QueStream
  */
 
 class PythonTestInputStream(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[JavaRDD[Array[Byte]]])
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
index 88f7036c3a05b..b46a644dacb7c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
@@ -3,6 +3,10 @@
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.streaming.Time;
 
+/*
+ * Interface for py4j callback function.
+ * This function is called by pyspark.streaming.dstream.DStream.foreachRDD .
+ */
 public interface PythonRDDFunction {
   JavaRDD<byte[]> call(JavaRDD<byte[]> rdd, long time);
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
deleted file mode 100644
index bc07e09ec6d03..0000000000000
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
-
-package org.apache.spark.streaming.api.python
-
-import org.apache.spark.Accumulator
-import org.apache.spark.api.python.PythonRDD
-import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.rdd.RDD
-import org.apache.spark.streaming.api.java.JavaDStream
-import org.apache.spark.streaming.{Time, Duration}
-import org.apache.spark.streaming.dstream.DStream
-
-import scala.reflect.ClassTag
-
-class PythonTransformedDStream[T: ClassTag](
-               parent: DStream[T],
-               command: Array[Byte],
-               envVars: JMap[String, String],
-               pythonIncludes: JList[String],
-               preservePartitoning: Boolean,
-               pythonExec: String,
-               broadcastVars: JList[Broadcast[Array[Byte]]],
-               accumulator: Accumulator[JList[Array[Byte]]]
-               ) extends DStream[Array[Byte]](parent.ssc) {
-
-  override def dependencies = List(parent)
-
-  override def slideDuration: Duration = parent.slideDuration
-
-  //pythonDStream compute
-  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-
-//    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
-//    parents.map(_.getOrCompute(validTime).orNull).to
-//    parent = parents.head.asInstanceOf[RDD]
-//    Some()
-  }
-
-  val asJavaDStream = JavaDStream.fromDStream(this)
-}
-
-*/

From f198d14db09b2896b3450463340f0eecf7f9f802 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 17:09:34 -0700
Subject: [PATCH 273/347] clean up code

---
 python/pyspark/streaming/context.py | 11 +++--------
 python/pyspark/streaming/dstream.py | 28 ++++++++++++++++------------
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index f7e356319ecac..dbb6fdf1694ad 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -72,7 +72,7 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         # Callback sever is need only by SparkStreming; therefore the callback sever
         # is started in StreamingContext.
         SparkContext._gateway.restart_callback_server()
-        self._clean_up_trigger()
+        self._set_clean_up_trigger()
         self._jvm = self._sc._jvm
         self._jssc = self._initialize_context(self._sc._jsc, duration._jduration)
 
@@ -80,13 +80,11 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
     def _initialize_context(self, jspark_context, jduration):
         return self._jvm.JavaStreamingContext(jspark_context, jduration)
 
-    def _clean_up_trigger(self):
+    def _set_clean_up_trigger(self):
         """Kill py4j callback server properly using signal lib"""
 
         def clean_up_handler(*args):
             # Make sure stop callback server.
-            # This need improvement how to terminate callback sever properly.
-            SparkContext._gateway._shutdown_callback_server()
             SparkContext._gateway.shutdown()
             sys.exit(0)
 
@@ -132,18 +130,15 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
         Stop the execution of the streams immediately (does not wait for all received data
         to be processed).
         """
-        
         try:
             self._jssc.stop(stopSparkContext, stopGraceFully)
         finally:
-            # Stop Callback server
-            SparkContext._gateway._shutdown_callback_server()
             SparkContext._gateway.shutdown()
 
     def _testInputStream(self, test_inputs, numSlices=None):
         """
         This function is only for unittest.
-        It requires a sequence as input, and returns the i_th element at the i_th batch
+        It requires a list as input, and returns the i_th element at the i_th batch
         under manual clock.
         """
         test_rdds = list()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 0b01a9f02f51f..22a2751138c41 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -207,7 +207,7 @@ def _defaultReducePartitions(self):
         """
         Returns the default number of partitions to use during reduce tasks (e.g., groupBy).
         If spark.default.parallelism is set, then we'll use the value from SparkContext
-        defaultParallelism, otherwise we'll use the number of partitions in this RDD.
+        defaultParallelism, otherwise we'll use the number of partitions in this RDD
 
         This mirrors the behavior of the Scala Partitioner#defaultPartitioner, intended to reduce
         the likelihood of OOMs. Once PySpark adopts Partitioner-based APIs, this behavior will
@@ -222,7 +222,8 @@ def getNumPartitions(self):
         """
         Return the number of partitions in RDD
         """
-        # TODO: remove hardcoding. RDD has NumPartitions but DStream does not have.
+        # TODO: remove hardcoding. RDD has NumPartitions. How do we get the number of partition
+        # through DStream?
         return 2
 
     def foreachRDD(self, func):
@@ -243,6 +244,10 @@ def pyprint(self):
         operator, so this DStream will be registered as an output stream and there materialized.
         """
         def takeAndPrint(rdd, time):
+            """
+            Closure to take element from RDD and print first 10 elements.
+            This closure is called by py4j callback server.
+            """
             taken = rdd.take(11)
             print "-------------------------------------------"
             print "Time: %s" % (str(time))
@@ -307,17 +312,11 @@ def checkpoint(self, interval):
         Mark this DStream for checkpointing. It will be saved to a file inside the
         checkpoint directory set with L{SparkContext.setCheckpointDir()}
 
-        I am not sure this part in DStream
-        and
-        all references to its parent RDDs will be removed. This function must
-        be called before any job has been executed on this RDD. It is strongly
-        recommended that this RDD is persisted in memory, otherwise saving it
-        on a file will require recomputation.
-
-        interval must be pysprak.streaming.duration
+        @param interval: Time interval after which generated RDD will be checkpointed
+               interval has to be pyspark.streaming.duration.Duration
         """
         self.is_checkpointed = True
-        self._jdstream.checkpoint(interval)
+        self._jdstream.checkpoint(interval._jduration)
         return self
 
     def groupByKey(self, numPartitions=None):
@@ -369,6 +368,10 @@ def saveAsTextFiles(self, prefix, suffix=None):
         Save this DStream as a text file, using string representations of elements.
         """
         def saveAsTextFile(rdd, time):
+            """
+            Closure to save element in RDD in DStream as Pickled data in file.
+            This closure is called by py4j callback server.
+            """
             path = rddToFileName(prefix, suffix, time)
             rdd.saveAsTextFile(path)
 
@@ -410,9 +413,10 @@ def get_output(rdd, time):
 # TODO: implement countByWindow
 # TODO: implement reduceByWindow
 
-# Following operation has dependency to transform
+# transform Operation
 # TODO: implement transform
 # TODO: implement transformWith
+# Following operation has dependency with transform
 # TODO: implement union
 # TODO: implement repertitions
 # TODO: implement cogroup

From b171ec3a94834c8a9efc10510128a2fab2fe954f Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 18:43:08 -0700
Subject: [PATCH 274/347] fixed pep8 violation

---
 python/pyspark/streaming/context.py  |  10 +-
 python/pyspark/streaming/dstream.py  |   9 +-
 python/pyspark/streaming/duration.py |   3 +
 python/pyspark/streaming/tests.py    | 464 +++++++++++++++++++++++++++
 python/pyspark/worker.py             |   2 +-
 python/run-tests                     |   3 +
 6 files changed, 481 insertions(+), 10 deletions(-)
 create mode 100644 python/pyspark/streaming/tests.py

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index dbb6fdf1694ad..12023374333a2 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -33,8 +33,8 @@ class StreamingContext(object):
     """
 
     def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
-        environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
-        gateway=None, sparkContext=None, duration=None):
+                 environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
+                 gateway=None, sparkContext=None, duration=None):
         """
         Create a new StreamingContext. At least the master and app name and duration
         should be set, either through the named parameters here or through C{conf}.
@@ -63,8 +63,8 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         if sparkContext is None:
             # Create the Python Sparkcontext
             self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
-                            pyFiles=pyFiles, environment=environment, batchSize=batchSize,
-                            serializer=serializer, conf=conf, gateway=gateway)
+                                    pyFiles=pyFiles, environment=environment, batchSize=batchSize,
+                                    serializer=serializer, conf=conf, gateway=gateway)
         else:
             self._sc = sparkContext
 
@@ -107,7 +107,7 @@ def awaitTermination(self, timeout=None):
         else:
             self._jssc.awaitTermination(timeout)
 
-    #TODO: add storageLevel
+    # TODO: add storageLevel
     def socketTextStream(self, hostname, port):
         """
         Create an input from TCP source hostname:port. Data is received using
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 22a2751138c41..d152485a3a17d 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -81,7 +81,8 @@ def filter(self, f):
         """
         Return a new DStream containing only the elements that satisfy predicate.
         """
-        def func(iterator): return ifilter(f, iterator)
+        def func(iterator):
+            return ifilter(f, iterator)
         return self.mapPartitions(func)
 
     def flatMap(self, f, preservesPartitioning=False):
@@ -136,7 +137,7 @@ def reduceByKey(self, func, numPartitions=None):
         return self.combineByKey(lambda x: x, func, func, numPartitions)
 
     def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
-                      numPartitions = None):
+                     numPartitions=None):
         """
         Count the number of elements for each key, and return the result to the
         master as a dictionary
@@ -159,7 +160,7 @@ def combineLocally(iterator):
         def _mergeCombiners(iterator):
             combiners = {}
             for (k, v) in iterator:
-                if not k in combiners:
+                if k not in combiners:
                     combiners[k] = v
                 else:
                     combiners[k] = mergeCombiners(combiners[k], v)
@@ -194,7 +195,7 @@ def add_shuffle_key(split, iterator):
         keyed._bypass_serializer = True
         with _JavaStackTrace(self.ctx) as st:
             partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
-                                                      id(partitionFunc))
+                                                          id(partitionFunc))
             jdstream = self.ctx._jvm.PythonPairwiseDStream(keyed._jdstream.dstream(),
                                                            partitioner).asJavaDStream()
         dstream = DStream(jdstream, self._ssc, BatchedSerializer(outputSerializer))
diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
index a7f1036e4b856..85b28b14832b8 100644
--- a/python/pyspark/streaming/duration.py
+++ b/python/pyspark/streaming/duration.py
@@ -333,6 +333,7 @@ def _is_duration(self, instance):
         if not isinstance(instance, Duration):
             raise TypeError("This should be Duration")
 
+
 def Milliseconds(milliseconds):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
@@ -346,6 +347,7 @@ def Milliseconds(milliseconds):
     """
     return Duration(milliseconds)
 
+
 def Seconds(seconds):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
@@ -359,6 +361,7 @@ def Seconds(seconds):
     """
     return Duration(seconds * 1000)
 
+
 def Minutes(minutes):
     """
     Helper function that creates instance of [[pysparkstreaming.duration]] representing
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
new file mode 100644
index 0000000000000..2ed099b1004c3
--- /dev/null
+++ b/python/pyspark/streaming/tests.py
@@ -0,0 +1,464 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Unit tests for PySpark; additional tests are implemented as doctests in
+individual modules.
+
+This file would be merged to tests.py after all functions are ready.
+Since python API for streaming is beta, this file is separated.
+
+Callback server is sometimes unstable sometimes, which cause error in test case.
+But this is very rare case.
+
+"""
+from itertools import chain
+import time
+import operator
+import sys
+
+if sys.version_info[:2] <= (2, 6):
+    import unittest2 as unittest
+else:
+    import unittest
+
+from pyspark.context import SparkContext
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.duration import *
+
+
+class PySparkStreamingTestCase(unittest.TestCase):
+    def setUp(self):
+        class_name = self.__class__.__name__
+        self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
+
+    def tearDown(self):
+        # Do not call pyspark.streaming.context.StreamingContext.stop directly because
+        # we do not wait to shutdown py4j client.
+        self.ssc._jssc.stop()
+        self.ssc._sc.stop()
+        # Why does it long time to terminate StremaingContext and SparkContext?
+        # Should we change the sleep time if this depends on machine spec?
+        time.sleep(1)
+
+    @classmethod
+    def tearDownClass(cls):
+        # Make sure tp shutdown the callback server
+        SparkContext._gateway._shutdown_callback_server()
+
+
+class TestBasicOperationsSuite(PySparkStreamingTestCase):
+    """
+    2 tests for each function for batach deserializer and unbatch deserilizer because
+    the deserializer is not changed dunamically after streaming process starts.
+    Default numInputPartitions is 2.
+    If the number of input element is over 3, that DStream use batach deserializer.
+    If not, that DStream use unbatch deserializer.
+
+    All tests input should have list of lists(3 lists are default). This list represents stream.
+    Every batch interval, the first object of list are chosen to make DStream.
+    e.g The first list in the list is input of the first batch.
+    Please see the BasicTestSuits in Scala which is close to this implementation.
+    """
+    def setUp(self):
+        PySparkStreamingTestCase.setUp(self)
+        self.timeout = 10  # seconds
+        self.numInputPartitions = 2
+
+    def tearDown(self):
+        PySparkStreamingTestCase.tearDown(self)
+
+    @classmethod
+    def tearDownClass(cls):
+        PySparkStreamingTestCase.tearDownClass()
+
+    def test_map_batch(self):
+        """Basic operation test for DStream.map with batch deserializer."""
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: str(x))
+        expected_output = map(lambda x: map(lambda y: str(y), x), test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_map_unbatach(self):
+        """Basic operation test for DStream.map with unbatch deserializer."""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+
+        def test_func(dstream):
+            return dstream.map(lambda x: str(x))
+        expected_output = map(lambda x: map(lambda y: str(y), x), test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_flatMap_batch(self):
+        """Basic operation test for DStream.faltMap with batch deserializer."""
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+
+        def test_func(dstream):
+            return dstream.flatMap(lambda x: (x, x * 2))
+        expected_output = map(lambda x: list(chain.from_iterable((map(lambda y: [y, y * 2], x)))),
+                              test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_flatMap_unbatch(self):
+        """Basic operation test for DStream.faltMap with unbatch deserializer."""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+
+        def test_func(dstream):
+            return dstream.flatMap(lambda x: (x, x * 2))
+        expected_output = map(lambda x: list(chain.from_iterable((map(lambda y: [y, y * 2], x)))),
+                              test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_filter_batch(self):
+        """Basic operation test for DStream.filter with batch deserializer."""
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+
+        def test_func(dstream):
+            return dstream.filter(lambda x: x % 2 == 0)
+        expected_output = map(lambda x: filter(lambda y: y % 2 == 0, x), test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_filter_unbatch(self):
+        """Basic operation test for DStream.filter with unbatch deserializer."""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+
+        def test_func(dstream):
+            return dstream.filter(lambda x: x % 2 == 0)
+        expected_output = map(lambda x: filter(lambda y: y % 2 == 0, x), test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_count_batch(self):
+        """Basic operation test for DStream.count with batch deserializer."""
+        test_input = [range(1, 5), range(1, 10), range(1, 20)]
+
+        def test_func(dstream):
+            return dstream.count()
+        expected_output = map(lambda x: [len(x)], test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_count_unbatch(self):
+        """Basic operation test for DStream.count with unbatch deserializer."""
+        test_input = [[], [1], range(1, 3), range(1, 4)]
+
+        def test_func(dstream):
+            return dstream.count()
+        expected_output = map(lambda x: [len(x)], test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_reduce_batch(self):
+        """Basic operation test for DStream.reduce with batch deserializer."""
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+
+        def test_func(dstream):
+            return dstream.reduce(operator.add)
+        expected_output = map(lambda x: [reduce(operator.add, x)], test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_reduce_unbatch(self):
+        """Basic operation test for DStream.reduce with unbatch deserializer."""
+        test_input = [[1], range(1, 3), range(1, 4)]
+
+        def test_func(dstream):
+            return dstream.reduce(operator.add)
+        expected_output = map(lambda x: [reduce(operator.add, x)], test_input)
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_reduceByKey_batch(self):
+        """Basic operation test for DStream.reduceByKey with batch deserializer."""
+        test_input = [[("a", 1), ("a", 1), ("b", 1), ("b", 1)],
+                      [("", 1), ("", 1), ("", 1), ("", 1)],
+                      [(1, 1), (1, 1), (2, 1), (2, 1), (3, 1)]]
+
+        def test_func(dstream):
+            return dstream.reduceByKey(operator.add)
+        expected_output = [[("a", 2), ("b", 2)], [("", 4)], [(1, 2), (2, 2), (3, 1)]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def test_reduceByKey_unbatch(self):
+        """Basic operation test for DStream.reduceByKey with unbatch deserializer."""
+        test_input = [[("a", 1), ("a", 1), ("b", 1)], [("", 1), ("", 1)], []]
+
+        def test_func(dstream):
+            return dstream.reduceByKey(operator.add)
+        expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def test_mapValues_batch(self):
+        """Basic operation test for DStream.mapValues with batch deserializer."""
+        test_input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)],
+                      [("", 4), (1, 1), (2, 2), (3, 3)],
+                      [(1, 1), (2, 1), (3, 1), (4, 1)]]
+
+        def test_func(dstream):
+            return dstream.mapValues(lambda x: x + 10)
+        expected_output = [[("a", 12), ("b", 12), ("c", 11), ("d", 11)],
+                           [("", 14), (1, 11), (2, 12), (3, 13)],
+                           [(1, 11), (2, 11), (3, 11), (4, 11)]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def test_mapValues_unbatch(self):
+        """Basic operation test for DStream.mapValues with unbatch deserializer."""
+        test_input = [[("a", 2), ("b", 1)], [("", 2)], [], [(1, 1), (2, 2)]]
+
+        def test_func(dstream):
+            return dstream.mapValues(lambda x: x + 10)
+        expected_output = [[("a", 12), ("b", 11)], [("", 12)], [], [(1, 11), (2, 12)]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def test_flatMapValues_batch(self):
+        """Basic operation test for DStream.flatMapValues with batch deserializer."""
+        test_input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)],
+                      [("", 4), (1, 1), (2, 1), (3, 1)],
+                      [(1, 1), (2, 1), (3, 1), (4, 1)]]
+
+        def test_func(dstream):
+            return dstream.flatMapValues(lambda x: (x, x + 10))
+        expected_output = [[("a", 2), ("a", 12), ("b", 2), ("b", 12),
+                            ("c", 1), ("c", 11), ("d", 1), ("d", 11)],
+                           [("", 4), ("", 14), (1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11)],
+                           [(1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11), (4, 1), (4, 11)]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_flatMapValues_unbatch(self):
+        """Basic operation test for DStream.flatMapValues with unbatch deserializer."""
+        test_input = [[("a", 2), ("b", 1)], [("", 2)], []]
+
+        def test_func(dstream):
+            return dstream.flatMapValues(lambda x: (x, x + 10))
+        expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
+        output = self._run_stream(test_input, test_func, expected_output)
+        self.assertEqual(expected_output, output)
+
+    def test_glom_batch(self):
+        """Basic operation test for DStream.glom with batch deserializer."""
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+        numSlices = 2
+
+        def test_func(dstream):
+            return dstream.glom()
+        expected_output = [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]]
+        output = self._run_stream(test_input, test_func, expected_output, numSlices)
+        self.assertEqual(expected_output, output)
+
+    def test_glom_unbatach(self):
+        """Basic operation test for DStream.glom with unbatch deserializer."""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+        numSlices = 2
+
+        def test_func(dstream):
+            return dstream.glom()
+        expected_output = [[[1], [2, 3]], [[4], [5, 6]], [[7], [8, 9]]]
+        output = self._run_stream(test_input, test_func, expected_output, numSlices)
+        self.assertEqual(expected_output, output)
+
+    def test_mapPartitions_batch(self):
+        """Basic operation test for DStream.mapPartitions with batch deserializer."""
+        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+        numSlices = 2
+
+        def test_func(dstream):
+            def f(iterator):
+                yield sum(iterator)
+            return dstream.mapPartitions(f)
+        expected_output = [[3, 7], [11, 15], [19, 23]]
+        output = self._run_stream(test_input, test_func, expected_output, numSlices)
+        self.assertEqual(expected_output, output)
+
+    def test_mapPartitions_unbatch(self):
+        """Basic operation test for DStream.mapPartitions with unbatch deserializer."""
+        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+        numSlices = 2
+
+        def test_func(dstream):
+            def f(iterator):
+                yield sum(iterator)
+            return dstream.mapPartitions(f)
+        expected_output = [[1, 5], [4, 11], [7, 17]]
+        output = self._run_stream(test_input, test_func, expected_output, numSlices)
+        self.assertEqual(expected_output, output)
+
+    def test_countByValue_batch(self):
+        """Basic operation test for DStream.countByValue with batch deserializer."""
+        test_input = [range(1, 5) * 2, range(5, 7) + range(5, 9), ["a", "a", "b", ""]]
+
+        def test_func(dstream):
+            return dstream.countByValue()
+        expected_output = [[(1, 2), (2, 2), (3, 2), (4, 2)],
+                           [(5, 2), (6, 2), (7, 1), (8, 1)],
+                           [("a", 2), ("b", 1), ("", 1)]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def test_countByValue_unbatch(self):
+        """Basic operation test for DStream.countByValue with unbatch deserializer."""
+        test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
+
+        def test_func(dstream):
+            return dstream.countByValue()
+        expected_output = [[(1, 1), (2, 1), (3, 1)],
+                           [(1, 2), ("", 1)],
+                           [("a", 2), ("b", 1)]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def test_groupByKey_batch(self):
+        """Basic operation test for DStream.groupByKey with batch deserializer."""
+        test_input = [[(1, 1), (2, 1), (3, 1), (4, 1)],
+                      [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)],
+                      [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]]
+
+        def test_func(dstream):
+            return dstream.groupByKey()
+        expected_output = [[(1, [1]), (2, [1]), (3, [1]), (4, [1])],
+                           [(1, [1, 1, 1]), (2, [1, 1]), (3, [1])],
+                           [("a", [1, 1]), ("b", [1]), ("", [1, 1, 1])]]
+        scattered_output = self._run_stream(test_input, test_func, expected_output)
+        output = self._convert_iter_value_to_list(scattered_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def test_groupByKey_unbatch(self):
+        """Basic operation test for DStream.groupByKey with unbatch deserializer."""
+        test_input = [[(1, 1), (2, 1), (3, 1)],
+                      [(1, 1), (1, 1), ("", 1)],
+                      [("a", 1), ("a", 1), ("b", 1)]]
+
+        def test_func(dstream):
+            return dstream.groupByKey()
+        expected_output = [[(1, [1]), (2, [1]), (3, [1])],
+                           [(1, [1, 1]), ("", [1])],
+                           [("a", [1, 1]), ("b", [1])]]
+        scattered_output = self._run_stream(test_input, test_func, expected_output)
+        output = self._convert_iter_value_to_list(scattered_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def test_combineByKey_batch(self):
+        """Basic operation test for DStream.combineByKey with batch deserializer."""
+        test_input = [[(1, 1), (2, 1), (3, 1), (4, 1)],
+                      [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)],
+                      [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]]
+
+        def test_func(dstream):
+            def add(a, b):
+                return a + str(b)
+            return dstream.combineByKey(str, add, add)
+        expected_output = [[(1, "1"), (2, "1"), (3, "1"), (4, "1")],
+                           [(1, "111"), (2, "11"), (3, "1")],
+                           [("a", "11"), ("b", "1"), ("", "111")]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def test_combineByKey_unbatch(self):
+        """Basic operation test for DStream.combineByKey with unbatch deserializer."""
+        test_input = [[(1, 1), (2, 1), (3, 1)],
+                      [(1, 1), (1, 1), ("", 1)],
+                      [("a", 1),  ("a", 1), ("b", 1)]]
+
+        def test_func(dstream):
+            def add(a, b):
+                return a + str(b)
+            return dstream.combineByKey(str, add, add)
+        expected_output = [[(1, "1"), (2, "1"), (3, "1")],
+                           [(1, "11"), ("", "1")],
+                           [("a", "11"), ("b", "1")]]
+        output = self._run_stream(test_input, test_func, expected_output)
+        for result in (output, expected_output):
+            self._sort_result_based_on_key(result)
+        self.assertEqual(expected_output, output)
+
+    def _convert_iter_value_to_list(self, outputs):
+        """Return key value pair list. Value is converted to iterator to list."""
+        result = list()
+        for output in outputs:
+            result.append(map(lambda (x, y): (x, list(y)), output))
+        return result
+
+    def _sort_result_based_on_key(self, outputs):
+        """Sort the list base onf first value."""
+        for output in outputs:
+            output.sort(key=lambda x: x[0])
+
+    def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
+        """
+        Start stream and return the result.
+        @param test_input: dataset for the test. This should be list of lists.
+        @param test_func: wrapped test_function. This function should return PythonDStream object.
+        @param expected_output: expected output for this testcase.
+        @param numSlices: the number of slices in the rdd in the dstream.
+        """
+        # Generate input stream with user-defined input.
+        numSlices = numSlices or self.numInputPartitions
+        test_input_stream = self.ssc._testInputStream(test_input, numSlices)
+        # Apply test function to stream.
+        test_stream = test_func(test_input_stream)
+        # Add job to get output from stream.
+        result = list()
+        test_stream._test_output(result)
+        self.ssc.start()
+
+        start_time = time.time()
+        # Loop until get the expected the number of the result from the stream.
+        while True:
+            current_time = time.time()
+            # Check time out.
+            if (current_time - start_time) > self.timeout:
+                break
+            # StreamingContext.awaitTermination is not used to wait because
+            # if py4j server is called every 50 milliseconds, it gets an error.
+            time.sleep(0.05)
+            # Check if the output is the same length of expected output.
+            if len(expected_output) == len(result):
+                break
+
+        return result
+
+if __name__ == "__main__":
+    unittest.main()
+    SparkContext._gateway._shutdown_callback_server()
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 90ea7b453d401..d6c06e2dbef62 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -61,7 +61,7 @@ def main(infile, outfile):
         SparkFiles._is_running_on_worker = True
 
         # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
-        sys.path.append(spark_files_dir) # *.py files that were added will be copied here
+        sys.path.append(spark_files_dir)  # *.py files that were added will be copied here
         num_python_includes = read_int(infile)
         for _ in range(num_python_includes):
             filename = utf8_deserializer.loads(infile)
diff --git a/python/run-tests b/python/run-tests
index 27f47edaebbad..d1e2a170ed2a5 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -84,6 +84,9 @@ run_test "pyspark/mllib/stat.py"
 run_test "pyspark/mllib/tests.py"
 run_test "pyspark/mllib/tree.py"
 run_test "pyspark/mllib/util.py"
+if [ -n "$_RUN_STREAMING_TESTS" ]; then
+  run_test "pyspark/streaming/tests.py"
+fi
 
 # Try to test with PyPy
 if [ $(which pypy) ]; then

From f04882c9ad90b0af9cf0e1f8037e84df0c81f74f Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 18:54:22 -0700
Subject: [PATCH 275/347] clen up examples

---
 .../src/main/python/streaming/network_wordcount.py     |  8 +++-----
 examples/src/main/python/streaming/wordcount.py        | 10 +++-------
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index 9b7af07803b4d..f242f8d29658a 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -1,7 +1,6 @@
 import sys
 from operator import add
 
-from pyspark.conf import SparkConf
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
 
@@ -9,15 +8,14 @@
     if len(sys.argv) != 3:
         print >> sys.stderr, "Usage: wordcount <hostname> <port>"
         exit(-1)
-    conf = SparkConf()
-    conf.setAppName("PythonStreamingNetworkWordCount")
-    ssc = StreamingContext(conf=conf, duration=Seconds(1))
+    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", 
+                           duration=Seconds(1))
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     words = lines.flatMap(lambda line: line.split(" "))
     mapped_words = words.map(lambda word: (word, 1))
     count = mapped_words.reduceByKey(add)
-
     count.pyprint()
+
     ssc.start()
     ssc.awaitTermination()
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index 2426345711086..e0600bbdb34d2 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -1,7 +1,5 @@
 import sys
-from operator import add
 
-from pyspark.conf import SparkConf
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
 
@@ -9,16 +7,14 @@
     if len(sys.argv) != 2:
         print >> sys.stderr, "Usage: wordcount <directory>"
         exit(-1)
-    conf = SparkConf()
-    conf.setAppName("PythonStreamingWordCount")
 
-    ssc = StreamingContext(conf=conf, duration=Seconds(1))
+    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
 
     lines = ssc.textFileStream(sys.argv[1])
     words = lines.flatMap(lambda line: line.split(" "))
     mapped_words = words.map(lambda x: (x, 1))
-    count = mapped_words.reduceByKey(add)
-    
+    count = mapped_words.reduceByKey(lambda a, b: a+b)
     count.pyprint()
+
     ssc.start()
     ssc.awaitTermination()

From 62dc7a3712e8f8e9983bcb1e8597fe8943acbea4 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 19:06:03 -0700
Subject: [PATCH 276/347] clean up exmples

---
 .../src/main/python/streaming/network_wordcount.py    |  9 ++++-----
 examples/src/main/python/streaming/wordcount.py       | 11 ++++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index f242f8d29658a..cd2a8a73de63b 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -1,5 +1,4 @@
 import sys
-from operator import add
 
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
@@ -12,10 +11,10 @@
                            duration=Seconds(1))
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
-    words = lines.flatMap(lambda line: line.split(" "))
-    mapped_words = words.map(lambda word: (word, 1))
-    count = mapped_words.reduceByKey(add)
-    count.pyprint()
+    counts = lines.flatMap(lambda line: line.split(" "))\
+                  .map(lambda word: (word, 1))\
+                  .reduceByKey(lambda a,b: a+b)
+    counts.pyprint()
 
     ssc.start()
     ssc.awaitTermination()
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index e0600bbdb34d2..4c62835ed8025 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -8,13 +8,14 @@
         print >> sys.stderr, "Usage: wordcount <directory>"
         exit(-1)
 
-    ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1))
+    ssc = StreamingContext(appName="PythonStreamingWordCount",
+                           duration=Seconds(1))
 
     lines = ssc.textFileStream(sys.argv[1])
-    words = lines.flatMap(lambda line: line.split(" "))
-    mapped_words = words.map(lambda x: (x, 1))
-    count = mapped_words.reduceByKey(lambda a, b: a+b)
-    count.pyprint()
+    counts = lines.flatMap(lambda line: line.split(" "))\
+                  .map(lambda x: (x, 1))\
+                  .reduceByKey(lambda a, b: a+b)
+    counts.pyprint()
 
     ssc.start()
     ssc.awaitTermination()

From 7dc73914439383a29afbe1fa464698f153e875c5 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 19:21:37 -0700
Subject: [PATCH 277/347] fixed typo

---
 .../src/main/scala/org/apache/spark/deploy/PythonRunner.scala | 1 -
 python/pyspark/java_gateway.py                                | 1 +
 python/pyspark/streaming/dstream.py                           | 4 ++--
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
index dc68b1fbda8bb..b66c3ba4d5fb0 100644
--- a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
@@ -57,7 +57,6 @@ object PythonRunner {
     val builder = new ProcessBuilder(Seq(pythonExec, "-u", formattedPythonFile) ++ otherArgs)
     val env = builder.environment()
     env.put("PYTHONPATH", pythonPath)
-    env.put("PYSPARK_PYTHON", pythonExec)
     env.put("PYSPARK_GATEWAY_PORT", "" + gatewayServer.getListeningPort)
     builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize
     val process = builder.start()
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index f3c6d231ab777..6650a4cd10be4 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -118,4 +118,5 @@ def run(self):
     java_import(gateway.jvm, "org.apache.spark.sql.hive.LocalHiveContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.TestHiveContext")
     java_import(gateway.jvm, "scala.Tuple2")
+
     return gateway
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index d152485a3a17d..f75d7c7f1b259 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -414,7 +414,7 @@ def get_output(rdd, time):
 # TODO: implement countByWindow
 # TODO: implement reduceByWindow
 
-# transform Operation
+# Transform Operation
 # TODO: implement transform
 # TODO: implement transformWith
 # Following operation has dependency with transform
@@ -423,7 +423,7 @@ def get_output(rdd, time):
 # TODO: implement cogroup
 # TODO: implement join
 # TODO: implement leftOuterJoin
-# TODO: implemtnt rightOuterJoin
+# TODO: implement rightOuterJoin
 
 
 class PipelinedDStream(DStream):

From 6ae3caa089341521f98af5ac2a7be4126d3fda31 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 19:28:03 -0700
Subject: [PATCH 278/347] revert pom.xml

---
 python/pyspark/streaming/context.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 12023374333a2..6b6cc653dfa6f 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -147,7 +147,7 @@ def _testInputStream(self, test_inputs, numSlices=None):
             test_rdd = self._sc.parallelize(test_input, numSlices)
             test_rdds.append(test_rdd._jrdd)
             test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
-        # All deserializer has to be the same.
+        # All deserializers have to be the same.
         # TODO: add deserializer validation
         jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()

From fa4af88be1c68f11a5581706c397ea5ef3cd7f61 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 19:30:36 -0700
Subject: [PATCH 279/347] remove duplicated import

---
 .../scala/org/apache/spark/streaming/dstream/DStream.scala     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 39ad591e8896e..e05db236addca 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.streaming.dstream
 
+
 import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
 
 import scala.deprecated
@@ -30,7 +31,6 @@ import org.apache.spark.streaming._
 import org.apache.spark.streaming.StreamingContext._
 import org.apache.spark.streaming.scheduler.Job
 import org.apache.spark.util.MetadataCleaner
-import org.apache.spark.streaming.Duration
 
 /**
  * A Discretized Stream (DStream), the basic abstraction in Spark Streaming, is a continuous
@@ -557,7 +557,6 @@ abstract class DStream[T: ClassTag] (
     // because the DStream is reachable from the outer object here, and because 
     // DStreams can't be serialized with closures, we can't proactively check 
     // it for serializability and so we pass the optional false to SparkContext.clean
-
     val cleanedF = context.sparkContext.clean(transformFunc, false)
     val realTransformFunc =  (rdds: Seq[RDD[_]], time: Time) => {
       assert(rdds.length == 1)

From 066ba90666e006c604f027474582460bed48aade Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 19:33:13 -0700
Subject: [PATCH 280/347] revert pom.xml

---
 streaming/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/streaming/pom.xml b/streaming/pom.xml
index 483e200ff9f16..4d41efb590233 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -79,7 +79,7 @@
       </plugin>
 
       <!--
-           This plugin forces the generation of jar containing streaming test classes,
+           This plugin forces the generation of jar containing streaming test classes, 
            so that the tests classes of external modules can use them. The two execution profiles
            are necessary - first one for 'mvn package', second one for 'mvn test-compile'. Ideally,
            'mvn compile' should not compile test classes and therefore should not need this. 

From 8ed93af06232d072f0c401b6255e3c1f3c4df561 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 19:34:20 -0700
Subject: [PATCH 281/347] fixed explanaiton

---
 streaming/pom.xml                                               | 2 +-
 .../apache/spark/streaming/api/python/PythonRDDFunction.java    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/streaming/pom.xml b/streaming/pom.xml
index 4d41efb590233..483e200ff9f16 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -79,7 +79,7 @@
       </plugin>
 
       <!--
-           This plugin forces the generation of jar containing streaming test classes, 
+           This plugin forces the generation of jar containing streaming test classes,
            so that the tests classes of external modules can use them. The two execution profiles
            are necessary - first one for 'mvn package', second one for 'mvn test-compile'. Ideally,
            'mvn compile' should not compile test classes and therefore should not need this. 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
index b46a644dacb7c..84d092ea0048f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
@@ -5,7 +5,7 @@
 
 /*
  * Interface for py4j callback function.
- * This function is called by pyspark.streaming.dstream.DStream.foreachRDD .
+ * This interface is related to pyspark.streaming.dstream.DStream.foreachRDD .
  */
 public interface PythonRDDFunction {
   JavaRDD<byte[]> call(JavaRDD<byte[]> rdd, long time);

From fbed8da5253e795a555761250faf7da1ce2058cb Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 19:38:38 -0700
Subject: [PATCH 282/347] revert pom.xml

---
 streaming/pom.xml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/streaming/pom.xml b/streaming/pom.xml
index 483e200ff9f16..12f900c91eb98 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -77,9 +77,9 @@
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
       </plugin>
-
-      <!--
-           This plugin forces the generation of jar containing streaming test classes,
+      
+      <!-- 
+           This plugin forces the generation of jar containing streaming test classes, 
            so that the tests classes of external modules can use them. The two execution profiles
            are necessary - first one for 'mvn package', second one for 'mvn test-compile'. Ideally,
            'mvn compile' should not compile test classes and therefore should not need this. 

From bebb3f3123040a3d48ea46faa9b02a5bf2a837c7 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Wed, 20 Aug 2014 19:44:05 -0700
Subject: [PATCH 283/347] remove the last brank line

---
 .../org/apache/spark/streaming/api/python/PythonDStream.scala   | 2 +-
 .../apache/spark/streaming/api/python/PythonRDDFunction.java    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 3c9fff6f4bf5c..717592dbf8bf9 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -198,4 +198,4 @@ class PythonTestInputStream(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[J
   }
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
-}
+}
\ No newline at end of file
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
index 84d092ea0048f..eacff4b0e6006 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
@@ -9,4 +9,4 @@
  */
 public interface PythonRDDFunction {
   JavaRDD<byte[]> call(JavaRDD<byte[]> rdd, long time);
-}
+}
\ No newline at end of file

From b0f2015c81d754a7a226d4b48f81d388817c61ec Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 21 Aug 2014 12:56:27 -0700
Subject: [PATCH 284/347] added comment in dstream._test_output

---
 python/pyspark/streaming/dstream.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index f75d7c7f1b259..b62b9d6e4ecf2 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -397,6 +397,10 @@ def _test_output(self, result):
         Store data in a DStream to result to verify the result in test case
         """
         def get_output(rdd, time):
+            """
+            Closure to get element in RDD in the DStream.
+            This closure is called by py4j callback server.
+            """
             collected = rdd.collect()
             result.append(collected)
 

From f385976aea314c20cd8de34f55162e0d5a7a135e Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 21 Aug 2014 15:15:55 -0700
Subject: [PATCH 285/347] delete inproper comments

---
 python/pyspark/streaming/tests.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 2ed099b1004c3..3c9174e64cf77 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -16,15 +16,11 @@
 #
 
 """
-Unit tests for PySpark; additional tests are implemented as doctests in
+Unit tests for Python SparkStreaming; additional tests are implemented as doctests in
 individual modules.
 
-This file would be merged to tests.py after all functions are ready.
-Since python API for streaming is beta, this file is separated.
-
 Callback server is sometimes unstable sometimes, which cause error in test case.
 But this is very rare case.
-
 """
 from itertools import chain
 import time

From c0a06bc6f293643ffa3a2c6ce7a778d740ff37c7 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Thu, 21 Aug 2014 15:43:57 -0700
Subject: [PATCH 286/347] delete not implemented functions

---
 python/pyspark/streaming/jtime.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/streaming/jtime.py b/python/pyspark/streaming/jtime.py
index f169228e81868..eb91ad94457fa 100644
--- a/python/pyspark/streaming/jtime.py
+++ b/python/pyspark/streaming/jtime.py
@@ -126,14 +126,12 @@ def isMultipbleOf(duration):
         Duration._is_duration(duration)
         return self._millis % duration._millis == 0
 
-    def until(time, interval):
-        raise NotImplementedError
-
-    def to(time, interval):
-        raise NotImplementedError
-
     @classmethod
     def _is_time(self, instance):
         """ is instance Time """
         if not isinstance(instance, Time):
             raise TypeError
+
+# TODO: implement until
+# TODO: implement to
+

From 2fdf0def481b8ad83d141370ab6482dadd779c14 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@redhat.com>
Date: Tue, 26 Aug 2014 17:01:47 -0400
Subject: [PATCH 287/347] Fix scalastyle errors

---
 .../spark/streaming/api/java/JavaStreamingContext.scala      | 3 ++-
 .../apache/spark/streaming/api/python/PythonDStream.scala    | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
index 50162e0a1d3b4..662cd8d22c6a5 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
@@ -549,7 +549,8 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
  * JavaStreamingContext object contains a number of utility functions.
  */
 object JavaStreamingContext {
-  implicit def fromStreamingContext(ssc: StreamingContext): JavaStreamingContext = new JavaStreamingContext(ssc)
+  implicit def fromStreamingContext(ssc: StreamingContext):
+    JavaStreamingContext = new JavaStreamingContext(ssc)
 
   implicit def toStreamingContext(jssc: JavaStreamingContext): StreamingContext = jssc.ssc
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 717592dbf8bf9..15104e464d48e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -174,7 +174,8 @@ class PythonForeachDStream(
  * returns the i_th element at the i_th batch under manual clock.
  */
 
-class PythonTestInputStream(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[JavaRDD[Array[Byte]]])
+class PythonTestInputStream(ssc_ : JavaStreamingContext,
+      inputRDDs: JArrayList[JavaRDD[Array[Byte]]])
   extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)) {
 
   def start() {}
@@ -198,4 +199,4 @@ class PythonTestInputStream(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[J
   }
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
-}
\ No newline at end of file
+}

From d542743edfba48e205c54bb9b1b5766ffa282a16 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 31 Aug 2014 15:56:14 +0900
Subject: [PATCH 288/347] clean up code

---
 .../org/apache/spark/streaming/api/python/PythonDStream.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 15104e464d48e..27e74a847a4ad 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -170,8 +170,8 @@ class PythonForeachDStream(
 
 /**
  * This is a input stream just for the unitest. This is equivalent to a checkpointable,
- * replayable, reliable message queue like Kafka. It requires a sequence as input, and
- * returns the i_th element at the i_th batch under manual clock.
+ * replayable, reliable message queue like Kafka. It requires a JArrayList input of JavaRDD,
+ * and returns the i_th element at the i_th batch under manual clock.
  */
 
 class PythonTestInputStream(ssc_ : JavaStreamingContext,

From d39f10256efeb11b8e887919510e8d2cd39f1926 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 31 Aug 2014 16:19:38 +0900
Subject: [PATCH 289/347] added StreamingContext.remember

---
 python/pyspark/streaming/context.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 6b6cc653dfa6f..a1a9be1eae439 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -21,6 +21,7 @@
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
 from pyspark.context import SparkContext
 from pyspark.streaming.dstream import DStream
+from pyspark.streaming.duration import Duration
 
 from py4j.java_collections import ListConverter
 
@@ -107,6 +108,20 @@ def awaitTermination(self, timeout=None):
         else:
             self._jssc.awaitTermination(timeout)
 
+    def remember(self, duration):
+        """
+        Set each DStreams in this context to remember RDDs it generated in the last given duration.
+        DStreams remember RDDs only for a limited duration of time and releases them for garbage
+        collection. This method allows the developer to specify how to long to remember the RDDs (
+        if the developer wishes to query old data outside the DStream computation).
+        @param duration pyspark.streaming.duration.Duration object.
+               Minimum duration that each DStream should remember its RDDs
+        """
+        if not isinstance(duration, Duration):
+            raise TypeError("Input should be pyspark.streaming.duration.Duration object")
+
+        self._jssc.remember(duration._jduration)
+
     # TODO: add storageLevel
     def socketTextStream(self, hostname, port):
         """

From 63c881a188e5d37407470654fb3b3e0a9983ba9c Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 31 Aug 2014 16:29:37 +0900
Subject: [PATCH 290/347] added StreamingContext.sparkContext

---
 python/pyspark/streaming/context.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index a1a9be1eae439..78828ea4ea3c2 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -92,6 +92,10 @@ def clean_up_handler(*args):
         for sig in (SIGINT, SIGTERM):
             signal(sig, clean_up_handler)
 
+    @property
+    def sparkContext(self):
+        return self._sc
+
     def start(self):
         """
         Start the execution of the streams.

From d5f5fcb80ad470fde5d873a6a35e6027a1d1752c Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 31 Aug 2014 16:31:01 +0900
Subject: [PATCH 291/347] added comment for StreamingContext.sparkContext

---
 python/pyspark/streaming/context.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 78828ea4ea3c2..bbb4f6764e266 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -94,6 +94,9 @@ def clean_up_handler(*args):
 
     @property
     def sparkContext(self):
+        """
+        Return SparkContext which is associated this StreamingContext
+        """
         return self._sc
 
     def start(self):

From 8ffdbf176efadf973cd32071719fbccc3ddb842f Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 31 Aug 2014 20:48:10 +0900
Subject: [PATCH 292/347] added atexit to handle callback server

---
 python/pyspark/streaming/context.py | 28 +++++++-----
 python/pyspark/streaming/tests.py   | 68 ++++++++++++++++++++++++++++-
 2 files changed, 83 insertions(+), 13 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index bbb4f6764e266..19e97f38861a6 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -17,6 +17,8 @@
 
 import sys
 from signal import signal, SIGTERM, SIGINT
+import atexit
+import time
 
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
 from pyspark.context import SparkContext
@@ -73,7 +75,7 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         # Callback sever is need only by SparkStreming; therefore the callback sever
         # is started in StreamingContext.
         SparkContext._gateway.restart_callback_server()
-        self._set_clean_up_trigger()
+        self._set_clean_up_handler()
         self._jvm = self._sc._jvm
         self._jssc = self._initialize_context(self._sc._jsc, duration._jduration)
 
@@ -81,21 +83,22 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
     def _initialize_context(self, jspark_context, jduration):
         return self._jvm.JavaStreamingContext(jspark_context, jduration)
 
-    def _set_clean_up_trigger(self):
-        """Kill py4j callback server properly using signal lib"""
+    def _set_clean_up_handler(self):
+        """ set clean up hander using atexit """
 
-        def clean_up_handler(*args):
-            # Make sure stop callback server.
+        def clean_up_handler():
             SparkContext._gateway.shutdown()
-            sys.exit(0)
 
+        atexit.register(clean_up_handler)
+        # atext is not called when the program is killed by a signal not handled by
+        # Python.
         for sig in (SIGINT, SIGTERM):
             signal(sig, clean_up_handler)
 
     @property
     def sparkContext(self):
         """
-        Return SparkContext which is associated this StreamingContext
+        Return SparkContext which is associated with this StreamingContext.
         """
         return self._sc
 
@@ -152,11 +155,14 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
         Stop the execution of the streams immediately (does not wait for all received data
         to be processed).
         """
-        try:
-            self._jssc.stop(stopSparkContext, stopGraceFully)
-        finally:
-            SparkContext._gateway.shutdown()
+        self._jssc.stop(stopSparkContext, stopGraceFully)
+        if stopSparkContext:
+            self._sc.stop()
 
+        # Shutdown only callback server and all py3j client is shutdowned
+        # clean up handler
+        SparkContext._gateway._shutdown_callback_server()
+        
     def _testInputStream(self, test_inputs, numSlices=None):
         """
         This function is only for unittest.
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 3c9174e64cf77..d7f86fc8f5923 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -33,6 +33,7 @@
     import unittest
 
 from pyspark.context import SparkContext
+from pyspark.conf import SparkConf
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.duration import *
 
@@ -47,8 +48,6 @@ def tearDown(self):
         # we do not wait to shutdown py4j client.
         self.ssc._jssc.stop()
         self.ssc._sc.stop()
-        # Why does it long time to terminate StremaingContext and SparkContext?
-        # Should we change the sleep time if this depends on machine spec?
         time.sleep(1)
 
     @classmethod
@@ -455,6 +454,71 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
 
         return result
 
+
+class TestStreamingContextSuite(unittest.TestCase):
+    """
+    Should we have conf property in  SparkContext?
+    @property
+    def conf(self):
+        return self._conf
+
+    """
+    def setUp(self):
+        self.master = "local[2]"
+        self.appName = self.__class__.__name__
+        self.batachDuration = Milliseconds(500)
+        self.sparkHome = "SomeDir"
+        self.envPair = {"key": "value"}
+
+    def tearDown(self):
+        # Do not call pyspark.streaming.context.StreamingContext.stop directly because
+        # we do not wait to shutdown py4j client.
+        self.ssc._jssc.stop()
+        self.ssc._sc.stop()
+        # Why does it long time to terminate StremaingContext and SparkContext?
+        # Should we change the sleep time if this depends on machine spec?
+        time.sleep(1)
+
+    @classmethod
+    def tearDownClass(cls):
+        # Make sure tp shutdown the callback server
+        SparkContext._gateway._shutdown_callback_server()
+
+
+    def test_from_no_conf_constructor(self):
+        ssc = StreamingContext(master=self.master, appName=self.appName, duration=batachDuration)
+        # Alternative call master: ssc.sparkContext.master
+        # I try to make code close to Scala.
+        self.assertEqual(ssc.sparkContext._conf.get("spark.master"), self.master)
+        self.assertEqual(ssc.sparkContext._conf.get("spark.app.name"), self.appName)
+
+    def test_from_no_conf_plus_spark_home(self):
+        ssc = StreamingContext(master=self.master, appName=self.appName, 
+                               sparkHome=self.sparkHome, duration=batachDuration)
+        self.assertEqual(ssc.sparkContext._conf.get("spark.home"), self.sparkHome)
+
+    def test_from_existing_spark_context(self):
+        sc = SparkContext(master=self.master, appName=self.appName)
+        ssc = StreamingContext(sparkContext=sc)
+
+    def test_existing_spark_context_with_settings(self):
+        conf = SparkConf()
+        conf.set("spark.cleaner.ttl", "10")
+        sc = SparkContext(master=self.master, appName=self.appName, conf=conf)
+        ssc = StreamingContext(context=sc)
+        self.assertEqual(int(ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)
+
+    def _addInputStream(self, s):
+        test_inputs = map(lambda x: range(1, x), range(5, 101))
+        # make sure numSlice is 2 due to deserializer proglem in pyspark
+        s._testInputStream(test_inputs, 2)
+
+
+
+
+
+
+
 if __name__ == "__main__":
     unittest.main()
     SparkContext._gateway._shutdown_callback_server()

From 4a59e1e8fb7ccaeae99d80380ad1b0f04d7d9b76 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 31 Aug 2014 21:05:23 +0900
Subject: [PATCH 293/347] WIP:added more test for StreamingContext

---
 python/pyspark/streaming/context.py |  3 +++
 python/pyspark/streaming/tests.py   | 13 +++++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 19e97f38861a6..d7a20caac1ee8 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -63,6 +63,9 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
 
         """
 
+        if not isinstance(duration, Duration):
+            raise TypeError("Input should be pyspark.streaming.duration.Duration object")
+
         if sparkContext is None:
             # Create the Python Sparkcontext
             self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index d7f86fc8f5923..dab36cc3a9c24 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -473,6 +473,7 @@ def setUp(self):
     def tearDown(self):
         # Do not call pyspark.streaming.context.StreamingContext.stop directly because
         # we do not wait to shutdown py4j client.
+        # We need change this simply calll streamingConxt.Stop
         self.ssc._jssc.stop()
         self.ssc._sc.stop()
         # Why does it long time to terminate StremaingContext and SparkContext?
@@ -484,7 +485,6 @@ def tearDownClass(cls):
         # Make sure tp shutdown the callback server
         SparkContext._gateway._shutdown_callback_server()
 
-
     def test_from_no_conf_constructor(self):
         ssc = StreamingContext(master=self.master, appName=self.appName, duration=batachDuration)
         # Alternative call master: ssc.sparkContext.master
@@ -513,7 +513,17 @@ def _addInputStream(self, s):
         # make sure numSlice is 2 due to deserializer proglem in pyspark
         s._testInputStream(test_inputs, 2)
 
+    def test_from_no_conf_plus_spark_home_plus_env(self):
+        pass
+
+    def test_from_conf_with_settings(self):
+        pass
+
+    def test_stop_only_streaming_context(self):
+        pass
 
+    def test_await_termination(self):
+        pass
 
 
 
@@ -521,4 +531,3 @@ def _addInputStream(self, s):
 
 if __name__ == "__main__":
     unittest.main()
-    SparkContext._gateway._shutdown_callback_server()

From 2d32a7452382ee2a9962e97af483b7bfa3983f1f Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 1 Sep 2014 10:30:41 +0900
Subject: [PATCH 294/347] added some StreamingContextTestSuite

---
 python/pyspark/streaming/tests.py | 80 ++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 28 deletions(-)

diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index dab36cc3a9c24..95cb76a15be07 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -469,13 +469,18 @@ def setUp(self):
         self.batachDuration = Milliseconds(500)
         self.sparkHome = "SomeDir"
         self.envPair = {"key": "value"}
+        self.ssc = None
+        self.sc = None
 
     def tearDown(self):
         # Do not call pyspark.streaming.context.StreamingContext.stop directly because
         # we do not wait to shutdown py4j client.
         # We need change this simply calll streamingConxt.Stop
-        self.ssc._jssc.stop()
-        self.ssc._sc.stop()
+        #self.ssc._jssc.stop()
+        if self.ssc is not None:
+            self.ssc.stop()
+        if self.sc is not None:
+            self.sc.stop()
         # Why does it long time to terminate StremaingContext and SparkContext?
         # Should we change the sleep time if this depends on machine spec?
         time.sleep(1)
@@ -486,48 +491,67 @@ def tearDownClass(cls):
         SparkContext._gateway._shutdown_callback_server()
 
     def test_from_no_conf_constructor(self):
-        ssc = StreamingContext(master=self.master, appName=self.appName, duration=batachDuration)
+        self.ssc = StreamingContext(master=self.master, appName=self.appName,
+                               duration=self.batachDuration)
         # Alternative call master: ssc.sparkContext.master
         # I try to make code close to Scala.
-        self.assertEqual(ssc.sparkContext._conf.get("spark.master"), self.master)
-        self.assertEqual(ssc.sparkContext._conf.get("spark.app.name"), self.appName)
+        self.assertEqual(self.ssc.sparkContext._conf.get("spark.master"), self.master)
+        self.assertEqual(self.ssc.sparkContext._conf.get("spark.app.name"), self.appName)
 
     def test_from_no_conf_plus_spark_home(self):
-        ssc = StreamingContext(master=self.master, appName=self.appName, 
-                               sparkHome=self.sparkHome, duration=batachDuration)
-        self.assertEqual(ssc.sparkContext._conf.get("spark.home"), self.sparkHome)
+        self.ssc = StreamingContext(master=self.master, appName=self.appName, 
+                               sparkHome=self.sparkHome, duration=self.batachDuration)
+        self.assertEqual(self.ssc.sparkContext._conf.get("spark.home"), self.sparkHome)
+
+    def test_from_no_conf_plus_spark_home_plus_env(self):
+        self.ssc = StreamingContext(master=self.master, appName=self.appName, 
+                               sparkHome=self.sparkHome, environment=self.envPair,
+                               duration=self.batachDuration)
+        self.assertEqual(self.ssc.sparkContext._conf.get("spark.executorEnv.key"), self.envPair["key"])
 
     def test_from_existing_spark_context(self):
-        sc = SparkContext(master=self.master, appName=self.appName)
-        ssc = StreamingContext(sparkContext=sc)
+        self.sc = SparkContext(master=self.master, appName=self.appName)
+        self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
 
     def test_existing_spark_context_with_settings(self):
         conf = SparkConf()
         conf.set("spark.cleaner.ttl", "10")
-        sc = SparkContext(master=self.master, appName=self.appName, conf=conf)
-        ssc = StreamingContext(context=sc)
-        self.assertEqual(int(ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)
-
-    def _addInputStream(self, s):
-        test_inputs = map(lambda x: range(1, x), range(5, 101))
-        # make sure numSlice is 2 due to deserializer proglem in pyspark
-        s._testInputStream(test_inputs, 2)
-
-    def test_from_no_conf_plus_spark_home_plus_env(self):
-        pass
+        self.sc = SparkContext(master=self.master, appName=self.appName, conf=conf)
+        self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
+        self.assertEqual(int(self.ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)
 
     def test_from_conf_with_settings(self):
-        pass
+        conf = SparkConf()
+        conf.set("spark.cleaner.ttl", "10")
+        conf.setMaster(self.master)
+        conf.setAppName(self.appName)
+        self.ssc = StreamingContext(conf=conf, duration=self.batachDuration)
+        self.assertEqual(int(self.ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)
 
     def test_stop_only_streaming_context(self):
-        pass
-
-    def test_await_termination(self):
-        pass
-
-
+        self.sc = SparkContext(master=self.master, appName=self.appName)
+        self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
+        self._addInputStream(self.ssc)
+        self.ssc.start()
+        self.ssc.stop(False)
+        self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)
 
+    def test_stop_multiple_times(self):
+        self.ssc = StreamingContext(master=self.master, appName=self.appName,
+                               duration=self.batachDuration)
+        self._addInputStream(self.ssc)
+        self.ssc.start()
+        self.ssc.stop()
+        self.ssc.stop()
 
+    def _addInputStream(self, s):
+        # Make sure each length of input is over 3 and 
+        # numSlice is 2 due to deserializer problem in pyspark.streaming
+        test_inputs = map(lambda x: range(1, x), range(5, 101))
+        test_stream = s._testInputStream(test_inputs, 2)
+        # Register fake output operation
+        result = list()
+        test_stream._test_output(result)
 
 if __name__ == "__main__":
     unittest.main()

From 5cdb6fae51ffb9a02bc390a26605cceb0465c564 Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Sun, 21 Sep 2014 23:50:34 +0900
Subject: [PATCH 295/347] changed for SCCallSiteSync

---
 core/pom.xml                        | 4 ----
 python/pyspark/streaming/dstream.py | 5 ++---
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/core/pom.xml b/core/pom.xml
index 6e3781f2bb0c6..2a81f6df289c0 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,11 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-<<<<<<< HEAD
     <version>1.2.0-SNAPSHOT</version>
-=======
-    <version>1.1.0-SNAPSHOT</version>
->>>>>>> modified the code base on comment in https://github.com/tdas/spark/pull/10
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 1b095f2a42372..0029178ec4f2b 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -22,11 +22,10 @@
 from pyspark.serializers import NoOpSerializer,\
     BatchedSerializer, CloudPickleSerializer, pack_long,\
     CompressedSerializer
-from pyspark.rdd import _JavaStackTrace
 from pyspark.storagelevel import StorageLevel
 from pyspark.resultiterable import ResultIterable
 from pyspark.streaming.util import rddToFileName, RDDFunction
-
+from pyspark.traceback_utils import SCCallSiteSync
 
 from py4j.java_collections import ListConverter, MapConverter
 
@@ -187,7 +186,7 @@ def add_shuffle_key(split, iterator):
                 yield outputSerializer.dumps(items)
         keyed = PipelinedDStream(self, add_shuffle_key)
         keyed._bypass_serializer = True
-        with _JavaStackTrace(self.ctx) as st:
+        with SCCallSiteSync(self.context) as css:
             partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
                                                           id(partitionFunc))
             jdstream = self.ctx._jvm.PythonPairwiseDStream(keyed._jdstream.dstream(),

From 550dfd9b2000271d14a00edd05f56a05e9f1e28f Mon Sep 17 00:00:00 2001
From: giwa <ugw.gi.world@gmail.com>
Date: Mon, 22 Sep 2014 01:09:16 +0900
Subject: [PATCH 296/347] WIP fixing 1.1 merge

---
 python/pyspark/streaming/dstream.py | 76 +++++++++++++++++++++++------
 1 file changed, 62 insertions(+), 14 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 0029178ec4f2b..bb137d09211bf 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -25,6 +25,7 @@
 from pyspark.storagelevel import StorageLevel
 from pyspark.resultiterable import ResultIterable
 from pyspark.streaming.util import rddToFileName, RDDFunction
+from pyspark.rdd import portable_hash, _parse_memory
 from pyspark.traceback_utils import SCCallSiteSync
 
 from py4j.java_collections import ListConverter, MapConverter
@@ -40,6 +41,7 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
         self._jrdd_deserializer = jrdd_deserializer
         self.is_cached = False
         self.is_checkpointed = False
+        self._partitionFunc = None
 
     def context(self):
         """
@@ -161,32 +163,71 @@ def _mergeCombiners(iterator):
 
         return shuffled.mapPartitions(_mergeCombiners)
 
-    def partitionBy(self, numPartitions, partitionFunc=None):
+    def partitionBy(self, numPartitions, partitionFunc=portable_hash):
         """
         Return a copy of the DStream partitioned using the specified partitioner.
         """
         if numPartitions is None:
             numPartitions = self.ctx._defaultReducePartitions()
 
-        if partitionFunc is None:
-            partitionFunc = lambda x: 0 if x is None else hash(x)
-
         # Transferring O(n) objects to Java is too expensive.  Instead, we'll
         # form the hash buckets in Python, transferring O(numPartitions) objects
         # to Java.  Each object is a (splitNumber, [objects]) pair.
+
         outputSerializer = self.ctx._unbatched_serializer
+#
+#        def add_shuffle_key(split, iterator):
+#            buckets = defaultdict(list)
+#
+#            for (k, v) in iterator:
+#                buckets[partitionFunc(k) % numPartitions].append((k, v))
+#            for (split, items) in buckets.iteritems():
+#                yield pack_long(split)
+#                yield outputSerializer.dumps(items)
+#        keyed = PipelinedDStream(self, add_shuffle_key)
+
+        limit = (_parse_memory(self.ctx._conf.get(
+            "spark.python.worker.memory", "512m")) / 2)
 
         def add_shuffle_key(split, iterator):
+
             buckets = defaultdict(list)
+            c, batch = 0, min(10 * numPartitions, 1000)
 
-            for (k, v) in iterator:
+            for k, v in iterator:
                 buckets[partitionFunc(k) % numPartitions].append((k, v))
-            for (split, items) in buckets.iteritems():
+                c += 1
+
+                # check used memory and avg size of chunk of objects
+                if (c % 1000 == 0 and get_used_memory() > limit
+                        or c > batch):
+                    n, size = len(buckets), 0
+                    for split in buckets.keys():
+                        yield pack_long(split)
+                        d = outputSerializer.dumps(buckets[split])
+                        del buckets[split]
+                        yield d
+                        size += len(d)
+
+                    avg = (size / n) >> 20
+                    # let 1M < avg < 10M
+                    if avg < 1:
+                        batch *= 1.5
+                    elif avg > 10:
+                        batch = max(batch / 1.5, 1)
+                    c = 0
+
+            for split, items in buckets.iteritems():
                 yield pack_long(split)
                 yield outputSerializer.dumps(items)
-        keyed = PipelinedDStream(self, add_shuffle_key)
+
+        keyed = self._mapPartitionsWithIndex(add_shuffle_key)
+
+
+
+
         keyed._bypass_serializer = True
-        with SCCallSiteSync(self.context) as css:
+        with SCCallSiteSync(self.ctx) as css:
             partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
                                                           id(partitionFunc))
             jdstream = self.ctx._jvm.PythonPairwiseDStream(keyed._jdstream.dstream(),
@@ -428,6 +469,10 @@ def get_output(rdd, time):
 
 
 class PipelinedDStream(DStream):
+    """
+    Since PipelinedDStream is same to PipelindRDD, if PipliedRDD is changed,
+    this code should be changed in the same way.
+    """
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
             # This transformation is the first in its stage:
@@ -453,19 +498,22 @@ def pipeline_func(split, iterator):
         self._jdstream_val = None
         self._jrdd_deserializer = self.ctx.serializer
         self._bypass_serializer = False
+        self._partitionFunc = prev._partitionFunc if self.preservesPartitioning else None
 
     @property
     def _jdstream(self):
         if self._jdstream_val:
             return self._jdstream_val
         if self._bypass_serializer:
-            serializer = NoOpSerializer()
-        else:
-            serializer = self.ctx.serializer
-
-        command = (self.func, self._prev_jrdd_deserializer, serializer)
-        ser = CompressedSerializer(CloudPickleSerializer())
+            self.jrdd_deserializer = NoOpSerializer()
+        command = (self.func, self._prev_jrdd_deserializer,
+                   self._jrdd_deserializer)
+        # the serialized command will be compressed by broadcast
+        ser = CloudPickleSerializer()
         pickled_command = ser.dumps(command)
+        if pickled_command > (1 << 20):  # 1M
+            broadcast = self.ctx.broadcast(pickled_command)
+            pickled_command = ser.dumps(broadcast)
         broadcast_vars = ListConverter().convert(
             [x._jbroadcast for x in self.ctx._pickled_broadcast_vars],
             self.ctx._gateway._gateway_client)

From 7f53086a65c717d6bc5cfce7f38921b680415935 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Thu, 25 Sep 2014 16:45:33 -0700
Subject: [PATCH 297/347] support transform(), refactor and cleanup

---
 .../apache/spark/api/python/PythonRDD.scala   |   4 +
 python/pyspark/streaming/context.py           | 114 +---
 python/pyspark/streaming/dstream.py           | 310 ++--------
 python/pyspark/streaming/duration.py          |  29 +-
 python/pyspark/streaming/jtime.py             |   1 -
 python/pyspark/streaming/tests.py             | 579 ++++++------------
 python/pyspark/streaming/util.py              |  52 +-
 python/pyspark/streaming/utils.py             |  67 --
 .../streaming/api/java/JavaDStreamLike.scala  |   2 +-
 .../streaming/api/python/PythonDStream.scala  | 119 ++--
 .../api/python/PythonRDDFunction.java         |  12 -
 11 files changed, 384 insertions(+), 905 deletions(-)
 delete mode 100644 python/pyspark/streaming/utils.py
 delete mode 100644 streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 022e2891559d7..4cc3350b18c95 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -52,6 +52,10 @@ private[spark] class PythonRDD(
     accumulator: Accumulator[JList[Array[Byte]]])
   extends RDD[Array[Byte]](parent) {
 
+  def copyTo(rdd: RDD[_]): PythonRDD = {
+    new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning, pythonExec, broadcastVars, accumulator)
+  }
+
   val bufferSize = conf.getInt("spark.buffer.size", 65536)
   val reuse_worker = conf.getBoolean("spark.python.worker.reuse", true)
 
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index d7a20caac1ee8..c2f8c9d3ff31d 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -23,7 +23,7 @@
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
 from pyspark.context import SparkContext
 from pyspark.streaming.dstream import DStream
-from pyspark.streaming.duration import Duration
+from pyspark.streaming.duration import Duration, Seconds
 
 from py4j.java_collections import ListConverter
 
@@ -35,68 +35,31 @@ class StreamingContext(object):
     broadcast variables on that cluster.
     """
 
-    def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
-                 environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
-                 gateway=None, sparkContext=None, duration=None):
+    def __init__(self, sparkContext, duration):
         """
         Create a new StreamingContext. At least the master and app name and duration
         should be set, either through the named parameters here or through C{conf}.
 
-        @param master: Cluster URL to connect to
-               (e.g. mesos://host:port, spark://host:port, local[4]).
-        @param appName: A name for your job, to display on the cluster web UI.
-        @param sparkHome: Location where Spark is installed on cluster nodes.
-        @param pyFiles: Collection of .zip or .py files to send to the cluster
-               and add to PYTHONPATH.  These can be paths on the local file
-               system or HDFS, HTTP, HTTPS, or FTP URLs.
-        @param environment: A dictionary of environment variables to set on
-               worker nodes.
-        @param batchSize: The number of Python objects represented as a single
-               Java object.  Set 1 to disable batching or -1 to use an
-               unlimited batch size.
-        @param serializer: The serializer for RDDs.
-        @param conf: A L{SparkConf} object setting Spark properties.
-        @param gateway: Use an existing gateway and JVM, otherwise a new JVM
-               will be instatiated.
         @param sparkContext: L{SparkContext} object.
-        @param duration: A L{Duration} object for SparkStreaming.
+        @param duration: A L{Duration} object or seconds for SparkStreaming.
 
         """
+        if isinstance(duration, (int, long, float)):
+            duration = Seconds(duration)
 
-        if not isinstance(duration, Duration):
-            raise TypeError("Input should be pyspark.streaming.duration.Duration object")
-
-        if sparkContext is None:
-            # Create the Python Sparkcontext
-            self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
-                                    pyFiles=pyFiles, environment=environment, batchSize=batchSize,
-                                    serializer=serializer, conf=conf, gateway=gateway)
-        else:
-            self._sc = sparkContext
-
-        # Start py4j callback server.
-        # Callback sever is need only by SparkStreming; therefore the callback sever
-        # is started in StreamingContext.
-        SparkContext._gateway.restart_callback_server()
-        self._set_clean_up_handler()
+        self._sc = sparkContext
         self._jvm = self._sc._jvm
-        self._jssc = self._initialize_context(self._sc._jsc, duration._jduration)
+        self._start_callback_server()
+        self._jssc = self._initialize_context(self._sc, duration)
 
-    # Initialize StremaingContext in function to allow subclass specific initialization
-    def _initialize_context(self, jspark_context, jduration):
-        return self._jvm.JavaStreamingContext(jspark_context, jduration)
+    def _start_callback_server(self):
+        gw = self._sc._gateway
+        # getattr will fallback to JVM
+        if "_callback_server" not in gw.__dict__:
+            gw._start_callback_server(gw._python_proxy_port)
 
-    def _set_clean_up_handler(self):
-        """ set clean up hander using atexit """
-
-        def clean_up_handler():
-            SparkContext._gateway.shutdown()
-
-        atexit.register(clean_up_handler)
-        # atext is not called when the program is killed by a signal not handled by
-        # Python.
-        for sig in (SIGINT, SIGTERM):
-            signal(sig, clean_up_handler)
+    def _initialize_context(self, sc, duration):
+        return self._jvm.JavaStreamingContext(sc._jsc, duration._jduration)
 
     @property
     def sparkContext(self):
@@ -121,17 +84,26 @@ def awaitTermination(self, timeout=None):
         else:
             self._jssc.awaitTermination(timeout)
 
+    def stop(self, stopSparkContext=True, stopGraceFully=False):
+        """
+        Stop the execution of the streams immediately (does not wait for all received data
+        to be processed).
+        """
+        self._jssc.stop(stopSparkContext, stopGraceFully)
+        if stopSparkContext:
+            self._sc.stop()
+
     def remember(self, duration):
         """
         Set each DStreams in this context to remember RDDs it generated in the last given duration.
         DStreams remember RDDs only for a limited duration of time and releases them for garbage
         collection. This method allows the developer to specify how to long to remember the RDDs (
         if the developer wishes to query old data outside the DStream computation).
-        @param duration pyspark.streaming.duration.Duration object.
+        @param duration pyspark.streaming.duration.Duration object or seconds.
                Minimum duration that each DStream should remember its RDDs
         """
-        if not isinstance(duration, Duration):
-            raise TypeError("Input should be pyspark.streaming.duration.Duration object")
+        if isinstance(duration, (int, long, float)):
+            duration = Seconds(duration)
 
         self._jssc.remember(duration._jduration)
 
@@ -153,34 +125,14 @@ def textFileStream(self, directory):
         """
         return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
 
-    def stop(self, stopSparkContext=True, stopGraceFully=False):
-        """
-        Stop the execution of the streams immediately (does not wait for all received data
-        to be processed).
-        """
-        self._jssc.stop(stopSparkContext, stopGraceFully)
-        if stopSparkContext:
-            self._sc.stop()
-
-        # Shutdown only callback server and all py3j client is shutdowned
-        # clean up handler
-        SparkContext._gateway._shutdown_callback_server()
-        
-    def _testInputStream(self, test_inputs, numSlices=None):
+    def _makeStream(self, inputs, numSlices=None):
         """
         This function is only for unittest.
         It requires a list as input, and returns the i_th element at the i_th batch
         under manual clock.
         """
-        test_rdds = list()
-        test_rdd_deserializers = list()
-        for test_input in test_inputs:
-            test_rdd = self._sc.parallelize(test_input, numSlices)
-            test_rdds.append(test_rdd._jrdd)
-            test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
-        # All deserializers have to be the same.
-        # TODO: add deserializer validation
-        jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
-        jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()
-
-        return DStream(jinput_stream, self, test_rdd_deserializers[0])
+        rdds = [self._sc.parallelize(input, numSlices) for input in inputs]
+        jrdds = ListConverter().convert([r._jrdd for r in rdds],
+                                        SparkContext._gateway._gateway_client)
+        jdstream = self._jvm.PythonDataInputStream(self._jssc, jrdds).asJavaDStream()
+        return DStream(jdstream, self, rdds[0]._jrdd_deserializer)
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index bb137d09211bf..64088ae8e6e83 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -19,6 +19,7 @@
 from itertools import chain, ifilter, imap
 import operator
 
+from pyspark import RDD
 from pyspark.serializers import NoOpSerializer,\
     BatchedSerializer, CloudPickleSerializer, pack_long,\
     CompressedSerializer
@@ -53,9 +54,9 @@ def count(self):
         """
         Return a new DStream which contains the number of elements in this DStream.
         """
-        return self.mapPartitions(lambda i: [sum(1 for _ in i)])._sum()
+        return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
 
-    def _sum(self):
+    def sum(self):
         """
         Add up the elements in this DStream.
         """
@@ -78,7 +79,7 @@ def filter(self, f):
         """
         def func(iterator):
             return ifilter(f, iterator)
-        return self.mapPartitions(func)
+        return self.mapPartitions(func, True)
 
     def flatMap(self, f, preservesPartitioning=False):
         """
@@ -87,7 +88,7 @@ def flatMap(self, f, preservesPartitioning=False):
         """
         def func(s, iterator):
             return chain.from_iterable(imap(f, iterator))
-        return self._mapPartitionsWithIndex(func, preservesPartitioning)
+        return self.mapPartitionsWithIndex(func, preservesPartitioning)
 
     def map(self, f, preservesPartitioning=False):
         """
@@ -103,14 +104,14 @@ def mapPartitions(self, f, preservesPartitioning=False):
         """
         def func(s, iterator):
             return f(iterator)
-        return self._mapPartitionsWithIndex(func, preservesPartitioning)
+        return self.mapPartitionsWithIndex(func, preservesPartitioning)
 
-    def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
+    def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
         Return a new DStream by applying a function to each partition of this DStream,
         while tracking the index of the original partition.
         """
-        return PipelinedDStream(self, f, preservesPartitioning)
+        return self.transform(lambda rdd: rdd.mapPartitionsWithIndex(f, preservesPartitioning))
 
     def reduce(self, func):
         """
@@ -137,129 +138,18 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         Count the number of elements for each key, and return the result to the
         master as a dictionary
         """
-        if numPartitions is None:
-            numPartitions = self._defaultReducePartitions()
-
-        def combineLocally(iterator):
-            combiners = {}
-            for x in iterator:
-                (k, v) = x
-                if k not in combiners:
-                    combiners[k] = createCombiner(v)
-                else:
-                    combiners[k] = mergeValue(combiners[k], v)
-            return combiners.iteritems()
-        locally_combined = self.mapPartitions(combineLocally)
-        shuffled = locally_combined.partitionBy(numPartitions)
-
-        def _mergeCombiners(iterator):
-            combiners = {}
-            for (k, v) in iterator:
-                if k not in combiners:
-                    combiners[k] = v
-                else:
-                    combiners[k] = mergeCombiners(combiners[k], v)
-            return combiners.iteritems()
-
-        return shuffled.mapPartitions(_mergeCombiners)
+        def func(rdd):
+            return rdd.combineByKey(createCombiner, mergeValue, mergeCombiners, numPartitions)
+        return self.transform(func)
 
     def partitionBy(self, numPartitions, partitionFunc=portable_hash):
         """
         Return a copy of the DStream partitioned using the specified partitioner.
         """
-        if numPartitions is None:
-            numPartitions = self.ctx._defaultReducePartitions()
-
-        # Transferring O(n) objects to Java is too expensive.  Instead, we'll
-        # form the hash buckets in Python, transferring O(numPartitions) objects
-        # to Java.  Each object is a (splitNumber, [objects]) pair.
-
-        outputSerializer = self.ctx._unbatched_serializer
-#
-#        def add_shuffle_key(split, iterator):
-#            buckets = defaultdict(list)
-#
-#            for (k, v) in iterator:
-#                buckets[partitionFunc(k) % numPartitions].append((k, v))
-#            for (split, items) in buckets.iteritems():
-#                yield pack_long(split)
-#                yield outputSerializer.dumps(items)
-#        keyed = PipelinedDStream(self, add_shuffle_key)
-
-        limit = (_parse_memory(self.ctx._conf.get(
-            "spark.python.worker.memory", "512m")) / 2)
-
-        def add_shuffle_key(split, iterator):
-
-            buckets = defaultdict(list)
-            c, batch = 0, min(10 * numPartitions, 1000)
-
-            for k, v in iterator:
-                buckets[partitionFunc(k) % numPartitions].append((k, v))
-                c += 1
-
-                # check used memory and avg size of chunk of objects
-                if (c % 1000 == 0 and get_used_memory() > limit
-                        or c > batch):
-                    n, size = len(buckets), 0
-                    for split in buckets.keys():
-                        yield pack_long(split)
-                        d = outputSerializer.dumps(buckets[split])
-                        del buckets[split]
-                        yield d
-                        size += len(d)
-
-                    avg = (size / n) >> 20
-                    # let 1M < avg < 10M
-                    if avg < 1:
-                        batch *= 1.5
-                    elif avg > 10:
-                        batch = max(batch / 1.5, 1)
-                    c = 0
-
-            for split, items in buckets.iteritems():
-                yield pack_long(split)
-                yield outputSerializer.dumps(items)
-
-        keyed = self._mapPartitionsWithIndex(add_shuffle_key)
-
-
-
-
-        keyed._bypass_serializer = True
-        with SCCallSiteSync(self.ctx) as css:
-            partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
-                                                          id(partitionFunc))
-            jdstream = self.ctx._jvm.PythonPairwiseDStream(keyed._jdstream.dstream(),
-                                                           partitioner).asJavaDStream()
-        dstream = DStream(jdstream, self._ssc, BatchedSerializer(outputSerializer))
-        # This is required so that id(partitionFunc) remains unique, even if
-        # partitionFunc is a lambda:
-        dstream._partitionFunc = partitionFunc
-        return dstream
-
-    def _defaultReducePartitions(self):
-        """
-        Returns the default number of partitions to use during reduce tasks (e.g., groupBy).
-        If spark.default.parallelism is set, then we'll use the value from SparkContext
-        defaultParallelism, otherwise we'll use the number of partitions in this RDD
-
-        This mirrors the behavior of the Scala Partitioner#defaultPartitioner, intended to reduce
-        the likelihood of OOMs. Once PySpark adopts Partitioner-based APIs, this behavior will
-        be inherent.
-        """
-        if self.ctx._conf.contains("spark.default.parallelism"):
-            return self.ctx.defaultParallelism
-        else:
-            return self.getNumPartitions()
+        return self.transform(lambda rdd: rdd.partitionBy(numPartitions, partitionFunc))
 
-    def getNumPartitions(self):
-        """
-        Return the number of partitions in RDD
-        """
-        # TODO: remove hard coding. RDD has NumPartitions. How do we get the number of partition
-        # through DStream?
-        return 2
+    def foreach(self, func):
+        return self.foreachRDD(lambda rdd, _: rdd.foreach(func))
 
     def foreachRDD(self, func):
         """
@@ -269,8 +159,8 @@ def foreachRDD(self, func):
         This is an output operator, so this DStream will be registered as an output
         stream and there materialized.
         """
-        wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
-        self.ctx._jvm.PythonForeachDStream(self._jdstream.dstream(), wrapped_func)
+        jfunc = RDDFunction(self.ctx, lambda a, b, t: func(a, t), self._jrdd_deserializer)
+        self.ctx._jvm.PythonForeachDStream(self._jdstream.dstream(), jfunc)
 
     def pyprint(self):
         """
@@ -365,37 +255,14 @@ def groupByKey(self, numPartitions=None):
         better performance.
 
         """
-        def createCombiner(x):
-            return [x]
-
-        def mergeValue(xs, x):
-            xs.append(x)
-            return xs
-
-        def mergeCombiners(a, b):
-            a.extend(b)
-            return a
-
-        return self.combineByKey(createCombiner, mergeValue, mergeCombiners,
-                                 numPartitions).mapValues(lambda x: ResultIterable(x))
+        return self.transform(lambda rdd: rdd.groupByKey(numPartitions))
 
     def countByValue(self):
         """
         Return new DStream which contains the count of each unique value in this
         DStreeam as a (value, count) pairs.
         """
-        def countPartition(iterator):
-            counts = defaultdict(int)
-            for obj in iterator:
-                counts[obj] += 1
-            yield counts
-
-        def mergeMaps(m1, m2):
-            for (k, v) in m2.iteritems():
-                m1[k] += v
-            return m1
-
-        return self.mapPartitions(countPartition).reduce(mergeMaps).flatMap(lambda x: x.items())
+        return self.map(lambda x: (x, None)).reduceByKey(lambda x, y: None).count()
 
     def saveAsTextFiles(self, prefix, suffix=None):
         """
@@ -429,24 +296,49 @@ def saveAsPickleFile(rdd, time):
 
         return self.foreachRDD(saveAsPickleFile)
 
-    def _test_output(self, result):
-        """
-        This function is only for test case.
-        Store data in a DStream to result to verify the result in test case
-        """
-        def get_output(rdd, time):
-            """
-            Closure to get element in RDD in the DStream.
-            This closure is called by py4j callback server.
-            """
-            collected = rdd.collect()
-            result.append(collected)
+    def collect(self):
+        result = []
 
+        def get_output(rdd, time):
+            r = rdd.collect()
+            result.append(r)
         self.foreachRDD(get_output)
+        return result
+
+    def transform(self, func):
+        return TransformedRDD(self, lambda a, b, t: func(a), cache=True)
 
+    def transformWith(self, func, other):
+        return TransformedRDD(self, lambda a, b, t: func(a, b), other)
+
+    def transformWithTime(self, func):
+        return TransformedRDD(self, lambda a, b, t: func(a, t))
+
+    def repartitions(self, numPartitions):
+        return self.transform(lambda rdd: rdd.repartition(numPartitions))
+
+    def union(self, other):
+        return self.transformWith(lambda a, b: a.union(b), other)
+
+    def cogroup(self, other):
+        return self.transformWith(lambda a, b: a.cogroup(b), other)
+
+    def leftOuterJoin(self, other):
+        return self.transformWith(lambda a, b: a.leftOuterJion(b), other)
+
+    def rightOuterJoin(self, other):
+        return self.transformWith(lambda a, b: a.rightOuterJoin(b), other)
+
+    def slice(self, fromTime, toTime):
+        jrdds = self._jdstream.slice(fromTime._jtime, toTime._jtime)
+        # FIXME: serializer
+        return [RDD(jrdd, self.ctx, self.ctx.serializer) for jrdd in jrdds]
+
+    def updateStateByKey(self, updateFunc):
+        # FIXME: convert updateFunc to java JFunction2
+        jFunc = updateFunc
+        return self._jdstream.updateStateByKey(jFunc)
 
-# TODO: implement updateStateByKey
-# TODO: implement slice
 
 # Window Operations
 # TODO: implement window
@@ -456,81 +348,13 @@ def get_output(rdd, time):
 # TODO: implement countByWindow
 # TODO: implement reduceByWindow
 
-# Transform Operation
-# TODO: implement transform
-# TODO: implement transformWith
-# Following operation has dependency with transform
-# TODO: implement union
-# TODO: implement repertitions
-# TODO: implement cogroup
-# TODO: implement join
-# TODO: implement leftOuterJoin
-# TODO: implement rightOuterJoin
-
-
-class PipelinedDStream(DStream):
-    """
-    Since PipelinedDStream is same to PipelindRDD, if PipliedRDD is changed,
-    this code should be changed in the same way.
-    """
-    def __init__(self, prev, func, preservesPartitioning=False):
-        if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
-            # This transformation is the first in its stage:
-            self.func = func
-            self.preservesPartitioning = preservesPartitioning
-            self._prev_jdstream = prev._jdstream
-            self._prev_jrdd_deserializer = prev._jrdd_deserializer
-        else:
-            prev_func = prev.func
-
-            def pipeline_func(split, iterator):
-                return func(split, prev_func(split, iterator))
-            self.func = pipeline_func
-            self.preservesPartitioning = \
-                prev.preservesPartitioning and preservesPartitioning
-            self._prev_jdstream = prev._prev_jdstream  # maintain the pipeline
-            self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer
-        self.is_cached = False
-        self.is_checkpointed = False
-        self._ssc = prev._ssc
-        self.ctx = prev.ctx
-        self.prev = prev
-        self._jdstream_val = None
-        self._jrdd_deserializer = self.ctx.serializer
-        self._bypass_serializer = False
-        self._partitionFunc = prev._partitionFunc if self.preservesPartitioning else None
-
-    @property
-    def _jdstream(self):
-        if self._jdstream_val:
-            return self._jdstream_val
-        if self._bypass_serializer:
-            self.jrdd_deserializer = NoOpSerializer()
-        command = (self.func, self._prev_jrdd_deserializer,
-                   self._jrdd_deserializer)
-        # the serialized command will be compressed by broadcast
-        ser = CloudPickleSerializer()
-        pickled_command = ser.dumps(command)
-        if pickled_command > (1 << 20):  # 1M
-            broadcast = self.ctx.broadcast(pickled_command)
-            pickled_command = ser.dumps(broadcast)
-        broadcast_vars = ListConverter().convert(
-            [x._jbroadcast for x in self.ctx._pickled_broadcast_vars],
-            self.ctx._gateway._gateway_client)
-        self.ctx._pickled_broadcast_vars.clear()
-        class_tag = self._prev_jdstream.classTag()
-        env = MapConverter().convert(self.ctx.environment,
-                                     self.ctx._gateway._gateway_client)
-        includes = ListConverter().convert(self.ctx._python_includes,
-                                           self.ctx._gateway._gateway_client)
-        python_dstream = self.ctx._jvm.PythonDStream(self._prev_jdstream.dstream(),
-                                                     bytearray(pickled_command),
-                                                     env, includes, self.preservesPartitioning,
-                                                     self.ctx.pythonExec,
-                                                     broadcast_vars, self.ctx._javaAccumulator,
-                                                     class_tag)
-        self._jdstream_val = python_dstream.asJavaDStream()
-        return self._jdstream_val
-
-    def _is_pipelinable(self):
-        return not (self.is_cached or self.is_checkpointed)
+
+class TransformedRDD(DStream):
+    # TODO: better name for cache
+    def __init__(self, prev, func, other=None, cache=False):
+        # TODO: combine transformed RDD
+        ssc = prev._ssc
+        t = RDDFunction(ssc._sc, func, prev._jrdd_deserializer)
+        jdstream = ssc._jvm.PythonTransformedDStream(prev._jdstream.dstream(),
+                                                     other and other._jdstream, t, cache)
+        DStream.__init__(self, jdstream.asJavaDStream(), ssc, ssc._sc.serializer)
diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
index 495ac2edff198..8660f332a48da 100644
--- a/python/pyspark/streaming/duration.py
+++ b/python/pyspark/streaming/duration.py
@@ -15,7 +15,32 @@
 # limitations under the License.
 #
 
-from pyspark.streaming import util
+
+def msDurationToString(ms):
+    """
+    Returns a human-readable string representing a duration such as "35ms"
+
+    >> msDurationToString(10)
+    '10 ms'
+    >>> msDurationToString(1000)
+    '1.0 s'
+    >>> msDurationToString(60000)
+    '1.0 m'
+    >>> msDurationToString(3600000)
+    '1.00 h'
+    """
+    second = 1000
+    minute = 60 * second
+    hour = 60 * minute
+
+    if ms < second:
+        return "%d ms" % ms
+    elif ms < minute:
+        return "%.1f s" % (float(ms) / second)
+    elif ms < hour:
+        return "%.1f m" % (float(ms) / minute)
+    else:
+        return "%.2f h" % (float(ms) / hour)
 
 
 class Duration(object):
@@ -82,7 +107,7 @@ def prettyPrint(self):
         >>> d_1hour.prettyPrint()
         '1.00 h'
         """
-        return util.msDurationToString(self._millis)
+        return msDurationToString(self._millis)
 
     def milliseconds(self):
         """
diff --git a/python/pyspark/streaming/jtime.py b/python/pyspark/streaming/jtime.py
index 801b8871b3879..e157640afa4df 100644
--- a/python/pyspark/streaming/jtime.py
+++ b/python/pyspark/streaming/jtime.py
@@ -133,4 +133,3 @@ def _is_time(self, instance):
 
 # TODO: implement until
 # TODO: implement to
-
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 95cb76a15be07..0123e4e8633fc 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -25,29 +25,24 @@
 from itertools import chain
 import time
 import operator
-import sys
-
-if sys.version_info[:2] <= (2, 6):
-    import unittest2 as unittest
-else:
-    import unittest
+import unittest
 
 from pyspark.context import SparkContext
-from pyspark.conf import SparkConf
 from pyspark.streaming.context import StreamingContext
-from pyspark.streaming.duration import *
+from pyspark.streaming.duration import Seconds
 
 
 class PySparkStreamingTestCase(unittest.TestCase):
     def setUp(self):
         class_name = self.__class__.__name__
-        self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
+        self.sc = SparkContext(appName=class_name)
+        self.ssc = StreamingContext(self.sc, duration=Seconds(1))
 
     def tearDown(self):
         # Do not call pyspark.streaming.context.StreamingContext.stop directly because
         # we do not wait to shutdown py4j client.
-        self.ssc._jssc.stop()
-        self.ssc._sc.stop()
+        self.ssc.stop()
+        self.sc.stop()
         time.sleep(1)
 
     @classmethod
@@ -56,7 +51,7 @@ def tearDownClass(cls):
         SparkContext._gateway._shutdown_callback_server()
 
 
-class TestBasicOperationsSuite(PySparkStreamingTestCase):
+class TestBasicOperations(PySparkStreamingTestCase):
     """
     2 tests for each function for batach deserializer and unbatch deserilizer because
     the deserializer is not changed dunamically after streaming process starts.
@@ -74,368 +69,169 @@ def setUp(self):
         self.timeout = 10  # seconds
         self.numInputPartitions = 2
 
-    def tearDown(self):
-        PySparkStreamingTestCase.tearDown(self)
-
-    @classmethod
-    def tearDownClass(cls):
-        PySparkStreamingTestCase.tearDownClass()
-
-    def test_map_batch(self):
-        """Basic operation test for DStream.map with batch deserializer."""
-        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+    def test_map(self):
+        """Basic operation test for DStream.map."""
+        input = [range(1, 5), range(5, 9), range(9, 13)]
 
-        def test_func(dstream):
-            return dstream.map(lambda x: str(x))
-        expected_output = map(lambda x: map(lambda y: str(y), x), test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
+        def func(dstream):
+            return dstream.map(str)
+        expected = map(lambda x: map(str, x), input)
+        self._test_func(input, func, expected)
 
-    def test_map_unbatach(self):
-        """Basic operation test for DStream.map with unbatch deserializer."""
-        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+    def test_flatMap(self):
+        """Basic operation test for DStream.faltMap."""
+        input = [range(1, 5), range(5, 9), range(9, 13)]
 
-        def test_func(dstream):
-            return dstream.map(lambda x: str(x))
-        expected_output = map(lambda x: map(lambda y: str(y), x), test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_flatMap_batch(self):
-        """Basic operation test for DStream.faltMap with batch deserializer."""
-        test_input = [range(1, 5), range(5, 9), range(9, 13)]
-
-        def test_func(dstream):
-            return dstream.flatMap(lambda x: (x, x * 2))
-        expected_output = map(lambda x: list(chain.from_iterable((map(lambda y: [y, y * 2], x)))),
-                              test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_flatMap_unbatch(self):
-        """Basic operation test for DStream.faltMap with unbatch deserializer."""
-        test_input = [range(1, 4), range(4, 7), range(7, 10)]
-
-        def test_func(dstream):
+        def func(dstream):
             return dstream.flatMap(lambda x: (x, x * 2))
-        expected_output = map(lambda x: list(chain.from_iterable((map(lambda y: [y, y * 2], x)))),
-                              test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_filter_batch(self):
-        """Basic operation test for DStream.filter with batch deserializer."""
-        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+        expected = map(lambda x: list(chain.from_iterable((map(lambda y: [y, y * 2], x)))),
+                       input)
+        self._test_func(input, func, expected)
 
-        def test_func(dstream):
-            return dstream.filter(lambda x: x % 2 == 0)
-        expected_output = map(lambda x: filter(lambda y: y % 2 == 0, x), test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_filter_unbatch(self):
-        """Basic operation test for DStream.filter with unbatch deserializer."""
-        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+    def test_filter(self):
+        """Basic operation test for DStream.filter."""
+        input = [range(1, 5), range(5, 9), range(9, 13)]
 
-        def test_func(dstream):
+        def func(dstream):
             return dstream.filter(lambda x: x % 2 == 0)
-        expected_output = map(lambda x: filter(lambda y: y % 2 == 0, x), test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
+        expected = map(lambda x: filter(lambda y: y % 2 == 0, x), input)
+        self._test_func(input, func, expected)
 
-    def test_count_batch(self):
-        """Basic operation test for DStream.count with batch deserializer."""
-        test_input = [range(1, 5), range(1, 10), range(1, 20)]
+    def test_count(self):
+        """Basic operation test for DStream.count."""
+        input = [range(1, 5), range(1, 10), range(1, 20)]
 
-        def test_func(dstream):
+        def func(dstream):
             return dstream.count()
-        expected_output = map(lambda x: [len(x)], test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
+        expected = map(lambda x: [len(x)], input)
+        self._test_func(input, func, expected)
 
-    def test_count_unbatch(self):
-        """Basic operation test for DStream.count with unbatch deserializer."""
-        test_input = [[], [1], range(1, 3), range(1, 4)]
-
-        def test_func(dstream):
-            return dstream.count()
-        expected_output = map(lambda x: [len(x)], test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
+    def test_reduce(self):
+        """Basic operation test for DStream.reduce."""
+        input = [range(1, 5), range(5, 9), range(9, 13)]
 
-    def test_reduce_batch(self):
-        """Basic operation test for DStream.reduce with batch deserializer."""
-        test_input = [range(1, 5), range(5, 9), range(9, 13)]
-
-        def test_func(dstream):
+        def func(dstream):
             return dstream.reduce(operator.add)
-        expected_output = map(lambda x: [reduce(operator.add, x)], test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_reduce_unbatch(self):
-        """Basic operation test for DStream.reduce with unbatch deserializer."""
-        test_input = [[1], range(1, 3), range(1, 4)]
+        expected = map(lambda x: [reduce(operator.add, x)], input)
+        self._test_func(input, func, expected)
 
-        def test_func(dstream):
-            return dstream.reduce(operator.add)
-        expected_output = map(lambda x: [reduce(operator.add, x)], test_input)
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
+    def test_reduceByKey(self):
+        """Basic operation test for DStream.reduceByKey."""
+        input = [[("a", 1), ("a", 1), ("b", 1), ("b", 1)],
+                 [("", 1), ("", 1), ("", 1), ("", 1)],
+                 [(1, 1), (1, 1), (2, 1), (2, 1), (3, 1)]]
 
-    def test_reduceByKey_batch(self):
-        """Basic operation test for DStream.reduceByKey with batch deserializer."""
-        test_input = [[("a", 1), ("a", 1), ("b", 1), ("b", 1)],
-                      [("", 1), ("", 1), ("", 1), ("", 1)],
-                      [(1, 1), (1, 1), (2, 1), (2, 1), (3, 1)]]
-
-        def test_func(dstream):
-            return dstream.reduceByKey(operator.add)
-        expected_output = [[("a", 2), ("b", 2)], [("", 4)], [(1, 2), (2, 2), (3, 1)]]
-        output = self._run_stream(test_input, test_func, expected_output)
-        for result in (output, expected_output):
-            self._sort_result_based_on_key(result)
-        self.assertEqual(expected_output, output)
-
-    def test_reduceByKey_unbatch(self):
-        """Basic operation test for DStream.reduceByKey with unbatch deserializer."""
-        test_input = [[("a", 1), ("a", 1), ("b", 1)], [("", 1), ("", 1)], []]
-
-        def test_func(dstream):
+        def func(dstream):
             return dstream.reduceByKey(operator.add)
-        expected_output = [[("a", 2), ("b", 1)], [("", 2)], []]
-        output = self._run_stream(test_input, test_func, expected_output)
-        for result in (output, expected_output):
-            self._sort_result_based_on_key(result)
-        self.assertEqual(expected_output, output)
+        expected = [[("a", 2), ("b", 2)], [("", 4)], [(1, 2), (2, 2), (3, 1)]]
+        self._test_func(input, func, expected, sort=True)
 
-    def test_mapValues_batch(self):
-        """Basic operation test for DStream.mapValues with batch deserializer."""
-        test_input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)],
-                      [("", 4), (1, 1), (2, 2), (3, 3)],
-                      [(1, 1), (2, 1), (3, 1), (4, 1)]]
+    def test_mapValues(self):
+        """Basic operation test for DStream.mapValues."""
+        input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)],
+                 [("", 4), (1, 1), (2, 2), (3, 3)],
+                 [(1, 1), (2, 1), (3, 1), (4, 1)]]
 
-        def test_func(dstream):
+        def func(dstream):
             return dstream.mapValues(lambda x: x + 10)
-        expected_output = [[("a", 12), ("b", 12), ("c", 11), ("d", 11)],
-                           [("", 14), (1, 11), (2, 12), (3, 13)],
-                           [(1, 11), (2, 11), (3, 11), (4, 11)]]
-        output = self._run_stream(test_input, test_func, expected_output)
-        for result in (output, expected_output):
-            self._sort_result_based_on_key(result)
-        self.assertEqual(expected_output, output)
-
-    def test_mapValues_unbatch(self):
-        """Basic operation test for DStream.mapValues with unbatch deserializer."""
-        test_input = [[("a", 2), ("b", 1)], [("", 2)], [], [(1, 1), (2, 2)]]
-
-        def test_func(dstream):
-            return dstream.mapValues(lambda x: x + 10)
-        expected_output = [[("a", 12), ("b", 11)], [("", 12)], [], [(1, 11), (2, 12)]]
-        output = self._run_stream(test_input, test_func, expected_output)
-        for result in (output, expected_output):
-            self._sort_result_based_on_key(result)
-        self.assertEqual(expected_output, output)
-
-    def test_flatMapValues_batch(self):
-        """Basic operation test for DStream.flatMapValues with batch deserializer."""
-        test_input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)],
-                      [("", 4), (1, 1), (2, 1), (3, 1)],
-                      [(1, 1), (2, 1), (3, 1), (4, 1)]]
-
-        def test_func(dstream):
+        expected = [[("a", 12), ("b", 12), ("c", 11), ("d", 11)],
+                    [("", 14), (1, 11), (2, 12), (3, 13)],
+                    [(1, 11), (2, 11), (3, 11), (4, 11)]]
+        self._test_func(input, func, expected, sort=True)
+
+    def test_flatMapValues(self):
+        """Basic operation test for DStream.flatMapValues."""
+        input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)],
+                 [("", 4), (1, 1), (2, 1), (3, 1)],
+                 [(1, 1), (2, 1), (3, 1), (4, 1)]]
+
+        def func(dstream):
             return dstream.flatMapValues(lambda x: (x, x + 10))
-        expected_output = [[("a", 2), ("a", 12), ("b", 2), ("b", 12),
-                            ("c", 1), ("c", 11), ("d", 1), ("d", 11)],
-                           [("", 4), ("", 14), (1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11)],
-                           [(1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11), (4, 1), (4, 11)]]
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_flatMapValues_unbatch(self):
-        """Basic operation test for DStream.flatMapValues with unbatch deserializer."""
-        test_input = [[("a", 2), ("b", 1)], [("", 2)], []]
-
-        def test_func(dstream):
-            return dstream.flatMapValues(lambda x: (x, x + 10))
-        expected_output = [[("a", 2), ("a", 12), ("b", 1), ("b", 11)], [("", 2), ("", 12)], []]
-        output = self._run_stream(test_input, test_func, expected_output)
-        self.assertEqual(expected_output, output)
-
-    def test_glom_batch(self):
-        """Basic operation test for DStream.glom with batch deserializer."""
-        test_input = [range(1, 5), range(5, 9), range(9, 13)]
-        numSlices = 2
-
-        def test_func(dstream):
-            return dstream.glom()
-        expected_output = [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]]
-        output = self._run_stream(test_input, test_func, expected_output, numSlices)
-        self.assertEqual(expected_output, output)
-
-    def test_glom_unbatach(self):
-        """Basic operation test for DStream.glom with unbatch deserializer."""
-        test_input = [range(1, 4), range(4, 7), range(7, 10)]
+        expected = [[("a", 2), ("a", 12), ("b", 2), ("b", 12),
+                     ("c", 1), ("c", 11), ("d", 1), ("d", 11)],
+                    [("", 4), ("", 14), (1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11)],
+                    [(1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11), (4, 1), (4, 11)]]
+        self._test_func(input, func, expected)
+
+    def test_glom(self):
+        """Basic operation test for DStream.glom."""
+        input = [range(1, 5), range(5, 9), range(9, 13)]
         numSlices = 2
 
-        def test_func(dstream):
+        def func(dstream):
             return dstream.glom()
-        expected_output = [[[1], [2, 3]], [[4], [5, 6]], [[7], [8, 9]]]
-        output = self._run_stream(test_input, test_func, expected_output, numSlices)
-        self.assertEqual(expected_output, output)
+        expected = [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]]
+        self._test_func(input, func, expected, numSlices)
 
-    def test_mapPartitions_batch(self):
-        """Basic operation test for DStream.mapPartitions with batch deserializer."""
-        test_input = [range(1, 5), range(5, 9), range(9, 13)]
+    def test_mapPartitions(self):
+        """Basic operation test for DStream.mapPartitions."""
+        input = [range(1, 5), range(5, 9), range(9, 13)]
         numSlices = 2
 
-        def test_func(dstream):
+        def func(dstream):
             def f(iterator):
                 yield sum(iterator)
             return dstream.mapPartitions(f)
-        expected_output = [[3, 7], [11, 15], [19, 23]]
-        output = self._run_stream(test_input, test_func, expected_output, numSlices)
-        self.assertEqual(expected_output, output)
+        expected = [[3, 7], [11, 15], [19, 23]]
+        self._test_func(input, func, expected, numSlices)
 
-    def test_mapPartitions_unbatch(self):
-        """Basic operation test for DStream.mapPartitions with unbatch deserializer."""
-        test_input = [range(1, 4), range(4, 7), range(7, 10)]
-        numSlices = 2
-
-        def test_func(dstream):
-            def f(iterator):
-                yield sum(iterator)
-            return dstream.mapPartitions(f)
-        expected_output = [[1, 5], [4, 11], [7, 17]]
-        output = self._run_stream(test_input, test_func, expected_output, numSlices)
-        self.assertEqual(expected_output, output)
-
-    def test_countByValue_batch(self):
-        """Basic operation test for DStream.countByValue with batch deserializer."""
-        test_input = [range(1, 5) * 2, range(5, 7) + range(5, 9), ["a", "a", "b", ""]]
+    def test_countByValue(self):
+        """Basic operation test for DStream.countByValue."""
+        input = [range(1, 5) * 2, range(5, 7) + range(5, 9), ["a", "a", "b", ""]]
 
-        def test_func(dstream):
+        def func(dstream):
             return dstream.countByValue()
-        expected_output = [[(1, 2), (2, 2), (3, 2), (4, 2)],
-                           [(5, 2), (6, 2), (7, 1), (8, 1)],
-                           [("a", 2), ("b", 1), ("", 1)]]
-        output = self._run_stream(test_input, test_func, expected_output)
-        for result in (output, expected_output):
-            self._sort_result_based_on_key(result)
-        self.assertEqual(expected_output, output)
-
-    def test_countByValue_unbatch(self):
-        """Basic operation test for DStream.countByValue with unbatch deserializer."""
-        test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]
-
-        def test_func(dstream):
-            return dstream.countByValue()
-        expected_output = [[(1, 1), (2, 1), (3, 1)],
-                           [(1, 2), ("", 1)],
-                           [("a", 2), ("b", 1)]]
-        output = self._run_stream(test_input, test_func, expected_output)
-        for result in (output, expected_output):
-            self._sort_result_based_on_key(result)
-        self.assertEqual(expected_output, output)
-
-    def test_groupByKey_batch(self):
-        """Basic operation test for DStream.groupByKey with batch deserializer."""
-        test_input = [[(1, 1), (2, 1), (3, 1), (4, 1)],
-                      [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)],
-                      [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]]
-
-        def test_func(dstream):
-            return dstream.groupByKey()
-        expected_output = [[(1, [1]), (2, [1]), (3, [1]), (4, [1])],
-                           [(1, [1, 1, 1]), (2, [1, 1]), (3, [1])],
-                           [("a", [1, 1]), ("b", [1]), ("", [1, 1, 1])]]
-        scattered_output = self._run_stream(test_input, test_func, expected_output)
-        output = self._convert_iter_value_to_list(scattered_output)
-        for result in (output, expected_output):
-            self._sort_result_based_on_key(result)
-        self.assertEqual(expected_output, output)
-
-    def test_groupByKey_unbatch(self):
-        """Basic operation test for DStream.groupByKey with unbatch deserializer."""
-        test_input = [[(1, 1), (2, 1), (3, 1)],
-                      [(1, 1), (1, 1), ("", 1)],
-                      [("a", 1), ("a", 1), ("b", 1)]]
-
-        def test_func(dstream):
-            return dstream.groupByKey()
-        expected_output = [[(1, [1]), (2, [1]), (3, [1])],
-                           [(1, [1, 1]), ("", [1])],
-                           [("a", [1, 1]), ("b", [1])]]
-        scattered_output = self._run_stream(test_input, test_func, expected_output)
-        output = self._convert_iter_value_to_list(scattered_output)
-        for result in (output, expected_output):
-            self._sort_result_based_on_key(result)
-        self.assertEqual(expected_output, output)
-
-    def test_combineByKey_batch(self):
-        """Basic operation test for DStream.combineByKey with batch deserializer."""
-        test_input = [[(1, 1), (2, 1), (3, 1), (4, 1)],
-                      [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)],
-                      [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]]
-
-        def test_func(dstream):
+        expected = [[4], [4], [3]]
+        self._test_func(input, func, expected)
+
+    def test_groupByKey(self):
+        """Basic operation test for DStream.groupByKey."""
+        input = [[(1, 1), (2, 1), (3, 1), (4, 1)],
+                 [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)],
+                 [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]]
+
+        def func(dstream):
+            return dstream.groupByKey().mapValues(list)
+
+        expected = [[(1, [1]), (2, [1]), (3, [1]), (4, [1])],
+                    [(1, [1, 1, 1]), (2, [1, 1]), (3, [1])],
+                    [("a", [1, 1]), ("b", [1]), ("", [1, 1, 1])]]
+        self._test_func(input, func, expected, sort=True)
+
+    def test_combineByKey(self):
+        """Basic operation test for DStream.combineByKey."""
+        input = [[(1, 1), (2, 1), (3, 1), (4, 1)],
+                 [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)],
+                 [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]]
+
+        def func(dstream):
             def add(a, b):
                 return a + str(b)
             return dstream.combineByKey(str, add, add)
-        expected_output = [[(1, "1"), (2, "1"), (3, "1"), (4, "1")],
-                           [(1, "111"), (2, "11"), (3, "1")],
-                           [("a", "11"), ("b", "1"), ("", "111")]]
-        output = self._run_stream(test_input, test_func, expected_output)
-        for result in (output, expected_output):
-            self._sort_result_based_on_key(result)
-        self.assertEqual(expected_output, output)
-
-    def test_combineByKey_unbatch(self):
-        """Basic operation test for DStream.combineByKey with unbatch deserializer."""
-        test_input = [[(1, 1), (2, 1), (3, 1)],
-                      [(1, 1), (1, 1), ("", 1)],
-                      [("a", 1),  ("a", 1), ("b", 1)]]
-
-        def test_func(dstream):
-            def add(a, b):
-                return a + str(b)
-            return dstream.combineByKey(str, add, add)
-        expected_output = [[(1, "1"), (2, "1"), (3, "1")],
-                           [(1, "11"), ("", "1")],
-                           [("a", "11"), ("b", "1")]]
-        output = self._run_stream(test_input, test_func, expected_output)
-        for result in (output, expected_output):
-            self._sort_result_based_on_key(result)
-        self.assertEqual(expected_output, output)
-
-    def _convert_iter_value_to_list(self, outputs):
-        """Return key value pair list. Value is converted to iterator to list."""
-        result = list()
-        for output in outputs:
-            result.append(map(lambda (x, y): (x, list(y)), output))
-        return result
+        expected = [[(1, "1"), (2, "1"), (3, "1"), (4, "1")],
+                    [(1, "111"), (2, "11"), (3, "1")],
+                    [("a", "11"), ("b", "1"), ("", "111")]]
+        self._test_func(input, func, expected, sort=True)
 
     def _sort_result_based_on_key(self, outputs):
         """Sort the list base onf first value."""
         for output in outputs:
             output.sort(key=lambda x: x[0])
 
-    def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
+    def _test_func(self, input, func, expected, numSlices=None, sort=False):
         """
         Start stream and return the result.
-        @param test_input: dataset for the test. This should be list of lists.
-        @param test_func: wrapped test_function. This function should return PythonDStream object.
-        @param expected_output: expected output for this testcase.
+        @param input: dataset for the test. This should be list of lists.
+        @param func: wrapped function. This function should return PythonDStream object.
+        @param expected: expected output for this testcase.
         @param numSlices: the number of slices in the rdd in the dstream.
         """
         # Generate input stream with user-defined input.
         numSlices = numSlices or self.numInputPartitions
-        test_input_stream = self.ssc._testInputStream(test_input, numSlices)
+        input_stream = self.ssc._makeStream(input, numSlices)
         # Apply test function to stream.
-        test_stream = test_func(test_input_stream)
-        # Add job to get output from stream.
-        result = list()
-        test_stream._test_output(result)
+        stream = func(input_stream)
+        result = stream.collect()
         self.ssc.start()
 
         start_time = time.time()
@@ -449,13 +245,62 @@ def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
             # if py4j server is called every 50 milliseconds, it gets an error.
             time.sleep(0.05)
             # Check if the output is the same length of expected output.
-            if len(expected_output) == len(result):
+            if len(expected) == len(result):
+                break
+        if sort:
+            self._sort_result_based_on_key(result)
+            self._sort_result_based_on_key(expected)
+        self.assertEqual(expected, result)
+
+
+class TestTransform(PySparkStreamingTestCase):
+    def setUp(self):
+        PySparkStreamingTestCase.setUp(self)
+        self.timeout = 10
+
+    def test_transform(self):
+        input = [range(1, 5), range(5, 9), range(9, 13)]
+
+        def func(stream):
+            return stream.transform(lambda r: r and r.map(str))
+
+        expected = map(lambda x: map(str, x), input)
+        self._test_func(input, func, expected)
+        self.assertEqual(expected, output)
+
+    def _test_func(self, input, func, expected, numSlices=None):
+        """
+        Start stream and return the result.
+        @param input: dataset for the test. This should be list of lists.
+        @param func: wrapped function. This function should return PythonDStream object.
+        @param expected: expected output for this testcase.
+        @param numSlices: the number of slices in the rdd in the dstream.
+        """
+        # Generate input stream with user-defined input.
+        input_stream = self.ssc._makeStream(input, numSlices)
+        # Apply test function to stream.
+        stream = func(input_stream)
+        result = stream.collect()
+        self.ssc.start()
+
+        start_time = time.time()
+        # Loop until get the expected the number of the result from the stream.
+        while True:
+            current_time = time.time()
+            # Check time out.
+            if (current_time - start_time) > self.timeout:
+                break
+            # StreamingContext.awaitTermination is not used to wait because
+            # if py4j server is called every 50 milliseconds, it gets an error.
+            time.sleep(0.05)
+            # Check if the output is the same length of expected output.
+            if len(expected) == len(result):
                 break
 
         return result
 
 
-class TestStreamingContextSuite(unittest.TestCase):
+class TestStreamingContext(unittest.TestCase):
     """
     Should we have conf property in  SparkContext?
     @property
@@ -464,94 +309,34 @@ def conf(self):
 
     """
     def setUp(self):
-        self.master = "local[2]"
-        self.appName = self.__class__.__name__
-        self.batachDuration = Milliseconds(500)
-        self.sparkHome = "SomeDir"
-        self.envPair = {"key": "value"}
+        self.sc = SparkContext(master="local[2]", appName=self.__class__.__name__)
+        self.batachDuration = Seconds(1)
         self.ssc = None
-        self.sc = None
 
     def tearDown(self):
-        # Do not call pyspark.streaming.context.StreamingContext.stop directly because
-        # we do not wait to shutdown py4j client.
-        # We need change this simply calll streamingConxt.Stop
-        #self.ssc._jssc.stop()
         if self.ssc is not None:
             self.ssc.stop()
-        if self.sc is not None:
-            self.sc.stop()
-        # Why does it long time to terminate StremaingContext and SparkContext?
-        # Should we change the sleep time if this depends on machine spec?
-        time.sleep(1)
-
-    @classmethod
-    def tearDownClass(cls):
-        # Make sure tp shutdown the callback server
-        SparkContext._gateway._shutdown_callback_server()
-
-    def test_from_no_conf_constructor(self):
-        self.ssc = StreamingContext(master=self.master, appName=self.appName,
-                               duration=self.batachDuration)
-        # Alternative call master: ssc.sparkContext.master
-        # I try to make code close to Scala.
-        self.assertEqual(self.ssc.sparkContext._conf.get("spark.master"), self.master)
-        self.assertEqual(self.ssc.sparkContext._conf.get("spark.app.name"), self.appName)
-
-    def test_from_no_conf_plus_spark_home(self):
-        self.ssc = StreamingContext(master=self.master, appName=self.appName, 
-                               sparkHome=self.sparkHome, duration=self.batachDuration)
-        self.assertEqual(self.ssc.sparkContext._conf.get("spark.home"), self.sparkHome)
-
-    def test_from_no_conf_plus_spark_home_plus_env(self):
-        self.ssc = StreamingContext(master=self.master, appName=self.appName, 
-                               sparkHome=self.sparkHome, environment=self.envPair,
-                               duration=self.batachDuration)
-        self.assertEqual(self.ssc.sparkContext._conf.get("spark.executorEnv.key"), self.envPair["key"])
-
-    def test_from_existing_spark_context(self):
-        self.sc = SparkContext(master=self.master, appName=self.appName)
-        self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
-
-    def test_existing_spark_context_with_settings(self):
-        conf = SparkConf()
-        conf.set("spark.cleaner.ttl", "10")
-        self.sc = SparkContext(master=self.master, appName=self.appName, conf=conf)
-        self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
-        self.assertEqual(int(self.ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)
-
-    def test_from_conf_with_settings(self):
-        conf = SparkConf()
-        conf.set("spark.cleaner.ttl", "10")
-        conf.setMaster(self.master)
-        conf.setAppName(self.appName)
-        self.ssc = StreamingContext(conf=conf, duration=self.batachDuration)
-        self.assertEqual(int(self.ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)
+        self.sc.stop()
 
     def test_stop_only_streaming_context(self):
-        self.sc = SparkContext(master=self.master, appName=self.appName)
-        self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
+        self.ssc = StreamingContext(self.sc, self.batachDuration)
         self._addInputStream(self.ssc)
         self.ssc.start()
         self.ssc.stop(False)
         self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)
 
     def test_stop_multiple_times(self):
-        self.ssc = StreamingContext(master=self.master, appName=self.appName,
-                               duration=self.batachDuration)
+        self.ssc = StreamingContext(self.sc, self.batachDuration)
         self._addInputStream(self.ssc)
         self.ssc.start()
         self.ssc.stop()
         self.ssc.stop()
 
     def _addInputStream(self, s):
-        # Make sure each length of input is over 3 and 
-        # numSlice is 2 due to deserializer problem in pyspark.streaming
-        test_inputs = map(lambda x: range(1, x), range(5, 101))
-        test_stream = s._testInputStream(test_inputs, 2)
-        # Register fake output operation
-        result = list()
-        test_stream._test_output(result)
+        # Make sure each length of input is over 3
+        inputs = map(lambda x: range(1, x), range(5, 101))
+        stream = s._makeStream(inputs)
+        stream.collect()
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/pyspark/streaming/util.py b/python/pyspark/streaming/util.py
index cf90952543fc0..3047763594ce5 100644
--- a/python/pyspark/streaming/util.py
+++ b/python/pyspark/streaming/util.py
@@ -18,56 +18,34 @@
 from pyspark.rdd import RDD
 
 
-class RDDFunction():
+class RDDFunction(object):
     """
     This class is for py4j callback. This class is related with
     org.apache.spark.streaming.api.python.PythonRDDFunction.
     """
-    def __init__(self, ctx, jrdd_deserializer, func):
+    def __init__(self, ctx, func, jrdd_deserializer):
         self.ctx = ctx
-        self.deserializer = jrdd_deserializer
         self.func = func
+        self.deserializer = jrdd_deserializer
 
-    def call(self, jrdd, time):
-        # Wrap JavaRDD into python's RDD class
-        rdd = RDD(jrdd, self.ctx, self.deserializer)
-        # Call user defined RDD function
-        self.func(rdd, time)
+    def call(self, jrdd, jrdd2, milliseconds):
+        try:
+            rdd = RDD(jrdd, self.ctx, self.deserializer)
+            other = RDD(jrdd2, self.ctx, self.deserializer) if jrdd2 else None
+            r = self.func(rdd, other, milliseconds)
+            if r:
+                return r._jrdd
+        except:
+            import traceback
+            traceback.print_exc()
 
-    def __str__(self):
-        return "%s, %s" % (str(self.deserializer), str(self.func))
+    def __repr__(self):
+        return "RDDFunction(%s, %s)" % (str(self.deserializer), str(self.func))
 
     class Java:
         implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction']
 
 
-def msDurationToString(ms):
-    """
-    Returns a human-readable string representing a duration such as "35ms"
-
-    >> msDurationToString(10)
-    '10 ms'
-    >>> msDurationToString(1000)
-    '1.0 s'
-    >>> msDurationToString(60000)
-    '1.0 m'
-    >>> msDurationToString(3600000)
-    '1.00 h'
-    """
-    second = 1000
-    minute = 60 * second
-    hour = 60 * minute
-
-    if ms < second:
-        return "%d ms" % ms
-    elif ms < minute:
-        return "%.1f s" % (float(ms) / second)
-    elif ms < hour:
-        return "%.1f m" % (float(ms) / minute)
-    else:
-        return "%.2f h" % (float(ms) / hour)
-
-
 def rddToFileName(prefix, suffix, time):
     """
     Return string prefix-time(.suffix)
diff --git a/python/pyspark/streaming/utils.py b/python/pyspark/streaming/utils.py
deleted file mode 100644
index 5ba179cae7f9c..0000000000000
--- a/python/pyspark/streaming/utils.py
+++ /dev/null
@@ -1,67 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from pyspark.rdd import RDD
-
-
-class RDDFunction():
-    """
-    This class is for py4j callback. This
-    """
-    def __init__(self, ctx, jrdd_deserializer, func):
-        self.ctx = ctx
-        self.deserializer = jrdd_deserializer
-        self.func = func
-
-    def call(self, jrdd, time):
-        # Wrap JavaRDD into python's RDD class
-        rdd = RDD(jrdd, self.ctx, self.deserializer)
-        # Call user defined RDD function
-        self.func(rdd, time)
-
-    def __str__(self):
-        return "%s, %s" % (str(self.deserializer), str(self.func))
-
-    class Java:
-        implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction']
-
-
-def msDurationToString(ms):
-    #TODO: add doctest
-    """
-    Returns a human-readable string representing a duration such as "35ms"
-    """
-    second = 1000
-    minute = 60 * second
-    hour = 60 * minute
-
-    if ms < second:
-        return "%d ms" % ms
-    elif ms < minute:
-        return "%.1f s" % (float(ms) / second)
-    elif ms < hour:
-        return "%.1f m" % (float(ms) / minute)
-    else:
-        return "%.2f h" % (float(ms) / hour)
-
-
-def rddToFileName(prefix, suffix, time):
-    #TODO: add doctest
-    if suffix is None:
-        return prefix + "-" + str(time)
-    else:
-        return prefix + "-" + str(time) + "." + suffix
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index a6184de4e83c1..2a7004e56ef53 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -167,7 +167,7 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
     new JavaPairDStream(dstream.flatMap(fn)(cm))(fakeClassTag[K2], fakeClassTag[V2])
   }
 
-    /**
+   /**
    * Return a new DStream in which each RDD is generated by applying mapPartitions() to each RDDs
    * of this DStream. Applying mapPartitions() to an RDD applies a function to each partition
    * of the RDD.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 720823d78a110..31e096cce130a 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -17,83 +17,69 @@
 
 package org.apache.spark.streaming.api.python
 
-import java.io._
-import java.util.{List => JList, ArrayList => JArrayList, Map => JMap}
+import java.util.{ArrayList => JArrayList}
 
-import scala.reflect.ClassTag
-import scala.collection.JavaConversions._
-
-
-import org.apache.spark._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.api.java._
 import org.apache.spark.api.python._
-import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.streaming.{StreamingContext, Duration, Time}
+import org.apache.spark.streaming.{Duration, Time}
 import org.apache.spark.streaming.dstream._
 import org.apache.spark.streaming.api.java._
 
 
-class PythonDStream[T: ClassTag](
-    parent: DStream[T],
-    command: Array[Byte],
-    envVars: JMap[String, String],
-    pythonIncludes: JList[String],
-    preservePartitoning: Boolean,
-    pythonExec: String,
-    broadcastVars: JList[Broadcast[Array[Byte]]],
-    accumulator: Accumulator[JList[Array[Byte]]])
-  extends DStream[Array[Byte]](parent.ssc) {
+/**
+ * Interface for Python callback function
+ */
+trait PythonRDDFunction {
+  def call(rdd: JavaRDD[_], rdd2: JavaRDD[_], time: Long): JavaRDD[Array[Byte]]
+}
 
-  override def dependencies = List(parent)
 
-  override def slideDuration: Duration = parent.slideDuration
+/**
+ * Transformed DStream in Python.
+ *
+ * If the result RDD is PythonRDD, then it will cache it as an template for future use,
+ * this can reduce the Python callbacks.
+ *
+ * @param parent
+ * @param parent2
+ * @param func
+ * @param cache
+ */
+class PythonTransformedDStream (parent: DStream[_], parent2: DStream[_], func: PythonRDDFunction,
+                                cache: Boolean = false) //TODO: better name
+  extends DStream[Array[Byte]] (parent.ssc) {
 
-  //pythonDStream compute
-  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-    parent.getOrCompute(validTime) match{
-      case Some(rdd) =>
-        // create PythonRDD to compute Python functions.
-        val pythonRDD = new PythonRDD(rdd, command, envVars, pythonIncludes,
-          preservePartitoning, pythonExec, broadcastVars, accumulator)
-        Some(pythonRDD.asJavaRDD.rdd)
-      case None => None
-    }
-  }
+  var lastResult: PythonRDD = _
 
-  def foreachRDD(foreachFunc: PythonRDDFunction) {
-    new PythonForeachDStream(this, context.sparkContext.clean(foreachFunc, false)).register()
+  override def dependencies = {
+    if (parent2 == null) {
+      List(parent)
+    } else {
+      List(parent, parent2)
+    }
   }
 
-  val asJavaDStream  = JavaDStream.fromDStream(this)
-}
-
+  override def slideDuration: Duration = parent.slideDuration
 
-private class PythonPairwiseDStream(
-    prev:DStream[Array[Byte]],
-    partitioner: Partitioner
-  ) extends DStream[Array[Byte]](prev.ssc){
-  override def dependencies = List(prev)
-
-  override def slideDuration: Duration = prev.slideDuration
-
-  override def compute(validTime:Time):Option[RDD[Array[Byte]]]={
-    prev.getOrCompute(validTime) match{
-      case Some(rdd)=>Some(rdd)
-        val pairwiseRDD = new PairwiseRDD(rdd)
-        /*
-         * Since python function is executed by Scala after StreamingContext.start.
-         * What PythonPairwiseDStream does is equivalent to python code in pyspark.
-         *
-         * with _JavaStackTrace(self.context) as st:
-         *    pairRDD = self.ctx._jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD()
-         *    partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
-         *                                                  id(partitionFunc))
-         * jrdd = pairRDD.partitionBy(partitioner).values()
-         * rdd = RDD(jrdd, self.ctx, BatchedSerializer(outputSerializer))
-         */
-        Some(pairwiseRDD.asJavaPairRDD.partitionBy(partitioner).values().rdd)
-      case None => None
+  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    val rdd1 = parent.getOrCompute(validTime).getOrElse(null)
+    val rdd2 = if (parent2 != null) parent2.getOrCompute(validTime).getOrElse(null) else null
+
+    val r = if (rdd2 != null) {
+      func.call(JavaRDD.fromRDD(rdd1), JavaRDD.fromRDD(rdd2), validTime.milliseconds)
+    } else if (cache && lastResult != null) {
+      lastResult.copyTo(rdd1).asJavaRDD
+    } else {
+      func.call(JavaRDD.fromRDD(rdd1), null, validTime.milliseconds)
+    }
+    if (r != null) {
+      if (lastResult == null && r.isInstanceOf[PythonRDD]) {
+        lastResult = r.asInstanceOf[PythonRDD]
+      }
+      Some(r)
+    } else {
+      None
     }
   }
 
@@ -101,13 +87,18 @@ private class PythonPairwiseDStream(
 }
 
 
+/**
+ * This is used for foreachRDD() in Python
+ * @param prev
+ * @param foreachFunction
+ */
 class PythonForeachDStream(
     prev: DStream[Array[Byte]],
     foreachFunction: PythonRDDFunction
   ) extends ForEachDStream[Array[Byte]](
     prev,
     (rdd: RDD[Array[Byte]], time: Time) => {
-      foreachFunction.call(rdd.toJavaRDD(), time.milliseconds)
+      foreachFunction.call(rdd.toJavaRDD(), null, time.milliseconds)
     }
   ) {
 
@@ -121,7 +112,7 @@ class PythonForeachDStream(
  * and returns the i_th element at the i_th batch under manual clock.
  */
 
-class PythonTestInputStream(
+class PythonDataInputStream(
     ssc_ : JavaStreamingContext,
     inputRDDs: JArrayList[JavaRDD[Array[Byte]]]
   ) extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)) {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
deleted file mode 100644
index eacff4b0e6006..0000000000000
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonRDDFunction.java
+++ /dev/null
@@ -1,12 +0,0 @@
-package org.apache.spark.streaming.api.python;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.streaming.Time;
-
-/*
- * Interface for py4j callback function.
- * This interface is related to pyspark.streaming.dstream.DStream.foreachRDD .
- */
-public interface PythonRDDFunction {
-  JavaRDD<byte[]> call(JavaRDD<byte[]> rdd, long time);
-}
\ No newline at end of file

From 7339be0df7da9259b838affc33a1ed0cbee93156 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Thu, 25 Sep 2014 16:53:12 -0700
Subject: [PATCH 298/347] delete tests

---
 python/pyspark/streaming/tests.py | 47 -------------------------------
 1 file changed, 47 deletions(-)

diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 0123e4e8633fc..8b355bf6b7d79 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -253,53 +253,6 @@ def _test_func(self, input, func, expected, numSlices=None, sort=False):
         self.assertEqual(expected, result)
 
 
-class TestTransform(PySparkStreamingTestCase):
-    def setUp(self):
-        PySparkStreamingTestCase.setUp(self)
-        self.timeout = 10
-
-    def test_transform(self):
-        input = [range(1, 5), range(5, 9), range(9, 13)]
-
-        def func(stream):
-            return stream.transform(lambda r: r and r.map(str))
-
-        expected = map(lambda x: map(str, x), input)
-        self._test_func(input, func, expected)
-        self.assertEqual(expected, output)
-
-    def _test_func(self, input, func, expected, numSlices=None):
-        """
-        Start stream and return the result.
-        @param input: dataset for the test. This should be list of lists.
-        @param func: wrapped function. This function should return PythonDStream object.
-        @param expected: expected output for this testcase.
-        @param numSlices: the number of slices in the rdd in the dstream.
-        """
-        # Generate input stream with user-defined input.
-        input_stream = self.ssc._makeStream(input, numSlices)
-        # Apply test function to stream.
-        stream = func(input_stream)
-        result = stream.collect()
-        self.ssc.start()
-
-        start_time = time.time()
-        # Loop until get the expected the number of the result from the stream.
-        while True:
-            current_time = time.time()
-            # Check time out.
-            if (current_time - start_time) > self.timeout:
-                break
-            # StreamingContext.awaitTermination is not used to wait because
-            # if py4j server is called every 50 milliseconds, it gets an error.
-            time.sleep(0.05)
-            # Check if the output is the same length of expected output.
-            if len(expected) == len(result):
-                break
-
-        return result
-
-
 class TestStreamingContext(unittest.TestCase):
     """
     Should we have conf property in  SparkContext?

From bd278742c59b9d9cd917cd5fd4ca8262aad1f64c Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Thu, 25 Sep 2014 17:34:58 -0700
Subject: [PATCH 299/347] fix scala style

---
 .../src/main/scala/org/apache/spark/api/python/PythonRDD.scala | 3 ++-
 .../org/apache/spark/streaming/api/python/PythonDStream.scala  | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 4cc3350b18c95..034a90110af76 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -53,7 +53,8 @@ private[spark] class PythonRDD(
   extends RDD[Array[Byte]](parent) {
 
   def copyTo(rdd: RDD[_]): PythonRDD = {
-    new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning, pythonExec, broadcastVars, accumulator)
+    new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning,
+      pythonExec, broadcastVars, accumulator)
   }
 
   val bufferSize = conf.getInt("spark.buffer.size", 65536)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 31e096cce130a..ae5c4ae2958fa 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -47,7 +47,7 @@ trait PythonRDDFunction {
  * @param cache
  */
 class PythonTransformedDStream (parent: DStream[_], parent2: DStream[_], func: PythonRDDFunction,
-                                cache: Boolean = false) //TODO: better name
+                                cache: Boolean = false)
   extends DStream[Array[Byte]] (parent.ssc) {
 
   var lastResult: PythonRDD = _

From 9a576859ca757b44b609bf9b823df5d6b376a98b Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Thu, 25 Sep 2014 18:17:52 -0700
Subject: [PATCH 300/347] fix python style

---
 examples/src/main/python/streaming/network_wordcount.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index cd2a8a73de63b..8cc21fcf89adf 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -7,13 +7,13 @@
     if len(sys.argv) != 3:
         print >> sys.stderr, "Usage: wordcount <hostname> <port>"
         exit(-1)
-    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", 
+    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount",
                            duration=Seconds(1))
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     counts = lines.flatMap(lambda line: line.split(" "))\
                   .map(lambda word: (word, 1))\
-                  .reduceByKey(lambda a,b: a+b)
+                  .reduceByKey(lambda a, b: a+b)
     counts.pyprint()
 
     ssc.start()

From eec401e4d187d65fb2e488cca72735df648cbd68 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Fri, 26 Sep 2014 00:17:33 -0700
Subject: [PATCH 301/347] refactor, combine TransformedRDD, fix reuse
 PythonRDD, fix union

---
 python/pyspark/streaming/context.py           |   7 +-
 python/pyspark/streaming/dstream.py           | 112 ++++++++++++------
 python/pyspark/streaming/tests.py             |  26 ++++
 python/pyspark/streaming/util.py              |  31 ++++-
 .../streaming/api/python/PythonDStream.scala  |  78 +++++++-----
 5 files changed, 178 insertions(+), 76 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index c2f8c9d3ff31d..fddef0d802670 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -15,12 +15,7 @@
 # limitations under the License.
 #
 
-import sys
-from signal import signal, SIGTERM, SIGINT
-import atexit
-import time
-
-from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
+from pyspark.serializers import UTF8Deserializer
 from pyspark.context import SparkContext
 from pyspark.streaming.dstream import DStream
 from pyspark.streaming.duration import Duration, Seconds
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 64088ae8e6e83..c51f39bc48428 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -15,21 +15,15 @@
 # limitations under the License.
 #
 
-from collections import defaultdict
 from itertools import chain, ifilter, imap
 import operator
 
 from pyspark import RDD
-from pyspark.serializers import NoOpSerializer,\
-    BatchedSerializer, CloudPickleSerializer, pack_long,\
-    CompressedSerializer
 from pyspark.storagelevel import StorageLevel
-from pyspark.resultiterable import ResultIterable
-from pyspark.streaming.util import rddToFileName, RDDFunction
-from pyspark.rdd import portable_hash, _parse_memory
-from pyspark.traceback_utils import SCCallSiteSync
+from pyspark.streaming.util import rddToFileName, RDDFunction, RDDFunction2
+from pyspark.rdd import portable_hash
+from pyspark.streaming.duration import Seconds
 
-from py4j.java_collections import ListConverter, MapConverter
 
 __all__ = ["DStream"]
 
@@ -42,7 +36,6 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
         self._jrdd_deserializer = jrdd_deserializer
         self.is_cached = False
         self.is_checkpointed = False
-        self._partitionFunc = None
 
     def context(self):
         """
@@ -159,7 +152,7 @@ def foreachRDD(self, func):
         This is an output operator, so this DStream will be registered as an output
         stream and there materialized.
         """
-        jfunc = RDDFunction(self.ctx, lambda a, b, t: func(a, t), self._jrdd_deserializer)
+        jfunc = RDDFunction(self.ctx, func, self._jrdd_deserializer)
         self.ctx._jvm.PythonForeachDStream(self._jdstream.dstream(), jfunc)
 
     def pyprint(self):
@@ -306,19 +299,19 @@ def get_output(rdd, time):
         return result
 
     def transform(self, func):
-        return TransformedRDD(self, lambda a, b, t: func(a), cache=True)
-
-    def transformWith(self, func, other):
-        return TransformedRDD(self, lambda a, b, t: func(a, b), other)
+        return TransformedRDD(self, lambda a, t: func(a), True)
 
     def transformWithTime(self, func):
-        return TransformedRDD(self, lambda a, b, t: func(a, t))
+        return TransformedRDD(self, func, False)
+
+    def transformWith(self, func, other, keepSerializer=False):
+        return Transformed2RDD(self, lambda a, b, t: func(a, b), other, keepSerializer)
 
     def repartitions(self, numPartitions):
         return self.transform(lambda rdd: rdd.repartition(numPartitions))
 
     def union(self, other):
-        return self.transformWith(lambda a, b: a.union(b), other)
+        return self.transformWith(lambda a, b: a.union(b), other, True)
 
     def cogroup(self, other):
         return self.transformWith(lambda a, b: a.cogroup(b), other)
@@ -329,10 +322,34 @@ def leftOuterJoin(self, other):
     def rightOuterJoin(self, other):
         return self.transformWith(lambda a, b: a.rightOuterJoin(b), other)
 
-    def slice(self, fromTime, toTime):
-        jrdds = self._jdstream.slice(fromTime._jtime, toTime._jtime)
-        # FIXME: serializer
-        return [RDD(jrdd, self.ctx, self.ctx.serializer) for jrdd in jrdds]
+    def _jtime(self, milliseconds):
+        return self.ctx._jvm.Time(milliseconds)
+
+    def slice(self, begin, end):
+        jrdds = self._jdstream.slice(self._jtime(begin), self._jtime(end))
+        return [RDD(jrdd, self.ctx, self._jrdd_deserializer) for jrdd in jrdds]
+
+    def window(self, windowDuration, slideDuration=None):
+        d = Seconds(windowDuration)
+        if slideDuration is None:
+            return DStream(self._jdstream.window(d), self._ssc, self._jrdd_deserializer)
+        s = Seconds(slideDuration)
+        return DStream(self._jdstream.window(d, s), self._ssc, self._jrdd_deserializer)
+
+    def reduceByWindow(self, reduceFunc, inReduceFunc, windowDuration, slideDuration):
+        pass
+
+    def countByWindow(self, window, slide):
+        pass
+
+    def countByValueAndWindow(self, window, slide, numPartitions=None):
+        pass
+
+    def groupByKeyAndWindow(self, window, slide, numPartitions=None):
+        pass
+
+    def reduceByKeyAndWindow(self, reduceFunc, inReduceFunc, window, slide, numPartitions=None):
+        pass
 
     def updateStateByKey(self, updateFunc):
         # FIXME: convert updateFunc to java JFunction2
@@ -340,21 +357,44 @@ def updateStateByKey(self, updateFunc):
         return self._jdstream.updateStateByKey(jFunc)
 
 
-# Window Operations
-# TODO: implement window
-# TODO: implement groupByKeyAndWindow
-# TODO: implement reduceByKeyAndWindow
-# TODO: implement countByValueAndWindow
-# TODO: implement countByWindow
-# TODO: implement reduceByWindow
+class TransformedRDD(DStream):
+    def __init__(self, prev, func, reuse=False):
+        ssc = prev._ssc
+        self._ssc = ssc
+        self.ctx = ssc._sc
+        self._jrdd_deserializer = self.ctx.serializer
+        self.is_cached = False
+        self.is_checkpointed = False
+
+        if isinstance(prev, TransformedRDD) and not prev.is_cached and not prev.is_checkpointed:
+            prev_func = prev.func
+            old_func = func
+            func = lambda rdd, t: old_func(prev_func(rdd, t), t)
+            reuse = reuse and prev.reuse
+            prev = prev.prev
 
+        self.prev = prev
+        self.func = func
+        self.reuse = reuse
+        self._jdstream_val = None
 
-class TransformedRDD(DStream):
-    # TODO: better name for cache
-    def __init__(self, prev, func, other=None, cache=False):
-        # TODO: combine transformed RDD
+    @property
+    def _jdstream(self):
+        if self._jdstream_val is not None:
+            return self._jdstream_val
+
+        jfunc = RDDFunction(self.ctx, self.func, self.prev._jrdd_deserializer)
+        jdstream = self.ctx._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(),
+                                                          jfunc, self.reuse).asJavaDStream()
+        self._jdstream_val = jdstream
+        return jdstream
+
+
+class Transformed2RDD(DStream):
+    def __init__(self, prev, func, other, keepSerializer=False):
         ssc = prev._ssc
-        t = RDDFunction(ssc._sc, func, prev._jrdd_deserializer)
-        jdstream = ssc._jvm.PythonTransformedDStream(prev._jdstream.dstream(),
-                                                     other and other._jdstream, t, cache)
-        DStream.__init__(self, jdstream.asJavaDStream(), ssc, ssc._sc.serializer)
+        jfunc = RDDFunction2(ssc._sc, func, prev._jrdd_deserializer)
+        jdstream = ssc._jvm.PythonTransformed2DStream(prev._jdstream.dstream(),
+                                                      other._jdstream.dstream(), jfunc)
+        jrdd_serializer = prev._jrdd_deserializer if keepSerializer else ssc._sc.serializer
+        DStream.__init__(self, jdstream.asJavaDStream(), ssc, jrdd_serializer)
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 8b355bf6b7d79..7f9c99c047bd4 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -213,6 +213,32 @@ def add(a, b):
                     [("a", "11"), ("b", "1"), ("", "111")]]
         self._test_func(input, func, expected, sort=True)
 
+    def test_union(self):
+        input1 = [range(3), range(5), range(1)]
+        input2 = [range(3, 6), range(5, 6), range(1, 6)]
+
+        d1 = self.ssc._makeStream(input1)
+        d2 = self.ssc._makeStream(input2)
+        d = d1.union(d2)
+        result = d.collect()
+        expected = [range(6), range(6), range(6)]
+
+        self.ssc.start()
+        start_time = time.time()
+        # Loop until get the expected the number of the result from the stream.
+        while True:
+            current_time = time.time()
+            # Check time out.
+            if (current_time - start_time) > self.timeout * 2:
+                break
+            # StreamingContext.awaitTermination is not used to wait because
+            # if py4j server is called every 50 milliseconds, it gets an error.
+            time.sleep(0.05)
+            # Check if the output is the same length of expected output.
+            if len(expected) == len(result):
+                break
+        self.assertEqual(expected, result)
+
     def _sort_result_based_on_key(self, outputs):
         """Sort the list base onf first value."""
         for output in outputs:
diff --git a/python/pyspark/streaming/util.py b/python/pyspark/streaming/util.py
index 3047763594ce5..4051732f25302 100644
--- a/python/pyspark/streaming/util.py
+++ b/python/pyspark/streaming/util.py
@@ -28,9 +28,36 @@ def __init__(self, ctx, func, jrdd_deserializer):
         self.func = func
         self.deserializer = jrdd_deserializer
 
-    def call(self, jrdd, jrdd2, milliseconds):
+    def call(self, jrdd, milliseconds):
         try:
             rdd = RDD(jrdd, self.ctx, self.deserializer)
+            r = self.func(rdd, milliseconds)
+            if r:
+                return r._jrdd
+        except:
+            import traceback
+            traceback.print_exc()
+
+    def __repr__(self):
+        return "RDDFunction(%s, %s)" % (str(self.deserializer), str(self.func))
+
+    class Java:
+        implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction']
+
+
+class RDDFunction2(object):
+    """
+    This class is for py4j callback. This class is related with
+    org.apache.spark.streaming.api.python.PythonRDDFunction2.
+    """
+    def __init__(self, ctx, func, jrdd_deserializer):
+        self.ctx = ctx
+        self.func = func
+        self.deserializer = jrdd_deserializer
+
+    def call(self, jrdd, jrdd2, milliseconds):
+        try:
+            rdd = RDD(jrdd, self.ctx, self.deserializer) if jrdd else None
             other = RDD(jrdd2, self.ctx, self.deserializer) if jrdd2 else None
             r = self.func(rdd, other, milliseconds)
             if r:
@@ -43,7 +70,7 @@ def __repr__(self):
         return "RDDFunction(%s, %s)" % (str(self.deserializer), str(self.func))
 
     class Java:
-        implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction']
+        implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction2']
 
 
 def rddToFileName(prefix, suffix, time):
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index ae5c4ae2958fa..7aab10b027c84 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -28,69 +28,83 @@ import org.apache.spark.streaming.api.java._
 
 
 /**
- * Interface for Python callback function
+ * Interface for Python callback function with two arguments
  */
 trait PythonRDDFunction {
-  def call(rdd: JavaRDD[_], rdd2: JavaRDD[_], time: Long): JavaRDD[Array[Byte]]
+  def call(rdd: JavaRDD[_], time: Long): JavaRDD[Array[Byte]]
 }
 
+/**
+ * Interface for Python callback function with three arguments
+ */
+trait PythonRDDFunction2 {
+  def call(rdd: JavaRDD[_], rdd2: JavaRDD[_], time: Long): JavaRDD[Array[Byte]]
+}
 
 /**
  * Transformed DStream in Python.
  *
  * If the result RDD is PythonRDD, then it will cache it as an template for future use,
  * this can reduce the Python callbacks.
- *
- * @param parent
- * @param parent2
- * @param func
- * @param cache
  */
-class PythonTransformedDStream (parent: DStream[_], parent2: DStream[_], func: PythonRDDFunction,
-                                cache: Boolean = false)
+class PythonTransformedDStream (parent: DStream[_], func: PythonRDDFunction,
+                                var reuse: Boolean = false)
   extends DStream[Array[Byte]] (parent.ssc) {
 
   var lastResult: PythonRDD = _
 
-  override def dependencies = {
-    if (parent2 == null) {
-      List(parent)
-    } else {
-      List(parent, parent2)
-    }
-  }
+  override def dependencies = List(parent)
 
   override def slideDuration: Duration = parent.slideDuration
 
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     val rdd1 = parent.getOrCompute(validTime).getOrElse(null)
-    val rdd2 = if (parent2 != null) parent2.getOrCompute(validTime).getOrElse(null) else null
-
-    val r = if (rdd2 != null) {
-      func.call(JavaRDD.fromRDD(rdd1), JavaRDD.fromRDD(rdd2), validTime.milliseconds)
-    } else if (cache && lastResult != null) {
-      lastResult.copyTo(rdd1).asJavaRDD
+    if (reuse && lastResult != null) {
+      Some(lastResult.copyTo(rdd1))
     } else {
-      func.call(JavaRDD.fromRDD(rdd1), null, validTime.milliseconds)
-    }
-    if (r != null) {
-      if (lastResult == null && r.isInstanceOf[PythonRDD]) {
-        lastResult = r.asInstanceOf[PythonRDD]
+      val r = func.call(JavaRDD.fromRDD(rdd1), validTime.milliseconds).rdd
+      if (reuse && lastResult == null) {
+        r match {
+          case rdd: PythonRDD =>
+            if (rdd.parent(0) == rdd1) {
+              // only one PythonRDD
+              lastResult = rdd
+            } else {
+              // may have multiple stages
+              reuse = false
+            }
+        }
       }
       Some(r)
-    } else {
-      None
     }
   }
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
 
+/**
+ * Transformed from two DStreams in Python.
+ */
+class PythonTransformed2DStream (parent: DStream[_], parent2: DStream[_], func: PythonRDDFunction2)
+  extends DStream[Array[Byte]] (parent.ssc) {
+
+  override def dependencies = List(parent, parent2)
+
+  override def slideDuration: Duration = parent.slideDuration
+
+  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    def resultRdd(stream: DStream[_]): JavaRDD[_] = stream.getOrCompute(validTime) match {
+      case Some(rdd) => JavaRDD.fromRDD(rdd)
+      case None => null
+    }
+    Some(func.call(resultRdd(parent), resultRdd(parent2), validTime.milliseconds))
+  }
+
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
 
 /**
  * This is used for foreachRDD() in Python
- * @param prev
- * @param foreachFunction
  */
 class PythonForeachDStream(
     prev: DStream[Array[Byte]],
@@ -98,7 +112,7 @@ class PythonForeachDStream(
   ) extends ForEachDStream[Array[Byte]](
     prev,
     (rdd: RDD[Array[Byte]], time: Time) => {
-      foreachFunction.call(rdd.toJavaRDD(), null, time.milliseconds)
+      foreachFunction.call(rdd.toJavaRDD(), time.milliseconds)
     }
   ) {
 

From bd130268e3c3e59a0230d24999c69cd1d5b21b15 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Fri, 26 Sep 2014 08:32:12 -0700
Subject: [PATCH 302/347] fix examples

---
 .../python/streaming/network_wordcount.py     |  8 ++++----
 .../src/main/python/streaming/wordcount.py    |  8 ++++----
 python/pyspark/streaming/__init__.py          | 19 +++++++++++++++++++
 python/pyspark/streaming/context.py           |  2 ++
 4 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index 8cc21fcf89adf..633e63172bad6 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -1,14 +1,14 @@
 import sys
 
-from pyspark.streaming.context import StreamingContext
-from pyspark.streaming.duration import *
+from pyspark import SparkContext
+from pyspark.streaming import StreamingContext
 
 if __name__ == "__main__":
     if len(sys.argv) != 3:
         print >> sys.stderr, "Usage: wordcount <hostname> <port>"
         exit(-1)
-    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount",
-                           duration=Seconds(1))
+    sc = SparkContext(appName="PythonStreamingNetworkWordCount")
+    ssc = StreamingContext(sc, 1)
 
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     counts = lines.flatMap(lambda line: line.split(" "))\
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index 4c62835ed8025..c794711845af0 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -1,15 +1,15 @@
 import sys
 
-from pyspark.streaming.context import StreamingContext
-from pyspark.streaming.duration import *
+from pyspark import SparkContext
+from pyspark.streaming import StreamingContext
 
 if __name__ == "__main__":
     if len(sys.argv) != 2:
         print >> sys.stderr, "Usage: wordcount <directory>"
         exit(-1)
 
-    ssc = StreamingContext(appName="PythonStreamingWordCount",
-                           duration=Seconds(1))
+    sc = SparkContext(appName="PythonStreamingWordCount")
+    ssc = StreamingContext(sc, 1)
 
     lines = ssc.textFileStream(sys.argv[1])
     counts = lines.flatMap(lambda line: line.split(" "))\
diff --git a/python/pyspark/streaming/__init__.py b/python/pyspark/streaming/__init__.py
index e69de29bb2d1d..00d2823525992 100644
--- a/python/pyspark/streaming/__init__.py
+++ b/python/pyspark/streaming/__init__.py
@@ -0,0 +1,19 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pyspark.streaming.context import StreamingContext
+from pyspark.streaming.dstream import DStream
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index fddef0d802670..1c7cb5604e5cc 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -22,6 +22,8 @@
 
 from py4j.java_collections import ListConverter
 
+__all__ = ["StreamingContext"]
+
 
 class StreamingContext(object):
     """

From d357b70cc2fdee48f12044751b994eccc10c233c Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Fri, 26 Sep 2014 12:35:02 -0700
Subject: [PATCH 303/347] support windowed dstream

---
 python/pyspark/streaming/dstream.py           |  89 ++++++++----
 python/pyspark/streaming/tests.py             | 135 ++++++++++--------
 .../streaming/api/python/PythonDStream.scala  | 121 +++++++++++++++-
 3 files changed, 247 insertions(+), 98 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index c51f39bc48428..38bb54f25eaa2 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -22,8 +22,8 @@
 from pyspark.storagelevel import StorageLevel
 from pyspark.streaming.util import rddToFileName, RDDFunction, RDDFunction2
 from pyspark.rdd import portable_hash
-from pyspark.streaming.duration import Seconds
-
+from pyspark.streaming.duration import Duration, Seconds
+from pyspark.resultiterable import ResultIterable
 
 __all__ = ["DStream"]
 
@@ -299,13 +299,17 @@ def get_output(rdd, time):
         return result
 
     def transform(self, func):
-        return TransformedRDD(self, lambda a, t: func(a), True)
+        return TransformedDStream(self, lambda a, t: func(a), True)
 
     def transformWithTime(self, func):
-        return TransformedRDD(self, func, False)
+        return TransformedDStream(self, func, False)
 
     def transformWith(self, func, other, keepSerializer=False):
-        return Transformed2RDD(self, lambda a, b, t: func(a, b), other, keepSerializer)
+        jfunc = RDDFunction2(self.ctx, func, self._jrdd_deserializer)
+        dstream = self.ctx._jvm.PythonTransformed2DStream(self._jdstream.dstream(),
+                                                          other._jdstream.dstream(), jfunc)
+        jrdd_serializer = self._jrdd_deserializer if keepSerializer else self.ctx.serializer
+        return DStream(dstream.asJavaDStream(), self._ssc, jrdd_serializer)
 
     def repartitions(self, numPartitions):
         return self.transform(lambda rdd: rdd.repartition(numPartitions))
@@ -336,20 +340,52 @@ def window(self, windowDuration, slideDuration=None):
         s = Seconds(slideDuration)
         return DStream(self._jdstream.window(d, s), self._ssc, self._jrdd_deserializer)
 
-    def reduceByWindow(self, reduceFunc, inReduceFunc, windowDuration, slideDuration):
-        pass
-
-    def countByWindow(self, window, slide):
-        pass
-
-    def countByValueAndWindow(self, window, slide, numPartitions=None):
-        pass
-
-    def groupByKeyAndWindow(self, window, slide, numPartitions=None):
-        pass
-
-    def reduceByKeyAndWindow(self, reduceFunc, inReduceFunc, window, slide, numPartitions=None):
-        pass
+    def reduceByWindow(self, reduceFunc, invReduceFunc, windowDuration, slideDuration):
+        keyed = self.map(lambda x: (1, x))
+        reduced = keyed.reduceByKeyAndWindow(reduceFunc, invReduceFunc,
+                                             windowDuration, slideDuration, 1)
+        return reduced.map(lambda (k, v): v)
+
+    def countByWindow(self, windowDuration, slideDuration):
+        return self.map(lambda x: 1).reduceByWindow(operator.add, operator.sub,
+                                                    windowDuration, slideDuration)
+
+    def countByValueAndWindow(self, windowDuration, slideDuration, numPartitions=None):
+        keyed = self.map(lambda x: (x, 1))
+        counted = keyed.reduceByKeyAndWindow(lambda a, b: a + b, lambda a, b: a - b,
+                                             windowDuration, slideDuration, numPartitions)
+        return counted.filter(lambda (k, v): v > 0).count()
+
+    def groupByKeyAndWindow(self, windowDuration, slideDuration, numPartitions=None):
+        ls = self.mapValues(lambda x: [x])
+        grouped = ls.reduceByKeyAndWindow(lambda a, b: a.extend(b) or a, lambda a, b: a[len(b):],
+                                          windowDuration, slideDuration, numPartitions)
+        return grouped.mapValues(ResultIterable)
+
+    def reduceByKeyAndWindow(self, func, invFunc,
+                             windowDuration, slideDuration, numPartitions=None):
+        reduced = self.reduceByKey(func)
+
+        def reduceFunc(a, t):
+            return a.reduceByKey(func, numPartitions)
+
+        def invReduceFunc(a, b, t):
+            b = b.reduceByKey(func, numPartitions)
+            joined = a.leftOuterJoin(b, numPartitions)
+            return joined.mapValues(lambda (v1, v2): invFunc(v1, v2) if v2 is not None else v1)
+
+        if not isinstance(windowDuration, Duration):
+            windowDuration = Seconds(windowDuration)
+        if not isinstance(slideDuration, Duration):
+            slideDuration = Seconds(slideDuration)
+        serializer = reduced._jrdd_deserializer
+        jreduceFunc = RDDFunction(self.ctx, reduceFunc, reduced._jrdd_deserializer)
+        jinvReduceFunc = RDDFunction2(self.ctx, invReduceFunc, reduced._jrdd_deserializer)
+        dstream = self.ctx._jvm.PythonReducedWindowedDStream(reduced._jdstream.dstream(),
+                                                             jreduceFunc, jinvReduceFunc,
+                                                             windowDuration._jduration,
+                                                             slideDuration._jduration)
+        return DStream(dstream.asJavaDStream(), self._ssc, serializer)
 
     def updateStateByKey(self, updateFunc):
         # FIXME: convert updateFunc to java JFunction2
@@ -357,7 +393,7 @@ def updateStateByKey(self, updateFunc):
         return self._jdstream.updateStateByKey(jFunc)
 
 
-class TransformedRDD(DStream):
+class TransformedDStream(DStream):
     def __init__(self, prev, func, reuse=False):
         ssc = prev._ssc
         self._ssc = ssc
@@ -366,7 +402,8 @@ def __init__(self, prev, func, reuse=False):
         self.is_cached = False
         self.is_checkpointed = False
 
-        if isinstance(prev, TransformedRDD) and not prev.is_cached and not prev.is_checkpointed:
+        if (isinstance(prev, TransformedDStream) and
+                not prev.is_cached and not prev.is_checkpointed):
             prev_func = prev.func
             old_func = func
             func = lambda rdd, t: old_func(prev_func(rdd, t), t)
@@ -388,13 +425,3 @@ def _jdstream(self):
                                                           jfunc, self.reuse).asJavaDStream()
         self._jdstream_val = jdstream
         return jdstream
-
-
-class Transformed2RDD(DStream):
-    def __init__(self, prev, func, other, keepSerializer=False):
-        ssc = prev._ssc
-        jfunc = RDDFunction2(ssc._sc, func, prev._jrdd_deserializer)
-        jdstream = ssc._jvm.PythonTransformed2DStream(prev._jdstream.dstream(),
-                                                      other._jdstream.dstream(), jfunc)
-        jrdd_serializer = prev._jrdd_deserializer if keepSerializer else ssc._sc.serializer
-        DStream.__init__(self, jdstream.asJavaDStream(), ssc, jrdd_serializer)
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 7f9c99c047bd4..aa20b7efbee46 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -33,42 +33,64 @@
 
 
 class PySparkStreamingTestCase(unittest.TestCase):
+
+    timeout = 10  # seconds
+
     def setUp(self):
         class_name = self.__class__.__name__
         self.sc = SparkContext(appName=class_name)
+        self.sc.setCheckpointDir("/tmp")
         self.ssc = StreamingContext(self.sc, duration=Seconds(1))
 
     def tearDown(self):
-        # Do not call pyspark.streaming.context.StreamingContext.stop directly because
-        # we do not wait to shutdown py4j client.
         self.ssc.stop()
         self.sc.stop()
-        time.sleep(1)
 
     @classmethod
     def tearDownClass(cls):
         # Make sure tp shutdown the callback server
         SparkContext._gateway._shutdown_callback_server()
 
+    def _test_func(self, input, func, expected, numSlices=None, sort=False):
+        """
+        Start stream and return the result.
+        @param input: dataset for the test. This should be list of lists.
+        @param func: wrapped function. This function should return PythonDStream object.
+        @param expected: expected output for this testcase.
+        @param numSlices: the number of slices in the rdd in the dstream.
+        """
+        # Generate input stream with user-defined input.
+        input_stream = self.ssc._makeStream(input, numSlices)
+        # Apply test function to stream.
+        stream = func(input_stream)
+        result = stream.collect()
+        self.ssc.start()
 
-class TestBasicOperations(PySparkStreamingTestCase):
-    """
-    2 tests for each function for batach deserializer and unbatch deserilizer because
-    the deserializer is not changed dunamically after streaming process starts.
-    Default numInputPartitions is 2.
-    If the number of input element is over 3, that DStream use batach deserializer.
-    If not, that DStream use unbatch deserializer.
-
-    All tests input should have list of lists(3 lists are default). This list represents stream.
-    Every batch interval, the first object of list are chosen to make DStream.
-    e.g The first list in the list is input of the first batch.
-    Please see the BasicTestSuits in Scala which is close to this implementation.
-    """
-    def setUp(self):
-        PySparkStreamingTestCase.setUp(self)
-        self.timeout = 10  # seconds
-        self.numInputPartitions = 2
+        start_time = time.time()
+        # Loop until get the expected the number of the result from the stream.
+        while True:
+            current_time = time.time()
+            # Check time out.
+            if (current_time - start_time) > self.timeout:
+                break
+            # StreamingContext.awaitTermination is not used to wait because
+            # if py4j server is called every 50 milliseconds, it gets an error.
+            time.sleep(0.05)
+            # Check if the output is the same length of expected output.
+            if len(expected) == len(result):
+                break
+        if sort:
+            self._sort_result_based_on_key(result)
+            self._sort_result_based_on_key(expected)
+        self.assertEqual(expected, result)
 
+    def _sort_result_based_on_key(self, outputs):
+        """Sort the list based on first value."""
+        for output in outputs:
+            output.sort(key=lambda x: x[0])
+
+
+class TestBasicOperations(PySparkStreamingTestCase):
     def test_map(self):
         """Basic operation test for DStream.map."""
         input = [range(1, 5), range(5, 9), range(9, 13)]
@@ -239,54 +261,41 @@ def test_union(self):
                 break
         self.assertEqual(expected, result)
 
-    def _sort_result_based_on_key(self, outputs):
-        """Sort the list base onf first value."""
-        for output in outputs:
-            output.sort(key=lambda x: x[0])
 
-    def _test_func(self, input, func, expected, numSlices=None, sort=False):
-        """
-        Start stream and return the result.
-        @param input: dataset for the test. This should be list of lists.
-        @param func: wrapped function. This function should return PythonDStream object.
-        @param expected: expected output for this testcase.
-        @param numSlices: the number of slices in the rdd in the dstream.
-        """
-        # Generate input stream with user-defined input.
-        numSlices = numSlices or self.numInputPartitions
-        input_stream = self.ssc._makeStream(input, numSlices)
-        # Apply test function to stream.
-        stream = func(input_stream)
-        result = stream.collect()
-        self.ssc.start()
+class TestWindowFunctions(PySparkStreamingTestCase):
 
-        start_time = time.time()
-        # Loop until get the expected the number of the result from the stream.
-        while True:
-            current_time = time.time()
-            # Check time out.
-            if (current_time - start_time) > self.timeout:
-                break
-            # StreamingContext.awaitTermination is not used to wait because
-            # if py4j server is called every 50 milliseconds, it gets an error.
-            time.sleep(0.05)
-            # Check if the output is the same length of expected output.
-            if len(expected) == len(result):
-                break
-        if sort:
-            self._sort_result_based_on_key(result)
-            self._sort_result_based_on_key(expected)
-        self.assertEqual(expected, result)
+    timeout = 15
 
+    def test_count_by_window(self):
+        input = [range(1), range(2), range(3), range(4), range(5), range(6)]
 
-class TestStreamingContext(unittest.TestCase):
-    """
-    Should we have conf property in  SparkContext?
-    @property
-    def conf(self):
-        return self._conf
+        def func(dstream):
+            return dstream.countByWindow(4, 1)
+
+        expected = [[1], [3], [6], [9], [12], [15], [11], [6]]
+        self._test_func(input, func, expected)
+
+    def test_count_by_window_large(self):
+        input = [range(1), range(2), range(3), range(4), range(5), range(6)]
 
-    """
+        def func(dstream):
+            return dstream.countByWindow(6, 1)
+
+        expected = [[1], [3], [6], [10], [15], [20], [18], [15], [11], [6]]
+        self._test_func(input, func, expected)
+
+    def test_group_by_key_and_window(self):
+        input = [[('a', i)] for i in range(5)]
+
+        def func(dstream):
+            return dstream.groupByKeyAndWindow(4, 1).mapValues(list)
+
+        expected = [[('a', [0])], [('a', [0, 1])], [('a', [0, 1, 2])], [('a', [1, 2, 3])],
+                    [('a', [2, 3, 4])], [('a', [3, 4])], [('a', [4])]]
+        self._test_func(input, func, expected)
+
+
+class TestStreamingContext(unittest.TestCase):
     def setUp(self):
         self.sc = SparkContext(master="local[2]", appName=self.__class__.__name__)
         self.batachDuration = Seconds(1)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 7aab10b027c84..689c04fa49135 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -19,13 +19,18 @@ package org.apache.spark.streaming.api.python
 
 import java.util.{ArrayList => JArrayList}
 
-import org.apache.spark.rdd.RDD
+import org.apache.spark.Partitioner
+import org.apache.spark.rdd.{CoGroupedRDD, UnionRDD, PartitionerAwareUnionRDD, RDD}
 import org.apache.spark.api.java._
 import org.apache.spark.api.python._
-import org.apache.spark.streaming.{Duration, Time}
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming.{Interval, Duration, Time}
 import org.apache.spark.streaming.dstream._
 import org.apache.spark.streaming.api.java._
 
+import scala.collection.mutable.ArrayBuffer
+import scala.reflect.ClassTag
+
 
 /**
  * Interface for Python callback function with two arguments
@@ -47,7 +52,7 @@ trait PythonRDDFunction2 {
  * If the result RDD is PythonRDD, then it will cache it as an template for future use,
  * this can reduce the Python callbacks.
  */
-class PythonTransformedDStream (parent: DStream[_], func: PythonRDDFunction,
+private[spark] class PythonTransformedDStream (parent: DStream[_], func: PythonRDDFunction,
                                 var reuse: Boolean = false)
   extends DStream[Array[Byte]] (parent.ssc) {
 
@@ -59,6 +64,9 @@ class PythonTransformedDStream (parent: DStream[_], func: PythonRDDFunction,
 
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     val rdd1 = parent.getOrCompute(validTime).getOrElse(null)
+    if (rdd1 == null) {
+      return None
+    }
     if (reuse && lastResult != null) {
       Some(lastResult.copyTo(rdd1))
     } else {
@@ -85,7 +93,7 @@ class PythonTransformedDStream (parent: DStream[_], func: PythonRDDFunction,
 /**
  * Transformed from two DStreams in Python.
  */
-class PythonTransformed2DStream (parent: DStream[_], parent2: DStream[_], func: PythonRDDFunction2)
+private[spark] class PythonTransformed2DStream (parent: DStream[_], parent2: DStream[_], func: PythonRDDFunction2)
   extends DStream[Array[Byte]] (parent.ssc) {
 
   override def dependencies = List(parent, parent2)
@@ -103,6 +111,111 @@ class PythonTransformed2DStream (parent: DStream[_], parent2: DStream[_], func:
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
 
+
+/**
+ * Copied from ReducedWindowedDStream
+ */
+private[spark]
+class PythonReducedWindowedDStream(
+                                  parent: DStream[Array[Byte]],
+                                  reduceFunc: PythonRDDFunction,
+                                  invReduceFunc: PythonRDDFunction2,
+                                  _windowDuration: Duration,
+                                  _slideDuration: Duration
+                                  ) extends DStream[Array[Byte]](parent.ssc) {
+
+  assert(_windowDuration.isMultipleOf(parent.slideDuration),
+    "The window duration of ReducedWindowedDStream (" + _windowDuration + ") " +
+      "must be multiple of the slide duration of parent DStream (" + parent.slideDuration + ")"
+  )
+
+  assert(_slideDuration.isMultipleOf(parent.slideDuration),
+    "The slide duration of ReducedWindowedDStream (" + _slideDuration + ") " +
+      "must be multiple of the slide duration of parent DStream (" + parent.slideDuration + ")"
+  )
+
+
+  // Persist RDDs to memory by default as these RDDs are going to be reused.
+  super.persist(StorageLevel.MEMORY_ONLY)
+
+  def windowDuration: Duration =  _windowDuration
+
+  override def dependencies = List(parent)
+
+  override def slideDuration: Duration = _slideDuration
+
+  override val mustCheckpoint = true
+
+  override def parentRememberDuration: Duration = rememberDuration + windowDuration
+
+  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    None
+    val reduceF = reduceFunc
+    val invReduceF = invReduceFunc
+
+    val currentTime = validTime
+    val currentWindow = new Interval(currentTime - windowDuration + parent.slideDuration,
+      currentTime)
+    val previousWindow = currentWindow - slideDuration
+
+    logDebug("Window time = " + windowDuration)
+    logDebug("Slide time = " + slideDuration)
+    logDebug("ZeroTime = " + zeroTime)
+    logDebug("Current window = " + currentWindow)
+    logDebug("Previous window = " + previousWindow)
+
+    //  _____________________________
+    // |  previous window   _________|___________________
+    // |___________________|       current window        |  --------------> Time
+    //                     |_____________________________|
+    //
+    // |________ _________|          |________ _________|
+    //          |                             |
+    //          V                             V
+    //       old RDDs                     new RDDs
+    //
+
+
+    // Get the RDD of the reduced value of the previous window
+    val previousWindowRDD =
+      getOrCompute(previousWindow.endTime)
+
+    if (windowDuration > slideDuration * 5 && previousWindowRDD.isDefined) {
+      // subtle the values from old RDDs
+      val oldRDDs =
+        parent.slice(previousWindow.beginTime, currentWindow.beginTime - parent.slideDuration)
+      val subbed = if (oldRDDs.size > 0) {
+        invReduceFunc.call(JavaRDD.fromRDD(previousWindowRDD.get),
+          JavaRDD.fromRDD(ssc.sc.union(oldRDDs)), validTime.milliseconds).rdd
+      } else {
+        previousWindowRDD.get
+      }
+
+      // add the RDDs of the reduced values in "new time steps"
+      val newRDDs =
+        parent.slice(previousWindow.endTime, currentWindow.endTime - parent.slideDuration)
+
+      if (newRDDs.size > 0) {
+        Some(reduceFunc.call(JavaRDD.fromRDD(ssc.sc.union(newRDDs).union(subbed)), validTime.milliseconds))
+      } else {
+        Some(subbed)
+      }
+    } else {
+      // Get the RDDs of the reduced values in current window
+      val currentRDDs =
+        parent.slice(currentWindow.beginTime, currentWindow.endTime - parent.slideDuration)
+      if (currentRDDs.size > 0) {
+        Some(reduceFunc.call(JavaRDD.fromRDD(ssc.sc.union(currentRDDs)), validTime.milliseconds))
+      } else {
+        None
+      }
+    }
+  }
+
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
+
+
 /**
  * This is used for foreachRDD() in Python
  */

From c28f520ec2e77c6a5f7139b5131182024eddd1be Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Fri, 26 Sep 2014 13:56:50 -0700
Subject: [PATCH 304/347] support updateStateByKey

---
 python/pyspark/streaming/dstream.py           | 30 +++++++++----
 python/pyspark/streaming/tests.py             | 19 ++++++++
 python/pyspark/streaming/util.py              | 11 ++---
 .../streaming/api/python/PythonDStream.scala  | 44 ++++++++++++++++---
 4 files changed, 83 insertions(+), 21 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 38bb54f25eaa2..27e1400b8ba0b 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -366,8 +366,9 @@ def reduceByKeyAndWindow(self, func, invFunc,
                              windowDuration, slideDuration, numPartitions=None):
         reduced = self.reduceByKey(func)
 
-        def reduceFunc(a, t):
-            return a.reduceByKey(func, numPartitions)
+        def reduceFunc(a, b, t):
+            b = b.reduceByKey(func, numPartitions)
+            return a.union(b).reduceByKey(func, numPartitions) if a else b
 
         def invReduceFunc(a, b, t):
             b = b.reduceByKey(func, numPartitions)
@@ -378,19 +379,30 @@ def invReduceFunc(a, b, t):
             windowDuration = Seconds(windowDuration)
         if not isinstance(slideDuration, Duration):
             slideDuration = Seconds(slideDuration)
-        serializer = reduced._jrdd_deserializer
-        jreduceFunc = RDDFunction(self.ctx, reduceFunc, reduced._jrdd_deserializer)
+        jreduceFunc = RDDFunction2(self.ctx, reduceFunc, reduced._jrdd_deserializer)
         jinvReduceFunc = RDDFunction2(self.ctx, invReduceFunc, reduced._jrdd_deserializer)
         dstream = self.ctx._jvm.PythonReducedWindowedDStream(reduced._jdstream.dstream(),
                                                              jreduceFunc, jinvReduceFunc,
                                                              windowDuration._jduration,
                                                              slideDuration._jduration)
-        return DStream(dstream.asJavaDStream(), self._ssc, serializer)
+        return DStream(dstream.asJavaDStream(), self._ssc, self.ctx.serializer)
+
+    def updateStateByKey(self, updateFunc, numPartitions=None):
+        """
+        :param updateFunc: [(k, vs, s)] -> [(k, s)]
+        """
+        def reduceFunc(a, b, t):
+            if a is None:
+                g = b.groupByKey(numPartitions).map(lambda (k, vs): (k, list(vs), None))
+            else:
+                g = a.cogroup(b).map(lambda (k, (va, vb)):
+                                            (k, list(vb), list(va)[0] if len(va) else None))
+            return g.mapPartitions(lambda x: updateFunc(x) or [])
 
-    def updateStateByKey(self, updateFunc):
-        # FIXME: convert updateFunc to java JFunction2
-        jFunc = updateFunc
-        return self._jdstream.updateStateByKey(jFunc)
+        jreduceFunc = RDDFunction2(self.ctx, reduceFunc,
+                                   self.ctx.serializer, self._jrdd_deserializer)
+        dstream = self.ctx._jvm.PythonStateDStream(self._jdstream.dstream(), jreduceFunc)
+        return DStream(dstream.asJavaDStream(), self._ssc, self.ctx.serializer)
 
 
 class TransformedDStream(DStream):
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index aa20b7efbee46..755ea224e56da 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -294,6 +294,25 @@ def func(dstream):
                     [('a', [2, 3, 4])], [('a', [3, 4])], [('a', [4])]]
         self._test_func(input, func, expected)
 
+    def update_state_by_key(self):
+
+        def updater(it):
+            for k, vs, s in it:
+                if not s:
+                    s = vs
+                else:
+                    s.extend(vs)
+                yield (k, s)
+
+        input = [[('k', i)] for i in range(5)]
+
+        def func(dstream):
+            return dstream.updateStateByKey(updater)
+
+        expected = [[0], [0, 1], [0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]]
+        expected = [[('k', v)] for v in expected]
+        self._test_func(input, func, expected)
+
 
 class TestStreamingContext(unittest.TestCase):
     def setUp(self):
diff --git a/python/pyspark/streaming/util.py b/python/pyspark/streaming/util.py
index 4051732f25302..fdbd01ec1766d 100644
--- a/python/pyspark/streaming/util.py
+++ b/python/pyspark/streaming/util.py
@@ -50,15 +50,16 @@ class RDDFunction2(object):
     This class is for py4j callback. This class is related with
     org.apache.spark.streaming.api.python.PythonRDDFunction2.
     """
-    def __init__(self, ctx, func, jrdd_deserializer):
+    def __init__(self, ctx, func, jrdd_deserializer, jrdd_deserializer2=None):
         self.ctx = ctx
         self.func = func
-        self.deserializer = jrdd_deserializer
+        self.jrdd_deserializer = jrdd_deserializer
+        self.jrdd_deserializer2 = jrdd_deserializer2 or jrdd_deserializer
 
     def call(self, jrdd, jrdd2, milliseconds):
         try:
-            rdd = RDD(jrdd, self.ctx, self.deserializer) if jrdd else None
-            other = RDD(jrdd2, self.ctx, self.deserializer) if jrdd2 else None
+            rdd = RDD(jrdd, self.ctx, self.jrdd_deserializer) if jrdd else None
+            other = RDD(jrdd2, self.ctx, self.jrdd_deserializer2) if jrdd2 else None
             r = self.func(rdd, other, milliseconds)
             if r:
                 return r._jrdd
@@ -67,7 +68,7 @@ def call(self, jrdd, jrdd2, milliseconds):
             traceback.print_exc()
 
     def __repr__(self):
-        return "RDDFunction(%s, %s)" % (str(self.deserializer), str(self.func))
+        return "RDDFunction2(%s)" % (str(self.func))
 
     class Java:
         implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction2']
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 689c04fa49135..b904e273eb438 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -118,7 +118,7 @@ private[spark] class PythonTransformed2DStream (parent: DStream[_], parent2: DSt
 private[spark]
 class PythonReducedWindowedDStream(
                                   parent: DStream[Array[Byte]],
-                                  reduceFunc: PythonRDDFunction,
+                                  reduceFunc: PythonRDDFunction2,
                                   invReduceFunc: PythonRDDFunction2,
                                   _windowDuration: Duration,
                                   _slideDuration: Duration
@@ -149,10 +149,6 @@ class PythonReducedWindowedDStream(
   override def parentRememberDuration: Duration = rememberDuration + windowDuration
 
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-    None
-    val reduceF = reduceFunc
-    val invReduceF = invReduceFunc
-
     val currentTime = validTime
     val currentWindow = new Interval(currentTime - windowDuration + parent.slideDuration,
       currentTime)
@@ -196,7 +192,7 @@ class PythonReducedWindowedDStream(
         parent.slice(previousWindow.endTime, currentWindow.endTime - parent.slideDuration)
 
       if (newRDDs.size > 0) {
-        Some(reduceFunc.call(JavaRDD.fromRDD(ssc.sc.union(newRDDs).union(subbed)), validTime.milliseconds))
+        Some(reduceFunc.call(JavaRDD.fromRDD(subbed), JavaRDD.fromRDD(ssc.sc.union(newRDDs)), validTime.milliseconds))
       } else {
         Some(subbed)
       }
@@ -205,7 +201,7 @@ class PythonReducedWindowedDStream(
       val currentRDDs =
         parent.slice(currentWindow.beginTime, currentWindow.endTime - parent.slideDuration)
       if (currentRDDs.size > 0) {
-        Some(reduceFunc.call(JavaRDD.fromRDD(ssc.sc.union(currentRDDs)), validTime.milliseconds))
+        Some(reduceFunc.call(null, JavaRDD.fromRDD(ssc.sc.union(currentRDDs)), validTime.milliseconds))
       } else {
         None
       }
@@ -216,6 +212,40 @@ class PythonReducedWindowedDStream(
 }
 
 
+/**
+ * Copied from ReducedWindowedDStream
+ */
+private[spark]
+class PythonStateDStream(
+                        parent: DStream[Array[Byte]],
+                        reduceFunc: PythonRDDFunction2
+                        ) extends DStream[Array[Byte]](parent.ssc) {
+
+  super.persist(StorageLevel.MEMORY_ONLY)
+
+  override def dependencies = List(parent)
+
+  override def slideDuration: Duration = parent.slideDuration
+
+  override val mustCheckpoint = true
+
+  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    val lastState = getOrCompute(validTime - slideDuration)
+    val newRDD = parent.getOrCompute(validTime)
+    if (newRDD.isDefined) {
+      if (lastState.isDefined) {
+        Some(reduceFunc.call(JavaRDD.fromRDD(lastState.get), JavaRDD.fromRDD(newRDD.get), validTime.milliseconds))
+      } else {
+        Some(reduceFunc.call(null, JavaRDD.fromRDD(newRDD.get), validTime.milliseconds))
+      }
+    } else {
+      lastState
+    }
+  }
+
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
+
 /**
  * This is used for foreachRDD() in Python
  */

From 3f0fb4b7e8265c9076077bc8290aeac3b9aeb18b Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Sat, 27 Sep 2014 00:15:52 -0700
Subject: [PATCH 305/347] refactor fix tests

---
 python/pyspark/serializers.py                 |   3 +
 python/pyspark/streaming/context.py           | 129 +++++++++--
 python/pyspark/streaming/dstream.py           |   8 +-
 python/pyspark/streaming/tests.py             |  62 +++--
 python/pyspark/streaming/util.py              |  13 +-
 .../streaming/api/python/PythonDStream.scala  | 219 ++++++++++--------
 6 files changed, 288 insertions(+), 146 deletions(-)

diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 2672da36c1f50..94bebc310bad6 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -114,6 +114,9 @@ def __ne__(self, other):
     def __repr__(self):
         return "<%s object>" % self.__class__.__name__
 
+    def __hash__(self):
+        return hash(str(self))
+
 
 class FramedSerializer(Serializer):
 
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 1c7cb5604e5cc..c4a1014ab9ab0 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -15,16 +15,51 @@
 # limitations under the License.
 #
 
-from pyspark.serializers import UTF8Deserializer
+from pyspark import RDD
+from pyspark.serializers import UTF8Deserializer, BatchedSerializer
 from pyspark.context import SparkContext
+from pyspark.storagelevel import StorageLevel
 from pyspark.streaming.dstream import DStream
-from pyspark.streaming.duration import Duration, Seconds
+from pyspark.streaming.duration import Seconds
 
 from py4j.java_collections import ListConverter
 
 __all__ = ["StreamingContext"]
 
 
+def _daemonize_callback_server():
+    """
+    Hack Py4J to daemonize callback server
+    """
+    # TODO: create a patch for Py4J
+    import socket
+    import py4j.java_gateway
+    logger = py4j.java_gateway.logger
+    from py4j.java_gateway import Py4JNetworkError
+    from threading import Thread
+
+    def start(self):
+        """Starts the CallbackServer. This method should be called by the
+        client instead of run()."""
+        self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        self.server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR,
+                                      1)
+        try:
+            self.server_socket.bind((self.address, self.port))
+            # self.port = self.server_socket.getsockname()[1]
+        except Exception:
+            msg = 'An error occurred while trying to start the callback server'
+            logger.exception(msg)
+            raise Py4JNetworkError(msg)
+
+        # Maybe thread needs to be cleanup up?
+        self.thread = Thread(target=self.run)
+        self.thread.daemon = True
+        self.thread.start()
+
+    py4j.java_gateway.CallbackServer.start = start
+
+
 class StreamingContext(object):
     """
     Main entry point for Spark Streaming functionality. A StreamingContext represents the
@@ -53,7 +88,9 @@ def _start_callback_server(self):
         gw = self._sc._gateway
         # getattr will fallback to JVM
         if "_callback_server" not in gw.__dict__:
+            _daemonize_callback_server()
             gw._start_callback_server(gw._python_proxy_port)
+            gw._python_proxy_port = gw._callback_server.port  # update port with real port
 
     def _initialize_context(self, sc, duration):
         return self._jvm.JavaStreamingContext(sc._jsc, duration._jduration)
@@ -92,26 +129,44 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
 
     def remember(self, duration):
         """
-        Set each DStreams in this context to remember RDDs it generated in the last given duration.
-        DStreams remember RDDs only for a limited duration of time and releases them for garbage
-        collection. This method allows the developer to specify how to long to remember the RDDs (
-        if the developer wishes to query old data outside the DStream computation).
-        @param duration pyspark.streaming.duration.Duration object or seconds.
-               Minimum duration that each DStream should remember its RDDs
+        Set each DStreams in this context to remember RDDs it generated
+        in the last given duration. DStreams remember RDDs only for a
+        limited duration of time and releases them for garbage collection.
+        This method allows the developer to specify how to long to remember
+        the RDDs ( if the developer wishes to query old data outside the
+        DStream computation).
+
+        @param duration Minimum duration (in seconds) that each DStream
+                        should remember its RDDs
         """
         if isinstance(duration, (int, long, float)):
             duration = Seconds(duration)
 
         self._jssc.remember(duration._jduration)
 
-    # TODO: add storageLevel
-    def socketTextStream(self, hostname, port):
+    def checkpoint(self, directory):
+        """
+        Sets the context to periodically checkpoint the DStream operations for master
+        fault-tolerance. The graph will be checkpointed every batch interval.
+
+        @param directory HDFS-compatible directory where the checkpoint data
+                         will be reliably stored
+        """
+        self._jssc.checkpoint(directory)
+
+    def socketTextStream(self, hostname, port, storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2):
         """
         Create an input from TCP source hostname:port. Data is received using
         a TCP socket and receive byte is interpreted as UTF8 encoded '\n' delimited
         lines.
+
+        @param hostname      Hostname to connect to for receiving data
+        @param port          Port to connect to for receiving data
+        @param storageLevel  Storage level to use for storing the received objects
         """
-        return DStream(self._jssc.socketTextStream(hostname, port), self, UTF8Deserializer())
+        jlevel = self._sc._getJavaStorageLevel(storageLevel)
+        return DStream(self._jssc.socketTextStream(hostname, port, jlevel), self,
+                       UTF8Deserializer())
 
     def textFileStream(self, directory):
         """
@@ -122,14 +177,52 @@ def textFileStream(self, directory):
         """
         return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
 
-    def _makeStream(self, inputs, numSlices=None):
+    def _check_serialzers(self, rdds):
+        # make sure they have same serializer
+        if len(set(rdd._jrdd_deserializer for rdd in rdds)):
+            for i in range(len(rdds)):
+                # reset them to sc.serializer
+                rdds[i] = rdds[i].map(lambda x: x, preservesPartitioning=True)
+
+    def queueStream(self, queue, oneAtATime=False, default=None):
         """
-        This function is only for unittest.
-        It requires a list as input, and returns the i_th element at the i_th batch
-        under manual clock.
+        Create an input stream from an queue of RDDs or list. In each batch,
+        it will process either one or all of the RDDs returned by the queue.
+
+        NOTE: changes to the queue after the stream is created will not be recognized.
+        @param queue      Queue of RDDs
+        @tparam T         Type of objects in the RDD
         """
-        rdds = [self._sc.parallelize(input, numSlices) for input in inputs]
+        if queue and not isinstance(queue[0], RDD):
+            rdds = [self._sc.parallelize(input) for input in queue]
+        else:
+            rdds = queue
+        self._check_serialzers(rdds)
         jrdds = ListConverter().convert([r._jrdd for r in rdds],
                                         SparkContext._gateway._gateway_client)
-        jdstream = self._jvm.PythonDataInputStream(self._jssc, jrdds).asJavaDStream()
-        return DStream(jdstream, self, rdds[0]._jrdd_deserializer)
+        jdstream = self._jvm.PythonDataInputStream(self._jssc, jrdds, oneAtATime,
+                                                   default and default._jrdd)
+        return DStream(jdstream.asJavaDStream(), self, rdds[0]._jrdd_deserializer)
+
+    def transform(self, dstreams, transformFunc):
+        """
+        Create a new DStream in which each RDD is generated by applying a function on RDDs of
+        the DStreams. The order of the JavaRDDs in the transform function parameter will be the
+        same as the order of corresponding DStreams in the list.
+        """
+        # TODO
+
+    def union(self, *dstreams):
+        """
+        Create a unified DStream from multiple DStreams of the same
+        type and same slide duration.
+        """
+        if not dstreams:
+            raise ValueError("should have at least one DStream to union")
+        if len(dstreams) == 1:
+            return dstreams[0]
+        self._check_serialzers(dstreams)
+        first = dstreams[0]
+        jrest = ListConverter().convert([d._jdstream for d in dstreams[1:]],
+                                        SparkContext._gateway._gateway_client)
+        return DStream(self._jssc.union(first._jdstream, jrest), self, first._jrdd_deserializer)
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 27e1400b8ba0b..9dd3556327477 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -315,16 +315,16 @@ def repartitions(self, numPartitions):
         return self.transform(lambda rdd: rdd.repartition(numPartitions))
 
     def union(self, other):
-        return self.transformWith(lambda a, b: a.union(b), other, True)
+        return self.transformWith(lambda a, b, t: a.union(b), other, True)
 
     def cogroup(self, other):
-        return self.transformWith(lambda a, b: a.cogroup(b), other)
+        return self.transformWith(lambda a, b, t: a.cogroup(b), other)
 
     def leftOuterJoin(self, other):
-        return self.transformWith(lambda a, b: a.leftOuterJion(b), other)
+        return self.transformWith(lambda a, b, t: a.leftOuterJion(b), other)
 
     def rightOuterJoin(self, other):
-        return self.transformWith(lambda a, b: a.rightOuterJoin(b), other)
+        return self.transformWith(lambda a, b, t: a.rightOuterJoin(b), other)
 
     def _jtime(self, milliseconds):
         return self.ctx._jvm.Time(milliseconds)
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 755ea224e56da..a585bbfa06f5b 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -40,27 +40,25 @@ def setUp(self):
         class_name = self.__class__.__name__
         self.sc = SparkContext(appName=class_name)
         self.sc.setCheckpointDir("/tmp")
+        # TODO: decrease duration to speed up tests
         self.ssc = StreamingContext(self.sc, duration=Seconds(1))
 
     def tearDown(self):
         self.ssc.stop()
-        self.sc.stop()
 
     @classmethod
     def tearDownClass(cls):
         # Make sure tp shutdown the callback server
         SparkContext._gateway._shutdown_callback_server()
 
-    def _test_func(self, input, func, expected, numSlices=None, sort=False):
+    def _test_func(self, input, func, expected, sort=False):
         """
-        Start stream and return the result.
         @param input: dataset for the test. This should be list of lists.
         @param func: wrapped function. This function should return PythonDStream object.
         @param expected: expected output for this testcase.
-        @param numSlices: the number of slices in the rdd in the dstream.
         """
         # Generate input stream with user-defined input.
-        input_stream = self.ssc._makeStream(input, numSlices)
+        input_stream = self.ssc.queueStream(input)
         # Apply test function to stream.
         stream = func(input_stream)
         result = stream.collect()
@@ -121,7 +119,7 @@ def func(dstream):
 
     def test_count(self):
         """Basic operation test for DStream.count."""
-        input = [range(1, 5), range(1, 10), range(1, 20)]
+        input = [range(5), range(10), range(20)]
 
         def func(dstream):
             return dstream.count()
@@ -178,24 +176,24 @@ def func(dstream):
     def test_glom(self):
         """Basic operation test for DStream.glom."""
         input = [range(1, 5), range(5, 9), range(9, 13)]
-        numSlices = 2
+        rdds = [self.sc.parallelize(r, 2) for r in input]
 
         def func(dstream):
             return dstream.glom()
         expected = [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]]
-        self._test_func(input, func, expected, numSlices)
+        self._test_func(rdds, func, expected)
 
     def test_mapPartitions(self):
         """Basic operation test for DStream.mapPartitions."""
         input = [range(1, 5), range(5, 9), range(9, 13)]
-        numSlices = 2
+        rdds = [self.sc.parallelize(r, 2) for r in input]
 
         def func(dstream):
             def f(iterator):
                 yield sum(iterator)
             return dstream.mapPartitions(f)
         expected = [[3, 7], [11, 15], [19, 23]]
-        self._test_func(input, func, expected, numSlices)
+        self._test_func(rdds, func, expected)
 
     def test_countByValue(self):
         """Basic operation test for DStream.countByValue."""
@@ -236,14 +234,14 @@ def add(a, b):
         self._test_func(input, func, expected, sort=True)
 
     def test_union(self):
-        input1 = [range(3), range(5), range(1)]
+        input1 = [range(3), range(5), range(1), range(6)]
         input2 = [range(3, 6), range(5, 6), range(1, 6)]
 
-        d1 = self.ssc._makeStream(input1)
-        d2 = self.ssc._makeStream(input2)
+        d1 = self.ssc.queueStream(input1)
+        d2 = self.ssc.queueStream(input2)
         d = d1.union(d2)
         result = d.collect()
-        expected = [range(6), range(6), range(6)]
+        expected = [range(6), range(6), range(6), range(6)]
 
         self.ssc.start()
         start_time = time.time()
@@ -317,33 +315,49 @@ def func(dstream):
 class TestStreamingContext(unittest.TestCase):
     def setUp(self):
         self.sc = SparkContext(master="local[2]", appName=self.__class__.__name__)
-        self.batachDuration = Seconds(1)
-        self.ssc = None
+        self.batachDuration = Seconds(0.1)
+        self.ssc = StreamingContext(self.sc, self.batachDuration)
 
     def tearDown(self):
-        if self.ssc is not None:
-            self.ssc.stop()
+        self.ssc.stop()
         self.sc.stop()
 
     def test_stop_only_streaming_context(self):
-        self.ssc = StreamingContext(self.sc, self.batachDuration)
-        self._addInputStream(self.ssc)
+        self._addInputStream()
         self.ssc.start()
         self.ssc.stop(False)
         self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)
 
     def test_stop_multiple_times(self):
-        self.ssc = StreamingContext(self.sc, self.batachDuration)
-        self._addInputStream(self.ssc)
+        self._addInputStream()
         self.ssc.start()
         self.ssc.stop()
         self.ssc.stop()
 
-    def _addInputStream(self, s):
+    def _addInputStream(self):
         # Make sure each length of input is over 3
         inputs = map(lambda x: range(1, x), range(5, 101))
-        stream = s._makeStream(inputs)
+        stream = self.ssc.queueStream(inputs)
         stream.collect()
 
+    def test_queueStream(self):
+        input = [range(i) for i in range(3)]
+        dstream = self.ssc.queueStream(input)
+        result = dstream.collect()
+        self.ssc.start()
+        time.sleep(1)
+        self.assertEqual(input, result)
+
+    def test_union(self):
+        input = [range(i) for i in range(3)]
+        dstream = self.ssc.queueStream(input)
+        dstream2 = self.ssc.union(dstream, dstream)
+        result = dstream.collect()
+        self.ssc.start()
+        time.sleep(1)
+        expected = [i * 2 for i in input]
+        self.assertEqual(input, result)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/pyspark/streaming/util.py b/python/pyspark/streaming/util.py
index fdbd01ec1766d..feff1b3889c49 100644
--- a/python/pyspark/streaming/util.py
+++ b/python/pyspark/streaming/util.py
@@ -30,7 +30,10 @@ def __init__(self, ctx, func, jrdd_deserializer):
 
     def call(self, jrdd, milliseconds):
         try:
-            rdd = RDD(jrdd, self.ctx, self.deserializer)
+            emptyRDD = getattr(self.ctx, "_emptyRDD", None)
+            if emptyRDD is None:
+                self.ctx._emptyRDD = emptyRDD = self.ctx.parallelize([]).cache()
+            rdd = RDD(jrdd, self.ctx, self.deserializer) if jrdd else emptyRDD
             r = self.func(rdd, milliseconds)
             if r:
                 return r._jrdd
@@ -58,8 +61,12 @@ def __init__(self, ctx, func, jrdd_deserializer, jrdd_deserializer2=None):
 
     def call(self, jrdd, jrdd2, milliseconds):
         try:
-            rdd = RDD(jrdd, self.ctx, self.jrdd_deserializer) if jrdd else None
-            other = RDD(jrdd2, self.ctx, self.jrdd_deserializer2) if jrdd2 else None
+            emptyRDD = getattr(self.ctx, "_emptyRDD", None)
+            if emptyRDD is None:
+                self.ctx._emptyRDD = emptyRDD = self.ctx.parallelize([]).cache()
+
+            rdd = RDD(jrdd, self.ctx, self.jrdd_deserializer) if jrdd else emptyRDD
+            other = RDD(jrdd2, self.ctx, self.jrdd_deserializer2) if jrdd2 else emptyRDD
             r = self.func(rdd, other, milliseconds)
             if r:
                 return r._jrdd
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index b904e273eb438..828a620e4c08f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -39,6 +39,22 @@ trait PythonRDDFunction {
   def call(rdd: JavaRDD[_], time: Long): JavaRDD[Array[Byte]]
 }
 
+class RDDFunction(pfunc: PythonRDDFunction) {
+  def apply(rdd: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
+    val jrdd = if (rdd.isDefined) {
+      JavaRDD.fromRDD(rdd.get)
+    } else {
+      null
+    }
+    val r = pfunc.call(jrdd, time.milliseconds)
+    if (r != null) {
+      Some(r.rdd)
+    } else {
+      None
+    }
+  }
+}
+
 /**
  * Interface for Python callback function with three arguments
  */
@@ -46,33 +62,61 @@ trait PythonRDDFunction2 {
   def call(rdd: JavaRDD[_], rdd2: JavaRDD[_], time: Long): JavaRDD[Array[Byte]]
 }
 
+class RDDFunction2(pfunc: PythonRDDFunction2) {
+  def apply(rdd: Option[RDD[_]], rdd2: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
+    val jrdd = if (rdd.isDefined) {
+      JavaRDD.fromRDD(rdd.get)
+    } else {
+      null
+    }
+    val jrdd2 = if (rdd2.isDefined) {
+      JavaRDD.fromRDD(rdd2.get)
+    } else {
+      null
+    }
+    val r = pfunc.call(jrdd, jrdd2, time.milliseconds)
+    if (r != null) {
+      Some(r.rdd)
+    } else {
+      None
+    }
+  }
+}
+
+private[python]
+abstract class PythonDStream(parent: DStream[_]) extends DStream[Array[Byte]] (parent.ssc) {
+
+  override def dependencies = List(parent)
+
+  override def slideDuration: Duration = parent.slideDuration
+
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
+
 /**
  * Transformed DStream in Python.
  *
  * If the result RDD is PythonRDD, then it will cache it as an template for future use,
  * this can reduce the Python callbacks.
  */
-private[spark] class PythonTransformedDStream (parent: DStream[_], func: PythonRDDFunction,
+private[spark] class PythonTransformedDStream (parent: DStream[_], pfunc: PythonRDDFunction,
                                 var reuse: Boolean = false)
-  extends DStream[Array[Byte]] (parent.ssc) {
+  extends PythonDStream(parent) {
 
+  val func = new RDDFunction(pfunc)
   var lastResult: PythonRDD = _
 
-  override def dependencies = List(parent)
-
-  override def slideDuration: Duration = parent.slideDuration
-
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-    val rdd1 = parent.getOrCompute(validTime).getOrElse(null)
-    if (rdd1 == null) {
+    val rdd1 = parent.getOrCompute(validTime)
+    if (rdd1.isEmpty) {
       return None
     }
     if (reuse && lastResult != null) {
-      Some(lastResult.copyTo(rdd1))
+      Some(lastResult.copyTo(rdd1.get))
     } else {
-      val r = func.call(JavaRDD.fromRDD(rdd1), validTime.milliseconds).rdd
-      if (reuse && lastResult == null) {
-        r match {
+      val r = func(rdd1, validTime)
+      if (reuse && r.isDefined && lastResult == null) {
+        r.get match {
           case rdd: PythonRDD =>
             if (rdd.parent(0) == rdd1) {
               // only one PythonRDD
@@ -83,46 +127,65 @@ private[spark] class PythonTransformedDStream (parent: DStream[_], func: PythonR
             }
         }
       }
-      Some(r)
+      r
     }
   }
-
-  val asJavaDStream  = JavaDStream.fromDStream(this)
 }
 
 /**
  * Transformed from two DStreams in Python.
  */
-private[spark] class PythonTransformed2DStream (parent: DStream[_], parent2: DStream[_], func: PythonRDDFunction2)
+private[spark]
+class PythonTransformed2DStream(parent: DStream[_], parent2: DStream[_],
+                                pfunc: PythonRDDFunction2)
   extends DStream[Array[Byte]] (parent.ssc) {
 
-  override def dependencies = List(parent, parent2)
+  val func = new RDDFunction2(pfunc)
 
   override def slideDuration: Duration = parent.slideDuration
 
+  override def dependencies = List(parent, parent2)
+
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-    def resultRdd(stream: DStream[_]): JavaRDD[_] = stream.getOrCompute(validTime) match {
-      case Some(rdd) => JavaRDD.fromRDD(rdd)
-      case None => null
-    }
-    Some(func.call(resultRdd(parent), resultRdd(parent2), validTime.milliseconds))
+    func(parent.getOrCompute(validTime), parent2.getOrCompute(validTime), validTime)
   }
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
 
+/**
+ * similar to StateDStream
+ */
+private[spark]
+class PythonStateDStream(parent: DStream[Array[Byte]], preduceFunc: PythonRDDFunction2)
+  extends PythonDStream(parent) {
+
+  val reduceFunc = new RDDFunction2(preduceFunc)
+
+  super.persist(StorageLevel.MEMORY_ONLY)
+  override val mustCheckpoint = true
+
+  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    val lastState = getOrCompute(validTime - slideDuration)
+    val rdd = parent.getOrCompute(validTime)
+    if (rdd.isDefined) {
+      reduceFunc(lastState, rdd, validTime)
+    } else {
+      lastState
+    }
+  }
+}
 
 /**
  * Copied from ReducedWindowedDStream
  */
 private[spark]
-class PythonReducedWindowedDStream(
-                                  parent: DStream[Array[Byte]],
-                                  reduceFunc: PythonRDDFunction2,
-                                  invReduceFunc: PythonRDDFunction2,
-                                  _windowDuration: Duration,
-                                  _slideDuration: Duration
-                                  ) extends DStream[Array[Byte]](parent.ssc) {
+class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
+                                   preduceFunc: PythonRDDFunction2,
+                                   pinvReduceFunc: PythonRDDFunction2,
+                                   _windowDuration: Duration,
+                                   _slideDuration: Duration
+                                   ) extends PythonStateDStream(parent, preduceFunc) {
 
   assert(_windowDuration.isMultipleOf(parent.slideDuration),
     "The window duration of ReducedWindowedDStream (" + _windowDuration + ") " +
@@ -134,18 +197,10 @@ class PythonReducedWindowedDStream(
       "must be multiple of the slide duration of parent DStream (" + parent.slideDuration + ")"
   )
 
+  val invReduceFunc = new RDDFunction2(pinvReduceFunc)
 
-  // Persist RDDs to memory by default as these RDDs are going to be reused.
-  super.persist(StorageLevel.MEMORY_ONLY)
-
-  def windowDuration: Duration =  _windowDuration
-
-  override def dependencies = List(parent)
-
+  def windowDuration: Duration = _windowDuration
   override def slideDuration: Duration = _slideDuration
-
-  override val mustCheckpoint = true
-
   override def parentRememberDuration: Duration = rememberDuration + windowDuration
 
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
@@ -171,20 +226,17 @@ class PythonReducedWindowedDStream(
     //       old RDDs                     new RDDs
     //
 
-
     // Get the RDD of the reduced value of the previous window
-    val previousWindowRDD =
-      getOrCompute(previousWindow.endTime)
+    val previousWindowRDD = getOrCompute(previousWindow.endTime)
 
     if (windowDuration > slideDuration * 5 && previousWindowRDD.isDefined) {
       // subtle the values from old RDDs
       val oldRDDs =
         parent.slice(previousWindow.beginTime, currentWindow.beginTime - parent.slideDuration)
       val subbed = if (oldRDDs.size > 0) {
-        invReduceFunc.call(JavaRDD.fromRDD(previousWindowRDD.get),
-          JavaRDD.fromRDD(ssc.sc.union(oldRDDs)), validTime.milliseconds).rdd
+        invReduceFunc(previousWindowRDD, Some(ssc.sc.union(oldRDDs)), validTime)
       } else {
-        previousWindowRDD.get
+        previousWindowRDD
       }
 
       // add the RDDs of the reduced values in "new time steps"
@@ -192,58 +244,21 @@ class PythonReducedWindowedDStream(
         parent.slice(previousWindow.endTime, currentWindow.endTime - parent.slideDuration)
 
       if (newRDDs.size > 0) {
-        Some(reduceFunc.call(JavaRDD.fromRDD(subbed), JavaRDD.fromRDD(ssc.sc.union(newRDDs)), validTime.milliseconds))
+        reduceFunc(subbed, Some(ssc.sc.union(newRDDs)), validTime)
       } else {
-        Some(subbed)
+        subbed
       }
     } else {
       // Get the RDDs of the reduced values in current window
       val currentRDDs =
         parent.slice(currentWindow.beginTime, currentWindow.endTime - parent.slideDuration)
       if (currentRDDs.size > 0) {
-        Some(reduceFunc.call(null, JavaRDD.fromRDD(ssc.sc.union(currentRDDs)), validTime.milliseconds))
+        reduceFunc(None, Some(ssc.sc.union(currentRDDs)), validTime)
       } else {
         None
       }
     }
   }
-
-  val asJavaDStream  = JavaDStream.fromDStream(this)
-}
-
-
-/**
- * Copied from ReducedWindowedDStream
- */
-private[spark]
-class PythonStateDStream(
-                        parent: DStream[Array[Byte]],
-                        reduceFunc: PythonRDDFunction2
-                        ) extends DStream[Array[Byte]](parent.ssc) {
-
-  super.persist(StorageLevel.MEMORY_ONLY)
-
-  override def dependencies = List(parent)
-
-  override def slideDuration: Duration = parent.slideDuration
-
-  override val mustCheckpoint = true
-
-  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-    val lastState = getOrCompute(validTime - slideDuration)
-    val newRDD = parent.getOrCompute(validTime)
-    if (newRDD.isDefined) {
-      if (lastState.isDefined) {
-        Some(reduceFunc.call(JavaRDD.fromRDD(lastState.get), JavaRDD.fromRDD(newRDD.get), validTime.milliseconds))
-      } else {
-        Some(reduceFunc.call(null, JavaRDD.fromRDD(newRDD.get), validTime.milliseconds))
-      }
-    } else {
-      lastState
-    }
-  }
-
-  val asJavaDStream  = JavaDStream.fromDStream(this)
 }
 
 /**
@@ -255,7 +270,9 @@ class PythonForeachDStream(
   ) extends ForEachDStream[Array[Byte]](
     prev,
     (rdd: RDD[Array[Byte]], time: Time) => {
-      foreachFunction.call(rdd.toJavaRDD(), time.milliseconds)
+      if (rdd != null) {
+        foreachFunction.call(rdd, time.milliseconds)
+      }
     }
   ) {
 
@@ -264,34 +281,42 @@ class PythonForeachDStream(
 
 
 /**
- * This is a input stream just for the unitest. This is equivalent to a checkpointable,
- * replayable, reliable message queue like Kafka. It requires a JArrayList of JavaRDD,
- * and returns the i_th element at the i_th batch under manual clock.
+ * similar to QueueInputStream
  */
 
 class PythonDataInputStream(
     ssc_ : JavaStreamingContext,
-    inputRDDs: JArrayList[JavaRDD[Array[Byte]]]
+    inputRDDs: JArrayList[JavaRDD[Array[Byte]]],
+    oneAtAtime: Boolean,
+    defaultRDD: JavaRDD[Array[Byte]]
   ) extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)) {
 
+  val emptyRDD = if (defaultRDD != null) {
+    Some(defaultRDD.rdd)
+  } else {
+    None // ssc.sparkContext.emptyRDD[Array[Byte]]
+  }
+
   def start() {}
 
   def stop() {}
 
   def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-    val emptyRDD = ssc.sparkContext.emptyRDD[Array[Byte]]
     val index = ((validTime - zeroTime) / slideDuration - 1).toInt
-    val selectedRDD = {
-      if (inputRDDs.isEmpty) {
+    if (oneAtAtime) {
+      if (index == 0) {
+        val rdds = inputRDDs.toArray.map(_.asInstanceOf[JavaRDD[Array[Byte]]].rdd).toSeq
+        Some(ssc.sparkContext.union(rdds))
+      } else {
         emptyRDD
-      } else if (index < inputRDDs.size()) {
-        inputRDDs.get(index).rdd
+      }
+    } else {
+      if (index < inputRDDs.size()) {
+        Some(inputRDDs.get(index).rdd)
       } else {
         emptyRDD
       }
     }
-
-    Some(selectedRDD)
   }
 
   val asJavaDStream  = JavaDStream.fromDStream(this)

From c499ba0e48c10b5aa587e81c179f02c1b88e2045 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Sat, 27 Sep 2014 00:26:12 -0700
Subject: [PATCH 306/347] remove Time and Duration

---
 python/pyspark/streaming/context.py  |  20 +-
 python/pyspark/streaming/dstream.py  |  13 +-
 python/pyspark/streaming/duration.py | 401 ---------------------------
 python/pyspark/streaming/jtime.py    | 135 ---------
 python/pyspark/streaming/tests.py    |   4 +-
 5 files changed, 14 insertions(+), 559 deletions(-)
 delete mode 100644 python/pyspark/streaming/duration.py
 delete mode 100644 python/pyspark/streaming/jtime.py

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index c4a1014ab9ab0..88e0cbbede1be 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -16,11 +16,10 @@
 #
 
 from pyspark import RDD
-from pyspark.serializers import UTF8Deserializer, BatchedSerializer
+from pyspark.serializers import UTF8Deserializer
 from pyspark.context import SparkContext
 from pyspark.storagelevel import StorageLevel
 from pyspark.streaming.dstream import DStream
-from pyspark.streaming.duration import Seconds
 
 from py4j.java_collections import ListConverter
 
@@ -76,9 +75,6 @@ def __init__(self, sparkContext, duration):
         @param duration: A L{Duration} object or seconds for SparkStreaming.
 
         """
-        if isinstance(duration, (int, long, float)):
-            duration = Seconds(duration)
-
         self._sc = sparkContext
         self._jvm = self._sc._jvm
         self._start_callback_server()
@@ -93,7 +89,10 @@ def _start_callback_server(self):
             gw._python_proxy_port = gw._callback_server.port  # update port with real port
 
     def _initialize_context(self, sc, duration):
-        return self._jvm.JavaStreamingContext(sc._jsc, duration._jduration)
+        return self._jvm.JavaStreamingContext(sc._jsc, self._jduration(duration))
+
+    def _jduration(self, seconds):
+        return self._jvm.Duration(int(seconds * 1000))
 
     @property
     def sparkContext(self):
@@ -111,12 +110,12 @@ def start(self):
     def awaitTermination(self, timeout=None):
         """
         Wait for the execution to stop.
-        @param timeout: time to wait in milliseconds
+        @param timeout: time to wait in seconds
         """
         if timeout is None:
             self._jssc.awaitTermination()
         else:
-            self._jssc.awaitTermination(timeout)
+            self._jssc.awaitTermination(int(timeout * 1000))
 
     def stop(self, stopSparkContext=True, stopGraceFully=False):
         """
@@ -139,10 +138,7 @@ def remember(self, duration):
         @param duration Minimum duration (in seconds) that each DStream
                         should remember its RDDs
         """
-        if isinstance(duration, (int, long, float)):
-            duration = Seconds(duration)
-
-        self._jssc.remember(duration._jduration)
+        self._jssc.remember(self._jduration(duration))
 
     def checkpoint(self, directory):
         """
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 9dd3556327477..8c79eece773ce 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -22,7 +22,6 @@
 from pyspark.storagelevel import StorageLevel
 from pyspark.streaming.util import rddToFileName, RDDFunction, RDDFunction2
 from pyspark.rdd import portable_hash
-from pyspark.streaming.duration import Duration, Seconds
 from pyspark.resultiterable import ResultIterable
 
 __all__ = ["DStream"]
@@ -334,10 +333,10 @@ def slice(self, begin, end):
         return [RDD(jrdd, self.ctx, self._jrdd_deserializer) for jrdd in jrdds]
 
     def window(self, windowDuration, slideDuration=None):
-        d = Seconds(windowDuration)
+        d = self._ssc._jduration(windowDuration)
         if slideDuration is None:
             return DStream(self._jdstream.window(d), self._ssc, self._jrdd_deserializer)
-        s = Seconds(slideDuration)
+        s = self._ssc._jduration(slideDuration)
         return DStream(self._jdstream.window(d, s), self._ssc, self._jrdd_deserializer)
 
     def reduceByWindow(self, reduceFunc, invReduceFunc, windowDuration, slideDuration):
@@ -375,16 +374,12 @@ def invReduceFunc(a, b, t):
             joined = a.leftOuterJoin(b, numPartitions)
             return joined.mapValues(lambda (v1, v2): invFunc(v1, v2) if v2 is not None else v1)
 
-        if not isinstance(windowDuration, Duration):
-            windowDuration = Seconds(windowDuration)
-        if not isinstance(slideDuration, Duration):
-            slideDuration = Seconds(slideDuration)
         jreduceFunc = RDDFunction2(self.ctx, reduceFunc, reduced._jrdd_deserializer)
         jinvReduceFunc = RDDFunction2(self.ctx, invReduceFunc, reduced._jrdd_deserializer)
         dstream = self.ctx._jvm.PythonReducedWindowedDStream(reduced._jdstream.dstream(),
                                                              jreduceFunc, jinvReduceFunc,
-                                                             windowDuration._jduration,
-                                                             slideDuration._jduration)
+                                                             self._ssc._jduration(windowDuration),
+                                                             self._ssc._jduration(slideDuration))
         return DStream(dstream.asJavaDStream(), self._ssc, self.ctx.serializer)
 
     def updateStateByKey(self, updateFunc, numPartitions=None):
diff --git a/python/pyspark/streaming/duration.py b/python/pyspark/streaming/duration.py
deleted file mode 100644
index 8660f332a48da..0000000000000
--- a/python/pyspark/streaming/duration.py
+++ /dev/null
@@ -1,401 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-def msDurationToString(ms):
-    """
-    Returns a human-readable string representing a duration such as "35ms"
-
-    >> msDurationToString(10)
-    '10 ms'
-    >>> msDurationToString(1000)
-    '1.0 s'
-    >>> msDurationToString(60000)
-    '1.0 m'
-    >>> msDurationToString(3600000)
-    '1.00 h'
-    """
-    second = 1000
-    minute = 60 * second
-    hour = 60 * minute
-
-    if ms < second:
-        return "%d ms" % ms
-    elif ms < minute:
-        return "%.1f s" % (float(ms) / second)
-    elif ms < hour:
-        return "%.1f m" % (float(ms) / minute)
-    else:
-        return "%.2f h" % (float(ms) / hour)
-
-
-class Duration(object):
-    """
-    Duration for Spark Streaming application. Used to set duration
-
-    Most of the time, you would create a Duration object with
-    C{Duration()}, which will load values from C{spark.streaming.*} Java system
-    properties as well. In this case, any parameters you set directly on
-    the C{Duration} object take priority over system properties.
-
-    """
-    def __init__(self, millis, _jvm=None):
-        """
-        Create new Duration.
-
-        @param millis: milisecond
-
-        """
-        self._millis = millis
-
-        from pyspark.context import SparkContext
-        SparkContext._ensure_initialized()
-        _jvm = _jvm or SparkContext._jvm
-        self._jduration = _jvm.Duration(millis)
-
-    def toString(self):
-        """
-        Return duration as string
-
-        >>> d_10 = Duration(10)
-        >>> d_10.toString()
-        '10 ms'
-        """
-        return str(self._millis) + " ms"
-
-    def isZero(self):
-        """
-        Check if millis is zero
-
-        >>> d_10 = Duration(10)
-        >>> d_10.isZero()
-        False
-        >>> d_0 = Duration(0)
-        >>> d_0.isZero()
-        True
-        """
-        return self._millis == 0
-
-    def prettyPrint(self):
-        """
-        Return a human-readable string representing a duration
-
-        >>> d_10 = Duration(10)
-        >>> d_10.prettyPrint()
-        '10 ms'
-        >>> d_1sec = Duration(1000)
-        >>> d_1sec.prettyPrint()
-        '1.0 s'
-        >>> d_1min = Duration(60 * 1000)
-        >>> d_1min.prettyPrint()
-        '1.0 m'
-        >>> d_1hour = Duration(60 * 60 * 1000)
-        >>> d_1hour.prettyPrint()
-        '1.00 h'
-        """
-        return msDurationToString(self._millis)
-
-    def milliseconds(self):
-        """
-        Return millisecond
-
-        >>> d_10 = Duration(10)
-        >>> d_10.milliseconds()
-        10
-
-        """
-        return self._millis
-
-    def toFormattedString(self):
-        """
-        Return millisecond
-
-        >>> d_10 = Duration(10)
-        >>> d_10.toFormattedString()
-        '10'
-
-        """
-        return str(self._millis)
-
-    def max(self, other):
-        """
-        Return higher Duration
-
-        >>> d_10 = Duration(10)
-        >>> d_100 = Duration(100)
-        >>> d_max = d_10.max(d_100)
-        >>> print d_max
-        100 ms
-
-        """
-        Duration._is_duration(other)
-        if self > other:
-            return self
-        else:
-            return other
-
-    def min(self, other):
-        """
-        Return lower Durattion
-
-        >>> d_10 = Duration(10)
-        >>> d_100 = Duration(100)
-        >>> d_min = d_10.min(d_100)
-        >>> print d_min
-        10 ms
-
-        """
-        Duration._is_duration(other)
-        if self < other:
-            return self
-        else:
-            return other
-
-    def __str__(self):
-        """
-        >>> d_10 = Duration(10)
-        >>> str(d_10)
-        '10 ms'
-
-        """
-        return self.toString()
-
-    def __add__(self, other):
-        """
-        Add Duration and Duration
-
-        >>> d_10 = Duration(10)
-        >>> d_100 = Duration(100)
-        >>> d_110 = d_10 + d_100
-        >>> print d_110
-        110 ms
-        """
-        Duration._is_duration(other)
-        return Duration(self._millis + other._millis)
-
-    def __sub__(self, other):
-        """
-        Subtract Duration by Duration
-
-        >>> d_10 = Duration(10)
-        >>> d_100 = Duration(100)
-        >>> d_90 =  d_100 - d_10
-        >>> print d_90
-        90 ms
-
-        """
-        Duration._is_duration(other)
-        return Duration(self._millis - other._millis)
-
-    def __mul__(self, other):
-        """
-        Multiple Duration by Duration
-
-        >>> d_10 = Duration(10)
-        >>> d_100 = Duration(100)
-        >>> d_1000 = d_10 * d_100
-        >>> print d_1000
-        1000 ms
-
-        """
-        Duration._is_duration(other)
-        return Duration(self._millis * other._millis)
-
-    def __div__(self, other):
-        """
-        Divide Duration by Duration
-        for Python 2.X
-
-        >>> d_10 = Duration(10)
-        >>> d_20 = Duration(20)
-        >>> d_2 = d_20 / d_10
-        >>> print d_2
-        2 ms
-
-        """
-        Duration._is_duration(other)
-        return Duration(self._millis / other._millis)
-
-    def __truediv__(self, other):
-        """
-        Divide Duration by Duration
-        for Python 3.0
-
-        >>> d_10 = Duration(10)
-        >>> d_20 = Duration(20)
-        >>> d_2 = d_20 / d_10
-        >>> print d_2
-        2 ms
-
-        """
-        Duration._is_duration(other)
-        return Duration(self._millis / other._millis)
-
-    def __floordiv__(self, other):
-        """
-        Divide Duration by Duration
-
-        >>> d_10 = Duration(10)
-        >>> d_3 = Duration(3)
-        >>> d_3 = d_10 // d_3
-        >>> print d_3
-        3 ms
-
-        """
-        Duration._is_duration(other)
-        return Duration(self._millis // other._millis)
-
-    def __lt__(self, other):
-        """
-        Duration < Duration
-
-        >>> d_10 = Duration(10)
-        >>> d_20 = Duration(20)
-        >>> d_10 < d_20
-        True
-        >>> d_20 < d_10
-        False
-
-        """
-        Duration._is_duration(other)
-        return self._millis < other._millis
-
-    def __le__(self, other):
-        """
-        Duration <= Duration
-
-        >>> d_10 = Duration(10)
-        >>> d_20 = Duration(20)
-        >>> d_10 <= d_20
-        True
-        >>> d_20 <= d_10
-        False
-
-        """
-        Duration._is_duration(other)
-        return self._millis <= other._millis
-
-    def __eq__(self, other):
-        """
-        Duration ==  Duration
-
-        >>> d_10 = Duration(10)
-        >>> d_20 = Duration(20)
-        >>> d_10 == d_20
-        False
-        >>> other_d_10 = Duration(10)
-        >>> d_10 == other_d_10
-        True
-
-        """
-        Duration._is_duration(other)
-        return self._millis == other._millis
-
-    def __ne__(self, other):
-        """
-        Duration != Duration
-
-        >>> d_10 = Duration(10)
-        >>> d_20 = Duration(20)
-        >>> d_10 != d_20
-        True
-        >>> other_d_10 = Duration(10)
-        >>> d_10 != other_d_10
-        False
-
-        """
-        Duration._is_duration(other)
-        return self._millis != other._millis
-
-    def __gt__(self, other):
-        """
-        Duration > Duration
-
-        >>> d_10 = Duration(10)
-        >>> d_20 = Duration(20)
-        >>> d_10 > d_20
-        False
-        >>> d_20 > d_10
-        True
-
-        """
-        Duration._is_duration(other)
-        return self._millis > other._millis
-
-    def __ge__(self, other):
-        """
-        Duration >= Duration
-
-        >>> d_10 = Duration(10)
-        >>> d_20 = Duration(20)
-        >>> d_10 < d_20
-        True
-        >>> d_20 < d_10
-        False
-
-
-        """
-        Duration._is_duration(other)
-        return self._millis >= other._millis
-
-    @classmethod
-    def _is_duration(self, instance):
-        """ is instance Duration """
-        if not isinstance(instance, Duration):
-            raise TypeError("This should be Duration")
-
-
-def Milliseconds(milliseconds):
-    """
-    Helper function that creates instance of [[pysparkstreaming.duration]] representing
-    a given number of milliseconds.
-
-    >>> milliseconds = Milliseconds(1)
-    >>> d_1 = Duration(1)
-    >>> milliseconds == d_1
-    True
-
-    """
-    return Duration(milliseconds)
-
-
-def Seconds(seconds):
-    """
-    Helper function that creates instance of [[pysparkstreaming.duration]] representing
-    a given number of seconds.
-
-    >>> seconds = Seconds(1)
-    >>> d_1sec = Duration(1000)
-    >>> seconds == d_1sec
-    True
-
-    """
-    return Duration(seconds * 1000)
-
-
-def Minutes(minutes):
-    """
-    Helper function that creates instance of [[pysparkstreaming.duration]] representing
-    a given number of minutes.
-
-    >>> minutes = Minutes(1)
-    >>> d_1min = Duration(60 * 1000)
-    >>> minutes == d_1min
-    True
-
-    """
-    return Duration(minutes * 60 * 1000)
diff --git a/python/pyspark/streaming/jtime.py b/python/pyspark/streaming/jtime.py
deleted file mode 100644
index e157640afa4df..0000000000000
--- a/python/pyspark/streaming/jtime.py
+++ /dev/null
@@ -1,135 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from pyspark.streaming.duration import Duration
-
-"""
-The name of this file, time is not a good naming for python
-because if we do import time when we want to use native python time package, it does
-not import python time package.
-"""
-# TODO: add doctest
-
-
-class Time(object):
-    """
-    Time for Spark Streaming application. Used to set Time
-
-    Most of the time, you would create a Duration object with
-    C{Time()}, which will load values from C{spark.streaming.*} Java system
-    properties as well. In this case, any parameters you set directly on
-    the C{Time} object take priority over system properties.
-
-    """
-    def __init__(self, millis, _jvm=None):
-        """
-        Create new Time.
-
-        @param millis: milisecond
-
-        @param _jvm: internal parameter used to pass a handle to the
-               Java VM; does not need to be set by users
-
-        """
-        self._millis = millis
-
-        from pyspark.context import StreamingContext
-        StreamingContext._ensure_initialized()
-        _jvm = _jvm or StreamingContext._jvm
-        self._jtime = _jvm.Time(millis)
-
-    def toString(self):
-        """ Return time as string """
-        return str(self._millis) + " ms"
-
-    def milliseconds(self):
-        """ Return millisecond """
-        return self._millis
-
-    def max(self, other):
-        """ Return higher Time """
-        Time._is_time(other)
-        if self > other:
-            return self
-        else:
-            return other
-
-    def min(self, other):
-        """ Return lower Time """
-        Time._is_time(other)
-        if self < other:
-            return self
-        else:
-            return other
-
-    def __add__(self, other):
-        """ Add Time and Time """
-        Duration._is_duration(other)
-        return Time(self._millis + other._millis)
-
-    def __sub__(self, other):
-        """ Subtract Time by Duration or Time """
-        if isinstance(other, Duration):
-            return Time(self._millis - other._millis)
-        elif isinstance(other, Time):
-            return Duration(self._millis, other._millis)
-        else:
-            raise TypeError
-
-    def __lt__(self, other):
-        """ Time < Time """
-        Time._is_time(other)
-        return self._millis < other._millis
-
-    def __le__(self, other):
-        """ Time <= Time """
-        Time._is_time(other)
-        return self._millis <= other._millis
-
-    def __eq__(self, other):
-        """ Time ==  Time """
-        Time._is_time(other)
-        return self._millis == other._millis
-
-    def __ne__(self, other):
-        """ Time != Time """
-        Time._is_time(other)
-        return self._millis != other._millis
-
-    def __gt__(self, other):
-        """ Time > Time """
-        Time._is_time(other)
-        return self._millis > other._millis
-
-    def __ge__(self, other):
-        """ Time >= Time """
-        Time._is_time(other)
-        return self._millis >= other._millis
-
-    def isMultipbleOf(self, duration):
-        """ is multiple by Duration """
-        Duration._is_duration(duration)
-        return self._millis % duration._millis == 0
-
-    @classmethod
-    def _is_time(self, instance):
-        """ is instance Time """
-        if not isinstance(instance, Time):
-            raise TypeError
-
-# TODO: implement until
-# TODO: implement to
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index a585bbfa06f5b..1684da580f973 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -41,7 +41,7 @@ def setUp(self):
         self.sc = SparkContext(appName=class_name)
         self.sc.setCheckpointDir("/tmp")
         # TODO: decrease duration to speed up tests
-        self.ssc = StreamingContext(self.sc, duration=Seconds(1))
+        self.ssc = StreamingContext(self.sc, duration=1)
 
     def tearDown(self):
         self.ssc.stop()
@@ -315,7 +315,7 @@ def func(dstream):
 class TestStreamingContext(unittest.TestCase):
     def setUp(self):
         self.sc = SparkContext(master="local[2]", appName=self.__class__.__name__)
-        self.batachDuration = Seconds(0.1)
+        self.batachDuration = 0.1
         self.ssc = StreamingContext(self.sc, self.batachDuration)
 
     def tearDown(self):

From 604323fd39a3b3f0f39540bf71fc737630f2b110 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Sat, 27 Sep 2014 00:30:06 -0700
Subject: [PATCH 307/347] enable streaming tests

---
 python/run-tests | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/python/run-tests b/python/run-tests
index 79d7602ccbc87..5aa9212c8adc1 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -70,8 +70,6 @@ export PYSPARK_DOC_TEST=1
 run_test "pyspark/broadcast.py"
 run_test "pyspark/accumulators.py"
 run_test "pyspark/serializers.py"
-run_test "pyspark/streaming/duration.py"
-run_test "pyspark/streaming/util.py"
 unset PYSPARK_DOC_TEST
 run_test "pyspark/shuffle.py"
 run_test "pyspark/tests.py"
@@ -85,9 +83,7 @@ run_test "pyspark/mllib/stat.py"
 run_test "pyspark/mllib/tests.py"
 run_test "pyspark/mllib/tree.py"
 run_test "pyspark/mllib/util.py"
-if [ -n "$_RUN_STREAMING_TESTS" ]; then
-  run_test "pyspark/streaming/tests.py"
-fi
+run_test "pyspark/streaming/tests.py"
 
 # Try to test with PyPy
 if [ $(which pypy) ]; then
@@ -108,6 +104,7 @@ if [ $(which pypy) ]; then
     unset PYSPARK_DOC_TEST
     run_test "pyspark/shuffle.py"
     run_test "pyspark/tests.py"
+    run_test "pyspark/streaming/tests.py"
 fi
 
 if [[ $FAILED == 0 ]]; then

From b32774cc3cc7493b360bd9e5b8b01df28968d0c2 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Sat, 27 Sep 2014 00:36:43 -0700
Subject: [PATCH 308/347] move java_import into streaming

---
 python/pyspark/java_gateway.py      | 4 +---
 python/pyspark/streaming/context.py | 7 +++++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index c3fef42d118bd..db5b97f8472d1 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -23,6 +23,7 @@
 import platform
 from subprocess import Popen, PIPE
 from threading import Thread
+
 from py4j.java_gateway import java_import, JavaGateway, GatewayClient
 
 
@@ -108,9 +109,6 @@ def run(self):
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
     java_import(gateway.jvm, "org.apache.spark.api.java.*")
     java_import(gateway.jvm, "org.apache.spark.api.python.*")
-    java_import(gateway.jvm, "org.apache.spark.streaming.*")
-    java_import(gateway.jvm, "org.apache.spark.streaming.api.java.*")
-    java_import(gateway.jvm, "org.apache.spark.streaming.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
     java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
     java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 88e0cbbede1be..a647c9ec734df 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -22,6 +22,7 @@
 from pyspark.streaming.dstream import DStream
 
 from py4j.java_collections import ListConverter
+from py4j.java_gateway import java_import
 
 __all__ = ["StreamingContext"]
 
@@ -72,7 +73,7 @@ def __init__(self, sparkContext, duration):
         should be set, either through the named parameters here or through C{conf}.
 
         @param sparkContext: L{SparkContext} object.
-        @param duration: A L{Duration} object or seconds for SparkStreaming.
+        @param duration: seconds for SparkStreaming.
 
         """
         self._sc = sparkContext
@@ -89,6 +90,9 @@ def _start_callback_server(self):
             gw._python_proxy_port = gw._callback_server.port  # update port with real port
 
     def _initialize_context(self, sc, duration):
+        java_import(self._jvm, "org.apache.spark.streaming.*")
+        java_import(self._jvm, "org.apache.spark.streaming.api.java.*")
+        java_import(self._jvm, "org.apache.spark.streaming.api.python.*")
         return self._jvm.JavaStreamingContext(sc._jsc, self._jduration(duration))
 
     def _jduration(self, seconds):
@@ -217,7 +221,6 @@ def union(self, *dstreams):
             raise ValueError("should have at least one DStream to union")
         if len(dstreams) == 1:
             return dstreams[0]
-        self._check_serialzers(dstreams)
         first = dstreams[0]
         jrest = ListConverter().convert([d._jdstream for d in dstreams[1:]],
                                         SparkContext._gateway._gateway_client)

From 74df565e26e9bf7b107cc678e1668dfda7d534ef Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Sat, 27 Sep 2014 00:48:03 -0700
Subject: [PATCH 309/347] fix print and docs

---
 python/pyspark/streaming/dstream.py | 56 ++++++++++++-----------------
 1 file changed, 23 insertions(+), 33 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 8c79eece773ce..01ca56a7a0387 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -17,6 +17,7 @@
 
 from itertools import chain, ifilter, imap
 import operator
+from datetime import datetime
 
 from pyspark import RDD
 from pyspark.storagelevel import StorageLevel
@@ -54,17 +55,6 @@ def sum(self):
         """
         return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
-    def print_(self, label=None):
-        """
-        Since print is reserved name for python, we cannot define a "print" method function.
-        This function prints serialized data in RDD in DStream because Scala and Java cannot
-        deserialized pickled python object. Please use DStream.pyprint() to print results.
-
-        Call DStream.print() and this function will print byte array in the DStream
-        """
-        # a hack to call print function in DStream
-        getattr(self._jdstream, "print")(label)
-
     def filter(self, f):
         """
         Return a new DStream containing only the elements that satisfy predicate.
@@ -154,19 +144,15 @@ def foreachRDD(self, func):
         jfunc = RDDFunction(self.ctx, func, self._jrdd_deserializer)
         self.ctx._jvm.PythonForeachDStream(self._jdstream.dstream(), jfunc)
 
-    def pyprint(self):
+    def pprint(self):
         """
         Print the first ten elements of each RDD generated in this DStream. This is an output
         operator, so this DStream will be registered as an output stream and there materialized.
         """
         def takeAndPrint(rdd, time):
-            """
-            Closure to take element from RDD and print first 10 elements.
-            This closure is called by py4j callback server.
-            """
             taken = rdd.take(11)
             print "-------------------------------------------"
-            print "Time: %s" % (str(time))
+            print "Time: %s" % datetime.fromtimestamp(time / 1000.0)
             print "-------------------------------------------"
             for record in taken[:10]:
                 print record
@@ -176,6 +162,20 @@ def takeAndPrint(rdd, time):
 
         self.foreachRDD(takeAndPrint)
 
+    def collect(self):
+        """
+        Collect each RDDs into the returned list.
+
+        :return: list, which will have the collected items.
+        """
+        result = []
+
+        def get_output(rdd, time):
+            r = rdd.collect()
+            result.append(r)
+        self.foreachRDD(get_output)
+        return result
+
     def mapValues(self, f):
         """
         Pass each value in the key-value pair RDD through a map function
@@ -196,9 +196,9 @@ def flatMapValues(self, f):
 
     def glom(self):
         """
-        Return a new DStream in which RDD is generated by applying glom() to RDD of
-        this DStream. Applying glom() to an RDD coalesces all elements within each partition into
-        an list.
+        Return a new DStream in which RDD is generated by applying glom()
+        to RDD of this DStream. Applying glom() to an RDD coalesces all
+        elements within each partition into an list.
         """
         def func(iterator):
             yield list(iterator)
@@ -228,11 +228,11 @@ def checkpoint(self, interval):
         Mark this DStream for checkpointing. It will be saved to a file inside the
         checkpoint directory set with L{SparkContext.setCheckpointDir()}
 
-        @param interval: Time interval after which generated RDD will be checkpointed
-               interval has to be pyspark.streaming.duration.Duration
+        @param interval: time in seconds, after which generated RDD will
+                         be checkpointed
         """
         self.is_checkpointed = True
-        self._jdstream.checkpoint(interval._jduration)
+        self._jdstream.checkpoint(self._ssc._jduration(interval))
         return self
 
     def groupByKey(self, numPartitions=None):
@@ -245,7 +245,6 @@ def groupByKey(self, numPartitions=None):
         Note: If you are grouping in order to perform an aggregation (such as a
         sum or average) over each key, using reduceByKey will provide much
         better performance.
-
         """
         return self.transform(lambda rdd: rdd.groupByKey(numPartitions))
 
@@ -288,15 +287,6 @@ def saveAsPickleFile(rdd, time):
 
         return self.foreachRDD(saveAsPickleFile)
 
-    def collect(self):
-        result = []
-
-        def get_output(rdd, time):
-            r = rdd.collect()
-            result.append(r)
-        self.foreachRDD(get_output)
-        return result
-
     def transform(self, func):
         return TransformedDStream(self, lambda a, t: func(a), True)
 

From 26ea39619c59f28b1ad18b8e44abef25d8d1dbae Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Sat, 27 Sep 2014 21:17:23 -0700
Subject: [PATCH 310/347] refactor

---
 python/pyspark/streaming/dstream.py           | 25 ++++----
 python/pyspark/streaming/tests.py             |  9 +--
 python/pyspark/streaming/util.py              | 47 +++-----------
 .../streaming/api/python/PythonDStream.scala  | 63 +++++--------------
 4 files changed, 40 insertions(+), 104 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 01ca56a7a0387..d41eca020feb1 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -21,7 +21,7 @@
 
 from pyspark import RDD
 from pyspark.storagelevel import StorageLevel
-from pyspark.streaming.util import rddToFileName, RDDFunction, RDDFunction2
+from pyspark.streaming.util import rddToFileName, RDDFunction
 from pyspark.rdd import portable_hash
 from pyspark.resultiterable import ResultIterable
 
@@ -141,7 +141,7 @@ def foreachRDD(self, func):
         This is an output operator, so this DStream will be registered as an output
         stream and there materialized.
         """
-        jfunc = RDDFunction(self.ctx, func, self._jrdd_deserializer)
+        jfunc = RDDFunction(self.ctx, lambda a, _, t: func(a, t), self._jrdd_deserializer)
         self.ctx._jvm.PythonForeachDStream(self._jdstream.dstream(), jfunc)
 
     def pprint(self):
@@ -294,7 +294,7 @@ def transformWithTime(self, func):
         return TransformedDStream(self, func, False)
 
     def transformWith(self, func, other, keepSerializer=False):
-        jfunc = RDDFunction2(self.ctx, func, self._jrdd_deserializer)
+        jfunc = RDDFunction(self.ctx, lambda a, b, t: func(a, b), self._jrdd_deserializer)
         dstream = self.ctx._jvm.PythonTransformed2DStream(self._jdstream.dstream(),
                                                           other._jdstream.dstream(), jfunc)
         jrdd_serializer = self._jrdd_deserializer if keepSerializer else self.ctx.serializer
@@ -304,16 +304,16 @@ def repartitions(self, numPartitions):
         return self.transform(lambda rdd: rdd.repartition(numPartitions))
 
     def union(self, other):
-        return self.transformWith(lambda a, b, t: a.union(b), other, True)
+        return self.transformWith(lambda a, b: a.union(b), other, True)
 
     def cogroup(self, other):
-        return self.transformWith(lambda a, b, t: a.cogroup(b), other)
+        return self.transformWith(lambda a, b: a.cogroup(b), other)
 
     def leftOuterJoin(self, other):
-        return self.transformWith(lambda a, b, t: a.leftOuterJion(b), other)
+        return self.transformWith(lambda a, b: a.leftOuterJion(b), other)
 
     def rightOuterJoin(self, other):
-        return self.transformWith(lambda a, b, t: a.rightOuterJoin(b), other)
+        return self.transformWith(lambda a, b: a.rightOuterJoin(b), other)
 
     def _jtime(self, milliseconds):
         return self.ctx._jvm.Time(milliseconds)
@@ -364,8 +364,8 @@ def invReduceFunc(a, b, t):
             joined = a.leftOuterJoin(b, numPartitions)
             return joined.mapValues(lambda (v1, v2): invFunc(v1, v2) if v2 is not None else v1)
 
-        jreduceFunc = RDDFunction2(self.ctx, reduceFunc, reduced._jrdd_deserializer)
-        jinvReduceFunc = RDDFunction2(self.ctx, invReduceFunc, reduced._jrdd_deserializer)
+        jreduceFunc = RDDFunction(self.ctx, reduceFunc, reduced._jrdd_deserializer)
+        jinvReduceFunc = RDDFunction(self.ctx, invReduceFunc, reduced._jrdd_deserializer)
         dstream = self.ctx._jvm.PythonReducedWindowedDStream(reduced._jdstream.dstream(),
                                                              jreduceFunc, jinvReduceFunc,
                                                              self._ssc._jduration(windowDuration),
@@ -384,8 +384,8 @@ def reduceFunc(a, b, t):
                                             (k, list(vb), list(va)[0] if len(va) else None))
             return g.mapPartitions(lambda x: updateFunc(x) or [])
 
-        jreduceFunc = RDDFunction2(self.ctx, reduceFunc,
-                                   self.ctx.serializer, self._jrdd_deserializer)
+        jreduceFunc = RDDFunction(self.ctx, reduceFunc,
+                                  self.ctx.serializer, self._jrdd_deserializer)
         dstream = self.ctx._jvm.PythonStateDStream(self._jdstream.dstream(), jreduceFunc)
         return DStream(dstream.asJavaDStream(), self._ssc, self.ctx.serializer)
 
@@ -417,7 +417,8 @@ def _jdstream(self):
         if self._jdstream_val is not None:
             return self._jdstream_val
 
-        jfunc = RDDFunction(self.ctx, self.func, self.prev._jrdd_deserializer)
+        func = self.func
+        jfunc = RDDFunction(self.ctx, lambda a, _, t: func(a, t), self.prev._jrdd_deserializer)
         jdstream = self.ctx._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(),
                                                           jfunc, self.reuse).asJavaDStream()
         self._jdstream_val = jdstream
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 1684da580f973..06fcc29850504 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -346,17 +346,18 @@ def test_queueStream(self):
         result = dstream.collect()
         self.ssc.start()
         time.sleep(1)
-        self.assertEqual(input, result)
+        self.assertEqual(input, result[:3])
 
     def test_union(self):
         input = [range(i) for i in range(3)]
         dstream = self.ssc.queueStream(input)
-        dstream2 = self.ssc.union(dstream, dstream)
-        result = dstream.collect()
+        dstream2 = self.ssc.queueStream(input)
+        dstream3 = self.ssc.union(dstream, dstream2)
+        result = dstream3.collect()
         self.ssc.start()
         time.sleep(1)
         expected = [i * 2 for i in input]
-        self.assertEqual(input, result)
+        self.assertEqual(expected, result[:3])
 
 
 if __name__ == "__main__":
diff --git a/python/pyspark/streaming/util.py b/python/pyspark/streaming/util.py
index feff1b3889c49..02b51dc472c51 100644
--- a/python/pyspark/streaming/util.py
+++ b/python/pyspark/streaming/util.py
@@ -20,44 +20,13 @@
 
 class RDDFunction(object):
     """
-    This class is for py4j callback. This class is related with
-    org.apache.spark.streaming.api.python.PythonRDDFunction.
+    This class is for py4j callback.
     """
-    def __init__(self, ctx, func, jrdd_deserializer):
+    def __init__(self, ctx, func, deserializer, deserializer2=None):
         self.ctx = ctx
         self.func = func
-        self.deserializer = jrdd_deserializer
-
-    def call(self, jrdd, milliseconds):
-        try:
-            emptyRDD = getattr(self.ctx, "_emptyRDD", None)
-            if emptyRDD is None:
-                self.ctx._emptyRDD = emptyRDD = self.ctx.parallelize([]).cache()
-            rdd = RDD(jrdd, self.ctx, self.deserializer) if jrdd else emptyRDD
-            r = self.func(rdd, milliseconds)
-            if r:
-                return r._jrdd
-        except:
-            import traceback
-            traceback.print_exc()
-
-    def __repr__(self):
-        return "RDDFunction(%s, %s)" % (str(self.deserializer), str(self.func))
-
-    class Java:
-        implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction']
-
-
-class RDDFunction2(object):
-    """
-    This class is for py4j callback. This class is related with
-    org.apache.spark.streaming.api.python.PythonRDDFunction2.
-    """
-    def __init__(self, ctx, func, jrdd_deserializer, jrdd_deserializer2=None):
-        self.ctx = ctx
-        self.func = func
-        self.jrdd_deserializer = jrdd_deserializer
-        self.jrdd_deserializer2 = jrdd_deserializer2 or jrdd_deserializer
+        self.deserializer = deserializer
+        self.deserializer2 = deserializer2 or deserializer
 
     def call(self, jrdd, jrdd2, milliseconds):
         try:
@@ -65,12 +34,12 @@ def call(self, jrdd, jrdd2, milliseconds):
             if emptyRDD is None:
                 self.ctx._emptyRDD = emptyRDD = self.ctx.parallelize([]).cache()
 
-            rdd = RDD(jrdd, self.ctx, self.jrdd_deserializer) if jrdd else emptyRDD
-            other = RDD(jrdd2, self.ctx, self.jrdd_deserializer2) if jrdd2 else emptyRDD
+            rdd = RDD(jrdd, self.ctx, self.deserializer) if jrdd else emptyRDD
+            other = RDD(jrdd2, self.ctx, self.deserializer2) if jrdd2 else emptyRDD
             r = self.func(rdd, other, milliseconds)
             if r:
                 return r._jrdd
-        except:
+        except Exception:
             import traceback
             traceback.print_exc()
 
@@ -78,7 +47,7 @@ def __repr__(self):
         return "RDDFunction2(%s)" % (str(self.func))
 
     class Java:
-        implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction2']
+        implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction']
 
 
 def rddToFileName(prefix, suffix, time):
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 828a620e4c08f..c0a1aa71840a5 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -19,8 +19,7 @@ package org.apache.spark.streaming.api.python
 
 import java.util.{ArrayList => JArrayList}
 
-import org.apache.spark.Partitioner
-import org.apache.spark.rdd.{CoGroupedRDD, UnionRDD, PartitionerAwareUnionRDD, RDD}
+import org.apache.spark.rdd.RDD
 import org.apache.spark.api.java._
 import org.apache.spark.api.python._
 import org.apache.spark.storage.StorageLevel
@@ -28,41 +27,14 @@ import org.apache.spark.streaming.{Interval, Duration, Time}
 import org.apache.spark.streaming.dstream._
 import org.apache.spark.streaming.api.java._
 
-import scala.collection.mutable.ArrayBuffer
-import scala.reflect.ClassTag
-
-
-/**
- * Interface for Python callback function with two arguments
- */
-trait PythonRDDFunction {
-  def call(rdd: JavaRDD[_], time: Long): JavaRDD[Array[Byte]]
-}
-
-class RDDFunction(pfunc: PythonRDDFunction) {
-  def apply(rdd: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
-    val jrdd = if (rdd.isDefined) {
-      JavaRDD.fromRDD(rdd.get)
-    } else {
-      null
-    }
-    val r = pfunc.call(jrdd, time.milliseconds)
-    if (r != null) {
-      Some(r.rdd)
-    } else {
-      None
-    }
-  }
-}
-
 /**
  * Interface for Python callback function with three arguments
  */
-trait PythonRDDFunction2 {
+trait PythonRDDFunction {
   def call(rdd: JavaRDD[_], rdd2: JavaRDD[_], time: Long): JavaRDD[Array[Byte]]
 }
 
-class RDDFunction2(pfunc: PythonRDDFunction2) {
+class RDDFunction(pfunc: PythonRDDFunction) {
   def apply(rdd: Option[RDD[_]], rdd2: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
     val jrdd = if (rdd.isDefined) {
       JavaRDD.fromRDD(rdd.get)
@@ -114,7 +86,7 @@ private[spark] class PythonTransformedDStream (parent: DStream[_], pfunc: Python
     if (reuse && lastResult != null) {
       Some(lastResult.copyTo(rdd1.get))
     } else {
-      val r = func(rdd1, validTime)
+      val r = func(rdd1, None, validTime)
       if (reuse && r.isDefined && lastResult == null) {
         r.get match {
           case rdd: PythonRDD =>
@@ -137,10 +109,10 @@ private[spark] class PythonTransformedDStream (parent: DStream[_], pfunc: Python
  */
 private[spark]
 class PythonTransformed2DStream(parent: DStream[_], parent2: DStream[_],
-                                pfunc: PythonRDDFunction2)
+                                pfunc: PythonRDDFunction)
   extends DStream[Array[Byte]] (parent.ssc) {
 
-  val func = new RDDFunction2(pfunc)
+  val func = new RDDFunction(pfunc)
 
   override def slideDuration: Duration = parent.slideDuration
 
@@ -157,10 +129,10 @@ class PythonTransformed2DStream(parent: DStream[_], parent2: DStream[_],
  * similar to StateDStream
  */
 private[spark]
-class PythonStateDStream(parent: DStream[Array[Byte]], preduceFunc: PythonRDDFunction2)
+class PythonStateDStream(parent: DStream[Array[Byte]], preduceFunc: PythonRDDFunction)
   extends PythonDStream(parent) {
 
-  val reduceFunc = new RDDFunction2(preduceFunc)
+  val reduceFunc = new RDDFunction(preduceFunc)
 
   super.persist(StorageLevel.MEMORY_ONLY)
   override val mustCheckpoint = true
@@ -177,12 +149,12 @@ class PythonStateDStream(parent: DStream[Array[Byte]], preduceFunc: PythonRDDFun
 }
 
 /**
- * Copied from ReducedWindowedDStream
+ * similar to ReducedWindowedDStream
  */
 private[spark]
 class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
-                                   preduceFunc: PythonRDDFunction2,
-                                   pinvReduceFunc: PythonRDDFunction2,
+                                   preduceFunc: PythonRDDFunction,
+                                   pinvReduceFunc: PythonRDDFunction,
                                    _windowDuration: Duration,
                                    _slideDuration: Duration
                                    ) extends PythonStateDStream(parent, preduceFunc) {
@@ -197,7 +169,7 @@ class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
       "must be multiple of the slide duration of parent DStream (" + parent.slideDuration + ")"
   )
 
-  val invReduceFunc = new RDDFunction2(pinvReduceFunc)
+  val invReduceFunc = new RDDFunction(pinvReduceFunc)
 
   def windowDuration: Duration = _windowDuration
   override def slideDuration: Duration = _slideDuration
@@ -209,12 +181,6 @@ class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
       currentTime)
     val previousWindow = currentWindow - slideDuration
 
-    logDebug("Window time = " + windowDuration)
-    logDebug("Slide time = " + slideDuration)
-    logDebug("ZeroTime = " + zeroTime)
-    logDebug("Current window = " + currentWindow)
-    logDebug("Previous window = " + previousWindow)
-
     //  _____________________________
     // |  previous window   _________|___________________
     // |___________________|       current window        |  --------------> Time
@@ -271,7 +237,7 @@ class PythonForeachDStream(
     prev,
     (rdd: RDD[Array[Byte]], time: Time) => {
       if (rdd != null) {
-        foreachFunction.call(rdd, time.milliseconds)
+        foreachFunction.call(rdd, null, time.milliseconds)
       }
     }
   ) {
@@ -283,7 +249,6 @@ class PythonForeachDStream(
 /**
  * similar to QueueInputStream
  */
-
 class PythonDataInputStream(
     ssc_ : JavaStreamingContext,
     inputRDDs: JArrayList[JavaRDD[Array[Byte]]],
@@ -294,7 +259,7 @@ class PythonDataInputStream(
   val emptyRDD = if (defaultRDD != null) {
     Some(defaultRDD.rdd)
   } else {
-    None // ssc.sparkContext.emptyRDD[Array[Byte]]
+    Some(ssc.sparkContext.emptyRDD[Array[Byte]])
   }
 
   def start() {}

From 7001b5136fdd462af33b62a132e87bf302911082 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Sat, 27 Sep 2014 21:50:58 -0700
Subject: [PATCH 311/347] refactor of queueStream()

---
 python/pyspark/streaming/context.py           | 11 ++--
 .../streaming/api/python/PythonDStream.scala  | 55 ++++---------------
 2 files changed, 19 insertions(+), 47 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index a647c9ec734df..00a1ec6f31fec 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -184,7 +184,7 @@ def _check_serialzers(self, rdds):
                 # reset them to sc.serializer
                 rdds[i] = rdds[i].map(lambda x: x, preservesPartitioning=True)
 
-    def queueStream(self, queue, oneAtATime=False, default=None):
+    def queueStream(self, queue, oneAtATime=True, default=None):
         """
         Create an input stream from an queue of RDDs or list. In each batch,
         it will process either one or all of the RDDs returned by the queue.
@@ -200,9 +200,12 @@ def queueStream(self, queue, oneAtATime=False, default=None):
         self._check_serialzers(rdds)
         jrdds = ListConverter().convert([r._jrdd for r in rdds],
                                         SparkContext._gateway._gateway_client)
-        jdstream = self._jvm.PythonDataInputStream(self._jssc, jrdds, oneAtATime,
-                                                   default and default._jrdd)
-        return DStream(jdstream.asJavaDStream(), self, rdds[0]._jrdd_deserializer)
+        queue = self._jvm.PythonDStream.toRDDQueue(jrdds)
+        if default:
+            jdstream = self._jssc.queueStream(queue, oneAtATime, default._jrdd)
+        else:
+            jdstream = self._jssc.queueStream(queue, oneAtATime)
+        return DStream(jdstream, self, rdds[0]._jrdd_deserializer)
 
     def transform(self, dstreams, transformFunc):
         """
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index c0a1aa71840a5..d7dd0a0c5c88b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.streaming.api.python
 
 import java.util.{ArrayList => JArrayList}
+import scala.collection.JavaConversions._
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.api.java._
@@ -65,6 +66,16 @@ abstract class PythonDStream(parent: DStream[_]) extends DStream[Array[Byte]] (p
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
 
+object PythonDStream {
+
+  // convert list of RDD into queue of RDDs, for ssc.queueStream()
+  def toRDDQueue(rdds: JArrayList[JavaRDD[Array[Byte]]]): java.util.Queue[JavaRDD[Array[Byte]]] = {
+    val queue = new java.util.LinkedList[JavaRDD[Array[Byte]]]
+    rdds.forall(queue.add(_))
+    queue
+  }
+}
+
 /**
  * Transformed DStream in Python.
  *
@@ -243,46 +254,4 @@ class PythonForeachDStream(
   ) {
 
   this.register()
-}
-
-
-/**
- * similar to QueueInputStream
- */
-class PythonDataInputStream(
-    ssc_ : JavaStreamingContext,
-    inputRDDs: JArrayList[JavaRDD[Array[Byte]]],
-    oneAtAtime: Boolean,
-    defaultRDD: JavaRDD[Array[Byte]]
-  ) extends InputDStream[Array[Byte]](JavaStreamingContext.toStreamingContext(ssc_)) {
-
-  val emptyRDD = if (defaultRDD != null) {
-    Some(defaultRDD.rdd)
-  } else {
-    Some(ssc.sparkContext.emptyRDD[Array[Byte]])
-  }
-
-  def start() {}
-
-  def stop() {}
-
-  def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-    val index = ((validTime - zeroTime) / slideDuration - 1).toInt
-    if (oneAtAtime) {
-      if (index == 0) {
-        val rdds = inputRDDs.toArray.map(_.asInstanceOf[JavaRDD[Array[Byte]]].rdd).toSeq
-        Some(ssc.sparkContext.union(rdds))
-      } else {
-        emptyRDD
-      }
-    } else {
-      if (index < inputRDDs.size()) {
-        Some(inputRDDs.get(index).rdd)
-      } else {
-        emptyRDD
-      }
-    }
-  }
-
-  val asJavaDStream  = JavaDStream.fromDStream(this)
-}
+}
\ No newline at end of file

From fce0ef5ffdf7d43052978a35b238bbc4ee434cc0 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Sat, 27 Sep 2014 22:41:04 -0700
Subject: [PATCH 312/347] rafactor of foreachRDD()

---
 python/pyspark/streaming/dstream.py           |  3 +-
 .../streaming/api/python/PythonDStream.scala  | 55 ++++++++-----------
 2 files changed, 26 insertions(+), 32 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index d41eca020feb1..8a9e2dab7fb07 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -142,7 +142,8 @@ def foreachRDD(self, func):
         stream and there materialized.
         """
         jfunc = RDDFunction(self.ctx, lambda a, _, t: func(a, t), self._jrdd_deserializer)
-        self.ctx._jvm.PythonForeachDStream(self._jdstream.dstream(), jfunc)
+        api = self._ssc._jvm.PythonDStream
+        api.callForeachRDD(self._jdstream, jfunc)
 
     def pprint(self):
         """
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index d7dd0a0c5c88b..66cf0c968478c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -20,9 +20,10 @@ package org.apache.spark.streaming.api.python
 import java.util.{ArrayList => JArrayList}
 import scala.collection.JavaConversions._
 
-import org.apache.spark.rdd.RDD
 import org.apache.spark.api.java._
+import org.apache.spark.api.java.function.{Function2 => JFunction2}
 import org.apache.spark.api.python._
+import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.{Interval, Duration, Time}
 import org.apache.spark.streaming.dstream._
@@ -35,19 +36,22 @@ trait PythonRDDFunction {
   def call(rdd: JavaRDD[_], rdd2: JavaRDD[_], time: Long): JavaRDD[Array[Byte]]
 }
 
-class RDDFunction(pfunc: PythonRDDFunction) {
-  def apply(rdd: Option[RDD[_]], rdd2: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
-    val jrdd = if (rdd.isDefined) {
+class RDDFunction(pfunc: PythonRDDFunction) extends Serializable {
+
+  def apply(rdd: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
+      apply(rdd, None, time)
+  }
+
+  def wrapRDD(rdd: Option[RDD[_]]): JavaRDD[_] = {
+    if (rdd.isDefined) {
       JavaRDD.fromRDD(rdd.get)
     } else {
       null
     }
-    val jrdd2 = if (rdd2.isDefined) {
-      JavaRDD.fromRDD(rdd2.get)
-    } else {
-      null
-    }
-    val r = pfunc.call(jrdd, jrdd2, time.milliseconds)
+  }
+
+  def apply(rdd: Option[RDD[_]], rdd2: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
+    val r = pfunc.call(wrapRDD(rdd), wrapRDD(rdd2), time.milliseconds)
     if (r != null) {
       Some(r.rdd)
     } else {
@@ -66,7 +70,13 @@ abstract class PythonDStream(parent: DStream[_]) extends DStream[Array[Byte]] (p
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
 
-object PythonDStream {
+private[spark] object PythonDStream {
+
+  // helper function for DStream.foreachRDD(),
+  // cannot be `foreachRDD`, it will confusing py4j
+  def callForeachRDD(jdstream: JavaDStream[Array[Byte]], pyfunc: PythonRDDFunction): Unit = {
+    jdstream.dstream.foreachRDD((rdd, time) => pyfunc.call(rdd, null, time.milliseconds))
+  }
 
   // convert list of RDD into queue of RDDs, for ssc.queueStream()
   def toRDDQueue(rdds: JArrayList[JavaRDD[Array[Byte]]]): java.util.Queue[JavaRDD[Array[Byte]]] = {
@@ -97,7 +107,7 @@ private[spark] class PythonTransformedDStream (parent: DStream[_], pfunc: Python
     if (reuse && lastResult != null) {
       Some(lastResult.copyTo(rdd1.get))
     } else {
-      val r = func(rdd1, None, validTime)
+      val r = func(rdd1, validTime)
       if (reuse && r.isDefined && lastResult == null) {
         r.get match {
           case rdd: PythonRDD =>
@@ -206,8 +216,9 @@ class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
     // Get the RDD of the reduced value of the previous window
     val previousWindowRDD = getOrCompute(previousWindow.endTime)
 
+    // for small window, reduce once will be better than twice
     if (windowDuration > slideDuration * 5 && previousWindowRDD.isDefined) {
-      // subtle the values from old RDDs
+      // subtract the values from old RDDs
       val oldRDDs =
         parent.slice(previousWindow.beginTime, currentWindow.beginTime - parent.slideDuration)
       val subbed = if (oldRDDs.size > 0) {
@@ -236,22 +247,4 @@ class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
       }
     }
   }
-}
-
-/**
- * This is used for foreachRDD() in Python
- */
-class PythonForeachDStream(
-    prev: DStream[Array[Byte]],
-    foreachFunction: PythonRDDFunction
-  ) extends ForEachDStream[Array[Byte]](
-    prev,
-    (rdd: RDD[Array[Byte]], time: Time) => {
-      if (rdd != null) {
-        foreachFunction.call(rdd, null, time.milliseconds)
-      }
-    }
-  ) {
-
-  this.register()
 }
\ No newline at end of file

From e059ca224d99b017355f62c157f7a71d9f3ec260 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Sat, 27 Sep 2014 22:58:29 -0700
Subject: [PATCH 313/347] move check of window into Python

---
 python/pyspark/streaming/dstream.py                 |  9 +++++++++
 python/pyspark/streaming/tests.py                   |  6 ++++++
 .../spark/streaming/api/python/PythonDStream.scala  | 13 ++-----------
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 8a9e2dab7fb07..ffcf70cc854ab 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -354,6 +354,15 @@ def groupByKeyAndWindow(self, windowDuration, slideDuration, numPartitions=None)
 
     def reduceByKeyAndWindow(self, func, invFunc,
                              windowDuration, slideDuration, numPartitions=None):
+
+        duration = self._jdstream.dstream().slideDuration().milliseconds()
+        if int(windowDuration * 1000) % duration != 0:
+            raise ValueError("windowDuration must be multiple of the slide duration (%d ms)"
+                             % duration)
+        if int(slideDuration * 1000) % duration != 0:
+            raise ValueError("slideDuration must be multiple of the slide duration (%d ms)"
+                             % duration)
+
         reduced = self.reduceByKey(func)
 
         def reduceFunc(a, b, t):
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 06fcc29850504..843d6ee04ca33 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -292,6 +292,12 @@ def func(dstream):
                     [('a', [2, 3, 4])], [('a', [3, 4])], [('a', [4])]]
         self._test_func(input, func, expected)
 
+    def test_reduce_by_invalid_window(self):
+        input1 = [range(3), range(5), range(1), range(6)]
+        d1 = self.ssc.queueStream(input1)
+        self.assertRaises(ValueError, lambda: d1.reduceByKeyAndWindow(None, None, 0.1, 0.1))
+        self.assertRaises(ValueError, lambda: d1.reduceByKeyAndWindow(None, None, 1, 0.1))
+
     def update_state_by_key(self):
 
         def updater(it):
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 66cf0c968478c..47c3974b61699 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -92,7 +92,8 @@ private[spark] object PythonDStream {
  * If the result RDD is PythonRDD, then it will cache it as an template for future use,
  * this can reduce the Python callbacks.
  */
-private[spark] class PythonTransformedDStream (parent: DStream[_], pfunc: PythonRDDFunction,
+private[spark]
+class PythonTransformedDStream (parent: DStream[_], pfunc: PythonRDDFunction,
                                 var reuse: Boolean = false)
   extends PythonDStream(parent) {
 
@@ -180,16 +181,6 @@ class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
                                    _slideDuration: Duration
                                    ) extends PythonStateDStream(parent, preduceFunc) {
 
-  assert(_windowDuration.isMultipleOf(parent.slideDuration),
-    "The window duration of ReducedWindowedDStream (" + _windowDuration + ") " +
-      "must be multiple of the slide duration of parent DStream (" + parent.slideDuration + ")"
-  )
-
-  assert(_slideDuration.isMultipleOf(parent.slideDuration),
-    "The slide duration of ReducedWindowedDStream (" + _slideDuration + ") " +
-      "must be multiple of the slide duration of parent DStream (" + parent.slideDuration + ")"
-  )
-
   val invReduceFunc = new RDDFunction(pinvReduceFunc)
 
   def windowDuration: Duration = _windowDuration

From 847f9b9faba9f9e6af20c9f5e72e68bc9eb52f4d Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Sun, 28 Sep 2014 00:20:34 -0700
Subject: [PATCH 314/347] add more docs, add first(), take()

---
 python/pyspark/streaming/context.py           |   3 +
 python/pyspark/streaming/dstream.py           | 243 ++++++++++++++++--
 python/pyspark/streaming/tests.py             |  15 ++
 .../streaming/api/python/PythonDStream.scala  |   8 +-
 4 files changed, 243 insertions(+), 26 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 00a1ec6f31fec..7879d1b7679d9 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -96,6 +96,9 @@ def _initialize_context(self, sc, duration):
         return self._jvm.JavaStreamingContext(sc._jsc, self._jduration(duration))
 
     def _jduration(self, seconds):
+        """
+        Create Duration object given number of seconds
+        """
         return self._jvm.Duration(int(seconds * 1000))
 
     @property
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index ffcf70cc854ab..acd9f27c46cbe 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -17,6 +17,7 @@
 
 from itertools import chain, ifilter, imap
 import operator
+import time
 from datetime import datetime
 
 from pyspark import RDD
@@ -163,6 +164,29 @@ def takeAndPrint(rdd, time):
 
         self.foreachRDD(takeAndPrint)
 
+    def first(self):
+        """
+        Return the first RDD in the stream.
+        """
+        return self.take(1)[0]
+
+    def take(self, n):
+        """
+        Return the first `n` RDDs in the stream (will start and stop).
+        """
+        rdds = []
+
+        def take(rdd, _):
+            if rdd:
+                rdds.append(rdd)
+                if len(rdds) == n:
+                    # FIXME: NPE in JVM
+                    self._ssc.stop(False)
+        self.foreachRDD(take)
+        self._ssc.start()
+        self._ssc.awaitTermination()
+        return rdds
+
     def collect(self):
         """
         Collect each RDDs into the returned list.
@@ -289,12 +313,24 @@ def saveAsPickleFile(rdd, time):
         return self.foreachRDD(saveAsPickleFile)
 
     def transform(self, func):
+        """
+        Return a new DStream in which each RDD is generated by applying a function
+        on each RDD of 'this' DStream.
+        """
         return TransformedDStream(self, lambda a, t: func(a), True)
 
     def transformWithTime(self, func):
+        """
+        Return a new DStream in which each RDD is generated by applying a function
+        on each RDD of 'this' DStream.
+        """
         return TransformedDStream(self, func, False)
 
     def transformWith(self, func, other, keepSerializer=False):
+        """
+        Return a new DStream in which each RDD is generated by applying a function
+        on each RDD of 'this' DStream and 'other' DStream.
+        """
         jfunc = RDDFunction(self.ctx, lambda a, b, t: func(a, b), self._jrdd_deserializer)
         dstream = self.ctx._jvm.PythonTransformed2DStream(self._jdstream.dstream(),
                                                           other._jdstream.dstream(), jfunc)
@@ -302,28 +338,114 @@ def transformWith(self, func, other, keepSerializer=False):
         return DStream(dstream.asJavaDStream(), self._ssc, jrdd_serializer)
 
     def repartitions(self, numPartitions):
+        """
+        Return a new DStream with an increased or decreased level of parallelism. Each RDD in the
+        returned DStream has exactly numPartitions partitions.
+        """
         return self.transform(lambda rdd: rdd.repartition(numPartitions))
 
+    @property
+    def _slideDuration(self):
+        """
+        Return the slideDuration in seconds of this DStream
+        """
+        return self._jdstream.dstream().slideDuration().milliseconds() / 1000.0
+
     def union(self, other):
+        """
+        Return a new DStream by unifying data of another DStream with this DStream.
+        @param other Another DStream having the same interval (i.e., slideDuration) as this DStream.
+        """
+        if self._slideDuration != other._slideDuration:
+            raise ValueError("the two DStream should have same slide duration")
         return self.transformWith(lambda a, b: a.union(b), other, True)
 
-    def cogroup(self, other):
-        return self.transformWith(lambda a, b: a.cogroup(b), other)
+    def cogroup(self, other, numPartitions=None):
+        """
+        Return a new DStream by applying 'cogroup' between RDDs of `this`
+        DStream and `other` DStream.
+
+        Hash partitioning is used to generate the RDDs with `numPartitions` partitions.
+        """
+        return self.transformWith(lambda a, b: a.cogroup(b, numPartitions), other)
+
+    def join(self, other, numPartitions=None):
+        """
+         Return a new DStream by applying 'join' between RDDs of `this` DStream and
+        `other` DStream.
+
+        Hash partitioning is used to generate the RDDs with `numPartitions`
+         partitions.
+        """
+        return self.transformWith(lambda a, b: a.join(b, numPartitions), other)
+
+    def leftOuterJoin(self, other, numPartitions=None):
+        """
+         Return a new DStream by applying 'left outer join' between RDDs of `this` DStream and
+        `other` DStream.
 
-    def leftOuterJoin(self, other):
-        return self.transformWith(lambda a, b: a.leftOuterJion(b), other)
+        Hash partitioning is used to generate the RDDs with `numPartitions`
+         partitions.
+        """
+        return self.transformWith(lambda a, b: a.leftOuterJion(b, numPartitions), other)
+
+    def rightOuterJoin(self, other, numPartitions=None):
+        """
+         Return a new DStream by applying 'right outer join' between RDDs of `this` DStream and
+        `other` DStream.
+
+        Hash partitioning is used to generate the RDDs with `numPartitions`
+         partitions.
+        """
+        return self.transformWith(lambda a, b: a.rightOuterJoin(b, numPartitions), other)
+
+    def fullOuterJoin(self, other, numPartitions=None):
+        """
+         Return a new DStream by applying 'full outer join' between RDDs of `this` DStream and
+        `other` DStream.
 
-    def rightOuterJoin(self, other):
-        return self.transformWith(lambda a, b: a.rightOuterJoin(b), other)
+        Hash partitioning is used to generate the RDDs with `numPartitions`
+         partitions.
+        """
+        return self.transformWith(lambda a, b: a.fullOuterJoin(b, numPartitions), other)
 
-    def _jtime(self, milliseconds):
-        return self.ctx._jvm.Time(milliseconds)
+    def _jtime(self, timestamp):
+        """ convert datetime or unix_timestamp into Time
+        """
+        if isinstance(timestamp, datetime):
+            timestamp = time.mktime(timestamp.timetuple())
+        return self.ctx._jvm.Time(long(timestamp * 1000))
 
     def slice(self, begin, end):
+        """
+        Return all the RDDs between 'begin' to 'end' (both included)
+
+        `begin`, `end` could be datetime.datetime() or unix_timestamp
+        """
         jrdds = self._jdstream.slice(self._jtime(begin), self._jtime(end))
         return [RDD(jrdd, self.ctx, self._jrdd_deserializer) for jrdd in jrdds]
 
+    def _check_window(self, window, slide):
+        duration = self._jdstream.dstream().slideDuration().milliseconds()
+        if int(window * 1000) % duration != 0:
+            raise ValueError("windowDuration must be multiple of the slide duration (%d ms)"
+                             % duration)
+        if slide and int(slide * 1000) % duration != 0:
+            raise ValueError("slideDuration must be multiple of the slide duration (%d ms)"
+                             % duration)
+
     def window(self, windowDuration, slideDuration=None):
+        """
+        Return a new DStream in which each RDD contains all the elements in seen in a
+        sliding window of time over this DStream.
+
+        @param windowDuration width of the window; must be a multiple of this DStream's
+                              batching interval
+        @param slideDuration  sliding interval of the window (i.e., the interval after which
+                              the new DStream will generate RDDs); must be a multiple of this
+                              DStream's batching interval
+        """
+        self._check_window(windowDuration, slideDuration)
         d = self._ssc._jduration(windowDuration)
         if slideDuration is None:
             return DStream(self._jdstream.window(d), self._ssc, self._jrdd_deserializer)
@@ -331,43 +453,108 @@ def window(self, windowDuration, slideDuration=None):
         return DStream(self._jdstream.window(d, s), self._ssc, self._jrdd_deserializer)
 
     def reduceByWindow(self, reduceFunc, invReduceFunc, windowDuration, slideDuration):
+        """
+        Return a new DStream in which each RDD has a single element generated by reducing all
+        elements in a sliding window over this DStream.
+
+        if `invReduceFunc` is not None, the reduction is done incrementally
+        using the old window's reduced value :
+         1. reduce the new values that entered the window (e.g., adding new counts)
+         2. "inverse reduce" the old values that left the window (e.g., subtracting old counts)
+         This is more efficient than `invReduceFunc` is None.
+
+        @param reduceFunc associative reduce function
+        @param invReduceFunc inverse reduce function of `reduceFunc`
+        @param windowDuration width of the window; must be a multiple of this DStream's
+                              batching interval
+        @param slideDuration  sliding interval of the window (i.e., the interval after which
+                              the new DStream will generate RDDs); must be a multiple of this
+                              DStream's batching interval
+        """
         keyed = self.map(lambda x: (1, x))
         reduced = keyed.reduceByKeyAndWindow(reduceFunc, invReduceFunc,
                                              windowDuration, slideDuration, 1)
         return reduced.map(lambda (k, v): v)
 
     def countByWindow(self, windowDuration, slideDuration):
+        """
+        Return a new DStream in which each RDD has a single element generated
+        by counting the number of elements in a window over this DStream.
+        windowDuration and slideDuration are as defined in the window() operation.
+
+        This is equivalent to window(windowDuration, slideDuration).count(),
+        but will be more efficient if window is large.
+        """
         return self.map(lambda x: 1).reduceByWindow(operator.add, operator.sub,
                                                     windowDuration, slideDuration)
 
     def countByValueAndWindow(self, windowDuration, slideDuration, numPartitions=None):
+        """
+        Return a new DStream in which each RDD contains the count of distinct elements in
+        RDDs in a sliding window over this DStream.
+
+        @param windowDuration width of the window; must be a multiple of this DStream's
+                              batching interval
+        @param slideDuration  sliding interval of the window (i.e., the interval after which
+                              the new DStream will generate RDDs); must be a multiple of this
+                              DStream's batching interval
+        @param numPartitions  number of partitions of each RDD in the new DStream.
+        """
         keyed = self.map(lambda x: (x, 1))
         counted = keyed.reduceByKeyAndWindow(lambda a, b: a + b, lambda a, b: a - b,
                                              windowDuration, slideDuration, numPartitions)
         return counted.filter(lambda (k, v): v > 0).count()
 
     def groupByKeyAndWindow(self, windowDuration, slideDuration, numPartitions=None):
+        """
+        Return a new DStream by applying `groupByKey` over a sliding window.
+        Similar to `DStream.groupByKey()`, but applies it over a sliding window.
+
+        @param windowDuration width of the window; must be a multiple of this DStream's
+                              batching interval
+        @param slideDuration  sliding interval of the window (i.e., the interval after which
+                              the new DStream will generate RDDs); must be a multiple of this
+                              DStream's batching interval
+        @param numPartitions  Number of partitions of each RDD in the new DStream.
+        """
         ls = self.mapValues(lambda x: [x])
         grouped = ls.reduceByKeyAndWindow(lambda a, b: a.extend(b) or a, lambda a, b: a[len(b):],
                                           windowDuration, slideDuration, numPartitions)
         return grouped.mapValues(ResultIterable)
 
-    def reduceByKeyAndWindow(self, func, invFunc,
-                             windowDuration, slideDuration, numPartitions=None):
+    def reduceByKeyAndWindow(self, func, invFunc, windowDuration, slideDuration=None,
+                             numPartitions=None, filterFunc=None):
+        """
+        Return a new DStream by applying incremental `reduceByKey` over a sliding window.
+
+        The reduced value of over a new window is calculated using the old window's reduce value :
+         1. reduce the new values that entered the window (e.g., adding new counts)
+         2. "inverse reduce" the old values that left the window (e.g., subtracting old counts)
 
-        duration = self._jdstream.dstream().slideDuration().milliseconds()
-        if int(windowDuration * 1000) % duration != 0:
-            raise ValueError("windowDuration must be multiple of the slide duration (%d ms)"
-                             % duration)
-        if int(slideDuration * 1000) % duration != 0:
-            raise ValueError("slideDuration must be multiple of the slide duration (%d ms)"
-                             % duration)
+        `invFunc` can be None, then it will reduce all the RDDs in window, could be slower
+        than having `invFunc`.
 
+        @param reduceFunc     associative reduce function
+        @param invReduceFunc  inverse function of `reduceFunc`
+        @param windowDuration width of the window; must be a multiple of this DStream's
+                              batching interval
+        @param slideDuration  sliding interval of the window (i.e., the interval after which
+                              the new DStream will generate RDDs); must be a multiple of this
+                              DStream's batching interval
+        @param numPartitions  number of partitions of each RDD in the new DStream.
+        @param filterFunc     function to filter expired key-value pairs;
+                              only pairs that satisfy the function are retained
+                              set this to null if you do not want to filter
+        """
+        self._check_window(windowDuration, slideDuration)
         reduced = self.reduceByKey(func)
 
         def reduceFunc(a, b, t):
             b = b.reduceByKey(func, numPartitions)
-            return a.union(b).reduceByKey(func, numPartitions) if a else b
+            r = a.union(b).reduceByKey(func, numPartitions) if a else b
+            if filterFunc:
+                r = r.filter(filterFunc)
+            return r
 
         def invReduceFunc(a, b, t):
             b = b.reduceByKey(func, numPartitions)
@@ -375,7 +562,12 @@ def invReduceFunc(a, b, t):
             return joined.mapValues(lambda (v1, v2): invFunc(v1, v2) if v2 is not None else v1)
 
         jreduceFunc = RDDFunction(self.ctx, reduceFunc, reduced._jrdd_deserializer)
-        jinvReduceFunc = RDDFunction(self.ctx, invReduceFunc, reduced._jrdd_deserializer)
+        if invReduceFunc:
+            jinvReduceFunc = RDDFunction(self.ctx, invReduceFunc, reduced._jrdd_deserializer)
+        else:
+            jinvReduceFunc = None
+        if slideDuration is None:
+            slideDuration = self._slideDuration
         dstream = self.ctx._jvm.PythonReducedWindowedDStream(reduced._jdstream.dstream(),
                                                              jreduceFunc, jinvReduceFunc,
                                                              self._ssc._jduration(windowDuration),
@@ -384,15 +576,20 @@ def invReduceFunc(a, b, t):
 
     def updateStateByKey(self, updateFunc, numPartitions=None):
         """
-        :param updateFunc: [(k, vs, s)] -> [(k, s)]
+        Return a new "state" DStream where the state for each key is updated by applying
+        the given function on the previous state of the key and the new values of the key.
+
+        @param updateFunc State update function ([(k, vs, s)] -> [(k, s)]).
+                          If `s` is None, then `k` will be eliminated.
         """
         def reduceFunc(a, b, t):
             if a is None:
                 g = b.groupByKey(numPartitions).map(lambda (k, vs): (k, list(vs), None))
             else:
-                g = a.cogroup(b).map(lambda (k, (va, vb)):
-                                            (k, list(vb), list(va)[0] if len(va) else None))
-            return g.mapPartitions(lambda x: updateFunc(x) or [])
+                g = a.cogroup(b, numPartitions)
+                g = g.map(lambda (k, (va, vb)): (k, list(vb), list(va)[0] if len(va) else None))
+            state = g.mapPartitions(lambda x: updateFunc(x))
+            return state.filter(lambda (k, v): v is not None)
 
         jreduceFunc = RDDFunction(self.ctx, reduceFunc,
                                   self.ctx.serializer, self._jrdd_deserializer)
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 843d6ee04ca33..0ef205754bb58 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -89,6 +89,21 @@ def _sort_result_based_on_key(self, outputs):
 
 
 class TestBasicOperations(PySparkStreamingTestCase):
+
+    def test_take(self):
+        input = [range(i) for i in range(3)]
+        dstream = self.ssc.queueStream(input)
+        rdds = dstream.take(3)
+        self.assertEqual(3, len(rdds))
+        for d, rdd in zip(input, rdds):
+            self.assertEqual(d, rdd.collect())
+
+    def test_first(self):
+        input = [range(10)]
+        dstream = self.ssc.queueStream(input)
+        rdd = dstream.first()
+        self.assertEqual(range(10), rdd.collect())
+
     def test_map(self):
         """Basic operation test for DStream.map."""
         input = [range(1, 5), range(5, 9), range(9, 13)]
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 47c3974b61699..16ac1b93b5f22 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -207,8 +207,10 @@ class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
     // Get the RDD of the reduced value of the previous window
     val previousWindowRDD = getOrCompute(previousWindow.endTime)
 
-    // for small window, reduce once will be better than twice
-    if (windowDuration > slideDuration * 5 && previousWindowRDD.isDefined) {
+    if (pinvReduceFunc != null && previousWindowRDD.isDefined
+        // for small window, reduce once will be better than twice
+        && windowDuration > slideDuration * 5) {
+
       // subtract the values from old RDDs
       val oldRDDs =
         parent.slice(previousWindow.beginTime, currentWindow.beginTime - parent.slideDuration)
@@ -238,4 +240,4 @@ class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
       }
     }
   }
-}
\ No newline at end of file
+}

From b983f0fed06bcbd6e740fbf86af6eb8881e9f3fd Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Sun, 28 Sep 2014 23:09:26 -0700
Subject: [PATCH 315/347] address comments

---
 bin/pyspark                                   |  6 +-
 .../apache/spark/api/python/PythonRDD.scala   |  2 +-
 .../python/streaming/network_wordcount.py     |  2 +-
 .../src/main/python/streaming/wordcount.py    |  2 +-
 python/pyspark/accumulators.py                |  5 ++
 python/pyspark/serializers.py                 |  5 ++
 python/pyspark/streaming/dstream.py           |  2 +-
 python/pyspark/streaming/tests.py             |  6 --
 python/pyspark/streaming/util.py              |  5 ++
 python/run-tests                              | 79 ++++++++++---------
 10 files changed, 61 insertions(+), 53 deletions(-)

diff --git a/bin/pyspark b/bin/pyspark
index 5142411e36974..118e6851af7a0 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -87,11 +87,7 @@ export PYSPARK_SUBMIT_ARGS
 if [[ -n "$SPARK_TESTING" ]]; then
   unset YARN_CONF_DIR
   unset HADOOP_CONF_DIR
-  if [[ -n "$PYSPARK_DOC_TEST" ]]; then
-    exec "$PYSPARK_PYTHON" -m doctest $1
-  else
-    exec "$PYSPARK_PYTHON" $1
-  fi
+  exec "$PYSPARK_PYTHON" $1
   exit
 fi
 
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 034a90110af76..19cdbe679fd35 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -293,7 +293,7 @@ private class PythonException(msg: String, cause: Exception) extends RuntimeExce
  * Form an RDD[(Array[Byte], Array[Byte])] from key-value pairs returned from Python.
  * This is used by PySpark's shuffle operations.
  */
-private[spark] class PairwiseRDD(prev: RDD[Array[Byte]]) extends
+private class PairwiseRDD(prev: RDD[Array[Byte]]) extends
   RDD[(Long, Array[Byte])](prev) {
   override def getPartitions = prev.partitions
   override def compute(split: Partition, context: TaskContext) =
diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index 633e63172bad6..e3b6248c82a12 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -14,7 +14,7 @@
     counts = lines.flatMap(lambda line: line.split(" "))\
                   .map(lambda word: (word, 1))\
                   .reduceByKey(lambda a, b: a+b)
-    counts.pyprint()
+    counts.pprint()
 
     ssc.start()
     ssc.awaitTermination()
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/wordcount.py
index c794711845af0..8c08ff0c89850 100644
--- a/examples/src/main/python/streaming/wordcount.py
+++ b/examples/src/main/python/streaming/wordcount.py
@@ -15,7 +15,7 @@
     counts = lines.flatMap(lambda line: line.split(" "))\
                   .map(lambda x: (x, 1))\
                   .reduceByKey(lambda a, b: a+b)
-    counts.pyprint()
+    counts.pprint()
 
     ssc.start()
     ssc.awaitTermination()
diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py
index ccbca67656c8d..9aa3db7ccf1dd 100644
--- a/python/pyspark/accumulators.py
+++ b/python/pyspark/accumulators.py
@@ -256,3 +256,8 @@ def _start_update_server():
     thread.daemon = True
     thread.start()
     return server
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 94bebc310bad6..e666dd9800256 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -526,3 +526,8 @@ def write_int(value, stream):
 def write_with_length(obj, stream):
     write_int(len(obj), stream)
     stream.write(obj)
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index acd9f27c46cbe..2653e75ccbc54 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -410,7 +410,7 @@ def fullOuterJoin(self, other, numPartitions=None):
         return self.transformWith(lambda a, b: a.fullOuterJoin(b, numPartitions), other)
 
     def _jtime(self, timestamp):
-        """ convert datetime or unix_timestamp into Time
+        """ Convert datetime or unix_timestamp into Time
         """
         if isinstance(timestamp, datetime):
             timestamp = time.mktime(timestamp.timetuple())
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 0ef205754bb58..c547971cd7741 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -29,7 +29,6 @@
 
 from pyspark.context import SparkContext
 from pyspark.streaming.context import StreamingContext
-from pyspark.streaming.duration import Seconds
 
 
 class PySparkStreamingTestCase(unittest.TestCase):
@@ -46,11 +45,6 @@ def setUp(self):
     def tearDown(self):
         self.ssc.stop()
 
-    @classmethod
-    def tearDownClass(cls):
-        # Make sure tp shutdown the callback server
-        SparkContext._gateway._shutdown_callback_server()
-
     def _test_func(self, input, func, expected, sort=False):
         """
         @param input: dataset for the test. This should be list of lists.
diff --git a/python/pyspark/streaming/util.py b/python/pyspark/streaming/util.py
index 02b51dc472c51..885411ed63936 100644
--- a/python/pyspark/streaming/util.py
+++ b/python/pyspark/streaming/util.py
@@ -64,3 +64,8 @@ def rddToFileName(prefix, suffix, time):
         return prefix + "-" + str(time)
     else:
         return prefix + "-" + str(time) + "." + suffix
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
diff --git a/python/run-tests b/python/run-tests
index 5aa9212c8adc1..e8796838c22c1 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -48,6 +48,39 @@ function run_test() {
     fi
 }
 
+function run_core_tests() {
+    run_test "pyspark/conf.py"
+    run_test "pyspark/context.py"
+    run_test "pyspark/broadcast.py"
+    run_test "pyspark/accumulators.py"
+    run_test "pyspark/serializers.py"
+    run_test "pyspark/shuffle.py"
+    run_test "pyspark/rdd.py"
+    run_test "pyspark/tests.py"
+}
+
+function run_sql_tests() {
+    run_test "pyspark/sql.py"
+}
+
+function run_mllib_tests() {
+    run_test "pyspark/mllib/util.py"
+    run_test "pyspark/mllib/linalg.py"
+    run_test "pyspark/mllib/classification.py"
+    run_test "pyspark/mllib/clustering.py"
+    run_test "pyspark/mllib/random.py"
+    run_test "pyspark/mllib/recommendation.py"
+    run_test "pyspark/mllib/regression.py"
+    run_test "pyspark/mllib/stat.py"
+    run_test "pyspark/mllib/tree.py"
+    run_test "pyspark/mllib/tests.py"
+}
+
+function run_streaming_tests() {
+    run_test "pyspark/streaming/util.py"
+    run_test "pyspark/streaming/tests.py"
+}
+
 echo "Running PySpark tests. Output is in python/unit-tests.log."
 
 export PYSPARK_PYTHON="python"
@@ -60,30 +93,10 @@ fi
 echo "Testing with Python version:"
 $PYSPARK_PYTHON --version
 
-run_test "pyspark/rdd.py"
-run_test "pyspark/context.py"
-run_test "pyspark/conf.py"
-run_test "pyspark/sql.py"
-# These tests are included in the module-level docs, and so must
-# be handled on a higher level rather than within the python file.
-export PYSPARK_DOC_TEST=1
-run_test "pyspark/broadcast.py"
-run_test "pyspark/accumulators.py"
-run_test "pyspark/serializers.py"
-unset PYSPARK_DOC_TEST
-run_test "pyspark/shuffle.py"
-run_test "pyspark/tests.py"
-run_test "pyspark/mllib/classification.py"
-run_test "pyspark/mllib/clustering.py"
-run_test "pyspark/mllib/linalg.py"
-run_test "pyspark/mllib/random.py"
-run_test "pyspark/mllib/recommendation.py"
-run_test "pyspark/mllib/regression.py"
-run_test "pyspark/mllib/stat.py"
-run_test "pyspark/mllib/tests.py"
-run_test "pyspark/mllib/tree.py"
-run_test "pyspark/mllib/util.py"
-run_test "pyspark/streaming/tests.py"
+#run_core_tests
+#run_sql_tests
+#run_mllib_tests
+run_streaming_tests
 
 # Try to test with PyPy
 if [ $(which pypy) ]; then
@@ -91,20 +104,10 @@ if [ $(which pypy) ]; then
     echo "Testing with PyPy version:"
     $PYSPARK_PYTHON --version
 
-    run_test "pyspark/rdd.py"
-    run_test "pyspark/context.py"
-    run_test "pyspark/conf.py"
-    run_test "pyspark/sql.py"
-    # These tests are included in the module-level docs, and so must
-    # be handled on a higher level rather than within the python file.
-    export PYSPARK_DOC_TEST=1
-    run_test "pyspark/broadcast.py"
-    run_test "pyspark/accumulators.py"
-    run_test "pyspark/serializers.py"
-    unset PYSPARK_DOC_TEST
-    run_test "pyspark/shuffle.py"
-    run_test "pyspark/tests.py"
-    run_test "pyspark/streaming/tests.py"
+    run_core_tests
+    run_sql_tests
+    run_mllib_tests
+    run_streaming_tests
 fi
 
 if [[ $FAILED == 0 ]]; then

From 98ac6c26d63dde9b6ca75177e082dbc421998ef7 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Mon, 29 Sep 2014 11:01:39 -0700
Subject: [PATCH 316/347] support ssc.transform()

---
 python/pyspark/streaming/context.py           | 18 +++++--
 python/pyspark/streaming/dstream.py           | 36 +++++++-------
 python/pyspark/streaming/tests.py             | 13 +++++
 python/pyspark/streaming/util.py              | 26 +++++-----
 .../spark/streaming/StreamingContext.scala    |  2 +-
 .../streaming/api/python/PythonDStream.scala  | 49 +++++++++++++------
 6 files changed, 96 insertions(+), 48 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 7879d1b7679d9..ce8aec613d08b 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -20,6 +20,7 @@
 from pyspark.context import SparkContext
 from pyspark.storagelevel import StorageLevel
 from pyspark.streaming.dstream import DStream
+from pyspark.streaming.util import RDDFunction
 
 from py4j.java_collections import ListConverter
 from py4j.java_gateway import java_import
@@ -212,11 +213,20 @@ def queueStream(self, queue, oneAtATime=True, default=None):
 
     def transform(self, dstreams, transformFunc):
         """
-        Create a new DStream in which each RDD is generated by applying a function on RDDs of
-        the DStreams. The order of the JavaRDDs in the transform function parameter will be the
-        same as the order of corresponding DStreams in the list.
+        Create a new DStream in which each RDD is generated by applying
+        a function on RDDs of the DStreams. The order of the JavaRDDs in
+        the transform function parameter will be the same as the order
+        of corresponding DStreams in the list.
         """
-        # TODO
+        jdstreams = ListConverter().convert([d._jdstream for d in dstreams],
+                                            SparkContext._gateway._gateway_client)
+        # change the final serializer to sc.serializer
+        jfunc = RDDFunction(self._sc,
+                            lambda t, *rdds: transformFunc(rdds).map(lambda x: x),
+                            *[d._jrdd_deserializer for d in dstreams])
+
+        jdstream = self._jvm.PythonDStream.callTransform(self._jssc, jdstreams, jfunc)
+        return DStream(jdstream, self, self._sc.serializer)
 
     def union(self, *dstreams):
         """
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 2653e75ccbc54..ae5be72952c76 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -132,7 +132,7 @@ def partitionBy(self, numPartitions, partitionFunc=portable_hash):
         return self.transform(lambda rdd: rdd.partitionBy(numPartitions, partitionFunc))
 
     def foreach(self, func):
-        return self.foreachRDD(lambda rdd, _: rdd.foreach(func))
+        return self.foreachRDD(lambda _, rdd: rdd.foreach(func))
 
     def foreachRDD(self, func):
         """
@@ -142,7 +142,7 @@ def foreachRDD(self, func):
         This is an output operator, so this DStream will be registered as an output
         stream and there materialized.
         """
-        jfunc = RDDFunction(self.ctx, lambda a, _, t: func(a, t), self._jrdd_deserializer)
+        jfunc = RDDFunction(self.ctx, func, self._jrdd_deserializer)
         api = self._ssc._jvm.PythonDStream
         api.callForeachRDD(self._jdstream, jfunc)
 
@@ -151,10 +151,10 @@ def pprint(self):
         Print the first ten elements of each RDD generated in this DStream. This is an output
         operator, so this DStream will be registered as an output stream and there materialized.
         """
-        def takeAndPrint(rdd, time):
+        def takeAndPrint(timestamp, rdd):
             taken = rdd.take(11)
             print "-------------------------------------------"
-            print "Time: %s" % datetime.fromtimestamp(time / 1000.0)
+            print "Time: %s" % datetime.fromtimestamp(timestamp / 1000.0)
             print "-------------------------------------------"
             for record in taken[:10]:
                 print record
@@ -176,15 +176,15 @@ def take(self, n):
         """
         rdds = []
 
-        def take(rdd, _):
-            if rdd:
+        def take(_, rdd):
+            if rdd and len(rdds) < n:
                 rdds.append(rdd)
-                if len(rdds) == n:
-                    # FIXME: NPE in JVM
-                    self._ssc.stop(False)
         self.foreachRDD(take)
+
         self._ssc.start()
-        self._ssc.awaitTermination()
+        while len(rdds) < n:
+            time.sleep(0.01)
+        self._ssc.stop(False, True)
         return rdds
 
     def collect(self):
@@ -195,7 +195,7 @@ def collect(self):
         """
         result = []
 
-        def get_output(rdd, time):
+        def get_output(_, rdd):
             r = rdd.collect()
             result.append(r)
         self.foreachRDD(get_output)
@@ -317,7 +317,7 @@ def transform(self, func):
         Return a new DStream in which each RDD is generated by applying a function
         on each RDD of 'this' DStream.
         """
-        return TransformedDStream(self, lambda a, t: func(a), True)
+        return TransformedDStream(self, lambda t, a: func(a), True)
 
     def transformWithTime(self, func):
         """
@@ -331,7 +331,7 @@ def transformWith(self, func, other, keepSerializer=False):
         Return a new DStream in which each RDD is generated by applying a function
         on each RDD of 'this' DStream and 'other' DStream.
         """
-        jfunc = RDDFunction(self.ctx, lambda a, b, t: func(a, b), self._jrdd_deserializer)
+        jfunc = RDDFunction(self.ctx, lambda t, a, b: func(a, b), self._jrdd_deserializer)
         dstream = self.ctx._jvm.PythonTransformed2DStream(self._jdstream.dstream(),
                                                           other._jdstream.dstream(), jfunc)
         jrdd_serializer = self._jrdd_deserializer if keepSerializer else self.ctx.serializer
@@ -549,14 +549,14 @@ def reduceByKeyAndWindow(self, func, invFunc, windowDuration, slideDuration=None
         self._check_window(windowDuration, slideDuration)
         reduced = self.reduceByKey(func)
 
-        def reduceFunc(a, b, t):
+        def reduceFunc(t, a, b):
             b = b.reduceByKey(func, numPartitions)
             r = a.union(b).reduceByKey(func, numPartitions) if a else b
             if filterFunc:
                 r = r.filter(filterFunc)
             return r
 
-        def invReduceFunc(a, b, t):
+        def invReduceFunc(t, a, b):
             b = b.reduceByKey(func, numPartitions)
             joined = a.leftOuterJoin(b, numPartitions)
             return joined.mapValues(lambda (v1, v2): invFunc(v1, v2) if v2 is not None else v1)
@@ -582,7 +582,7 @@ def updateStateByKey(self, updateFunc, numPartitions=None):
         @param updateFunc State update function ([(k, vs, s)] -> [(k, s)]).
                           If `s` is None, then `k` will be eliminated.
         """
-        def reduceFunc(a, b, t):
+        def reduceFunc(t, a, b):
             if a is None:
                 g = b.groupByKey(numPartitions).map(lambda (k, vs): (k, list(vs), None))
             else:
@@ -610,7 +610,7 @@ def __init__(self, prev, func, reuse=False):
                 not prev.is_cached and not prev.is_checkpointed):
             prev_func = prev.func
             old_func = func
-            func = lambda rdd, t: old_func(prev_func(rdd, t), t)
+            func = lambda t, rdd: old_func(t, prev_func(t, rdd))
             reuse = reuse and prev.reuse
             prev = prev.prev
 
@@ -625,7 +625,7 @@ def _jdstream(self):
             return self._jdstream_val
 
         func = self.func
-        jfunc = RDDFunction(self.ctx, lambda a, _, t: func(a, t), self.prev._jrdd_deserializer)
+        jfunc = RDDFunction(self.ctx, func, self.prev._jrdd_deserializer)
         jdstream = self.ctx._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(),
                                                           jfunc, self.reuse).asJavaDStream()
         self._jdstream_val = jdstream
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index c547971cd7741..ecf88cce47beb 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -374,6 +374,19 @@ def test_union(self):
         expected = [i * 2 for i in input]
         self.assertEqual(expected, result[:3])
 
+    def test_transform(self):
+        dstream1 = self.ssc.queueStream([[1]])
+        dstream2 = self.ssc.queueStream([[2]])
+        dstream3 = self.ssc.queueStream([[3]])
+
+        def func(rdds):
+            rdd1, rdd2, rdd3 = rdds
+            return rdd2.union(rdd3).union(rdd1)
+
+        dstream = self.ssc.transform([dstream1, dstream2, dstream3], func)
+
+        self.assertEqual([2, 3, 1], dstream.first().collect())
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/pyspark/streaming/util.py b/python/pyspark/streaming/util.py
index 885411ed63936..57791805e8f9f 100644
--- a/python/pyspark/streaming/util.py
+++ b/python/pyspark/streaming/util.py
@@ -22,21 +22,25 @@ class RDDFunction(object):
     """
     This class is for py4j callback.
     """
-    def __init__(self, ctx, func, deserializer, deserializer2=None):
+    def __init__(self, ctx, func, *deserializers):
         self.ctx = ctx
         self.func = func
-        self.deserializer = deserializer
-        self.deserializer2 = deserializer2 or deserializer
+        self.deserializers = deserializers
+        emptyRDD = getattr(self.ctx, "_emptyRDD", None)
+        if emptyRDD is None:
+            self.ctx._emptyRDD = emptyRDD = self.ctx.parallelize([]).cache()
+        self.emptyRDD = emptyRDD
 
-    def call(self, jrdd, jrdd2, milliseconds):
+    def call(self, milliseconds, jrdds):
         try:
-            emptyRDD = getattr(self.ctx, "_emptyRDD", None)
-            if emptyRDD is None:
-                self.ctx._emptyRDD = emptyRDD = self.ctx.parallelize([]).cache()
+            # extend deserializers with the first one
+            sers = self.deserializers
+            if len(sers) < len(jrdds):
+                sers += (sers[0],) * (len(jrdds) - len(sers))
 
-            rdd = RDD(jrdd, self.ctx, self.deserializer) if jrdd else emptyRDD
-            other = RDD(jrdd2, self.ctx, self.deserializer2) if jrdd2 else emptyRDD
-            r = self.func(rdd, other, milliseconds)
+            rdds = [RDD(jrdd, self.ctx, ser) if jrdd else self.emptyRDD
+                    for jrdd, ser in zip(jrdds, sers)]
+            r = self.func(milliseconds, *rdds)
             if r:
                 return r._jrdd
         except Exception:
@@ -44,7 +48,7 @@ def call(self, jrdd, jrdd2, milliseconds):
             traceback.print_exc()
 
     def __repr__(self):
-        return "RDDFunction2(%s)" % (str(self.func))
+        return "RDDFunction(%s)" % (str(self.func))
 
     class Java:
         implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction']
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 5a8eef1372e23..ab6a6de074a80 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -413,7 +413,7 @@ class StreamingContext private[streaming] (
       dstreams: Seq[DStream[_]],
       transformFunc: (Seq[RDD[_]], Time) => RDD[T]
     ): DStream[T] = {
-    new TransformedDStream[T](dstreams, sparkContext.clean(transformFunc))
+    new TransformedDStream[T](dstreams, (transformFunc))
   }
 
   /** Add a [[org.apache.spark.streaming.scheduler.StreamingListener]] object for
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 16ac1b93b5f22..8ba8c0441ef35 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -17,11 +17,12 @@
 
 package org.apache.spark.streaming.api.python
 
-import java.util.{ArrayList => JArrayList}
+import java.util.{ArrayList => JArrayList, List => JList}
 import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
+import scala.collection.mutable
 
 import org.apache.spark.api.java._
-import org.apache.spark.api.java.function.{Function2 => JFunction2}
 import org.apache.spark.api.python._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
@@ -29,18 +30,19 @@ import org.apache.spark.streaming.{Interval, Duration, Time}
 import org.apache.spark.streaming.dstream._
 import org.apache.spark.streaming.api.java._
 
+
 /**
  * Interface for Python callback function with three arguments
  */
 trait PythonRDDFunction {
-  def call(rdd: JavaRDD[_], rdd2: JavaRDD[_], time: Long): JavaRDD[Array[Byte]]
+  def call(time: Long, rdds: JList[_]): JavaRDD[Array[Byte]]
 }
 
-class RDDFunction(pfunc: PythonRDDFunction) extends Serializable {
-
-  def apply(rdd: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
-      apply(rdd, None, time)
-  }
+/**
+ * Wrapper for PythonRDDFunction
+ */
+class RDDFunction(pfunc: PythonRDDFunction)
+  extends function.Function2[JList[JavaRDD[_]], Time, JavaRDD[Array[Byte]]] with Serializable {
 
   def wrapRDD(rdd: Option[RDD[_]]): JavaRDD[_] = {
     if (rdd.isDefined) {
@@ -50,14 +52,25 @@ class RDDFunction(pfunc: PythonRDDFunction) extends Serializable {
     }
   }
 
-  def apply(rdd: Option[RDD[_]], rdd2: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
-    val r = pfunc.call(wrapRDD(rdd), wrapRDD(rdd2), time.milliseconds)
-    if (r != null) {
-      Some(r.rdd)
+  def some(jrdd: JavaRDD[Array[Byte]]): Option[RDD[Array[Byte]]] = {
+    if (jrdd != null) {
+      Some(jrdd.rdd)
     } else {
       None
     }
   }
+
+  def apply(rdd: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
+    some(pfunc.call(time.milliseconds, List(wrapRDD(rdd)).asJava))
+  }
+
+  def apply(rdd: Option[RDD[_]], rdd2: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
+    some(pfunc.call(time.milliseconds, List(wrapRDD(rdd), wrapRDD(rdd2)).asJava))
+  }
+
+  def call(rdds: JList[JavaRDD[_]], time: Time): JavaRDD[Array[Byte]] = {
+    pfunc.call(time.milliseconds, rdds)
+  }
 }
 
 private[python]
@@ -74,8 +87,16 @@ private[spark] object PythonDStream {
 
   // helper function for DStream.foreachRDD(),
   // cannot be `foreachRDD`, it will confusing py4j
-  def callForeachRDD(jdstream: JavaDStream[Array[Byte]], pyfunc: PythonRDDFunction): Unit = {
-    jdstream.dstream.foreachRDD((rdd, time) => pyfunc.call(rdd, null, time.milliseconds))
+  def callForeachRDD(jdstream: JavaDStream[Array[Byte]], pyfunc: PythonRDDFunction){
+    val func = new RDDFunction(pyfunc)
+    jdstream.dstream.foreachRDD((rdd, time) => func(Some(rdd), time))
+  }
+
+  // helper function for ssc.transform()
+  def callTransform(ssc: JavaStreamingContext, jdsteams: JList[JavaDStream[_]], pyfunc: PythonRDDFunction)
+    :JavaDStream[Array[Byte]] = {
+    val func = new RDDFunction(pyfunc)
+    ssc.transform(jdsteams, func)
   }
 
   // convert list of RDD into queue of RDDs, for ssc.queueStream()

From c40c52df9fd8b6dc8fd44196a73d57bd97a43a06 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Mon, 29 Sep 2014 11:10:05 -0700
Subject: [PATCH 317/347] change first(), take(n) to has the same behavior as
 RDD

---
 python/pyspark/streaming/dstream.py | 11 ++++++-----
 python/pyspark/streaming/tests.py   | 10 +++-------
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index ae5be72952c76..8f02d95e03d43 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -174,18 +174,19 @@ def take(self, n):
         """
         Return the first `n` RDDs in the stream (will start and stop).
         """
-        rdds = []
+        results = []
 
         def take(_, rdd):
-            if rdd and len(rdds) < n:
-                rdds.append(rdd)
+            if rdd and len(results) < n:
+                results.extend(rdd.take(n - len(results)))
+
         self.foreachRDD(take)
 
         self._ssc.start()
-        while len(rdds) < n:
+        while len(results) < n:
             time.sleep(0.01)
         self._ssc.stop(False, True)
-        return rdds
+        return results
 
     def collect(self):
         """
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index ecf88cce47beb..828c40f247629 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -87,16 +87,12 @@ class TestBasicOperations(PySparkStreamingTestCase):
     def test_take(self):
         input = [range(i) for i in range(3)]
         dstream = self.ssc.queueStream(input)
-        rdds = dstream.take(3)
-        self.assertEqual(3, len(rdds))
-        for d, rdd in zip(input, rdds):
-            self.assertEqual(d, rdd.collect())
+        self.assertEqual([0, 0, 1], dstream.take(3))
 
     def test_first(self):
         input = [range(10)]
         dstream = self.ssc.queueStream(input)
-        rdd = dstream.first()
-        self.assertEqual(range(10), rdd.collect())
+        self.assertEqual(0, dstream)
 
     def test_map(self):
         """Basic operation test for DStream.map."""
@@ -385,7 +381,7 @@ def func(rdds):
 
         dstream = self.ssc.transform([dstream1, dstream2, dstream3], func)
 
-        self.assertEqual([2, 3, 1], dstream.first().collect())
+        self.assertEqual([2, 3, 1], dstream.take(3))
 
 
 if __name__ == "__main__":

From 6ebceca528dbd94dc23eba4412715e661ff6527e Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Mon, 29 Sep 2014 13:26:06 -0700
Subject: [PATCH 318/347] add more tests

---
 python/pyspark/streaming/dstream.py           |   8 +-
 python/pyspark/streaming/tests.py             | 156 +++++++++++++-----
 .../streaming/api/python/PythonDStream.scala  |  34 ++--
 3 files changed, 137 insertions(+), 61 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 8f02d95e03d43..c18c68dfe5a32 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -286,7 +286,7 @@ def saveAsTextFiles(self, prefix, suffix=None):
         Save this DStream as a text file, using string representations of elements.
         """
 
-        def saveAsTextFile(rdd, time):
+        def saveAsTextFile(time, rdd):
             """
             Closure to save element in RDD in DStream as Pickled data in file.
             This closure is called by py4j callback server.
@@ -303,7 +303,7 @@ def saveAsPickleFiles(self, prefix, suffix=None):
         is 10.
         """
 
-        def saveAsPickleFile(rdd, time):
+        def saveAsPickleFile(time, rdd):
             """
             Closure to save element in RDD in the DStream as Pickled data in file.
             This closure is called by py4j callback server.
@@ -388,7 +388,7 @@ def leftOuterJoin(self, other, numPartitions=None):
         Hash partitioning is used to generate the RDDs with `numPartitions`
          partitions.
         """
-        return self.transformWith(lambda a, b: a.leftOuterJion(b, numPartitions), other)
+        return self.transformWith(lambda a, b: a.leftOuterJoin(b, numPartitions), other)
 
     def rightOuterJoin(self, other, numPartitions=None):
         """
@@ -502,7 +502,7 @@ def countByValueAndWindow(self, windowDuration, slideDuration, numPartitions=Non
         @param numPartitions  number of partitions of each RDD in the new DStream.
         """
         keyed = self.map(lambda x: (x, 1))
-        counted = keyed.reduceByKeyAndWindow(lambda a, b: a + b, lambda a, b: a - b,
+        counted = keyed.reduceByKeyAndWindow(operator.add, operator.sub,
                                              windowDuration, slideDuration, numPartitions)
         return counted.filter(lambda (k, v): v > 0).count()
 
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 828c40f247629..54d4d9b1f7850 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -15,17 +15,12 @@
 # limitations under the License.
 #
 
-"""
-Unit tests for Python SparkStreaming; additional tests are implemented as doctests in
-individual modules.
-
-Callback server is sometimes unstable sometimes, which cause error in test case.
-But this is very rare case.
-"""
+import os
 from itertools import chain
 import time
 import operator
 import unittest
+import tempfile
 
 from pyspark.context import SparkContext
 from pyspark.streaming.context import StreamingContext
@@ -45,16 +40,20 @@ def setUp(self):
     def tearDown(self):
         self.ssc.stop()
 
-    def _test_func(self, input, func, expected, sort=False):
+    def _test_func(self, input, func, expected, sort=False, input2=None):
         """
         @param input: dataset for the test. This should be list of lists.
         @param func: wrapped function. This function should return PythonDStream object.
         @param expected: expected output for this testcase.
         """
-        # Generate input stream with user-defined input.
         input_stream = self.ssc.queueStream(input)
+        input_stream2 = self.ssc.queueStream(input2) if input2 is not None else None
         # Apply test function to stream.
-        stream = func(input_stream)
+        if input2:
+            stream = func(input_stream, input_stream2)
+        else:
+            stream = func(input_stream)
+
         result = stream.collect()
         self.ssc.start()
 
@@ -92,7 +91,7 @@ def test_take(self):
     def test_first(self):
         input = [range(10)]
         dstream = self.ssc.queueStream(input)
-        self.assertEqual(0, dstream)
+        self.assertEqual(0, dstream.first())
 
     def test_map(self):
         """Basic operation test for DStream.map."""
@@ -238,55 +237,122 @@ def add(a, b):
                     [("a", "11"), ("b", "1"), ("", "111")]]
         self._test_func(input, func, expected, sort=True)
 
+    def test_repartition(self):
+        input = [range(1, 5), range(5, 9)]
+        rdds = [self.sc.parallelize(r, 2) for r in input]
+
+        def func(dstream):
+            return dstream.repartitions(1).glom()
+        expected = [[[1, 2, 3, 4]], [[5, 6, 7, 8]]]
+        self._test_func(rdds, func, expected)
+
     def test_union(self):
-        input1 = [range(3), range(5), range(1), range(6)]
-        input2 = [range(3, 6), range(5, 6), range(1, 6)]
+        input1 = [range(3), range(5), range(6)]
+        input2 = [range(3, 6), range(5, 6)]
 
-        d1 = self.ssc.queueStream(input1)
-        d2 = self.ssc.queueStream(input2)
-        d = d1.union(d2)
-        result = d.collect()
-        expected = [range(6), range(6), range(6), range(6)]
+        def func(d1, d2):
+            return d1.union(d2)
 
-        self.ssc.start()
-        start_time = time.time()
-        # Loop until get the expected the number of the result from the stream.
-        while True:
-            current_time = time.time()
-            # Check time out.
-            if (current_time - start_time) > self.timeout * 2:
-                break
-            # StreamingContext.awaitTermination is not used to wait because
-            # if py4j server is called every 50 milliseconds, it gets an error.
-            time.sleep(0.05)
-            # Check if the output is the same length of expected output.
-            if len(expected) == len(result):
-                break
-        self.assertEqual(expected, result)
+        expected = [range(6), range(6), range(6)]
+        self._test_func(input1, func, expected, input2=input2)
+
+    def test_cogroup(self):
+        input = [[(1, 1), (2, 1), (3, 1)],
+                 [(1, 1), (1, 1), (1, 1), (2, 1)],
+                 [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1)]]
+        input2 = [[(1, 2)],
+                  [(4, 1)],
+                  [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 2)]]
+
+        def func(d1, d2):
+            return d1.cogroup(d2).mapValues(lambda vs: tuple(map(list, vs)))
+
+        expected = [[(1, ([1], [2])), (2, ([1], [])), (3, ([1], []))],
+                    [(1, ([1, 1, 1], [])), (2, ([1], [])), (4, ([], [1]))],
+                    [("a", ([1, 1], [1, 1])), ("b", ([1], [1])), ("", ([1, 1], [1, 2]))]]
+        self._test_func(input, func, expected, sort=True, input2=input2)
+
+    def test_join(self):
+        input = [[('a', 1), ('b', 2)]]
+        input2 = [[('b', 3), ('c', 4)]]
+
+        def func(a, b):
+            return a.join(b)
+
+        expected = [[('b', (2, 3))]]
+        self._test_func(input, func, expected, True, input2)
+
+    def test_left_outer_join(self):
+        input = [[('a', 1), ('b', 2)]]
+        input2 = [[('b', 3), ('c', 4)]]
+
+        def func(a, b):
+            return a.leftOuterJoin(b)
+
+        expected = [[('a', (1, None)), ('b', (2, 3))]]
+        self._test_func(input, func, expected, True, input2)
+
+    def test_right_outer_join(self):
+        input = [[('a', 1), ('b', 2)]]
+        input2 = [[('b', 3), ('c', 4)]]
+
+        def func(a, b):
+            return a.rightOuterJoin(b)
+
+        expected = [[('b', (2, 3)), ('c', (None, 4))]]
+        self._test_func(input, func, expected, True, input2)
+
+    def test_full_outer_join(self):
+        input = [[('a', 1), ('b', 2)]]
+        input2 = [[('b', 3), ('c', 4)]]
+
+        def func(a, b):
+            return a.fullOuterJoin(b)
+
+        expected = [[('a', (1, None)), ('b', (2, 3)), ('c', (None, 4))]]
+        self._test_func(input, func, expected, True, input2)
 
 
 class TestWindowFunctions(PySparkStreamingTestCase):
 
-    timeout = 15
+    timeout = 20
+
+    def test_window(self):
+        input = [range(1), range(2), range(3), range(4), range(5)]
+
+        def func(dstream):
+            return dstream.window(3, 1).count()
+
+        expected = [[1], [3], [6], [9], [12], [9], [5]]
+        self._test_func(input, func, expected)
 
     def test_count_by_window(self):
-        input = [range(1), range(2), range(3), range(4), range(5), range(6)]
+        input = [range(1), range(2), range(3), range(4), range(5)]
 
         def func(dstream):
-            return dstream.countByWindow(4, 1)
+            return dstream.countByWindow(3, 1)
 
-        expected = [[1], [3], [6], [9], [12], [15], [11], [6]]
+        expected = [[1], [3], [6], [9], [12], [9], [5]]
         self._test_func(input, func, expected)
 
     def test_count_by_window_large(self):
         input = [range(1), range(2), range(3), range(4), range(5), range(6)]
 
         def func(dstream):
-            return dstream.countByWindow(6, 1)
+            return dstream.countByWindow(5, 1)
 
         expected = [[1], [3], [6], [10], [15], [20], [18], [15], [11], [6]]
         self._test_func(input, func, expected)
 
+    def test_count_by_value_and_window(self):
+        input = [range(1), range(2), range(3), range(4), range(5), range(6)]
+
+        def func(dstream):
+            return dstream.countByValueAndWindow(6, 1)
+
+        expected = [[1], [2], [3], [4], [5], [6], [6], [6], [6], [6]]
+        self._test_func(input, func, expected)
+
     def test_group_by_key_and_window(self):
         input = [[('a', i)] for i in range(5)]
 
@@ -359,6 +425,20 @@ def test_queueStream(self):
         time.sleep(1)
         self.assertEqual(input, result[:3])
 
+    # TODO: test textFileStream
+    # def test_textFileStream(self):
+    #     input = [range(i) for i in range(3)]
+    #     dstream = self.ssc.queueStream(input)
+    #     d = os.path.join(tempfile.gettempdir(), str(id(self)))
+    #     if not os.path.exists(d):
+    #         os.makedirs(d)
+    #     dstream.saveAsTextFiles(os.path.join(d, 'test'))
+    #     dstream2 = self.ssc.textFileStream(d)
+    #     result = dstream2.collect()
+    #     self.ssc.start()
+    #     time.sleep(2)
+    #     self.assertEqual(input, result[:3])
+
     def test_union(self):
         input = [range(i) for i in range(3)]
         dstream = self.ssc.queueStream(input)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 8ba8c0441ef35..2f20b05991b8e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -93,7 +93,8 @@ private[spark] object PythonDStream {
   }
 
   // helper function for ssc.transform()
-  def callTransform(ssc: JavaStreamingContext, jdsteams: JList[JavaDStream[_]], pyfunc: PythonRDDFunction)
+  def callTransform(ssc: JavaStreamingContext, jdsteams: JList[JavaDStream[_]],
+                    pyfunc: PythonRDDFunction)
     :JavaDStream[Array[Byte]] = {
     val func = new RDDFunction(pyfunc)
     ssc.transform(jdsteams, func)
@@ -210,9 +211,9 @@ class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
 
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     val currentTime = validTime
-    val currentWindow = new Interval(currentTime - windowDuration + parent.slideDuration,
+    val current = new Interval(currentTime - windowDuration,
       currentTime)
-    val previousWindow = currentWindow - slideDuration
+    val previous = current - slideDuration
 
     //  _____________________________
     // |  previous window   _________|___________________
@@ -225,35 +226,30 @@ class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
     //       old RDDs                     new RDDs
     //
 
-    // Get the RDD of the reduced value of the previous window
-    val previousWindowRDD = getOrCompute(previousWindow.endTime)
+    val previousRDD = getOrCompute(previous.endTime)
 
-    if (pinvReduceFunc != null && previousWindowRDD.isDefined
+    if (pinvReduceFunc != null && previousRDD.isDefined
         // for small window, reduce once will be better than twice
-        && windowDuration > slideDuration * 5) {
+        && windowDuration >= slideDuration * 5) {
 
       // subtract the values from old RDDs
-      val oldRDDs =
-        parent.slice(previousWindow.beginTime, currentWindow.beginTime - parent.slideDuration)
-      val subbed = if (oldRDDs.size > 0) {
-        invReduceFunc(previousWindowRDD, Some(ssc.sc.union(oldRDDs)), validTime)
+      val oldRDDs = parent.slice(previous.beginTime + parent.slideDuration, current.beginTime)
+      val subtracted = if (oldRDDs.size > 0) {
+        invReduceFunc(previousRDD, Some(ssc.sc.union(oldRDDs)), validTime)
       } else {
-        previousWindowRDD
+        previousRDD
       }
 
       // add the RDDs of the reduced values in "new time steps"
-      val newRDDs =
-        parent.slice(previousWindow.endTime, currentWindow.endTime - parent.slideDuration)
-
+      val newRDDs = parent.slice(previous.endTime + parent.slideDuration, current.endTime)
       if (newRDDs.size > 0) {
-        reduceFunc(subbed, Some(ssc.sc.union(newRDDs)), validTime)
+        reduceFunc(subtracted, Some(ssc.sc.union(newRDDs)), validTime)
       } else {
-        subbed
+        subtracted
       }
     } else {
       // Get the RDDs of the reduced values in current window
-      val currentRDDs =
-        parent.slice(currentWindow.beginTime, currentWindow.endTime - parent.slideDuration)
+      val currentRDDs = parent.slice(current.beginTime + parent.slideDuration, current.endTime)
       if (currentRDDs.size > 0) {
         reduceFunc(None, Some(ssc.sc.union(currentRDDs)), validTime)
       } else {

From 19797f9fc9b062ee30746c184ad432192ca5e19a Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Mon, 29 Sep 2014 13:41:44 -0700
Subject: [PATCH 319/347] clean up

---
 python/pyspark/streaming/context.py                         | 6 +++---
 python/pyspark/streaming/tests.py                           | 4 ++--
 .../scala/org/apache/spark/streaming/StreamingContext.scala | 2 +-
 .../spark/streaming/api/java/JavaStreamingContext.scala     | 4 ----
 .../apache/spark/streaming/api/python/PythonDStream.scala   | 3 ++-
 5 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index ce8aec613d08b..425b0a96aa832 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -15,6 +15,9 @@
 # limitations under the License.
 #
 
+from py4j.java_collections import ListConverter
+from py4j.java_gateway import java_import
+
 from pyspark import RDD
 from pyspark.serializers import UTF8Deserializer
 from pyspark.context import SparkContext
@@ -22,9 +25,6 @@
 from pyspark.streaming.dstream import DStream
 from pyspark.streaming.util import RDDFunction
 
-from py4j.java_collections import ListConverter
-from py4j.java_gateway import java_import
-
 __all__ = ["StreamingContext"]
 
 
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 54d4d9b1f7850..342afde3bffd2 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -348,7 +348,7 @@ def test_count_by_value_and_window(self):
         input = [range(1), range(2), range(3), range(4), range(5), range(6)]
 
         def func(dstream):
-            return dstream.countByValueAndWindow(6, 1)
+            return dstream.countByValueAndWindow(5, 1)
 
         expected = [[1], [2], [3], [4], [5], [6], [6], [6], [6], [6]]
         self._test_func(input, func, expected)
@@ -357,7 +357,7 @@ def test_group_by_key_and_window(self):
         input = [[('a', i)] for i in range(5)]
 
         def func(dstream):
-            return dstream.groupByKeyAndWindow(4, 1).mapValues(list)
+            return dstream.groupByKeyAndWindow(3, 1).mapValues(list)
 
         expected = [[('a', [0])], [('a', [0, 1])], [('a', [0, 1, 2])], [('a', [1, 2, 3])],
                     [('a', [2, 3, 4])], [('a', [3, 4])], [('a', [4])]]
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index ab6a6de074a80..ef7631788f26d 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -413,7 +413,7 @@ class StreamingContext private[streaming] (
       dstreams: Seq[DStream[_]],
       transformFunc: (Seq[RDD[_]], Time) => RDD[T]
     ): DStream[T] = {
-    new TransformedDStream[T](dstreams, (transformFunc))
+    new TransformedDStream[T](dstreams, transformFunc)
   }
 
   /** Add a [[org.apache.spark.streaming.scheduler.StreamingListener]] object for
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
index 662cd8d22c6a5..9dc26dc6b32a1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
@@ -549,10 +549,6 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
  * JavaStreamingContext object contains a number of utility functions.
  */
 object JavaStreamingContext {
-  implicit def fromStreamingContext(ssc: StreamingContext):
-    JavaStreamingContext = new JavaStreamingContext(ssc)
-
-  implicit def toStreamingContext(jssc: JavaStreamingContext): StreamingContext = jssc.ssc
 
   /**
    * Either recreate a StreamingContext from checkpoint data or create a new StreamingContext.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 2f20b05991b8e..30c52c15e9e68 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -41,7 +41,7 @@ trait PythonRDDFunction {
 /**
  * Wrapper for PythonRDDFunction
  */
-class RDDFunction(pfunc: PythonRDDFunction)
+private[python] class RDDFunction(pfunc: PythonRDDFunction)
   extends function.Function2[JList[JavaRDD[_]], Time, JavaRDD[Array[Byte]]] with Serializable {
 
   def wrapRDD(rdd: Option[RDD[_]]): JavaRDD[_] = {
@@ -68,6 +68,7 @@ class RDDFunction(pfunc: PythonRDDFunction)
     some(pfunc.call(time.milliseconds, List(wrapRDD(rdd), wrapRDD(rdd2)).asJava))
   }
 
+  // for JFunction2
   def call(rdds: JList[JavaRDD[_]], time: Time): JavaRDD[Array[Byte]] = {
     pfunc.call(time.milliseconds, rdds)
   }

From 338580a7aa39fcf8beedefdc7000b906a1028c84 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Mon, 29 Sep 2014 17:02:12 -0700
Subject: [PATCH 320/347] change _first(), _take(), _collect() as private API

---
 python/pyspark/streaming/dstream.py |  8 ++++----
 python/pyspark/streaming/tests.py   | 23 ++++++++++++++---------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index c18c68dfe5a32..d98afc3e5a294 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -164,13 +164,13 @@ def takeAndPrint(timestamp, rdd):
 
         self.foreachRDD(takeAndPrint)
 
-    def first(self):
+    def _first(self):
         """
         Return the first RDD in the stream.
         """
-        return self.take(1)[0]
+        return self._take(1)[0]
 
-    def take(self, n):
+    def _take(self, n):
         """
         Return the first `n` RDDs in the stream (will start and stop).
         """
@@ -188,7 +188,7 @@ def take(_, rdd):
         self._ssc.stop(False, True)
         return results
 
-    def collect(self):
+    def _collect(self):
         """
         Collect each RDDs into the returned list.
 
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 342afde3bffd2..7ffdb145c104e 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -54,7 +54,7 @@ def _test_func(self, input, func, expected, sort=False, input2=None):
         else:
             stream = func(input_stream)
 
-        result = stream.collect()
+        result = stream._collect()
         self.ssc.start()
 
         start_time = time.time()
@@ -86,12 +86,12 @@ class TestBasicOperations(PySparkStreamingTestCase):
     def test_take(self):
         input = [range(i) for i in range(3)]
         dstream = self.ssc.queueStream(input)
-        self.assertEqual([0, 0, 1], dstream.take(3))
+        self.assertEqual([0, 0, 1], dstream._take(3))
 
     def test_first(self):
         input = [range(10)]
         dstream = self.ssc.queueStream(input)
-        self.assertEqual(0, dstream.first())
+        self.assertEqual(0, dstream._first())
 
     def test_map(self):
         """Basic operation test for DStream.map."""
@@ -415,17 +415,17 @@ def _addInputStream(self):
         # Make sure each length of input is over 3
         inputs = map(lambda x: range(1, x), range(5, 101))
         stream = self.ssc.queueStream(inputs)
-        stream.collect()
+        stream._collect()
 
     def test_queueStream(self):
         input = [range(i) for i in range(3)]
         dstream = self.ssc.queueStream(input)
-        result = dstream.collect()
+        result = dstream._collect()
         self.ssc.start()
         time.sleep(1)
         self.assertEqual(input, result[:3])
 
-    # TODO: test textFileStream
+    # TODO: fix this test
     # def test_textFileStream(self):
     #     input = [range(i) for i in range(3)]
     #     dstream = self.ssc.queueStream(input)
@@ -433,8 +433,13 @@ def test_queueStream(self):
     #     if not os.path.exists(d):
     #         os.makedirs(d)
     #     dstream.saveAsTextFiles(os.path.join(d, 'test'))
+    #     self.ssc.start()
+    #     time.sleep(1)
+    #     self.ssc.stop(False, True)
+    #
+    #     self.ssc = StreamingContext(self.sc, self.batachDuration)
     #     dstream2 = self.ssc.textFileStream(d)
-    #     result = dstream2.collect()
+    #     result = dstream2._collect()
     #     self.ssc.start()
     #     time.sleep(2)
     #     self.assertEqual(input, result[:3])
@@ -444,7 +449,7 @@ def test_union(self):
         dstream = self.ssc.queueStream(input)
         dstream2 = self.ssc.queueStream(input)
         dstream3 = self.ssc.union(dstream, dstream2)
-        result = dstream3.collect()
+        result = dstream3._collect()
         self.ssc.start()
         time.sleep(1)
         expected = [i * 2 for i in input]
@@ -461,7 +466,7 @@ def func(rdds):
 
         dstream = self.ssc.transform([dstream1, dstream2, dstream3], func)
 
-        self.assertEqual([2, 3, 1], dstream.take(3))
+        self.assertEqual([2, 3, 1], dstream._take(3))
 
 
 if __name__ == "__main__":

From 069a94c2f12211560691177f465a74630531e81b Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Mon, 29 Sep 2014 22:48:47 -0700
Subject: [PATCH 321/347] fix the number of partitions during window()

---
 python/pyspark/streaming/dstream.py | 12 +++++++++---
 python/pyspark/streaming/tests.py   |  8 +++++++-
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index d98afc3e5a294..d866f8c9687fb 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -552,14 +552,18 @@ def reduceByKeyAndWindow(self, func, invFunc, windowDuration, slideDuration=None
 
         def reduceFunc(t, a, b):
             b = b.reduceByKey(func, numPartitions)
-            r = a.union(b).reduceByKey(func, numPartitions) if a else b
+            # use the average of number of partitions, or it will keep increasing
+            partitions = numPartitions or (a.getNumPartitions() + b.getNumPartitions())/2
+            r = a.union(b).reduceByKey(func, partitions) if a else b
             if filterFunc:
                 r = r.filter(filterFunc)
             return r
 
         def invReduceFunc(t, a, b):
             b = b.reduceByKey(func, numPartitions)
-            joined = a.leftOuterJoin(b, numPartitions)
+            # use the average of number of partitions, or it will keep increasing
+            partitions = numPartitions or (a.getNumPartitions() + b.getNumPartitions())/2
+            joined = a.leftOuterJoin(b, partitions)
             return joined.mapValues(lambda (v1, v2): invFunc(v1, v2) if v2 is not None else v1)
 
         jreduceFunc = RDDFunction(self.ctx, reduceFunc, reduced._jrdd_deserializer)
@@ -587,7 +591,9 @@ def reduceFunc(t, a, b):
             if a is None:
                 g = b.groupByKey(numPartitions).map(lambda (k, vs): (k, list(vs), None))
             else:
-                g = a.cogroup(b, numPartitions)
+                # use the average of number of partitions, or it will keep increasing
+                partitions = numPartitions or (a.getNumPartitions() + b.getNumPartitions())/2
+                g = a.cogroup(b, partitions)
                 g = g.map(lambda (k, (va, vb)): (k, list(vb), list(va)[0] if len(va) else None))
             state = g.mapPartitions(lambda x: updateFunc(x))
             return state.filter(lambda (k, v): v is not None)
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 7ffdb145c104e..0dc6b3d675397 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -22,7 +22,7 @@
 import unittest
 import tempfile
 
-from pyspark.context import SparkContext
+from pyspark.context import SparkContext, RDD
 from pyspark.streaming.context import StreamingContext
 
 
@@ -46,8 +46,13 @@ def _test_func(self, input, func, expected, sort=False, input2=None):
         @param func: wrapped function. This function should return PythonDStream object.
         @param expected: expected output for this testcase.
         """
+        if not isinstance(input[0], RDD):
+            input = [self.sc.parallelize(d, 1) for d in input]
         input_stream = self.ssc.queueStream(input)
+        if input2 and not isinstance(input2[0], RDD):
+            input2 = [self.sc.parallelize(d, 1) for d in input2]
         input_stream2 = self.ssc.queueStream(input2) if input2 is not None else None
+
         # Apply test function to stream.
         if input2:
             stream = func(input_stream, input_stream2)
@@ -63,6 +68,7 @@ def _test_func(self, input, func, expected, sort=False, input2=None):
             current_time = time.time()
             # Check time out.
             if (current_time - start_time) > self.timeout:
+                print "timeout after", self.timeout
                 break
             # StreamingContext.awaitTermination is not used to wait because
             # if py4j server is called every 50 milliseconds, it gets an error.

From e00136b3dfd330689d89e44006a49871b36a4825 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Tue, 30 Sep 2014 00:41:39 -0700
Subject: [PATCH 322/347] address comments

---
 .../apache/spark/api/python/PythonRDD.scala   |   1 +
 .../{wordcount.py => hdfs_wordcount.py}       |   0
 python/pyspark/java_gateway.py                |   1 -
 python/pyspark/streaming/context.py           |  32 ++-
 python/pyspark/streaming/dstream.py           | 254 ++++++++----------
 python/pyspark/streaming/tests.py             |  69 +++--
 python/pyspark/streaming/util.py              |   5 +-
 python/run-tests                              |   6 +-
 .../streaming/api/python/PythonDStream.scala  |  98 ++++---
 9 files changed, 245 insertions(+), 221 deletions(-)
 rename examples/src/main/python/streaming/{wordcount.py => hdfs_wordcount.py} (100%)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 19cdbe679fd35..8051b221ac3d1 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -52,6 +52,7 @@ private[spark] class PythonRDD(
     accumulator: Accumulator[JList[Array[Byte]]])
   extends RDD[Array[Byte]](parent) {
 
+  // create a new PythonRDD with same Python setting but different parent.
   def copyTo(rdd: RDD[_]): PythonRDD = {
     new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning,
       pythonExec, broadcastVars, accumulator)
diff --git a/examples/src/main/python/streaming/wordcount.py b/examples/src/main/python/streaming/hdfs_wordcount.py
similarity index 100%
rename from examples/src/main/python/streaming/wordcount.py
rename to examples/src/main/python/streaming/hdfs_wordcount.py
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index db5b97f8472d1..9c70fa5c16d0c 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -23,7 +23,6 @@
 import platform
 from subprocess import Popen, PIPE
 from threading import Thread
-
 from py4j.java_gateway import java_import, JavaGateway, GatewayClient
 
 
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 425b0a96aa832..ae4a1d5b6b069 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -31,6 +31,11 @@
 def _daemonize_callback_server():
     """
     Hack Py4J to daemonize callback server
+
+    The thread of callback server has daemon=False, it will block the driver
+    from exiting if it's not shutdown. The following code replace `start()`
+    of CallbackServer with a new version, which set daemon=True for this
+    thread.
     """
     # TODO: create a patch for Py4J
     import socket
@@ -47,7 +52,6 @@ def start(self):
                                       1)
         try:
             self.server_socket.bind((self.address, self.port))
-            # self.port = self.server_socket.getsockname()[1]
         except Exception:
             msg = 'An error occurred while trying to start the callback server'
             logger.exception(msg)
@@ -63,19 +67,21 @@ def start(self):
 
 class StreamingContext(object):
     """
-    Main entry point for Spark Streaming functionality. A StreamingContext represents the
-    connection to a Spark cluster, and can be used to create L{DStream}s and
-    broadcast variables on that cluster.
+    Main entry point for Spark Streaming functionality. A StreamingContext
+    represents the connection to a Spark cluster, and can be used to create
+    L{DStream}s various input sources. It can be from an existing L{SparkContext}.
+    After creating and transforming DStreams, the streaming computation can
+    be started and stopped using `context.start()` and `context.stop()`,
+    respectively. `context.awaitTransformation()` allows the current thread
+    to wait for the termination of the context by `stop()` or by an exception.
     """
 
     def __init__(self, sparkContext, duration):
         """
-        Create a new StreamingContext. At least the master and app name and duration
-        should be set, either through the named parameters here or through C{conf}.
+        Create a new StreamingContext.
 
         @param sparkContext: L{SparkContext} object.
-        @param duration: seconds for SparkStreaming.
-
+        @param duration: number of seconds.
         """
         self._sc = sparkContext
         self._jvm = self._sc._jvm
@@ -127,8 +133,12 @@ def awaitTermination(self, timeout=None):
 
     def stop(self, stopSparkContext=True, stopGraceFully=False):
         """
-        Stop the execution of the streams immediately (does not wait for all received data
-        to be processed).
+        Stop the execution of the streams, with option of ensuring all
+        received data has been processed.
+
+        @param stopSparkContext Stop the associated SparkContext or not
+        @param stopGracefully Stop gracefully by waiting for the processing
+                              of all received data to be completed
         """
         self._jssc.stop(stopSparkContext, stopGraceFully)
         if stopSparkContext:
@@ -140,7 +150,7 @@ def remember(self, duration):
         in the last given duration. DStreams remember RDDs only for a
         limited duration of time and releases them for garbage collection.
         This method allows the developer to specify how to long to remember
-        the RDDs ( if the developer wishes to query old data outside the
+        the RDDs (if the developer wishes to query old data outside the
         DStream computation).
 
         @param duration Minimum duration (in seconds) that each DStream
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index d866f8c9687fb..4e3f07e26953b 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -30,6 +30,24 @@
 
 
 class DStream(object):
+    """
+    A Discretized Stream (DStream), the basic abstraction in Spark Streaming,
+    is a continuous sequence of RDDs (of the same type) representing a
+    continuous stream of data (see L{RDD} in the Spark core documentation
+    for more details on RDDs).
+
+    DStreams can either be created from live data (such as, data from TCP
+    sockets, Kafka, Flume, etc.) using a L{StreamingContext} or it can be
+    generated by transforming existing DStreams using operations such as
+    `map`, `window` and `reduceByKeyAndWindow`. While a Spark Streaming
+    program is running, each DStream periodically generates a RDD, either
+    from live data or by transforming the RDD generated by a parent DStream.
+
+    DStreams internally is characterized by a few basic properties:
+     - A list of other DStreams that the DStream depends on
+     - A time interval at which the DStream generates an RDD
+     - A function that is used to generate an RDD after each time interval
+    """
     def __init__(self, jdstream, ssc, jrdd_deserializer):
         self._jdstream = jdstream
         self._ssc = ssc
@@ -46,11 +64,12 @@ def context(self):
 
     def count(self):
         """
-        Return a new DStream which contains the number of elements in this DStream.
+        Return a new DStream in which each RDD has a single element
+        generated by counting each RDD of this DStream.
         """
-        return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
+        return self.mapPartitions(lambda i: [sum(1 for _ in i)])._sum()
 
-    def sum(self):
+    def _sum(self):
         """
         Add up the elements in this DStream.
         """
@@ -66,8 +85,8 @@ def func(iterator):
 
     def flatMap(self, f, preservesPartitioning=False):
         """
-        Pass each value in the key-value pair DStream through flatMap function
-        without changing the keys: this also retains the original RDD's partition.
+        Return a new DStream by applying a function to all elements of
+        this DStream, and then flattening the results
         """
         def func(s, iterator):
             return chain.from_iterable(imap(f, iterator))
@@ -83,7 +102,8 @@ def func(iterator):
 
     def mapPartitions(self, f, preservesPartitioning=False):
         """
-        Return a new DStream by applying a function to each partition of this DStream.
+        Return a new DStream in which each RDD is generated by applying
+        mapPartitions() to each RDDs of this DStream.
         """
         def func(s, iterator):
             return f(iterator)
@@ -91,56 +111,51 @@ def func(s, iterator):
 
     def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
         """
-        Return a new DStream by applying a function to each partition of this DStream,
-        while tracking the index of the original partition.
+        Return a new DStream in which each RDD is generated by applying
+        mapPartitionsWithIndex() to each RDDs of this DStream.
         """
         return self.transform(lambda rdd: rdd.mapPartitionsWithIndex(f, preservesPartitioning))
 
     def reduce(self, func):
         """
-        Return a new DStream by reduceing the elements of this RDD using the specified
-        commutative and associative binary operator.
+        Return a new DStream in which each RDD has a single element
+        generated by reducing each RDD of this DStream.
         """
         return self.map(lambda x: (None, x)).reduceByKey(func, 1).map(lambda x: x[1])
 
     def reduceByKey(self, func, numPartitions=None):
         """
-        Merge the value for each key using an associative reduce function.
-
-        This will also perform the merging locally on each mapper before
-        sending results to reducer, similarly to a "combiner" in MapReduce.
-
-        Output will be hash-partitioned with C{numPartitions} partitions, or
-        the default parallelism level if C{numPartitions} is not specified.
+        Return a new DStream by applying reduceByKey to each RDD.
         """
+        if numPartitions is None:
+            numPartitions = self.ctx.defaultParallelism
         return self.combineByKey(lambda x: x, func, func, numPartitions)
 
     def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
                      numPartitions=None):
         """
-        Count the number of elements for each key, and return the result to the
-        master as a dictionary
+        Return a new DStream by applying combineByKey to each RDD.
         """
+        if numPartitions is None:
+            numPartitions = self.ctx.defaultParallelism
+
         def func(rdd):
             return rdd.combineByKey(createCombiner, mergeValue, mergeCombiners, numPartitions)
         return self.transform(func)
 
     def partitionBy(self, numPartitions, partitionFunc=portable_hash):
         """
-        Return a copy of the DStream partitioned using the specified partitioner.
+        Return a copy of the DStream in which each RDD are partitioned
+        using the specified partitioner.
         """
         return self.transform(lambda rdd: rdd.partitionBy(numPartitions, partitionFunc))
 
-    def foreach(self, func):
-        return self.foreachRDD(lambda _, rdd: rdd.foreach(func))
+    # def foreach(self, func):
+    #    return self.foreachRDD(lambda _, rdd: rdd.foreach(func))
 
     def foreachRDD(self, func):
         """
-        Apply userdefined function to all RDD in a DStream.
-        This python implementation could be expensive because it uses callback server
-        in order to apply function to RDD in DStream.
-        This is an output operator, so this DStream will be registered as an output
-        stream and there materialized.
+        Apply a function to each RDD in this DStream.
         """
         jfunc = RDDFunction(self.ctx, func, self._jrdd_deserializer)
         api = self._ssc._jvm.PythonDStream
@@ -148,13 +163,12 @@ def foreachRDD(self, func):
 
     def pprint(self):
         """
-        Print the first ten elements of each RDD generated in this DStream. This is an output
-        operator, so this DStream will be registered as an output stream and there materialized.
+        Print the first ten elements of each RDD generated in this DStream.
         """
-        def takeAndPrint(timestamp, rdd):
+        def takeAndPrint(time, rdd):
             taken = rdd.take(11)
             print "-------------------------------------------"
-            print "Time: %s" % datetime.fromtimestamp(timestamp / 1000.0)
+            print "Time: %s" % time
             print "-------------------------------------------"
             for record in taken[:10]:
                 print record
@@ -164,58 +178,18 @@ def takeAndPrint(timestamp, rdd):
 
         self.foreachRDD(takeAndPrint)
 
-    def _first(self):
-        """
-        Return the first RDD in the stream.
-        """
-        return self._take(1)[0]
-
-    def _take(self, n):
-        """
-        Return the first `n` RDDs in the stream (will start and stop).
-        """
-        results = []
-
-        def take(_, rdd):
-            if rdd and len(results) < n:
-                results.extend(rdd.take(n - len(results)))
-
-        self.foreachRDD(take)
-
-        self._ssc.start()
-        while len(results) < n:
-            time.sleep(0.01)
-        self._ssc.stop(False, True)
-        return results
-
-    def _collect(self):
-        """
-        Collect each RDDs into the returned list.
-
-        :return: list, which will have the collected items.
-        """
-        result = []
-
-        def get_output(_, rdd):
-            r = rdd.collect()
-            result.append(r)
-        self.foreachRDD(get_output)
-        return result
-
     def mapValues(self, f):
         """
-        Pass each value in the key-value pair RDD through a map function
-        without changing the keys; this also retains the original RDD's
-        partitioning.
+        Return a new DStream by applying a map function to the value of
+        each key-value pairs in 'this' DStream without changing the key.
         """
         map_values_fn = lambda (k, v): (k, f(v))
         return self.map(map_values_fn, preservesPartitioning=True)
 
     def flatMapValues(self, f):
         """
-        Pass each value in the key-value pair RDD through a flatMap function
-        without changing the keys; this also retains the original RDD's
-        partitioning.
+        Return a new DStream by applying a flatmap function to the value
+        of each key-value pairs in 'this' DStream without changing the key.
         """
         flat_map_fn = lambda (k, v): ((k, x) for x in f(v))
         return self.flatMap(flat_map_fn, preservesPartitioning=True)
@@ -223,8 +197,7 @@ def flatMapValues(self, f):
     def glom(self):
         """
         Return a new DStream in which RDD is generated by applying glom()
-        to RDD of this DStream. Applying glom() to an RDD coalesces all
-        elements within each partition into an list.
+        to RDD of this DStream.
         """
         def func(iterator):
             yield list(iterator)
@@ -232,7 +205,8 @@ def func(iterator):
 
     def cache(self):
         """
-        Persist this DStream with the default storage level (C{MEMORY_ONLY_SER}).
+        Persist the RDDs of this DStream with the default storage level
+        (C{MEMORY_ONLY_SER}).
         """
         self.is_cached = True
         self.persist(StorageLevel.MEMORY_ONLY_SER)
@@ -240,9 +214,7 @@ def cache(self):
 
     def persist(self, storageLevel):
         """
-        Set this DStream's storage level to persist its values across operations
-        after the first time it is computed. This can only be used to assign
-        a new storage level if the DStream does not have a storage level set yet.
+        Persist the RDDs of this DStream with the given storage level
         """
         self.is_cached = True
         javaStorageLevel = self.ctx._getJavaStorageLevel(storageLevel)
@@ -251,11 +223,10 @@ def persist(self, storageLevel):
 
     def checkpoint(self, interval):
         """
-        Mark this DStream for checkpointing. It will be saved to a file inside the
-        checkpoint directory set with L{SparkContext.setCheckpointDir()}
+        Enable periodic checkpointing of RDDs of this DStream
 
-        @param interval: time in seconds, after which generated RDD will
-                         be checkpointed
+        @param interval: time in seconds, after each period of that, generated
+                         RDD will be checkpointed
         """
         self.is_checkpointed = True
         self._jdstream.checkpoint(self._ssc._jduration(interval))
@@ -263,85 +234,76 @@ def checkpoint(self, interval):
 
     def groupByKey(self, numPartitions=None):
         """
-        Return a new DStream which contains group the values for each key in the
-        DStream into a single sequence.
-        Hash-partitions the resulting RDD with into numPartitions partitions in
-        the DStream.
-
-        Note: If you are grouping in order to perform an aggregation (such as a
-        sum or average) over each key, using reduceByKey will provide much
-        better performance.
+        Return a new DStream by applying groupByKey on each RDD.
         """
+        if numPartitions is None:
+            numPartitions = self.ctx.defaultParallelism
         return self.transform(lambda rdd: rdd.groupByKey(numPartitions))
 
     def countByValue(self):
         """
-        Return new DStream which contains the count of each unique value in this
-        DStreeam as a (value, count) pairs.
+        Return a new DStream in which each RDD contains the counts of each
+        distinct value in each RDD of this DStream.
         """
         return self.map(lambda x: (x, None)).reduceByKey(lambda x, y: None).count()
 
     def saveAsTextFiles(self, prefix, suffix=None):
         """
-        Save this DStream as a text file, using string representations of elements.
+        Save each RDD in this DStream as at text file, using string
+        representation of elements.
         """
-
         def saveAsTextFile(time, rdd):
-            """
-            Closure to save element in RDD in DStream as Pickled data in file.
-            This closure is called by py4j callback server.
-            """
             path = rddToFileName(prefix, suffix, time)
             rdd.saveAsTextFile(path)
-
         return self.foreachRDD(saveAsTextFile)
 
-    def saveAsPickleFiles(self, prefix, suffix=None):
+    def _saveAsPickleFiles(self, prefix, suffix=None):
         """
-        Save this DStream as a SequenceFile of serialized objects. The serializer
-        used is L{pyspark.serializers.PickleSerializer}, default batch size
-        is 10.
+        Save each RDD in this DStream as at binary file, the elements are
+        serialized by pickle.
         """
-
         def saveAsPickleFile(time, rdd):
-            """
-            Closure to save element in RDD in the DStream as Pickled data in file.
-            This closure is called by py4j callback server.
-            """
             path = rddToFileName(prefix, suffix, time)
             rdd.saveAsPickleFile(path)
-
         return self.foreachRDD(saveAsPickleFile)
 
     def transform(self, func):
         """
         Return a new DStream in which each RDD is generated by applying a function
         on each RDD of 'this' DStream.
-        """
-        return TransformedDStream(self, lambda t, a: func(a), True)
 
-    def transformWithTime(self, func):
+        `func` can have one argument of `rdd`, or have two arguments of
+         (`time`, `rdd`)
         """
-        Return a new DStream in which each RDD is generated by applying a function
-        on each RDD of 'this' DStream.
-        """
-        return TransformedDStream(self, func, False)
+        resue = False
+        if func.func_code.co_argcount == 1:
+            reuse = True
+            oldfunc = func
+            func = lambda t, rdd: oldfunc(rdd)
+        assert func.func_code.co_argcount == 2, "func should take one or two arguments"
+        return TransformedDStream(self, func, reuse)
 
     def transformWith(self, func, other, keepSerializer=False):
         """
         Return a new DStream in which each RDD is generated by applying a function
         on each RDD of 'this' DStream and 'other' DStream.
+
+        `func` can have two arguments of (`rdd_a`, `rdd_b`) or have three
+        arguments of (`time`, `rdd_a`, `rdd_b`)
         """
-        jfunc = RDDFunction(self.ctx, lambda t, a, b: func(a, b), self._jrdd_deserializer)
+        if func.func_code.co_argcount == 2:
+            oldfunc = func
+            func = lambda t, a, b: oldfunc(a, b)
+        assert func.func_code.co_argcount == 3, "func should take two or three arguments"
+        jfunc = RDDFunction(self.ctx, func, self._jrdd_deserializer)
         dstream = self.ctx._jvm.PythonTransformed2DStream(self._jdstream.dstream(),
                                                           other._jdstream.dstream(), jfunc)
         jrdd_serializer = self._jrdd_deserializer if keepSerializer else self.ctx.serializer
         return DStream(dstream.asJavaDStream(), self._ssc, jrdd_serializer)
 
-    def repartitions(self, numPartitions):
+    def repartition(self, numPartitions):
         """
-        Return a new DStream with an increased or decreased level of parallelism. Each RDD in the
-        returned DStream has exactly numPartitions partitions.
+        Return a new DStream with an increased or decreased level of parallelism.
         """
         return self.transform(lambda rdd: rdd.repartition(numPartitions))
 
@@ -355,7 +317,8 @@ def _slideDuration(self):
     def union(self, other):
         """
         Return a new DStream by unifying data of another DStream with this DStream.
-        @param other Another DStream having the same interval (i.e., slideDuration) as this DStream.
+        @param other Another DStream having the same interval (i.e., slideDuration)
+                     as this DStream.
         """
         if self._slideDuration != other._slideDuration:
             raise ValueError("the two DStream should have same slide duration")
@@ -368,6 +331,8 @@ def cogroup(self, other, numPartitions=None):
 
         Hash partitioning is used to generate the RDDs with `numPartitions` partitions.
         """
+        if numPartitions is None:
+            numPartitions = self.ctx.defaultParallelism
         return self.transformWith(lambda a, b: a.cogroup(b, numPartitions), other)
 
     def join(self, other, numPartitions=None):
@@ -378,6 +343,8 @@ def join(self, other, numPartitions=None):
         Hash partitioning is used to generate the RDDs with `numPartitions`
          partitions.
         """
+        if numPartitions is None:
+            numPartitions = self.ctx.defaultParallelism
         return self.transformWith(lambda a, b: a.join(b, numPartitions), other)
 
     def leftOuterJoin(self, other, numPartitions=None):
@@ -388,6 +355,8 @@ def leftOuterJoin(self, other, numPartitions=None):
         Hash partitioning is used to generate the RDDs with `numPartitions`
          partitions.
         """
+        if numPartitions is None:
+            numPartitions = self.ctx.defaultParallelism
         return self.transformWith(lambda a, b: a.leftOuterJoin(b, numPartitions), other)
 
     def rightOuterJoin(self, other, numPartitions=None):
@@ -398,6 +367,8 @@ def rightOuterJoin(self, other, numPartitions=None):
         Hash partitioning is used to generate the RDDs with `numPartitions`
          partitions.
         """
+        if numPartitions is None:
+            numPartitions = self.ctx.defaultParallelism
         return self.transformWith(lambda a, b: a.rightOuterJoin(b, numPartitions), other)
 
     def fullOuterJoin(self, other, numPartitions=None):
@@ -408,6 +379,8 @@ def fullOuterJoin(self, other, numPartitions=None):
         Hash partitioning is used to generate the RDDs with `numPartitions`
          partitions.
         """
+        if numPartitions is None:
+            numPartitions = self.ctx.defaultParallelism
         return self.transformWith(lambda a, b: a.fullOuterJoin(b, numPartitions), other)
 
     def _jtime(self, timestamp):
@@ -426,7 +399,7 @@ def slice(self, begin, end):
         jrdds = self._jdstream.slice(self._jtime(begin), self._jtime(end))
         return [RDD(jrdd, self.ctx, self._jrdd_deserializer) for jrdd in jrdds]
 
-    def _check_window(self, window, slide):
+    def _validate_window_param(self, window, slide):
         duration = self._jdstream.dstream().slideDuration().milliseconds()
         if int(window * 1000) % duration != 0:
             raise ValueError("windowDuration must be multiple of the slide duration (%d ms)"
@@ -446,7 +419,7 @@ def window(self, windowDuration, slideDuration=None):
                               the new DStream will generate RDDs); must be a multiple of this
                               DStream's batching interval
         """
-        self._check_window(windowDuration, slideDuration)
+        self._validate_window_param(windowDuration, slideDuration)
         d = self._ssc._jduration(windowDuration)
         if slideDuration is None:
             return DStream(self._jdstream.window(d), self._ssc, self._jrdd_deserializer)
@@ -547,23 +520,22 @@ def reduceByKeyAndWindow(self, func, invFunc, windowDuration, slideDuration=None
                               only pairs that satisfy the function are retained
                               set this to null if you do not want to filter
         """
-        self._check_window(windowDuration, slideDuration)
-        reduced = self.reduceByKey(func)
+        self._validate_window_param(windowDuration, slideDuration)
+        if numPartitions is None:
+            numPartitions = self.ctx.defaultParallelism
+
+        reduced = self.reduceByKey(func, numPartitions)
 
         def reduceFunc(t, a, b):
             b = b.reduceByKey(func, numPartitions)
-            # use the average of number of partitions, or it will keep increasing
-            partitions = numPartitions or (a.getNumPartitions() + b.getNumPartitions())/2
-            r = a.union(b).reduceByKey(func, partitions) if a else b
+            r = a.union(b).reduceByKey(func, numPartitions) if a else b
             if filterFunc:
                 r = r.filter(filterFunc)
             return r
 
         def invReduceFunc(t, a, b):
             b = b.reduceByKey(func, numPartitions)
-            # use the average of number of partitions, or it will keep increasing
-            partitions = numPartitions or (a.getNumPartitions() + b.getNumPartitions())/2
-            joined = a.leftOuterJoin(b, partitions)
+            joined = a.leftOuterJoin(b, numPartitions)
             return joined.mapValues(lambda (v1, v2): invFunc(v1, v2) if v2 is not None else v1)
 
         jreduceFunc = RDDFunction(self.ctx, reduceFunc, reduced._jrdd_deserializer)
@@ -587,13 +559,14 @@ def updateStateByKey(self, updateFunc, numPartitions=None):
         @param updateFunc State update function ([(k, vs, s)] -> [(k, s)]).
                           If `s` is None, then `k` will be eliminated.
         """
+        if numPartitions is None:
+            numPartitions = self.ctx.defaultParallelism
+
         def reduceFunc(t, a, b):
             if a is None:
                 g = b.groupByKey(numPartitions).map(lambda (k, vs): (k, list(vs), None))
             else:
-                # use the average of number of partitions, or it will keep increasing
-                partitions = numPartitions or (a.getNumPartitions() + b.getNumPartitions())/2
-                g = a.cogroup(b, partitions)
+                g = a.cogroup(b, numPartitions)
                 g = g.map(lambda (k, (va, vb)): (k, list(vb), list(va)[0] if len(va) else None))
             state = g.mapPartitions(lambda x: updateFunc(x))
             return state.filter(lambda (k, v): v is not None)
@@ -605,6 +578,13 @@ def reduceFunc(t, a, b):
 
 
 class TransformedDStream(DStream):
+    """
+    TransformedDStream is an DStream generated by an Python function
+    transforming each RDD of an DStream to another RDDs.
+
+    Multiple continuous transformations of DStream can be combined into
+    one transformation.
+    """
     def __init__(self, prev, func, reuse=False):
         ssc = prev._ssc
         self._ssc = ssc
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 0dc6b3d675397..698978e61ffad 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -29,17 +29,50 @@
 class PySparkStreamingTestCase(unittest.TestCase):
 
     timeout = 10  # seconds
+    duration = 1
 
     def setUp(self):
         class_name = self.__class__.__name__
         self.sc = SparkContext(appName=class_name)
         self.sc.setCheckpointDir("/tmp")
         # TODO: decrease duration to speed up tests
-        self.ssc = StreamingContext(self.sc, duration=1)
+        self.ssc = StreamingContext(self.sc, self.duration)
 
     def tearDown(self):
         self.ssc.stop()
 
+    def _take(self, dstream, n):
+        """
+        Return the first `n` elements in the stream (will start and stop).
+        """
+        results = []
+
+        def take(_, rdd):
+            if rdd and len(results) < n:
+                results.extend(rdd.take(n - len(results)))
+
+        dstream.foreachRDD(take)
+
+        self.ssc.start()
+        while len(results) < n:
+            time.sleep(0.01)
+        self.ssc.stop(False, True)
+        return results
+
+    def _collect(self, dstream):
+        """
+        Collect each RDDs into the returned list.
+
+        :return: list, which will have the collected items.
+        """
+        result = []
+
+        def get_output(_, rdd):
+            r = rdd.collect()
+            result.append(r)
+        dstream.foreachRDD(get_output)
+        return result
+
     def _test_func(self, input, func, expected, sort=False, input2=None):
         """
         @param input: dataset for the test. This should be list of lists.
@@ -59,7 +92,7 @@ def _test_func(self, input, func, expected, sort=False, input2=None):
         else:
             stream = func(input_stream)
 
-        result = stream._collect()
+        result = self._collect(stream)
         self.ssc.start()
 
         start_time = time.time()
@@ -89,16 +122,6 @@ def _sort_result_based_on_key(self, outputs):
 
 class TestBasicOperations(PySparkStreamingTestCase):
 
-    def test_take(self):
-        input = [range(i) for i in range(3)]
-        dstream = self.ssc.queueStream(input)
-        self.assertEqual([0, 0, 1], dstream._take(3))
-
-    def test_first(self):
-        input = [range(10)]
-        dstream = self.ssc.queueStream(input)
-        self.assertEqual(0, dstream._first())
-
     def test_map(self):
         """Basic operation test for DStream.map."""
         input = [range(1, 5), range(5, 9), range(9, 13)]
@@ -248,7 +271,7 @@ def test_repartition(self):
         rdds = [self.sc.parallelize(r, 2) for r in input]
 
         def func(dstream):
-            return dstream.repartitions(1).glom()
+            return dstream.repartition(1).glom()
         expected = [[[1, 2, 3, 4]], [[5, 6, 7, 8]]]
         self._test_func(rdds, func, expected)
 
@@ -395,15 +418,9 @@ def func(dstream):
         self._test_func(input, func, expected)
 
 
-class TestStreamingContext(unittest.TestCase):
-    def setUp(self):
-        self.sc = SparkContext(master="local[2]", appName=self.__class__.__name__)
-        self.batachDuration = 0.1
-        self.ssc = StreamingContext(self.sc, self.batachDuration)
+class TestStreamingContext(PySparkStreamingTestCase):
 
-    def tearDown(self):
-        self.ssc.stop()
-        self.sc.stop()
+    duration = 0.1
 
     def test_stop_only_streaming_context(self):
         self._addInputStream()
@@ -421,12 +438,12 @@ def _addInputStream(self):
         # Make sure each length of input is over 3
         inputs = map(lambda x: range(1, x), range(5, 101))
         stream = self.ssc.queueStream(inputs)
-        stream._collect()
+        self._collect(stream)
 
     def test_queueStream(self):
         input = [range(i) for i in range(3)]
         dstream = self.ssc.queueStream(input)
-        result = dstream._collect()
+        result = self._collect(dstream)
         self.ssc.start()
         time.sleep(1)
         self.assertEqual(input, result[:3])
@@ -445,7 +462,7 @@ def test_queueStream(self):
     #
     #     self.ssc = StreamingContext(self.sc, self.batachDuration)
     #     dstream2 = self.ssc.textFileStream(d)
-    #     result = dstream2._collect()
+    #     result = self._collect(dstream2)
     #     self.ssc.start()
     #     time.sleep(2)
     #     self.assertEqual(input, result[:3])
@@ -455,7 +472,7 @@ def test_union(self):
         dstream = self.ssc.queueStream(input)
         dstream2 = self.ssc.queueStream(input)
         dstream3 = self.ssc.union(dstream, dstream2)
-        result = dstream3._collect()
+        result = self._collect(dstream3)
         self.ssc.start()
         time.sleep(1)
         expected = [i * 2 for i in input]
@@ -472,7 +489,7 @@ def func(rdds):
 
         dstream = self.ssc.transform([dstream1, dstream2, dstream3], func)
 
-        self.assertEqual([2, 3, 1], dstream._take(3))
+        self.assertEqual([2, 3, 1], self._take(dstream, 3))
 
 
 if __name__ == "__main__":
diff --git a/python/pyspark/streaming/util.py b/python/pyspark/streaming/util.py
index 57791805e8f9f..4838ec6c8c6e9 100644
--- a/python/pyspark/streaming/util.py
+++ b/python/pyspark/streaming/util.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 #
 
+from datetime import datetime
+
 from pyspark.rdd import RDD
 
 
@@ -40,7 +42,8 @@ def call(self, milliseconds, jrdds):
 
             rdds = [RDD(jrdd, self.ctx, ser) if jrdd else self.emptyRDD
                     for jrdd, ser in zip(jrdds, sers)]
-            r = self.func(milliseconds, *rdds)
+            t = datetime.fromtimestamp(milliseconds / 1000.0)
+            r = self.func(t, *rdds)
             if r:
                 return r._jrdd
         except Exception:
diff --git a/python/run-tests b/python/run-tests
index e8796838c22c1..e86e0729cf65e 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -93,9 +93,9 @@ fi
 echo "Testing with Python version:"
 $PYSPARK_PYTHON --version
 
-#run_core_tests
-#run_sql_tests
-#run_mllib_tests
+run_core_tests
+run_sql_tests
+run_mllib_tests
 run_streaming_tests
 
 # Try to test with PyPy
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 30c52c15e9e68..658715eb456dd 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -34,7 +34,8 @@ import org.apache.spark.streaming.api.java._
 /**
  * Interface for Python callback function with three arguments
  */
-trait PythonRDDFunction {
+private[spark] trait PythonRDDFunction {
+  // callback in Python
   def call(time: Long, rdds: JList[_]): JavaRDD[Array[Byte]]
 }
 
@@ -44,38 +45,30 @@ trait PythonRDDFunction {
 private[python] class RDDFunction(pfunc: PythonRDDFunction)
   extends function.Function2[JList[JavaRDD[_]], Time, JavaRDD[Array[Byte]]] with Serializable {
 
-  def wrapRDD(rdd: Option[RDD[_]]): JavaRDD[_] = {
-    if (rdd.isDefined) {
-      JavaRDD.fromRDD(rdd.get)
-    } else {
-      null
-    }
-  }
-
-  def some(jrdd: JavaRDD[Array[Byte]]): Option[RDD[Array[Byte]]] = {
-    if (jrdd != null) {
-      Some(jrdd.rdd)
-    } else {
-      None
-    }
-  }
-
   def apply(rdd: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
-    some(pfunc.call(time.milliseconds, List(wrapRDD(rdd)).asJava))
+    PythonDStream.some(pfunc.call(time.milliseconds, List(PythonDStream.wrapRDD(rdd)).asJava))
   }
 
   def apply(rdd: Option[RDD[_]], rdd2: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
-    some(pfunc.call(time.milliseconds, List(wrapRDD(rdd), wrapRDD(rdd2)).asJava))
+    val rdds = List(PythonDStream.wrapRDD(rdd), PythonDStream.wrapRDD(rdd2)).asJava
+    PythonDStream.some(pfunc.call(time.milliseconds, rdds))
   }
 
-  // for JFunction2
+  // for function.Function2
   def call(rdds: JList[JavaRDD[_]], time: Time): JavaRDD[Array[Byte]] = {
     pfunc.call(time.milliseconds, rdds)
   }
 }
 
+
+/**
+ * Base class for PythonDStream with some common methods
+ */
 private[python]
-abstract class PythonDStream(parent: DStream[_]) extends DStream[Array[Byte]] (parent.ssc) {
+abstract class PythonDStream(parent: DStream[_], pfunc: PythonRDDFunction)
+  extends DStream[Array[Byte]] (parent.ssc) {
+
+  val func = new RDDFunction(pfunc)
 
   override def dependencies = List(parent)
 
@@ -84,12 +77,33 @@ abstract class PythonDStream(parent: DStream[_]) extends DStream[Array[Byte]] (p
   val asJavaDStream  = JavaDStream.fromDStream(this)
 }
 
+/**
+ * Helper functions
+ */
 private[spark] object PythonDStream {
 
+  // convert Option[RDD[_]] to JavaRDD, handle null gracefully
+  def wrapRDD(rdd: Option[RDD[_]]): JavaRDD[_] = {
+    if (rdd.isDefined) {
+      JavaRDD.fromRDD(rdd.get)
+    } else {
+      null
+    }
+  }
+
+  // convert JavaRDD to Option[RDD[Array[Byte]]] to , handle null gracefully
+  def some(jrdd: JavaRDD[Array[Byte]]): Option[RDD[Array[Byte]]] = {
+    if (jrdd != null) {
+      Some(jrdd.rdd)
+    } else {
+      None
+    }
+  }
+
   // helper function for DStream.foreachRDD(),
   // cannot be `foreachRDD`, it will confusing py4j
-  def callForeachRDD(jdstream: JavaDStream[Array[Byte]], pyfunc: PythonRDDFunction){
-    val func = new RDDFunction(pyfunc)
+  def callForeachRDD(jdstream: JavaDStream[Array[Byte]], pfunc: PythonRDDFunction){
+    val func = new RDDFunction((pfunc))
     jdstream.dstream.foreachRDD((rdd, time) => func(Some(rdd), time))
   }
 
@@ -112,34 +126,36 @@ private[spark] object PythonDStream {
 /**
  * Transformed DStream in Python.
  *
- * If the result RDD is PythonRDD, then it will cache it as an template for future use,
- * this can reduce the Python callbacks.
+ * If `reuse` is true and the result of the `func` is an PythonRDD, then it will cache it
+ * as an template for future use, this can reduce the Python callbacks.
  */
 private[spark]
 class PythonTransformedDStream (parent: DStream[_], pfunc: PythonRDDFunction,
                                 var reuse: Boolean = false)
-  extends PythonDStream(parent) {
+  extends PythonDStream(parent, pfunc) {
 
-  val func = new RDDFunction(pfunc)
+  // rdd returned by func
   var lastResult: PythonRDD = _
 
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-    val rdd1 = parent.getOrCompute(validTime)
-    if (rdd1.isEmpty) {
+    val rdd = parent.getOrCompute(validTime)
+    if (rdd.isEmpty) {
       return None
     }
     if (reuse && lastResult != null) {
-      Some(lastResult.copyTo(rdd1.get))
+      // use the previous result as the template to generate new RDD
+      Some(lastResult.copyTo(rdd.get))
     } else {
-      val r = func(rdd1, validTime)
+      val r = func(rdd, validTime)
       if (reuse && r.isDefined && lastResult == null) {
+        // try to use the result as a template
         r.get match {
-          case rdd: PythonRDD =>
-            if (rdd.parent(0) == rdd1) {
+          case pyrdd: PythonRDD =>
+            if (pyrdd.parent(0) == rdd) {
               // only one PythonRDD
-              lastResult = rdd
+              lastResult = pyrdd
             } else {
-              // may have multiple stages
+              // maybe have multiple stages, don't check it anymore
               reuse = false
             }
         }
@@ -174,10 +190,8 @@ class PythonTransformed2DStream(parent: DStream[_], parent2: DStream[_],
  * similar to StateDStream
  */
 private[spark]
-class PythonStateDStream(parent: DStream[Array[Byte]], preduceFunc: PythonRDDFunction)
-  extends PythonDStream(parent) {
-
-  val reduceFunc = new RDDFunction(preduceFunc)
+class PythonStateDStream(parent: DStream[Array[Byte]], reduceFunc: PythonRDDFunction)
+  extends PythonDStream(parent, reduceFunc) {
 
   super.persist(StorageLevel.MEMORY_ONLY)
   override val mustCheckpoint = true
@@ -186,7 +200,7 @@ class PythonStateDStream(parent: DStream[Array[Byte]], preduceFunc: PythonRDDFun
     val lastState = getOrCompute(validTime - slideDuration)
     val rdd = parent.getOrCompute(validTime)
     if (rdd.isDefined) {
-      reduceFunc(lastState, rdd, validTime)
+      func(lastState, rdd, validTime)
     } else {
       lastState
     }
@@ -244,7 +258,7 @@ class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
       // add the RDDs of the reduced values in "new time steps"
       val newRDDs = parent.slice(previous.endTime + parent.slideDuration, current.endTime)
       if (newRDDs.size > 0) {
-        reduceFunc(subtracted, Some(ssc.sc.union(newRDDs)), validTime)
+        func(subtracted, Some(ssc.sc.union(newRDDs)), validTime)
       } else {
         subtracted
       }
@@ -252,7 +266,7 @@ class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
       // Get the RDDs of the reduced values in current window
       val currentRDDs = parent.slice(current.beginTime + parent.slideDuration, current.endTime)
       if (currentRDDs.size > 0) {
-        reduceFunc(None, Some(ssc.sc.union(currentRDDs)), validTime)
+        func(None, Some(ssc.sc.union(currentRDDs)), validTime)
       } else {
         None
       }

From eed6e2a034646d91ddddccb42aee6809e0faa93e Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Tue, 30 Sep 2014 00:48:29 -0700
Subject: [PATCH 323/347] rollback not needed changes

---
 bin/pyspark                       |  6 ++-
 python/pyspark/accumulators.py    |  5 --
 python/pyspark/serializers.py     |  5 --
 python/pyspark/streaming/tests.py | 38 +++++++--------
 python/run-tests                  | 81 +++++++++++++++----------------
 5 files changed, 64 insertions(+), 71 deletions(-)

diff --git a/bin/pyspark b/bin/pyspark
index 118e6851af7a0..5142411e36974 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -87,7 +87,11 @@ export PYSPARK_SUBMIT_ARGS
 if [[ -n "$SPARK_TESTING" ]]; then
   unset YARN_CONF_DIR
   unset HADOOP_CONF_DIR
-  exec "$PYSPARK_PYTHON" $1
+  if [[ -n "$PYSPARK_DOC_TEST" ]]; then
+    exec "$PYSPARK_PYTHON" -m doctest $1
+  else
+    exec "$PYSPARK_PYTHON" $1
+  fi
   exit
 fi
 
diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py
index 9aa3db7ccf1dd..ccbca67656c8d 100644
--- a/python/pyspark/accumulators.py
+++ b/python/pyspark/accumulators.py
@@ -256,8 +256,3 @@ def _start_update_server():
     thread.daemon = True
     thread.start()
     return server
-
-
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index e666dd9800256..94bebc310bad6 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -526,8 +526,3 @@ def write_int(value, stream):
 def write_with_length(obj, stream):
     write_int(len(obj), stream)
     stream.write(obj)
-
-
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 698978e61ffad..09d2670cc1962 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -341,6 +341,25 @@ def func(a, b):
         expected = [[('a', (1, None)), ('b', (2, 3)), ('c', (None, 4))]]
         self._test_func(input, func, expected, True, input2)
 
+    def update_state_by_key(self):
+
+        def updater(it):
+            for k, vs, s in it:
+                if not s:
+                    s = vs
+                else:
+                    s.extend(vs)
+                yield (k, s)
+
+        input = [[('k', i)] for i in range(5)]
+
+        def func(dstream):
+            return dstream.updateStateByKey(updater)
+
+        expected = [[0], [0, 1], [0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]]
+        expected = [[('k', v)] for v in expected]
+        self._test_func(input, func, expected)
+
 
 class TestWindowFunctions(PySparkStreamingTestCase):
 
@@ -398,25 +417,6 @@ def test_reduce_by_invalid_window(self):
         self.assertRaises(ValueError, lambda: d1.reduceByKeyAndWindow(None, None, 0.1, 0.1))
         self.assertRaises(ValueError, lambda: d1.reduceByKeyAndWindow(None, None, 1, 0.1))
 
-    def update_state_by_key(self):
-
-        def updater(it):
-            for k, vs, s in it:
-                if not s:
-                    s = vs
-                else:
-                    s.extend(vs)
-                yield (k, s)
-
-        input = [[('k', i)] for i in range(5)]
-
-        def func(dstream):
-            return dstream.updateStateByKey(updater)
-
-        expected = [[0], [0, 1], [0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]]
-        expected = [[('k', v)] for v in expected]
-        self._test_func(input, func, expected)
-
 
 class TestStreamingContext(PySparkStreamingTestCase):
 
diff --git a/python/run-tests b/python/run-tests
index e86e0729cf65e..c5cb580f77fd2 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -48,39 +48,6 @@ function run_test() {
     fi
 }
 
-function run_core_tests() {
-    run_test "pyspark/conf.py"
-    run_test "pyspark/context.py"
-    run_test "pyspark/broadcast.py"
-    run_test "pyspark/accumulators.py"
-    run_test "pyspark/serializers.py"
-    run_test "pyspark/shuffle.py"
-    run_test "pyspark/rdd.py"
-    run_test "pyspark/tests.py"
-}
-
-function run_sql_tests() {
-    run_test "pyspark/sql.py"
-}
-
-function run_mllib_tests() {
-    run_test "pyspark/mllib/util.py"
-    run_test "pyspark/mllib/linalg.py"
-    run_test "pyspark/mllib/classification.py"
-    run_test "pyspark/mllib/clustering.py"
-    run_test "pyspark/mllib/random.py"
-    run_test "pyspark/mllib/recommendation.py"
-    run_test "pyspark/mllib/regression.py"
-    run_test "pyspark/mllib/stat.py"
-    run_test "pyspark/mllib/tree.py"
-    run_test "pyspark/mllib/tests.py"
-}
-
-function run_streaming_tests() {
-    run_test "pyspark/streaming/util.py"
-    run_test "pyspark/streaming/tests.py"
-}
-
 echo "Running PySpark tests. Output is in python/unit-tests.log."
 
 export PYSPARK_PYTHON="python"
@@ -93,10 +60,31 @@ fi
 echo "Testing with Python version:"
 $PYSPARK_PYTHON --version
 
-run_core_tests
-run_sql_tests
-run_mllib_tests
-run_streaming_tests
+run_test "pyspark/rdd.py"
+run_test "pyspark/context.py"
+run_test "pyspark/conf.py"
+run_test "pyspark/sql.py"
+# These tests are included in the module-level docs, and so must
+# be handled on a higher level rather than within the python file.
+export PYSPARK_DOC_TEST=1
+run_test "pyspark/broadcast.py"
+run_test "pyspark/accumulators.py"
+run_test "pyspark/serializers.py"
+unset PYSPARK_DOC_TEST
+run_test "pyspark/shuffle.py"
+run_test "pyspark/tests.py"
+run_test "pyspark/mllib/classification.py"
+run_test "pyspark/mllib/clustering.py"
+run_test "pyspark/mllib/linalg.py"
+run_test "pyspark/mllib/random.py"
+run_test "pyspark/mllib/recommendation.py"
+run_test "pyspark/mllib/regression.py"
+run_test "pyspark/mllib/stat.py"
+run_test "pyspark/mllib/tests.py"
+run_test "pyspark/mllib/tree.py"
+run_test "pyspark/mllib/util.py"
+run_test "pyspark/streaming/util.py"
+run_test "pyspark/streaming/tests.py"
 
 # Try to test with PyPy
 if [ $(which pypy) ]; then
@@ -104,10 +92,21 @@ if [ $(which pypy) ]; then
     echo "Testing with PyPy version:"
     $PYSPARK_PYTHON --version
 
-    run_core_tests
-    run_sql_tests
-    run_mllib_tests
-    run_streaming_tests
+    run_test "pyspark/rdd.py"
+    run_test "pyspark/context.py"
+    run_test "pyspark/conf.py"
+    run_test "pyspark/sql.py"
+    # These tests are included in the module-level docs, and so must
+    # be handled on a higher level rather than within the python file.
+    export PYSPARK_DOC_TEST=1
+    run_test "pyspark/broadcast.py"
+    run_test "pyspark/accumulators.py"
+    run_test "pyspark/serializers.py"
+    unset PYSPARK_DOC_TEST
+    run_test "pyspark/shuffle.py"
+    run_test "pyspark/tests.py"
+    run_test "pyspark/streaming/util.py"
+    run_test "pyspark/streaming/tests.py"
 fi
 
 if [[ $FAILED == 0 ]]; then

From b98d63fbde10f20a42e1e6e0f34f45736b802772 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Tue, 30 Sep 2014 00:52:47 -0700
Subject: [PATCH 324/347] change private[spark] to private[python]

---
 .../spark/streaming/api/python/PythonDStream.scala   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 658715eb456dd..4a52ce1c4f43a 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -34,7 +34,7 @@ import org.apache.spark.streaming.api.java._
 /**
  * Interface for Python callback function with three arguments
  */
-private[spark] trait PythonRDDFunction {
+private[python] trait PythonRDDFunction {
   // callback in Python
   def call(time: Long, rdds: JList[_]): JavaRDD[Array[Byte]]
 }
@@ -80,7 +80,7 @@ abstract class PythonDStream(parent: DStream[_], pfunc: PythonRDDFunction)
 /**
  * Helper functions
  */
-private[spark] object PythonDStream {
+private[python] object PythonDStream {
 
   // convert Option[RDD[_]] to JavaRDD, handle null gracefully
   def wrapRDD(rdd: Option[RDD[_]]): JavaRDD[_] = {
@@ -129,7 +129,7 @@ private[spark] object PythonDStream {
  * If `reuse` is true and the result of the `func` is an PythonRDD, then it will cache it
  * as an template for future use, this can reduce the Python callbacks.
  */
-private[spark]
+private[python]
 class PythonTransformedDStream (parent: DStream[_], pfunc: PythonRDDFunction,
                                 var reuse: Boolean = false)
   extends PythonDStream(parent, pfunc) {
@@ -168,7 +168,7 @@ class PythonTransformedDStream (parent: DStream[_], pfunc: PythonRDDFunction,
 /**
  * Transformed from two DStreams in Python.
  */
-private[spark]
+private[python]
 class PythonTransformed2DStream(parent: DStream[_], parent2: DStream[_],
                                 pfunc: PythonRDDFunction)
   extends DStream[Array[Byte]] (parent.ssc) {
@@ -189,7 +189,7 @@ class PythonTransformed2DStream(parent: DStream[_], parent2: DStream[_],
 /**
  * similar to StateDStream
  */
-private[spark]
+private[python]
 class PythonStateDStream(parent: DStream[Array[Byte]], reduceFunc: PythonRDDFunction)
   extends PythonDStream(parent, reduceFunc) {
 
@@ -210,7 +210,7 @@ class PythonStateDStream(parent: DStream[Array[Byte]], reduceFunc: PythonRDDFunc
 /**
  * similar to ReducedWindowedDStream
  */
-private[spark]
+private[python]
 class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
                                    preduceFunc: PythonRDDFunction,
                                    pinvReduceFunc: PythonRDDFunction,

From 9a16bd1bdce5b66ff3701aeb94b77d94e8b0a521 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Tue, 30 Sep 2014 10:08:32 -0700
Subject: [PATCH 325/347] change number of partitions during tests

---
 python/pyspark/streaming/tests.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 09d2670cc1962..bd6d92255dbc6 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -22,7 +22,7 @@
 import unittest
 import tempfile
 
-from pyspark.context import SparkContext, RDD
+from pyspark.context import SparkConf, SparkContext, RDD
 from pyspark.streaming.context import StreamingContext
 
 
@@ -33,7 +33,8 @@ class PySparkStreamingTestCase(unittest.TestCase):
 
     def setUp(self):
         class_name = self.__class__.__name__
-        self.sc = SparkContext(appName=class_name)
+        conf = SparkConf().set("spark.default.parallelism", 1)
+        self.sc = SparkContext(appName=class_name, conf=conf)
         self.sc.setCheckpointDir("/tmp")
         # TODO: decrease duration to speed up tests
         self.ssc = StreamingContext(self.sc, self.duration)

From 8466916cec3ce6ebba8c3c2c35f7ad4c74f90e66 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Tue, 30 Sep 2014 11:51:54 -0700
Subject: [PATCH 326/347] support checkpoint

---
 python/pyspark/streaming/context.py           |  7 +-
 python/pyspark/streaming/util.py              | 28 +++++-
 .../spark/streaming/StreamingContext.scala    |  2 +-
 .../streaming/api/python/PythonDStream.scala  | 87 +++++++++++++++----
 4 files changed, 101 insertions(+), 23 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index ae4a1d5b6b069..da645a6201503 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -19,11 +19,11 @@
 from py4j.java_gateway import java_import
 
 from pyspark import RDD
-from pyspark.serializers import UTF8Deserializer
+from pyspark.serializers import UTF8Deserializer, CloudPickleSerializer
 from pyspark.context import SparkContext
 from pyspark.storagelevel import StorageLevel
 from pyspark.streaming.dstream import DStream
-from pyspark.streaming.util import RDDFunction
+from pyspark.streaming.util import RDDFunction, RDDFunctionSerializer
 
 __all__ = ["StreamingContext"]
 
@@ -100,6 +100,9 @@ def _initialize_context(self, sc, duration):
         java_import(self._jvm, "org.apache.spark.streaming.*")
         java_import(self._jvm, "org.apache.spark.streaming.api.java.*")
         java_import(self._jvm, "org.apache.spark.streaming.api.python.*")
+        # register serializer for RDDFunction
+        ser = RDDFunctionSerializer(self._sc, CloudPickleSerializer())
+        self._jvm.PythonDStream.registerSerializer(ser)
         return self._jvm.JavaStreamingContext(sc._jsc, self._jduration(duration))
 
     def _jduration(self, seconds):
diff --git a/python/pyspark/streaming/util.py b/python/pyspark/streaming/util.py
index 4838ec6c8c6e9..c15f9d98c1866 100644
--- a/python/pyspark/streaming/util.py
+++ b/python/pyspark/streaming/util.py
@@ -16,6 +16,7 @@
 #
 
 from datetime import datetime
+import traceback
 
 from pyspark.rdd import RDD
 
@@ -47,7 +48,6 @@ def call(self, milliseconds, jrdds):
             if r:
                 return r._jrdd
         except Exception:
-            import traceback
             traceback.print_exc()
 
     def __repr__(self):
@@ -57,6 +57,32 @@ class Java:
         implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction']
 
 
+class RDDFunctionSerializer(object):
+    def __init__(self, ctx, serializer):
+        self.ctx = ctx
+        self.serializer = serializer
+
+    def dumps(self, id):
+        try:
+            func = self.ctx._gateway.gateway_property.pool[id]
+            return bytearray(self.serializer.dumps((func.func, func.deserializers)))
+        except Exception:
+            traceback.print_exc()
+
+    def loads(self, bytes):
+        try:
+            f, deserializers = self.serializer.loads(str(bytes))
+            return RDDFunction(self.ctx, f, *deserializers)
+        except Exception:
+            traceback.print_exc()
+
+    def __repr__(self):
+        return "RDDFunctionSerializer(%s)" % self.serializer
+
+    class Java:
+        implements = ['org.apache.spark.streaming.api.python.PythonRDDFunctionSerializer']
+
+
 def rddToFileName(prefix, suffix, time):
     """
     Return string prefix-time(.suffix)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index ef7631788f26d..5a8eef1372e23 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -413,7 +413,7 @@ class StreamingContext private[streaming] (
       dstreams: Seq[DStream[_]],
       transformFunc: (Seq[RDD[_]], Time) => RDD[T]
     ): DStream[T] = {
-    new TransformedDStream[T](dstreams, transformFunc)
+    new TransformedDStream[T](dstreams, sparkContext.clean(transformFunc))
   }
 
   /** Add a [[org.apache.spark.streaming.scheduler.StreamingListener]] object for
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 4a52ce1c4f43a..ddbbf107abb3e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -17,10 +17,11 @@
 
 package org.apache.spark.streaming.api.python
 
+import java.io.{ObjectInputStream, ObjectOutputStream}
+import java.lang.reflect.Proxy
 import java.util.{ArrayList => JArrayList, List => JList}
 import scala.collection.JavaConversions._
 import scala.collection.JavaConverters._
-import scala.collection.mutable
 
 import org.apache.spark.api.java._
 import org.apache.spark.api.python._
@@ -35,14 +36,14 @@ import org.apache.spark.streaming.api.java._
  * Interface for Python callback function with three arguments
  */
 private[python] trait PythonRDDFunction {
-  // callback in Python
   def call(time: Long, rdds: JList[_]): JavaRDD[Array[Byte]]
 }
 
 /**
  * Wrapper for PythonRDDFunction
+ * TODO: support checkpoint
  */
-private[python] class RDDFunction(pfunc: PythonRDDFunction)
+private[python] class RDDFunction(@transient var pfunc: PythonRDDFunction)
   extends function.Function2[JList[JavaRDD[_]], Time, JavaRDD[Array[Byte]]] with Serializable {
 
   def apply(rdd: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
@@ -58,23 +59,47 @@ private[python] class RDDFunction(pfunc: PythonRDDFunction)
   def call(rdds: JList[JavaRDD[_]], time: Time): JavaRDD[Array[Byte]] = {
     pfunc.call(time.milliseconds, rdds)
   }
-}
 
+  private def writeObject(out: ObjectOutputStream): Unit = {
+    assert(PythonDStream.serializer != null, "Serializer has not been registered!")
+    val bytes = PythonDStream.serializer.serialize(pfunc)
+    out.writeInt(bytes.length)
+    out.write(bytes)
+  }
+
+  private def readObject(in: ObjectInputStream): Unit = {
+    assert(PythonDStream.serializer != null, "Serializer has not been registered!")
+    val length = in.readInt()
+    val bytes = new Array[Byte](length)
+    in.readFully(bytes)
+    pfunc = PythonDStream.serializer.deserialize(bytes)
+  }
+}
 
 /**
- * Base class for PythonDStream with some common methods
+ * Inferface for Python Serializer to serialize PythonRDDFunction
  */
-private[python]
-abstract class PythonDStream(parent: DStream[_], pfunc: PythonRDDFunction)
-  extends DStream[Array[Byte]] (parent.ssc) {
-
-  val func = new RDDFunction(pfunc)
-
-  override def dependencies = List(parent)
+private[python] trait PythonRDDFunctionSerializer {
+  def dumps(id: String): Array[Byte]  //
+  def loads(bytes: Array[Byte]): PythonRDDFunction
+}
 
-  override def slideDuration: Duration = parent.slideDuration
+/**
+ * Wrapper for PythonRDDFunctionSerializer
+ */
+private[python] class RDDFunctionSerializer(pser: PythonRDDFunctionSerializer) {
+  def serialize(func: PythonRDDFunction): Array[Byte] = {
+    // get the id of PythonRDDFunction in py4j
+    val h = Proxy.getInvocationHandler(func.asInstanceOf[Proxy])
+    val f = h.getClass().getDeclaredField("id");
+    f.setAccessible(true);
+    val id = f.get(h).asInstanceOf[String];
+    pser.dumps(id)
+  }
 
-  val asJavaDStream  = JavaDStream.fromDStream(this)
+  def deserialize(bytes: Array[Byte]): PythonRDDFunction = {
+    pser.loads(bytes)
+  }
 }
 
 /**
@@ -82,6 +107,14 @@ abstract class PythonDStream(parent: DStream[_], pfunc: PythonRDDFunction)
  */
 private[python] object PythonDStream {
 
+  // A serializer in Python, used to serialize PythonRDDFunction
+  var serializer: RDDFunctionSerializer = _
+
+  // Register a serializer from Python, should be called during initialization
+  def registerSerializer(ser: PythonRDDFunctionSerializer) = {
+    serializer = new RDDFunctionSerializer(ser)
+  }
+
   // convert Option[RDD[_]] to JavaRDD, handle null gracefully
   def wrapRDD(rdd: Option[RDD[_]]): JavaRDD[_] = {
     if (rdd.isDefined) {
@@ -123,6 +156,22 @@ private[python] object PythonDStream {
   }
 }
 
+/**
+ * Base class for PythonDStream with some common methods
+ */
+private[python]
+abstract class PythonDStream(parent: DStream[_], @transient pfunc: PythonRDDFunction)
+  extends DStream[Array[Byte]] (parent.ssc) {
+
+  val func = new RDDFunction(pfunc)
+
+  override def dependencies = List(parent)
+
+  override def slideDuration: Duration = parent.slideDuration
+
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
+
 /**
  * Transformed DStream in Python.
  *
@@ -130,7 +179,7 @@ private[python] object PythonDStream {
  * as an template for future use, this can reduce the Python callbacks.
  */
 private[python]
-class PythonTransformedDStream (parent: DStream[_], pfunc: PythonRDDFunction,
+class PythonTransformedDStream (parent: DStream[_], @transient pfunc: PythonRDDFunction,
                                 var reuse: Boolean = false)
   extends PythonDStream(parent, pfunc) {
 
@@ -170,7 +219,7 @@ class PythonTransformedDStream (parent: DStream[_], pfunc: PythonRDDFunction,
  */
 private[python]
 class PythonTransformed2DStream(parent: DStream[_], parent2: DStream[_],
-                                pfunc: PythonRDDFunction)
+                                @transient pfunc: PythonRDDFunction)
   extends DStream[Array[Byte]] (parent.ssc) {
 
   val func = new RDDFunction(pfunc)
@@ -190,7 +239,7 @@ class PythonTransformed2DStream(parent: DStream[_], parent2: DStream[_],
  * similar to StateDStream
  */
 private[python]
-class PythonStateDStream(parent: DStream[Array[Byte]], reduceFunc: PythonRDDFunction)
+class PythonStateDStream(parent: DStream[Array[Byte]], @transient reduceFunc: PythonRDDFunction)
   extends PythonDStream(parent, reduceFunc) {
 
   super.persist(StorageLevel.MEMORY_ONLY)
@@ -212,8 +261,8 @@ class PythonStateDStream(parent: DStream[Array[Byte]], reduceFunc: PythonRDDFunc
  */
 private[python]
 class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
-                                   preduceFunc: PythonRDDFunction,
-                                   pinvReduceFunc: PythonRDDFunction,
+                                   @transient preduceFunc: PythonRDDFunction,
+                                   @transient pinvReduceFunc: PythonRDDFunction,
                                    _windowDuration: Duration,
                                    _slideDuration: Duration
                                    ) extends PythonStateDStream(parent, preduceFunc) {

From a13ff34d76c35f1a28bb09b8787715c767c9f515 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Tue, 30 Sep 2014 12:25:14 -0700
Subject: [PATCH 327/347] address comments

---
 .../main/python/streaming/hdfs_wordcount.py   | 32 +++++++++++++++--
 .../python/streaming/network_wordcount.py     | 30 +++++++++++++++-
 python/pyspark/streaming/context.py           | 10 +++---
 python/pyspark/streaming/dstream.py           |  3 --
 .../streaming/api/python/PythonDStream.scala  | 36 +++----------------
 5 files changed, 69 insertions(+), 42 deletions(-)

diff --git a/examples/src/main/python/streaming/hdfs_wordcount.py b/examples/src/main/python/streaming/hdfs_wordcount.py
index 8c08ff0c89850..40faff0ccc7db 100644
--- a/examples/src/main/python/streaming/hdfs_wordcount.py
+++ b/examples/src/main/python/streaming/hdfs_wordcount.py
@@ -1,3 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+ Counts words in new text files created in the given directory
+ Usage: hdfs_wordcount.py <directory>
+   <directory> is the directory that Spark Streaming will use to find and read new text files.
+
+ To run this on your local machine on directory `localdir`, run this example
+    $ bin/spark-submit examples/src/main/python/streaming/network_wordcount.py localdir
+
+ Then create a text file in `localdir` and the words in the file will get counted.
+"""
+
 import sys
 
 from pyspark import SparkContext
@@ -5,10 +33,10 @@
 
 if __name__ == "__main__":
     if len(sys.argv) != 2:
-        print >> sys.stderr, "Usage: wordcount <directory>"
+        print >> sys.stderr, "Usage: hdfs_wordcount.py <directory>"
         exit(-1)
 
-    sc = SparkContext(appName="PythonStreamingWordCount")
+    sc = SparkContext(appName="PythonStreamingHDFSWordCount")
     ssc = StreamingContext(sc, 1)
 
     lines = ssc.textFileStream(sys.argv[1])
diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index e3b6248c82a12..cfa9c1ff5bfbc 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -1,3 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+ Counts words in UTF8 encoded, '\n' delimited text received from the network every second.
+ Usage: network_wordcount.py <hostname> <port>
+   <hostname> and <port> describe the TCP server that Spark Streaming would connect to receive data.
+
+ To run this on your local machine, you need to first run a Netcat server
+    `$ nc -lk 9999`
+ and then run the example
+    `$ bin/spark-submit examples/src/main/python/streaming/network_wordcount.py localhost 9999`
+"""
+
 import sys
 
 from pyspark import SparkContext
@@ -5,7 +33,7 @@
 
 if __name__ == "__main__":
     if len(sys.argv) != 3:
-        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
+        print >> sys.stderr, "Usage: network_wordcount.py <hostname> <port>"
         exit(-1)
     sc = SparkContext(appName="PythonStreamingNetworkWordCount")
     ssc = StreamingContext(sc, 1)
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index da645a6201503..9808361eb664f 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -234,11 +234,11 @@ def transform(self, dstreams, transformFunc):
         jdstreams = ListConverter().convert([d._jdstream for d in dstreams],
                                             SparkContext._gateway._gateway_client)
         # change the final serializer to sc.serializer
-        jfunc = RDDFunction(self._sc,
-                            lambda t, *rdds: transformFunc(rdds).map(lambda x: x),
-                            *[d._jrdd_deserializer for d in dstreams])
-
-        jdstream = self._jvm.PythonDStream.callTransform(self._jssc, jdstreams, jfunc)
+        func = RDDFunction(self._sc,
+                           lambda t, *rdds: transformFunc(rdds).map(lambda x: x),
+                           *[d._jrdd_deserializer for d in dstreams])
+        jfunc = self._jvm.RDDFunction(func)
+        jdstream = self._jssc.transform(jdstreams, jfunc)
         return DStream(jdstream, self, self._sc.serializer)
 
     def union(self, *dstreams):
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 4e3f07e26953b..87d5bb4906bd5 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -150,9 +150,6 @@ def partitionBy(self, numPartitions, partitionFunc=portable_hash):
         """
         return self.transform(lambda rdd: rdd.partitionBy(numPartitions, partitionFunc))
 
-    # def foreach(self, func):
-    #    return self.foreachRDD(lambda _, rdd: rdd.foreach(func))
-
     def foreachRDD(self, func):
         """
         Apply a function to each RDD in this DStream.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index ddbbf107abb3e..4a19f27fe9c7d 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -47,12 +47,12 @@ private[python] class RDDFunction(@transient var pfunc: PythonRDDFunction)
   extends function.Function2[JList[JavaRDD[_]], Time, JavaRDD[Array[Byte]]] with Serializable {
 
   def apply(rdd: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
-    PythonDStream.some(pfunc.call(time.milliseconds, List(PythonDStream.wrapRDD(rdd)).asJava))
+    Option(pfunc.call(time.milliseconds, List(rdd.map(JavaRDD.fromRDD(_)).orNull).asJava)).map(_.rdd)
   }
 
   def apply(rdd: Option[RDD[_]], rdd2: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
-    val rdds = List(PythonDStream.wrapRDD(rdd), PythonDStream.wrapRDD(rdd2)).asJava
-    PythonDStream.some(pfunc.call(time.milliseconds, rdds))
+    val rdds = List(rdd.map(JavaRDD.fromRDD(_)).orNull, rdd2.map(JavaRDD.fromRDD(_)).orNull).asJava
+    Option(pfunc.call(time.milliseconds, rdds)).map(_.rdd)
   }
 
   // for function.Function2
@@ -115,39 +115,13 @@ private[python] object PythonDStream {
     serializer = new RDDFunctionSerializer(ser)
   }
 
-  // convert Option[RDD[_]] to JavaRDD, handle null gracefully
-  def wrapRDD(rdd: Option[RDD[_]]): JavaRDD[_] = {
-    if (rdd.isDefined) {
-      JavaRDD.fromRDD(rdd.get)
-    } else {
-      null
-    }
-  }
-
-  // convert JavaRDD to Option[RDD[Array[Byte]]] to , handle null gracefully
-  def some(jrdd: JavaRDD[Array[Byte]]): Option[RDD[Array[Byte]]] = {
-    if (jrdd != null) {
-      Some(jrdd.rdd)
-    } else {
-      None
-    }
-  }
-
   // helper function for DStream.foreachRDD(),
   // cannot be `foreachRDD`, it will confusing py4j
-  def callForeachRDD(jdstream: JavaDStream[Array[Byte]], pfunc: PythonRDDFunction){
+  def callForeachRDD(jdstream: JavaDStream[Array[Byte]], pfunc: PythonRDDFunction) {
     val func = new RDDFunction((pfunc))
     jdstream.dstream.foreachRDD((rdd, time) => func(Some(rdd), time))
   }
 
-  // helper function for ssc.transform()
-  def callTransform(ssc: JavaStreamingContext, jdsteams: JList[JavaDStream[_]],
-                    pyfunc: PythonRDDFunction)
-    :JavaDStream[Array[Byte]] = {
-    val func = new RDDFunction(pyfunc)
-    ssc.transform(jdsteams, func)
-  }
-
   // convert list of RDD into queue of RDDs, for ssc.queueStream()
   def toRDDQueue(rdds: JArrayList[JavaRDD[Array[Byte]]]): java.util.Queue[JavaRDD[Array[Byte]]] = {
     val queue = new java.util.LinkedList[JavaRDD[Array[Byte]]]
@@ -232,7 +206,7 @@ class PythonTransformed2DStream(parent: DStream[_], parent2: DStream[_],
     func(parent.getOrCompute(validTime), parent2.getOrCompute(validTime), validTime)
   }
 
-  val asJavaDStream  = JavaDStream.fromDStream(this)
+  val asJavaDStream = JavaDStream.fromDStream(this)
 }
 
 /**

From fa7261b5610a02fe725f975fada995d37234f615 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Tue, 30 Sep 2014 13:36:00 -0700
Subject: [PATCH 328/347] refactor

---
 .../apache/spark/streaming/api/python/PythonDStream.scala    | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 4a19f27fe9c7d..f2ed0c507c2b7 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -239,7 +239,10 @@ class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
                                    @transient pinvReduceFunc: PythonRDDFunction,
                                    _windowDuration: Duration,
                                    _slideDuration: Duration
-                                   ) extends PythonStateDStream(parent, preduceFunc) {
+                                   ) extends PythonDStream(parent, preduceFunc) {
+
+  super.persist(StorageLevel.MEMORY_ONLY)
+  override val mustCheckpoint = true
 
   val invReduceFunc = new RDDFunction(pinvReduceFunc)
 

From 6f0da2fa486c2a580045a2e9e3133b6617875363 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Wed, 1 Oct 2014 00:08:54 -0700
Subject: [PATCH 329/347] recover from checkpoint

---
 .../apache/spark/api/python/PythonRDD.scala   |  8 +-
 .../spark/rdd/ParallelCollectionRDD.scala     |  2 +-
 .../main/scala/org/apache/spark/rdd/RDD.scala |  8 ++
 python/pyspark/context.py                     |  8 +-
 python/pyspark/streaming/context.py           | 76 ++++++++++++++-----
 python/pyspark/streaming/tests.py             | 33 ++++++++
 python/pyspark/streaming/util.py              | 24 ++++--
 .../streaming/api/python/PythonDStream.scala  |  8 +-
 .../streaming/dstream/QueueInputDStream.scala |  7 ++
 9 files changed, 136 insertions(+), 38 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 8051b221ac3d1..b093917430a59 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -42,7 +42,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.util.Utils
 
 private[spark] class PythonRDD(
-    parent: RDD[_],
+    @transient parent: RDD[_],
     command: Array[Byte],
     envVars: JMap[String, String],
     pythonIncludes: JList[String],
@@ -61,9 +61,9 @@ private[spark] class PythonRDD(
   val bufferSize = conf.getInt("spark.buffer.size", 65536)
   val reuse_worker = conf.getBoolean("spark.python.worker.reuse", true)
 
-  override def getPartitions = parent.partitions
+  override def getPartitions = firstParent.partitions
 
-  override val partitioner = if (preservePartitoning) parent.partitioner else None
+  override val partitioner = if (preservePartitoning) firstParent.partitioner else None
 
   override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {
     val startTime = System.currentTimeMillis
@@ -241,7 +241,7 @@ private[spark] class PythonRDD(
         dataOut.writeInt(command.length)
         dataOut.write(command)
         // Data values
-        PythonRDD.writeIteratorToStream(parent.iterator(split, context), dataOut)
+        PythonRDD.writeIteratorToStream(firstParent.iterator(split, context), dataOut)
         dataOut.writeInt(SpecialLengths.END_OF_DATA_SECTION)
         dataOut.flush()
       } catch {
diff --git a/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala
index 66c71bf7e8bb5..1069e23241302 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala
@@ -84,7 +84,7 @@ private[spark] class ParallelCollectionPartition[T: ClassTag](
 
 private[spark] class ParallelCollectionRDD[T: ClassTag](
     @transient sc: SparkContext,
-    @transient data: Seq[T],
+    data: Seq[T],
     numSlices: Int,
     locationPrefs: Map[Int, Seq[String]])
     extends RDD[T](sc, Nil) {
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 0e90caa5c9ca7..352ce5e00d5ec 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -82,6 +82,14 @@ abstract class RDD[T: ClassTag](
   def this(@transient oneParent: RDD[_]) =
     this(oneParent.context , List(new OneToOneDependency(oneParent)))
 
+  // setContext after loading from checkpointing
+  private[spark] def setContext(s: SparkContext) = {
+    if (sc != null && sc != s) {
+      throw new SparkException("Context is already set in " + this + ", cannot set it again")
+    }
+    sc = s
+  }
+
   private[spark] def conf = sc.conf
   // =======================================================================
   // Methods that should be implemented by subclasses of RDD
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 8e7b00469e246..ba930d949101d 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -68,7 +68,7 @@ class SparkContext(object):
 
     def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
                  environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
-                 gateway=None):
+                 gateway=None, jsc=None):
         """
         Create a new SparkContext. At least the master and app name should be set,
         either through the named parameters here or through C{conf}.
@@ -103,14 +103,14 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         SparkContext._ensure_initialized(self, gateway=gateway)
         try:
             self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
-                          conf)
+                          conf, jsc)
         except:
             # If an error occurs, clean up in order to allow future SparkContext creation:
             self.stop()
             raise
 
     def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
-                 conf):
+                 conf, jsc):
         self.environment = environment or {}
         self._conf = conf or SparkConf(_jvm=self._jvm)
         self._batchSize = batchSize  # -1 represents an unlimited batch size
@@ -151,7 +151,7 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
                 self.environment[varName] = v
 
         # Create the Java SparkContext through Py4J
-        self._jsc = self._initialize_context(self._conf._jconf)
+        self._jsc = jsc or self._initialize_context(self._conf._jconf)
 
         # Create a single Accumulator in Java that we'll send all our updates through;
         # they will be passed back to us through a TCP server
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 9808361eb664f..759feda169cff 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -14,11 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import os
+import sys
 
 from py4j.java_collections import ListConverter
 from py4j.java_gateway import java_import
 
-from pyspark import RDD
+from pyspark import RDD, SparkConf
 from pyspark.serializers import UTF8Deserializer, CloudPickleSerializer
 from pyspark.context import SparkContext
 from pyspark.storagelevel import StorageLevel
@@ -75,41 +77,81 @@ class StreamingContext(object):
     respectively. `context.awaitTransformation()` allows the current thread
     to wait for the termination of the context by `stop()` or by an exception.
     """
+    _transformerSerializer = None
 
-    def __init__(self, sparkContext, duration):
+    def __init__(self, sparkContext, duration=None, jssc=None):
         """
         Create a new StreamingContext.
 
         @param sparkContext: L{SparkContext} object.
         @param duration: number of seconds.
         """
+
         self._sc = sparkContext
         self._jvm = self._sc._jvm
-        self._start_callback_server()
-        self._jssc = self._initialize_context(self._sc, duration)
+        self._jssc = jssc or self._initialize_context(self._sc, duration)
+
+    def _initialize_context(self, sc, duration):
+        self._ensure_initialized()
+        return self._jvm.JavaStreamingContext(sc._jsc, self._jduration(duration))
+
+    def _jduration(self, seconds):
+        """
+        Create Duration object given number of seconds
+        """
+        return self._jvm.Duration(int(seconds * 1000))
 
-    def _start_callback_server(self):
-        gw = self._sc._gateway
+    @classmethod
+    def _ensure_initialized(cls):
+        SparkContext._ensure_initialized()
+        gw = SparkContext._gateway
+        # start callback server
         # getattr will fallback to JVM
         if "_callback_server" not in gw.__dict__:
             _daemonize_callback_server()
             gw._start_callback_server(gw._python_proxy_port)
-            gw._python_proxy_port = gw._callback_server.port  # update port with real port
 
-    def _initialize_context(self, sc, duration):
-        java_import(self._jvm, "org.apache.spark.streaming.*")
-        java_import(self._jvm, "org.apache.spark.streaming.api.java.*")
-        java_import(self._jvm, "org.apache.spark.streaming.api.python.*")
+        java_import(gw.jvm, "org.apache.spark.streaming.*")
+        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
+        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")
         # register serializer for RDDFunction
-        ser = RDDFunctionSerializer(self._sc, CloudPickleSerializer())
-        self._jvm.PythonDStream.registerSerializer(ser)
-        return self._jvm.JavaStreamingContext(sc._jsc, self._jduration(duration))
+        # it happens before creating SparkContext when loading from checkpointing
+        cls._transformerSerializer = RDDFunctionSerializer(SparkContext._active_spark_context,
+                                                           CloudPickleSerializer(), gw)
+        gw.jvm.PythonDStream.registerSerializer(cls._transformerSerializer)
 
-    def _jduration(self, seconds):
+    @classmethod
+    def getOrCreate(cls, path, setupFunc):
         """
-        Create Duration object given number of seconds
+        Get the StreamingContext from checkpoint file at `path`, or setup
+        it by `setupFunc`.
+
+        :param path: directory of checkpoint
+        :param setupFunc: a function used to create StreamingContext and
+                          setup DStreams.
+        :return: a StreamingContext
         """
-        return self._jvm.Duration(int(seconds * 1000))
+        if not os.path.exists(path) or not os.path.isdir(path) or not os.listdir(path):
+            ssc = setupFunc()
+            ssc.checkpoint(path)
+            return ssc
+
+        cls._ensure_initialized()
+        gw = SparkContext._gateway
+
+        try:
+            jssc = gw.jvm.JavaStreamingContext(path)
+        except Exception:
+            print >>sys.stderr, "failed to load StreamingContext from checkpoint"
+            raise
+
+        jsc = jssc.sparkContext()
+        conf = SparkConf(_jconf=jsc.getConf())
+        sc = SparkContext(conf=conf, gateway=gw, jsc=jsc)
+        # update ctx in serializer
+        SparkContext._active_spark_context = sc
+        cls._transformerSerializer.ctx = sc
+        return StreamingContext(sc, None, jssc)
 
     @property
     def sparkContext(self):
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index bd6d92255dbc6..00fea041d0be3 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -493,5 +493,38 @@ def func(rdds):
         self.assertEqual([2, 3, 1], self._take(dstream, 3))
 
 
+class TestCheckpoint(PySparkStreamingTestCase):
+
+    def setUp(self):
+        pass
+
+    def tearDown(self):
+        pass
+
+    def test_get_or_create(self):
+        result = [0]
+
+        def setup():
+            conf = SparkConf().set("spark.default.parallelism", 1)
+            sc = SparkContext(conf=conf)
+            ssc = StreamingContext(sc, .2)
+            rdd = sc.parallelize(range(10), 1)
+            dstream = ssc.queueStream([rdd], default=rdd)
+            result[0] = self._collect(dstream.countByWindow(1, .2))
+            return ssc
+        tmpd = tempfile.mkdtemp("test_streaming_cps")
+        ssc = StreamingContext.getOrCreate(tmpd, setup)
+        ssc.start()
+        ssc.awaitTermination(4)
+        ssc.stop()
+        expected = [[i * 10 + 10] for i in range(5)] + [[50]] * 5
+        self.assertEqual(expected, result[0][:10])
+
+        ssc = StreamingContext.getOrCreate(tmpd, setup)
+        ssc.start()
+        ssc.awaitTermination(2)
+        ssc.stop()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/pyspark/streaming/util.py b/python/pyspark/streaming/util.py
index c15f9d98c1866..4cfaa3fc50e18 100644
--- a/python/pyspark/streaming/util.py
+++ b/python/pyspark/streaming/util.py
@@ -18,24 +18,31 @@
 from datetime import datetime
 import traceback
 
-from pyspark.rdd import RDD
+from pyspark import SparkContext, RDD
 
 
 class RDDFunction(object):
     """
     This class is for py4j callback.
     """
+    _emptyRDD = None
+
     def __init__(self, ctx, func, *deserializers):
         self.ctx = ctx
         self.func = func
         self.deserializers = deserializers
-        emptyRDD = getattr(self.ctx, "_emptyRDD", None)
-        if emptyRDD is None:
-            self.ctx._emptyRDD = emptyRDD = self.ctx.parallelize([]).cache()
-        self.emptyRDD = emptyRDD
+
+    @property
+    def emptyRDD(self):
+        if self._emptyRDD is None and self.ctx:
+            self._emptyRDD = self.ctx.parallelize([]).cache()
+        return self._emptyRDD
 
     def call(self, milliseconds, jrdds):
         try:
+            if self.ctx is None:
+                self.ctx = SparkContext._active_spark_context
+
             # extend deserializers with the first one
             sers = self.deserializers
             if len(sers) < len(jrdds):
@@ -51,20 +58,21 @@ def call(self, milliseconds, jrdds):
             traceback.print_exc()
 
     def __repr__(self):
-        return "RDDFunction(%s)" % (str(self.func))
+        return "RDDFunction(%s)" % self.func
 
     class Java:
         implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction']
 
 
 class RDDFunctionSerializer(object):
-    def __init__(self, ctx, serializer):
+    def __init__(self, ctx, serializer, gateway=None):
         self.ctx = ctx
         self.serializer = serializer
+        self.gateway = gateway or self.ctx._gateway
 
     def dumps(self, id):
         try:
-            func = self.ctx._gateway.gateway_property.pool[id]
+            func = self.gateway.gateway_property.pool[id]
             return bytearray(self.serializer.dumps((func.func, func.deserializers)))
         except Exception:
             traceback.print_exc()
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index f2ed0c507c2b7..48d1f2ae17e8c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -77,7 +77,7 @@ private[python] class RDDFunction(@transient var pfunc: PythonRDDFunction)
 }
 
 /**
- * Inferface for Python Serializer to serialize PythonRDDFunction
+ * Interface for Python Serializer to serialize PythonRDDFunction
  */
 private[python] trait PythonRDDFunctionSerializer {
   def dumps(id: String): Array[Byte]  //
@@ -91,9 +91,9 @@ private[python] class RDDFunctionSerializer(pser: PythonRDDFunctionSerializer) {
   def serialize(func: PythonRDDFunction): Array[Byte] = {
     // get the id of PythonRDDFunction in py4j
     val h = Proxy.getInvocationHandler(func.asInstanceOf[Proxy])
-    val f = h.getClass().getDeclaredField("id");
-    f.setAccessible(true);
-    val id = f.get(h).asInstanceOf[String];
+    val f = h.getClass().getDeclaredField("id")
+    f.setAccessible(true)
+    val id = f.get(h).asInstanceOf[String]
     pser.dumps(id)
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
index ed7da6dc1315e..0557ac87b5a1e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.streaming.dstream
 
+import org.apache.spark.SparkException
 import org.apache.spark.rdd.RDD
 import org.apache.spark.rdd.UnionRDD
 import scala.collection.mutable.Queue
@@ -32,6 +33,12 @@ class QueueInputDStream[T: ClassTag](
     defaultRDD: RDD[T]
   ) extends InputDStream[T](ssc) {
 
+  private[streaming] override def setContext(s: StreamingContext) {
+    super.setContext(s)
+    queue.map(_.setContext(s.sparkContext))
+    defaultRDD.setContext(s.sparkContext)
+  }
+
   override def start() { }
 
   override def stop() { }

From d328aca2d5396ea75c7afffc4c45987c06fc43d9 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Wed, 1 Oct 2014 09:00:07 -0700
Subject: [PATCH 330/347] fix serializer in queueStream

---
 python/pyspark/streaming/context.py | 24 ++++++++++++++++--------
 python/pyspark/streaming/dstream.py |  2 +-
 python/pyspark/streaming/tests.py   |  6 +++---
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 759feda169cff..e3a34db566016 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -238,29 +238,37 @@ def textFileStream(self, directory):
 
     def _check_serialzers(self, rdds):
         # make sure they have same serializer
-        if len(set(rdd._jrdd_deserializer for rdd in rdds)):
+        if len(set(rdd._jrdd_deserializer for rdd in rdds)) > 1:
             for i in range(len(rdds)):
                 # reset them to sc.serializer
                 rdds[i] = rdds[i].map(lambda x: x, preservesPartitioning=True)
 
-    def queueStream(self, queue, oneAtATime=True, default=None):
+    def queueStream(self, rdds, oneAtATime=True, default=None):
         """
         Create an input stream from an queue of RDDs or list. In each batch,
         it will process either one or all of the RDDs returned by the queue.
 
         NOTE: changes to the queue after the stream is created will not be recognized.
-        @param queue      Queue of RDDs
-        @tparam T         Type of objects in the RDD
+
+        @param rdds       Queue of RDDs
+        @param oneAtATime pick one rdd each time or pick all of them once.
+        @param default    The default rdd if no more in rdds
         """
-        if queue and not isinstance(queue[0], RDD):
-            rdds = [self._sc.parallelize(input) for input in queue]
-        else:
-            rdds = queue
+        if default and not isinstance(default, RDD):
+            default = self._sc.parallelize(default)
+
+        if not rdds and default:
+            rdds = [rdds]
+
+        if rdds and not isinstance(rdds[0], RDD):
+            rdds = [self._sc.parallelize(input) for input in rdds]
         self._check_serialzers(rdds)
+
         jrdds = ListConverter().convert([r._jrdd for r in rdds],
                                         SparkContext._gateway._gateway_client)
         queue = self._jvm.PythonDStream.toRDDQueue(jrdds)
         if default:
+            default = default._reserialize(rdds[0]._jrdd_deserializer)
             jdstream = self._jssc.queueStream(queue, oneAtATime, default._jrdd)
         else:
             jdstream = self._jssc.queueStream(queue, oneAtATime)
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 87d5bb4906bd5..8fd6c68340381 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -292,7 +292,7 @@ def transformWith(self, func, other, keepSerializer=False):
             oldfunc = func
             func = lambda t, a, b: oldfunc(a, b)
         assert func.func_code.co_argcount == 3, "func should take two or three arguments"
-        jfunc = RDDFunction(self.ctx, func, self._jrdd_deserializer)
+        jfunc = RDDFunction(self.ctx, func, self._jrdd_deserializer, other._jrdd_deserializer)
         dstream = self.ctx._jvm.PythonTransformed2DStream(self._jdstream.dstream(),
                                                           other._jdstream.dstream(), jfunc)
         jrdd_serializer = self._jrdd_deserializer if keepSerializer else self.ctx.serializer
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 00fea041d0be3..9e9a0847e7146 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -508,16 +508,16 @@ def setup():
             conf = SparkConf().set("spark.default.parallelism", 1)
             sc = SparkContext(conf=conf)
             ssc = StreamingContext(sc, .2)
-            rdd = sc.parallelize(range(10), 1)
+            rdd = sc.parallelize(range(1), 1)
             dstream = ssc.queueStream([rdd], default=rdd)
-            result[0] = self._collect(dstream.countByWindow(1, .2))
+            result[0] = self._collect(dstream.countByWindow(1, 0.2))
             return ssc
         tmpd = tempfile.mkdtemp("test_streaming_cps")
         ssc = StreamingContext.getOrCreate(tmpd, setup)
         ssc.start()
         ssc.awaitTermination(4)
         ssc.stop()
-        expected = [[i * 10 + 10] for i in range(5)] + [[50]] * 5
+        expected = [[i * 1 + 1] for i in range(5)] + [[5]] * 5
         self.assertEqual(expected, result[0][:10])
 
         ssc = StreamingContext.getOrCreate(tmpd, setup)

From ff88bec11c497ab62225b945546949508a5b8347 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Wed, 1 Oct 2014 09:06:52 -0700
Subject: [PATCH 331/347] rename RDDFunction to TransformFunction

---
 python/pyspark/streaming/context.py           | 16 +++---
 python/pyspark/streaming/dstream.py           | 16 +++---
 python/pyspark/streaming/util.py              | 14 ++---
 .../streaming/api/python/PythonDStream.scala  | 52 +++++++++----------
 4 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index e3a34db566016..0f3662b9a54a6 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -25,7 +25,7 @@
 from pyspark.context import SparkContext
 from pyspark.storagelevel import StorageLevel
 from pyspark.streaming.dstream import DStream
-from pyspark.streaming.util import RDDFunction, RDDFunctionSerializer
+from pyspark.streaming.util import TransformFunction, TransformFunctionSerializer
 
 __all__ = ["StreamingContext"]
 
@@ -114,10 +114,10 @@ def _ensure_initialized(cls):
         java_import(gw.jvm, "org.apache.spark.streaming.*")
         java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
         java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")
-        # register serializer for RDDFunction
+        # register serializer for TransformFunction
         # it happens before creating SparkContext when loading from checkpointing
-        cls._transformerSerializer = RDDFunctionSerializer(SparkContext._active_spark_context,
-                                                           CloudPickleSerializer(), gw)
+        cls._transformerSerializer = TransformFunctionSerializer(
+            SparkContext._active_spark_context, CloudPickleSerializer(), gw)
         gw.jvm.PythonDStream.registerSerializer(cls._transformerSerializer)
 
     @classmethod
@@ -284,10 +284,10 @@ def transform(self, dstreams, transformFunc):
         jdstreams = ListConverter().convert([d._jdstream for d in dstreams],
                                             SparkContext._gateway._gateway_client)
         # change the final serializer to sc.serializer
-        func = RDDFunction(self._sc,
-                           lambda t, *rdds: transformFunc(rdds).map(lambda x: x),
-                           *[d._jrdd_deserializer for d in dstreams])
-        jfunc = self._jvm.RDDFunction(func)
+        func = TransformFunction(self._sc,
+                                 lambda t, *rdds: transformFunc(rdds).map(lambda x: x),
+                                 *[d._jrdd_deserializer for d in dstreams])
+        jfunc = self._jvm.TransformFunction(func)
         jdstream = self._jssc.transform(jdstreams, jfunc)
         return DStream(jdstream, self, self._sc.serializer)
 
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 8fd6c68340381..1b4a4421da0e0 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -22,7 +22,7 @@
 
 from pyspark import RDD
 from pyspark.storagelevel import StorageLevel
-from pyspark.streaming.util import rddToFileName, RDDFunction
+from pyspark.streaming.util import rddToFileName, TransformFunction
 from pyspark.rdd import portable_hash
 from pyspark.resultiterable import ResultIterable
 
@@ -154,7 +154,7 @@ def foreachRDD(self, func):
         """
         Apply a function to each RDD in this DStream.
         """
-        jfunc = RDDFunction(self.ctx, func, self._jrdd_deserializer)
+        jfunc = TransformFunction(self.ctx, func, self._jrdd_deserializer)
         api = self._ssc._jvm.PythonDStream
         api.callForeachRDD(self._jdstream, jfunc)
 
@@ -292,7 +292,7 @@ def transformWith(self, func, other, keepSerializer=False):
             oldfunc = func
             func = lambda t, a, b: oldfunc(a, b)
         assert func.func_code.co_argcount == 3, "func should take two or three arguments"
-        jfunc = RDDFunction(self.ctx, func, self._jrdd_deserializer, other._jrdd_deserializer)
+        jfunc = TransformFunction(self.ctx, func, self._jrdd_deserializer, other._jrdd_deserializer)
         dstream = self.ctx._jvm.PythonTransformed2DStream(self._jdstream.dstream(),
                                                           other._jdstream.dstream(), jfunc)
         jrdd_serializer = self._jrdd_deserializer if keepSerializer else self.ctx.serializer
@@ -535,9 +535,9 @@ def invReduceFunc(t, a, b):
             joined = a.leftOuterJoin(b, numPartitions)
             return joined.mapValues(lambda (v1, v2): invFunc(v1, v2) if v2 is not None else v1)
 
-        jreduceFunc = RDDFunction(self.ctx, reduceFunc, reduced._jrdd_deserializer)
+        jreduceFunc = TransformFunction(self.ctx, reduceFunc, reduced._jrdd_deserializer)
         if invReduceFunc:
-            jinvReduceFunc = RDDFunction(self.ctx, invReduceFunc, reduced._jrdd_deserializer)
+            jinvReduceFunc = TransformFunction(self.ctx, invReduceFunc, reduced._jrdd_deserializer)
         else:
             jinvReduceFunc = None
         if slideDuration is None:
@@ -568,8 +568,8 @@ def reduceFunc(t, a, b):
             state = g.mapPartitions(lambda x: updateFunc(x))
             return state.filter(lambda (k, v): v is not None)
 
-        jreduceFunc = RDDFunction(self.ctx, reduceFunc,
-                                  self.ctx.serializer, self._jrdd_deserializer)
+        jreduceFunc = TransformFunction(self.ctx, reduceFunc,
+                                        self.ctx.serializer, self._jrdd_deserializer)
         dstream = self.ctx._jvm.PythonStateDStream(self._jdstream.dstream(), jreduceFunc)
         return DStream(dstream.asJavaDStream(), self._ssc, self.ctx.serializer)
 
@@ -609,7 +609,7 @@ def _jdstream(self):
             return self._jdstream_val
 
         func = self.func
-        jfunc = RDDFunction(self.ctx, func, self.prev._jrdd_deserializer)
+        jfunc = TransformFunction(self.ctx, func, self.prev._jrdd_deserializer)
         jdstream = self.ctx._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(),
                                                           jfunc, self.reuse).asJavaDStream()
         self._jdstream_val = jdstream
diff --git a/python/pyspark/streaming/util.py b/python/pyspark/streaming/util.py
index 4cfaa3fc50e18..4f07e44aa2d43 100644
--- a/python/pyspark/streaming/util.py
+++ b/python/pyspark/streaming/util.py
@@ -21,7 +21,7 @@
 from pyspark import SparkContext, RDD
 
 
-class RDDFunction(object):
+class TransformFunction(object):
     """
     This class is for py4j callback.
     """
@@ -58,13 +58,13 @@ def call(self, milliseconds, jrdds):
             traceback.print_exc()
 
     def __repr__(self):
-        return "RDDFunction(%s)" % self.func
+        return "TransformFunction(%s)" % self.func
 
     class Java:
-        implements = ['org.apache.spark.streaming.api.python.PythonRDDFunction']
+        implements = ['org.apache.spark.streaming.api.python.PythonTransformFunction']
 
 
-class RDDFunctionSerializer(object):
+class TransformFunctionSerializer(object):
     def __init__(self, ctx, serializer, gateway=None):
         self.ctx = ctx
         self.serializer = serializer
@@ -80,15 +80,15 @@ def dumps(self, id):
     def loads(self, bytes):
         try:
             f, deserializers = self.serializer.loads(str(bytes))
-            return RDDFunction(self.ctx, f, *deserializers)
+            return TransformFunction(self.ctx, f, *deserializers)
         except Exception:
             traceback.print_exc()
 
     def __repr__(self):
-        return "RDDFunctionSerializer(%s)" % self.serializer
+        return "TransformFunctionSerializer(%s)" % self.serializer
 
     class Java:
-        implements = ['org.apache.spark.streaming.api.python.PythonRDDFunctionSerializer']
+        implements = ['org.apache.spark.streaming.api.python.PythonTransformFunctionSerializer']
 
 
 def rddToFileName(prefix, suffix, time):
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 48d1f2ae17e8c..59bb2ed5fa042 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -35,15 +35,15 @@ import org.apache.spark.streaming.api.java._
 /**
  * Interface for Python callback function with three arguments
  */
-private[python] trait PythonRDDFunction {
+private[python] trait PythonTransformFunction {
   def call(time: Long, rdds: JList[_]): JavaRDD[Array[Byte]]
 }
 
 /**
- * Wrapper for PythonRDDFunction
+ * Wrapper for PythonTransformFunction
  * TODO: support checkpoint
  */
-private[python] class RDDFunction(@transient var pfunc: PythonRDDFunction)
+private[python] class TransformFunction(@transient var pfunc: PythonTransformFunction)
   extends function.Function2[JList[JavaRDD[_]], Time, JavaRDD[Array[Byte]]] with Serializable {
 
   def apply(rdd: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
@@ -77,19 +77,19 @@ private[python] class RDDFunction(@transient var pfunc: PythonRDDFunction)
 }
 
 /**
- * Interface for Python Serializer to serialize PythonRDDFunction
+ * Interface for Python Serializer to serialize PythonTransformFunction
  */
-private[python] trait PythonRDDFunctionSerializer {
+private[python] trait PythonTransformFunctionSerializer {
   def dumps(id: String): Array[Byte]  //
-  def loads(bytes: Array[Byte]): PythonRDDFunction
+  def loads(bytes: Array[Byte]): PythonTransformFunction
 }
 
 /**
- * Wrapper for PythonRDDFunctionSerializer
+ * Wrapper for PythonTransformFunctionSerializer
  */
-private[python] class RDDFunctionSerializer(pser: PythonRDDFunctionSerializer) {
-  def serialize(func: PythonRDDFunction): Array[Byte] = {
-    // get the id of PythonRDDFunction in py4j
+private[python] class TransformFunctionSerializer(pser: PythonTransformFunctionSerializer) {
+  def serialize(func: PythonTransformFunction): Array[Byte] = {
+    // get the id of PythonTransformFunction in py4j
     val h = Proxy.getInvocationHandler(func.asInstanceOf[Proxy])
     val f = h.getClass().getDeclaredField("id")
     f.setAccessible(true)
@@ -97,7 +97,7 @@ private[python] class RDDFunctionSerializer(pser: PythonRDDFunctionSerializer) {
     pser.dumps(id)
   }
 
-  def deserialize(bytes: Array[Byte]): PythonRDDFunction = {
+  def deserialize(bytes: Array[Byte]): PythonTransformFunction = {
     pser.loads(bytes)
   }
 }
@@ -107,18 +107,18 @@ private[python] class RDDFunctionSerializer(pser: PythonRDDFunctionSerializer) {
  */
 private[python] object PythonDStream {
 
-  // A serializer in Python, used to serialize PythonRDDFunction
-  var serializer: RDDFunctionSerializer = _
+  // A serializer in Python, used to serialize PythonTransformFunction
+  var serializer: TransformFunctionSerializer = _
 
   // Register a serializer from Python, should be called during initialization
-  def registerSerializer(ser: PythonRDDFunctionSerializer) = {
-    serializer = new RDDFunctionSerializer(ser)
+  def registerSerializer(ser: PythonTransformFunctionSerializer) = {
+    serializer = new TransformFunctionSerializer(ser)
   }
 
   // helper function for DStream.foreachRDD(),
   // cannot be `foreachRDD`, it will confusing py4j
-  def callForeachRDD(jdstream: JavaDStream[Array[Byte]], pfunc: PythonRDDFunction) {
-    val func = new RDDFunction((pfunc))
+  def callForeachRDD(jdstream: JavaDStream[Array[Byte]], pfunc: PythonTransformFunction) {
+    val func = new TransformFunction((pfunc))
     jdstream.dstream.foreachRDD((rdd, time) => func(Some(rdd), time))
   }
 
@@ -134,10 +134,10 @@ private[python] object PythonDStream {
  * Base class for PythonDStream with some common methods
  */
 private[python]
-abstract class PythonDStream(parent: DStream[_], @transient pfunc: PythonRDDFunction)
+abstract class PythonDStream(parent: DStream[_], @transient pfunc: PythonTransformFunction)
   extends DStream[Array[Byte]] (parent.ssc) {
 
-  val func = new RDDFunction(pfunc)
+  val func = new TransformFunction(pfunc)
 
   override def dependencies = List(parent)
 
@@ -153,7 +153,7 @@ abstract class PythonDStream(parent: DStream[_], @transient pfunc: PythonRDDFunc
  * as an template for future use, this can reduce the Python callbacks.
  */
 private[python]
-class PythonTransformedDStream (parent: DStream[_], @transient pfunc: PythonRDDFunction,
+class PythonTransformedDStream (parent: DStream[_], @transient pfunc: PythonTransformFunction,
                                 var reuse: Boolean = false)
   extends PythonDStream(parent, pfunc) {
 
@@ -193,10 +193,10 @@ class PythonTransformedDStream (parent: DStream[_], @transient pfunc: PythonRDDF
  */
 private[python]
 class PythonTransformed2DStream(parent: DStream[_], parent2: DStream[_],
-                                @transient pfunc: PythonRDDFunction)
+                                @transient pfunc: PythonTransformFunction)
   extends DStream[Array[Byte]] (parent.ssc) {
 
-  val func = new RDDFunction(pfunc)
+  val func = new TransformFunction(pfunc)
 
   override def slideDuration: Duration = parent.slideDuration
 
@@ -213,7 +213,7 @@ class PythonTransformed2DStream(parent: DStream[_], parent2: DStream[_],
  * similar to StateDStream
  */
 private[python]
-class PythonStateDStream(parent: DStream[Array[Byte]], @transient reduceFunc: PythonRDDFunction)
+class PythonStateDStream(parent: DStream[Array[Byte]], @transient reduceFunc: PythonTransformFunction)
   extends PythonDStream(parent, reduceFunc) {
 
   super.persist(StorageLevel.MEMORY_ONLY)
@@ -235,8 +235,8 @@ class PythonStateDStream(parent: DStream[Array[Byte]], @transient reduceFunc: Py
  */
 private[python]
 class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
-                                   @transient preduceFunc: PythonRDDFunction,
-                                   @transient pinvReduceFunc: PythonRDDFunction,
+                                   @transient preduceFunc: PythonTransformFunction,
+                                   @transient pinvReduceFunc: PythonTransformFunction,
                                    _windowDuration: Duration,
                                    _slideDuration: Duration
                                    ) extends PythonDStream(parent, preduceFunc) {
@@ -244,7 +244,7 @@ class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
   super.persist(StorageLevel.MEMORY_ONLY)
   override val mustCheckpoint = true
 
-  val invReduceFunc = new RDDFunction(pinvReduceFunc)
+  val invReduceFunc = new TransformFunction(pinvReduceFunc)
 
   def windowDuration: Duration = _windowDuration
   override def slideDuration: Duration = _slideDuration

From 7797c70f321b9ba5a66ad6a2744cf8e829dde011 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Wed, 1 Oct 2014 09:09:25 -0700
Subject: [PATCH 332/347] refactor

---
 .../org/apache/spark/streaming/api/python/PythonDStream.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 59bb2ed5fa042..5ab15f717903e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -174,7 +174,7 @@ class PythonTransformedDStream (parent: DStream[_], @transient pfunc: PythonTran
         // try to use the result as a template
         r.get match {
           case pyrdd: PythonRDD =>
-            if (pyrdd.parent(0) == rdd) {
+            if (pyrdd.firstParent == rdd) {
               // only one PythonRDD
               lastResult = pyrdd
             } else {

From bd8a4c2516147f1e99cf1f6e721346c18db23a20 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Wed, 1 Oct 2014 09:26:26 -0700
Subject: [PATCH 333/347] fix scala style

---
 .../streaming/api/python/PythonDStream.scala  | 47 ++++++++++---------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 5ab15f717903e..5afcb84857350 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -47,7 +47,8 @@ private[python] class TransformFunction(@transient var pfunc: PythonTransformFun
   extends function.Function2[JList[JavaRDD[_]], Time, JavaRDD[Array[Byte]]] with Serializable {
 
   def apply(rdd: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
-    Option(pfunc.call(time.milliseconds, List(rdd.map(JavaRDD.fromRDD(_)).orNull).asJava)).map(_.rdd)
+    Option(pfunc.call(time.milliseconds, List(rdd.map(JavaRDD.fromRDD(_)).orNull).asJava))
+      .map(_.rdd)
   }
 
   def apply(rdd: Option[RDD[_]], rdd2: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
@@ -133,8 +134,9 @@ private[python] object PythonDStream {
 /**
  * Base class for PythonDStream with some common methods
  */
-private[python]
-abstract class PythonDStream(parent: DStream[_], @transient pfunc: PythonTransformFunction)
+private[python] abstract class PythonDStream(
+    parent: DStream[_],
+    @transient pfunc: PythonTransformFunction)
   extends DStream[Array[Byte]] (parent.ssc) {
 
   val func = new TransformFunction(pfunc)
@@ -152,9 +154,10 @@ abstract class PythonDStream(parent: DStream[_], @transient pfunc: PythonTransfo
  * If `reuse` is true and the result of the `func` is an PythonRDD, then it will cache it
  * as an template for future use, this can reduce the Python callbacks.
  */
-private[python]
-class PythonTransformedDStream (parent: DStream[_], @transient pfunc: PythonTransformFunction,
-                                var reuse: Boolean = false)
+private[python] class PythonTransformedDStream (
+    parent: DStream[_],
+    @transient pfunc: PythonTransformFunction,
+    var reuse: Boolean = false)
   extends PythonDStream(parent, pfunc) {
 
   // rdd returned by func
@@ -191,9 +194,10 @@ class PythonTransformedDStream (parent: DStream[_], @transient pfunc: PythonTran
 /**
  * Transformed from two DStreams in Python.
  */
-private[python]
-class PythonTransformed2DStream(parent: DStream[_], parent2: DStream[_],
-                                @transient pfunc: PythonTransformFunction)
+private[python] class PythonTransformed2DStream(
+    parent: DStream[_],
+    parent2: DStream[_],
+    @transient pfunc: PythonTransformFunction)
   extends DStream[Array[Byte]] (parent.ssc) {
 
   val func = new TransformFunction(pfunc)
@@ -212,8 +216,9 @@ class PythonTransformed2DStream(parent: DStream[_], parent2: DStream[_],
 /**
  * similar to StateDStream
  */
-private[python]
-class PythonStateDStream(parent: DStream[Array[Byte]], @transient reduceFunc: PythonTransformFunction)
+private[python] class PythonStateDStream(
+    parent: DStream[Array[Byte]],
+    @transient reduceFunc: PythonTransformFunction)
   extends PythonDStream(parent, reduceFunc) {
 
   super.persist(StorageLevel.MEMORY_ONLY)
@@ -233,13 +238,13 @@ class PythonStateDStream(parent: DStream[Array[Byte]], @transient reduceFunc: Py
 /**
  * similar to ReducedWindowedDStream
  */
-private[python]
-class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
-                                   @transient preduceFunc: PythonTransformFunction,
-                                   @transient pinvReduceFunc: PythonTransformFunction,
-                                   _windowDuration: Duration,
-                                   _slideDuration: Duration
-                                   ) extends PythonDStream(parent, preduceFunc) {
+private[python] class PythonReducedWindowedDStream(
+    parent: DStream[Array[Byte]],
+    @transient preduceFunc: PythonTransformFunction,
+    @transient pinvReduceFunc: PythonTransformFunction,
+    _windowDuration: Duration,
+    _slideDuration: Duration)
+  extends PythonDStream(parent, preduceFunc) {
 
   super.persist(StorageLevel.MEMORY_ONLY)
   override val mustCheckpoint = true
@@ -252,8 +257,7 @@ class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
 
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     val currentTime = validTime
-    val current = new Interval(currentTime - windowDuration,
-      currentTime)
+    val current = new Interval(currentTime - windowDuration, currentTime)
     val previous = current - slideDuration
 
     //  _____________________________
@@ -266,11 +270,10 @@ class PythonReducedWindowedDStream(parent: DStream[Array[Byte]],
     //          V                             V
     //       old RDDs                     new RDDs
     //
-
     val previousRDD = getOrCompute(previous.endTime)
 
+    // for small window, reduce once will be better than twice
     if (pinvReduceFunc != null && previousRDD.isDefined
-        // for small window, reduce once will be better than twice
         && windowDuration >= slideDuration * 5) {
 
       // subtract the values from old RDDs

From 7a88f9f1b054468b40e3134d7f4e0be8aacb03fa Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Wed, 1 Oct 2014 12:11:40 -0700
Subject: [PATCH 334/347] rollback RDD.setContext(), use textFileStream() to
 test checkpointing

---
 .../spark/rdd/ParallelCollectionRDD.scala     |  2 +-
 .../main/scala/org/apache/spark/rdd/RDD.scala |  8 ---
 python/pyspark/streaming/tests.py             | 52 ++++++++++---------
 .../streaming/dstream/QueueInputDStream.scala |  7 ---
 4 files changed, 28 insertions(+), 41 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala
index 1069e23241302..66c71bf7e8bb5 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala
@@ -84,7 +84,7 @@ private[spark] class ParallelCollectionPartition[T: ClassTag](
 
 private[spark] class ParallelCollectionRDD[T: ClassTag](
     @transient sc: SparkContext,
-    data: Seq[T],
+    @transient data: Seq[T],
     numSlices: Int,
     locationPrefs: Map[Int, Seq[String]])
     extends RDD[T](sc, Nil) {
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 352ce5e00d5ec..0e90caa5c9ca7 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -82,14 +82,6 @@ abstract class RDD[T: ClassTag](
   def this(@transient oneParent: RDD[_]) =
     this(oneParent.context , List(new OneToOneDependency(oneParent)))
 
-  // setContext after loading from checkpointing
-  private[spark] def setContext(s: SparkContext) = {
-    if (sc != null && sc != s) {
-      throw new SparkException("Context is already set in " + this + ", cannot set it again")
-    }
-    sc = s
-  }
-
   private[spark] def conf = sc.conf
   // =======================================================================
   // Methods that should be implemented by subclasses of RDD
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 9e9a0847e7146..b489c8b3f46f3 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -70,7 +70,8 @@ def _collect(self, dstream):
 
         def get_output(_, rdd):
             r = rdd.collect()
-            result.append(r)
+            if r:
+                result.append(r)
         dstream.foreachRDD(get_output)
         return result
 
@@ -449,24 +450,18 @@ def test_queueStream(self):
         time.sleep(1)
         self.assertEqual(input, result[:3])
 
-    # TODO: fix this test
-    # def test_textFileStream(self):
-    #     input = [range(i) for i in range(3)]
-    #     dstream = self.ssc.queueStream(input)
-    #     d = os.path.join(tempfile.gettempdir(), str(id(self)))
-    #     if not os.path.exists(d):
-    #         os.makedirs(d)
-    #     dstream.saveAsTextFiles(os.path.join(d, 'test'))
-    #     self.ssc.start()
-    #     time.sleep(1)
-    #     self.ssc.stop(False, True)
-    #
-    #     self.ssc = StreamingContext(self.sc, self.batachDuration)
-    #     dstream2 = self.ssc.textFileStream(d)
-    #     result = self._collect(dstream2)
-    #     self.ssc.start()
-    #     time.sleep(2)
-    #     self.assertEqual(input, result[:3])
+    def test_textFileStream(self):
+        d = tempfile.mkdtemp()
+        self.ssc = StreamingContext(self.sc, self.duration)
+        dstream2 = self.ssc.textFileStream(d).map(int)
+        result = self._collect(dstream2)
+        self.ssc.start()
+        time.sleep(1)
+        for name in ('a', 'b'):
+            with open(os.path.join(d, name), "w") as f:
+                f.writelines(["%d\n" % i for i in range(10)])
+        time.sleep(2)
+        self.assertEqual([range(10) * 2], result[:3])
 
     def test_union(self):
         input = [range(i) for i in range(3)]
@@ -503,27 +498,34 @@ def tearDown(self):
 
     def test_get_or_create(self):
         result = [0]
+        inputd = tempfile.mkdtemp()
 
         def setup():
             conf = SparkConf().set("spark.default.parallelism", 1)
             sc = SparkContext(conf=conf)
             ssc = StreamingContext(sc, .2)
-            rdd = sc.parallelize(range(1), 1)
-            dstream = ssc.queueStream([rdd], default=rdd)
-            result[0] = self._collect(dstream.countByWindow(1, 0.2))
+            dstream = ssc.textFileStream(inputd)
+            result[0] = self._collect(dstream.count())
             return ssc
+
         tmpd = tempfile.mkdtemp("test_streaming_cps")
         ssc = StreamingContext.getOrCreate(tmpd, setup)
         ssc.start()
+        time.sleep(1)
+        with open(os.path.join(inputd, "1"), 'w') as f:
+            f.writelines(["%d\n" % i for i in range(10)])
         ssc.awaitTermination(4)
-        ssc.stop()
+        ssc.stop(True, True)
         expected = [[i * 1 + 1] for i in range(5)] + [[5]] * 5
-        self.assertEqual(expected, result[0][:10])
+        self.assertEqual([[10]], result[0][:1])
 
         ssc = StreamingContext.getOrCreate(tmpd, setup)
         ssc.start()
+        time.sleep(1)
+        with open(os.path.join(inputd, "1"), 'w') as f:
+            f.writelines(["%d\n" % i for i in range(10)])
         ssc.awaitTermination(2)
-        ssc.stop()
+        ssc.stop(True, True)
 
 
 if __name__ == "__main__":
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
index 0557ac87b5a1e..ed7da6dc1315e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.streaming.dstream
 
-import org.apache.spark.SparkException
 import org.apache.spark.rdd.RDD
 import org.apache.spark.rdd.UnionRDD
 import scala.collection.mutable.Queue
@@ -33,12 +32,6 @@ class QueueInputDStream[T: ClassTag](
     defaultRDD: RDD[T]
   ) extends InputDStream[T](ssc) {
 
-  private[streaming] override def setContext(s: StreamingContext) {
-    super.setContext(s)
-    queue.map(_.setContext(s.sparkContext))
-    defaultRDD.setContext(s.sparkContext)
-  }
-
   override def start() { }
 
   override def stop() { }

From 54bd92b5800ce9165e53289c44603e6a89c5ed75 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Wed, 1 Oct 2014 23:34:41 -0700
Subject: [PATCH 335/347] improve tests

---
 python/pyspark/streaming/context.py           |   1 -
 python/pyspark/streaming/dstream.py           |  29 +++-
 python/pyspark/streaming/tests.py             | 150 ++++++++++--------
 python/pyspark/streaming/util.py              |  23 +--
 .../streaming/api/python/PythonDStream.scala  |  56 ++++---
 5 files changed, 151 insertions(+), 108 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 0f3662b9a54a6..b84e12ebac1dc 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -118,7 +118,6 @@ def _ensure_initialized(cls):
         # it happens before creating SparkContext when loading from checkpointing
         cls._transformerSerializer = TransformFunctionSerializer(
             SparkContext._active_spark_context, CloudPickleSerializer(), gw)
-        gw.jvm.PythonDStream.registerSerializer(cls._transformerSerializer)
 
     @classmethod
     def getOrCreate(cls, path, setupFunc):
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 1b4a4421da0e0..f8ebb7e68d8d7 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -20,6 +20,8 @@
 import time
 from datetime import datetime
 
+from py4j.protocol import Py4JJavaError
+
 from pyspark import RDD
 from pyspark.storagelevel import StorageLevel
 from pyspark.streaming.util import rddToFileName, TransformFunction
@@ -249,9 +251,15 @@ def saveAsTextFiles(self, prefix, suffix=None):
         Save each RDD in this DStream as at text file, using string
         representation of elements.
         """
-        def saveAsTextFile(time, rdd):
-            path = rddToFileName(prefix, suffix, time)
-            rdd.saveAsTextFile(path)
+        def saveAsTextFile(t, rdd):
+            path = rddToFileName(prefix, suffix, t)
+            try:
+                rdd.saveAsTextFile(path)
+            except Py4JJavaError as e:
+                # after recovered from checkpointing, the foreachRDD may
+                # be called twice
+                if 'FileAlreadyExistsException' not in str(e):
+                    raise
         return self.foreachRDD(saveAsTextFile)
 
     def _saveAsPickleFiles(self, prefix, suffix=None):
@@ -259,9 +267,15 @@ def _saveAsPickleFiles(self, prefix, suffix=None):
         Save each RDD in this DStream as at binary file, the elements are
         serialized by pickle.
         """
-        def saveAsPickleFile(time, rdd):
-            path = rddToFileName(prefix, suffix, time)
-            rdd.saveAsPickleFile(path)
+        def saveAsPickleFile(t, rdd):
+            path = rddToFileName(prefix, suffix, t)
+            try:
+                rdd.saveAsPickleFile(path)
+            except Py4JJavaError as e:
+                # after recovered from checkpointing, the foreachRDD may
+                # be called twice
+                if 'FileAlreadyExistsException' not in str(e):
+                    raise
         return self.foreachRDD(saveAsPickleFile)
 
     def transform(self, func):
@@ -608,8 +622,7 @@ def _jdstream(self):
         if self._jdstream_val is not None:
             return self._jdstream_val
 
-        func = self.func
-        jfunc = TransformFunction(self.ctx, func, self.prev._jrdd_deserializer)
+        jfunc = TransformFunction(self.ctx, self.func, self.prev._jrdd_deserializer)
         jdstream = self.ctx._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(),
                                                           jfunc, self.reuse).asJavaDStream()
         self._jdstream_val = jdstream
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index b489c8b3f46f3..ff5986776a94e 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -42,6 +42,13 @@ def setUp(self):
     def tearDown(self):
         self.ssc.stop()
 
+    def wait_for(self, result, n):
+        start_time = time.time()
+        while len(result) < n and time.time() - start_time < self.timeout:
+            time.sleep(0.01)
+        if len(result) < n:
+            print "timeout after", self.timeout
+
     def _take(self, dstream, n):
         """
         Return the first `n` elements in the stream (will start and stop).
@@ -55,12 +62,10 @@ def take(_, rdd):
         dstream.foreachRDD(take)
 
         self.ssc.start()
-        while len(results) < n:
-            time.sleep(0.01)
-        self.ssc.stop(False, True)
+        self.wait_for(results, n)
         return results
 
-    def _collect(self, dstream):
+    def _collect(self, dstream, n, block=True):
         """
         Collect each RDDs into the returned list.
 
@@ -69,10 +74,18 @@ def _collect(self, dstream):
         result = []
 
         def get_output(_, rdd):
-            r = rdd.collect()
-            if r:
-                result.append(r)
+            if rdd and len(result) < n:
+                r = rdd.collect()
+                if r:
+                    result.append(r)
+
         dstream.foreachRDD(get_output)
+
+        if not block:
+            return result
+
+        self.ssc.start()
+        self.wait_for(result, n)
         return result
 
     def _test_func(self, input, func, expected, sort=False, input2=None):
@@ -94,23 +107,7 @@ def _test_func(self, input, func, expected, sort=False, input2=None):
         else:
             stream = func(input_stream)
 
-        result = self._collect(stream)
-        self.ssc.start()
-
-        start_time = time.time()
-        # Loop until get the expected the number of the result from the stream.
-        while True:
-            current_time = time.time()
-            # Check time out.
-            if (current_time - start_time) > self.timeout:
-                print "timeout after", self.timeout
-                break
-            # StreamingContext.awaitTermination is not used to wait because
-            # if py4j server is called every 50 milliseconds, it gets an error.
-            time.sleep(0.05)
-            # Check if the output is the same length of expected output.
-            if len(expected) == len(result):
-                break
+        result = self._collect(stream, len(expected))
         if sort:
             self._sort_result_based_on_key(result)
             self._sort_result_based_on_key(expected)
@@ -424,55 +421,50 @@ class TestStreamingContext(PySparkStreamingTestCase):
 
     duration = 0.1
 
+    def _add_input_stream(self):
+        inputs = map(lambda x: range(1, x), range(101))
+        stream = self.ssc.queueStream(inputs)
+        self._collect(stream, 1, block=False)
+
     def test_stop_only_streaming_context(self):
-        self._addInputStream()
+        self._add_input_stream()
         self.ssc.start()
         self.ssc.stop(False)
         self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)
 
     def test_stop_multiple_times(self):
-        self._addInputStream()
+        self._add_input_stream()
         self.ssc.start()
         self.ssc.stop()
         self.ssc.stop()
 
-    def _addInputStream(self):
-        # Make sure each length of input is over 3
-        inputs = map(lambda x: range(1, x), range(5, 101))
-        stream = self.ssc.queueStream(inputs)
-        self._collect(stream)
-
-    def test_queueStream(self):
-        input = [range(i) for i in range(3)]
+    def test_queue_stream(self):
+        input = [range(i + 1) for i in range(3)]
         dstream = self.ssc.queueStream(input)
-        result = self._collect(dstream)
-        self.ssc.start()
-        time.sleep(1)
-        self.assertEqual(input, result[:3])
+        result = self._collect(dstream, 3)
+        self.assertEqual(input, result)
 
-    def test_textFileStream(self):
+    def test_text_file_stream(self):
         d = tempfile.mkdtemp()
         self.ssc = StreamingContext(self.sc, self.duration)
         dstream2 = self.ssc.textFileStream(d).map(int)
-        result = self._collect(dstream2)
+        result = self._collect(dstream2, 2, block=False)
         self.ssc.start()
-        time.sleep(1)
         for name in ('a', 'b'):
+            time.sleep(1)
             with open(os.path.join(d, name), "w") as f:
                 f.writelines(["%d\n" % i for i in range(10)])
-        time.sleep(2)
-        self.assertEqual([range(10) * 2], result[:3])
+        self.wait_for(result, 2)
+        self.assertEqual([range(10), range(10)], result)
 
     def test_union(self):
-        input = [range(i) for i in range(3)]
+        input = [range(i + 1) for i in range(3)]
         dstream = self.ssc.queueStream(input)
         dstream2 = self.ssc.queueStream(input)
         dstream3 = self.ssc.union(dstream, dstream2)
-        result = self._collect(dstream3)
-        self.ssc.start()
-        time.sleep(1)
+        result = self._collect(dstream3, 3)
         expected = [i * 2 for i in input]
-        self.assertEqual(expected, result[:3])
+        self.assertEqual(expected, result)
 
     def test_transform(self):
         dstream1 = self.ssc.queueStream([[1]])
@@ -497,34 +489,62 @@ def tearDown(self):
         pass
 
     def test_get_or_create(self):
-        result = [0]
         inputd = tempfile.mkdtemp()
+        outputd = tempfile.mkdtemp() + "/"
+
+        def updater(it):
+            for k, vs, s in it:
+                yield (k, sum(vs, s or 0))
 
         def setup():
             conf = SparkConf().set("spark.default.parallelism", 1)
             sc = SparkContext(conf=conf)
-            ssc = StreamingContext(sc, .2)
-            dstream = ssc.textFileStream(inputd)
-            result[0] = self._collect(dstream.count())
+            ssc = StreamingContext(sc, 0.2)
+            dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1))
+            wc = dstream.updateStateByKey(updater)
+            wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test")
+            wc.checkpoint(.2)
             return ssc
 
-        tmpd = tempfile.mkdtemp("test_streaming_cps")
-        ssc = StreamingContext.getOrCreate(tmpd, setup)
+        cpd = tempfile.mkdtemp("test_streaming_cps")
+        ssc = StreamingContext.getOrCreate(cpd, setup)
         ssc.start()
-        time.sleep(1)
-        with open(os.path.join(inputd, "1"), 'w') as f:
-            f.writelines(["%d\n" % i for i in range(10)])
-        ssc.awaitTermination(4)
+
+        def check_output(n):
+            while not os.listdir(outputd):
+                time.sleep(0.1)
+            time.sleep(1)  # make sure mtime is larger than the previous one
+            with open(os.path.join(inputd, str(n)), 'w') as f:
+                f.writelines(["%d\n" % i for i in range(10)])
+
+            while True:
+                p = os.path.join(outputd, max(os.listdir(outputd)))
+                if '_SUCCESS' not in os.listdir(p):
+                    # not finished
+                    time.sleep(0.01)
+                    continue
+                ordd = ssc.sparkContext.textFile(p).map(lambda line: line.split(","))
+                d = ordd.values().map(int).collect()
+                if not d:
+                    time.sleep(0.01)
+                    continue
+                self.assertEqual(10, len(d))
+                s = set(d)
+                self.assertEqual(1, len(s))
+                m = s.pop()
+                if n > m:
+                    continue
+                self.assertEqual(n, m)
+                break
+
+        check_output(1)
+        check_output(2)
         ssc.stop(True, True)
-        expected = [[i * 1 + 1] for i in range(5)] + [[5]] * 5
-        self.assertEqual([[10]], result[0][:1])
 
-        ssc = StreamingContext.getOrCreate(tmpd, setup)
-        ssc.start()
         time.sleep(1)
-        with open(os.path.join(inputd, "1"), 'w') as f:
-            f.writelines(["%d\n" % i for i in range(10)])
-        ssc.awaitTermination(2)
+        ssc = StreamingContext.getOrCreate(cpd, setup)
+        ssc.start()
+        check_output(3)
         ssc.stop(True, True)
 
 
diff --git a/python/pyspark/streaming/util.py b/python/pyspark/streaming/util.py
index 4f07e44aa2d43..aecf7f71fdbc7 100644
--- a/python/pyspark/streaming/util.py
+++ b/python/pyspark/streaming/util.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 
+import time
 from datetime import datetime
 import traceback
 
@@ -32,23 +33,20 @@ def __init__(self, ctx, func, *deserializers):
         self.func = func
         self.deserializers = deserializers
 
-    @property
-    def emptyRDD(self):
-        if self._emptyRDD is None and self.ctx:
-            self._emptyRDD = self.ctx.parallelize([]).cache()
-        return self._emptyRDD
-
     def call(self, milliseconds, jrdds):
         try:
             if self.ctx is None:
                 self.ctx = SparkContext._active_spark_context
+            if not self.ctx or not self.ctx._jsc:
+                # stopped
+                return
 
             # extend deserializers with the first one
             sers = self.deserializers
             if len(sers) < len(jrdds):
                 sers += (sers[0],) * (len(jrdds) - len(sers))
 
-            rdds = [RDD(jrdd, self.ctx, ser) if jrdd else self.emptyRDD
+            rdds = [RDD(jrdd, self.ctx, ser) if jrdd else None
                     for jrdd, ser in zip(jrdds, sers)]
             t = datetime.fromtimestamp(milliseconds / 1000.0)
             r = self.func(t, *rdds)
@@ -69,6 +67,7 @@ def __init__(self, ctx, serializer, gateway=None):
         self.ctx = ctx
         self.serializer = serializer
         self.gateway = gateway or self.ctx._gateway
+        self.gateway.jvm.PythonDStream.registerSerializer(self)
 
     def dumps(self, id):
         try:
@@ -91,7 +90,7 @@ class Java:
         implements = ['org.apache.spark.streaming.api.python.PythonTransformFunctionSerializer']
 
 
-def rddToFileName(prefix, suffix, time):
+def rddToFileName(prefix, suffix, timestamp):
     """
     Return string prefix-time(.suffix)
 
@@ -99,12 +98,14 @@ def rddToFileName(prefix, suffix, time):
     'spark-12345678910'
     >>> rddToFileName("spark", "tmp", 12345678910)
     'spark-12345678910.tmp'
-
     """
+    if isinstance(timestamp, datetime):
+        seconds = time.mktime(timestamp.timetuple())
+        timestamp = long(seconds * 1000) + timestamp.microsecond / 1000
     if suffix is None:
-        return prefix + "-" + str(time)
+        return prefix + "-" + str(timestamp)
     else:
-        return prefix + "-" + str(time) + "." + suffix
+        return prefix + "-" + str(timestamp) + "." + suffix
 
 
 if __name__ == "__main__":
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 5afcb84857350..59552bb0a2205 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -22,6 +22,7 @@ import java.lang.reflect.Proxy
 import java.util.{ArrayList => JArrayList, List => JList}
 import scala.collection.JavaConversions._
 import scala.collection.JavaConverters._
+import scala.language.existentials
 
 import org.apache.spark.api.java._
 import org.apache.spark.api.python._
@@ -39,9 +40,16 @@ private[python] trait PythonTransformFunction {
   def call(time: Long, rdds: JList[_]): JavaRDD[Array[Byte]]
 }
 
+/**
+ * Interface for Python Serializer to serialize PythonTransformFunction
+ */
+private[python] trait PythonTransformFunctionSerializer {
+  def dumps(id: String): Array[Byte]
+  def loads(bytes: Array[Byte]): PythonTransformFunction
+}
+
 /**
  * Wrapper for PythonTransformFunction
- * TODO: support checkpoint
  */
 private[python] class TransformFunction(@transient var pfunc: PythonTransformFunction)
   extends function.Function2[JList[JavaRDD[_]], Time, JavaRDD[Array[Byte]]] with Serializable {
@@ -62,44 +70,45 @@ private[python] class TransformFunction(@transient var pfunc: PythonTransformFun
   }
 
   private def writeObject(out: ObjectOutputStream): Unit = {
-    assert(PythonDStream.serializer != null, "Serializer has not been registered!")
-    val bytes = PythonDStream.serializer.serialize(pfunc)
+    val bytes = PythonTransformFunctionSerializer.serialize(pfunc)
     out.writeInt(bytes.length)
     out.write(bytes)
   }
 
   private def readObject(in: ObjectInputStream): Unit = {
-    assert(PythonDStream.serializer != null, "Serializer has not been registered!")
     val length = in.readInt()
     val bytes = new Array[Byte](length)
     in.readFully(bytes)
-    pfunc = PythonDStream.serializer.deserialize(bytes)
+    pfunc = PythonTransformFunctionSerializer.deserialize(bytes)
   }
 }
 
 /**
- * Interface for Python Serializer to serialize PythonTransformFunction
+ * Helpers for PythonTransformFunctionSerializer
  */
-private[python] trait PythonTransformFunctionSerializer {
-  def dumps(id: String): Array[Byte]  //
-  def loads(bytes: Array[Byte]): PythonTransformFunction
-}
+private[python] object PythonTransformFunctionSerializer {
+
+  // A serializer in Python, used to serialize PythonTransformFunction
+  private var serializer: PythonTransformFunctionSerializer = _
+
+  // Register a serializer from Python, should be called during initialization
+  def register(ser: PythonTransformFunctionSerializer): Unit = {
+    serializer = ser
+  }
 
-/**
- * Wrapper for PythonTransformFunctionSerializer
- */
-private[python] class TransformFunctionSerializer(pser: PythonTransformFunctionSerializer) {
   def serialize(func: PythonTransformFunction): Array[Byte] = {
+    assert(serializer != null, "Serializer has not been registered!")
     // get the id of PythonTransformFunction in py4j
     val h = Proxy.getInvocationHandler(func.asInstanceOf[Proxy])
     val f = h.getClass().getDeclaredField("id")
     f.setAccessible(true)
     val id = f.get(h).asInstanceOf[String]
-    pser.dumps(id)
+    serializer.dumps(id)
   }
 
   def deserialize(bytes: Array[Byte]): PythonTransformFunction = {
-    pser.loads(bytes)
+    assert(serializer != null, "Serializer has not been registered!")
+    serializer.loads(bytes)
   }
 }
 
@@ -108,12 +117,10 @@ private[python] class TransformFunctionSerializer(pser: PythonTransformFunctionS
  */
 private[python] object PythonDStream {
 
-  // A serializer in Python, used to serialize PythonTransformFunction
-  var serializer: TransformFunctionSerializer = _
-
-  // Register a serializer from Python, should be called during initialization
-  def registerSerializer(ser: PythonTransformFunctionSerializer) = {
-    serializer = new TransformFunctionSerializer(ser)
+  // can not access PythonTransformFunctionSerializer.register() via Py4j
+  // Py4JError: PythonTransformFunctionSerializerregister does not exist in the JVM
+  def registerSerializer(ser: PythonTransformFunctionSerializer): Unit = {
+    PythonTransformFunctionSerializer.register(ser)
   }
 
   // helper function for DStream.foreachRDD(),
@@ -207,7 +214,10 @@ private[python] class PythonTransformed2DStream(
   override def dependencies = List(parent, parent2)
 
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
-    func(parent.getOrCompute(validTime), parent2.getOrCompute(validTime), validTime)
+    val empty: RDD[_] = ssc.sparkContext.emptyRDD
+    val rdd1 = parent.getOrCompute(validTime).getOrElse(empty)
+    val rdd2 = parent2.getOrCompute(validTime).getOrElse(empty)
+    func(Some(rdd1), Some(rdd2), validTime)
   }
 
   val asJavaDStream = JavaDStream.fromDStream(this)

From 4d0ea8bf5df513d5d1f4250286ca328192018f08 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Thu, 2 Oct 2014 00:10:38 -0700
Subject: [PATCH 336/347] clear reference of SparkEnv after stop

---
 core/src/main/scala/org/apache/spark/SparkEnv.scala | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index 009ed64775844..57874df3819b2 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -91,6 +91,9 @@ class SparkEnv (
     // actorSystem.awaitTermination()
 
     // Note that blockTransferService is stopped by BlockManager since it is started by it.
+
+    // clear all the references in ThreadLocal object
+    SparkEnv.reset()
   }
 
   private[spark]
@@ -119,7 +122,7 @@ class SparkEnv (
 }
 
 object SparkEnv extends Logging {
-  private val env = new ThreadLocal[SparkEnv]
+  @volatile private var env = new ThreadLocal[SparkEnv]
   @volatile private var lastSetSparkEnv : SparkEnv = _
 
   private[spark] val driverActorSystemName = "sparkDriver"
@@ -130,6 +133,12 @@ object SparkEnv extends Logging {
     env.set(e)
   }
 
+  // clear all the threadlocal references
+  private[spark] def reset(): Unit = {
+    env = new ThreadLocal[SparkEnv]
+    lastSetSparkEnv = null
+  }
+
   /**
    * Returns the ThreadLocal SparkEnv, if non-null. Else returns the SparkEnv
    * previously set in any thread.

From c7bbbced7ba2d45e5fb2c1452920de11bd5138a8 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Thu, 2 Oct 2014 08:01:53 -0700
Subject: [PATCH 337/347] fix sphinx docs

---
 python/docs/epytext.py               |  2 +-
 python/docs/index.rst                |  1 +
 python/docs/modules.rst              |  3 ++
 python/docs/pyspark.rst              |  3 +-
 python/pyspark/streaming/__init__.py |  2 +
 python/pyspark/streaming/context.py  | 24 +++++-----
 python/pyspark/streaming/dstream.py  | 65 ++++++++++++++--------------
 7 files changed, 54 insertions(+), 46 deletions(-)

diff --git a/python/docs/epytext.py b/python/docs/epytext.py
index 61d731bff570d..19fefbfc057a4 100644
--- a/python/docs/epytext.py
+++ b/python/docs/epytext.py
@@ -5,7 +5,7 @@
     (r"L{([\w.()]+)}", r":class:`\1`"),
     (r"[LC]{(\w+\.\w+)\(\)}", r":func:`\1`"),
     (r"C{([\w.()]+)}", r":class:`\1`"),
-    (r"[IBCM]{(.+)}", r"`\1`"),
+    (r"[IBCM]{([^}]+)}", r"`\1`"),
     ('pyspark.rdd.RDD', 'RDD'),
 )
 
diff --git a/python/docs/index.rst b/python/docs/index.rst
index 25b3f9bd93e63..e0f4e5c192acf 100644
--- a/python/docs/index.rst
+++ b/python/docs/index.rst
@@ -13,6 +13,7 @@ Contents:
 
    pyspark
    pyspark.sql
+   pyspark.streaming
    pyspark.mllib
 
 
diff --git a/python/docs/modules.rst b/python/docs/modules.rst
index 183564659fbcf..04dce62be5f49 100644
--- a/python/docs/modules.rst
+++ b/python/docs/modules.rst
@@ -5,3 +5,6 @@
    :maxdepth: 4
 
    pyspark
+   pyspark.sql
+   pyspark.streaming
+   pyspark.mllib
diff --git a/python/docs/pyspark.rst b/python/docs/pyspark.rst
index a68bd62433085..e81be3b6cb796 100644
--- a/python/docs/pyspark.rst
+++ b/python/docs/pyspark.rst
@@ -7,8 +7,9 @@ Subpackages
 .. toctree::
     :maxdepth: 1
     
-    pyspark.mllib
     pyspark.sql
+    pyspark.streaming
+    pyspark.mllib
 
 Contents
 --------
diff --git a/python/pyspark/streaming/__init__.py b/python/pyspark/streaming/__init__.py
index 00d2823525992..d2644a1d4ffab 100644
--- a/python/pyspark/streaming/__init__.py
+++ b/python/pyspark/streaming/__init__.py
@@ -17,3 +17,5 @@
 
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.dstream import DStream
+
+__all__ = ['StreamingContext', 'DStream']
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index b84e12ebac1dc..aabbbd958080a 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -71,7 +71,7 @@ class StreamingContext(object):
     """
     Main entry point for Spark Streaming functionality. A StreamingContext
     represents the connection to a Spark cluster, and can be used to create
-    L{DStream}s various input sources. It can be from an existing L{SparkContext}.
+    L{DStream} various input sources. It can be from an existing L{SparkContext}.
     After creating and transforming DStreams, the streaming computation can
     be started and stopped using `context.start()` and `context.stop()`,
     respectively. `context.awaitTransformation()` allows the current thread
@@ -180,8 +180,8 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):
         Stop the execution of the streams, with option of ensuring all
         received data has been processed.
 
-        @param stopSparkContext Stop the associated SparkContext or not
-        @param stopGracefully Stop gracefully by waiting for the processing
+        @param stopSparkContext: Stop the associated SparkContext or not
+        @param stopGracefully: Stop gracefully by waiting for the processing
                               of all received data to be completed
         """
         self._jssc.stop(stopSparkContext, stopGraceFully)
@@ -197,7 +197,7 @@ def remember(self, duration):
         the RDDs (if the developer wishes to query old data outside the
         DStream computation).
 
-        @param duration Minimum duration (in seconds) that each DStream
+        @param duration: Minimum duration (in seconds) that each DStream
                         should remember its RDDs
         """
         self._jssc.remember(self._jduration(duration))
@@ -207,7 +207,7 @@ def checkpoint(self, directory):
         Sets the context to periodically checkpoint the DStream operations for master
         fault-tolerance. The graph will be checkpointed every batch interval.
 
-        @param directory HDFS-compatible directory where the checkpoint data
+        @param directory: HDFS-compatible directory where the checkpoint data
                          will be reliably stored
         """
         self._jssc.checkpoint(directory)
@@ -215,12 +215,12 @@ def checkpoint(self, directory):
     def socketTextStream(self, hostname, port, storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2):
         """
         Create an input from TCP source hostname:port. Data is received using
-        a TCP socket and receive byte is interpreted as UTF8 encoded '\n' delimited
+        a TCP socket and receive byte is interpreted as UTF8 encoded ``\\n`` delimited
         lines.
 
-        @param hostname      Hostname to connect to for receiving data
-        @param port          Port to connect to for receiving data
-        @param storageLevel  Storage level to use for storing the received objects
+        @param hostname:      Hostname to connect to for receiving data
+        @param port:          Port to connect to for receiving data
+        @param storageLevel:  Storage level to use for storing the received objects
         """
         jlevel = self._sc._getJavaStorageLevel(storageLevel)
         return DStream(self._jssc.socketTextStream(hostname, port, jlevel), self,
@@ -249,9 +249,9 @@ def queueStream(self, rdds, oneAtATime=True, default=None):
 
         NOTE: changes to the queue after the stream is created will not be recognized.
 
-        @param rdds       Queue of RDDs
-        @param oneAtATime pick one rdd each time or pick all of them once.
-        @param default    The default rdd if no more in rdds
+        @param rdds:       Queue of RDDs
+        @param oneAtATime: pick one rdd each time or pick all of them once.
+        @param default:    The default rdd if no more in rdds
         """
         if default and not isinstance(default, RDD):
             default = self._sc.parallelize(default)
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index f8ebb7e68d8d7..a77e8f505e147 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -284,7 +284,7 @@ def transform(self, func):
         on each RDD of 'this' DStream.
 
         `func` can have one argument of `rdd`, or have two arguments of
-         (`time`, `rdd`)
+        (`time`, `rdd`)
         """
         resue = False
         if func.func_code.co_argcount == 1:
@@ -328,7 +328,8 @@ def _slideDuration(self):
     def union(self, other):
         """
         Return a new DStream by unifying data of another DStream with this DStream.
-        @param other Another DStream having the same interval (i.e., slideDuration)
+
+        @param other: Another DStream having the same interval (i.e., slideDuration)
                      as this DStream.
         """
         if self._slideDuration != other._slideDuration:
@@ -348,11 +349,11 @@ def cogroup(self, other, numPartitions=None):
 
     def join(self, other, numPartitions=None):
         """
-         Return a new DStream by applying 'join' between RDDs of `this` DStream and
+        Return a new DStream by applying 'join' between RDDs of `this` DStream and
         `other` DStream.
 
         Hash partitioning is used to generate the RDDs with `numPartitions`
-         partitions.
+        partitions.
         """
         if numPartitions is None:
             numPartitions = self.ctx.defaultParallelism
@@ -360,11 +361,11 @@ def join(self, other, numPartitions=None):
 
     def leftOuterJoin(self, other, numPartitions=None):
         """
-         Return a new DStream by applying 'left outer join' between RDDs of `this` DStream and
+        Return a new DStream by applying 'left outer join' between RDDs of `this` DStream and
         `other` DStream.
 
         Hash partitioning is used to generate the RDDs with `numPartitions`
-         partitions.
+        partitions.
         """
         if numPartitions is None:
             numPartitions = self.ctx.defaultParallelism
@@ -372,11 +373,11 @@ def leftOuterJoin(self, other, numPartitions=None):
 
     def rightOuterJoin(self, other, numPartitions=None):
         """
-         Return a new DStream by applying 'right outer join' between RDDs of `this` DStream and
+        Return a new DStream by applying 'right outer join' between RDDs of `this` DStream and
         `other` DStream.
 
         Hash partitioning is used to generate the RDDs with `numPartitions`
-         partitions.
+        partitions.
         """
         if numPartitions is None:
             numPartitions = self.ctx.defaultParallelism
@@ -384,11 +385,11 @@ def rightOuterJoin(self, other, numPartitions=None):
 
     def fullOuterJoin(self, other, numPartitions=None):
         """
-         Return a new DStream by applying 'full outer join' between RDDs of `this` DStream and
+        Return a new DStream by applying 'full outer join' between RDDs of `this` DStream and
         `other` DStream.
 
         Hash partitioning is used to generate the RDDs with `numPartitions`
-         partitions.
+        partitions.
         """
         if numPartitions is None:
             numPartitions = self.ctx.defaultParallelism
@@ -424,9 +425,9 @@ def window(self, windowDuration, slideDuration=None):
         Return a new DStream in which each RDD contains all the elements in seen in a
         sliding window of time over this DStream.
 
-        @param windowDuration width of the window; must be a multiple of this DStream's
+        @param windowDuration: width of the window; must be a multiple of this DStream's
                               batching interval
-        @param slideDuration  sliding interval of the window (i.e., the interval after which
+        @param slideDuration:  sliding interval of the window (i.e., the interval after which
                               the new DStream will generate RDDs); must be a multiple of this
                               DStream's batching interval
         """
@@ -448,13 +449,13 @@ def reduceByWindow(self, reduceFunc, invReduceFunc, windowDuration, slideDuratio
          2. "inverse reduce" the old values that left the window (e.g., subtracting old counts)
          This is more efficient than `invReduceFunc` is None.
 
-        @param reduceFunc associative reduce function
-        @param invReduceFunc inverse reduce function of `reduceFunc`
-        @param windowDuration width of the window; must be a multiple of this DStream's
-                              batching interval
-        @param slideDuration  sliding interval of the window (i.e., the interval after which
-                              the new DStream will generate RDDs); must be a multiple of this
-                              DStream's batching interval
+        @param reduceFunc:     associative reduce function
+        @param invReduceFunc:  inverse reduce function of `reduceFunc`
+        @param windowDuration: width of the window; must be a multiple of this DStream's
+                               batching interval
+        @param slideDuration:  sliding interval of the window (i.e., the interval after which
+                               the new DStream will generate RDDs); must be a multiple of this
+                               DStream's batching interval
         """
         keyed = self.map(lambda x: (1, x))
         reduced = keyed.reduceByKeyAndWindow(reduceFunc, invReduceFunc,
@@ -478,12 +479,12 @@ def countByValueAndWindow(self, windowDuration, slideDuration, numPartitions=Non
         Return a new DStream in which each RDD contains the count of distinct elements in
         RDDs in a sliding window over this DStream.
 
-        @param windowDuration width of the window; must be a multiple of this DStream's
+        @param windowDuration: width of the window; must be a multiple of this DStream's
                               batching interval
-        @param slideDuration  sliding interval of the window (i.e., the interval after which
+        @param slideDuration:  sliding interval of the window (i.e., the interval after which
                               the new DStream will generate RDDs); must be a multiple of this
                               DStream's batching interval
-        @param numPartitions  number of partitions of each RDD in the new DStream.
+        @param numPartitions:  number of partitions of each RDD in the new DStream.
         """
         keyed = self.map(lambda x: (x, 1))
         counted = keyed.reduceByKeyAndWindow(operator.add, operator.sub,
@@ -495,12 +496,12 @@ def groupByKeyAndWindow(self, windowDuration, slideDuration, numPartitions=None)
         Return a new DStream by applying `groupByKey` over a sliding window.
         Similar to `DStream.groupByKey()`, but applies it over a sliding window.
 
-        @param windowDuration width of the window; must be a multiple of this DStream's
+        @param windowDuration: width of the window; must be a multiple of this DStream's
                               batching interval
-        @param slideDuration  sliding interval of the window (i.e., the interval after which
+        @param slideDuration:  sliding interval of the window (i.e., the interval after which
                               the new DStream will generate RDDs); must be a multiple of this
                               DStream's batching interval
-        @param numPartitions  Number of partitions of each RDD in the new DStream.
+        @param numPartitions:  Number of partitions of each RDD in the new DStream.
         """
         ls = self.mapValues(lambda x: [x])
         grouped = ls.reduceByKeyAndWindow(lambda a, b: a.extend(b) or a, lambda a, b: a[len(b):],
@@ -519,15 +520,15 @@ def reduceByKeyAndWindow(self, func, invFunc, windowDuration, slideDuration=None
         `invFunc` can be None, then it will reduce all the RDDs in window, could be slower
         than having `invFunc`.
 
-        @param reduceFunc     associative reduce function
-        @param invReduceFunc  inverse function of `reduceFunc`
-        @param windowDuration width of the window; must be a multiple of this DStream's
+        @param reduceFunc:     associative reduce function
+        @param invReduceFunc:  inverse function of `reduceFunc`
+        @param windowDuration: width of the window; must be a multiple of this DStream's
                               batching interval
-        @param slideDuration  sliding interval of the window (i.e., the interval after which
+        @param slideDuration:  sliding interval of the window (i.e., the interval after which
                               the new DStream will generate RDDs); must be a multiple of this
                               DStream's batching interval
-        @param numPartitions  number of partitions of each RDD in the new DStream.
-        @param filterFunc     function to filter expired key-value pairs;
+        @param numPartitions:  number of partitions of each RDD in the new DStream.
+        @param filterFunc:     function to filter expired key-value pairs;
                               only pairs that satisfy the function are retained
                               set this to null if you do not want to filter
         """
@@ -567,7 +568,7 @@ def updateStateByKey(self, updateFunc, numPartitions=None):
         Return a new "state" DStream where the state for each key is updated by applying
         the given function on the previous state of the key and the new values of the key.
 
-        @param updateFunc State update function ([(k, vs, s)] -> [(k, s)]).
+        @param updateFunc: State update function ([(k, vs, s)] -> [(k, s)]).
                           If `s` is None, then `k` will be eliminated.
         """
         if numPartitions is None:

From be5e5ffdc5d2606042f09adb8d0fff08ddc4b85d Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Thu, 2 Oct 2014 08:27:52 -0700
Subject: [PATCH 338/347] merge branch of env, make tests stable.

---
 python/pyspark/streaming/tests.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index ff5986776a94e..6a7dfd574701d 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -485,9 +485,6 @@ class TestCheckpoint(PySparkStreamingTestCase):
     def setUp(self):
         pass
 
-    def tearDown(self):
-        pass
-
     def test_get_or_create(self):
         inputd = tempfile.mkdtemp()
         outputd = tempfile.mkdtemp() + "/"
@@ -545,7 +542,6 @@ def check_output(n):
         ssc = StreamingContext.getOrCreate(cpd, setup)
         ssc.start()
         check_output(3)
-        ssc.stop(True, True)
 
 
 if __name__ == "__main__":

From d05871e912ee4828a4ac68a6a0ceed0454e44722 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Thu, 2 Oct 2014 12:46:50 -0700
Subject: [PATCH 339/347] remove reuse of PythonRDD

---
 python/pyspark/streaming/dstream.py           | 28 ++++++-------
 python/pyspark/streaming/tests.py             |  4 +-
 .../streaming/api/python/PythonDStream.scala  | 39 ++++---------------
 3 files changed, 20 insertions(+), 51 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index a77e8f505e147..fddfd757b8674 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -286,13 +286,11 @@ def transform(self, func):
         `func` can have one argument of `rdd`, or have two arguments of
         (`time`, `rdd`)
         """
-        resue = False
         if func.func_code.co_argcount == 1:
-            reuse = True
             oldfunc = func
             func = lambda t, rdd: oldfunc(rdd)
         assert func.func_code.co_argcount == 2, "func should take one or two arguments"
-        return TransformedDStream(self, func, reuse)
+        return TransformedDStream(self, func)
 
     def transformWith(self, func, other, keepSerializer=False):
         """
@@ -597,26 +595,23 @@ class TransformedDStream(DStream):
     Multiple continuous transformations of DStream can be combined into
     one transformation.
     """
-    def __init__(self, prev, func, reuse=False):
+    def __init__(self, prev, func):
         ssc = prev._ssc
         self._ssc = ssc
         self.ctx = ssc._sc
         self._jrdd_deserializer = self.ctx.serializer
         self.is_cached = False
         self.is_checkpointed = False
+        self._jdstream_val = None
 
         if (isinstance(prev, TransformedDStream) and
                 not prev.is_cached and not prev.is_checkpointed):
             prev_func = prev.func
-            old_func = func
-            func = lambda t, rdd: old_func(t, prev_func(t, rdd))
-            reuse = reuse and prev.reuse
-            prev = prev.prev
-
-        self.prev = prev
-        self.func = func
-        self.reuse = reuse
-        self._jdstream_val = None
+            self.func = lambda t, rdd: func(t, prev_func(t, rdd))
+            self.prev = prev.prev
+        else:
+            self.prev = prev
+            self.func = func
 
     @property
     def _jdstream(self):
@@ -624,7 +619,6 @@ def _jdstream(self):
             return self._jdstream_val
 
         jfunc = TransformFunction(self.ctx, self.func, self.prev._jrdd_deserializer)
-        jdstream = self.ctx._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(),
-                                                          jfunc, self.reuse).asJavaDStream()
-        self._jdstream_val = jdstream
-        return jdstream
+        dstream = self.ctx._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(), jfunc)
+        self._jdstream_val = dstream.asJavaDStream()
+        return self._jdstream_val
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 6a7dfd574701d..a839faecf9a16 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -504,7 +504,7 @@ def setup():
             return ssc
 
         cpd = tempfile.mkdtemp("test_streaming_cps")
-        ssc = StreamingContext.getOrCreate(cpd, setup)
+        self.ssc = ssc = StreamingContext.getOrCreate(cpd, setup)
         ssc.start()
 
         def check_output(n):
@@ -539,7 +539,7 @@ def check_output(n):
         ssc.stop(True, True)
 
         time.sleep(1)
-        ssc = StreamingContext.getOrCreate(cpd, setup)
+        self.ssc = ssc = StreamingContext.getOrCreate(cpd, setup)
         ssc.start()
         check_output(3)
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 59552bb0a2205..96b84b45b2ebf 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -157,43 +157,18 @@ private[python] abstract class PythonDStream(
 
 /**
  * Transformed DStream in Python.
- *
- * If `reuse` is true and the result of the `func` is an PythonRDD, then it will cache it
- * as an template for future use, this can reduce the Python callbacks.
  */
 private[python] class PythonTransformedDStream (
     parent: DStream[_],
-    @transient pfunc: PythonTransformFunction,
-    var reuse: Boolean = false)
+    @transient pfunc: PythonTransformFunction)
   extends PythonDStream(parent, pfunc) {
 
-  // rdd returned by func
-  var lastResult: PythonRDD = _
-
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     val rdd = parent.getOrCompute(validTime)
-    if (rdd.isEmpty) {
-      return None
-    }
-    if (reuse && lastResult != null) {
-      // use the previous result as the template to generate new RDD
-      Some(lastResult.copyTo(rdd.get))
+    if (rdd.isDefined) {
+      func(rdd, validTime)
     } else {
-      val r = func(rdd, validTime)
-      if (reuse && r.isDefined && lastResult == null) {
-        // try to use the result as a template
-        r.get match {
-          case pyrdd: PythonRDD =>
-            if (pyrdd.firstParent == rdd) {
-              // only one PythonRDD
-              lastResult = pyrdd
-            } else {
-              // maybe have multiple stages, don't check it anymore
-              reuse = false
-            }
-        }
-      }
-      r
+      None
     }
   }
 }
@@ -209,10 +184,10 @@ private[python] class PythonTransformed2DStream(
 
   val func = new TransformFunction(pfunc)
 
-  override def slideDuration: Duration = parent.slideDuration
-
   override def dependencies = List(parent, parent2)
 
+  override def slideDuration: Duration = parent.slideDuration
+
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
     val empty: RDD[_] = ssc.sparkContext.emptyRDD
     val rdd1 = parent.getOrCompute(validTime).getOrElse(empty)
@@ -220,7 +195,7 @@ private[python] class PythonTransformed2DStream(
     func(Some(rdd1), Some(rdd2), validTime)
   }
 
-  val asJavaDStream = JavaDStream.fromDStream(this)
+  val asJavaDStream  = JavaDStream.fromDStream(this)
 }
 
 /**

From 37fe06fb743a1934d834d603e04b678110bc0fd5 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Thu, 2 Oct 2014 15:43:46 -0700
Subject: [PATCH 340/347] use random port for callback server

---
 python/pyspark/streaming/context.py           | 30 +++++++++++-----
 .../streaming/api/python/PythonDStream.scala  | 36 +++++++++++++++----
 2 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index aabbbd958080a..7f99d38771ce8 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -18,7 +18,7 @@
 import sys
 
 from py4j.java_collections import ListConverter
-from py4j.java_gateway import java_import
+from py4j.java_gateway import java_import, JavaObject
 
 from pyspark import RDD, SparkConf
 from pyspark.serializers import UTF8Deserializer, CloudPickleSerializer
@@ -38,6 +38,8 @@ def _daemonize_callback_server():
     from exiting if it's not shutdown. The following code replace `start()`
     of CallbackServer with a new version, which set daemon=True for this
     thread.
+
+    Also, it will update the port number (0) with real port
     """
     # TODO: create a patch for Py4J
     import socket
@@ -54,8 +56,11 @@ def start(self):
                                       1)
         try:
             self.server_socket.bind((self.address, self.port))
-        except Exception:
-            msg = 'An error occurred while trying to start the callback server'
+            if not self.port:
+                # update port with real port
+                self.port = self.server_socket.getsockname()[1]
+        except Exception as e:
+            msg = 'An error occurred while trying to start the callback server: %s' % e
             logger.exception(msg)
             raise Py4JNetworkError(msg)
 
@@ -105,15 +110,24 @@ def _jduration(self, seconds):
     def _ensure_initialized(cls):
         SparkContext._ensure_initialized()
         gw = SparkContext._gateway
-        # start callback server
-        # getattr will fallback to JVM
-        if "_callback_server" not in gw.__dict__:
-            _daemonize_callback_server()
-            gw._start_callback_server(gw._python_proxy_port)
 
         java_import(gw.jvm, "org.apache.spark.streaming.*")
         java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
         java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")
+
+        # start callback server
+        # getattr will fallback to JVM, so we cannot test by hasattr()
+        if "_callback_server" not in gw.__dict__:
+            _daemonize_callback_server()
+            # use random port
+            gw._start_callback_server(0)
+            # gateway with real port
+            gw._python_proxy_port = gw._callback_server.port
+            # get the GatewayServer object in JVM by ID
+            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
+            # update the port of CallbackClient with real port
+            gw.jvm.PythonDStream.updatePythonGatewayPort(jgws, gw._python_proxy_port)
+
         # register serializer for TransformFunction
         # it happens before creating SparkContext when loading from checkpointing
         cls._transformerSerializer = TransformFunctionSerializer(
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 96b84b45b2ebf..e171fb5730616 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -24,6 +24,8 @@ import scala.collection.JavaConversions._
 import scala.collection.JavaConverters._
 import scala.language.existentials
 
+import py4j.GatewayServer
+
 import org.apache.spark.api.java._
 import org.apache.spark.api.python._
 import org.apache.spark.rdd.RDD
@@ -88,10 +90,14 @@ private[python] class TransformFunction(@transient var pfunc: PythonTransformFun
  */
 private[python] object PythonTransformFunctionSerializer {
 
-  // A serializer in Python, used to serialize PythonTransformFunction
+  /**
+   * A serializer in Python, used to serialize PythonTransformFunction
+    */
   private var serializer: PythonTransformFunctionSerializer = _
 
-  // Register a serializer from Python, should be called during initialization
+  /*
+   * Register a serializer from Python, should be called during initialization
+   */
   def register(ser: PythonTransformFunctionSerializer): Unit = {
     serializer = ser
   }
@@ -117,20 +123,36 @@ private[python] object PythonTransformFunctionSerializer {
  */
 private[python] object PythonDStream {
 
-  // can not access PythonTransformFunctionSerializer.register() via Py4j
-  // Py4JError: PythonTransformFunctionSerializerregister does not exist in the JVM
+  /**
+   * can not access PythonTransformFunctionSerializer.register() via Py4j
+   * Py4JError: PythonTransformFunctionSerializerregister does not exist in the JVM
+   */
   def registerSerializer(ser: PythonTransformFunctionSerializer): Unit = {
     PythonTransformFunctionSerializer.register(ser)
   }
 
-  // helper function for DStream.foreachRDD(),
-  // cannot be `foreachRDD`, it will confusing py4j
+  /**
+   * Update the port of callback client to `port`
+   */
+  def updatePythonGatewayPort(gws: GatewayServer, port: Int): Unit = {
+    val cl = gws.getCallbackClient
+    val f = cl.getClass.getDeclaredField("port")
+    f.setAccessible(true)
+    f.setInt(cl, port)
+  }
+
+  /**
+   * helper function for DStream.foreachRDD(),
+   * cannot be `foreachRDD`, it will confusing py4j
+   */
   def callForeachRDD(jdstream: JavaDStream[Array[Byte]], pfunc: PythonTransformFunction) {
     val func = new TransformFunction((pfunc))
     jdstream.dstream.foreachRDD((rdd, time) => func(Some(rdd), time))
   }
 
-  // convert list of RDD into queue of RDDs, for ssc.queueStream()
+  /**
+   * convert list of RDD into queue of RDDs, for ssc.queueStream()
+   */
   def toRDDQueue(rdds: JArrayList[JavaRDD[Array[Byte]]]): java.util.Queue[JavaRDD[Array[Byte]]] = {
     val queue = new java.util.LinkedList[JavaRDD[Array[Byte]]]
     rdds.forall(queue.add(_))

From e108ec114eb1a14c6e2387761da8e55bee4b3c83 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Thu, 2 Oct 2014 22:51:18 -0700
Subject: [PATCH 341/347]  address comments

---
 .../apache/spark/api/python/PythonRDD.scala   |   8 --
 python/pyspark/rdd.py                         |   2 +
 python/pyspark/streaming/context.py           |  38 +++---
 python/pyspark/streaming/dstream.py           | 112 +++++++++---------
 python/pyspark/streaming/tests.py             |   4 +-
 .../streaming/api/python/PythonDStream.scala  |   2 +-
 6 files changed, 80 insertions(+), 86 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index fd6e3406a3b7e..f36a651dc2d8f 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -25,8 +25,6 @@ import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collectio
 import scala.collection.JavaConversions._
 import scala.collection.mutable
 import scala.language.existentials
-import scala.reflect.ClassTag
-import scala.util.{Try, Success, Failure}
 
 import net.razorvine.pickle.{Pickler, Unpickler}
 
@@ -52,12 +50,6 @@ private[spark] class PythonRDD(
     accumulator: Accumulator[JList[Array[Byte]]])
   extends RDD[Array[Byte]](parent) {
 
-  // create a new PythonRDD with same Python setting but different parent.
-  def copyTo(rdd: RDD[_]): PythonRDD = {
-    new PythonRDD(rdd, command, envVars, pythonIncludes, preservePartitoning,
-      pythonExec, broadcastVars, accumulator)
-  }
-
   val bufferSize = conf.getInt("spark.buffer.size", 65536)
   val reuse_worker = conf.getBoolean("spark.python.worker.reuse", true)
 
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index dc6497772e502..77e8fb1773fd1 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -787,6 +787,8 @@ def sum(self):
         >>> sc.parallelize([1.0, 2.0, 3.0]).sum()
         6.0
         """
+        if not self.getNumPartitions():
+            return 0  # empty RDD can not been reduced
         return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
     def count(self):
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 7f99d38771ce8..dc9dc41121935 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -84,17 +84,18 @@ class StreamingContext(object):
     """
     _transformerSerializer = None
 
-    def __init__(self, sparkContext, duration=None, jssc=None):
+    def __init__(self, sparkContext, batchDuration=None, jssc=None):
         """
         Create a new StreamingContext.
 
         @param sparkContext: L{SparkContext} object.
-        @param duration: number of seconds.
+        @param batchDuration: the time interval (in seconds) at which streaming
+                              data will be divided into batches
         """
 
         self._sc = sparkContext
         self._jvm = self._sc._jvm
-        self._jssc = jssc or self._initialize_context(self._sc, duration)
+        self._jssc = jssc or self._initialize_context(self._sc, batchDuration)
 
     def _initialize_context(self, sc, duration):
         self._ensure_initialized()
@@ -134,26 +135,27 @@ def _ensure_initialized(cls):
             SparkContext._active_spark_context, CloudPickleSerializer(), gw)
 
     @classmethod
-    def getOrCreate(cls, path, setupFunc):
+    def getOrCreate(cls, checkpointPath, setupFunc):
         """
-        Get the StreamingContext from checkpoint file at `path`, or setup
-        it by `setupFunc`.
+        Either recreate a StreamingContext from checkpoint data or create a new StreamingContext.
+        If checkpoint data exists in the provided `checkpointPath`, then StreamingContext will be
+        recreated from the checkpoint data. If the data does not exist, then the provided setupFunc
+        will be used to create a JavaStreamingContext.
 
-        :param path: directory of checkpoint
-        :param setupFunc: a function used to create StreamingContext and
-                          setup DStreams.
-        :return: a StreamingContext
+        @param checkpointPath Checkpoint directory used in an earlier JavaStreamingContext program
+        @param setupFunc      Function to create a new JavaStreamingContext and setup DStreams
         """
-        if not os.path.exists(path) or not os.path.isdir(path) or not os.listdir(path):
+        # TODO: support checkpoint in HDFS
+        if not os.path.exists(checkpointPath) or not os.listdir(checkpointPath):
             ssc = setupFunc()
-            ssc.checkpoint(path)
+            ssc.checkpoint(checkpointPath)
             return ssc
 
         cls._ensure_initialized()
         gw = SparkContext._gateway
 
         try:
-            jssc = gw.jvm.JavaStreamingContext(path)
+            jssc = gw.jvm.JavaStreamingContext(checkpointPath)
         except Exception:
             print >>sys.stderr, "failed to load StreamingContext from checkpoint"
             raise
@@ -249,12 +251,12 @@ def textFileStream(self, directory):
         """
         return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
 
-    def _check_serialzers(self, rdds):
+    def _check_serializers(self, rdds):
         # make sure they have same serializer
         if len(set(rdd._jrdd_deserializer for rdd in rdds)) > 1:
             for i in range(len(rdds)):
                 # reset them to sc.serializer
-                rdds[i] = rdds[i].map(lambda x: x, preservesPartitioning=True)
+                rdds[i] = rdds[i]._reserialize()
 
     def queueStream(self, rdds, oneAtATime=True, default=None):
         """
@@ -275,7 +277,7 @@ def queueStream(self, rdds, oneAtATime=True, default=None):
 
         if rdds and not isinstance(rdds[0], RDD):
             rdds = [self._sc.parallelize(input) for input in rdds]
-        self._check_serialzers(rdds)
+        self._check_serializers(rdds)
 
         jrdds = ListConverter().convert([r._jrdd for r in rdds],
                                         SparkContext._gateway._gateway_client)
@@ -313,6 +315,10 @@ def union(self, *dstreams):
             raise ValueError("should have at least one DStream to union")
         if len(dstreams) == 1:
             return dstreams[0]
+        if len(set(s._jrdd_deserializer for s in dstreams)) > 1:
+            raise ValueError("All DStreams should have same serializer")
+        if len(set(s._slideDuration for s in dstreams)) > 1:
+            raise ValueError("All DStreams should have same slide duration")
         first = dstreams[0]
         jrest = ListConverter().convert([d._jdstream for d in dstreams[1:]],
                                         SparkContext._gateway._gateway_client)
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index fddfd757b8674..824131739cce3 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -53,7 +53,7 @@ class DStream(object):
     def __init__(self, jdstream, ssc, jrdd_deserializer):
         self._jdstream = jdstream
         self._ssc = ssc
-        self.ctx = ssc._sc
+        self._sc = ssc._sc
         self._jrdd_deserializer = jrdd_deserializer
         self.is_cached = False
         self.is_checkpointed = False
@@ -69,13 +69,7 @@ def count(self):
         Return a new DStream in which each RDD has a single element
         generated by counting each RDD of this DStream.
         """
-        return self.mapPartitions(lambda i: [sum(1 for _ in i)])._sum()
-
-    def _sum(self):
-        """
-        Add up the elements in this DStream.
-        """
-        return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
+        return self.mapPartitions(lambda i: [sum(1 for _ in i)]).reduce(operator.add)
 
     def filter(self, f):
         """
@@ -130,7 +124,7 @@ def reduceByKey(self, func, numPartitions=None):
         Return a new DStream by applying reduceByKey to each RDD.
         """
         if numPartitions is None:
-            numPartitions = self.ctx.defaultParallelism
+            numPartitions = self._sc.defaultParallelism
         return self.combineByKey(lambda x: x, func, func, numPartitions)
 
     def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
@@ -139,7 +133,7 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         Return a new DStream by applying combineByKey to each RDD.
         """
         if numPartitions is None:
-            numPartitions = self.ctx.defaultParallelism
+            numPartitions = self._sc.defaultParallelism
 
         def func(rdd):
             return rdd.combineByKey(createCombiner, mergeValue, mergeCombiners, numPartitions)
@@ -156,7 +150,7 @@ def foreachRDD(self, func):
         """
         Apply a function to each RDD in this DStream.
         """
-        jfunc = TransformFunction(self.ctx, func, self._jrdd_deserializer)
+        jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer)
         api = self._ssc._jvm.PythonDStream
         api.callForeachRDD(self._jdstream, jfunc)
 
@@ -216,7 +210,7 @@ def persist(self, storageLevel):
         Persist the RDDs of this DStream with the given storage level
         """
         self.is_cached = True
-        javaStorageLevel = self.ctx._getJavaStorageLevel(storageLevel)
+        javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel)
         self._jdstream.persist(javaStorageLevel)
         return self
 
@@ -236,7 +230,7 @@ def groupByKey(self, numPartitions=None):
         Return a new DStream by applying groupByKey on each RDD.
         """
         if numPartitions is None:
-            numPartitions = self.ctx.defaultParallelism
+            numPartitions = self._sc.defaultParallelism
         return self.transform(lambda rdd: rdd.groupByKey(numPartitions))
 
     def countByValue(self):
@@ -262,21 +256,22 @@ def saveAsTextFile(t, rdd):
                     raise
         return self.foreachRDD(saveAsTextFile)
 
-    def _saveAsPickleFiles(self, prefix, suffix=None):
-        """
-        Save each RDD in this DStream as at binary file, the elements are
-        serialized by pickle.
-        """
-        def saveAsPickleFile(t, rdd):
-            path = rddToFileName(prefix, suffix, t)
-            try:
-                rdd.saveAsPickleFile(path)
-            except Py4JJavaError as e:
-                # after recovered from checkpointing, the foreachRDD may
-                # be called twice
-                if 'FileAlreadyExistsException' not in str(e):
-                    raise
-        return self.foreachRDD(saveAsPickleFile)
+    # TODO: uncomment this until we have ssc.pickleFileStream()
+    # def saveAsPickleFiles(self, prefix, suffix=None):
+    #     """
+    #     Save each RDD in this DStream as at binary file, the elements are
+    #     serialized by pickle.
+    #     """
+    #     def saveAsPickleFile(t, rdd):
+    #         path = rddToFileName(prefix, suffix, t)
+    #         try:
+    #             rdd.saveAsPickleFile(path)
+    #         except Py4JJavaError as e:
+    #             # after recovered from checkpointing, the foreachRDD may
+    #             # be called twice
+    #             if 'FileAlreadyExistsException' not in str(e):
+    #                 raise
+    #     return self.foreachRDD(saveAsPickleFile)
 
     def transform(self, func):
         """
@@ -304,10 +299,10 @@ def transformWith(self, func, other, keepSerializer=False):
             oldfunc = func
             func = lambda t, a, b: oldfunc(a, b)
         assert func.func_code.co_argcount == 3, "func should take two or three arguments"
-        jfunc = TransformFunction(self.ctx, func, self._jrdd_deserializer, other._jrdd_deserializer)
-        dstream = self.ctx._jvm.PythonTransformed2DStream(self._jdstream.dstream(),
+        jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer, other._jrdd_deserializer)
+        dstream = self._sc._jvm.PythonTransformed2DStream(self._jdstream.dstream(),
                                                           other._jdstream.dstream(), jfunc)
-        jrdd_serializer = self._jrdd_deserializer if keepSerializer else self.ctx.serializer
+        jrdd_serializer = self._jrdd_deserializer if keepSerializer else self._sc.serializer
         return DStream(dstream.asJavaDStream(), self._ssc, jrdd_serializer)
 
     def repartition(self, numPartitions):
@@ -336,61 +331,61 @@ def union(self, other):
 
     def cogroup(self, other, numPartitions=None):
         """
-        Return a new DStream by applying 'cogroup' between RDDs of `this`
+        Return a new DStream by applying 'cogroup' between RDDs of this
         DStream and `other` DStream.
 
         Hash partitioning is used to generate the RDDs with `numPartitions` partitions.
         """
         if numPartitions is None:
-            numPartitions = self.ctx.defaultParallelism
+            numPartitions = self._sc.defaultParallelism
         return self.transformWith(lambda a, b: a.cogroup(b, numPartitions), other)
 
     def join(self, other, numPartitions=None):
         """
-        Return a new DStream by applying 'join' between RDDs of `this` DStream and
+        Return a new DStream by applying 'join' between RDDs of this DStream and
         `other` DStream.
 
         Hash partitioning is used to generate the RDDs with `numPartitions`
         partitions.
         """
         if numPartitions is None:
-            numPartitions = self.ctx.defaultParallelism
+            numPartitions = self._sc.defaultParallelism
         return self.transformWith(lambda a, b: a.join(b, numPartitions), other)
 
     def leftOuterJoin(self, other, numPartitions=None):
         """
-        Return a new DStream by applying 'left outer join' between RDDs of `this` DStream and
+        Return a new DStream by applying 'left outer join' between RDDs of this DStream and
         `other` DStream.
 
         Hash partitioning is used to generate the RDDs with `numPartitions`
         partitions.
         """
         if numPartitions is None:
-            numPartitions = self.ctx.defaultParallelism
+            numPartitions = self._sc.defaultParallelism
         return self.transformWith(lambda a, b: a.leftOuterJoin(b, numPartitions), other)
 
     def rightOuterJoin(self, other, numPartitions=None):
         """
-        Return a new DStream by applying 'right outer join' between RDDs of `this` DStream and
+        Return a new DStream by applying 'right outer join' between RDDs of this DStream and
         `other` DStream.
 
         Hash partitioning is used to generate the RDDs with `numPartitions`
         partitions.
         """
         if numPartitions is None:
-            numPartitions = self.ctx.defaultParallelism
+            numPartitions = self._sc.defaultParallelism
         return self.transformWith(lambda a, b: a.rightOuterJoin(b, numPartitions), other)
 
     def fullOuterJoin(self, other, numPartitions=None):
         """
-        Return a new DStream by applying 'full outer join' between RDDs of `this` DStream and
+        Return a new DStream by applying 'full outer join' between RDDs of this DStream and
         `other` DStream.
 
         Hash partitioning is used to generate the RDDs with `numPartitions`
         partitions.
         """
         if numPartitions is None:
-            numPartitions = self.ctx.defaultParallelism
+            numPartitions = self._sc.defaultParallelism
         return self.transformWith(lambda a, b: a.fullOuterJoin(b, numPartitions), other)
 
     def _jtime(self, timestamp):
@@ -398,7 +393,7 @@ def _jtime(self, timestamp):
         """
         if isinstance(timestamp, datetime):
             timestamp = time.mktime(timestamp.timetuple())
-        return self.ctx._jvm.Time(long(timestamp * 1000))
+        return self._sc._jvm.Time(long(timestamp * 1000))
 
     def slice(self, begin, end):
         """
@@ -407,7 +402,7 @@ def slice(self, begin, end):
         `begin`, `end` could be datetime.datetime() or unix_timestamp
         """
         jrdds = self._jdstream.slice(self._jtime(begin), self._jtime(end))
-        return [RDD(jrdd, self.ctx, self._jrdd_deserializer) for jrdd in jrdds]
+        return [RDD(jrdd, self._sc, self._jrdd_deserializer) for jrdd in jrdds]
 
     def _validate_window_param(self, window, slide):
         duration = self._jdstream.dstream().slideDuration().milliseconds()
@@ -532,7 +527,7 @@ def reduceByKeyAndWindow(self, func, invFunc, windowDuration, slideDuration=None
         """
         self._validate_window_param(windowDuration, slideDuration)
         if numPartitions is None:
-            numPartitions = self.ctx.defaultParallelism
+            numPartitions = self._sc.defaultParallelism
 
         reduced = self.reduceByKey(func, numPartitions)
 
@@ -548,18 +543,18 @@ def invReduceFunc(t, a, b):
             joined = a.leftOuterJoin(b, numPartitions)
             return joined.mapValues(lambda (v1, v2): invFunc(v1, v2) if v2 is not None else v1)
 
-        jreduceFunc = TransformFunction(self.ctx, reduceFunc, reduced._jrdd_deserializer)
+        jreduceFunc = TransformFunction(self._sc, reduceFunc, reduced._jrdd_deserializer)
         if invReduceFunc:
-            jinvReduceFunc = TransformFunction(self.ctx, invReduceFunc, reduced._jrdd_deserializer)
+            jinvReduceFunc = TransformFunction(self._sc, invReduceFunc, reduced._jrdd_deserializer)
         else:
             jinvReduceFunc = None
         if slideDuration is None:
             slideDuration = self._slideDuration
-        dstream = self.ctx._jvm.PythonReducedWindowedDStream(reduced._jdstream.dstream(),
+        dstream = self._sc._jvm.PythonReducedWindowedDStream(reduced._jdstream.dstream(),
                                                              jreduceFunc, jinvReduceFunc,
                                                              self._ssc._jduration(windowDuration),
                                                              self._ssc._jduration(slideDuration))
-        return DStream(dstream.asJavaDStream(), self._ssc, self.ctx.serializer)
+        return DStream(dstream.asJavaDStream(), self._ssc, self._sc.serializer)
 
     def updateStateByKey(self, updateFunc, numPartitions=None):
         """
@@ -570,7 +565,7 @@ def updateStateByKey(self, updateFunc, numPartitions=None):
                           If `s` is None, then `k` will be eliminated.
         """
         if numPartitions is None:
-            numPartitions = self.ctx.defaultParallelism
+            numPartitions = self._sc.defaultParallelism
 
         def reduceFunc(t, a, b):
             if a is None:
@@ -581,10 +576,10 @@ def reduceFunc(t, a, b):
             state = g.mapPartitions(lambda x: updateFunc(x))
             return state.filter(lambda (k, v): v is not None)
 
-        jreduceFunc = TransformFunction(self.ctx, reduceFunc,
-                                        self.ctx.serializer, self._jrdd_deserializer)
-        dstream = self.ctx._jvm.PythonStateDStream(self._jdstream.dstream(), jreduceFunc)
-        return DStream(dstream.asJavaDStream(), self._ssc, self.ctx.serializer)
+        jreduceFunc = TransformFunction(self._sc, reduceFunc,
+                                        self._sc.serializer, self._jrdd_deserializer)
+        dstream = self._sc._jvm.PythonStateDStream(self._jdstream.dstream(), jreduceFunc)
+        return DStream(dstream.asJavaDStream(), self._ssc, self._sc.serializer)
 
 
 class TransformedDStream(DStream):
@@ -596,10 +591,9 @@ class TransformedDStream(DStream):
     one transformation.
     """
     def __init__(self, prev, func):
-        ssc = prev._ssc
-        self._ssc = ssc
-        self.ctx = ssc._sc
-        self._jrdd_deserializer = self.ctx.serializer
+        self._ssc = prev._ssc
+        self._sc = self._ssc._sc
+        self._jrdd_deserializer = self._sc.serializer
         self.is_cached = False
         self.is_checkpointed = False
         self._jdstream_val = None
@@ -618,7 +612,7 @@ def _jdstream(self):
         if self._jdstream_val is not None:
             return self._jdstream_val
 
-        jfunc = TransformFunction(self.ctx, self.func, self.prev._jrdd_deserializer)
-        dstream = self.ctx._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(), jfunc)
+        jfunc = TransformFunction(self._sc, self.func, self.prev._jrdd_deserializer)
+        dstream = self._sc._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(), jfunc)
         self._jdstream_val = dstream.asJavaDStream()
         return self._jdstream_val
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index a839faecf9a16..9f5cdff5ed809 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -496,11 +496,11 @@ def updater(it):
         def setup():
             conf = SparkConf().set("spark.default.parallelism", 1)
             sc = SparkContext(conf=conf)
-            ssc = StreamingContext(sc, 0.2)
+            ssc = StreamingContext(sc, 0.5)
             dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1))
             wc = dstream.updateStateByKey(updater)
             wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test")
-            wc.checkpoint(.2)
+            wc.checkpoint(.5)
             return ssc
 
         cpd = tempfile.mkdtemp("test_streaming_cps")
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index e171fb5730616..696dfb969a48a 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -36,7 +36,7 @@ import org.apache.spark.streaming.api.java._
 
 
 /**
- * Interface for Python callback function with three arguments
+ * Interface for Python callback function which is used to transform RDDs
  */
 private[python] trait PythonTransformFunction {
   def call(time: Long, rdds: JList[_]): JavaRDD[Array[Byte]]

From 52c535b0696b3861222a7bd6608bb3f6f4db64c3 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Fri, 3 Oct 2014 08:54:33 -0700
Subject: [PATCH 342/347] remove fix for sum()

---
 python/pyspark/rdd.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 77e8fb1773fd1..dc6497772e502 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -787,8 +787,6 @@ def sum(self):
         >>> sc.parallelize([1.0, 2.0, 3.0]).sum()
         6.0
         """
-        if not self.getNumPartitions():
-            return 0  # empty RDD can not been reduced
         return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
 
     def count(self):

From bebeb4aa6df42b6a72ffa7afb574891d2ce46c59 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Tue, 7 Oct 2014 16:44:57 -0700
Subject: [PATCH 343/347] address all comments

---
 python/pyspark/streaming/dstream.py             |  8 ++++----
 python/pyspark/streaming/util.py                | 17 ++++++++++++++++-
 .../streaming/api/python/PythonDStream.scala    | 11 ++++++++---
 3 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 824131739cce3..4533c5d541a51 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -174,7 +174,7 @@ def takeAndPrint(time, rdd):
     def mapValues(self, f):
         """
         Return a new DStream by applying a map function to the value of
-        each key-value pairs in 'this' DStream without changing the key.
+        each key-value pairs in this DStream without changing the key.
         """
         map_values_fn = lambda (k, v): (k, f(v))
         return self.map(map_values_fn, preservesPartitioning=True)
@@ -182,7 +182,7 @@ def mapValues(self, f):
     def flatMapValues(self, f):
         """
         Return a new DStream by applying a flatmap function to the value
-        of each key-value pairs in 'this' DStream without changing the key.
+        of each key-value pairs in this DStream without changing the key.
         """
         flat_map_fn = lambda (k, v): ((k, x) for x in f(v))
         return self.flatMap(flat_map_fn, preservesPartitioning=True)
@@ -276,7 +276,7 @@ def saveAsTextFile(t, rdd):
     def transform(self, func):
         """
         Return a new DStream in which each RDD is generated by applying a function
-        on each RDD of 'this' DStream.
+        on each RDD of this DStream.
 
         `func` can have one argument of `rdd`, or have two arguments of
         (`time`, `rdd`)
@@ -290,7 +290,7 @@ def transform(self, func):
     def transformWith(self, func, other, keepSerializer=False):
         """
         Return a new DStream in which each RDD is generated by applying a function
-        on each RDD of 'this' DStream and 'other' DStream.
+        on each RDD of this DStream and 'other' DStream.
 
         `func` can have two arguments of (`rdd_a`, `rdd_b`) or have three
         arguments of (`time`, `rdd_a`, `rdd_b`)
diff --git a/python/pyspark/streaming/util.py b/python/pyspark/streaming/util.py
index aecf7f71fdbc7..86ee5aa04f252 100644
--- a/python/pyspark/streaming/util.py
+++ b/python/pyspark/streaming/util.py
@@ -24,7 +24,12 @@
 
 class TransformFunction(object):
     """
-    This class is for py4j callback.
+    This class wraps a function RDD[X] -> RDD[Y] that was passed to
+    DStream.transform(), allowing it to be called from Java via Py4J's
+    callback server.
+
+    Java calls this function with a sequence of JavaRDDs and this function
+    returns a single JavaRDD pointer back to Java.
     """
     _emptyRDD = None
 
@@ -63,6 +68,16 @@ class Java:
 
 
 class TransformFunctionSerializer(object):
+    """
+    This class implements a serializer for PythonTransformFunction Java
+    objects.
+
+    This is necessary because the Java PythonTransformFunction objects are
+    actually Py4J references to Python objects and thus are not directly
+    serializable. When Java needs to serialize a PythonTransformFunction,
+    it uses this class to invoke Python, which returns the serialized function
+    as a byte array.
+    """
     def __init__(self, ctx, serializer, gateway=None):
         self.ctx = ctx
         self.serializer = serializer
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 696dfb969a48a..213dff6a76354 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -51,10 +51,12 @@ private[python] trait PythonTransformFunctionSerializer {
 }
 
 /**
- * Wrapper for PythonTransformFunction
+ * Wraps a PythonTransformFunction (which is a Python object accessed through Py4J)
+ * so that it looks like a Scala function and can be transparently serialized and
+ * deserialized by Java.
  */
 private[python] class TransformFunction(@transient var pfunc: PythonTransformFunction)
-  extends function.Function2[JList[JavaRDD[_]], Time, JavaRDD[Array[Byte]]] with Serializable {
+  extends function.Function2[JList[JavaRDD[_]], Time, JavaRDD[Array[Byte]]] {
 
   def apply(rdd: Option[RDD[_]], time: Time): Option[RDD[Array[Byte]]] = {
     Option(pfunc.call(time.milliseconds, List(rdd.map(JavaRDD.fromRDD(_)).orNull).asJava))
@@ -87,6 +89,9 @@ private[python] class TransformFunction(@transient var pfunc: PythonTransformFun
 
 /**
  * Helpers for PythonTransformFunctionSerializer
+ *
+ * PythonTransformFunctionSerializer is logically a singleton that's happens to be
+ * implemented as a Python object.
  */
 private[python] object PythonTransformFunctionSerializer {
 
@@ -119,7 +124,7 @@ private[python] object PythonTransformFunctionSerializer {
 }
 
 /**
- * Helper functions
+ * Helper functions, which are called from Python via Py4J.
  */
 private[python] object PythonDStream {
 

From 02d05751ea281d377ce52ad39ccd30e518d2ff5a Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Fri, 10 Oct 2014 14:27:24 -0700
Subject: [PATCH 344/347] add wrapper for foreachRDD()

---
 python/pyspark/streaming/dstream.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 4533c5d541a51..5d0dface2f043 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -150,6 +150,9 @@ def foreachRDD(self, func):
         """
         Apply a function to each RDD in this DStream.
         """
+        if func.func_code.co_argcount == 1:
+            old_func = func
+            func = lambda t, rdd: old_func(rdd)
         jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer)
         api = self._ssc._jvm.PythonDStream
         api.callForeachRDD(self._jdstream, jfunc)

From 3e2492b9b95e0cc0e3427265f71f069000cc43f7 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Fri, 10 Oct 2014 15:02:29 -0700
Subject: [PATCH 345/347] change updateStateByKey() to easy API

---
 .../streaming/stateful_network_wordcount.py   | 57 +++++++++++++++++++
 python/pyspark/streaming/dstream.py           | 10 ++--
 python/pyspark/streaming/tests.py             | 22 ++++---
 3 files changed, 72 insertions(+), 17 deletions(-)
 create mode 100644 examples/src/main/python/streaming/stateful_network_wordcount.py

diff --git a/examples/src/main/python/streaming/stateful_network_wordcount.py b/examples/src/main/python/streaming/stateful_network_wordcount.py
new file mode 100644
index 0000000000000..7bd1512180920
--- /dev/null
+++ b/examples/src/main/python/streaming/stateful_network_wordcount.py
@@ -0,0 +1,57 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+ Counts words in UTF8 encoded, '\n' delimited text received from the
+ network every second.
+
+ Usage: stateful_network_wordcount.py <hostname> <port>
+   <hostname> and <port> describe the TCP server that Spark Streaming
+    would connect to receive data.
+
+ To run this on your local machine, you need to first run a Netcat server
+    `$ nc -lk 9999`
+ and then run the example
+    `$ bin/spark-submit examples/src/main/python/streaming/stateful_network_wordcount.py \
+        localhost 9999`
+"""
+
+import sys
+
+from pyspark import SparkContext
+from pyspark.streaming import StreamingContext
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print >> sys.stderr, "Usage: stateful_network_wordcount.py <hostname> <port>"
+        exit(-1)
+    sc = SparkContext(appName="PythonStreamingNetworkWordCount")
+    ssc = StreamingContext(sc, 1)
+    ssc.checkpoint("checkpoint")
+
+    def updateFunc(new_values, last_sum):
+        return sum(new_values) + (last_sum or 0)
+
+    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
+    running_counts = lines.flatMap(lambda line: line.split(" "))\
+                          .map(lambda word: (word, 1))\
+                          .updateStateByKey(updateFunc)
+
+    running_counts.pprint()
+
+    ssc.start()
+    ssc.awaitTermination()
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 5d0dface2f043..5ae5cf07f0137 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -564,19 +564,19 @@ def updateStateByKey(self, updateFunc, numPartitions=None):
         Return a new "state" DStream where the state for each key is updated by applying
         the given function on the previous state of the key and the new values of the key.
 
-        @param updateFunc: State update function ([(k, vs, s)] -> [(k, s)]).
-                          If `s` is None, then `k` will be eliminated.
+        @param updateFunc: State update function. If this function returns None, then
+                           corresponding state key-value pair will be eliminated.
         """
         if numPartitions is None:
             numPartitions = self._sc.defaultParallelism
 
         def reduceFunc(t, a, b):
             if a is None:
-                g = b.groupByKey(numPartitions).map(lambda (k, vs): (k, list(vs), None))
+                g = b.groupByKey(numPartitions).mapValues(lambda vs: (list(vs), None))
             else:
                 g = a.cogroup(b, numPartitions)
-                g = g.map(lambda (k, (va, vb)): (k, list(vb), list(va)[0] if len(va) else None))
-            state = g.mapPartitions(lambda x: updateFunc(x))
+                g = g.mapValues(lambda (va, vb): (list(vb), list(va)[0] if len(va) else None))
+            state = g.mapValues(lambda (vs, s): updateFunc(vs, s))
             return state.filter(lambda (k, v): v is not None)
 
         jreduceFunc = TransformFunction(self._sc, reduceFunc,
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 9f5cdff5ed809..0e5c1a3b3c2ad 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -119,7 +119,7 @@ def _sort_result_based_on_key(self, outputs):
             output.sort(key=lambda x: x[0])
 
 
-class TestBasicOperations(PySparkStreamingTestCase):
+class BasicOperationTests(PySparkStreamingTestCase):
 
     def test_map(self):
         """Basic operation test for DStream.map."""
@@ -340,15 +340,13 @@ def func(a, b):
         expected = [[('a', (1, None)), ('b', (2, 3)), ('c', (None, 4))]]
         self._test_func(input, func, expected, True, input2)
 
-    def update_state_by_key(self):
+    def test_update_state_by_key(self):
 
-        def updater(it):
-            for k, vs, s in it:
-                if not s:
-                    s = vs
-                else:
-                    s.extend(vs)
-                yield (k, s)
+        def updater(vs, s):
+            if not s:
+                s = []
+            s.extend(vs)
+            return s
 
         input = [[('k', i)] for i in range(5)]
 
@@ -360,7 +358,7 @@ def func(dstream):
         self._test_func(input, func, expected)
 
 
-class TestWindowFunctions(PySparkStreamingTestCase):
+class WindowFunctionTests(PySparkStreamingTestCase):
 
     timeout = 20
 
@@ -417,7 +415,7 @@ def test_reduce_by_invalid_window(self):
         self.assertRaises(ValueError, lambda: d1.reduceByKeyAndWindow(None, None, 1, 0.1))
 
 
-class TestStreamingContext(PySparkStreamingTestCase):
+class StreamingContextTests(PySparkStreamingTestCase):
 
     duration = 0.1
 
@@ -480,7 +478,7 @@ def func(rdds):
         self.assertEqual([2, 3, 1], self._take(dstream, 3))
 
 
-class TestCheckpoint(PySparkStreamingTestCase):
+class CheckpointTests(PySparkStreamingTestCase):
 
     def setUp(self):
         pass

From 331ecced6f61ad5183da5830f94f584bcc74e479 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Fri, 10 Oct 2014 22:25:09 -0700
Subject: [PATCH 346/347] fix example

---
 .../src/main/python/streaming/stateful_network_wordcount.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/src/main/python/streaming/stateful_network_wordcount.py b/examples/src/main/python/streaming/stateful_network_wordcount.py
index 7bd1512180920..18a9a5a452ffb 100644
--- a/examples/src/main/python/streaming/stateful_network_wordcount.py
+++ b/examples/src/main/python/streaming/stateful_network_wordcount.py
@@ -39,7 +39,7 @@
     if len(sys.argv) != 3:
         print >> sys.stderr, "Usage: stateful_network_wordcount.py <hostname> <port>"
         exit(-1)
-    sc = SparkContext(appName="PythonStreamingNetworkWordCount")
+    sc = SparkContext(appName="PythonStreamingStatefulNetworkWordCount")
     ssc = StreamingContext(sc, 1)
     ssc.checkpoint("checkpoint")
 

From 64561e4e503eafb958f6769383ba3b37edbe5fa2 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Fri, 10 Oct 2014 22:47:46 -0700
Subject: [PATCH 347/347] fix tests

---
 python/pyspark/streaming/tests.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 0e5c1a3b3c2ad..a8d876d0fa3b3 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -487,9 +487,8 @@ def test_get_or_create(self):
         inputd = tempfile.mkdtemp()
         outputd = tempfile.mkdtemp() + "/"
 
-        def updater(it):
-            for k, vs, s in it:
-                yield (k, sum(vs, s or 0))
+        def updater(vs, s):
+            return sum(vs, s or 0)
 
         def setup():
             conf = SparkConf().set("spark.default.parallelism", 1)