apache · d80tb7 · Jun 20, 2019 · Jun 20, 2019 · Jun 21, 2019 · Jun 25, 2019
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -48,6 +48,7 @@ private[spark] object PythonEvalType {
   val SQL_WINDOW_AGG_PANDAS_UDF = 203
   val SQL_SCALAR_PANDAS_ITER_UDF = 204
   val SQL_MAP_PANDAS_ITER_UDF = 205
+  val SQL_COGROUPED_MAP_PANDAS_UDF = 206
 
   def toString(pythonEvalType: Int): String = pythonEvalType match {
     case NON_UDF => "NON_UDF"
@@ -58,6 +59,7 @@ private[spark] object PythonEvalType {
     case SQL_WINDOW_AGG_PANDAS_UDF => "SQL_WINDOW_AGG_PANDAS_UDF"
     case SQL_SCALAR_PANDAS_ITER_UDF => "SQL_SCALAR_PANDAS_ITER_UDF"
     case SQL_MAP_PANDAS_ITER_UDF => "SQL_MAP_PANDAS_ITER_UDF"
+    case SQL_COGROUPED_MAP_PANDAS_UDF => "SQL_COGROUPED_MAP_PANDAS_UDF"
   }
 }
 

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -75,6 +75,7 @@ class PythonEvalType(object):
     SQL_WINDOW_AGG_PANDAS_UDF = 203
     SQL_SCALAR_PANDAS_ITER_UDF = 204
     SQL_MAP_PANDAS_ITER_UDF = 205
+    SQL_COGROUPED_MAP_PANDAS_UDF = 206
 
 
 def portable_hash(x):

diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
@@ -401,6 +401,32 @@ def __repr__(self):
         return "ArrowStreamPandasUDFSerializer"
 
 
+class CogroupUDFSerializer(ArrowStreamPandasUDFSerializer):
+
+    def load_stream(self, stream):
+        """
+        Deserialize Cogrouped ArrowRecordBatches to a tuple of Arrow tables and yield as two
+        lists of pandas.Series.
+        """
+        import pyarrow as pa
+        dataframes_in_group = None
+
+        while dataframes_in_group is None or dataframes_in_group > 0:
+            dataframes_in_group = read_int(stream)
+
+            if dataframes_in_group == 2:
+                batch1 = [batch for batch in ArrowStreamSerializer.load_stream(self, stream)]
+                batch2 = [batch for batch in ArrowStreamSerializer.load_stream(self, stream)]
+                yield (
+                    [self.arrow_to_pandas(c) for c in pa.Table.from_batches(batch1).itercolumns()],
+                    [self.arrow_to_pandas(c) for c in pa.Table.from_batches(batch2).itercolumns()]
+                )
+
+            elif dataframes_in_group != 0:
+                raise ValueError(
+                    'Invalid number of pandas.DataFrames in group {0}'.format(dataframes_in_group))
+
+
 class BatchedSerializer(Serializer):
 
     """

diff --git a/python/pyspark/sql/cogroup.py b/python/pyspark/sql/cogroup.py
@@ -0,0 +1,98 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pyspark import since
+from pyspark.rdd import PythonEvalType
+from pyspark.sql.column import Column
+from pyspark.sql.dataframe import DataFrame
+
+
+class CoGroupedData(object):
+    """
+    A logical grouping of two :class:`GroupedData`,
+    created by :func:`GroupedData.cogroup`.
+
+    .. note:: Experimental
+
+    .. versionadded:: 3.0
+    """
+
+    def __init__(self, gd1, gd2):
+        self._gd1 = gd1
+        self._gd2 = gd2
+        self.sql_ctx = gd1.sql_ctx
+
+    @since(3.0)
+    def apply(self, udf):
+        """
+        Applies a function to each cogroup using a pandas udf and returns the result
+        as a `DataFrame`.
+
+        The user-defined function should take two `pandas.DataFrame` and return another
+        `pandas.DataFrame`. For each side of the cogroup, all columns are passed together
+        as a `pandas.DataFrame` to the user-function and the returned `pandas.DataFrame`
+        are combined as a :class:`DataFrame`.
+
+        The returned `pandas.DataFrame` can be of arbitrary length and its schema must match the
+        returnType of the pandas udf.
+
+        .. note:: This function requires a full shuffle. All the data of a cogroup will be loaded
+            into memory, so the user should be aware of the potential OOM risk if data is skewed
+            and certain groups are too large to fit in memory.
+
+        .. note:: Experimental
+
+        :param udf: a cogrouped map user-defined function returned by
+            :func:`pyspark.sql.functions.pandas_udf`.
+
+        >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
+        >>> df1 = spark.createDataFrame(
+        ...    [(20000101, 1, 1.0), (20000101, 2, 2.0), (20000102, 1, 3.0), (20000102, 2, 4.0)],
+        ...    ("time", "id", "v1"))
+        >>> df2 = spark.createDataFrame(
+        ... [(20000101, 1, "x"), (20000101, 2, "y")],
+        ... ("time", "id", "v2"))
+        >>> @pandas_udf("time int, id int, v1 double, v2 string", PandasUDFType.COGROUPED_MAP)
+        ... def asof_join(l, r):
+        ...     return pd.merge_asof(l, r, on="time", by="id")
+        >>> df1.groupby("id").cogroup(df2.groupby("id")).apply(asof_join).show()
+        +--------+---+---+---+
+        |    time| id| v1| v2|
+        +--------+---+---+---+
+        |20000101|  1|1.0|  x|
+        |20000102|  1|3.0|  x|
+        |20000101|  2|2.0|  y|
+        |20000102|  2|4.0|  y|
+        +--------+---+---+---+
+
+        .. seealso:: :meth:`pyspark.sql.functions.pandas_udf`
+
+        """
+        # Columns are special because hasattr always return True
+        if isinstance(udf, Column) or not hasattr(udf, 'func') \
+           or udf.evalType != PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
+            raise ValueError("Invalid udf: the udf argument must be a pandas_udf of type "
+                             "COGROUPED_MAP.")
+        all_cols = self._extract_cols(self._gd1) + self._extract_cols(self._gd2)
+        udf_column = udf(*all_cols)
+        jdf = self._gd1._jgd.flatMapCoGroupsInPandas(self._gd2._jgd, udf_column._jc.expr())
+        return DataFrame(jdf, self.sql_ctx)
+
+    @staticmethod
+    def _extract_cols(gd):
+        df = gd._df
+        return [df[col] for col in df.columns]
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -2814,6 +2814,8 @@ class PandasUDFType(object):
 
     GROUPED_MAP = PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF
 
+    COGROUPED_MAP = PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF
+
     GROUPED_AGG = PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF
 
     MAP_ITER = PythonEvalType.SQL_MAP_PANDAS_ITER_UDF
@@ -3320,7 +3322,8 @@ def pandas_udf(f=None, returnType=None, functionType=None):
                          PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
                          PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                          PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
-                         PythonEvalType.SQL_MAP_PANDAS_ITER_UDF]:
+                         PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
+                         PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF]:
         raise ValueError("Invalid functionType: "
                          "functionType must be one the values from PandasUDFType")
 

diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
@@ -22,6 +22,7 @@
 from pyspark.sql.column import Column, _to_seq
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.types import *
+from pyspark.sql.cogroup import CoGroupedData
 
 __all__ = ["GroupedData"]
 
@@ -218,6 +219,15 @@ def pivot(self, pivot_col, values=None):
             jgd = self._jgd.pivot(pivot_col, values)
         return GroupedData(jgd, self._df)
 
+    @since(3.0)
+    def cogroup(self, other):
+        """
+        Cogroups this group with another group so that we can run cogrouped operations.
+
+        See :class:`CoGroupedData` for the operations that can be run.
+        """
+        return CoGroupedData(self, other)
+
     @since(2.3)
     def apply(self, udf):
         """
@@ -232,7 +242,7 @@ def apply(self, udf):
         The returned `pandas.DataFrame` can be of arbitrary length and its schema must match the
         returnType of the pandas udf.
 
-        .. note:: This function requires a full shuffle. all the data of a group will be loaded
+        .. note:: This function requires a full shuffle. All the data of a group will be loaded
             into memory, so the user should be aware of the potential OOM risk if data is skewed
             and certain groups are too large to fit in memory.