changed toPandas to use Arrow with pure Python pipeline since hybrid IPC in Arrow not ready

BryanCutler · BryanCutler · commit 3f855eccd2c6 · 2016-11-09T15:10:49.000-08:00
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -346,7 +346,7 @@ def collect(self):
     @ignore_unicode_prefix
     @since(2.0)
     def collectAsArrow(self):
-        """Returns all the records as an Arrow
+        """Returns all the records as an ArrowRecordBatch
         """
         with SCCallSiteSync(self._sc) as css:
             port = self._jdf.collectAsArrowToPython()
@@ -1531,8 +1531,54 @@ def toPandas(self, useArrow=True):
         1    5    Bob
         """
         import pandas as pd
+
         if useArrow:
-            return self.collectAsArrow().to_pandas()
+            import io
+            from pyarrow.array import from_pylist
+            from pyarrow.table import RecordBatch
+            from pyarrow.ipc import ArrowFileReader, ArrowFileWriter
+
+            names = self.columns  # capture for closure
+
+            # reduce a partition to a serialized ArrowRecordBatch
+            def reducePartition(iterator):
+                cols = [[] for _ in range(len(names))]
+                for row in iterator:
+                    for i in range(len(row)):
+                        cols[i].append(row[i])
+
+                arrs = list(map(lambda c: from_pylist(c), cols))
+                batch = RecordBatch.from_arrays(names, arrs)
+                sink = io.BytesIO()
+                writer = ArrowFileWriter(sink, batch.schema)
+                writer.write_record_batch(batch)
+                writer.close()
+                yield sink.getvalue()
+
+            # convert partitions to serialized ArrowRecordBatches and collect byte arrays
+            batch_bytes = self.rdd.mapPartitions(reducePartition).collect()
+
+            def read_batch(b):
+                reader = ArrowFileReader(bytes(b))
+                return reader.get_record_batch(0)
+
+            # deserialize ArrowRecordBatch and create a Pandas DataFrame for each batch
+            frames = list(map(lambda b: read_batch(b).to_pandas(), batch_bytes))
+
+            # merge all DataFrames to one
+            return pd.concat(frames, ignore_index=True)
+
+            # ~ alternate to concat ~
+            # batch = read_batch(batch_bytes[0])
+            # pdf = batch.to_pandas()
+            # for i in range(1, len(batch_bytes)):
+            #     batch = read_batch(batch_bytes[i])
+            #     pdf = pdf.append(batch.to_pandas(), ignore_index=True)
+            #
+            # return pdf
+
+            # TODO - Uses Arrow hybrid (Java -> C++) pipeline
+            #return self.collectAsArrow().to_pandas()
         else:
             return pd.DataFrame.from_records(self.collect(), columns=self.columns)
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -1971,37 +1971,11 @@ def setUpClass(cls):
         ReusedPySparkTestCase.setUpClass()
         cls.spark = SparkSession(cls.sc)
 
-    '''
-    # TODO - remove, just testing pyarrow api
-    def test_no_ser(self):
-        import io
-        import pandas as pd
-        from pandas.util.testing import assert_frame_equal
-        from pyarrow.ipc import ArrowFileReader, ArrowFileWriter
-        pdf = pd.DataFrame({'test': [1.5]})
-        batch = pyarrow.RecordBatch.from_pandas(pdf)
-        sink = io.BytesIO()
-        writer = ArrowFileWriter(sink, batch.schema)
-        writer.write_record_batch(batch)
-        writer.close()
-        data = [[bytearray(sink.getvalue())]]
-        schema = StructType([StructField('test', BinaryType())])
-        df = self.spark.createDataFrame(data, schema=schema)
-        rows = df.collect()
-        reader = ArrowFileReader(bytes(rows[0][0]))
-        batch_rt = reader.get_record_batch(0)
-        pdf_rt = batch_rt.to_pandas()
-        assert_frame_equal(pdf, pdf_rt)
-    '''
-
-    def test_arrow_round_trip(self):
+    def test_arrow_toPandas(self):
         df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
         pdf = df.toPandas(useArrow=False)
         pdf_arrow = df.toPandas(useArrow=True)
-        # TODO - compare Pandas DataFrames
-        print(pdf)
-        print(pdf_arrow)
-        self.assertTrue(False)
+        self.assertTrue(pdf.equals(pdf_arrow))
 
 
 if __name__ == "__main__":