[SPARK-23299][SQL][PYSPARK] Fix __repr__ behaviour for Rows

tbcs · ashashwat · holdenk · commit eec1a3c28629 · 2019-05-06T10:00:49.000-07:00
This is PR is meant to replace #20503, which lay dormant for a while. The solution in the original PR is still valid, so this is just that patch rebased onto the current master. Original summary follows. ## What changes were proposed in this pull request? Fix `__repr__` behaviour for Rows. Rows `__repr__` assumes data is a string when column name is missing. Examples, ``` >>> from pyspark.sql.types import Row >>> Row ("Alice", "11") <Row(Alice, 11)> >>> Row (name="Alice", age=11) Row(age=11, name='Alice') >>> Row ("Alice", 11) <snip stack trace> TypeError: sequence item 1: expected string, int found ``` This is because Row () when called without column names assumes everything is a string. ## How was this patch tested? Manually tested and a unit test was added to `python/pyspark/sql/tests/test_types.py`. Closes #24448 from tbcs/SPARK-23299. Lead-authored-by: Tibor Csögör <tibi@tiborius.net> Co-authored-by: Shashwat Anand <me@shashwat.me> Signed-off-by: Holden Karau <holden@pigscanfly.ca>
diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py
@@ -1,3 +1,4 @@
+# -*- encoding: utf-8 -*-
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -739,6 +740,17 @@ def test_timestamp_microsecond(self):
         tst = TimestampType()
         self.assertEqual(tst.toInternal(datetime.datetime.max) % 1000000, 999999)
 
+    # regression test for SPARK-23299
+    def test_row_without_column_name(self):
+        row = Row("Alice", 11)
+        self.assertEqual(repr(row), "<Row('Alice', 11)>")
+
+        # test __repr__ with unicode values
+        if sys.version_info.major >= 3:
+            self.assertEqual(repr(Row("数", "量")), "<Row('数', '量')>")
+        else:
+            self.assertEqual(repr(Row(u"数", u"量")), r"<Row(u'\u6570', u'\u91cf')>")
+
     def test_empty_row(self):
         row = Row()
         self.assertEqual(len(row), 0)
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
@@ -1435,13 +1435,24 @@ class Row(tuple):
 
     >>> Person = Row("name", "age")
     >>> Person
-    <Row(name, age)>
+    <Row('name', 'age')>
     >>> 'name' in Person
     True
     >>> 'wrong_key' in Person
     False
     >>> Person("Alice", 11)
     Row(name='Alice', age=11)
+
+    This form can also be used to create rows as tuple values, i.e. with unnamed
+    fields. Beware that such Row objects have different equality semantics:
+
+    >>> row1 = Row("Alice", 11)
+    >>> row2 = Row(name="Alice", age=11)
+    >>> row1 == row2
+    False
+    >>> row3 = Row(a="Alice", b=11)
+    >>> row1 == row3
+    True
     """
 
     def __new__(self, *args, **kwargs):
@@ -1549,7 +1560,7 @@ def __repr__(self):
             return "Row(%s)" % ", ".join("%s=%r" % (k, v)
                                          for k, v in zip(self.__fields__, tuple(self)))
         else:
-            return "<Row(%s)>" % ", ".join(self)
+            return "<Row(%s)>" % ", ".join("%r" % field for field in self)
 
 
 class DateConverter(object):