From 3084be1de3ff58a9258dacfb8d7cf575df3fb3c9 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Sat, 6 Oct 2018 18:59:46 +0800 Subject: [PATCH 1/3] Internally document type conversion between Python data and SQL types in UDFs --- python/pyspark/sql/functions.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 7685264b2d4d..e27b4b71713f 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -2733,6 +2733,33 @@ def udf(f=None, returnType=StringType()): | 8| JOHN DOE| 22| +----------+--------------+------------+ """ + + # The following table shows most of Python data and SQL type conversions in normal UDFs that + # are not yet visible to the user. Some of behaviors are buggy and might be changed in the near + # future. The table might have to be eventually documented externally. + # Please see SPARK-25666's PR to see the codes in order to generate the table below. + # + # +-----------------------------+--------------+----------+------+-------+------+----------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+--------------+----------+--------------+-------------+-------------+ # noqa + # |SQL Type \ Python Value(Type)|None(NoneType)|True(bool)|1(int)|1(long)|a(str)|a(unicode)| 1970-01-01(date)|1970-01-01 00:00:00(datetime)|1.0(float)|array('i', [1])(array)|[1](list)| (1,)(tuple)|ABC(bytearray)|1(Decimal)|{'a': 1}(dict)|Row(a=1)(Row)|Row(a=1)(Row)| # noqa + # +-----------------------------+--------------+----------+------+-------+------+----------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+--------------+----------+--------------+-------------+-------------+ # noqa + # | null| None| None| None| None| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa + # | boolean| None| True| None| None| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa + # | tinyint| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa + # | smallint| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa + # | int| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa + # | bigint| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa + # | string| None| true| 1| 1| a| a|java.util.Gregori...| java.util.Gregori...| 1.0| [I@7f1970e1| [1]|[Ljava.lang.Objec...| [B@284838a9| 1| {a=1}| X| X| # noqa + # | date| None| X| X| X| X| X| 1970-01-01| 1970-01-01| X| X| X| X| X| X| X| X| X| # noqa + # | timestamp| None| X| X| X| X| X| X| 1970-01-01 00:00:00| X| X| X| X| X| X| X| X| X| # noqa + # | float| None| None| None| None| None| None| None| None| 1.0| None| None| None| None| None| None| X| X| # noqa + # | double| None| None| None| None| None| None| None| None| 1.0| None| None| None| None| None| None| X| X| # noqa + # | array| None| None| None| None| None| None| None| None| None| [1]| [1]| [1]| [65, 66, 67]| None| None| X| X| # noqa + # | binary| None| None| None| None| a| a| None| None| None| None| None| None| ABC| None| None| X| X| # noqa + # | decimal(10,0)| None| None| None| None| None| None| None| None| None| None| None| None| None| 1| None| X| X| # noqa + # | map| None| None| None| None| None| None| None| None| None| None| None| None| None| None| {u'a': 1}| X| X| # noqa + # | struct<_1:int>| None| X| X| X| X| X| X| X| X| X|Row(_1=1)| Row(_1=1)| X| X| Row(_1=None)| Row(_1=1)| Row(_1=1)| # noqa + # +-----------------------------+--------------+----------+------+-------+------+----------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+--------------+----------+--------------+-------------+-------------+ # noqa + # decorator @udf, @udf(), @udf(dataType()) if f is None or isinstance(f, (str, DataType)): # If DataType has been passed as a positional argument From 3aa010320b50fecded7f103292c1b93daf9f3754 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Mon, 8 Oct 2018 13:04:41 +0800 Subject: [PATCH 2/3] Address comments --- python/pyspark/sql/functions.py | 46 +++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index e27b4b71713f..a9dccfdbbb69 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -2739,26 +2739,32 @@ def udf(f=None, returnType=StringType()): # future. The table might have to be eventually documented externally. # Please see SPARK-25666's PR to see the codes in order to generate the table below. # - # +-----------------------------+--------------+----------+------+-------+------+----------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+--------------+----------+--------------+-------------+-------------+ # noqa - # |SQL Type \ Python Value(Type)|None(NoneType)|True(bool)|1(int)|1(long)|a(str)|a(unicode)| 1970-01-01(date)|1970-01-01 00:00:00(datetime)|1.0(float)|array('i', [1])(array)|[1](list)| (1,)(tuple)|ABC(bytearray)|1(Decimal)|{'a': 1}(dict)|Row(a=1)(Row)|Row(a=1)(Row)| # noqa - # +-----------------------------+--------------+----------+------+-------+------+----------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+--------------+----------+--------------+-------------+-------------+ # noqa - # | null| None| None| None| None| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa - # | boolean| None| True| None| None| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa - # | tinyint| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa - # | smallint| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa - # | int| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa - # | bigint| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa - # | string| None| true| 1| 1| a| a|java.util.Gregori...| java.util.Gregori...| 1.0| [I@7f1970e1| [1]|[Ljava.lang.Objec...| [B@284838a9| 1| {a=1}| X| X| # noqa - # | date| None| X| X| X| X| X| 1970-01-01| 1970-01-01| X| X| X| X| X| X| X| X| X| # noqa - # | timestamp| None| X| X| X| X| X| X| 1970-01-01 00:00:00| X| X| X| X| X| X| X| X| X| # noqa - # | float| None| None| None| None| None| None| None| None| 1.0| None| None| None| None| None| None| X| X| # noqa - # | double| None| None| None| None| None| None| None| None| 1.0| None| None| None| None| None| None| X| X| # noqa - # | array| None| None| None| None| None| None| None| None| None| [1]| [1]| [1]| [65, 66, 67]| None| None| X| X| # noqa - # | binary| None| None| None| None| a| a| None| None| None| None| None| None| ABC| None| None| X| X| # noqa - # | decimal(10,0)| None| None| None| None| None| None| None| None| None| None| None| None| None| 1| None| X| X| # noqa - # | map| None| None| None| None| None| None| None| None| None| None| None| None| None| None| {u'a': 1}| X| X| # noqa - # | struct<_1:int>| None| X| X| X| X| X| X| X| X| X|Row(_1=1)| Row(_1=1)| X| X| Row(_1=None)| Row(_1=1)| Row(_1=1)| # noqa - # +-----------------------------+--------------+----------+------+-------+------+----------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+--------------+----------+--------------+-------------+-------------+ # noqa + # +-----------------------------+--------------+----------+------+-------+---------------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+-----------------+------------+--------------+------------------+----------------------+ # noqa + # |SQL Type \ Python Value(Type)|None(NoneType)|True(bool)|1(int)|1(long)| a(str)| a(unicode)| 1970-01-01(date)|1970-01-01 00:00:00(datetime)|1.0(float)|array('i', [1])(array)|[1](list)| (1,)(tuple)| ABC(bytearray)| 1(Decimal)|{'a': 1}(dict)|Row(kwargs=1)(Row)|Row(namedtuple=1)(Row)| # noqa + # +-----------------------------+--------------+----------+------+-------+---------------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+-----------------+------------+--------------+------------------+----------------------+ # noqa + # | boolean| None| True| None| None| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa + # | tinyint| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa + # | smallint| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa + # | int| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa + # | bigint| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa + # | string| None| u'true'| u'1'| u'1'| u'a'| u'a'|u'java.util.Grego...| u'java.util.Grego...| u'1.0'| u'[I@24a83055'| u'[1]'|u'[Ljava.lang.Obj...| u'[B@49093632'| u'1'| u'{a=1}'| X| X| # noqa + # | date| None| X| X| X| X| X|datetime.date(197...| datetime.date(197...| X| X| X| X| X| X| X| X| X| # noqa + # | timestamp| None| X| X| X| X| X| X| datetime.datetime...| X| X| X| X| X| X| X| X| X| # noqa + # | float| None| None| None| None| None| None| None| None| 1.0| None| None| None| None| None| None| X| X| # noqa + # | double| None| None| None| None| None| None| None| None| 1.0| None| None| None| None| None| None| X| X| # noqa + # | array| None| None| None| None| None| None| None| None| None| [1]| [1]| [1]| [65, 66, 67]| None| None| X| X| # noqa + # | binary| None| None| None| None|bytearray(b'a')|bytearray(b'a')| None| None| None| None| None| None|bytearray(b'ABC')| None| None| X| X| # noqa + # | decimal(10,0)| None| None| None| None| None| None| None| None| None| None| None| None| None|Decimal('1')| None| X| X| # noqa + # | map| None| None| None| None| None| None| None| None| None| None| None| None| None| None| {u'a': 1}| X| X| # noqa + # | struct<_1:int>| None| X| X| X| X| X| X| X| X| X|Row(_1=1)| Row(_1=1)| X| X| Row(_1=None)| Row(_1=1)| Row(_1=1)| # noqa + # +-----------------------------+--------------+----------+------+-------+---------------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+-----------------+------------+--------------+------------------+----------------------+ # noqa + # + # Note: DDL formatted string is used for 'SQL Type' for simplicity. This string can be + # used in `returnType`. + # Note: The values inside of the table are generated by `repr`. + # Note: Python 2 was used to generate this table since it is used to check the backward + # compatibility often in practice. + # Note: 'X' means it throws an exception during the conversion. # decorator @udf, @udf(), @udf(dataType()) if f is None or isinstance(f, (str, DataType)): From 6ee69a37827737760a662a5a9d03f7b5b37fc39e Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Mon, 8 Oct 2018 13:07:48 +0800 Subject: [PATCH 3/3] was -> is --- python/pyspark/sql/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index a9dccfdbbb69..3ee008641200 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -2762,7 +2762,7 @@ def udf(f=None, returnType=StringType()): # Note: DDL formatted string is used for 'SQL Type' for simplicity. This string can be # used in `returnType`. # Note: The values inside of the table are generated by `repr`. - # Note: Python 2 was used to generate this table since it is used to check the backward + # Note: Python 2 is used to generate this table since it is used to check the backward # compatibility often in practice. # Note: 'X' means it throws an exception during the conversion.