From 376149aa33b16f39f21fc209d11aef232faa2bbd Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Fri, 25 May 2018 10:19:04 -0700 Subject: [PATCH 1/8] add experimental to pandas_udf --- python/pyspark/sql/functions.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index fbc8a2d038f8f..443f40e4d8f1d 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -2448,6 +2448,8 @@ def udf(f=None, returnType=StringType()): @since(2.3) def pandas_udf(f=None, returnType=None, functionType=None): """ + .. note:: Experimental + Creates a vectorized user defined function (UDF). :param f: user-defined function. A python function if used as a standalone function From fb05282803f5c470c262263c2d5669f9a2f914db Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Sat, 26 May 2018 00:15:18 -0700 Subject: [PATCH 2/8] add experimental note to other arrow functionality --- python/pyspark/sql/dataframe.py | 2 ++ python/pyspark/sql/group.py | 2 ++ python/pyspark/sql/session.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 213dc158f9328..808235ab25440 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1975,6 +1975,8 @@ def toPandas(self): .. note:: This method should only be used if the resulting Pandas's DataFrame is expected to be small, as all the data is loaded into the driver's memory. + .. note:: Usage with spark.sql.execution.arrow.enabled=True is experimental. + >>> df.toPandas() # doctest: +SKIP age name 0 2 Alice diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py index 3505065b648f2..0906c9c6b329a 100644 --- a/python/pyspark/sql/group.py +++ b/python/pyspark/sql/group.py @@ -236,6 +236,8 @@ def apply(self, udf): into memory, so the user should be aware of the potential OOM risk if data is skewed and certain groups are too large to fit in memory. + .. note:: Experimental + :param udf: a grouped map user-defined function returned by :func:`pyspark.sql.functions.pandas_udf`. diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index 13d6e2e53dbd0..d675a240172a7 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -584,6 +584,8 @@ def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=Tr .. versionchanged:: 2.1 Added verifySchema. + .. note:: Usage with spark.sql.execution.arrow.enabled=True is experimental. + >>> l = [('Alice', 1)] >>> spark.createDataFrame(l).collect() [Row(_1=u'Alice', _2=1)] From 0bce5227c4ba6d2257878643cb4b8e8b75a334b9 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Sat, 26 May 2018 15:03:51 -0700 Subject: [PATCH 3/8] moved placement of note in pandas udf --- python/pyspark/sql/functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 443f40e4d8f1d..efcce25a08e04 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -2448,8 +2448,6 @@ def udf(f=None, returnType=StringType()): @since(2.3) def pandas_udf(f=None, returnType=None, functionType=None): """ - .. note:: Experimental - Creates a vectorized user defined function (UDF). :param f: user-defined function. A python function if used as a standalone function @@ -2458,6 +2456,8 @@ def pandas_udf(f=None, returnType=None, functionType=None): :param functionType: an enum value in :class:`pyspark.sql.functions.PandasUDFType`. Default: SCALAR. + .. note:: Experimental + The function type of the UDF can be one of the following: 1. SCALAR From a1e70aa04d698d8fef6d20da6cdfc5867d074462 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Sat, 26 May 2018 15:04:07 -0700 Subject: [PATCH 4/8] added line in migration guide --- docs/sql-programming-guide.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index fc26562ff33da..d7213cce31953 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1826,6 +1826,7 @@ working with timestamps in `pandas_udf`s to get the best performance, see - In version 2.3 and earlier, Spark converts Parquet Hive tables by default but ignores table properties like `TBLPROPERTIES (parquet.compression 'NONE')`. This happens for ORC Hive table properties like `TBLPROPERTIES (orc.compress 'NONE')` in case of `spark.sql.hive.convertMetastoreOrc=true`, too. Since Spark 2.4, Spark respects Parquet/ORC specific table properties while converting Parquet/ORC Hive tables. As an example, `CREATE TABLE t(id int) STORED AS PARQUET TBLPROPERTIES (parquet.compression 'NONE')` would generate Snappy parquet files during insertion in Spark 2.3, and in Spark 2.4, the result would be uncompressed parquet files. - Since Spark 2.0, Spark converts Parquet Hive tables by default for better performance. Since Spark 2.4, Spark converts ORC Hive tables by default, too. It means Spark uses its own ORC support by default instead of Hive SerDe. As an example, `CREATE TABLE t(id int) STORED AS ORC` would be handled with Hive SerDe in Spark 2.3, and in Spark 2.4, it would be converted into Spark's ORC data source table and ORC vectorization would be applied. To set `false` to `spark.sql.hive.convertMetastoreOrc` restores the previous behavior. - In version 2.3 and earlier, CSV rows are considered as malformed if at least one column value in the row is malformed. CSV parser dropped such rows in the DROPMALFORMED mode or outputs an error in the FAILFAST mode. Since Spark 2.4, CSV row is considered as malformed only when it contains malformed column values requested from CSV datasource, other values can be ignored. As an example, CSV file contains the "id,name" header and one row "1234". In Spark 2.4, selection of the id column consists of a row with one column value 1234 but in Spark 2.3 and earlier it is empty in the DROPMALFORMED mode. To restore the previous behavior, set `spark.sql.csv.parser.columnPruning.enabled` to `false`. + - As of version 2.3.1 Arrow functionality, including `pandas_udf` and `toPandas()`/`createDataFrame()` with "spark.sql.execution.arrow.enabled=True," has been marked as experimental. These are still evolving and not currently recommended for use in production. ## Upgrading From Spark SQL 2.2 to 2.3 From 38a765d96e723afe9d318098f6af540397664597 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Sat, 26 May 2018 20:57:05 -0700 Subject: [PATCH 5/8] Made section in migration guide for 2.3.1+ --- docs/sql-programming-guide.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index d7213cce31953..0adc407ab8789 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1826,6 +1826,8 @@ working with timestamps in `pandas_udf`s to get the best performance, see - In version 2.3 and earlier, Spark converts Parquet Hive tables by default but ignores table properties like `TBLPROPERTIES (parquet.compression 'NONE')`. This happens for ORC Hive table properties like `TBLPROPERTIES (orc.compress 'NONE')` in case of `spark.sql.hive.convertMetastoreOrc=true`, too. Since Spark 2.4, Spark respects Parquet/ORC specific table properties while converting Parquet/ORC Hive tables. As an example, `CREATE TABLE t(id int) STORED AS PARQUET TBLPROPERTIES (parquet.compression 'NONE')` would generate Snappy parquet files during insertion in Spark 2.3, and in Spark 2.4, the result would be uncompressed parquet files. - Since Spark 2.0, Spark converts Parquet Hive tables by default for better performance. Since Spark 2.4, Spark converts ORC Hive tables by default, too. It means Spark uses its own ORC support by default instead of Hive SerDe. As an example, `CREATE TABLE t(id int) STORED AS ORC` would be handled with Hive SerDe in Spark 2.3, and in Spark 2.4, it would be converted into Spark's ORC data source table and ORC vectorization would be applied. To set `false` to `spark.sql.hive.convertMetastoreOrc` restores the previous behavior. - In version 2.3 and earlier, CSV rows are considered as malformed if at least one column value in the row is malformed. CSV parser dropped such rows in the DROPMALFORMED mode or outputs an error in the FAILFAST mode. Since Spark 2.4, CSV row is considered as malformed only when it contains malformed column values requested from CSV datasource, other values can be ignored. As an example, CSV file contains the "id,name" header and one row "1234". In Spark 2.4, selection of the id column consists of a row with one column value 1234 but in Spark 2.3 and earlier it is empty in the DROPMALFORMED mode. To restore the previous behavior, set `spark.sql.csv.parser.columnPruning.enabled` to `false`. + +## Upgrading From Spark SQL 2.3.0 to 2.3.1 and Above - As of version 2.3.1 Arrow functionality, including `pandas_udf` and `toPandas()`/`createDataFrame()` with "spark.sql.execution.arrow.enabled=True," has been marked as experimental. These are still evolving and not currently recommended for use in production. ## Upgrading From Spark SQL 2.2 to 2.3 From f977293d4b441b25dd00684571935f2894207645 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Sat, 26 May 2018 20:58:22 -0700 Subject: [PATCH 6/8] forgot newline --- docs/sql-programming-guide.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 0adc407ab8789..d461ee024c0e4 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1828,6 +1828,7 @@ working with timestamps in `pandas_udf`s to get the best performance, see - In version 2.3 and earlier, CSV rows are considered as malformed if at least one column value in the row is malformed. CSV parser dropped such rows in the DROPMALFORMED mode or outputs an error in the FAILFAST mode. Since Spark 2.4, CSV row is considered as malformed only when it contains malformed column values requested from CSV datasource, other values can be ignored. As an example, CSV file contains the "id,name" header and one row "1234". In Spark 2.4, selection of the id column consists of a row with one column value 1234 but in Spark 2.3 and earlier it is empty in the DROPMALFORMED mode. To restore the previous behavior, set `spark.sql.csv.parser.columnPruning.enabled` to `false`. ## Upgrading From Spark SQL 2.3.0 to 2.3.1 and Above + - As of version 2.3.1 Arrow functionality, including `pandas_udf` and `toPandas()`/`createDataFrame()` with "spark.sql.execution.arrow.enabled=True," has been marked as experimental. These are still evolving and not currently recommended for use in production. ## Upgrading From Spark SQL 2.2 to 2.3 From 84b52f3f10afa2a9d5d386fd1090c1dcf993d59d Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Sat, 26 May 2018 21:02:39 -0700 Subject: [PATCH 7/8] use backticks for conf --- docs/sql-programming-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index d461ee024c0e4..a972f23df279e 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1829,7 +1829,7 @@ working with timestamps in `pandas_udf`s to get the best performance, see ## Upgrading From Spark SQL 2.3.0 to 2.3.1 and Above - - As of version 2.3.1 Arrow functionality, including `pandas_udf` and `toPandas()`/`createDataFrame()` with "spark.sql.execution.arrow.enabled=True," has been marked as experimental. These are still evolving and not currently recommended for use in production. + - As of version 2.3.1 Arrow functionality, including `pandas_udf` and `toPandas()`/`createDataFrame()` with `spark.sql.execution.arrow.enabled` set to `True`, has been marked as experimental. These are still evolving and not currently recommended for use in production. ## Upgrading From Spark SQL 2.2 to 2.3 From 6b873309b3db7022cfe47561c47d7ae320a46d65 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Sun, 27 May 2018 21:11:30 -0700 Subject: [PATCH 8/8] fix case in heading --- docs/sql-programming-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index a972f23df279e..50600861912b1 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1827,7 +1827,7 @@ working with timestamps in `pandas_udf`s to get the best performance, see - Since Spark 2.0, Spark converts Parquet Hive tables by default for better performance. Since Spark 2.4, Spark converts ORC Hive tables by default, too. It means Spark uses its own ORC support by default instead of Hive SerDe. As an example, `CREATE TABLE t(id int) STORED AS ORC` would be handled with Hive SerDe in Spark 2.3, and in Spark 2.4, it would be converted into Spark's ORC data source table and ORC vectorization would be applied. To set `false` to `spark.sql.hive.convertMetastoreOrc` restores the previous behavior. - In version 2.3 and earlier, CSV rows are considered as malformed if at least one column value in the row is malformed. CSV parser dropped such rows in the DROPMALFORMED mode or outputs an error in the FAILFAST mode. Since Spark 2.4, CSV row is considered as malformed only when it contains malformed column values requested from CSV datasource, other values can be ignored. As an example, CSV file contains the "id,name" header and one row "1234". In Spark 2.4, selection of the id column consists of a row with one column value 1234 but in Spark 2.3 and earlier it is empty in the DROPMALFORMED mode. To restore the previous behavior, set `spark.sql.csv.parser.columnPruning.enabled` to `false`. -## Upgrading From Spark SQL 2.3.0 to 2.3.1 and Above +## Upgrading From Spark SQL 2.3.0 to 2.3.1 and above - As of version 2.3.1 Arrow functionality, including `pandas_udf` and `toPandas()`/`createDataFrame()` with `spark.sql.execution.arrow.enabled` set to `True`, has been marked as experimental. These are still evolving and not currently recommended for use in production.