From 196d14fdce1cbea059a7d3e734d1d87c4c6db5d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20J=C3=B8rgensen?= <47577197+bjornjorgensen@users.noreply.github.com> Date: Sun, 23 Jan 2022 21:49:35 +0100 Subject: [PATCH 01/14] SPARK-37981 Add note for deleting Null and NaN ### What changes were proposed in this pull request? This is for SPARK-37981Deletes columns with all Null as default. Do also see https://github.com/apache/spark/pull/26098 User HyukjinKwon did a reviewed on 21 Oct 2019 "Hey, you should document this in DataFrameWrtier, DataStreamWrtier, readwriter.py" ### Why are the changes needed? Users need to know why there column(s) with all NaN or Null are gone. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? No. --- python/pyspark/pandas/generic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index 2dac5b056aba0..267cd46c7b497 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -904,6 +904,8 @@ def to_json( .. note:: output JSON format is different from pandas'. It always use `orient='records'` for its output. This behaviour might have to change in the near future. + + .. note:: If column have only NaN or Null values. The column well be deleted. Note NaN's and None will be converted to null and datetime objects will be converted to UNIX timestamps. From a073be4620bfdca2bbf299c5b19a1cc846c1f438 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20J=C3=B8rgensen?= <47577197+bjornjorgensen@users.noreply.github.com> Date: Mon, 24 Jan 2022 15:01:02 +0100 Subject: [PATCH 02/14] Set ignoreNullFields to False Default ignoreNullFields have been set to False to prevent columns with only NaN or Null to be deleted during saving to JSON. --- python/pyspark/pandas/generic.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index 267cd46c7b497..e47ecf7ab596c 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -889,6 +889,7 @@ def to_json( lines: bool = True, partition_cols: Optional[Union[str, List[str]]] = None, index_col: Optional[Union[str, List[str]]] = None, + ignoreNullFields: bool = False, **options: Any, ) -> Optional[str]: """ @@ -948,6 +949,7 @@ def to_json( the options in PySpark's API documentation for `spark.write.json(...)`. It has a higher priority and overwrites all other options. This parameter only works when `path` is specified. + ignoreNullFields: if set to True and path is provided, writer omits columns with all NaN or Null values. Returns -------- @@ -980,6 +982,8 @@ def to_json( 0 a 1 c """ + options["ignoreNullFields"] = ignoreNullFields + if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1: options = options.get("options") From 670c6905551df33a2fa7c41ba2da9fb0e1634d99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20J=C3=B8rgensen?= <47577197+bjornjorgensen@users.noreply.github.com> Date: Mon, 24 Jan 2022 15:21:40 +0100 Subject: [PATCH 03/14] Removed note --- python/pyspark/pandas/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index e47ecf7ab596c..4cbecf932e89b 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -906,7 +906,6 @@ def to_json( .. note:: output JSON format is different from pandas'. It always use `orient='records'` for its output. This behaviour might have to change in the near future. - .. note:: If column have only NaN or Null values. The column well be deleted. Note NaN's and None will be converted to null and datetime objects will be converted to UNIX timestamps. From da2da56cf8e4f196e96f851283e94f50479e5313 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20J=C3=B8rgensen?= <47577197+bjornjorgensen@users.noreply.github.com> Date: Mon, 24 Jan 2022 15:23:39 +0100 Subject: [PATCH 04/14] Removed extra space. --- python/pyspark/pandas/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index 4cbecf932e89b..c4d54282c7a06 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -906,7 +906,6 @@ def to_json( .. note:: output JSON format is different from pandas'. It always use `orient='records'` for its output. This behaviour might have to change in the near future. - Note NaN's and None will be converted to null and datetime objects will be converted to UNIX timestamps. From fc659ca9173722dc2761fc629a5c51ea2abdb96d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20J=C3=B8rgensen?= <47577197+bjornjorgensen@users.noreply.github.com> Date: Mon, 24 Jan 2022 21:01:34 +0100 Subject: [PATCH 05/14] Fix new line --- python/pyspark/pandas/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index c4d54282c7a06..73dcd711c374d 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -905,7 +905,7 @@ def to_json( .. note:: output JSON format is different from pandas'. It always use `orient='records'` for its output. This behaviour might have to change in the near future. - + Note NaN's and None will be converted to null and datetime objects will be converted to UNIX timestamps. From d6f5b53f0e3a77efa760526350ad18b93172fa48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20J=C3=B8rgensen?= <47577197+bjornjorgensen@users.noreply.github.com> Date: Tue, 25 Jan 2022 10:19:12 +0100 Subject: [PATCH 06/14] Updated options to more Pandas-specific defaults as a dict. --- python/pyspark/pandas/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index 73dcd711c374d..bce92703d30c7 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -980,7 +980,8 @@ def to_json( 0 a 1 c """ - options["ignoreNullFields"] = ignoreNullFields + default_options: Dict[str, Any] = {"ignoreNullFields": False} + options = {**default_options, **options} if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1: options = options.get("options") From d8de0c583808efc0b3c04ce4e62362c5705e848e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20J=C3=B8rgensen?= <47577197+bjornjorgensen@users.noreply.github.com> Date: Tue, 25 Jan 2022 11:00:48 +0100 Subject: [PATCH 07/14] Set it as a note. --- python/pyspark/pandas/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index bce92703d30c7..7a2ba30107f01 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -905,6 +905,8 @@ def to_json( .. note:: output JSON format is different from pandas'. It always use `orient='records'` for its output. This behaviour might have to change in the near future. + + .. note:: Set ignoreNullFields keyword argument to `True` and path is provided, writer omits columns with all NaN or Null values. Note NaN's and None will be converted to null and datetime objects will be converted to UNIX timestamps. @@ -947,7 +949,6 @@ def to_json( the options in PySpark's API documentation for `spark.write.json(...)`. It has a higher priority and overwrites all other options. This parameter only works when `path` is specified. - ignoreNullFields: if set to True and path is provided, writer omits columns with all NaN or Null values. Returns -------- From 8ffbef254c7f0edd96398716ec2455d21462b868 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20J=C3=B8rgensen?= <47577197+bjornjorgensen@users.noreply.github.com> Date: Tue, 25 Jan 2022 11:03:13 +0100 Subject: [PATCH 08/14] Fix indenting --- python/pyspark/pandas/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index 7a2ba30107f01..9d4a2d5bc2d41 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -906,7 +906,8 @@ def to_json( .. note:: output JSON format is different from pandas'. It always use `orient='records'` for its output. This behaviour might have to change in the near future. - .. note:: Set ignoreNullFields keyword argument to `True` and path is provided, writer omits columns with all NaN or Null values. + .. note:: Set ignoreNullFields keyword argument to `True` and path is provided, + writer omits columns with all NaN or Null values. Note NaN's and None will be converted to null and datetime objects will be converted to UNIX timestamps. From 0d4794c56932b77ad947a01b357b891cbfc8ebee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20J=C3=B8rgensen?= <47577197+bjornjorgensen@users.noreply.github.com> Date: Tue, 25 Jan 2022 12:08:47 +0100 Subject: [PATCH 09/14] Update python/pyspark/pandas/generic.py Co-authored-by: Hyukjin Kwon --- python/pyspark/pandas/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index 9d4a2d5bc2d41..b969286f1d8a1 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -906,8 +906,8 @@ def to_json( .. note:: output JSON format is different from pandas'. It always use `orient='records'` for its output. This behaviour might have to change in the near future. - .. note:: Set ignoreNullFields keyword argument to `True` and path is provided, - writer omits columns with all NaN or Null values. + .. note:: Set `ignoreNullFields` keyword argument to `True` to omit `None` or `NaN` values + when writing JSON objects. It works only when `path` is provided. Note NaN's and None will be converted to null and datetime objects will be converted to UNIX timestamps. From fec39c787386440d61c75cae9ea7d3a9a8e71b9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20J=C3=B8rgensen?= <47577197+bjornjorgensen@users.noreply.github.com> Date: Tue, 25 Jan 2022 12:08:58 +0100 Subject: [PATCH 10/14] Update python/pyspark/pandas/generic.py Co-authored-by: Hyukjin Kwon --- python/pyspark/pandas/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index b969286f1d8a1..b8dc7e70ec65e 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -889,7 +889,6 @@ def to_json( lines: bool = True, partition_cols: Optional[Union[str, List[str]]] = None, index_col: Optional[Union[str, List[str]]] = None, - ignoreNullFields: bool = False, **options: Any, ) -> Optional[str]: """ From 99100ac098bca78b275bc5952bc3ea3c8bbcd665 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20J=C3=B8rgensen?= <47577197+bjornjorgensen@users.noreply.github.com> Date: Tue, 25 Jan 2022 19:39:17 +0100 Subject: [PATCH 11/14] moved merging default_options after if-clause --- python/pyspark/pandas/generic.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index b8dc7e70ec65e..7bf349109614b 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -981,11 +981,12 @@ def to_json( 0 a 1 c """ - default_options: Dict[str, Any] = {"ignoreNullFields": False} - options = {**default_options, **options} if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1: options = options.get("options") + + default_options: Dict[str, Any] = {"ignoreNullFields": False} + options = {**default_options, **options} if not lines: raise NotImplementedError("lines=False is not implemented yet.") From cd6737be0aff1609e17ca86e0fabc34cac699b53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20J=C3=B8rgensen?= <47577197+bjornjorgensen@users.noreply.github.com> Date: Tue, 25 Jan 2022 19:42:41 +0100 Subject: [PATCH 12/14] Removed new line --- python/pyspark/pandas/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index 7bf349109614b..ab79c64f8d027 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -981,7 +981,6 @@ def to_json( 0 a 1 c """ - if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1: options = options.get("options") From d992250d93b16ac5301c6b0a0456cad31bddd76b Mon Sep 17 00:00:00 2001 From: bjornjorgensen Date: Fri, 28 Jan 2022 19:23:05 +0100 Subject: [PATCH 13/14] Merge remote-tracking branch 'origin/SPARK-37981-Add-note-for-deleting-Null-and-NaN' --- python/pyspark/pandas/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index ab79c64f8d027..c459f033f8c31 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -904,7 +904,7 @@ def to_json( .. note:: output JSON format is different from pandas'. It always use `orient='records'` for its output. This behaviour might have to change in the near future. - + .. note:: Set `ignoreNullFields` keyword argument to `True` to omit `None` or `NaN` values when writing JSON objects. It works only when `path` is provided. @@ -983,7 +983,7 @@ def to_json( """ if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1: options = options.get("options") - + default_options: Dict[str, Any] = {"ignoreNullFields": False} options = {**default_options, **options} From c2df06568c5f4ed8efbed1c00bdffdc99476c624 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20J=C3=B8rgensen?= <47577197+bjornjorgensen@users.noreply.github.com> Date: Sat, 29 Jan 2022 14:20:10 +0100 Subject: [PATCH 14/14] Add Dict, to import --- python/pyspark/pandas/generic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index c459f033f8c31..63ce25ec5f2b2 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -24,6 +24,7 @@ from typing import ( Any, Callable, + Dict, Iterable, IO, List,