Skip to content

Commit a349d07

Browse files
committed
New approach for no value concept taken after NumPy
1 parent 30295bf commit a349d07

File tree

4 files changed

+98
-10
lines changed

4 files changed

+98
-10
lines changed

python/pyspark/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
from pyspark.taskcontext import TaskContext
5555
from pyspark.profiler import Profiler, BasicProfiler
5656
from pyspark.version import __version__
57+
from pyspark._globals import _NoValue
5758

5859

5960
def since(version):

python/pyspark/_globals.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
"""
19+
Module defining global singleton classes.
20+
21+
This module raises a RuntimeError if an attempt to reload it is made. In that
22+
way the identities of the classes defined here are fixed and will remain so
23+
even if pyspark itself is reloaded. In particular, a function like the following
24+
will still work correctly after pyspark is reloaded:
25+
26+
def foo(arg=pyspark._NoValue):
27+
if arg is pyspark._NoValue:
28+
...
29+
30+
See gh-7844 for a discussion of the reload problem that motivated this module.
31+
32+
Note that this approach is taken after from NumPy.
33+
"""
34+
35+
__ALL__ = ['_NoValue']
36+
37+
38+
# Disallow reloading this module so as to preserve the identities of the
39+
# classes defined here.
40+
if '_is_loaded' in globals():
41+
raise RuntimeError('Reloading pyspark._globals is not allowed')
42+
_is_loaded = True
43+
44+
45+
class _NoValueType(object):
46+
"""Special keyword value.
47+
48+
The instance of this class may be used as the default value assigned to a
49+
deprecated keyword in order to check if it has been given a user defined
50+
value.
51+
52+
This class was copied from NumPy.
53+
"""
54+
__instance = None
55+
56+
def __new__(cls):
57+
# ensure that only one instance exists
58+
if not cls.__instance:
59+
cls.__instance = super(_NoValueType, cls).__new__(cls)
60+
return cls.__instance
61+
62+
# needed for python 2 to preserve identity through a pickle
63+
def __reduce__(self):
64+
return (self.__class__, ())
65+
66+
def __repr__(self):
67+
return "<no value>"
68+
69+
70+
_NoValue = _NoValueType()

python/pyspark/sql/dataframe.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
import warnings
2929

30-
from pyspark import copy_func, since
30+
from pyspark import copy_func, since, _NoValue
3131
from pyspark.rdd import RDD, _load_from_socket, ignore_unicode_prefix
3232
from pyspark.serializers import ArrowSerializer, BatchedSerializer, PickleSerializer, \
3333
UTF8Deserializer
@@ -1532,7 +1532,7 @@ def fillna(self, value, subset=None):
15321532
return DataFrame(self._jdf.na().fill(value, self._jseq(subset)), self.sql_ctx)
15331533

15341534
@since(1.4)
1535-
def replace(self, to_replace, value=None, subset=None):
1535+
def replace(self, to_replace, value=_NoValue, subset=None):
15361536
"""Returns a new :class:`DataFrame` replacing a value with another value.
15371537
:func:`DataFrame.replace` and :func:`DataFrameNaFunctions.replace` are
15381538
aliases of each other.
@@ -1545,8 +1545,8 @@ def replace(self, to_replace, value=None, subset=None):
15451545
15461546
:param to_replace: bool, int, long, float, string, list or dict.
15471547
Value to be replaced.
1548-
If the value is a dict, then `value` is ignored and `to_replace` must be a
1549-
mapping between a value and a replacement.
1548+
If the value is a dict, then `value` is ignored or can be omitted, and `to_replace`
1549+
must be a mapping between a value and a replacement.
15501550
:param value: bool, int, long, float, string, list or None.
15511551
The replacement value must be a bool, int, long, float, string or None. If `value` is a
15521552
list, `value` should be of the same length and type as `to_replace`.
@@ -1577,6 +1577,16 @@ def replace(self, to_replace, value=None, subset=None):
15771577
|null| null|null|
15781578
+----+------+----+
15791579
1580+
>>> df4.na.replace({'Alice': None}).show()
1581+
+----+------+----+
1582+
| age|height|name|
1583+
+----+------+----+
1584+
| 10| 80|null|
1585+
| 5| null| Bob|
1586+
|null| null| Tom|
1587+
|null| null|null|
1588+
+----+------+----+
1589+
15801590
>>> df4.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
15811591
+----+------+----+
15821592
| age|height|name|
@@ -1587,6 +1597,12 @@ def replace(self, to_replace, value=None, subset=None):
15871597
|null| null|null|
15881598
+----+------+----+
15891599
"""
1600+
if value is _NoValue:
1601+
if isinstance(to_replace, dict):
1602+
value = None
1603+
else:
1604+
raise TypeError("value argument is required when to_replace is not a dictionary.")
1605+
15901606
# Helper functions
15911607
def all_of(types):
15921608
"""Given a type or tuple of types and a sequence of xs
@@ -2047,7 +2063,7 @@ def fill(self, value, subset=None):
20472063

20482064
fill.__doc__ = DataFrame.fillna.__doc__
20492065

2050-
def replace(self, to_replace, value, subset=None):
2066+
def replace(self, to_replace, value=_NoValue, subset=None):
20512067
return self.df.replace(to_replace, value, subset)
20522068

20532069
replace.__doc__ = DataFrame.replace.__doc__

python/pyspark/sql/tests.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2243,11 +2243,6 @@ def test_replace(self):
22432243
.replace(False, True).first())
22442244
self.assertTupleEqual(row, (True, True))
22452245

2246-
# replace list while value is not given (default to None)
2247-
row = self.spark.createDataFrame(
2248-
[(u'Alice', 10, 80.0)], schema).replace(["Alice", "Bob"]).first()
2249-
self.assertTupleEqual(row, (None, 10, 80.0))
2250-
22512246
# replace string with None and then drop None rows
22522247
row = self.spark.createDataFrame(
22532248
[(u'Alice', 10, 80.0)], schema).replace(u'Alice', None).dropna()
@@ -2283,6 +2278,12 @@ def test_replace(self):
22832278
self.spark.createDataFrame(
22842279
[(u'Alice', 10, 80.1)], schema).replace({u"Alice": u"Bob", 10: 20}).first()
22852280

2281+
with self.assertRaisesRegexp(
2282+
TypeError,
2283+
'value argument is required when to_replace is not a dictionary.'):
2284+
self.spark.createDataFrame(
2285+
[(u'Alice', 10, 80.0)], schema).replace(["Alice", "Bob"]).first()
2286+
22862287
def test_capture_analysis_exception(self):
22872288
self.assertRaises(AnalysisException, lambda: self.spark.sql("select abc"))
22882289
self.assertRaises(AnalysisException, lambda: self.df.selectExpr("a + b"))

0 commit comments

Comments
 (0)