From 2d2394a3f2e730aa773dc26b14c5f5eaefbfb67e Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 8 Jun 2020 16:24:46 +0200 Subject: [PATCH 1/9] enumerate exclude_dims --- xarray/core/computation.py | 39 ++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 28bf818e4a3..841e52ebbf8 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -56,14 +56,18 @@ class _UFuncSignature: __slots__ = ( "input_core_dims", "output_core_dims", + "exclude_dims", "_all_input_core_dims", "_all_output_core_dims", "_all_core_dims", ) - def __init__(self, input_core_dims, output_core_dims=((),)): + def __init__( + self, input_core_dims, output_core_dims=((),), exclude_dims=frozenset() + ): self.input_core_dims = tuple(tuple(a) for a in input_core_dims) self.output_core_dims = tuple(tuple(a) for a in output_core_dims) + self.exclude_dims = exclude_dims self._all_input_core_dims = None self._all_output_core_dims = None self._all_core_dims = None @@ -103,6 +107,7 @@ def __eq__(self, other): return ( self.input_core_dims == other.input_core_dims and self.output_core_dims == other.output_core_dims + and self.exclude_dims == other.exclude_dims ) except AttributeError: return False @@ -111,12 +116,32 @@ def __ne__(self, other): return not self == other def __repr__(self): - return "{}({!r}, {!r})".format( - type(self).__name__, list(self.input_core_dims), list(self.output_core_dims) + return "{}({!r}, {!r}, {!r})".format( + type(self).__name__, + list(self.input_core_dims), + list(self.output_core_dims), + self.exclude_dims, ) def __str__(self): - lhs = ",".join("({})".format(",".join(dims)) for dims in self.input_core_dims) + input_core_dims = self.input_core_dims + + # enumerate to make the exclude_dims unique + if self.exclude_dims: + counter = Counter() + + def _enumerate(dim): + if dim in self.exclude_dims: + n = counter[dim] + counter.update([dim]) + dim = f"{dim}_{n}" + return dim + + input_core_dims = [ + [_enumerate(dim) for dim in arg] for arg in input_core_dims + ] + + lhs = ",".join("({})".format(",".join(dims)) for dims in input_core_dims) rhs = ",".join("({})".format(",".join(dims)) for dims in self.output_core_dims) return f"{lhs}->{rhs}" @@ -136,7 +161,9 @@ def to_gufunc_string(self): ["dim%d" % dims_map[dim] for dim in core_dims] for core_dims in self.output_core_dims ] - alt_signature = type(self)(input_core_dims, output_core_dims) + exclude_dims = frozenset(["dim%d" % dims_map[dim] for dim in self.exclude_dims]) + + alt_signature = type(self)(input_core_dims, output_core_dims, exclude_dims) return str(alt_signature) @@ -992,7 +1019,7 @@ def earth_mover_distance(first_samples, if kwargs is None: kwargs = {} - signature = _UFuncSignature(input_core_dims, output_core_dims) + signature = _UFuncSignature(input_core_dims, output_core_dims, exclude_dims) if exclude_dims and not exclude_dims <= signature.all_core_dims: raise ValueError( From 36d75b4419a1bbff090803852563867ac2e779e6 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 8 Jun 2020 16:25:01 +0200 Subject: [PATCH 2/9] add tests --- xarray/tests/test_computation.py | 52 ++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 4eed464d2dc..9e095bd10ba 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -45,6 +45,18 @@ def test_signature_properties(): assert _UFuncSignature([["x"]]) != _UFuncSignature([["y"]]) +def test_signature_properties_exclude(): + sig = _UFuncSignature([["x"], ["x", "y"]], [["z"]], set("x")) + assert sig.input_core_dims == (("x",), ("x", "y")) + assert sig.output_core_dims == (("z",),) + assert sig.all_input_core_dims == frozenset(["x", "y"]) + assert sig.all_output_core_dims == frozenset(["z"]) + assert sig.num_inputs == 2 + assert sig.num_outputs == 1 + assert str(sig) == "(x_0),(x_1,y)->(z)" + assert sig.to_gufunc_string() == "(dim0_0),(dim0_1,dim1)->(dim2)" + + def test_result_name(): class Named: def __init__(self, name=None): @@ -817,6 +829,46 @@ def test_vectorize_dask(): assert_identical(expected, actual) +def pandas_median_add(x, y): + # function which can consume input of unequal length + return pd.Series(x).median() + pd.Series(y).median() + + +def test_vectorize_exclude_dims(): + # GH 3890 + data_array_a = xr.DataArray([[0, 1, 2], [1, 2, 3]], dims=("x", "y")) + data_array_b = xr.DataArray([[0, 1, 2, 3, 4], [1, 2, 3, 4, 5]], dims=("x", "y")) + expected = xr.DataArray([3, 5], dims=["x"]) + actual = apply_ufunc( + pandas_median_add, + data_array_a, + data_array_b, + input_core_dims=[["y"], ["y"]], + vectorize=True, + exclude_dims=set("y"), + ) + assert_identical(expected, actual) + + +@requires_dask +def test_vectorize_exclude_dims_dask(): + # GH 3890 + data_array_a = xr.DataArray([[0, 1, 2], [1, 2, 3]], dims=("x", "y")) + data_array_b = xr.DataArray([[0, 1, 2, 3, 4], [1, 2, 3, 4, 5]], dims=("x", "y")) + expected = xr.DataArray([3, 5], dims=["x"]) + actual = apply_ufunc( + pandas_median_add, + data_array_a.chunk({"x": 1}), + data_array_b.chunk({"x": 1}), + input_core_dims=[["y"], ["y"]], + exclude_dims=set("y"), + vectorize=True, + dask="parallelized", + output_dtypes=[float], + ) + assert_identical(expected, actual) + + @requires_dask def test_vectorize_dask_new_output_dims(): # regression test for GH3574 From 1cc1ada3bb7ce9d62bd3415e24b2935e69932a23 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 8 Jun 2020 16:53:03 +0200 Subject: [PATCH 3/9] xfail dask test (depends on 4060) --- xarray/tests/test_computation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 04ef7976a5e..ebafae5974c 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -851,6 +851,7 @@ def test_vectorize_exclude_dims(): assert_identical(expected, actual) +@pytest.mark.xfail(reason="Should work after GH: 4060") @requires_dask def test_vectorize_exclude_dims_dask(): # GH 3890 From 36814e4a590193715df87f532f4a58675f3dd5ae Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 8 Jun 2020 16:53:18 +0200 Subject: [PATCH 4/9] add whats new --- doc/whats-new.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 85e73e1b7e8..5f891e3c0f1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -149,6 +149,8 @@ Bug fixes By `Benoit Bovy `_. - Fix :py:func:`open_rasterio` for ``WarpedVRT`` with specified ``src_crs``. (:pull:`4104`) By `Dave Cole `_. +- Fix :py:func:`~xarray.apply_ufunc` with ``vectorize=True`` and ``exclude_dims`` (:issue:`3890`). + By `Mathias Hauser `_. Documentation ~~~~~~~~~~~~~ From b5270a6ad06855c585a7c24a29c2275478beb877 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Wed, 19 Aug 2020 10:26:01 +0200 Subject: [PATCH 5/9] add tests again (removed in merge) --- xarray/tests/test_computation.py | 42 ++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index ec9e9f8534d..dbb0c231cc0 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -907,6 +907,48 @@ def test_vectorize_dask_dtype_meta(): assert np.float == actual.dtype +def pandas_median_add(x, y): + # function which can consume input of unequal length + return pd.Series(x).median() + pd.Series(y).median() + + +def test_vectorize_exclude_dims(): + # GH 3890 + data_array_a = xr.DataArray([[0, 1, 2], [1, 2, 3]], dims=("x", "y")) + data_array_b = xr.DataArray([[0, 1, 2, 3, 4], [1, 2, 3, 4, 5]], dims=("x", "y")) + + expected = xr.DataArray([3, 5], dims=["x"]) + actual = apply_ufunc( + pandas_median_add, + data_array_a, + data_array_b, + input_core_dims=[["y"], ["y"]], + vectorize=True, + exclude_dims=set("y"), + ) + assert_identical(expected, actual) + + +@requires_dask +def test_vectorize_exclude_dims_dask(): + # GH 3890 + data_array_a = xr.DataArray([[0, 1, 2], [1, 2, 3]], dims=("x", "y")) + data_array_b = xr.DataArray([[0, 1, 2, 3, 4], [1, 2, 3, 4, 5]], dims=("x", "y")) + + expected = xr.DataArray([3, 5], dims=["x"]) + actual = apply_ufunc( + pandas_median_add, + data_array_a.chunk({"x": 1}), + data_array_b.chunk({"x": 1}), + input_core_dims=[["y"], ["y"]], + exclude_dims=set("y"), + vectorize=True, + dask="parallelized", + output_dtypes=[float], + ) + assert_identical(expected, actual) + + with raises_regex(TypeError, "Only xr.DataArray is supported"): xr.corr(xr.Dataset(), xr.Dataset()) From 505f634bb751a3d44e33762c51ae9b52d8b39ea2 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Wed, 19 Aug 2020 11:40:19 +0200 Subject: [PATCH 6/9] move exclude_dims to to_gufunc_string --- xarray/core/computation.py | 66 ++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 8beb6581da2..57eda9a00e4 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -58,18 +58,14 @@ class _UFuncSignature: __slots__ = ( "input_core_dims", "output_core_dims", - "exclude_dims", "_all_input_core_dims", "_all_output_core_dims", "_all_core_dims", ) - def __init__( - self, input_core_dims, output_core_dims=((),), exclude_dims=frozenset() - ): + def __init__(self, input_core_dims, output_core_dims=((),)): self.input_core_dims = tuple(tuple(a) for a in input_core_dims) self.output_core_dims = tuple(tuple(a) for a in output_core_dims) - self.exclude_dims = exclude_dims self._all_input_core_dims = None self._all_output_core_dims = None self._all_core_dims = None @@ -115,7 +111,6 @@ def __eq__(self, other): return ( self.input_core_dims == other.input_core_dims and self.output_core_dims == other.output_core_dims - and self.exclude_dims == other.exclude_dims ) except AttributeError: return False @@ -124,36 +119,18 @@ def __ne__(self, other): return not self == other def __repr__(self): - return "{}({!r}, {!r}, {!r})".format( + return "{}({!r}, {!r})".format( type(self).__name__, list(self.input_core_dims), list(self.output_core_dims), - self.exclude_dims, ) def __str__(self): - input_core_dims = self.input_core_dims - - # enumerate to make the exclude_dims unique - if self.exclude_dims: - counter = Counter() - - def _enumerate(dim): - if dim in self.exclude_dims: - n = counter[dim] - counter.update([dim]) - dim = f"{dim}_{n}" - return dim - - input_core_dims = [ - [_enumerate(dim) for dim in arg] for arg in input_core_dims - ] - - lhs = ",".join("({})".format(",".join(dims)) for dims in input_core_dims) + lhs = ",".join("({})".format(",".join(dims)) for dims in self.input_core_dims) rhs = ",".join("({})".format(",".join(dims)) for dims in self.output_core_dims) return f"{lhs}->{rhs}" - def to_gufunc_string(self): + def to_gufunc_string(self, exclude_dims=frozenset()): """Create an equivalent signature string for a NumPy gufunc. Unlike __str__, handles dimensions that don't map to Python @@ -167,9 +144,26 @@ def to_gufunc_string(self): [self.dims_map[dim] for dim in core_dims] for core_dims in self.output_core_dims ] - exclude_dims = frozenset(["dim%d" % dims_map[dim] for dim in self.exclude_dims]) - alt_signature = type(self)(input_core_dims, output_core_dims, exclude_dims) + # enumerate input_core_dims contained in exclude_dims to make them unique + if exclude_dims: + + exclude_dims = [self.dims_map[dim] for dim in exclude_dims] + + counter = Counter() + + def _enumerate(dim): + if dim in exclude_dims: + n = counter[dim] + counter.update([dim]) + dim = f"{dim}_{n}" + return dim + + input_core_dims = [ + [_enumerate(dim) for dim in arg] for arg in input_core_dims + ] + + alt_signature = type(self)(input_core_dims, output_core_dims) return str(alt_signature) @@ -572,10 +566,12 @@ def broadcast_compat_data( return data -def _vectorize(func, signature, output_dtypes): +def _vectorize(func, signature, output_dtypes, exclude_dims): if signature.all_core_dims: func = np.vectorize( - func, otypes=output_dtypes, signature=signature.to_gufunc_string() + func, + otypes=output_dtypes, + signature=signature.to_gufunc_string(exclude_dims), ) else: func = np.vectorize(func, otypes=output_dtypes) @@ -650,7 +646,7 @@ def func(*arrays): res = da.apply_gufunc( numpy_func, - signature.to_gufunc_string(), + signature.to_gufunc_string(exclude_dims), *arrays, vectorize=vectorize, output_dtypes=output_dtypes, @@ -676,7 +672,9 @@ def func(*arrays): ) else: if vectorize: - func = _vectorize(func, signature, output_dtypes=output_dtypes) + func = _vectorize( + func, signature, output_dtypes=output_dtypes, exclude_dims=exclude_dims + ) result_data = func(*input_data) @@ -996,7 +994,7 @@ def earth_mover_distance(first_samples, if kwargs is None: kwargs = {} - signature = _UFuncSignature(input_core_dims, output_core_dims, exclude_dims) + signature = _UFuncSignature(input_core_dims, output_core_dims) if exclude_dims: if not isinstance(exclude_dims, set): From 0cd20bb1230ecd9cd313b4dfac82561c783d1b32 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Wed, 19 Aug 2020 11:40:39 +0200 Subject: [PATCH 7/9] adapt tests --- xarray/tests/test_computation.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index dbb0c231cc0..32505f24ac4 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -45,22 +45,13 @@ def test_signature_properties(): assert sig.num_outputs == 1 assert str(sig) == "(x),(x,y)->(z)" assert sig.to_gufunc_string() == "(dim0),(dim0,dim1)->(dim2)" + assert ( + sig.to_gufunc_string(exclude_dims=set("x")) == "(dim0_0),(dim0_1,dim1)->(dim2)" + ) # dimension names matter assert _UFuncSignature([["x"]]) != _UFuncSignature([["y"]]) -def test_signature_properties_exclude(): - sig = _UFuncSignature([["x"], ["x", "y"]], [["z"]], set("x")) - assert sig.input_core_dims == (("x",), ("x", "y")) - assert sig.output_core_dims == (("z",),) - assert sig.all_input_core_dims == frozenset(["x", "y"]) - assert sig.all_output_core_dims == frozenset(["z"]) - assert sig.num_inputs == 2 - assert sig.num_outputs == 1 - assert str(sig) == "(x_0),(x_1,y)->(z)" - assert sig.to_gufunc_string() == "(dim0_0),(dim0_1,dim1)->(dim2)" - - def test_result_name(): class Named: def __init__(self, name=None): From ebbbc6eabefba5f296274dc60b94a25e16ba3fb1 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Wed, 19 Aug 2020 11:41:06 +0200 Subject: [PATCH 8/9] move whats new to 16.1 --- doc/whats-new.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 60332748b3c..1a618df48cd 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -65,7 +65,8 @@ Bug fixes and :py:meth:`DataArray.str.wrap` (:issue:`4334`). By `Mathias Hauser `_. - Fixed overflow issue causing incorrect results in computing means of :py:class:`cftime.datetime` arrays (:issue:`4341`). By `Spencer Clark `_. - +- Fix :py:func:`xarray.apply_ufunc` with ``vectorize=True`` and ``exclude_dims`` (:issue:`3890`). + By `Mathias Hauser `_. Documentation ~~~~~~~~~~~~~ @@ -272,8 +273,6 @@ Bug fixes By `Deepak Cherian `_ - Fix :py:func:`open_rasterio` for ``WarpedVRT`` with specified ``src_crs``. (:pull:`4104`) By `Dave Cole `_. -- Fix :py:func:`~xarray.apply_ufunc` with ``vectorize=True`` and ``exclude_dims`` (:issue:`3890`). - By `Mathias Hauser `_. Documentation ~~~~~~~~~~~~~ From b42766005d26b50c6e59e02e669e5574b3e6700d Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Wed, 19 Aug 2020 11:45:03 +0200 Subject: [PATCH 9/9] update docstring --- xarray/core/computation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 57eda9a00e4..a21915630c6 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -135,6 +135,8 @@ def to_gufunc_string(self, exclude_dims=frozenset()): Unlike __str__, handles dimensions that don't map to Python identifiers. + + Also creates unique names for input_core_dims contained in exclude_dims. """ input_core_dims = [ [self.dims_map[dim] for dim in core_dims]