diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst index be9c04ae5d4f3..5efb3f40f5018 100644 --- a/doc/source/user_guide/window.rst +++ b/doc/source/user_guide/window.rst @@ -157,6 +157,18 @@ By default the labels are set to the right edge of the window, but a s.rolling(window=5, center=True).mean() +This can also be applied to datetime-like indices. +.. versionadded:: 1.3 +.. ipython:: python + + df = pd.DataFrame( + {"A": [0, 1, 2, 3, 4]}, index=pd.date_range("2020", periods=5, freq="1D") + ) + df + df.rolling("2D", center=False).mean() + df.rolling("2D", center=True).mean() + + .. _window.endpoints: Rolling window endpoints diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 63902b53ea36d..44266c55c70eb 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -134,6 +134,23 @@ a copy will no longer be made (:issue:`32960`) The default behavior when not passing ``copy`` will remain unchanged, i.e. a copy will be made. +Centered Datetime-Like Rolling Windows +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When performing rolling calculations on :class:`DataFrame` and :class:`Series` +objects with a datetime-like index, a centered datetime-like window can now be +used (:issue:`38780`). +For example: + +.. ipython:: python + + df = pd.DataFrame( + {"A": [0, 1, 2, 3, 4]}, index=pd.date_range("2020", periods=5, freq="1D") + ) + df + df.rolling("2D", center=True).mean() + + .. _whatsnew_130.enhancements.other: Other enhancements diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 67b196b7cb179..5e2b137db64a6 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -14,7 +14,7 @@ def calculate_variable_window_bounds( int64_t num_values, int64_t window_size, object min_periods, # unused but here to match get_window_bounds signature - object center, # unused but here to match get_window_bounds signature + bint center, object closed, const int64_t[:] index ): @@ -32,8 +32,8 @@ def calculate_variable_window_bounds( min_periods : object ignored, exists for compatibility - center : object - ignored, exists for compatibility + center : bint + center the rolling window on the current observation closed : str string of side of the window that should be closed @@ -46,7 +46,8 @@ def calculate_variable_window_bounds( (ndarray[int64], ndarray[int64]) """ cdef: - bint left_closed = False, right_closed = False + bint left_closed = False + bint right_closed = False ndarray[int64_t, ndim=1] start, end int64_t start_bound, end_bound, index_growth_sign = 1 Py_ssize_t i, j @@ -77,14 +78,27 @@ def calculate_variable_window_bounds( # right endpoint is open else: end[0] = 0 + if center: + for j in range(0, num_values + 1): + if (index[j] == index[0] + index_growth_sign * window_size / 2 and + right_closed): + end[0] = j + 1 + break + elif index[j] >= index[0] + index_growth_sign * window_size / 2: + end[0] = j + break with nogil: # start is start of slice interval (including) # end is end of slice interval (not including) for i in range(1, num_values): - end_bound = index[i] - start_bound = index[i] - index_growth_sign * window_size + if center: + end_bound = index[i] + index_growth_sign * window_size / 2 + start_bound = index[i] - index_growth_sign * window_size / 2 + else: + end_bound = index[i] + start_bound = index[i] - index_growth_sign * window_size # left endpoint is closed if left_closed: @@ -98,14 +112,27 @@ def calculate_variable_window_bounds( start[i] = j break + # for centered window advance the end bound until we are + # outside the constraint + if center: + for j in range(end[i - 1], num_values + 1): + if j == num_values: + end[i] = j + elif ((index[j] - end_bound) * index_growth_sign == 0 and + right_closed): + end[i] = j + 1 + break + elif (index[j] - end_bound) * index_growth_sign >= 0: + end[i] = j + break # end bound is previous end # or current index - if (index[end[i - 1]] - end_bound) * index_growth_sign <= 0: + elif (index[end[i - 1]] - end_bound) * index_growth_sign <= 0: end[i] = i + 1 else: end[i] = end[i - 1] # right endpoint is open - if not right_closed: + if not right_closed and not center: end[i] -= 1 return start, end diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index b482934dd25d2..f11544cb62e97 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -378,7 +378,9 @@ def _get_window_indexer(self) -> BaseIndexer: return self.window if self._win_freq_i8 is not None: return VariableWindowIndexer( - index_array=self._index_array, window_size=self._win_freq_i8 + index_array=self._index_array, + window_size=self._win_freq_i8, + center=self.center, ) return FixedWindowIndexer(window_size=self.window) @@ -1470,13 +1472,6 @@ def validate(self): self._validate_monotonic() - # we don't allow center - if self.center: - raise NotImplementedError( - "center is not implemented for " - "datetimelike and offset based windows" - ) - # this will raise ValueError on non-fixed freqs try: freq = to_offset(self.window) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index cfd09d0842418..9abae632e5da3 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -80,7 +80,8 @@ def test_constructor_with_timedelta_window(window): # GH 15440 n = 10 df = DataFrame( - {"value": np.arange(n)}, index=date_range("2015-12-24", periods=n, freq="D") + {"value": np.arange(n)}, + index=date_range("2015-12-24", periods=n, freq="D"), ) expected_data = np.append([0.0, 1.0], np.arange(3.0, 27.0, 3)) @@ -99,7 +100,8 @@ def test_constructor_timedelta_window_and_minperiods(window, raw): # GH 15305 n = 10 df = DataFrame( - {"value": np.arange(n)}, index=date_range("2017-08-08", periods=n, freq="D") + {"value": np.arange(n)}, + index=date_range("2017-08-08", periods=n, freq="D"), ) expected = DataFrame( {"value": np.append([np.NaN, 1.0], np.arange(3.0, 27.0, 3))}, @@ -130,15 +132,105 @@ def test_closed_fixed(closed, arithmetic_win_operators): df_fixed = DataFrame({"A": [0, 1, 2, 3, 4]}) df_time = DataFrame({"A": [0, 1, 2, 3, 4]}, index=date_range("2020", periods=5)) - result = getattr(df_fixed.rolling(2, closed=closed, min_periods=1), func_name)() - expected = getattr(df_time.rolling("2D", closed=closed), func_name)().reset_index( - drop=True - ) + result = getattr( + df_fixed.rolling(2, closed=closed, min_periods=1), + func_name, + )() + expected = getattr( + df_time.rolling("2D", closed=closed, min_periods=1), + func_name, + )().reset_index(drop=True) tm.assert_frame_equal(result, expected) -def test_closed_fixed_binary_col(): +@pytest.mark.parametrize( + "closed, window_selections", + [ + ( + "both", + [ + [True, True, False, False, False], + [True, True, True, False, False], + [False, True, True, True, False], + [False, False, True, True, True], + [False, False, False, True, True], + ], + ), + ( + "left", + [ + [True, False, False, False, False], + [True, True, False, False, False], + [False, True, True, False, False], + [False, False, True, True, False], + [False, False, False, True, True], + ], + ), + ( + "right", + [ + [True, True, False, False, False], + [False, True, True, False, False], + [False, False, True, True, False], + [False, False, False, True, True], + [False, False, False, False, True], + ], + ), + ( + "neither", + [ + [True, False, False, False, False], + [False, True, False, False, False], + [False, False, True, False, False], + [False, False, False, True, False], + [False, False, False, False, True], + ], + ), + ], +) +def test_datetimelike_centered_selections( + closed, window_selections, arithmetic_win_operators +): + # GH 34315 + func_name = arithmetic_win_operators + df_time = DataFrame( + {"A": [0.0, 1.0, 2.0, 3.0, 4.0]}, index=date_range("2020", periods=5) + ) + + expected = DataFrame( + {"A": [getattr(df_time["A"].iloc[s], func_name)() for s in window_selections]}, + index=date_range("2020", periods=5), + ) + + if func_name == "sem": + kwargs = {"ddof": 0} + else: + kwargs = {} + + result = getattr( + df_time.rolling("2D", closed=closed, min_periods=1, center=True), + func_name, + )(**kwargs) + + tm.assert_frame_equal(result, expected, check_dtype=False) + + +def test_even_number_window_alignment(): + # see discussion in GH 38780 + s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3)) + + # behavior of index- and datetime-based windows differs here! + # s.rolling(window=2, min_periods=1, center=True).mean() + + result = s.rolling(window="2D", min_periods=1, center=True).mean() + + expected = Series([0.5, 1.5, 2], index=s.index) + + tm.assert_series_equal(result, expected) + + +def test_closed_fixed_binary_col(center): # GH 34315 data = [0, 1, 1, 0, 0, 1, 0, 1] df = DataFrame( @@ -146,13 +238,19 @@ def test_closed_fixed_binary_col(): index=date_range(start="2020-01-01", freq="min", periods=len(data)), ) - rolling = df.rolling(window=len(df), closed="left", min_periods=1) - result = rolling.mean() + if center: + expected_data = [2 / 3, 0.5, 0.4, 0.5, 0.428571, 0.5, 0.571429, 0.5] + else: + expected_data = [np.nan, 0, 0.5, 2 / 3, 0.5, 0.4, 0.5, 0.428571] + expected = DataFrame( - [np.nan, 0, 0.5, 2 / 3, 0.5, 0.4, 0.5, 0.428571], + expected_data, columns=["binary_col"], - index=date_range(start="2020-01-01", freq="min", periods=len(data)), + index=date_range(start="2020-01-01", freq="min", periods=len(expected_data)), ) + + rolling = df.rolling(window=len(df), closed="left", min_periods=1, center=center) + result = rolling.mean() tm.assert_frame_equal(result, expected) @@ -180,7 +278,8 @@ def test_closed_one_entry(func): def test_closed_one_entry_groupby(func): # GH24718 ser = DataFrame( - data={"A": [1, 1, 2], "B": [3, 2, 1]}, index=date_range("2000", periods=3) + data={"A": [1, 1, 2], "B": [3, 2, 1]}, + index=date_range("2000", periods=3), ) result = getattr( ser.groupby("A", sort=False)["B"].rolling("10D", closed="left"), func @@ -207,7 +306,8 @@ def test_closed_one_entry_groupby(func): def test_closed_min_max_datetime(input_dtype, func, closed, expected): # see gh-21704 ser = Series( - data=np.arange(10).astype(input_dtype), index=date_range("2000", periods=10) + data=np.arange(10).astype(input_dtype), + index=date_range("2000", periods=10), ) result = getattr(ser.rolling("3D", closed=closed), func)() @@ -397,7 +497,96 @@ def test_rolling_datetime(axis_frame, tz_naive_fixture): tm.assert_frame_equal(result, expected) -def test_rolling_window_as_string(using_array_manager): +@pytest.mark.parametrize( + "center, expected_data", + [ + ( + True, + ( + [88.0] * 7 + + [97.0] * 9 + + [98.0] + + [99.0] * 21 + + [95.0] * 16 + + [93.0] * 5 + + [89.0] * 5 + + [96.0] * 21 + + [94.0] * 14 + + [90.0] * 13 + + [88.0] * 2 + + [90.0] * 9 + + [96.0] * 21 + + [95.0] * 6 + + [91.0] + + [87.0] * 6 + + [92.0] * 21 + + [83.0] * 2 + + [86.0] * 10 + + [87.0] * 5 + + [98.0] * 21 + + [97.0] * 14 + + [93.0] * 7 + + [87.0] * 4 + + [86.0] * 4 + + [95.0] * 21 + + [85.0] * 14 + + [83.0] * 2 + + [76.0] * 5 + + [81.0] * 2 + + [98.0] * 21 + + [95.0] * 14 + + [91.0] * 7 + + [86.0] + + [93.0] * 3 + + [95.0] * 29 + + [77.0] * 2 + ), + ), + ( + False, + ( + [np.nan] * 2 + + [88.0] * 16 + + [97.0] * 9 + + [98.0] + + [99.0] * 21 + + [95.0] * 16 + + [93.0] * 5 + + [89.0] * 5 + + [96.0] * 21 + + [94.0] * 14 + + [90.0] * 13 + + [88.0] * 2 + + [90.0] * 9 + + [96.0] * 21 + + [95.0] * 6 + + [91.0] + + [87.0] * 6 + + [92.0] * 21 + + [83.0] * 2 + + [86.0] * 10 + + [87.0] * 5 + + [98.0] * 21 + + [97.0] * 14 + + [93.0] * 7 + + [87.0] * 4 + + [86.0] * 4 + + [95.0] * 21 + + [85.0] * 14 + + [83.0] * 2 + + [76.0] * 5 + + [81.0] * 2 + + [98.0] * 21 + + [95.0] * 14 + + [91.0] * 7 + + [86.0] + + [93.0] * 3 + + [95.0] * 20 + ), + ), + ], +) +def test_rolling_window_as_string(center, expected_data, using_array_manager): # see gh-22590 date_today = datetime.now() days = date_range(date_today, date_today + timedelta(365), freq="D") @@ -408,53 +597,15 @@ def test_rolling_window_as_string(using_array_manager): df = DataFrame({"DateCol": days, "metric": data}) df.set_index("DateCol", inplace=True) - result = df.rolling(window="21D", min_periods=2, closed="left")["metric"].agg("max") - - expData = ( - [np.nan] * 2 - + [88.0] * 16 - + [97.0] * 9 - + [98.0] - + [99.0] * 21 - + [95.0] * 16 - + [93.0] * 5 - + [89.0] * 5 - + [96.0] * 21 - + [94.0] * 14 - + [90.0] * 13 - + [88.0] * 2 - + [90.0] * 9 - + [96.0] * 21 - + [95.0] * 6 - + [91.0] - + [87.0] * 6 - + [92.0] * 21 - + [83.0] * 2 - + [86.0] * 10 - + [87.0] * 5 - + [98.0] * 21 - + [97.0] * 14 - + [93.0] * 7 - + [87.0] * 4 - + [86.0] * 4 - + [95.0] * 21 - + [85.0] * 14 - + [83.0] * 2 - + [76.0] * 5 - + [81.0] * 2 - + [98.0] * 21 - + [95.0] * 14 - + [91.0] * 7 - + [86.0] - + [93.0] * 3 - + [95.0] * 20 - ) + result = df.rolling(window="21D", min_periods=2, closed="left", center=center)[ + "metric" + ].agg("max") index = days.rename("DateCol") if not using_array_manager: # INFO(ArrayManager) preserves the frequence of the index index = index._with_freq(None) - expected = Series(expData, index=index, name="metric") + expected = Series(expected_data, index=index, name="metric") tm.assert_series_equal(result, expected) @@ -622,8 +773,18 @@ def test_iter_rolling_on_dataframe(expected, window): 3, 1, ), - (Series([1, 2, 3]), [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], 2, 1), - (Series([1, 2, 3]), [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], 2, 2), + ( + Series([1, 2, 3]), + [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], + 2, + 1, + ), + ( + Series([1, 2, 3]), + [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], + 2, + 2, + ), (Series([1, 2, 3]), [([1], [0]), ([2], [1]), ([3], [2])], 1, 0), (Series([1, 2, 3]), [([1], [0]), ([2], [1]), ([3], [2])], 1, 1), (Series([1, 2]), [([1], [0]), ([1, 2], [0, 1])], 2, 0), @@ -795,7 +956,18 @@ def test_rolling_numerical_too_large_numbers(): ds[2] = -9e33 result = ds.rolling(5).mean() expected = Series( - [np.nan, np.nan, np.nan, np.nan, -1.8e33, -1.8e33, -1.8e33, 5.0, 6.0, 7.0], + [ + np.nan, + np.nan, + np.nan, + np.nan, + -1.8e33, + -1.8e33, + -1.8e33, + 5.0, + 6.0, + 7.0, + ], index=dates, ) tm.assert_series_equal(result, expected) @@ -811,7 +983,8 @@ def test_rolling_mixed_dtypes_axis_1(func, value): df["c"] = 1.0 result = getattr(df.rolling(window=2, min_periods=1, axis=1), func)() expected = DataFrame( - {"a": [1.0, 1.0], "b": [value, value], "c": [value, value]}, index=[1, 2] + {"a": [1.0, 1.0], "b": [value, value], "c": [value, value]}, + index=[1, 2], ) tm.assert_frame_equal(result, expected) @@ -895,7 +1068,7 @@ def test_rolling_sem(frame_or_series): result = obj.rolling(2, min_periods=1).sem() if isinstance(result, DataFrame): result = Series(result[0].values) - expected = Series([np.nan] + [0.707107] * 2) + expected = Series([np.nan] + [0.7071067811865476] * 2) tm.assert_series_equal(result, expected) @@ -1009,8 +1182,14 @@ def test_rolling_decreasing_indices(method): 318.0, ], ), - ("mean", [float("nan"), 7.5, float("nan"), 21.5, 6.0, 9.166667, 13.0, 17.5]), - ("sum", [float("nan"), 30.0, float("nan"), 86.0, 30.0, 55.0, 91.0, 140.0]), + ( + "mean", + [float("nan"), 7.5, float("nan"), 21.5, 6.0, 9.166667, 13.0, 17.5], + ), + ( + "sum", + [float("nan"), 30.0, float("nan"), 86.0, 30.0, 55.0, 91.0, 140.0], + ), ( "skew", [ @@ -1074,7 +1253,10 @@ def get_window_bounds(self, num_values, min_periods, center, closed): @pytest.mark.parametrize( ("index", "window"), - [([0, 1, 2, 3, 4], 2), (date_range("2001-01-01", freq="D", periods=5), "2D")], + [ + ([0, 1, 2, 3, 4], 2), + (date_range("2001-01-01", freq="D", periods=5), "2D"), + ], ) def test_rolling_corr_timedelta_index(index, window): # GH: 31286 diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 0782ef2f4ce7b..7cd319480083b 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -83,12 +83,6 @@ def test_invalid_minp(self, minp): with pytest.raises(ValueError, match=msg): self.regular.rolling(window="1D", min_periods=minp) - def test_invalid_center_datetimelike(self): - # center is not implemented - msg = "center is not implemented for datetimelike and offset based windows" - with pytest.raises(NotImplementedError, match=msg): - self.regular.rolling(window="1D", center=True) - def test_on(self): df = self.regular