diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 70a1ad4a335ea..24bd6101e96aa 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -43,3 +43,4 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in ``value_counts`` where normalizes over all observations including missing even when ``dropna=True`` (:issue:`12558`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ad1efa21e8280..6661017ba1b4a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -342,7 +342,14 @@ def value_counts(values, sort=True, ascending=False, normalize=False, result = result.sort_values(ascending=ascending) if normalize: - result = result / float(values.size) + if dropna: + # NaT's dropped above if time, so don't need to do again. + if not com.is_datetime_or_timedelta_dtype(dtype) \ + and not is_period and not is_datetimetz: + + result = result / float(Series(values).count()) + else: + result = result / float(values.size) return result diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b9a4384603cda..20d8a3b088611 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -517,6 +517,33 @@ def test_dropna(self): pd.Series([10.3, 5., 5., None]).value_counts(dropna=False), pd.Series([2, 1, 1], index=[5., 10.3, np.nan])) + def test_normalize(self): + # Issue 12558 + tm.assert_series_equal( + pd.Series([5., 10.3, 10.3, 10.3, np.nan]).value_counts( + dropna=True, normalize=True), + pd.Series([0.75, 0.25], index=[10.3, 5.])) + + tm.assert_series_equal( + pd.Series([5., 10.3, 10.3, 10.3, np.nan]).value_counts( + dropna=True, normalize=False), + pd.Series([3, 1], index=[10.3, 5.])) + + tm.assert_series_equal( + pd.Series([5., 10.3, 10.3, 10.3, np.nan]).value_counts( + dropna=False, normalize=True), + pd.Series([0.6, 0.2, 0.2], index=[10.3, 5., np.nan])) + + tm.assert_series_equal( + pd.Series([5., 10.3, 10.3, 10.3, np.nan]).value_counts( + dropna=False, normalize=False), + pd.Series([3, 1, 1], index=[10.3, 5., np.nan])) + + s = pd.Series(['2015-01-03T00:00:00.000000000+0000', pd.NaT]) + tm.assert_series_equal( + s.value_counts(dropna=True, normalize=True), + pd.Series([1], index=[s.loc[0]])) + class GroupVarTestMixin(object):