diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 70a1ad4a335ea..5aa1d30979cfb 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -43,3 +43,5 @@ Performance Improvements Bug Fixes ~~~~~~~~~ + +- Bug in ``value_counts`` when ``normalize=True`` and ``dropna=True`` where nulls still contributed to the normalized count (:issue:`12558`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ad1efa21e8280..de38c0c3940fd 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -342,7 +342,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, result = result.sort_values(ascending=ascending) if normalize: - result = result / float(values.size) + result = result / float(counts.sum()) return result diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index c8598639d9fad..f3fe5a5a2d5d8 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -11,7 +11,7 @@ callable, map ) from pandas import compat - +from pandas.compat.numpy_compat import _np_version_under1p8 from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, DataError, SpecificationError) from pandas.core.categorical import Categorical @@ -2949,8 +2949,18 @@ def value_counts(self, normalize=False, sort=True, ascending=False, if normalize: out = out.astype('float') - acc = rep(np.diff(np.r_[idx, len(ids)])) - out /= acc[mask] if dropna else acc + d = np.diff(np.r_[idx, len(ids)]) + if dropna: + m = ids[lab == -1] + if _np_version_under1p8: + mi, ml = algos.factorize(m) + d[ml] = d[ml] - np.bincount(mi) + else: + np.add.at(d, m, -1) + acc = rep(d)[mask] + else: + acc = rep(d) + out /= acc if sort and bins is None: cat = ids[inc][mask] if dropna else ids[inc] diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b9a4384603cda..5c83cdb1493dc 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -517,6 +517,22 @@ def test_dropna(self): pd.Series([10.3, 5., 5., None]).value_counts(dropna=False), pd.Series([2, 1, 1], index=[5., 10.3, np.nan])) + def test_value_counts_normalized(self): + # GH12558 + s = Series([1, 2, np.nan, np.nan, np.nan]) + dtypes = (np.float64, np.object, 'M8[ns]') + for t in dtypes: + s_typed = s.astype(t) + result = s_typed.value_counts(normalize=True, dropna=False) + expected = Series([0.6, 0.2, 0.2], + index=Series([np.nan, 2.0, 1.0], dtype=t)) + tm.assert_series_equal(result, expected) + + result = s_typed.value_counts(normalize=True, dropna=True) + expected = Series([0.5, 0.5], + index=Series([2.0, 1.0], dtype=t)) + tm.assert_series_equal(result, expected) + class GroupVarTestMixin(object):