From 4b35ff06b14ec197dc6c40082188b41961a713f3 Mon Sep 17 00:00:00 2001 From: TomAugspurger Date: Fri, 28 Mar 2014 15:45:35 -0500 Subject: [PATCH] API/BUG Raise ValueError when stacking nonunique levels Should raise a ValueError when (un)stacking a DataFrame on a nonunique level. Previous behavior was to raise a KeyError (not deliberately). Closes #6729. --- doc/source/release.rst | 2 ++ doc/source/v0.14.0.txt | 3 ++- pandas/core/index.py | 7 +++++++ pandas/core/reshape.py | 14 +++++++++++++- pandas/tests/test_frame.py | 10 ++++++++++ pandas/tests/test_index.py | 7 +++++++ 6 files changed, 41 insertions(+), 2 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 13aef1a5d8fdb..21c30d68a29d9 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -149,6 +149,8 @@ API Changes - Define and document the order of column vs index names in query/eval (:issue:`6676`) - ``DataFrame.sort`` now places NaNs at the beginning or end of the sort according to the ``na_position`` parameter. (:issue:`3917`) +- ``stack`` and ``unstack`` now raise a ``ValueError`` when the ``level`` keyword refers + to a non-unique item in the ``Index`` (previously raised a ``KeyError``). (:issue:`6738`) - all offset operations now return ``Timestamp`` types (rather than datetime), Business/Week frequencies were incorrect (:issue:`4069`) - ``Series.iteritems()`` is now lazy (returns an iterator rather than a list). This was the documented behavior prior to 0.14. (:issue:`6760`) diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index b0ea3cb770a64..ad538b3c01dae 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -206,7 +206,8 @@ API changes - ``Panel.shift`` now uses ``NDFrame.shift``. It no longer drops the ``nan`` data and retains its original shape. (:issue:`4867`) - Added ``nunique`` and ``value_counts`` functions to ``Index`` for counting unique elements. (:issue:`6734`) - +- ``stack`` and ``unstack`` now raise a ``ValueError`` when the ``level`` keyword refers + to a non-unique item in the ``Index`` (previously raised a ``KeyError``). .. _whatsnew_0140.sql: diff --git a/pandas/core/index.py b/pandas/core/index.py index 7edd2c6646535..72465040077b2 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -2317,6 +2317,13 @@ def _set_names(self, values, validate=True): names = property( fset=_set_names, fget=_get_names, doc="Names of levels in MultiIndex") + def _reference_duplicate_name(self, name): + """ + Returns True if the name refered to in self.names is duplicated. + """ + # count the times name equals an element in self.names. + return np.sum(name == np.asarray(self.names)) > 1 + def _format_native_types(self, **kwargs): return self.tolist() diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 0d06e9253ce1f..7dc266617c5fd 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -1,6 +1,5 @@ # pylint: disable=E1101,E1103 # pylint: disable=W0703,W0622,W0613,W0201 - from pandas.compat import range, zip from pandas import compat import itertools @@ -69,6 +68,13 @@ def __init__(self, values, index, level=-1, value_columns=None): raise ValueError('must pass column labels for multi-column data') self.index = index + + if isinstance(self.index, MultiIndex): + if index._reference_duplicate_name(level): + msg = ("Ambiguous reference to {0}. The index " + "names are not unique.".format(level)) + raise ValueError(msg) + self.level = self.index._get_level_number(level) levels = index.levels @@ -497,6 +503,12 @@ def stack(frame, level=-1, dropna=True): stacked : Series """ N, K = frame.shape + if isinstance(frame.columns, MultiIndex): + if frame.columns._reference_duplicate_name(level): + msg = ("Ambiguous reference to {0}. The column " + "names are not unique.".format(level)) + raise ValueError(msg) + if isinstance(level, int) and level < 0: level += frame.columns.nlevels diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index aa8350dfdfe78..5c8ae497f9117 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -11387,6 +11387,16 @@ def test_unstack_dtypes(self): expected = Series({'float64' : 2, 'object' : 2}) assert_series_equal(result, expected) + def test_unstack_non_unique_index_names(self): + idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')], + names=['c1', 'c1']) + df = DataFrame([1, 2], index=idx) + with tm.assertRaises(ValueError): + df.unstack('c1') + + with tm.assertRaises(ValueError): + df.T.stack('c1') + def test_reset_index(self): stacked = self.frame.stack()[::2] stacked = DataFrame({'foo': stacked, 'bar': stacked}) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index c6c405306afb8..74ca5d0fe9276 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -1534,6 +1534,13 @@ def test_names(self): level_names = [level.name for level in index.levels] self.assertEqual(ind_names, level_names) + def test_reference_duplicate_name(self): + idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')], names=['x', 'x']) + self.assertTrue(idx._reference_duplicate_name('x')) + + idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')], names=['x', 'y']) + self.assertFalse(idx._reference_duplicate_name('x')) + def test_astype(self): expected = self.index.copy() actual = self.index.astype('O')