diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index c41da4d67afe5..df7a4cb46b0ec 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -89,6 +89,7 @@ Bug Fixes - Bug in ``pd.read_msgpack()`` with a non existent file is passed in Python 2 (:issue:`15296`) - Bug in ``DataFrame.groupby`` where key as tuple in a ``MultiIndex`` were interpreted as a list of keys (:issue:`17979`) +- Bug in :func:`pd.read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`) Conversion ^^^^^^^^^^ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 1b6414ea974fa..79422ddcaf609 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1106,6 +1106,24 @@ def _is_index_col(col): return col is not None and col is not False +def _is_potential_multi_index(columns): + """ + Check whether or not the `columns` parameter + could be converted into a MultiIndex. + + Parameters + ---------- + columns : array-like + Object which may or may not be convertible into a MultiIndex + + Returns + ------- + boolean : Whether or not columns could become a MultiIndex + """ + return (len(columns) and not isinstance(columns, MultiIndex) and + all([isinstance(c, tuple) for c in columns])) + + def _evaluate_usecols(usecols, names): """ Check whether or not the 'usecols' parameter @@ -1374,6 +1392,7 @@ def _maybe_dedup_names(self, names): if self.mangle_dupe_cols: names = list(names) # so we can index counts = defaultdict(int) + is_potential_mi = _is_potential_multi_index(names) for i, col in enumerate(names): cur_count = counts[col] @@ -1381,7 +1400,10 @@ def _maybe_dedup_names(self, names): while cur_count > 0: counts[col] = cur_count + 1 - col = '%s.%d' % (col, cur_count) + if is_potential_mi: + col = col[:-1] + ('%s.%d' % (col[-1], cur_count),) + else: + col = '%s.%d' % (col, cur_count) cur_count = counts[col] names[i] = col @@ -1391,9 +1413,7 @@ def _maybe_dedup_names(self, names): def _maybe_make_multi_index_columns(self, columns, col_names=None): # possibly create a column mi here - if (not self.tupleize_cols and len(columns) and - not isinstance(columns, MultiIndex) and - all([isinstance(c, tuple) for c in columns])): + if _is_potential_multi_index(columns): columns = MultiIndex.from_tuples(columns, names=col_names) return columns diff --git a/pandas/tests/io/parser/header.py b/pandas/tests/io/parser/header.py index ff3beb70b774f..58dae112c59b7 100644 --- a/pandas/tests/io/parser/header.py +++ b/pandas/tests/io/parser/header.py @@ -290,3 +290,30 @@ def test_singleton_header(self): df = self.read_csv(StringIO(data), header=[0]) expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]}) tm.assert_frame_equal(df, expected) + + def test_mangles_multi_index(self): + # See GH 18062 + data = """A,A,A,B\none,one,one,two\n0,40,34,0.1""" + df = self.read_csv(StringIO(data), header=[0, 1]) + expected = DataFrame([[0, 40, 34, 0.1]], + columns=MultiIndex.from_tuples( + [('A', 'one'), ('A', 'one.1'), + ('A', 'one.2'), ('B', 'two')])) + tm.assert_frame_equal(df, expected) + + data = """A,A,A,B\none,one,one.1,two\n0,40,34,0.1""" + df = self.read_csv(StringIO(data), header=[0, 1]) + expected = DataFrame([[0, 40, 34, 0.1]], + columns=MultiIndex.from_tuples( + [('A', 'one'), ('A', 'one.1'), + ('A', 'one.1.1'), ('B', 'two')])) + tm.assert_frame_equal(df, expected) + + data = """A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1""" + df = self.read_csv(StringIO(data), header=[0, 1]) + expected = DataFrame([[0, 40, 34, 0.1, 0.1]], + columns=MultiIndex.from_tuples( + [('A', 'one'), ('A', 'one.1'), + ('A', 'one.1.1'), ('B', 'two'), + ('B', 'two.1')])) + tm.assert_frame_equal(df, expected)