From c1c47413e0033d1f4c6ca26c4e68f1ebf57a8294 Mon Sep 17 00:00:00 2001 From: araraonline Date: Mon, 1 Oct 2018 15:27:43 -0300 Subject: [PATCH 1/4] Regression tests for sort=None One thing not tested is the behavior when it is impossible to sort (different dtypes being compared). In summary, it will warn, but only when some kind of monoticity condition occurs. The behavior for duplicates is not explicitely tested, but it behaves almost the same as for when there are no duplicates. The new code should fix it better. To consider later, test whole dataframes instead of just indexes... Might catch some unexpected errors. --- pandas/tests/test_regression.py | 137 ++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 pandas/tests/test_regression.py diff --git a/pandas/tests/test_regression.py b/pandas/tests/test_regression.py new file mode 100644 index 0000000000000..f6afa2db1e3c4 --- /dev/null +++ b/pandas/tests/test_regression.py @@ -0,0 +1,137 @@ +"""Regression tests for DataFrame.append(other, sort=None), the default value +""" + +import numpy as np +import pytest + +from pandas import DataFrame, Index, Series +from pandas.testing import assert_frame_equal + + +class TestAppendSortNone(object): + + def generate_frames(self, compare, special): + if compare == 'lt': + if special: + df1 = DataFrame([[11, 12]], columns=[2, 1]) + df2 = DataFrame([[13, 14, 15]], columns=[3, 2, 1]) + else: + df1 = DataFrame([[11, 12]], columns=list('ba')) + df2 = DataFrame([[13, 14, 15]], columns=list('cba')) + elif compare == 'eq': + if special: + df1 = DataFrame([[11, 12, 13]], columns=[3, 2, 1]) + df2 = DataFrame([[14, 15, 16]], columns=[3, 2, 1]) + else: + df1 = DataFrame([[11, 12, 13]], columns=list('cba')) + df2 = DataFrame([[14, 15, 16]], columns=list('cba')) + elif compare == 'gt': + if special: + df1 = DataFrame([[11, 12, 13]], columns=[3, 2, 1]) + df2 = DataFrame([[14, 15]], columns=[2, 1]) + else: + df1 = DataFrame([[11, 12, 13]], columns=list('cba')) + df2 = DataFrame([[14, 15]], columns=list('ba')) + + # avoid upcasting problems + df1 = df1.astype('float64') + df2 = df2.astype('float64') + + return df1, df2 + + def merge_indexes(self, idx1, idx2, sort): + len1 = idx1.size + len2 = idx2.size + + if len1 < len2: + # match 'lt' in self.generate_frames + vals1 = idx1.tolist() + vals2 = [idx2.tolist()[0]] + result = Index(vals1 + vals2) + else: + result = idx1.copy() + + return result.sort_values() if sort else result + + def merge_frames(self, df1, df2, sort): + new_index = self.merge_indexes(df1.columns, df2.columns, sort) + df1 = df1.reindex(new_index, axis=1) + df2 = df2.reindex(new_index, axis=1) + + values = np.vstack([df1.values[0, :], df2.values[0, :]]) + result = DataFrame(values, columns=new_index) + return result + + @pytest.mark.parametrize('input_type', ['series', 'dict']) + @pytest.mark.parametrize('special', [True, False]) + @pytest.mark.parametrize('compare', ['lt', 'eq', 'gt']) + def test_append_series_dict(self, compare, special, input_type): + # When appending a Series or dict, the resulting columns come unsorted + # and no warning is raised. + + sorts = False + warns = False + + df1, df2 = self.generate_frames(compare, special) + if input_type == 'series': + other = df2.loc[0] + else: + other = df2.loc[0].to_dict() + + ctx = pytest.warns(FutureWarning) if warns else pytest.warns(None) + expected = self.merge_frames(df1, df2, sorts) + with ctx: + result = df1.append(other, ignore_index=True, sort=None) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('input_type', ['[series]', '[dict]']) + @pytest.mark.parametrize('special', [True, False]) + @pytest.mark.parametrize('compare', ['lt', 'eq', 'gt']) + def test_append_list_of_series_dict(self, compare, special, input_type): + # When appending a list of Series or list of dicts, the behavior is + # as specified below. + + if compare in ('gt', 'eq'): + sorts = False + warns = False + else: + sorts = True + warns = not special + + df1, df2 = self.generate_frames(compare, special) + if input_type == '[series]': + other = [df2.loc[0]] + else: + other = [df2.loc[0].to_dict()] + + ctx = pytest.warns(FutureWarning) if warns else pytest.warns(None) + expected = self.merge_frames(df1, df2, sorts) + with ctx: + result = df1.append(other, ignore_index=True, sort=None) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('input_type', ['dataframe', '[dataframe]']) + @pytest.mark.parametrize('special', [True, False]) + @pytest.mark.parametrize('compare', ['lt', 'eq', 'gt']) + def test_append_dframe_list_of_dframe(self, compare, special, input_type): + # When appenindg a DataFrame of list of DataFrames, the behavior is as + # specified below. + + if compare == 'eq': + sorts = False + warns = False + else: + sorts = True + warns = not special + + df1, df2 = self.generate_frames(compare, special) + if input_type == 'dataframe': + other = df2 + else: + other = [df2] + + ctx = pytest.warns(FutureWarning) if warns else pytest.warns(None) + expected = self.merge_frames(df1, df2, sorts) + with ctx: + result = df1.append(other, ignore_index=True, sort=None) + assert_frame_equal(result, expected) From 16a47c0767a554258eb55fa8cd1d13ac54bbeb7b Mon Sep 17 00:00:00 2001 From: araraonline Date: Mon, 1 Oct 2018 16:33:08 -0300 Subject: [PATCH 2/4] Behavior for sort=None --- pandas/core/frame.py | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0e58a5d1b3591..4db4239ea1b54 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6423,7 +6423,44 @@ def _append_list_of_frames(self, other, *args, **kwargs): from pandas.core.indexes.api import _normalize_dataframes from pandas.core.reshape.concat import concat - # TODO: sorting behavior when sort=None + # sorting behavior when sort=None + # TODO: remove when kwarg value change + if sort is None: + # stabilish desired behavior + if _obj_type in (dict, Series): + # dict/ser + + sort = False + warn = False + elif _item_type in (dict, Series): + # [dict]/[ser] + + if (self.columns.get_indexer(other[0].columns) >= 0).all(): + # self.columns >= other[0].columns + sort = False + warn = False + else: + sort = True + types = [df.columns.dtype for df in [self] + other] + common = find_common_type(types) + warn = (common == object) + else: + # frame/[frame] + + if all(self.columns.equals(df.columns) for df in other): + # all values the same + sort = False + warn = False + else: + sort = True + types = [df.columns.dtype for df in [self] + other] + common = find_common_type(types) + warn = (common == object) + + # warn if necessary + if warn: + from pandas.core.indexes.api import _sort_msg + warnings.warn(_sort_msg, FutureWarning) # The behavior of concat is a bit problematic as it is. To get around, # we prepare the DataFrames before feeding them into concat. From 39bd8adcca451f77642338a879001279784fb58f Mon Sep 17 00:00:00 2001 From: araraonline Date: Mon, 1 Oct 2018 18:25:09 -0300 Subject: [PATCH 3/4] Add tests for duplicate columns --- pandas/tests/test_regression.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/tests/test_regression.py b/pandas/tests/test_regression.py index f6afa2db1e3c4..6cce187fe756f 100644 --- a/pandas/tests/test_regression.py +++ b/pandas/tests/test_regression.py @@ -32,6 +32,15 @@ def generate_frames(self, compare, special): else: df1 = DataFrame([[11, 12, 13]], columns=list('cba')) df2 = DataFrame([[14, 15]], columns=list('ba')) + elif compare == 'dups': + # special category for duplicates + # assumes compare = 'eq' + if special: + df1 = DataFrame([[11, 12, 13]], columns=[3, 3, 1]) + df2 = DataFrame([[14, 15, 16]], columns=[3, 3, 1]) + else: + df1 = DataFrame([[11, 12, 13]], columns=list('cca')) + df2 = DataFrame([[14, 15, 16]], columns=list('cca')) # avoid upcasting problems df1 = df1.astype('float64') @@ -64,7 +73,7 @@ def merge_frames(self, df1, df2, sort): @pytest.mark.parametrize('input_type', ['series', 'dict']) @pytest.mark.parametrize('special', [True, False]) - @pytest.mark.parametrize('compare', ['lt', 'eq', 'gt']) + @pytest.mark.parametrize('compare', ['lt', 'eq', 'gt', 'dups']) def test_append_series_dict(self, compare, special, input_type): # When appending a Series or dict, the resulting columns come unsorted # and no warning is raised. @@ -77,6 +86,8 @@ def test_append_series_dict(self, compare, special, input_type): other = df2.loc[0] else: other = df2.loc[0].to_dict() + if compare == 'dups': + return ctx = pytest.warns(FutureWarning) if warns else pytest.warns(None) expected = self.merge_frames(df1, df2, sorts) @@ -86,7 +97,7 @@ def test_append_series_dict(self, compare, special, input_type): @pytest.mark.parametrize('input_type', ['[series]', '[dict]']) @pytest.mark.parametrize('special', [True, False]) - @pytest.mark.parametrize('compare', ['lt', 'eq', 'gt']) + @pytest.mark.parametrize('compare', ['lt', 'eq', 'gt']) # dups won't work def test_append_list_of_series_dict(self, compare, special, input_type): # When appending a list of Series or list of dicts, the behavior is # as specified below. @@ -112,12 +123,12 @@ def test_append_list_of_series_dict(self, compare, special, input_type): @pytest.mark.parametrize('input_type', ['dataframe', '[dataframe]']) @pytest.mark.parametrize('special', [True, False]) - @pytest.mark.parametrize('compare', ['lt', 'eq', 'gt']) + @pytest.mark.parametrize('compare', ['lt', 'eq', 'gt', 'dups']) def test_append_dframe_list_of_dframe(self, compare, special, input_type): # When appenindg a DataFrame of list of DataFrames, the behavior is as # specified below. - if compare == 'eq': + if compare in ('dups', 'eq'): sorts = False warns = False else: From 2a054b59ac45cf1b19a481197e960a8b1ae111d5 Mon Sep 17 00:00:00 2001 From: araraonline Date: Mon, 1 Oct 2018 18:30:42 -0300 Subject: [PATCH 4/4] Move regression tests to test_append.py --- pandas/tests/reshape/test_append.py | 143 +++++++++++++++++++++++++++ pandas/tests/test_regression.py | 148 ---------------------------- 2 files changed, 143 insertions(+), 148 deletions(-) delete mode 100644 pandas/tests/test_regression.py diff --git a/pandas/tests/reshape/test_append.py b/pandas/tests/reshape/test_append.py index 17e02fdffcc72..d2e07ec8d92ff 100644 --- a/pandas/tests/reshape/test_append.py +++ b/pandas/tests/reshape/test_append.py @@ -4,6 +4,7 @@ import pytest import pandas as pd +from pandas import DataFrame, Index, Series from pandas.core.indexes.base import InvalidIndexError from pandas.util.testing import assert_frame_equal @@ -328,6 +329,148 @@ def test_no_unecessary_upcast(self, sort): assert_frame_equal(result, expected) +class TestAppendSortNone(object): + """Regression tests to preserve the behavior of sort=None + """ + + def generate_frames(self, compare, special): + if compare == 'lt': + if special: + df1 = DataFrame([[11, 12]], columns=[2, 1]) + df2 = DataFrame([[13, 14, 15]], columns=[3, 2, 1]) + else: + df1 = DataFrame([[11, 12]], columns=list('ba')) + df2 = DataFrame([[13, 14, 15]], columns=list('cba')) + elif compare == 'eq': + if special: + df1 = DataFrame([[11, 12, 13]], columns=[3, 2, 1]) + df2 = DataFrame([[14, 15, 16]], columns=[3, 2, 1]) + else: + df1 = DataFrame([[11, 12, 13]], columns=list('cba')) + df2 = DataFrame([[14, 15, 16]], columns=list('cba')) + elif compare == 'gt': + if special: + df1 = DataFrame([[11, 12, 13]], columns=[3, 2, 1]) + df2 = DataFrame([[14, 15]], columns=[2, 1]) + else: + df1 = DataFrame([[11, 12, 13]], columns=list('cba')) + df2 = DataFrame([[14, 15]], columns=list('ba')) + elif compare == 'dups': + # special category for duplicates + # assumes compare = 'eq' + if special: + df1 = DataFrame([[11, 12, 13]], columns=[3, 3, 1]) + df2 = DataFrame([[14, 15, 16]], columns=[3, 3, 1]) + else: + df1 = DataFrame([[11, 12, 13]], columns=list('cca')) + df2 = DataFrame([[14, 15, 16]], columns=list('cca')) + + # avoid upcasting problems + df1 = df1.astype('float64') + df2 = df2.astype('float64') + + return df1, df2 + + def merge_indexes(self, idx1, idx2, sort): + len1 = idx1.size + len2 = idx2.size + + if len1 < len2: + # match 'lt' in self.generate_frames + vals1 = idx1.tolist() + vals2 = [idx2.tolist()[0]] + result = Index(vals1 + vals2) + else: + result = idx1.copy() + + return result.sort_values() if sort else result + + def merge_frames(self, df1, df2, sort): + new_index = self.merge_indexes(df1.columns, df2.columns, sort) + df1 = df1.reindex(new_index, axis=1) + df2 = df2.reindex(new_index, axis=1) + + values = np.vstack([df1.values[0, :], df2.values[0, :]]) + result = DataFrame(values, columns=new_index) + return result + + @pytest.mark.parametrize('input_type', ['series', 'dict']) + @pytest.mark.parametrize('special', [True, False]) + @pytest.mark.parametrize('compare', ['lt', 'eq', 'gt', 'dups']) + def test_append_series_dict(self, compare, special, input_type): + # When appending a Series or dict, the resulting columns come unsorted + # and no warning is raised. + + sorts = False + warns = False + + df1, df2 = self.generate_frames(compare, special) + if input_type == 'series': + other = df2.loc[0] + else: + other = df2.loc[0].to_dict() + if compare == 'dups': + return + + ctx = pytest.warns(FutureWarning) if warns else pytest.warns(None) + expected = self.merge_frames(df1, df2, sorts) + with ctx: + result = df1.append(other, ignore_index=True, sort=None) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('input_type', ['[series]', '[dict]']) + @pytest.mark.parametrize('special', [True, False]) + @pytest.mark.parametrize('compare', ['lt', 'eq', 'gt']) # dups won't work + def test_append_list_of_series_dict(self, compare, special, input_type): + # When appending a list of Series or list of dicts, the behavior is + # as specified below. + + if compare in ('gt', 'eq'): + sorts = False + warns = False + else: + sorts = True + warns = not special + + df1, df2 = self.generate_frames(compare, special) + if input_type == '[series]': + other = [df2.loc[0]] + else: + other = [df2.loc[0].to_dict()] + + ctx = pytest.warns(FutureWarning) if warns else pytest.warns(None) + expected = self.merge_frames(df1, df2, sorts) + with ctx: + result = df1.append(other, ignore_index=True, sort=None) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('input_type', ['dataframe', '[dataframe]']) + @pytest.mark.parametrize('special', [True, False]) + @pytest.mark.parametrize('compare', ['lt', 'eq', 'gt', 'dups']) + def test_append_dframe_list_of_dframe(self, compare, special, input_type): + # When appenindg a DataFrame of list of DataFrames, the behavior is as + # specified below. + + if compare in ('dups', 'eq'): + sorts = False + warns = False + else: + sorts = True + warns = not special + + df1, df2 = self.generate_frames(compare, special) + if input_type == 'dataframe': + other = df2 + else: + other = [df2] + + ctx = pytest.warns(FutureWarning) if warns else pytest.warns(None) + expected = self.merge_frames(df1, df2, sorts) + with ctx: + result = df1.append(other, ignore_index=True, sort=None) + assert_frame_equal(result, expected) + + class TestAppendColumnsIndex(object): @pytest.mark.parametrize('idx_name3', [None, 'foo', 'bar', 'baz']) @pytest.mark.parametrize('idx_name2', [None, 'foo', 'bar', 'baz']) diff --git a/pandas/tests/test_regression.py b/pandas/tests/test_regression.py deleted file mode 100644 index 6cce187fe756f..0000000000000 --- a/pandas/tests/test_regression.py +++ /dev/null @@ -1,148 +0,0 @@ -"""Regression tests for DataFrame.append(other, sort=None), the default value -""" - -import numpy as np -import pytest - -from pandas import DataFrame, Index, Series -from pandas.testing import assert_frame_equal - - -class TestAppendSortNone(object): - - def generate_frames(self, compare, special): - if compare == 'lt': - if special: - df1 = DataFrame([[11, 12]], columns=[2, 1]) - df2 = DataFrame([[13, 14, 15]], columns=[3, 2, 1]) - else: - df1 = DataFrame([[11, 12]], columns=list('ba')) - df2 = DataFrame([[13, 14, 15]], columns=list('cba')) - elif compare == 'eq': - if special: - df1 = DataFrame([[11, 12, 13]], columns=[3, 2, 1]) - df2 = DataFrame([[14, 15, 16]], columns=[3, 2, 1]) - else: - df1 = DataFrame([[11, 12, 13]], columns=list('cba')) - df2 = DataFrame([[14, 15, 16]], columns=list('cba')) - elif compare == 'gt': - if special: - df1 = DataFrame([[11, 12, 13]], columns=[3, 2, 1]) - df2 = DataFrame([[14, 15]], columns=[2, 1]) - else: - df1 = DataFrame([[11, 12, 13]], columns=list('cba')) - df2 = DataFrame([[14, 15]], columns=list('ba')) - elif compare == 'dups': - # special category for duplicates - # assumes compare = 'eq' - if special: - df1 = DataFrame([[11, 12, 13]], columns=[3, 3, 1]) - df2 = DataFrame([[14, 15, 16]], columns=[3, 3, 1]) - else: - df1 = DataFrame([[11, 12, 13]], columns=list('cca')) - df2 = DataFrame([[14, 15, 16]], columns=list('cca')) - - # avoid upcasting problems - df1 = df1.astype('float64') - df2 = df2.astype('float64') - - return df1, df2 - - def merge_indexes(self, idx1, idx2, sort): - len1 = idx1.size - len2 = idx2.size - - if len1 < len2: - # match 'lt' in self.generate_frames - vals1 = idx1.tolist() - vals2 = [idx2.tolist()[0]] - result = Index(vals1 + vals2) - else: - result = idx1.copy() - - return result.sort_values() if sort else result - - def merge_frames(self, df1, df2, sort): - new_index = self.merge_indexes(df1.columns, df2.columns, sort) - df1 = df1.reindex(new_index, axis=1) - df2 = df2.reindex(new_index, axis=1) - - values = np.vstack([df1.values[0, :], df2.values[0, :]]) - result = DataFrame(values, columns=new_index) - return result - - @pytest.mark.parametrize('input_type', ['series', 'dict']) - @pytest.mark.parametrize('special', [True, False]) - @pytest.mark.parametrize('compare', ['lt', 'eq', 'gt', 'dups']) - def test_append_series_dict(self, compare, special, input_type): - # When appending a Series or dict, the resulting columns come unsorted - # and no warning is raised. - - sorts = False - warns = False - - df1, df2 = self.generate_frames(compare, special) - if input_type == 'series': - other = df2.loc[0] - else: - other = df2.loc[0].to_dict() - if compare == 'dups': - return - - ctx = pytest.warns(FutureWarning) if warns else pytest.warns(None) - expected = self.merge_frames(df1, df2, sorts) - with ctx: - result = df1.append(other, ignore_index=True, sort=None) - assert_frame_equal(result, expected) - - @pytest.mark.parametrize('input_type', ['[series]', '[dict]']) - @pytest.mark.parametrize('special', [True, False]) - @pytest.mark.parametrize('compare', ['lt', 'eq', 'gt']) # dups won't work - def test_append_list_of_series_dict(self, compare, special, input_type): - # When appending a list of Series or list of dicts, the behavior is - # as specified below. - - if compare in ('gt', 'eq'): - sorts = False - warns = False - else: - sorts = True - warns = not special - - df1, df2 = self.generate_frames(compare, special) - if input_type == '[series]': - other = [df2.loc[0]] - else: - other = [df2.loc[0].to_dict()] - - ctx = pytest.warns(FutureWarning) if warns else pytest.warns(None) - expected = self.merge_frames(df1, df2, sorts) - with ctx: - result = df1.append(other, ignore_index=True, sort=None) - assert_frame_equal(result, expected) - - @pytest.mark.parametrize('input_type', ['dataframe', '[dataframe]']) - @pytest.mark.parametrize('special', [True, False]) - @pytest.mark.parametrize('compare', ['lt', 'eq', 'gt', 'dups']) - def test_append_dframe_list_of_dframe(self, compare, special, input_type): - # When appenindg a DataFrame of list of DataFrames, the behavior is as - # specified below. - - if compare in ('dups', 'eq'): - sorts = False - warns = False - else: - sorts = True - warns = not special - - df1, df2 = self.generate_frames(compare, special) - if input_type == 'dataframe': - other = df2 - else: - other = [df2] - - ctx = pytest.warns(FutureWarning) if warns else pytest.warns(None) - expected = self.merge_frames(df1, df2, sorts) - with ctx: - result = df1.append(other, ignore_index=True, sort=None) - assert_frame_equal(result, expected)