From c338620fdce13ed59776192096e57bf78d65a32b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sun, 21 Sep 2025 17:28:12 -0700 Subject: [PATCH 1/3] Reduce extensionarray data length from 100 to 10 --- .../tests/arrays/boolean/test_arithmetic.py | 2 +- .../tests/arrays/boolean/test_comparison.py | 2 +- pandas/tests/arrays/boolean/test_reduction.py | 2 +- pandas/tests/arrays/floating/conftest.py | 7 +--- pandas/tests/arrays/integer/conftest.py | 2 +- pandas/tests/extension/base/getitem.py | 2 +- pandas/tests/extension/base/interface.py | 4 +-- pandas/tests/extension/base/methods.py | 3 +- pandas/tests/extension/base/ops.py | 4 +-- pandas/tests/extension/conftest.py | 4 +-- pandas/tests/extension/decimal/array.py | 4 +-- .../tests/extension/decimal/test_decimal.py | 10 +++--- pandas/tests/extension/json/array.py | 4 +-- pandas/tests/extension/json/test_json.py | 6 ++-- pandas/tests/extension/list/array.py | 6 ++-- pandas/tests/extension/list/test_list.py | 6 ++-- pandas/tests/extension/test_arrow.py | 34 +++++++++---------- pandas/tests/extension/test_categorical.py | 6 ++-- pandas/tests/extension/test_datetime.py | 2 +- pandas/tests/extension/test_interval.py | 11 +++--- pandas/tests/extension/test_masked.py | 16 +++------ pandas/tests/extension/test_numpy.py | 6 ++-- pandas/tests/extension/test_period.py | 2 +- pandas/tests/extension/test_sparse.py | 14 ++++---- pandas/tests/extension/test_string.py | 4 +-- 25 files changed, 74 insertions(+), 89 deletions(-) diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index 9ff690cdc914d..9553c4cf7ce5f 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -11,7 +11,7 @@ def data(): """Fixture returning boolean array with valid and missing values.""" return pd.array( - [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False], + [True, False] + [np.nan] + [True, False] * 2 + [np.nan] + [True, False], dtype="boolean", ) diff --git a/pandas/tests/arrays/boolean/test_comparison.py b/pandas/tests/arrays/boolean/test_comparison.py index 2eeb9da574b1e..8c35a77253494 100644 --- a/pandas/tests/arrays/boolean/test_comparison.py +++ b/pandas/tests/arrays/boolean/test_comparison.py @@ -11,7 +11,7 @@ def data(): """Fixture returning boolean array with valid and missing data""" return pd.array( - [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False], + [True, False] + [np.nan] + [True, False] * 2 + [np.nan] + [True, False], dtype="boolean", ) diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py index 7071c6f8844e4..8cb5035e264f6 100644 --- a/pandas/tests/arrays/boolean/test_reduction.py +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -8,7 +8,7 @@ def data(): """Fixture returning boolean array, with valid and missing values.""" return pd.array( - [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False], + [True, False] + [np.nan] + [True, False] * 2 + [np.nan] + [True, False], dtype="boolean", ) diff --git a/pandas/tests/arrays/floating/conftest.py b/pandas/tests/arrays/floating/conftest.py index fc29e3bfd9962..9b6f65275ae2e 100644 --- a/pandas/tests/arrays/floating/conftest.py +++ b/pandas/tests/arrays/floating/conftest.py @@ -1,4 +1,3 @@ -import numpy as np import pytest import pandas as pd @@ -18,11 +17,7 @@ def dtype(request): def data(dtype): """Fixture returning 'data' array according to parametrized float 'dtype'""" return pd.array( - list(np.arange(0.1, 0.9, 0.1)) - + [pd.NA] - + list(np.arange(1, 9.8, 0.1)) - + [pd.NA] - + [9.9, 10.0], + [0.1, 0.2] + [pd.NA] + [1.0, 1.1, 1.2, 1.3] + [pd.NA] + [9.9, 10.0], dtype=dtype, ) diff --git a/pandas/tests/arrays/integer/conftest.py b/pandas/tests/arrays/integer/conftest.py index 79275522d49f5..8b1ae06d4aafc 100644 --- a/pandas/tests/arrays/integer/conftest.py +++ b/pandas/tests/arrays/integer/conftest.py @@ -39,7 +39,7 @@ def data(dtype): Used to test dtype conversion with and without missing values. """ return pd.array( - list(range(8)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100], + [0, 1] + [pd.NA] + [10, 11, 12, 13] + [pd.NA] + [99, 100], dtype=dtype, ) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 1f3680bf67e90..09e8d8ba93092 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -408,7 +408,7 @@ def test_take_series(self, data): result = s.take([0, -1]) expected = pd.Series( data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype), - index=range(0, 198, 99), + index=s.index.take([0, -1]), ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 79eb64b5a654f..883c0ba8e35b6 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -19,10 +19,10 @@ class BaseInterfaceTests: # ------------------------------------------------------------------------ def test_len(self, data): - assert len(data) == 100 + assert len(data) == 10 def test_size(self, data): - assert data.size == 100 + assert data.size == 10 def test_ndim(self, data): assert data.ndim == 1 diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 90ec84a30a129..084ee61243fd0 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -39,7 +39,6 @@ def test_value_counts_default_dropna(self, data): @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna): - all_data = all_data[:10] if dropna: other = all_data[~all_data.isna()] else: @@ -52,7 +51,7 @@ def test_value_counts(self, all_data, dropna): def test_value_counts_with_normalize(self, data): # GH 33172 - data = data[:10].unique() + data = data.unique() values = np.array(data[~data.isna()]) ser = pd.Series(data, dtype=data.dtype) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 222ff42d45052..583435b674ba1 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -250,9 +250,7 @@ class BaseUnaryOpsTests(BaseOpsUtil): def test_invert(self, data): ser = pd.Series(data, name="name") try: - # 10 is an arbitrary choice here, just avoid iterating over - # the whole array to trim test runtime - [~x for x in data[:10]] + [~x for x in data] except TypeError: # scalars don't support invert -> we don't expect the vectorized # operation to succeed diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 97fb5a0bc5066..1376af5e51a6b 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -14,7 +14,7 @@ def dtype(): @pytest.fixture def data(): """ - Length-100 array for this type. + Length-10 array for this type. * data[0] and data[1] should both be non missing * data[0] and data[1] should not be equal @@ -25,7 +25,7 @@ def data(): @pytest.fixture def data_for_twos(dtype): """ - Length-100 array in which all the elements are two. + Length-10 array in which all the elements are two. Call pytest.skip in your fixture if the dtype does not support divmod. """ diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 6f7733ad7693e..aaba6295e5815 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -301,8 +301,8 @@ def to_decimal(values, context=None): return DecimalArray([decimal.Decimal(x) for x in values], context=context) -def make_data(): - return [decimal.Decimal(val) for val in np.random.default_rng(2).random(100)] +def make_data(n: int): + return [decimal.Decimal(val) for val in np.random.default_rng(2).random(n)] DecimalArray._add_arithmetic_ops() diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 39ce93d37da45..538c025546426 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -25,12 +25,12 @@ def dtype(): @pytest.fixture def data(): - return DecimalArray(make_data()) + return DecimalArray(make_data(10)) @pytest.fixture def data_for_twos(): - return DecimalArray([decimal.Decimal(2) for _ in range(100)]) + return DecimalArray([decimal.Decimal(2) for _ in range(10)]) @pytest.fixture @@ -340,7 +340,7 @@ def test_groupby_agg(): # Ensure that the result of agg is inferred to be decimal dtype # https://github.com/pandas-dev/pandas/issues/29141 - data = make_data()[:5] + data = make_data(5) df = pd.DataFrame( {"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)} ) @@ -377,7 +377,7 @@ def DecimalArray__my_sum(self): monkeypatch.setattr(DecimalArray, "my_sum", DecimalArray__my_sum, raising=False) - data = make_data()[:5] + data = make_data(5) df = pd.DataFrame({"id": [0, 0, 0, 1, 1], "decimals": DecimalArray(data)}) expected = pd.Series(to_decimal([data[0] + data[1] + data[2], data[3] + data[4]])) @@ -399,7 +399,7 @@ def DecimalArray__array__(self, dtype=None): monkeypatch.setattr(DecimalArray, "__array__", DecimalArray__array__, raising=False) - data = make_data() + data = make_data(10) s = pd.Series(DecimalArray(data)) df = pd.DataFrame({"a": s, "b": range(len(s))}) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index bc30ba4ef7769..828e4415bd295 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -255,7 +255,7 @@ def _pad_or_backfill(self, *, method, limit=None, copy=True): return super()._pad_or_backfill(method=method, limit=limit, copy=copy) -def make_data(): +def make_data(n: int): # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer rng = np.random.default_rng(2) return [ @@ -265,5 +265,5 @@ def make_data(): for _ in range(rng.integers(0, 10)) ] ) - for _ in range(100) + for _ in range(n) ] diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 5e1980c202f62..0d3b4e1a7cde6 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -26,8 +26,8 @@ def dtype(): @pytest.fixture def data(): - """Length-100 PeriodArray for semantics test.""" - data = make_data() + """Length-10 JSONArray for semantics test.""" + data = make_data(10) # Why the while loop? NumPy is unable to construct an ndarray from # equal-length ndarrays. Many of our operations involve coercing the @@ -36,7 +36,7 @@ def data(): # the first two elements, so that's what we'll check. while len(data[0]) == len(data[1]): - data = make_data() + data = make_data(10) return JSONArray(data) diff --git a/pandas/tests/extension/list/array.py b/pandas/tests/extension/list/array.py index 009462a1406ec..6da5d1cf25fad 100644 --- a/pandas/tests/extension/list/array.py +++ b/pandas/tests/extension/list/array.py @@ -125,12 +125,12 @@ def _concat_same_type(cls, to_concat): return cls(data) -def make_data(): +def make_data(n: int): # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer rng = np.random.default_rng(2) - data = np.empty(100, dtype=object) + data = np.empty(n, dtype=object) data[:] = [ [rng.choice(list(string.ascii_letters)) for _ in range(rng.integers(0, 10))] - for _ in range(100) + for _ in range(n) ] return data diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py index ac396cd3c60d4..3cb1c1916492c 100644 --- a/pandas/tests/extension/list/test_list.py +++ b/pandas/tests/extension/list/test_list.py @@ -15,11 +15,11 @@ def dtype(): @pytest.fixture def data(): - """Length-100 ListArray for semantics test.""" - data = make_data() + """Length-10 ListArray for semantics test.""" + data = make_data(10) while len(data[0]) == len(data[1]): - data = make_data() + data = make_data(10) return ListArray(data) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 36dd91195d241..66f17c147ff71 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -93,57 +93,57 @@ def dtype(request): def data(dtype): pa_dtype = dtype.pyarrow_dtype if pa.types.is_boolean(pa_dtype): - data = [True, False] * 4 + [None] + [True, False] * 44 + [None] + [True, False] + data = [True, False] + [None] + [True, False] * 2 + [None] + [True, False] elif pa.types.is_floating(pa_dtype): - data = [1.0, 0.0] * 4 + [None] + [-2.0, -1.0] * 44 + [None] + [0.5, 99.5] + data = [1.0, 0.0] + [None] + [-2.0, -1.0] * 2 + [None] + [0.5, 99.5] elif pa.types.is_signed_integer(pa_dtype): - data = [1, 0] * 4 + [None] + [-2, -1] * 44 + [None] + [1, 99] + data = [1, 0] + [None] + [-2, -1] * 2 + [None] + [1, 99] elif pa.types.is_unsigned_integer(pa_dtype): - data = [1, 0] * 4 + [None] + [2, 1] * 44 + [None] + [1, 99] + data = [1, 0] + [None] + [2, 1] * 2 + [None] + [1, 99] elif pa.types.is_decimal(pa_dtype): data = ( - [Decimal("1"), Decimal("0.0")] * 4 + [Decimal("1"), Decimal("0.0")] + [None] - + [Decimal("-2.0"), Decimal("-1.0")] * 44 + + [Decimal("-2.0"), Decimal("-1.0")] * 2 + [None] + [Decimal("0.5"), Decimal("33.123")] ) elif pa.types.is_date(pa_dtype): data = ( - [date(2022, 1, 1), date(1999, 12, 31)] * 4 + [date(2022, 1, 1), date(1999, 12, 31)] + [None] - + [date(2022, 1, 1), date(2022, 1, 1)] * 44 + + [date(2022, 1, 1), date(2022, 1, 1)] * 2 + [None] + [date(1999, 12, 31), date(1999, 12, 31)] ) elif pa.types.is_timestamp(pa_dtype): data = ( - [datetime(2020, 1, 1, 1, 1, 1, 1), datetime(1999, 1, 1, 1, 1, 1, 1)] * 4 + [datetime(2020, 1, 1, 1, 1, 1, 1), datetime(1999, 1, 1, 1, 1, 1, 1)] + [None] - + [datetime(2020, 1, 1, 1), datetime(1999, 1, 1, 1)] * 44 + + [datetime(2020, 1, 1, 1), datetime(1999, 1, 1, 1)] * 2 + [None] + [datetime(2020, 1, 1), datetime(1999, 1, 1)] ) elif pa.types.is_duration(pa_dtype): data = ( - [timedelta(1), timedelta(1, 1)] * 4 + [timedelta(1), timedelta(1, 1)] + [None] - + [timedelta(-1), timedelta(0)] * 44 + + [timedelta(-1), timedelta(0)] * 2 + [None] + [timedelta(-10), timedelta(10)] ) elif pa.types.is_time(pa_dtype): data = ( - [time(12, 0), time(0, 12)] * 4 + [time(12, 0), time(0, 12)] + [None] - + [time(0, 0), time(1, 1)] * 44 + + [time(0, 0), time(1, 1)] * 2 + [None] + [time(0, 5), time(5, 0)] ) elif pa.types.is_string(pa_dtype): - data = ["a", "b"] * 4 + [None] + ["1", "2"] * 44 + [None] + ["!", ">"] + data = ["a", "b"] + [None] + ["1", "2"] * 2 + [None] + ["!", ">"] elif pa.types.is_binary(pa_dtype): - data = [b"a", b"b"] * 4 + [None] + [b"1", b"2"] * 44 + [None] + [b"!", b">"] + data = [b"a", b"b"] + [None] + [b"1", b"2"] * 2 + [None] + [b"!", b">"] else: raise NotImplementedError return pd.array(data, dtype=dtype) @@ -264,7 +264,7 @@ def data_for_twos(data): or pa.types.is_decimal(pa_dtype) or pa.types.is_duration(pa_dtype) ): - return pd.array([2] * 100, dtype=data.dtype) + return pd.array([2] * 10, dtype=data.dtype) # tests will be xfailed where 2 is not a valid scalar for pa_dtype return data # TODO: skip otherwise? diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 8f8af607585df..275f8e2f859a4 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -28,9 +28,9 @@ from pandas.tests.extension import base -def make_data(): +def make_data(n: int): while True: - values = np.random.default_rng(2).choice(list(string.ascii_letters), size=100) + values = np.random.default_rng(2).choice(list(string.ascii_letters), size=n) # ensure we meet the requirements # 1. first two not null # 2. first and second are different @@ -51,7 +51,7 @@ def data(): * data[0] and data[1] should both be non missing * data[0] and data[1] should not be equal """ - return Categorical(make_data()) + return Categorical(make_data(10)) @pytest.fixture diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 356d5352f41f4..ab9eff220914d 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -33,7 +33,7 @@ def dtype(): @pytest.fixture def data(dtype): data = DatetimeArray._from_sequence( - pd.date_range("2000", periods=100, tz=dtype.tz), dtype=dtype + pd.date_range("2000", periods=10, tz=dtype.tz), dtype=dtype ) return data diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 8d437fc5d238b..c457d702ba043 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -31,10 +31,9 @@ import pandas as pd -def make_data(): - N = 100 - left_array = np.random.default_rng(2).uniform(size=N).cumsum() - right_array = left_array + np.random.default_rng(2).uniform(size=N) +def make_data(n: int): + left_array = np.random.default_rng(2).uniform(size=n).cumsum() + right_array = left_array + np.random.default_rng(2).uniform(size=n) return [Interval(left, right) for left, right in zip(left_array, right_array)] @@ -45,8 +44,8 @@ def dtype(): @pytest.fixture def data(): - """Length-100 PeriodArray for semantics test.""" - return IntervalArray(make_data()) + """Length-10 IntervalArray for semantics test.""" + return IntervalArray(make_data(10)) @pytest.fixture diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index e3764b2514680..073f4e63a422b 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -61,21 +61,15 @@ def make_data(): - return list(range(1, 9)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100] + return [0, 1] + [pd.NA] + [10, 11, 12, 13] + [pd.NA] + [99, 100] def make_float_data(): - return ( - list(np.arange(0.1, 0.9, 0.1)) - + [pd.NA] - + list(np.arange(1, 9.8, 0.1)) - + [pd.NA] - + [9.9, 10.0] - ) + return [0.1, 0.2] + [pd.NA] + [1.0, 1.1, 1.2, 1.3] + [pd.NA] + [9.9, 10.0] def make_bool_data(): - return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] + return [True, False] + [np.nan] + [True, False] * 2 + [np.nan] + [True, False] @pytest.fixture( @@ -111,8 +105,8 @@ def data(dtype): @pytest.fixture def data_for_twos(dtype): if dtype.kind == "b": - return pd.array(np.ones(100), dtype=dtype) - return pd.array(np.ones(100) * 2, dtype=dtype) + return pd.array(np.ones(10), dtype=dtype) + return pd.array(np.ones(10) * 2, dtype=dtype) @pytest.fixture diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 5fe761cd702b1..691ce9341b788 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -77,8 +77,8 @@ def allow_in_pandas(monkeypatch): @pytest.fixture def data(allow_in_pandas, dtype): if dtype.numpy_dtype == "object": - return pd.Series([(i,) for i in range(100)]).array - return NumpyExtensionArray(np.arange(1, 101, dtype=dtype._dtype)) + return pd.Series([(i,) for i in range(10)]).array + return NumpyExtensionArray(np.arange(1, 11, dtype=dtype._dtype)) @pytest.fixture @@ -143,7 +143,7 @@ def data_for_grouping(allow_in_pandas, dtype): def data_for_twos(dtype): if dtype.kind == "O": pytest.skip(f"{dtype} is not a numeric dtype") - arr = np.ones(100) * 2 + arr = np.ones(10) * 2 return NumpyExtensionArray._from_sequence(arr, dtype=dtype) diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index 31496c1b7db3d..a3be4e2b4420a 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -43,7 +43,7 @@ def dtype(request): @pytest.fixture def data(dtype): - return PeriodArray(np.arange(1970, 2070), dtype=dtype) + return PeriodArray(np.arange(1970, 1980), dtype=dtype) @pytest.fixture diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index b7685a61d4937..72f9a13910cc2 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -24,12 +24,12 @@ from pandas.tests.extension import base -def make_data(fill_value): +def make_data(fill_value, n: int): rng = np.random.default_rng(2) if np.isnan(fill_value): - data = rng.uniform(size=100) + data = rng.uniform(size=n) else: - data = rng.integers(1, 100, size=100, dtype=int) + data = rng.integers(1, 100, size=n, dtype=int) if data[0] == data[1]: data[0] += 1 @@ -44,14 +44,14 @@ def dtype(): @pytest.fixture(params=[0, np.nan]) def data(request): - """Length-100 PeriodArray for semantics test.""" - res = SparseArray(make_data(request.param), fill_value=request.param) + """Length-10 SparseArray for semantics test.""" + res = SparseArray(make_data(request.param, 10), fill_value=request.param) return res @pytest.fixture def data_for_twos(): - return SparseArray(np.ones(100) * 2) + return SparseArray(np.ones(10) * 2) @pytest.fixture(params=[0, np.nan]) @@ -66,7 +66,7 @@ def data_repeated(request): def gen(count): for _ in range(count): - yield SparseArray(make_data(request.param), fill_value=request.param) + yield SparseArray(make_data(request.param, 10), fill_value=request.param) return gen diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index c49063a5a9a68..e373ff12c4086 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -65,9 +65,9 @@ def dtype(string_dtype_arguments): @pytest.fixture def data(dtype, chunked): - strings = np.random.default_rng(2).choice(list(string.ascii_letters), size=100) + strings = np.random.default_rng(2).choice(list(string.ascii_letters), size=10) while strings[0] == strings[1]: - strings = np.random.default_rng(2).choice(list(string.ascii_letters), size=100) + strings = np.random.default_rng(2).choice(list(string.ascii_letters), size=10) arr = dtype.construct_array_type()._from_sequence(strings, dtype=dtype) return maybe_split_array(arr, chunked) From fca462e1455cc5e1f70a312d434b7f2f33cddd3b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 22 Sep 2025 13:39:43 -0700 Subject: [PATCH 2/3] Adjust where the nulls should be, generalize some tests --- .../tests/arrays/boolean/test_arithmetic.py | 2 +- .../tests/arrays/boolean/test_comparison.py | 2 +- pandas/tests/arrays/boolean/test_reduction.py | 2 +- pandas/tests/arrays/floating/conftest.py | 2 +- pandas/tests/arrays/integer/conftest.py | 2 +- pandas/tests/extension/base/casting.py | 10 +++++- pandas/tests/extension/base/printing.py | 2 +- pandas/tests/extension/base/reshaping.py | 4 ++- pandas/tests/extension/base/setitem.py | 14 ++++---- pandas/tests/extension/json/test_json.py | 5 ++- pandas/tests/extension/test_arrow.py | 32 +++++++++---------- pandas/tests/extension/test_masked.py | 6 ++-- 12 files changed, 46 insertions(+), 37 deletions(-) diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index 9553c4cf7ce5f..312dfb72e0950 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -11,7 +11,7 @@ def data(): """Fixture returning boolean array with valid and missing values.""" return pd.array( - [True, False] + [np.nan] + [True, False] * 2 + [np.nan] + [True, False], + [True, False] * 2 + [np.nan] + [True, False] + [np.nan] + [True, False], dtype="boolean", ) diff --git a/pandas/tests/arrays/boolean/test_comparison.py b/pandas/tests/arrays/boolean/test_comparison.py index 8c35a77253494..ed1d4414cb03c 100644 --- a/pandas/tests/arrays/boolean/test_comparison.py +++ b/pandas/tests/arrays/boolean/test_comparison.py @@ -11,7 +11,7 @@ def data(): """Fixture returning boolean array with valid and missing data""" return pd.array( - [True, False] + [np.nan] + [True, False] * 2 + [np.nan] + [True, False], + [True, False] * 2 + [np.nan] + [True, False] + [np.nan] + [True, False], dtype="boolean", ) diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py index 8cb5035e264f6..696ae1df4c9fd 100644 --- a/pandas/tests/arrays/boolean/test_reduction.py +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -8,7 +8,7 @@ def data(): """Fixture returning boolean array, with valid and missing values.""" return pd.array( - [True, False] + [np.nan] + [True, False] * 2 + [np.nan] + [True, False], + [True, False] * 2 + [np.nan] + [True, False] + [np.nan] + [True, False], dtype="boolean", ) diff --git a/pandas/tests/arrays/floating/conftest.py b/pandas/tests/arrays/floating/conftest.py index 9b6f65275ae2e..21dae4123814a 100644 --- a/pandas/tests/arrays/floating/conftest.py +++ b/pandas/tests/arrays/floating/conftest.py @@ -17,7 +17,7 @@ def dtype(request): def data(dtype): """Fixture returning 'data' array according to parametrized float 'dtype'""" return pd.array( - [0.1, 0.2] + [pd.NA] + [1.0, 1.1, 1.2, 1.3] + [pd.NA] + [9.9, 10.0], + [0.1, 0.2, 0.3, 0.4] + [pd.NA] + [1.0, 1.1] + [pd.NA] + [9.9, 10.0], dtype=dtype, ) diff --git a/pandas/tests/arrays/integer/conftest.py b/pandas/tests/arrays/integer/conftest.py index 8b1ae06d4aafc..f9371cf648fc0 100644 --- a/pandas/tests/arrays/integer/conftest.py +++ b/pandas/tests/arrays/integer/conftest.py @@ -39,7 +39,7 @@ def data(dtype): Used to test dtype conversion with and without missing values. """ return pd.array( - [0, 1] + [pd.NA] + [10, 11, 12, 13] + [pd.NA] + [99, 100], + [0, 1, 2, 3] + [pd.NA] + [10, 11] + [pd.NA] + [99, 100], dtype=dtype, ) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 8e3f21e1a4f56..8e1d6109e2119 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -57,9 +57,17 @@ def test_astype_str(self, data): ) def test_astype_string(self, data, nullable_string_dtype): # GH-33465, GH#45326 as of 2.0 we decode bytes instead of calling str(obj) + def as_str(x): + if isinstance(x, bytes): + return x.decode() + elif x is data.dtype.na_value: + return x + else: + return str(x) + result = pd.Series(data[:5]).astype(nullable_string_dtype) expected = pd.Series( - [str(x) if not isinstance(x, bytes) else x.decode() for x in data[:5]], + [as_str(x) for x in data[:5]], dtype=nullable_string_dtype, ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/printing.py b/pandas/tests/extension/base/printing.py index b20236ec107b0..f963c21ceea37 100644 --- a/pandas/tests/extension/base/printing.py +++ b/pandas/tests/extension/base/printing.py @@ -13,7 +13,7 @@ def test_array_repr(self, data, size): if size == "small": data = data[:5] else: - data = type(data)._concat_same_type([data] * 5) + data = type(data)._concat_same_type([data] * 20) result = repr(data) assert type(data).__name__ in result diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index a760cbc3995b3..6bddaaddff88e 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -303,7 +303,9 @@ def test_stack(self, data, columns, future_stack): ) @pytest.mark.parametrize("obj", ["series", "frame"]) def test_unstack(self, data, index, obj): - data = data[: len(index)] + final_length = min(len(index), len(data)) + index = index[:final_length] + data = data[:final_length] if obj == "series": ser = pd.Series(data, index=index) else: diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index b4dbe04b04374..b273c9b9f092a 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -136,13 +136,13 @@ def test_setitem_iloc_scalar_mixed(self, data): def test_setitem_iloc_scalar_single(self, data): df = pd.DataFrame({"B": data}) - df.iloc[10, 0] = data[1] - assert df.loc[10, "B"] == data[1] + df.iloc[9, 0] = data[1] + assert df.loc[9, "B"] == data[1] def test_setitem_iloc_scalar_multiple_homogoneous(self, data): df = pd.DataFrame({"A": data, "B": data}) - df.iloc[10, 1] = data[1] - assert df.loc[10, "B"] == data[1] + df.iloc[9, 1] = data[1] + assert df.loc[9, "B"] == data[1] @pytest.mark.parametrize( "mask", @@ -281,9 +281,9 @@ def test_setitem_mask_broadcast(self, data, setter): else: # __setitem__ target = ser - target[mask] = data[10] - assert ser[0] == data[10] - assert ser[1] == data[10] + target[mask] = data[9] + assert ser[0] == data[9] + assert ser[1] == data[9] def test_setitem_expand_columns(self, data): df = pd.DataFrame({"A": data}) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 0d3b4e1a7cde6..cdb98c5342ecb 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -190,9 +190,8 @@ def test_ffill_limit_area( ) def test_value_counts(self, all_data, dropna, request): - if len(all_data) == 100 or dropna: - mark = pytest.mark.xfail(reason="unhashable") - request.applymarker(mark) + if len(all_data) == 10 or dropna: + request.applymarker(unhashable) super().test_value_counts(all_data, dropna) @unhashable diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 66f17c147ff71..1863771dff593 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -93,57 +93,57 @@ def dtype(request): def data(dtype): pa_dtype = dtype.pyarrow_dtype if pa.types.is_boolean(pa_dtype): - data = [True, False] + [None] + [True, False] * 2 + [None] + [True, False] + data = [True, False] * 2 + [None] + [True, False] + [None] + [True, False] elif pa.types.is_floating(pa_dtype): - data = [1.0, 0.0] + [None] + [-2.0, -1.0] * 2 + [None] + [0.5, 99.5] + data = [1.0, 0.0] * 2 + [None] + [-2.0, -1.0] + [None] + [0.5, 99.5] elif pa.types.is_signed_integer(pa_dtype): - data = [1, 0] + [None] + [-2, -1] * 2 + [None] + [1, 99] + data = [1, 0] * 2 + [None] + [-2, -1] + [None] + [1, 99] elif pa.types.is_unsigned_integer(pa_dtype): - data = [1, 0] + [None] + [2, 1] * 2 + [None] + [1, 99] + data = [1, 0] * 2 + [None] + [2, 1] + [None] + [1, 99] elif pa.types.is_decimal(pa_dtype): data = ( - [Decimal("1"), Decimal("0.0")] + [Decimal("1"), Decimal("0.0")] * 2 + [None] - + [Decimal("-2.0"), Decimal("-1.0")] * 2 + + [Decimal("-2.0"), Decimal("-1.0")] + [None] + [Decimal("0.5"), Decimal("33.123")] ) elif pa.types.is_date(pa_dtype): data = ( - [date(2022, 1, 1), date(1999, 12, 31)] + [date(2022, 1, 1), date(1999, 12, 31)] * 2 + [None] - + [date(2022, 1, 1), date(2022, 1, 1)] * 2 + + [date(2022, 1, 1), date(2022, 1, 1)] + [None] + [date(1999, 12, 31), date(1999, 12, 31)] ) elif pa.types.is_timestamp(pa_dtype): data = ( - [datetime(2020, 1, 1, 1, 1, 1, 1), datetime(1999, 1, 1, 1, 1, 1, 1)] + [datetime(2020, 1, 1, 1, 1, 1, 1), datetime(1999, 1, 1, 1, 1, 1, 1)] * 2 + [None] - + [datetime(2020, 1, 1, 1), datetime(1999, 1, 1, 1)] * 2 + + [datetime(2020, 1, 1, 1), datetime(1999, 1, 1, 1)] + [None] + [datetime(2020, 1, 1), datetime(1999, 1, 1)] ) elif pa.types.is_duration(pa_dtype): data = ( - [timedelta(1), timedelta(1, 1)] + [timedelta(1), timedelta(1, 1)] * 2 + [None] - + [timedelta(-1), timedelta(0)] * 2 + + [timedelta(-1), timedelta(0)] + [None] + [timedelta(-10), timedelta(10)] ) elif pa.types.is_time(pa_dtype): data = ( - [time(12, 0), time(0, 12)] + [time(12, 0), time(0, 12)] * 2 + [None] - + [time(0, 0), time(1, 1)] * 2 + + [time(0, 0), time(1, 1)] + [None] + [time(0, 5), time(5, 0)] ) elif pa.types.is_string(pa_dtype): - data = ["a", "b"] + [None] + ["1", "2"] * 2 + [None] + ["!", ">"] + data = ["a", "b"] * 2 + [None] + ["1", "2"] + [None] + ["!", ">"] elif pa.types.is_binary(pa_dtype): - data = [b"a", b"b"] + [None] + [b"1", b"2"] * 2 + [None] + [b"!", b">"] + data = [b"a", b"b"] * 2 + [None] + [b"1", b"2"] + [None] + [b"!", b">"] else: raise NotImplementedError return pd.array(data, dtype=dtype) diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 073f4e63a422b..a288a4438aeec 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -61,15 +61,15 @@ def make_data(): - return [0, 1] + [pd.NA] + [10, 11, 12, 13] + [pd.NA] + [99, 100] + return [1, 2, 3, 4] + [pd.NA] + [10, 11] + [pd.NA] + [99, 100] def make_float_data(): - return [0.1, 0.2] + [pd.NA] + [1.0, 1.1, 1.2, 1.3] + [pd.NA] + [9.9, 10.0] + return [0.1, 0.2, 0.3, 0.4] + [pd.NA] + [1.0, 1.1] + [pd.NA] + [9.9, 10.0] def make_bool_data(): - return [True, False] + [np.nan] + [True, False] * 2 + [np.nan] + [True, False] + return [True, False] * 2 + [np.nan] + [True, False] + [np.nan] + [True, False] @pytest.fixture( From 6ceec252c4d054c2336559f80616bfa7d012dbcf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 22 Sep 2025 14:02:10 -0700 Subject: [PATCH 3/3] Fix test_groupby_agg_err_catching --- pandas/tests/groupby/aggregate/test_other.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 1c016143d50c3..3b9f18456c559 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -647,7 +647,7 @@ def test_groupby_agg_err_catching(err_cls): to_decimal, ) - data = make_data()[:5] + data = make_data(5) df = DataFrame( {"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)} )