@@ -36,6 +36,41 @@ def f(a):
3636 return result .reindex (index ).sort_index ()
3737
3838
39+ _results_for_groupbys_with_missing_categories = dict (
40+ # This maps the builtin groupby functions to their expected outputs for
41+ # missing categories when they are called on a categorical grouper with
42+ # observed=False. Some functions are expected to return NaN, some zero.
43+ # These expected values can be used across several tests (i.e. they are
44+ # the same for SeriesGroupBy and DataFrameGroupBy) but they should only be
45+ # hardcoded in one place.
46+ [
47+ ("all" , np .NaN ),
48+ ("any" , np .NaN ),
49+ ("count" , 0 ),
50+ ("corrwith" , np .NaN ),
51+ ("first" , np .NaN ),
52+ ("idxmax" , np .NaN ),
53+ ("idxmin" , np .NaN ),
54+ ("last" , np .NaN ),
55+ ("mad" , np .NaN ),
56+ ("max" , np .NaN ),
57+ ("mean" , np .NaN ),
58+ ("median" , np .NaN ),
59+ ("min" , np .NaN ),
60+ ("nth" , np .NaN ),
61+ ("nunique" , 0 ),
62+ ("prod" , np .NaN ),
63+ ("quantile" , np .NaN ),
64+ ("sem" , np .NaN ),
65+ ("size" , 0 ),
66+ ("skew" , np .NaN ),
67+ ("std" , np .NaN ),
68+ ("sum" , 0 ),
69+ ("var" , np .NaN ),
70+ ]
71+ )
72+
73+
3974def test_apply_use_categorical_name (df ):
4075 cats = qcut (df .C , 4 )
4176
@@ -1263,12 +1298,13 @@ def test_series_groupby_on_2_categoricals_unobserved(
12631298 reduction_func : str , observed : bool , request
12641299):
12651300 # GH 17605
1266-
12671301 if reduction_func == "ngroup" :
12681302 pytest .skip ("ngroup is not truly a reduction" )
12691303
12701304 if reduction_func == "corrwith" : # GH 32293
1271- mark = pytest .mark .xfail (reason = "TODO: implemented SeriesGroupBy.corrwith" )
1305+ mark = pytest .mark .xfail (
1306+ reason = "TODO: implemented SeriesGroupBy.corrwith. See GH 32293"
1307+ )
12721308 request .node .add_marker (mark )
12731309
12741310 df = pd .DataFrame (
@@ -1289,36 +1325,30 @@ def test_series_groupby_on_2_categoricals_unobserved(
12891325 assert len (result ) == expected_length
12901326
12911327
1292- @pytest .mark .parametrize (
1293- "func, zero_or_nan" ,
1294- [
1295- ("all" , np .NaN ),
1296- ("any" , np .NaN ),
1297- ("count" , 0 ),
1298- ("first" , np .NaN ),
1299- ("idxmax" , np .NaN ),
1300- ("idxmin" , np .NaN ),
1301- ("last" , np .NaN ),
1302- ("mad" , np .NaN ),
1303- ("max" , np .NaN ),
1304- ("mean" , np .NaN ),
1305- ("median" , np .NaN ),
1306- ("min" , np .NaN ),
1307- ("nth" , np .NaN ),
1308- ("nunique" , 0 ),
1309- ("prod" , np .NaN ),
1310- ("quantile" , np .NaN ),
1311- ("sem" , np .NaN ),
1312- ("size" , 0 ),
1313- ("skew" , np .NaN ),
1314- ("std" , np .NaN ),
1315- ("sum" , np .NaN ),
1316- ("var" , np .NaN ),
1317- ],
1318- )
1319- def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans (func , zero_or_nan ):
1328+ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans (
1329+ reduction_func : str , request
1330+ ):
13201331 # GH 17605
13211332 # Tests whether the unobserved categories in the result contain 0 or NaN
1333+
1334+ if reduction_func == "ngroup" :
1335+ pytest .skip ("ngroup is not truly a reduction" )
1336+
1337+ if reduction_func == "corrwith" : # GH 32293
1338+ mark = pytest .mark .xfail (
1339+ reason = "TODO: implemented SeriesGroupBy.corrwith. See GH 32293"
1340+ )
1341+ request .node .add_marker (mark )
1342+
1343+ if reduction_func == "sum" : # GH 31422
1344+ mark = pytest .mark .xfail (
1345+ reason = (
1346+ "sum should return 0 but currently returns NaN. "
1347+ "This is a known bug. See GH 31422."
1348+ )
1349+ )
1350+ request .node .add_marker (mark )
1351+
13221352 df = pd .DataFrame (
13231353 {
13241354 "cat_1" : pd .Categorical (list ("AABB" ), categories = list ("ABC" )),
@@ -1327,12 +1357,14 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_o
13271357 }
13281358 )
13291359 unobserved = [tuple ("AC" ), tuple ("BC" ), tuple ("CA" ), tuple ("CB" ), tuple ("CC" )]
1330- args = {"nth" : [0 ]}.get (func , [])
1360+ args = {"nth" : [0 ]}.get (reduction_func , [])
13311361
13321362 series_groupby = df .groupby (["cat_1" , "cat_2" ], observed = False )["value" ]
1333- agg = getattr (series_groupby , func )
1363+ agg = getattr (series_groupby , reduction_func )
13341364 result = agg (* args )
13351365
1366+ zero_or_nan = _results_for_groupbys_with_missing_categories [reduction_func ]
1367+
13361368 for idx in unobserved :
13371369 val = result .loc [idx ]
13381370 assert (pd .isna (zero_or_nan ) and pd .isna (val )) or (val == zero_or_nan )
@@ -1342,6 +1374,84 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_o
13421374 assert np .issubdtype (result .dtype , np .integer )
13431375
13441376
1377+ def test_dataframe_groupby_on_2_categoricals_when_observed_is_true (reduction_func : str ):
1378+ # GH 23865
1379+ # GH 27075
1380+ # Ensure that df.groupby, when 'by' is two pd.Categorical variables,
1381+ # does not return the categories that are not in df when observed=True
1382+ if reduction_func == "ngroup" :
1383+ pytest .skip ("ngroup does not return the Categories on the index" )
1384+
1385+ df = pd .DataFrame (
1386+ {
1387+ "cat_1" : pd .Categorical (list ("AABB" ), categories = list ("ABC" )),
1388+ "cat_2" : pd .Categorical (list ("1111" ), categories = list ("12" )),
1389+ "value" : [0.1 , 0.1 , 0.1 , 0.1 ],
1390+ }
1391+ )
1392+ unobserved_cats = [("A" , "2" ), ("B" , "2" ), ("C" , "1" ), ("C" , "2" )]
1393+
1394+ df_grp = df .groupby (["cat_1" , "cat_2" ], observed = True )
1395+
1396+ args = {"nth" : [0 ], "corrwith" : [df ]}.get (reduction_func , [])
1397+ res = getattr (df_grp , reduction_func )(* args )
1398+
1399+ for cat in unobserved_cats :
1400+ assert cat not in res .index
1401+
1402+
1403+ @pytest .mark .parametrize ("observed" , [False , None ])
1404+ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false (
1405+ reduction_func : str , observed : bool , request
1406+ ):
1407+ # GH 23865
1408+ # GH 27075
1409+ # Ensure that df.groupby, when 'by' is two pd.Categorical variables,
1410+ # returns the categories that are not in df when observed=False/None
1411+
1412+ if reduction_func == "ngroup" :
1413+ pytest .skip ("ngroup does not return the Categories on the index" )
1414+
1415+ if reduction_func == "count" : # GH 35028
1416+ mark = pytest .mark .xfail (
1417+ reason = (
1418+ "DataFrameGroupBy.count returns np.NaN for missing "
1419+ "categories, when it should return 0. See GH 35028"
1420+ )
1421+ )
1422+ request .node .add_marker (mark )
1423+
1424+ if reduction_func == "sum" : # GH 31422
1425+ mark = pytest .mark .xfail (
1426+ reason = (
1427+ "sum should return 0 but currently returns NaN. "
1428+ "This is a known bug. See GH 31422."
1429+ )
1430+ )
1431+ request .node .add_marker (mark )
1432+
1433+ df = pd .DataFrame (
1434+ {
1435+ "cat_1" : pd .Categorical (list ("AABB" ), categories = list ("ABC" )),
1436+ "cat_2" : pd .Categorical (list ("1111" ), categories = list ("12" )),
1437+ "value" : [0.1 , 0.1 , 0.1 , 0.1 ],
1438+ }
1439+ )
1440+ unobserved_cats = [("A" , "2" ), ("B" , "2" ), ("C" , "1" ), ("C" , "2" )]
1441+
1442+ df_grp = df .groupby (["cat_1" , "cat_2" ], observed = observed )
1443+
1444+ args = {"nth" : [0 ], "corrwith" : [df ]}.get (reduction_func , [])
1445+ res = getattr (df_grp , reduction_func )(* args )
1446+
1447+ expected = _results_for_groupbys_with_missing_categories [reduction_func ]
1448+
1449+ if expected is np .nan :
1450+ assert res .loc [unobserved_cats ].isnull ().all ().all ()
1451+ else :
1452+ assert (res .loc [unobserved_cats ] == expected ).all ().all ()
1453+
1454+
13451455def test_series_groupby_categorical_aggregation_getitem ():
13461456 # GH 8870
13471457 d = {"foo" : [10 , 8 , 4 , 1 ], "bar" : [10 , 20 , 30 , 40 ], "baz" : ["d" , "c" , "d" , "c" ]}
0 commit comments