@@ -308,66 +308,130 @@ def test_data_frame_value_counts_dropna(
308308 tm .assert_series_equal (result_frame_groupby , expected )
309309
310310
311- @pytest .mark .parametrize ("as_index" , [False , True ])
311+ @pytest .mark .parametrize ("as_index" , [True , False ])
312312@pytest .mark .parametrize (
313313 "observed, expected_index" ,
314314 [
315315 (
316316 False ,
317317 [
318- ("FR" , "male " , "low " ),
319- ("FR" , "female " , "high " ),
320- ("FR" , "male " , "medium " ),
321- ("FR" , "female " , "low " ),
322- ("FR" , "female " , "medium " ),
323- ("FR" , "male " , "high " ),
324- ("US" , "female " , "high " ),
325- ("US" , "male " , "low " ),
326- ("US" , "female " , "low " ),
327- ("US" , "female " , "medium " ),
328- ("US" , "male " , "high " ),
329- ("US" , "male " , "medium " ),
318+ ("FR" , "high " , "female " ),
319+ ("FR" , "high " , "male " ),
320+ ("FR" , "low " , "male " ),
321+ ("FR" , "low " , "female " ),
322+ ("FR" , "medium " , "male " ),
323+ ("FR" , "medium " , "female " ),
324+ ("US" , "high " , "female " ),
325+ ("US" , "high " , "male " ),
326+ ("US" , "low " , "male " ),
327+ ("US" , "low " , "female " ),
328+ ("US" , "medium " , "female " ),
329+ ("US" , "medium " , "male " ),
330330 ],
331331 ),
332332 (
333333 True ,
334334 [
335- ("FR" , "male " , "low " ),
336- ("FR" , "female " , "high " ),
337- ("FR" , "male " , "medium " ),
338- ("US" , "female " , "high " ),
339- ("US" , "male " , "low " ),
335+ ("FR" , "high " , "female " ),
336+ ("FR" , "low " , "male " ),
337+ ("FR" , "medium " , "male " ),
338+ ("US" , "high " , "female " ),
339+ ("US" , "low " , "male " ),
340340 ],
341341 ),
342342 ],
343343)
344344@pytest .mark .parametrize (
345345 "normalize, expected_data" ,
346346 [
347- (False , np .array ([2 , 1 , 1 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 ], dtype = np .int64 )),
347+ (False , np .array ([1 , 0 , 2 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 0 , 0 ], dtype = np .int64 )),
348348 (
349349 True ,
350- np .array ([0.5 , 0.25 , 0.25 , 0.0 , 0.0 , 0.0 , 0.5 , 0.5 , 0.0 , 0.0 , 0.0 , 0.0 ]),
350+ # NaN values corresponds to non-observed groups
351+ np .array (
352+ [1.0 , 0.0 , 1.0 , 0.0 , 1.0 , 0.0 , 1.0 , 0.0 , 1.0 , 0.0 , np .nan , np .nan ]
353+ ),
351354 ),
352355 ],
353356)
354- def test_categorical (
357+ def test_categorical_groupers (
355358 education_df , as_index , observed , expected_index , normalize , expected_data
356359):
357- # Test categorical data whether or not observed
358- gp = education_df .astype ("category" ).groupby (
359- "country" , as_index = as_index , observed = observed
360+ education_df = education_df .copy ()
361+ education_df ["country" ] = education_df ["country" ].astype ("category" )
362+ education_df ["education" ] = education_df ["education" ].astype ("category" )
363+
364+ gp = education_df .groupby (
365+ ["country" , "education" ], as_index = as_index , observed = observed
360366 )
361367 result = gp .value_counts (normalize = normalize )
362368
363369 expected_series = Series (
364370 data = expected_data [expected_data > 0.0 ] if observed else expected_data ,
371+ index = MultiIndex .from_tuples (
372+ expected_index ,
373+ names = ["country" , "education" , "gender" ],
374+ ),
375+ )
376+ for i in range (2 ):
377+ expected_series .index = expected_series .index .set_levels (
378+ CategoricalIndex (expected_series .index .levels [i ]), level = i
379+ )
380+
381+ if as_index :
382+ tm .assert_series_equal (result , expected_series )
383+ else :
384+ expected = expected_series .reset_index (
385+ name = "proportion" if normalize else "count"
386+ )
387+ tm .assert_frame_equal (result , expected )
388+
389+
390+ @pytest .mark .parametrize ("as_index" , [False , True ])
391+ @pytest .mark .parametrize ("observed" , [False , True ])
392+ @pytest .mark .parametrize (
393+ "normalize, expected_data" ,
394+ [
395+ (False , np .array ([2 , 1 , 1 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 ], dtype = np .int64 )),
396+ (
397+ True ,
398+ # NaN values corresponds to non-observed groups
399+ np .array ([0.5 , 0.25 , 0.25 , 0.0 , 0.0 , 0.0 , 0.5 , 0.5 , 0.0 , 0.0 , 0.0 , 0.0 ]),
400+ ),
401+ ],
402+ )
403+ def test_categorical_values (education_df , as_index , observed , normalize , expected_data ):
404+ # Test non-observed categories are included in the result,
405+ # regardless of `observed`
406+ education_df = education_df .copy ()
407+ education_df ["gender" ] = education_df ["gender" ].astype ("category" )
408+ education_df ["education" ] = education_df ["education" ].astype ("category" )
409+
410+ gp = education_df .groupby ("country" , as_index = as_index , observed = observed )
411+ result = gp .value_counts (normalize = normalize )
412+
413+ expected_index = [
414+ ("FR" , "male" , "low" ),
415+ ("FR" , "female" , "high" ),
416+ ("FR" , "male" , "medium" ),
417+ ("FR" , "female" , "low" ),
418+ ("FR" , "female" , "medium" ),
419+ ("FR" , "male" , "high" ),
420+ ("US" , "female" , "high" ),
421+ ("US" , "male" , "low" ),
422+ ("US" , "female" , "low" ),
423+ ("US" , "female" , "medium" ),
424+ ("US" , "male" , "high" ),
425+ ("US" , "male" , "medium" ),
426+ ]
427+ expected_series = Series (
428+ data = expected_data ,
365429 index = MultiIndex .from_tuples (
366430 expected_index ,
367431 names = ["country" , "gender" , "education" ],
368432 ),
369433 )
370- for i in range (3 ):
434+ for i in range (1 , 3 ):
371435 expected_series .index = expected_series .index .set_levels (
372436 CategoricalIndex (expected_series .index .levels [i ]), level = i
373437 )
0 commit comments