@@ -1324,8 +1324,9 @@ def _get_slice(slob):
1324
1324
1325
1325
def get_group_index (label_list , shape ):
1326
1326
"""
1327
- Gets the offsets into what would be the cartesian product of all
1328
- possible labels given the label_list.
1327
+ For the particular label_list, gets the offsets into the hypothetical list
1328
+ representing the totally ordered cartesian product of all possible label
1329
+ combinations.
1329
1330
"""
1330
1331
if len (label_list ) == 1 :
1331
1332
return label_list [0 ]
@@ -1409,24 +1410,38 @@ def cython_aggregate(values, group_index, ngroups, how='add'):
1409
1410
# sorting levels...cleverly?
1410
1411
1411
1412
def _compress_group_index (group_index , sort = True ):
1413
+ """
1414
+ Group_index is offsets into cartesian product of all possible labels. This
1415
+ space can be huge, so this function compresses it, by computing offsets
1416
+ (comp_ids) into the list of unique labels (obs_group_ids).
1417
+ """
1418
+
1412
1419
uniques = []
1413
1420
table = lib .Int64HashTable (len (group_index ))
1414
1421
1415
1422
group_index = _ensure_int64 (group_index )
1423
+
1424
+ # note, group labels come out ascending (ie, 1,2,3 etc)
1416
1425
comp_ids = table .get_labels_groupby (group_index , uniques )
1417
1426
1418
- # these are the ones we observed
1427
+ # these are the unique ones we observed, in the order we observed them
1419
1428
obs_group_ids = np .array (uniques , dtype = 'i8' )
1420
1429
1421
1430
if sort and len (obs_group_ids ) > 0 :
1431
+ # sorter is index where elements ought to go
1422
1432
sorter = obs_group_ids .argsort ()
1433
+
1434
+ # reverse_indexer is where elements came from
1423
1435
reverse_indexer = np .empty (len (sorter ), dtype = 'i4' )
1424
1436
reverse_indexer .put (sorter , np .arange (len (sorter )))
1425
1437
1426
1438
mask = comp_ids < 0
1439
+
1440
+ # move comp_ids to right locations (ie, unsort ascending labels)
1427
1441
comp_ids = reverse_indexer .take (comp_ids )
1428
1442
np .putmask (comp_ids , mask , - 1 )
1429
1443
1444
+ # sort observed ids
1430
1445
obs_group_ids = obs_group_ids .take (sorter )
1431
1446
1432
1447
return comp_ids , obs_group_ids
0 commit comments