@@ -256,11 +256,13 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
256256 def ax (self ):
257257 return self .grouper
258258
259- def _get_grouper (self , obj ):
259+ def _get_grouper (self , obj , validate = True ):
260260 """
261261 Parameters
262262 ----------
263263 obj : the subject object
264+ validate : boolean, default True
265+ if True, validate the grouper
264266
265267 Returns
266268 -------
@@ -271,7 +273,8 @@ def _get_grouper(self, obj):
271273 self .grouper , exclusions , self .obj = _get_grouper (self .obj , [self .key ],
272274 axis = self .axis ,
273275 level = self .level ,
274- sort = self .sort )
276+ sort = self .sort ,
277+ validate = validate )
275278 return self .binner , self .grouper , self .obj
276279
277280 def _set_grouper (self , obj , sort = False ):
@@ -326,12 +329,6 @@ def _set_grouper(self, obj, sort=False):
326329 self .grouper = ax
327330 return self .grouper
328331
329- def _get_binner_for_grouping (self , obj ):
330- """ default to the standard binner here """
331- group_axis = obj ._get_axis (self .axis )
332- return Grouping (group_axis , None , obj = obj , name = self .key ,
333- level = self .level , sort = self .sort , in_axis = False )
334-
335332 @property
336333 def groups (self ):
337334 return self .grouper .groups
@@ -1733,16 +1730,34 @@ class BaseGrouper(object):
17331730 """
17341731 This is an internal Grouper class, which actually holds
17351732 the generated groups
1733+
1734+ Parameters
1735+ ----------
1736+ axis : int
1737+ the axis to group
1738+ groupings : array of grouping
1739+ all the grouping instances to handle in this grouper
1740+ for example for grouper list to groupby, need to pass the list
1741+ sort : boolean, default True
1742+ whether this grouper will give sorted result or not
1743+ group_keys : boolean, default True
1744+ mutated : boolean, default False
1745+ indexer : intp array, optional
1746+ the indexer created by Grouper
1747+ some groupers (TimeGrouper) will sort its axis and its
1748+ group_info is also sorted, so need the indexer to reorder
1749+
17361750 """
17371751
17381752 def __init__ (self , axis , groupings , sort = True , group_keys = True ,
1739- mutated = False ):
1753+ mutated = False , indexer = None ):
17401754 self ._filter_empty_groups = self .compressed = len (groupings ) != 1
17411755 self .axis = axis
17421756 self .groupings = groupings
17431757 self .sort = sort
17441758 self .group_keys = group_keys
17451759 self .mutated = mutated
1760+ self .indexer = indexer
17461761
17471762 @property
17481763 def shape (self ):
@@ -1888,6 +1903,15 @@ def group_info(self):
18881903 comp_ids = _ensure_int64 (comp_ids )
18891904 return comp_ids , obs_group_ids , ngroups
18901905
1906+ @cache_readonly
1907+ def label_info (self ):
1908+ # return the labels of items in original grouped axis
1909+ labels , _ , _ = self .group_info
1910+ if self .indexer is not None :
1911+ sorter = np .lexsort ((labels , self .indexer ))
1912+ labels = labels [sorter ]
1913+ return labels
1914+
18911915 def _get_compressed_labels (self ):
18921916 all_labels = [ping .labels for ping in self .groupings ]
18931917 if len (all_labels ) > 1 :
@@ -2288,11 +2312,42 @@ def generate_bins_generic(values, binner, closed):
22882312
22892313class BinGrouper (BaseGrouper ):
22902314
2291- def __init__ (self , bins , binlabels , filter_empty = False , mutated = False ):
2315+ """
2316+ This is an internal Grouper class
2317+
2318+ Parameters
2319+ ----------
2320+ bins : the split index of binlabels to group the item of axis
2321+ binlabels : the label list
2322+ filter_empty : boolean, default False
2323+ mutated : boolean, default False
2324+ indexer : a intp array
2325+
2326+ Examples
2327+ --------
2328+ bins: [2, 4, 6, 8, 10]
2329+ binlabels: DatetimeIndex(['2005-01-01', '2005-01-03',
2330+ '2005-01-05', '2005-01-07', '2005-01-09'],
2331+ dtype='datetime64[ns]', freq='2D')
2332+
2333+ the group_info, which contains the label of each item in grouped
2334+ axis, the index of label in label list, group number, is
2335+
2336+ (array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), array([0, 1, 2, 3, 4]), 5)
2337+
2338+ means that, the grouped axis has 10 items, can be grouped into 5
2339+ labels, the first and second items belong to the first label, the
2340+ third and forth items belong to the second label, and so on
2341+
2342+ """
2343+
2344+ def __init__ (self , bins , binlabels , filter_empty = False , mutated = False ,
2345+ indexer = None ):
22922346 self .bins = _ensure_int64 (bins )
22932347 self .binlabels = _ensure_index (binlabels )
22942348 self ._filter_empty_groups = filter_empty
22952349 self .mutated = mutated
2350+ self .indexer = indexer
22962351
22972352 @cache_readonly
22982353 def groups (self ):
@@ -2460,6 +2515,19 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
24602515 self .grouper , self ._labels , self ._group_index = \
24612516 index ._get_grouper_for_level (self .grouper , level )
24622517
2518+ # a passed Grouper like, directly get the grouper in the same way
2519+ # as single grouper groupby, use the group_info to get labels
2520+ elif isinstance (self .grouper , Grouper ):
2521+ # get the new grouper; we already have disambiguated
2522+ # what key/level refer to exactly, don't need to
2523+ # check again as we have by this point converted these
2524+ # to an actual value (rather than a pd.Grouper)
2525+ _ , grouper , _ = self .grouper ._get_grouper (self .obj , validate = False )
2526+ if self .name is None :
2527+ self .name = grouper .result_index .name
2528+ self .obj = self .grouper .obj
2529+ self .grouper = grouper
2530+
24632531 else :
24642532 if self .grouper is None and self .name is not None :
24652533 self .grouper = self .obj [self .name ]
@@ -2482,16 +2550,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
24822550 categories = c ,
24832551 ordered = self .grouper .ordered ))
24842552
2485- # a passed Grouper like
2486- elif isinstance (self .grouper , Grouper ):
2487-
2488- # get the new grouper
2489- grouper = self .grouper ._get_binner_for_grouping (self .obj )
2490- self .obj = self .grouper .obj
2491- self .grouper = grouper
2492- if self .name is None :
2493- self .name = grouper .name
2494-
24952553 # we are done
24962554 if isinstance (self .grouper , Grouping ):
24972555 self .grouper = self .grouper .grouper
@@ -2536,6 +2594,10 @@ def ngroups(self):
25362594
25372595 @cache_readonly
25382596 def indices (self ):
2597+ # we have a list of groupers
2598+ if isinstance (self .grouper , BaseGrouper ):
2599+ return self .grouper .indices
2600+
25392601 values = _ensure_categorical (self .grouper )
25402602 return values ._reverse_indexer ()
25412603
@@ -2553,9 +2615,14 @@ def group_index(self):
25532615
25542616 def _make_labels (self ):
25552617 if self ._labels is None or self ._group_index is None :
2556- labels , uniques = algorithms .factorize (
2557- self .grouper , sort = self .sort )
2558- uniques = Index (uniques , name = self .name )
2618+ # we have a list of groupers
2619+ if isinstance (self .grouper , BaseGrouper ):
2620+ labels = self .grouper .label_info
2621+ uniques = self .grouper .result_index
2622+ else :
2623+ labels , uniques = algorithms .factorize (
2624+ self .grouper , sort = self .sort )
2625+ uniques = Index (uniques , name = self .name )
25592626 self ._labels = labels
25602627 self ._group_index = uniques
25612628
@@ -2566,7 +2633,7 @@ def groups(self):
25662633
25672634
25682635def _get_grouper (obj , key = None , axis = 0 , level = None , sort = True ,
2569- mutated = False ):
2636+ mutated = False , validate = True ):
25702637 """
25712638 create and return a BaseGrouper, which is an internal
25722639 mapping of how to create the grouper indexers.
@@ -2583,6 +2650,8 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
25832650 are and then creates a Grouping for each one, combined into
25842651 a BaseGrouper.
25852652
2653+ If validate, then check for key/level overlaps
2654+
25862655 """
25872656 group_axis = obj ._get_axis (axis )
25882657
@@ -2707,7 +2776,7 @@ def is_in_obj(gpr):
27072776
27082777 elif is_in_axis (gpr ): # df.groupby('name')
27092778 if gpr in obj :
2710- if gpr in obj .index .names :
2779+ if validate and gpr in obj .index .names :
27112780 warnings .warn (
27122781 ("'%s' is both a column name and an index level.\n "
27132782 "Defaulting to column but "
0 commit comments