@@ -187,7 +187,6 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
187187 else :
188188 f = lambda x : pat in x
189189 return _na_map (f , arr , na )
190-
191190
192191
193192def str_startswith (arr , pat , na = np .nan ):
@@ -460,6 +459,46 @@ def f(x):
460459 return result
461460
462461
462+ def str_get_dummies (arr , sep = '|' ):
463+ """
464+ Split each string by sep and return a frame of dummy/indicator variables.
465+
466+ Examples
467+ --------
468+ >>> Series(['a|b', 'a', 'a|c']).str.get_dummies()
469+ a b c
470+ 0 1 1 0
471+ 1 1 0 0
472+ 2 1 0 1
473+
474+ >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
475+ a b c
476+ 0 1 1 0
477+ 1 0 0 0
478+ 2 1 0 1
479+
480+ See also ``pd.get_dummies``.
481+
482+ """
483+ # TODO remove this hack?
484+ arr = arr .fillna ('' )
485+ try :
486+ arr = sep + arr + sep
487+ except TypeError :
488+ arr = sep + arr .astype (str ) + sep
489+
490+ tags = set ()
491+ for ts in arr .str .split (sep ):
492+ tags .update (ts )
493+ tags = sorted (tags - set (["" ]))
494+
495+ dummies = np .empty ((len (arr ), len (tags )), dtype = int )
496+
497+ for i , t in enumerate (tags ):
498+ pat = sep + t + sep
499+ dummies [:, i ] = lib .map_infer (arr .values , lambda x : pat in x )
500+ return DataFrame (dummies , arr .index , tags )
501+
463502
464503def str_join (arr , sep ):
465504 """
@@ -843,7 +882,7 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
843882 result = str_contains (self .series , pat , case = case , flags = flags ,
844883 na = na , regex = regex )
845884 return self ._wrap_result (result )
846-
885+
847886 @copy (str_replace )
848887 def replace (self , pat , repl , n = - 1 , case = True , flags = 0 ):
849888 result = str_replace (self .series , pat , repl , n = n , case = case ,
@@ -899,6 +938,11 @@ def rstrip(self, to_strip=None):
899938 result = str_rstrip (self .series , to_strip )
900939 return self ._wrap_result (result )
901940
941+ @copy (str_get_dummies )
942+ def get_dummies (self , sep = '|' ):
943+ result = str_get_dummies (self .series , sep )
944+ return self ._wrap_result (result )
945+
902946 count = _pat_wrapper (str_count , flags = True )
903947 startswith = _pat_wrapper (str_startswith , na = True )
904948 endswith = _pat_wrapper (str_endswith , na = True )
0 commit comments