33from pandas .compat import range , zip
44from pandas import compat
55import itertools
6+ import re
67
78import numpy as np
89
@@ -877,29 +878,55 @@ def lreshape(data, groups, dropna=True, label=None):
877878 return DataFrame (mdata , columns = id_cols + pivot_cols )
878879
879880
880- def wide_to_long (df , stubnames , i , j ):
881- """
881+ def wide_to_long (df , stubnames , i , j , sep = "" , suffix = '\d+' ):
882+ r """
882883 Wide panel to long format. Less flexible but more user-friendly than melt.
883884
885+ With stubnames ['A', 'B'], this function expects to find one or more
886+ group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,...
887+ You specify what you want to call this suffix in the resulting long format
888+ with `j` (for example `j='year'`)
889+
890+ Each row of these wide variables are assumed to be uniquely identified by
891+ `i` (can be a single column name or a list of column names)
892+
893+ All remaining variables in the data frame are left intact.
894+
884895 Parameters
885896 ----------
886897 df : DataFrame
887898 The wide-format DataFrame
888- stubnames : list
889- A list of stub names . The wide format variables are assumed to
899+ stubnames : str or list-like
900+ The stub name(s) . The wide format variables are assumed to
890901 start with the stub names.
891- i : str
892- The name of the id variable.
902+ i : str or list-like
903+ Column(s) to use as id variable(s)
893904 j : str
894- The name of the subobservation variable.
895- stubend : str
896- Regex to match for the end of the stubs.
905+ The name of the subobservation variable. What you wish to name your
906+ suffix in the long format.
907+ sep : str, default ""
908+ A character indicating the separation of the variable names
909+ in the wide format, to be stripped from the names in the long format.
910+ For example, if your column names are A-suffix1, A-suffix2, you
911+ can strip the hypen by specifying `sep='-'`
912+
913+ .. versionadded:: 0.20.0
914+
915+ suffix : str, default '\\d+'
916+ A regular expression capturing the wanted suffixes. '\\d+' captures
917+ numeric suffixes. Suffixes with no numbers could be specified with the
918+ negated character class '\\D+'. You can also further disambiguate
919+ suffixes, for example, if your wide variables are of the form
920+ Aone, Btwo,.., and you have an unrelated column Arating, you can
921+ ignore the last one by specifying `suffix='(!?one|two)'`
922+
923+ .. versionadded:: 0.20.0
897924
898925 Returns
899926 -------
900927 DataFrame
901- A DataFrame that contains each stub name as a variable as well as
902- variables for i and j.
928+ A DataFrame that contains each stub name as a variable, with new index
929+ (i, j)
903930
904931 Examples
905932 --------
@@ -918,7 +945,7 @@ def wide_to_long(df, stubnames, i, j):
918945 0 a d 2.5 3.2 -1.085631 0
919946 1 b e 1.2 1.3 0.997345 1
920947 2 c f 0.7 0.1 0.282978 2
921- >>> wide_to_long(df, ["A", "B"], i="id", j="year")
948+ >>> pd. wide_to_long(df, ["A", "B"], i="id", j="year")
922949 X A B
923950 id year
924951 0 1970 -1.085631 a 2.5
@@ -928,38 +955,151 @@ def wide_to_long(df, stubnames, i, j):
928955 1 1980 0.997345 e 1.3
929956 2 1980 0.282978 f 0.1
930957
958+ With multuple id columns
959+
960+ >>> df = pd.DataFrame({
961+ ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
962+ ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
963+ ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
964+ ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
965+ ... })
966+ >>> df
967+ birth famid ht1 ht2
968+ 0 1 1 2.8 3.4
969+ 1 2 1 2.9 3.8
970+ 2 3 1 2.2 2.9
971+ 3 1 2 2.0 3.2
972+ 4 2 2 1.8 2.8
973+ 5 3 2 1.9 2.4
974+ 6 1 3 2.2 3.3
975+ 7 2 3 2.3 3.4
976+ 8 3 3 2.1 2.9
977+ >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
978+ >>> l
979+ ht
980+ famid birth age
981+ 1 1 1 2.8
982+ 2 3.4
983+ 2 1 2.9
984+ 2 3.8
985+ 3 1 2.2
986+ 2 2.9
987+ 2 1 1 2.0
988+ 2 3.2
989+ 2 1 1.8
990+ 2 2.8
991+ 3 1 1.9
992+ 2 2.4
993+ 3 1 1 2.2
994+ 2 3.3
995+ 2 1 2.3
996+ 2 3.4
997+ 3 1 2.1
998+ 2 2.9
999+
1000+ Going from long back to wide just takes some creative use of `unstack`
1001+
1002+ >>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack()
1003+ >>> w.columns = pd.Index(w.columns).str.join('')
1004+ >>> w.reset_index()
1005+ famid birth ht1 ht2
1006+ 0 1 1 2.8 3.4
1007+ 1 1 2 2.9 3.8
1008+ 2 1 3 2.2 2.9
1009+ 3 2 1 2.0 3.2
1010+ 4 2 2 1.8 2.8
1011+ 5 2 3 1.9 2.4
1012+ 6 3 1 2.2 3.3
1013+ 7 3 2 2.3 3.4
1014+ 8 3 3 2.1 2.9
1015+
1016+ Less wieldy column names are also handled
1017+
1018+ >>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3),
1019+ ... 'A(quarterly)-2011': np.random.rand(3),
1020+ ... 'B(quarterly)-2010': np.random.rand(3),
1021+ ... 'B(quarterly)-2011': np.random.rand(3),
1022+ ... 'X' : np.random.randint(3, size=3)})
1023+ >>> df['id'] = df.index
1024+ >>> df
1025+ A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011
1026+ 0 0.531828 0.724455 0.322959 0.293714
1027+ 1 0.634401 0.611024 0.361789 0.630976
1028+ 2 0.849432 0.722443 0.228263 0.092105
1029+ \
1030+ X id
1031+ 0 0 0
1032+ 1 1 1
1033+ 2 2 2
1034+ >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'],
1035+ i='id', j='year', sep='-')
1036+ X A(quarterly) B(quarterly)
1037+ id year
1038+ 0 2010 0 0.531828 0.322959
1039+ 1 2010 2 0.634401 0.361789
1040+ 2 2010 2 0.849432 0.228263
1041+ 0 2011 0 0.724455 0.293714
1042+ 1 2011 2 0.611024 0.630976
1043+ 2 2011 2 0.722443 0.092105
1044+
1045+ If we have many columns, we could also use a regex to find our
1046+ stubnames and pass that list on to wide_to_long
1047+
1048+ >>> stubnames = set([match[0] for match in
1049+ df.columns.str.findall('[A-B]\(.*\)').values
1050+ if match != [] ])
1051+ >>> list(stubnames)
1052+ ['B(quarterly)', 'A(quarterly)']
1053+
9311054 Notes
9321055 -----
933- All extra variables are treated as extra id variables . This simply uses
1056+ All extra variables are left untouched . This simply uses
9341057 `pandas.melt` under the hood, but is hard-coded to "do the right thing"
9351058 in a typicaly case.
9361059 """
937-
938- def get_var_names ( df , regex ):
1060+ def get_var_names ( df , stub , sep , suffix ):
1061+ regex = "^{0}{1}{2}" . format ( re . escape ( stub ), re . escape ( sep ), suffix )
9391062 return df .filter (regex = regex ).columns .tolist ()
9401063
941- def melt_stub (df , stub , i , j ):
942- varnames = get_var_names (df , "^" + stub )
943- newdf = melt (df , id_vars = i , value_vars = varnames , value_name = stub ,
944- var_name = j )
945- newdf_j = newdf [j ].str .replace (stub , "" )
946- try :
947- newdf_j = newdf_j .astype (int )
948- except ValueError :
949- pass
950- newdf [j ] = newdf_j
951- return newdf
952-
953- id_vars = get_var_names (df , "^(?!%s)" % "|" .join (stubnames ))
954- if i not in id_vars :
955- id_vars += [i ]
956-
957- newdf = melt_stub (df , stubnames [0 ], id_vars , j )
958-
959- for stub in stubnames [1 :]:
960- new = melt_stub (df , stub , id_vars , j )
961- newdf = newdf .merge (new , how = "outer" , on = id_vars + [j ], copy = False )
962- return newdf .set_index ([i , j ])
1064+ def melt_stub (df , stub , i , j , value_vars , sep ):
1065+ newdf = melt (df , id_vars = i , value_vars = value_vars ,
1066+ value_name = stub .rstrip (sep ), var_name = j )
1067+ newdf [j ] = Categorical (newdf [j ])
1068+ newdf [j ] = newdf [j ].str .replace (re .escape (stub + sep ), "" )
1069+
1070+ return newdf .set_index (i + [j ])
1071+
1072+ if any (map (lambda s : s in df .columns .tolist (), stubnames )):
1073+ raise ValueError ("stubname can't be identical to a column name" )
1074+
1075+ if not is_list_like (stubnames ):
1076+ stubnames = [stubnames ]
1077+ else :
1078+ stubnames = list (stubnames )
1079+
1080+ if not is_list_like (i ):
1081+ i = [i ]
1082+ else :
1083+ i = list (i )
1084+
1085+ value_vars = list (map (lambda stub :
1086+ get_var_names (df , stub , sep , suffix ), stubnames ))
1087+
1088+ value_vars_flattened = [e for sublist in value_vars for e in sublist ]
1089+ id_vars = list (set (df .columns .tolist ()).difference (value_vars_flattened ))
1090+
1091+ melted = []
1092+ for s , v in zip (stubnames , value_vars ):
1093+ melted .append (melt_stub (df , s , i , j , v , sep ))
1094+ melted = melted [0 ].join (melted [1 :], how = 'outer' )
1095+
1096+ if len (i ) == 1 :
1097+ new = df [id_vars ].set_index (i ).join (melted )
1098+ return new
1099+
1100+ new = df [id_vars ].merge (melted .reset_index (), on = i ).set_index (i + [j ])
1101+
1102+ return new
9631103
9641104
9651105def get_dummies (data , prefix = None , prefix_sep = '_' , dummy_na = False ,
0 commit comments