1313
1414import sys
1515import struct
16+ from dateutil .relativedelta import relativedelta
1617from pandas .core .base import StringMixin
1718from pandas .core .frame import DataFrame
1819from pandas .core .series import Series
1920from pandas .core .categorical import Categorical
2021import datetime
21- from pandas import compat
22+ from pandas import compat , to_timedelta , to_datetime
2223from pandas .compat import lrange , lmap , lzip , text_type , string_types , range , \
2324 zip
24- from pandas import isnull
2525from pandas .io .common import get_filepath_or_buffer
2626from pandas .lib import max_len_string_array , is_string_array
27- from pandas .tslib import NaT
27+ from pandas .tslib import NaT , Timestamp
2828
2929def read_stata (filepath_or_buffer , convert_dates = True ,
3030 convert_categoricals = True , encoding = None , index = None ,
@@ -62,6 +62,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
6262_date_formats = ["%tc" , "%tC" , "%td" , "%d" , "%tw" , "%tm" , "%tq" , "%th" , "%ty" ]
6363
6464
65+ stata_epoch = datetime .datetime (1960 , 1 , 1 )
6566def _stata_elapsed_date_to_datetime (date , fmt ):
6667 """
6768 Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime
@@ -111,9 +112,7 @@ def _stata_elapsed_date_to_datetime(date, fmt):
111112 #TODO: When pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly
112113 if np .isnan (date ):
113114 return NaT
114-
115115 date = int (date )
116- stata_epoch = datetime .datetime (1960 , 1 , 1 )
117116 if fmt in ["%tc" , "tc" ]:
118117 from dateutil .relativedelta import relativedelta
119118 return stata_epoch + relativedelta (microseconds = date * 1000 )
@@ -148,6 +147,158 @@ def _stata_elapsed_date_to_datetime(date, fmt):
148147 raise ValueError ("Date fmt %s not understood" % fmt )
149148
150149
150+ def _stata_elapsed_date_to_datetime_vec (dates , fmt ):
151+ """
152+ Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime
153+
154+ Parameters
155+ ----------
156+ dates : array-like
157+ The Stata Internal Format date to convert to datetime according to fmt
158+ fmt : str
159+ The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
160+ Returns
161+
162+ Returns
163+ -------
164+ converted : Series
165+ The converted dates
166+
167+ Examples
168+ --------
169+ >>> _stata_elapsed_date_to_datetime(52, "%tw")
170+ datetime.datetime(1961, 1, 1, 0, 0)
171+
172+ Notes
173+ -----
174+ datetime/c - tc
175+ milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day
176+ datetime/C - tC - NOT IMPLEMENTED
177+ milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds
178+ date - td
179+ days since 01jan1960 (01jan1960 = 0)
180+ weekly date - tw
181+ weeks since 1960w1
182+ This assumes 52 weeks in a year, then adds 7 * remainder of the weeks.
183+ The datetime value is the start of the week in terms of days in the
184+ year, not ISO calendar weeks.
185+ monthly date - tm
186+ months since 1960m1
187+ quarterly date - tq
188+ quarters since 1960q1
189+ half-yearly date - th
190+ half-years since 1960h1 yearly
191+ date - ty
192+ years since 0000
193+
194+ If you don't have pandas with datetime support, then you can't do
195+ milliseconds accurately.
196+ """
197+ MIN_YEAR , MAX_YEAR = Timestamp .min .year , Timestamp .max .year
198+ MAX_DAY_DELTA = (Timestamp .max - datetime .datetime (1960 , 1 , 1 )).days
199+ MIN_DAY_DELTA = (Timestamp .min - datetime .datetime (1960 , 1 , 1 )).days
200+ MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000
201+ MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000
202+
203+ def convert_year_month_safe (year , month ):
204+ """
205+ Convert year and month to datetimes, using pandas vectorized versions
206+ when the date range falls within the range supported by pandas. Other
207+ wise it falls back to a slower but more robust method using datetime.
208+ """
209+ if year .max () < MAX_YEAR and year .min () > MIN_YEAR :
210+ return to_datetime (100 * year + month , format = '%Y%m' )
211+ else :
212+ return Series (
213+ [datetime .datetime (y , m , 1 ) for y , m in zip (year , month )])
214+
215+ def convert_year_days_safe (year , days ):
216+ """
217+ Converts year (e.g. 1999) and days since the start of the year to a
218+ datetime or datetime64 Series
219+ """
220+ if year .max () < (MAX_YEAR - 1 ) and year .min () > MIN_YEAR :
221+ return to_datetime (year , format = '%Y' ) + to_timedelta (days , unit = 'd' )
222+ else :
223+ value = [datetime .datetime (y , 1 , 1 ) + relativedelta (days = int (d )) for
224+ y , d in zip (year , days )]
225+ return Series (value )
226+
227+ def convert_delta_safe (base , deltas , unit ):
228+ """
229+ Convert base dates and deltas to datetimes, using pandas vectorized
230+ versions if the deltas satisfy restrictions required to be expressed
231+ as dates in pandas.
232+ """
233+ if unit == 'd' :
234+ if deltas .max () > MAX_DAY_DELTA or deltas .min () < MIN_DAY_DELTA :
235+ values = [base + relativedelta (days = int (d )) for d in deltas ]
236+ return Series (values )
237+ elif unit == 'ms' :
238+ if deltas .max () > MAX_MS_DELTA or deltas .min () < MIN_MS_DELTA :
239+ values = [base + relativedelta (microseconds = (int (d ) * 1000 )) for
240+ d in deltas ]
241+ return Series (values )
242+ else :
243+ raise ValueError ('format not understood' )
244+
245+ base = to_datetime (base )
246+ deltas = to_timedelta (deltas , unit = unit )
247+ return base + deltas
248+
249+ # TODO: If/when pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly
250+ bad_locs = np .isnan (dates )
251+ has_bad_values = False
252+ if bad_locs .any ():
253+ has_bad_values = True
254+ data_col = Series (dates )
255+ data_col [bad_locs ] = 1.0 # Replace with NaT
256+ dates = dates .astype (np .int64 )
257+
258+ if fmt in ["%tc" , "tc" ]: # Delta ms relative to base
259+ base = stata_epoch
260+ ms = dates
261+ conv_dates = convert_delta_safe (base , ms , 'ms' )
262+ elif fmt in ["%tC" , "tC" ]:
263+ from warnings import warn
264+
265+ warn ("Encountered %tC format. Leaving in Stata Internal Format." )
266+ conv_dates = Series (dates , dtype = np .object )
267+ if has_bad_values :
268+ conv_dates [bad_locs ] = np .nan
269+ return conv_dates
270+ elif fmt in ["%td" , "td" , "%d" , "d" ]: # Delta days relative to base
271+ base = stata_epoch
272+ days = dates
273+ conv_dates = convert_delta_safe (base , days , 'd' )
274+ elif fmt in ["%tw" , "tw" ]: # does not count leap days - 7 days is a week
275+ year = stata_epoch .year + dates // 52
276+ days = (dates % 52 ) * 7
277+ conv_dates = convert_year_days_safe (year , days )
278+ elif fmt in ["%tm" , "tm" ]: # Delta months relative to base
279+ year = stata_epoch .year + dates // 12
280+ month = (dates % 12 ) + 1
281+ conv_dates = convert_year_month_safe (year , month )
282+ elif fmt in ["%tq" , "tq" ]: # Delta quarters relative to base
283+ year = stata_epoch .year + dates // 4
284+ month = (dates % 4 ) * 3 + 1
285+ conv_dates = convert_year_month_safe (year , month )
286+ elif fmt in ["%th" , "th" ]: # Delta half-years relative to base
287+ year = stata_epoch .year + dates // 2
288+ month = (dates % 2 ) * 6 + 1
289+ conv_dates = convert_year_month_safe (year , month )
290+ elif fmt in ["%ty" , "ty" ]: # Years -- not delta
291+ # TODO: Check about negative years, here, and raise or warn if needed
292+ year = dates
293+ month = np .ones_like (dates )
294+ conv_dates = convert_year_month_safe (year , month )
295+ else :
296+ raise ValueError ("Date fmt %s not understood" % fmt )
297+
298+ if has_bad_values : # Restore NaT for bad values
299+ conv_dates [bad_locs ] = NaT
300+ return conv_dates
301+
151302def _datetime_to_stata_elapsed (date , fmt ):
152303 """
153304 Convert from datetime to SIF. http://www.stata.com/help.cgi?datetime
@@ -477,6 +628,14 @@ def __init__(self, encoding):
477628 'f' : np .float32 (struct .unpack ('<f' , b'\x00 \x00 \x00 \x7f ' )[0 ]),
478629 'd' : np .float64 (struct .unpack ('<d' , b'\x00 \x00 \x00 \x00 \x00 \x00 \xe0 \x7f ' )[0 ])
479630 }
631+ self .NUMPY_TYPE_MAP = \
632+ {
633+ 'b' : 'i1' ,
634+ 'h' : 'i2' ,
635+ 'l' : 'i4' ,
636+ 'f' : 'f4' ,
637+ 'd' : 'f8'
638+ }
480639
481640 # Reserved words cannot be used as variable names
482641 self .RESERVED_WORDS = ('aggregate' , 'array' , 'boolean' , 'break' ,
@@ -759,15 +918,6 @@ def _calcsize(self, fmt):
759918 return (type (fmt ) is int and fmt
760919 or struct .calcsize (self .byteorder + fmt ))
761920
762- def _col_size (self , k = None ):
763- if k is None :
764- return self .col_sizes
765- else :
766- return self .col_sizes [k ]
767-
768- def _unpack (self , fmt , byt ):
769- return struct .unpack (self .byteorder + fmt , byt )[0 ]
770-
771921 def _null_terminate (self , s ):
772922 if compat .PY3 or self ._encoding is not None : # have bytes not strings,
773923 # so must decode
@@ -784,55 +934,6 @@ def _null_terminate(self, s):
784934 except :
785935 return s
786936
787- def _next (self ):
788- typlist = self .typlist
789- if self .has_string_data :
790- data = [None ] * self .nvar
791- for i in range (len (data )):
792- if type (typlist [i ]) is int :
793- data [i ] = self ._null_terminate (
794- self .path_or_buf .read (typlist [i ])
795- )
796- else :
797- data [i ] = self ._unpack (
798- typlist [i ], self .path_or_buf .read (self ._col_size (i ))
799- )
800- return data
801- else :
802- return lmap (
803- lambda i : self ._unpack (typlist [i ],
804- self .path_or_buf .read (
805- self ._col_size (i )
806- )),
807- range (self .nvar )
808- )
809-
810-
811- def _dataset (self ):
812- """
813- Returns a Python generator object for iterating over the dataset.
814-
815-
816- Parameters
817- ----------
818-
819- Returns
820- -------
821- Generator object for iterating over the dataset. Yields each row of
822- observations as a list by default.
823-
824- Notes
825- -----
826- If missing_values is True during instantiation of StataReader then
827- observations with _StataMissingValue(s) are not filtered and should
828- be handled by your applcation.
829- """
830-
831- self .path_or_buf .seek (self .data_location )
832-
833- for i in range (self .nobs ):
834- yield self ._next ()
835-
836937 def _read_value_labels (self ):
837938 if self .format_version >= 117 :
838939 self .path_or_buf .seek (self .seek_value_labels )
@@ -932,27 +1033,32 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None,
9321033 if self .format_version >= 117 :
9331034 self ._read_strls ()
9341035
935- stata_dta = self ._dataset ()
936-
937- data = []
938- for rownum , line in enumerate (stata_dta ):
939- # doesn't handle missing value objects, just casts
940- # None will only work without missing value object.
941- for i , val in enumerate (line ):
942- #NOTE: This will only be scalar types because missing strings
943- # are empty not None in Stata
944- if val is None :
945- line [i ] = np .nan
946- data .append (tuple (line ))
1036+ # Read data
1037+ count = self .nobs
1038+ dtype = [] # Convert struct data types to numpy data type
1039+ for i , typ in enumerate (self .typlist ):
1040+ if typ in self .NUMPY_TYPE_MAP :
1041+ dtype .append (('s' + str (i ), self .NUMPY_TYPE_MAP [typ ]))
1042+ else :
1043+ dtype .append (('s' + str (i ), 'S' + str (typ )))
1044+ dtype = np .dtype (dtype )
1045+ read_len = count * dtype .itemsize
1046+ self .path_or_buf .seek (self .data_location )
1047+ data = np .frombuffer (self .path_or_buf .read (read_len ),dtype = dtype ,count = count )
1048+ self ._data_read = True
9471049
9481050 if convert_categoricals :
9491051 self ._read_value_labels ()
9501052
951- # TODO: Refactor to use a dictionary constructor and the correct dtype from the start?
9521053 if len (data )== 0 :
9531054 data = DataFrame (columns = self .varlist , index = index )
9541055 else :
955- data = DataFrame (data , columns = self .varlist , index = index )
1056+ data = DataFrame .from_records (data , index = index )
1057+ data .columns = self .varlist
1058+
1059+ for col , typ in zip (data , self .typlist ):
1060+ if type (typ ) is int :
1061+ data [col ] = data [col ].apply (self ._null_terminate , convert_dtype = True ,)
9561062
9571063 cols_ = np .where (self .dtyplist )[0 ]
9581064
@@ -1010,8 +1116,7 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None,
10101116 self .fmtlist ))[0 ]
10111117 for i in cols :
10121118 col = data .columns [i ]
1013- data [col ] = data [col ].apply (_stata_elapsed_date_to_datetime ,
1014- args = (self .fmtlist [i ],))
1119+ data [col ] = _stata_elapsed_date_to_datetime_vec (data [col ], self .fmtlist [i ])
10151120
10161121 if convert_categoricals :
10171122 cols = np .where (
0 commit comments