2323from pandas .compat import long , lrange , lmap , lzip
2424from pandas import isnull
2525from pandas .io .common import get_filepath_or_buffer
26-
26+ from pandas . tslib import NaT
2727
2828def read_stata (filepath_or_buffer , convert_dates = True ,
2929 convert_categoricals = True , encoding = None , index = None ):
@@ -48,7 +48,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
4848
4949 return reader .data (convert_dates , convert_categoricals , index )
5050
51- _date_formats = ["%tc" , "%tC" , "%td" , "%tw" , "%tm" , "%tq" , "%th" , "%ty" ]
51+ _date_formats = ["%tc" , "%tC" , "%td" , "%d" , "% tw" , "%tm" , "%tq" , "%th" , "%ty" ]
5252
5353
5454def _stata_elapsed_date_to_datetime (date , fmt ):
@@ -97,6 +97,7 @@ def _stata_elapsed_date_to_datetime(date, fmt):
9797 # numpy types and numpy datetime isn't mature enough / we can't rely on
9898 # pandas version > 0.7.1
9999 #TODO: IIRC relative delta doesn't play well with np.datetime?
100+ #TODO: When pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly
100101 if np .isnan (date ):
101102 return np .datetime64 ('nat' )
102103
@@ -109,7 +110,7 @@ def _stata_elapsed_date_to_datetime(date, fmt):
109110 from warnings import warn
110111 warn ("Encountered %tC format. Leaving in Stata Internal Format." )
111112 return date
112- elif fmt in ["%td" , "td" ]:
113+ elif fmt in ["%td" , "td" , "%d" , "d" ]:
113114 return stata_epoch + datetime .timedelta (int (date ))
114115 elif fmt in ["%tw" , "tw" ]: # does not count leap days - 7 days is a week
115116 year = datetime .datetime (stata_epoch .year + date // 52 , 1 , 1 )
@@ -150,6 +151,11 @@ def _datetime_to_stata_elapsed(date, fmt):
150151 if not isinstance (date , datetime .datetime ):
151152 raise ValueError ("date should be datetime.datetime format" )
152153 stata_epoch = datetime .datetime (1960 , 1 , 1 )
154+ # Handle NaTs
155+ if date is NaT :
156+ # Missing value for dates ('.'), assumed always double
157+ # TODO: Should be moved so a const somewhere, and consolidated
158+ return struct .unpack ('<d' , b'\x00 \x00 \x00 \x00 \x00 \x00 \xe0 \x7f ' )[0 ]
153159 if fmt in ["%tc" , "tc" ]:
154160 delta = date - stata_epoch
155161 return (delta .days * 86400000 + delta .seconds * 1000 +
@@ -175,6 +181,62 @@ def _datetime_to_stata_elapsed(date, fmt):
175181 raise ValueError ("fmt %s not understood" % fmt )
176182
177183
184+ class PossiblePrecisionLoss (Warning ):
185+ pass
186+
187+
188+ precision_loss_doc = """
189+ Column converted from %s to %s, and some data are outside of the lossless
190+ conversion range. This may result in a loss of precision in the saved data.
191+ """
192+
193+
194+ def _cast_to_stata_types (data ):
195+ """Checks the dtypes of the columns of a pandas DataFrame for
196+ compatibility with the data types and ranges supported by Stata, and
197+ converts if necessary.
198+
199+ Parameters
200+ ----------
201+ data : DataFrame
202+ The DataFrame to check and convert
203+
204+ Notes
205+ -----
206+ Numeric columns must be one of int8, int16, int32, float32 or float64, with
207+ some additional value restrictions on the integer data types. int8 and
208+ int16 columns are checked for violations of the value restrictions and
209+ upcast if needed. int64 data is not usable in Stata, and so it is
210+ downcast to int32 whenever the value are in the int32 range, and
211+ sidecast to float64 when larger than this range. If the int64 values
212+ are outside of the range of those perfectly representable as float64 values,
213+ a warning is raised.
214+ """
215+ ws = ''
216+ for col in data :
217+ dtype = data [col ].dtype
218+ if dtype == np .int8 :
219+ if data [col ].max () > 100 or data [col ].min () < - 127 :
220+ data [col ] = data [col ].astype (np .int16 )
221+ elif dtype == np .int16 :
222+ if data [col ].max () > 32740 or data [col ].min () < - 32767 :
223+ data [col ] = data [col ].astype (np .int32 )
224+ elif dtype == np .int64 :
225+ if data [col ].max () <= 2147483620 and data [col ].min () >= - 2147483647 :
226+ data [col ] = data [col ].astype (np .int32 )
227+ else :
228+ data [col ] = data [col ].astype (np .float64 )
229+ if data [col ].max () <= 2 * 53 or data [col ].min () >= - 2 ** 53 :
230+ ws = precision_loss_doc % ('int64' , 'float64' )
231+
232+ if ws :
233+ import warnings
234+
235+ warnings .warn (ws , PossiblePrecisionLoss )
236+
237+ return data
238+
239+
178240class StataMissingValue (StringMixin ):
179241 """
180242 An observation's missing value.
@@ -193,14 +255,23 @@ class StataMissingValue(StringMixin):
193255 -----
194256 More information: <http://www.stata.com/help.cgi?missing>
195257 """
196-
258+ # TODO: Needs test
197259 def __init__ (self , offset , value ):
198260 self ._value = value
199- if type (value ) is int or type (value ) is long :
200- self ._str = value - offset is 1 and \
201- '.' or ('.' + chr (value - offset + 96 ))
261+ value_type = type (value )
262+ if value_type in int :
263+ loc = value - offset
264+ elif value_type in (float , np .float32 , np .float64 ):
265+ if value <= np .finfo (np .float32 ).max : # float32
266+ conv_str , byte_loc , scale = '<f' , 1 , 8
267+ else :
268+ conv_str , byte_loc , scale = '<d' , 5 , 1
269+ value_bytes = struct .pack (conv_str , value )
270+ loc = (struct .unpack ('<b' , value_bytes [byte_loc ])[0 ] / scale ) + 0
202271 else :
203- self ._str = '.'
272+ # Should never be hit
273+ loc = 0
274+ self ._str = loc is 0 and '.' or ('.' + chr (loc + 96 ))
204275 string = property (lambda self : self ._str ,
205276 doc = "The Stata representation of the missing value: "
206277 "'.', '.a'..'.z'" )
@@ -240,9 +311,9 @@ def __init__(self, encoding):
240311 dict (
241312 lzip (range (1 , 245 ), ['a' + str (i ) for i in range (1 , 245 )]) +
242313 [
243- (251 , np .int16 ),
244- (252 , np .int32 ),
245- (253 , np .int64 ),
314+ (251 , np .int8 ),
315+ (252 , np .int16 ),
316+ (253 , np .int32 ),
246317 (254 , np .float32 ),
247318 (255 , np .float64 )
248319 ]
@@ -253,9 +324,9 @@ def __init__(self, encoding):
253324 (32768 , np .string_ ),
254325 (65526 , np .float64 ),
255326 (65527 , np .float32 ),
256- (65528 , np .int64 ),
257- (65529 , np .int32 ),
258- (65530 , np .int16 )
327+ (65528 , np .int32 ),
328+ (65529 , np .int16 ),
329+ (65530 , np .int8 )
259330 ]
260331 )
261332 self .TYPE_MAP = lrange (251 ) + list ('bhlfd' )
@@ -272,13 +343,19 @@ def __init__(self, encoding):
272343 #NOTE: technically, some of these are wrong. there are more numbers
273344 # that can be represented. it's the 27 ABOVE and BELOW the max listed
274345 # numeric data type in [U] 12.2.2 of the 11.2 manual
275- self .MISSING_VALUES = \
346+ float32_min = b'\xff \xff \xff \xfe '
347+ float32_max = b'\xff \xff \xff \x7e '
348+ float64_min = b'\xff \xff \xff \xff \xff \xff \xef \xff '
349+ float64_max = b'\xff \xff \xff \xff \xff \xff \xdf \x7f '
350+ self .VALID_RANGE = \
276351 {
277352 'b' : (- 127 , 100 ),
278353 'h' : (- 32767 , 32740 ),
279354 'l' : (- 2147483647 , 2147483620 ),
280- 'f' : (- 1.701e+38 , + 1.701e+38 ),
281- 'd' : (- 1.798e+308 , + 8.988e+307 )
355+ 'f' : (np .float32 (struct .unpack ('<f' , float32_min )[0 ]),
356+ np .float32 (struct .unpack ('<f' , float32_max )[0 ])),
357+ 'd' : (np .float64 (struct .unpack ('<d' , float64_min )[0 ]),
358+ np .float64 (struct .unpack ('<d' , float64_max )[0 ]))
282359 }
283360
284361 self .OLD_TYPE_MAPPING = \
@@ -287,6 +364,16 @@ def __init__(self, encoding):
287364 'f' : 254 ,
288365 'b' : 251
289366 }
367+ # These missing values are the generic '.' in Stata, and are used
368+ # to replace nans
369+ self .MISSING_VALUES = \
370+ {
371+ 'b' : 101 ,
372+ 'h' : 32741 ,
373+ 'l' : 2147483621 ,
374+ 'f' : np .float32 (struct .unpack ('<f' , b'\x00 \x00 \x00 \x7f ' )[0 ]),
375+ 'd' : np .float64 (struct .unpack ('<d' , b'\x00 \x00 \x00 \x00 \x00 \x00 \xe0 \x7f ' )[0 ])
376+ }
290377
291378 def _decode_bytes (self , str , errors = None ):
292379 if compat .PY3 or self ._encoding is not None :
@@ -556,8 +643,8 @@ def _col_size(self, k=None):
556643
557644 def _unpack (self , fmt , byt ):
558645 d = struct .unpack (self .byteorder + fmt , byt )[0 ]
559- if fmt [- 1 ] in self .MISSING_VALUES :
560- nmin , nmax = self .MISSING_VALUES [fmt [- 1 ]]
646+ if fmt [- 1 ] in self .VALID_RANGE :
647+ nmin , nmax = self .VALID_RANGE [fmt [- 1 ]]
561648 if d < nmin or d > nmax :
562649 if self ._missing_values :
563650 return StataMissingValue (nmax , d )
@@ -855,11 +942,12 @@ def _dtype_to_stata_type(dtype):
855942 See TYPE_MAP and comments for an explanation. This is also explained in
856943 the dta spec.
857944 1 - 244 are strings of this length
858- 251 - chr(251) - for int8 and int16, byte
859- 252 - chr(252) - for int32, int
860- 253 - chr(253) - for int64, long
861- 254 - chr(254) - for float32, float
862- 255 - chr(255) - double, double
945+ Pandas Stata
946+ 251 - chr(251) - for int8 byte
947+ 252 - chr(252) - for int16 int
948+ 253 - chr(253) - for int32 long
949+ 254 - chr(254) - for float32 float
950+ 255 - chr(255) - for double double
863951
864952 If there are dates to convert, then dtype will already have the correct
865953 type inserted.
@@ -878,8 +966,10 @@ def _dtype_to_stata_type(dtype):
878966 elif dtype == np .int64 :
879967 return chr (253 )
880968 elif dtype == np .int32 :
969+ return chr (253 )
970+ elif dtype == np .int16 :
881971 return chr (252 )
882- elif dtype == np .int8 or dtype == np . int16 :
972+ elif dtype == np .int8 :
883973 return chr (251 )
884974 else : # pragma : no cover
885975 raise ValueError ("Data type %s not currently understood. "
@@ -970,7 +1060,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True,
9701060 self ._file = _open_file_binary_write (
9711061 fname , self ._encoding or self ._default_encoding
9721062 )
973- self .type_converters = {253 : np .long , 252 : int }
1063+ self .type_converters = {253 : np .int32 , 252 : np . int16 , 251 : np . int8 }
9741064
9751065 def _write (self , to_write ):
9761066 """
@@ -990,11 +1080,14 @@ def __init__(self, data):
9901080 self .data = data
9911081
9921082 def __iter__ (self ):
993- for i , row in data .iterrows ():
994- yield row
1083+ for row in data .itertuples ():
1084+ # First element is index, so remove
1085+ yield row [1 :]
9951086
9961087 if self ._write_index :
9971088 data = data .reset_index ()
1089+ # Check columns for compatbaility with stata
1090+ data = _cast_to_stata_types (data )
9981091 self .datarows = DataFrameRowIter (data )
9991092 self .nobs , self .nvar = data .shape
10001093 self .data = data
@@ -1181,7 +1274,7 @@ def _write_data_dates(self):
11811274 self ._write (var )
11821275 else :
11831276 if isnull (var ): # this only matters for floats
1184- var = MISSING_VALUES [typ ]
1277+ var = MISSING_VALUES [TYPE_MAP [ typ ] ]
11851278 self ._file .write (struct .pack (byteorder + TYPE_MAP [typ ], var ))
11861279
11871280 def _null_terminate (self , s , as_string = False ):
0 commit comments