1+ # cython: profile=False
2+ # cython: boundscheck=False, initializedcheck=False
3+
14import numpy as np
25cimport numpy as np
36from numpy cimport uint8_t, uint16_t, int8_t, int64_t
@@ -10,19 +13,19 @@ import sas_constants as const
1013cdef np.ndarray[uint8_t, ndim= 1 ] rle_decompress(int result_length, np.ndarray[uint8_t, ndim= 1 ] inbuff):
1114
1215 cdef:
13- uint8_t control_byte, x, end_of_first_byte
16+ uint8_t control_byte, x
1417 uint8_t [:] result = np.zeros(result_length, np.uint8)
15- int rpos = 0 , ipos = 0 , i, nbytes, length = len (inbuff)
18+ int rpos = 0 , ipos = 0 , i, nbytes, end_of_first_byte, length = len (inbuff)
1619
1720 while ipos < length:
1821 control_byte = inbuff[ipos] & 0xF0
19- end_of_first_byte = int (inbuff[ipos] & 0x0F )
22+ end_of_first_byte = < int > (inbuff[ipos] & 0x0F )
2023 ipos += 1
2124
2225 if control_byte == 0x00 :
2326 if end_of_first_byte != 0 :
24- print (" Unexpected non-zero end_of_first_byte" )
25- nbytes = int (inbuff[ipos]) + 64
27+ raise ValueError (" Unexpected non-zero end_of_first_byte" )
28+ nbytes = < int > (inbuff[ipos]) + 64
2629 ipos += 1
2730 for i in range (nbytes):
2831 result[rpos] = inbuff[ipos]
@@ -31,20 +34,20 @@ cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[ui
3134 elif control_byte == 0x40 :
3235 # not documented
3336 nbytes = end_of_first_byte * 16
34- nbytes += int (inbuff[ipos])
37+ nbytes += < int > (inbuff[ipos])
3538 ipos += 1
3639 for i in range (nbytes):
3740 result[rpos] = inbuff[ipos]
3841 rpos += 1
3942 ipos += 1
4043 elif control_byte == 0x60 :
41- nbytes = end_of_first_byte* 256 + int (inbuff[ipos]) + 17
44+ nbytes = end_of_first_byte* 256 + < int > (inbuff[ipos]) + 17
4245 ipos += 1
4346 for i in range (nbytes):
4447 result[rpos] = 0x20
4548 rpos += 1
4649 elif control_byte == 0x70 :
47- nbytes = end_of_first_byte* 256 + int (inbuff[ipos]) + 17
50+ nbytes = end_of_first_byte* 256 + < int > (inbuff[ipos]) + 17
4851 ipos += 1
4952 for i in range (nbytes):
5053 result[rpos] = 0x00
@@ -99,7 +102,7 @@ cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[ui
99102 raise ValueError (" unknown control byte: %v " , control_byte)
100103
101104 if len (result) != result_length:
102- print (" RLE: %v != %v \n " , (len (result), result_length))
105+ raise ValueError (" RLE: %v != %v " , (len (result), result_length))
103106
104107 return np.asarray(result)
105108
@@ -162,7 +165,7 @@ cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[ui
162165 ipos += 1
163166 cnt += 16
164167 for k in range (cnt):
165- outbuff[rpos + k] = outbuff[rpos - int ( ofs) + k]
168+ outbuff[rpos + k] = outbuff[rpos - < int > ofs + k]
166169 rpos += cnt
167170
168171 # short pattern
@@ -171,7 +174,7 @@ cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[ui
171174 ofs += < uint16_t> inbuff[ipos] << 4
172175 ipos += 1
173176 for k in range (cmd):
174- outbuff[rpos + k] = outbuff[rpos - int ( ofs) + k]
177+ outbuff[rpos + k] = outbuff[rpos - < int > ofs + k]
175178 rpos += cmd
176179
177180 else :
@@ -182,6 +185,17 @@ cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[ui
182185
183186 return np.asarray(outbuff)
184187
188+ cdef enum ColumnTypes:
189+ column_type_decimal = 1
190+ column_type_string = 2
191+
192+
193+ # type the page_data types
194+ cdef int page_meta_type = const.page_meta_type
195+ cdef int page_mix_types_0 = const.page_mix_types[0 ]
196+ cdef int page_mix_types_1 = const.page_mix_types[1 ]
197+ cdef int page_data_type = const.page_data_type
198+ cdef int subheader_pointers_offset = const.subheader_pointers_offset
185199
186200cdef class Parser(object ):
187201
@@ -194,11 +208,16 @@ cdef class Parser(object):
194208 object [:, :] string_chunk
195209 char * cached_page
196210 int current_row_on_page_index
211+ int current_page_block_count
212+ int current_page_data_subheader_pointers_len
213+ int current_page_subheaders_count
197214 int current_row_in_chunk_index
198215 int current_row_in_file_index
216+ int header_length
199217 int row_length
200218 int bit_offset
201219 int subheader_pointer_length
220+ int current_page_type
202221 bint is_little_endian
203222 np.ndarray[uint8_t, ndim= 1 ] (* decompress)(int result_length, np.ndarray[uint8_t, ndim= 1 ] inbuff)
204223 object parser
@@ -208,30 +227,30 @@ cdef class Parser(object):
208227 int j
209228 char [:] column_types
210229
211- self .current_row_on_page_index = parser._current_row_on_page_index
212- self .current_row_in_chunk_index = parser._current_row_in_chunk_index
213- self .current_row_in_file_index = parser._current_row_in_file_index
214230 self .parser = parser
231+ self .header_length = self .parser.header_length
215232 self .column_count = parser.column_count
216233 self .lengths = parser._column_data_lengths
217234 self .offsets = parser._column_data_offsets
218235 self .byte_chunk = parser._byte_chunk
219236 self .string_chunk = parser._string_chunk
220237 self .row_length = parser.row_length
221- self .cached_page = < char * > parser._cached_page
222238 self .bit_offset = self .parser._page_bit_offset
223239 self .subheader_pointer_length = self .parser._subheader_pointer_length
224240 self .is_little_endian = parser.byte_order == " <"
225241 self .column_types = np.empty(self .column_count, dtype = ' int64' )
226242
243+ # page indicators
244+ self .update_next_page()
245+
227246 column_types = parser.column_types
228247
229248 # map column types
230249 for j in range (self .column_count):
231250 if column_types[j] == b' d' :
232- self .column_types[j] = 1
251+ self .column_types[j] = column_type_decimal
233252 elif column_types[j] == b' s' :
234- self .column_types[j] = 2
253+ self .column_types[j] = column_type_string
235254 else :
236255 raise ValueError (" unknown column type: %s " % self .parser.columns[j].ctype)
237256
@@ -243,6 +262,11 @@ cdef class Parser(object):
243262 else :
244263 self .decompress = NULL
245264
265+ # update to current state of the parser
266+ self .current_row_in_chunk_index = parser._current_row_in_chunk_index
267+ self .current_row_in_file_index = parser._current_row_in_file_index
268+ self .current_row_on_page_index = parser._current_row_on_page_index
269+
246270 def read (self , int nrows ):
247271 cdef:
248272 bint done
@@ -265,31 +289,39 @@ cdef class Parser(object):
265289 if done:
266290 self .cached_page = NULL
267291 else :
268- self .cached_page = < char * > self .parser._cached_page
269- self .current_row_on_page_index = 0
292+ self .update_next_page()
270293 return done
271294
295+ cdef update_next_page(self ):
296+ # update data for the current page
297+
298+ self .cached_page = < char * > self .parser._cached_page
299+ self .current_row_on_page_index = 0
300+ self .current_page_type = self .parser._current_page_type
301+ self .current_page_block_count = self .parser._current_page_block_count
302+ self .current_page_data_subheader_pointers_len = len (self .parser._current_page_data_subheader_pointers)
303+ self .current_page_subheaders_count = self .parser._current_page_subheaders_count
304+
272305 cdef bint readline(self ):
273306
274307 cdef:
275- int offset, bit_offset, align_correction, subheader_pointer_length
308+ int offset, bit_offset, align_correction, subheader_pointer_length, mn
276309 bint done, flag
277310
278311 bit_offset = self .bit_offset
279312 subheader_pointer_length = self .subheader_pointer_length
280313
281314 # If there is no page, go to the end of the header and read a page.
282315 if self .cached_page == NULL :
283- self .parser._path_or_buf.seek(self .parser. header_length)
316+ self .parser._path_or_buf.seek(self .header_length)
284317 done = self .read_next_page()
285318 if done:
286319 return True
287320
288321 # Loop until a data row is read
289322 while True :
290- if self .parser._current_page_type == const.page_meta_type:
291- flag = (self .current_row_on_page_index >=
292- len (self .parser._current_page_data_subheader_pointers))
323+ if self .current_page_type == page_meta_type:
324+ flag = self .current_row_on_page_index >= self .current_page_data_subheader_pointers_len
293325 if flag:
294326 done = self .read_next_page()
295327 if done:
@@ -301,14 +333,14 @@ cdef class Parser(object):
301333 self .process_byte_array_with_data(current_subheader_pointer.offset,
302334 current_subheader_pointer.length)
303335 return False
304- elif self .parser._current_page_type in const.page_mix_types :
305- align_correction = (bit_offset + const. subheader_pointers_offset +
306- self .parser._current_page_subheaders_count *
336+ elif self .current_page_type == page_mix_types_0 or self .current_page_type == page_mix_types_1 :
337+ align_correction = (bit_offset + subheader_pointers_offset +
338+ self .current_page_subheaders_count *
307339 subheader_pointer_length)
308340 align_correction = align_correction % 8
309341 offset = bit_offset + align_correction
310- offset += const. subheader_pointers_offset
311- offset += (self .parser._current_page_subheaders_count *
342+ offset += subheader_pointers_offset
343+ offset += (self .current_page_subheaders_count *
312344 subheader_pointer_length)
313345 offset += self .current_row_on_page_index * self .row_length
314346 self .process_byte_array_with_data(offset,
@@ -319,27 +351,29 @@ cdef class Parser(object):
319351 if done:
320352 return True
321353 return False
322- elif self .parser._current_page_type == const. page_data_type:
354+ elif self .current_page_type == page_data_type:
323355 self .process_byte_array_with_data(bit_offset +
324- const. subheader_pointers_offset +
356+ subheader_pointers_offset +
325357 self .current_row_on_page_index *
326358 self .row_length,
327359 self .row_length)
328360 flag = (self .current_row_on_page_index ==
329- self .parser._current_page_block_count )
361+ self .current_page_block_count )
330362 if flag:
331363 done = self .read_next_page()
332364 if done:
333365 return True
334366 return False
335367 else :
336368 raise ValueError (" unknown page type: %s " ,
337- self .parser._current_page_type )
369+ self .current_page_type )
338370
339371 cdef void process_byte_array_with_data(self , int offset, int length):
340372
341373 cdef:
342- int s, j, k, m, jb, js, lngt, start
374+ Py_ssize_t j
375+ int s, k, m, jb, js, current_row
376+ int64_t lngt, start, ct
343377 np.ndarray[uint8_t, ndim= 1 ] source
344378 int64_t[:] column_types
345379 int64_t[:] lengths
@@ -352,6 +386,7 @@ cdef class Parser(object):
352386 if self .decompress != NULL and (length < self .row_length):
353387 source = self .decompress(self .row_length, source)
354388
389+ current_row = self .current_row_in_chunk_index
355390 column_types = self .column_types
356391 lengths = self .lengths
357392 offsets = self .offsets
@@ -365,7 +400,8 @@ cdef class Parser(object):
365400 if lngt == 0 :
366401 break
367402 start = offsets[j]
368- if column_types[j] == 1 :
403+ ct = column_types[j]
404+ if ct == column_type_decimal:
369405 # decimal
370406 if self .is_little_endian:
371407 m = s + 8 - lngt
@@ -374,9 +410,9 @@ cdef class Parser(object):
374410 for k in range (lngt):
375411 byte_chunk[jb, m + k] = source[start + k]
376412 jb += 1
377- elif column_types[j] == 2 :
413+ elif column_types[j] == column_type_string :
378414 # string
379- string_chunk[js, self .current_row_in_chunk_index ] = source[start:(start+ lngt)].tostring().rstrip()
415+ string_chunk[js, current_row ] = source[start:(start+ lngt)].tostring().rstrip()
380416 js += 1
381417
382418 self .current_row_on_page_index += 1
0 commit comments