@@ -20,13 +20,19 @@ from libc.string cimport (
2020import cython
2121from cython import Py_ssize_t
2222
23- from cpython.bytes cimport PyBytes_AsString
23+ from cpython.bytes cimport (
24+ PyBytes_AsString,
25+ PyBytes_FromString,
26+ )
2427from cpython.exc cimport (
2528 PyErr_Fetch,
2629 PyErr_Occurred,
2730)
2831from cpython.object cimport PyObject
29- from cpython.ref cimport Py_XDECREF
32+ from cpython.ref cimport (
33+ Py_INCREF,
34+ Py_XDECREF,
35+ )
3036from cpython.unicode cimport (
3137 PyUnicode_AsUTF8String,
3238 PyUnicode_Decode,
@@ -143,7 +149,7 @@ cdef extern from "parser/tokenizer.h":
143149 enum : ERROR_OVERFLOW
144150
145151 ctypedef void * (* io_callback)(void * src, size_t nbytes, size_t * bytes_read,
146- int * status)
152+ int * status, const char * encoding_errors )
147153 ctypedef int (* io_cleanup)(void * src)
148154
149155 ctypedef struct parser_t:
@@ -255,8 +261,8 @@ cdef extern from "parser/tokenizer.h":
255261
256262 int parser_trim_buffers(parser_t * self )
257263
258- int tokenize_all_rows(parser_t * self ) nogil
259- int tokenize_nrows(parser_t * self , size_t nrows) nogil
264+ int tokenize_all_rows(parser_t * self , const char * encoding_errors ) nogil
265+ int tokenize_nrows(parser_t * self , size_t nrows, const char * encoding_errors ) nogil
260266
261267 int64_t str_to_int64(char * p_item, int64_t int_min,
262268 int64_t int_max, int * error, char tsep) nogil
@@ -293,7 +299,7 @@ cdef extern from "parser/io.h":
293299 size_t * bytes_read, int * status)
294300
295301 void * buffer_rd_bytes(void * source, size_t nbytes,
296- size_t * bytes_read, int * status)
302+ size_t * bytes_read, int * status, const char * encoding_errors )
297303
298304
299305cdef class TextReader:
@@ -316,6 +322,7 @@ cdef class TextReader:
316322 uint64_t parser_start
317323 list clocks
318324 char * c_encoding
325+ const char * encoding_errors
319326 kh_str_starts_t * false_set
320327 kh_str_starts_t * true_set
321328
@@ -370,10 +377,15 @@ cdef class TextReader:
370377 bint verbose = False ,
371378 bint mangle_dupe_cols = True ,
372379 float_precision = None ,
373- bint skip_blank_lines = True ):
380+ bint skip_blank_lines = True ,
381+ encoding_errors = b" strict" ):
374382
375383 # set encoding for native Python and C library
376384 self .c_encoding = NULL
385+ if isinstance (encoding_errors, str ):
386+ encoding_errors = encoding_errors.encode(" utf-8" )
387+ Py_INCREF(encoding_errors)
388+ self .encoding_errors = PyBytes_AsString(encoding_errors)
377389
378390 self .parser = parser_new()
379391 self .parser.chunksize = tokenize_chunksize
@@ -558,13 +570,7 @@ cdef class TextReader:
558570 pass
559571
560572 def __dealloc__ (self ):
561- parser_free(self .parser)
562- if self .true_set:
563- kh_destroy_str_starts(self .true_set)
564- self .true_set = NULL
565- if self .false_set:
566- kh_destroy_str_starts(self .false_set)
567- self .false_set = NULL
573+ self .close()
568574 parser_del(self .parser)
569575
570576 def close (self ):
@@ -632,7 +638,6 @@ cdef class TextReader:
632638 char * word
633639 object name, old_name
634640 uint64_t hr, data_line = 0
635- char * errors = " strict"
636641 StringPath path = _string_path(self .c_encoding)
637642 list header = []
638643 set unnamed_cols = set ()
@@ -673,11 +678,8 @@ cdef class TextReader:
673678 for i in range (field_count):
674679 word = self .parser.words[start + i]
675680
676- if path == UTF8:
677- name = PyUnicode_FromString(word)
678- elif path == ENCODED:
679- name = PyUnicode_Decode(word, strlen(word),
680- self .c_encoding, errors)
681+ name = PyUnicode_Decode(word, strlen(word),
682+ self .c_encoding, self .encoding_errors)
681683
682684 # We use this later when collecting placeholder names.
683685 old_name = name
@@ -831,7 +833,7 @@ cdef class TextReader:
831833 int status
832834
833835 with nogil:
834- status = tokenize_nrows(self .parser, nrows)
836+ status = tokenize_nrows(self .parser, nrows, self .encoding_errors )
835837
836838 if self .parser.warn_msg != NULL :
837839 print (self .parser.warn_msg, file = sys.stderr)
@@ -859,7 +861,7 @@ cdef class TextReader:
859861 ' the whole file' )
860862 else :
861863 with nogil:
862- status = tokenize_all_rows(self .parser)
864+ status = tokenize_all_rows(self .parser, self .encoding_errors )
863865
864866 if self .parser.warn_msg != NULL :
865867 print (self .parser.warn_msg, file = sys.stderr)
@@ -1201,7 +1203,7 @@ cdef class TextReader:
12011203
12021204 if path == UTF8:
12031205 return _string_box_utf8(self .parser, i, start, end, na_filter,
1204- na_hashset)
1206+ na_hashset, self .encoding_errors )
12051207 elif path == ENCODED:
12061208 return _string_box_decode(self .parser, i, start, end,
12071209 na_filter, na_hashset, self .c_encoding)
@@ -1352,7 +1354,8 @@ cdef inline StringPath _string_path(char *encoding):
13521354
13531355cdef _string_box_utf8(parser_t * parser, int64_t col,
13541356 int64_t line_start, int64_t line_end,
1355- bint na_filter, kh_str_starts_t * na_hashset):
1357+ bint na_filter, kh_str_starts_t * na_hashset,
1358+ const char * encoding_errors):
13561359 cdef:
13571360 int error, na_count = 0
13581361 Py_ssize_t i, lines
@@ -1391,7 +1394,7 @@ cdef _string_box_utf8(parser_t *parser, int64_t col,
13911394 pyval = < object > table.vals[k]
13921395 else :
13931396 # box it. new ref?
1394- pyval = PyUnicode_FromString (word)
1397+ pyval = PyUnicode_Decode (word, strlen(word), " utf-8 " , encoding_errors )
13951398
13961399 k = kh_put_strbox(table, word, & ret)
13971400 table.vals[k] = < PyObject * > pyval
0 commit comments