@@ -290,11 +290,11 @@ def test_empty_header_read(count):
290290 test_empty_header_read (count )
291291
292292 def test_parse_trim_buffers (self ):
293- # This test is part of a bugfix for issue #13703. It attmepts to
293+ # This test is part of a bugfix for issue #13703. It attempts to
294294 # to stress the system memory allocator, to cause it to move the
295295 # stream buffer and either let the OS reclaim the region, or let
296296 # other memory requests of parser otherwise modify the contents
297- # of memory space, where it was formely located.
297+ # of memory space, where it was formally located.
298298 # This test is designed to cause a `segfault` with unpatched
299299 # `tokenizer.c`. Sometimes the test fails on `segfault`, other
300300 # times it fails due to memory corruption, which causes the
@@ -346,7 +346,7 @@ def test_parse_trim_buffers(self):
346346
347347 # Generate the expected output: manually create the dataframe
348348 # by splitting by comma and repeating the `n_lines` times.
349- row = tuple (val_ if val_ else float ( " nan" )
349+ row = tuple (val_ if val_ else np . nan
350350 for val_ in record_ .split ("," ))
351351 expected = pd .DataFrame ([row for _ in range (n_lines )],
352352 dtype = object , columns = None , index = None )
@@ -359,6 +359,15 @@ def test_parse_trim_buffers(self):
359359 # Check for data corruption if there was no segfault
360360 tm .assert_frame_equal (result , expected )
361361
362+ # This extra test was added to replicate the fault in gh-5291.
363+ # Force 'utf-8' encoding, so that `_string_convert` would take
364+ # a different execution branch.
365+ chunks_ = self .read_csv (StringIO (csv_data ), header = None ,
366+ dtype = object , chunksize = chunksize ,
367+ encoding = 'utf_8' )
368+ result = pd .concat (chunks_ , axis = 0 , ignore_index = True )
369+ tm .assert_frame_equal (result , expected )
370+
362371 def test_internal_null_byte (self ):
363372 # see gh-14012
364373 #
0 commit comments