@@ -2661,33 +2661,6 @@ def index_line_ends(s, index=None, offset=0, c='\n'):
26612661 return index
26622662
26632663
2664- def chunks_to_lines (chunks , num_lines_required = None , encoding = None ):
2665- r"""
2666- Parameters
2667- ----------
2668- chunks : list
2669- List of chunks. str and bytes are both supported but should not be mixed (all chunks must
2670- have the same type than the first chunk).
2671-
2672- Examples
2673- --------
2674- >>> chunks = ['a\nb\nc ', 'c\n', 'd ', 'd', '\n', 'e']
2675- >>> chunks_to_lines(chunks)
2676- ['a', 'b', 'c c', 'd d', 'e']
2677- >>> # it should have the same result than join then splitlines (just do it more efficiently)
2678- ... ''.join(chunks).splitlines()
2679- ['a', 'b', 'c c', 'd d', 'e']
2680- """
2681- if not chunks :
2682- return []
2683- if encoding is not None :
2684- assert isinstance (chunks [0 ], bytes )
2685- chunks = [chunk .decode (encoding ) for chunk in chunks ]
2686- sep = b'' if isinstance (chunks [0 ], bytes ) else ''
2687- lines = sep .join (chunks ).splitlines ()
2688- return lines [:num_lines_required ]
2689-
2690-
26912664@adapter_for ('_io.TextIOWrapper' )
26922665class TextFileAdapter (AbstractAdapter ):
26932666 def __init__ (self , data , attributes ):
@@ -2760,13 +2733,7 @@ def _index_up_to(self, f, approx_v_stop, chunk_size=4 * MB, max_time=0.5):
27602733 # (once for indexing then again for getting the data)
27612734 chunk = f .read (chunk_size )
27622735 if self ._encoding is None :
2763- try :
2764- import charset_normalizer
2765- chartset_match = charset_normalizer .from_bytes (chunk ).best ()
2766- self ._encoding = chartset_match .encoding
2767- logger .debug (f"encoding detected as { self ._encoding } " )
2768- except ImportError :
2769- logger .debug ("could not import 'charset_normalizer' => cannot detect encoding" )
2736+ self ._detect_encoding (chunk )
27702737
27712738 line_end_char = b'\n '
27722739 index_line_ends (chunk , self ._lines_end_index , offset = chunk_start , c = line_end_char )
@@ -2785,6 +2752,15 @@ def _index_up_to(self, f, approx_v_stop, chunk_size=4 * MB, max_time=0.5):
27852752 self ._lines_end_index .append (file_length )
27862753 chunk_start += length_read
27872754
2755+ def _detect_encoding (self , chunk ):
2756+ try :
2757+ import charset_normalizer
2758+ chartset_match = charset_normalizer .from_bytes (chunk ).best ()
2759+ self ._encoding = chartset_match .encoding
2760+ logger .debug (f"encoding detected as { self ._encoding } " )
2761+ except ImportError :
2762+ logger .debug ("could not import 'charset_normalizer' => cannot detect encoding" )
2763+
27882764 def get_vlabels_values (self , start , stop ):
27892765 # we need to trigger indexing too (because get_vlabels happens before get_data) so that lines_indexed is correct
27902766 # FIXME: get_data should not trigger indexing too if start/stop are the same
@@ -2813,9 +2789,9 @@ def _get_lines(self, start, stop):
28132789 stop_pos = self ._lines_end_index [stop - 1 ]
28142790 f .seek (start_pos )
28152791 chunk = f .read (stop_pos - start_pos )
2816- lines = chunks_to_lines ([chunk ], stop - start , self . _encoding )
2792+ lines = self . _decode_chunks_to_lines ([chunk ], stop - start )
28172793 # lines = chunk.split(b'\n')
2818- # assert len(lines) == stop - start
2794+ # assert len(lines) == num_required_lines
28192795 return lines
28202796 else :
28212797 pos_last_end = self ._lines_end_index [- 1 ]
@@ -2852,13 +2828,46 @@ def _get_lines(self, start, stop):
28522828
28532829 if approx_start :
28542830 # +1 and [1:] to remove first line so that we are sure the first line is complete
2855- lines = chunks_to_lines ( chunks , num_lines_required + 1 ,
2856- self ._encoding )[1 :]
2831+ n_req_lines = num_lines_required + 1
2832+ lines = self ._decode_chunks_to_lines ( chunks , n_req_lines )[1 :]
28572833 else :
2858- lines = chunks_to_lines (chunks , num_lines_required ,
2859- self ._encoding )
2834+ lines = self ._decode_chunks_to_lines (chunks , num_lines_required )
28602835 return lines
28612836
2837+ def _decode_chunk (self , chunk : bytes ):
2838+ try :
2839+ return chunk .decode (self ._encoding )
2840+ except UnicodeDecodeError :
2841+ old_encoding = self ._encoding
2842+ # try to find another encoding
2843+ self ._detect_encoding (chunk )
2844+ logger .debug (f"Could not decode chunk using { old_encoding } " )
2845+ logger .debug (f"Trying again using { self ._encoding } and ignoring "
2846+ f"errors" )
2847+ return chunk .decode (self ._encoding , errors = 'replace' )
2848+
2849+ def _decode_chunks_to_lines (self , chunks : list , num_required_lines : int ):
2850+ r"""
2851+ Parameters
2852+ ----------
2853+ chunks : list
2854+ List of chunks. str and bytes are both supported but should not be mixed (all chunks must
2855+ have the same type than the first chunk).
2856+ """
2857+ if not chunks :
2858+ return []
2859+
2860+ # TODO: we could have more efficient code:
2861+ # * only decode as many chunks as necessary to get num_required_lines
2862+ # * only join as many chunks as necessary to get num_required_lines
2863+ if self ._encoding is not None :
2864+ assert isinstance (chunks [0 ], bytes )
2865+ chunks = [self ._decode_chunk (chunk ) for chunk in chunks ]
2866+
2867+ sep = b'' if isinstance (chunks [0 ], bytes ) else ''
2868+ lines = sep .join (chunks ).splitlines ()
2869+ return lines [:num_required_lines ]
2870+
28622871 def get_values (self , h_start , v_start , h_stop , v_stop ):
28632872 """*_stop are exclusive"""
28642873 return [[line ] for line in self ._get_lines (v_start , v_stop )]
0 commit comments