Skip to content

Commit 7941a29

Browse files
committed
ENH: made text/csv adapters more robust to decoding errors
on failure sniff encoding again and decode using errors='replace'
1 parent cbb2d11 commit 7941a29

File tree

1 file changed

+49
-40
lines changed

1 file changed

+49
-40
lines changed

larray_editor/arrayadapter.py

Lines changed: 49 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -2661,33 +2661,6 @@ def index_line_ends(s, index=None, offset=0, c='\n'):
26612661
return index
26622662

26632663

2664-
def chunks_to_lines(chunks, num_lines_required=None, encoding=None):
2665-
r"""
2666-
Parameters
2667-
----------
2668-
chunks : list
2669-
List of chunks. str and bytes are both supported but should not be mixed (all chunks must
2670-
have the same type than the first chunk).
2671-
2672-
Examples
2673-
--------
2674-
>>> chunks = ['a\nb\nc ', 'c\n', 'd ', 'd', '\n', 'e']
2675-
>>> chunks_to_lines(chunks)
2676-
['a', 'b', 'c c', 'd d', 'e']
2677-
>>> # it should have the same result than join then splitlines (just do it more efficiently)
2678-
... ''.join(chunks).splitlines()
2679-
['a', 'b', 'c c', 'd d', 'e']
2680-
"""
2681-
if not chunks:
2682-
return []
2683-
if encoding is not None:
2684-
assert isinstance(chunks[0], bytes)
2685-
chunks = [chunk.decode(encoding) for chunk in chunks]
2686-
sep = b'' if isinstance(chunks[0], bytes) else ''
2687-
lines = sep.join(chunks).splitlines()
2688-
return lines[:num_lines_required]
2689-
2690-
26912664
@adapter_for('_io.TextIOWrapper')
26922665
class TextFileAdapter(AbstractAdapter):
26932666
def __init__(self, data, attributes):
@@ -2760,13 +2733,7 @@ def _index_up_to(self, f, approx_v_stop, chunk_size=4 * MB, max_time=0.5):
27602733
# (once for indexing then again for getting the data)
27612734
chunk = f.read(chunk_size)
27622735
if self._encoding is None:
2763-
try:
2764-
import charset_normalizer
2765-
chartset_match = charset_normalizer.from_bytes(chunk).best()
2766-
self._encoding = chartset_match.encoding
2767-
logger.debug(f"encoding detected as {self._encoding}")
2768-
except ImportError:
2769-
logger.debug("could not import 'charset_normalizer' => cannot detect encoding")
2736+
self._detect_encoding(chunk)
27702737

27712738
line_end_char = b'\n'
27722739
index_line_ends(chunk, self._lines_end_index, offset=chunk_start, c=line_end_char)
@@ -2785,6 +2752,15 @@ def _index_up_to(self, f, approx_v_stop, chunk_size=4 * MB, max_time=0.5):
27852752
self._lines_end_index.append(file_length)
27862753
chunk_start += length_read
27872754

2755+
def _detect_encoding(self, chunk):
2756+
try:
2757+
import charset_normalizer
2758+
chartset_match = charset_normalizer.from_bytes(chunk).best()
2759+
self._encoding = chartset_match.encoding
2760+
logger.debug(f"encoding detected as {self._encoding}")
2761+
except ImportError:
2762+
logger.debug("could not import 'charset_normalizer' => cannot detect encoding")
2763+
27882764
def get_vlabels_values(self, start, stop):
27892765
# we need to trigger indexing too (because get_vlabels happens before get_data) so that lines_indexed is correct
27902766
# FIXME: get_data should not trigger indexing too if start/stop are the same
@@ -2813,9 +2789,9 @@ def _get_lines(self, start, stop):
28132789
stop_pos = self._lines_end_index[stop - 1]
28142790
f.seek(start_pos)
28152791
chunk = f.read(stop_pos - start_pos)
2816-
lines = chunks_to_lines([chunk], stop - start, self._encoding)
2792+
lines = self._decode_chunks_to_lines([chunk], stop - start)
28172793
# lines = chunk.split(b'\n')
2818-
# assert len(lines) == stop - start
2794+
# assert len(lines) == num_required_lines
28192795
return lines
28202796
else:
28212797
pos_last_end = self._lines_end_index[-1]
@@ -2852,13 +2828,46 @@ def _get_lines(self, start, stop):
28522828

28532829
if approx_start:
28542830
# +1 and [1:] to remove first line so that we are sure the first line is complete
2855-
lines = chunks_to_lines(chunks, num_lines_required + 1,
2856-
self._encoding)[1:]
2831+
n_req_lines = num_lines_required + 1
2832+
lines = self._decode_chunks_to_lines(chunks, n_req_lines)[1:]
28572833
else:
2858-
lines = chunks_to_lines(chunks, num_lines_required,
2859-
self._encoding)
2834+
lines = self._decode_chunks_to_lines(chunks, num_lines_required)
28602835
return lines
28612836

2837+
def _decode_chunk(self, chunk: bytes):
2838+
try:
2839+
return chunk.decode(self._encoding)
2840+
except UnicodeDecodeError:
2841+
old_encoding = self._encoding
2842+
# try to find another encoding
2843+
self._detect_encoding(chunk)
2844+
logger.debug(f"Could not decode chunk using {old_encoding}")
2845+
logger.debug(f"Trying again using {self._encoding} and ignoring "
2846+
f"errors")
2847+
return chunk.decode(self._encoding, errors='replace')
2848+
2849+
def _decode_chunks_to_lines(self, chunks: list, num_required_lines: int):
2850+
r"""
2851+
Parameters
2852+
----------
2853+
chunks : list
2854+
List of chunks. str and bytes are both supported but should not be mixed (all chunks must
2855+
have the same type than the first chunk).
2856+
"""
2857+
if not chunks:
2858+
return []
2859+
2860+
# TODO: we could have more efficient code:
2861+
# * only decode as many chunks as necessary to get num_required_lines
2862+
# * only join as many chunks as necessary to get num_required_lines
2863+
if self._encoding is not None:
2864+
assert isinstance(chunks[0], bytes)
2865+
chunks = [self._decode_chunk(chunk) for chunk in chunks]
2866+
2867+
sep = b'' if isinstance(chunks[0], bytes) else ''
2868+
lines = sep.join(chunks).splitlines()
2869+
return lines[:num_required_lines]
2870+
28622871
def get_values(self, h_start, v_start, h_stop, v_stop):
28632872
"""*_stop are exclusive"""
28642873
return [[line] for line in self._get_lines(v_start, v_stop)]

0 commit comments

Comments
 (0)