Skip to content

Commit 469776a

Browse files
committed
FEAT: added adapter for .csv.gz files
1 parent 0e911b3 commit 469776a

File tree

1 file changed

+43
-5
lines changed

1 file changed

+43
-5
lines changed

larray_editor/arrayadapter.py

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1001,8 +1001,9 @@ def cell_activated(self, row_idx, column_idx):
10011001
@adapter_for('pathlib.Path')
10021002
def get_path_suffix_adapter(fpath):
10031003
logger.debug(f"get_path_suffix_adapter('{fpath}')")
1004-
if fpath.suffix.lower() in PATH_SUFFIX_ADAPTERS:
1005-
path_adapter_cls, required_module = PATH_SUFFIX_ADAPTERS[fpath.suffix]
1004+
suffix = fpath.suffix.lower()
1005+
if suffix in PATH_SUFFIX_ADAPTERS:
1006+
path_adapter_cls, required_module = PATH_SUFFIX_ADAPTERS[suffix]
10061007
if required_module is not None:
10071008
if required_module not in sys.modules:
10081009
import importlib
@@ -1013,7 +1014,15 @@ def get_path_suffix_adapter(fpath):
10131014
f"which is required to handle {fpath.suffix} "
10141015
f"files")
10151016
return None
1016-
return path_adapter_cls
1017+
# 2 options:
1018+
# - either there is a single adapter for that suffix
1019+
if (isinstance(path_adapter_cls, type) and
1020+
issubclass(path_adapter_cls, AbstractAdapter)):
1021+
return path_adapter_cls
1022+
# - different adapters handle that suffix and/or not all instances can
1023+
# be handled
1024+
else:
1025+
return path_adapter_cls(fpath)
10171026
elif fpath.is_dir():
10181027
return DirectoryPathAdapter
10191028
else:
@@ -2771,8 +2780,12 @@ def _detect_encoding(self, chunk):
27712780
try:
27722781
import charset_normalizer
27732782
chartset_match = charset_normalizer.from_bytes(chunk).best()
2774-
self._encoding = chartset_match.encoding
2775-
logger.debug(f"encoding detected as {self._encoding}")
2783+
if chartset_match is None:
2784+
self._encoding = None
2785+
logger.debug("could not detect encoding from chunk")
2786+
else:
2787+
self._encoding = chartset_match.encoding
2788+
logger.debug(f"encoding detected as {self._encoding}")
27762789
except ImportError:
27772790
logger.debug("could not import 'charset_normalizer' => cannot detect encoding")
27782791

@@ -3198,6 +3211,31 @@ def cell_activated(self, row_idx, column_idx):
31983211
# return self.data.open(info.filename)
31993212

32003213

3214+
class CSVGZPathAdapater(CsvFileAdapter):
3215+
@classmethod
3216+
def open(cls, fpath):
3217+
import gzip
3218+
# not specifying an encoding is not an option because in that case
3219+
# we would get bytes and not str, which makes csv reader unhappy
3220+
return gzip.open(fpath, mode='rt', encoding='utf-8')
3221+
3222+
@property
3223+
def _binary_file(self):
3224+
import gzip
3225+
return gzip.open(self.data.name, mode='rb')
3226+
3227+
3228+
@path_adapter_for('.gz', 'gzip')
3229+
def dispatch_gzip_path_adapter(gz_path):
3230+
# strip .gz extension and dispatch to appropriate adapter
3231+
fpath = gz_path.with_name(gz_path.stem)
3232+
suffix = fpath.suffix.lower()
3233+
if suffix == '.csv':
3234+
return CSVGZPathAdapater
3235+
else:
3236+
return None
3237+
3238+
32013239
@path_adapter_for('.zip', 'zipfile')
32023240
class ZipPathAdapter(ZipFileAdapter):
32033241
@classmethod

0 commit comments

Comments
 (0)