Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions datalad_dataverse/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from itertools import product
from pathlib import Path
from unicodedata import lookup

import pytest
from unidecode import unidecode

from ..utils import (
_dataverse_dirname_quote,
Expand All @@ -14,6 +16,7 @@


_test_paths = [
lookup("dog face") + lookup("cat face"),
".x",
"_x",
"..x",
Expand All @@ -32,6 +35,7 @@
"._dir/x",
"_.dir/x",
"__dir/x",
"ä",
"%%;;,_,?-&=",
]

Expand All @@ -49,18 +53,26 @@ def test_format_doi():
format_doi(123)


def _check_simplified_match(path, mangled_path):
result = [
True
if mangled_part.startswith('__not_representable')
else str(unmangle_path(mangled_part)) == unidecode(part)
for mangled_part, part in zip(mangled_path.parts, path.parts)
]
assert all(result)


def test_path_mangling_identity():
for p in _test_paths + ['?;#:eee=2.txt']:
assert Path(p) == unmangle_path(mangle_path(p))
_check_simplified_match(Path(p), mangle_path(p))


def test_path_mangling_sub_dirs():
for p, q, r in product(_test_paths, _test_paths, _test_paths):
path = Path(p) / q / r
mangled_path = mangle_path(path)
for part in mangled_path.parts[:-1]:
assert part[0] != "."
assert unmangle_path(mangled_path) == path
_check_simplified_match(path, mangled_path)


def test_file_quoting_identity():
Expand Down
9 changes: 6 additions & 3 deletions datalad_dataverse/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path

from pyDataverse.api import NativeApi
from unidecode import unidecode

from datalad_next.utils import update_specialremote_credential

Expand Down Expand Up @@ -295,7 +296,8 @@ def _dataverse_dirname_quote(dirname: str) -> str:
dataverse, it is encoded as well to prevent name collisions, for example,
between ``.datalad`` and ``datalad``.
"""
quoted_dirname = _dataverse_quote(dirname, DATAVERSE_DIRNAME_SAFE)
ascii_dirname = unidecode(dirname) or f"_not_representable_{len(dirname)}"
quoted_dirname = _dataverse_quote(ascii_dirname, DATAVERSE_DIRNAME_SAFE)
return _encode_leading_dot(quoted_dirname)


Expand All @@ -306,14 +308,15 @@ def _dataverse_filename_quote(filename: str) -> str:
``/``, ``:``, ``*``, ``?``, ``"``, ``<``, ``>``, ``|``, ``;``, and
``#``.

In order to be able to use the some decoding for file names and directory
In order to be able to use the same decoding for file names and directory
names, we also encode leading dots in file names, although that is not
strictly necessary with dataverse, because it would preserve the leading
dots in file names.


"""
quoted_filename = _dataverse_quote(filename, DATAVERSE_FILENAME_SAFE)
ascii_filename = unidecode(filename) or f"_not_representable_{len(filename)}"
quoted_filename = _dataverse_quote(ascii_filename, DATAVERSE_FILENAME_SAFE)
return _encode_leading_dot(quoted_filename)


Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ install_requires =
datalad_next >= 1.0.0b1
datalad >= 0.18.0
pydataverse
Unidecode
packages = find_namespace:
include_package_data = True

Expand Down