Skip to content

Commit ba716f4

Browse files
authored
prevent find from corrupting dircache (#488)
1 parent fef6c0b commit ba716f4

File tree

2 files changed

+78
-60
lines changed

2 files changed

+78
-60
lines changed

gcsfs/core.py

Lines changed: 42 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1086,54 +1086,55 @@ async def _isdir(self, path):
10861086
async def _find(self, path, withdirs=False, detail=False, prefix="", **kwargs):
10871087
path = self._strip_protocol(path)
10881088
bucket, key = self.split_path(path)
1089-
out, _ = await self._do_list_objects(
1090-
path,
1091-
delimiter=None,
1092-
prefix=prefix,
1093-
)
1094-
if not prefix and not out and key:
1095-
try:
1096-
out = [
1097-
await self._get_object(
1098-
path,
1099-
)
1100-
]
1101-
except FileNotFoundError:
1102-
out = []
1103-
dirs = []
1104-
sdirs = set()
1089+
1090+
if prefix:
1091+
_path = "" if not key else key.rstrip("/") + "/"
1092+
_prefix = f"{_path}{prefix}"
1093+
else:
1094+
_prefix = key
1095+
1096+
objects, _ = await self._do_list_objects(bucket, delimiter="", prefix=_prefix)
1097+
1098+
dirs = {}
11051099
cache_entries = {}
1106-
for o in out:
1107-
par = o["name"]
1108-
while par:
1109-
par = self._parent(par)
1110-
if par not in sdirs:
1111-
if len(par) < len(path):
1112-
break
1113-
sdirs.add(par)
1114-
dirs.append(
1115-
{
1116-
"Key": self.split_path(par)[1],
1117-
"Size": 0,
1118-
"name": par,
1119-
"StorageClass": "DIRECTORY",
1120-
"type": "directory",
1121-
"size": 0,
1122-
}
1123-
)
1124-
# Don't cache "folder-like" objects (ex: "Create Folder" in GCS console) to prevent
1125-
# masking subfiles in subsequent requests.
1126-
if not o["name"].endswith("/"):
1127-
cache_entries.setdefault(par, []).append(o)
1100+
1101+
for obj in objects:
1102+
parent = self._parent(obj["name"])
1103+
previous = obj
1104+
1105+
while parent:
1106+
dir_key = self.split_path(parent)[1]
1107+
if not dir_key:
1108+
break
1109+
1110+
dirs[parent] = {
1111+
"Key": dir_key,
1112+
"Size": 0,
1113+
"name": parent,
1114+
"StorageClass": "DIRECTORY",
1115+
"type": "directory",
1116+
"size": 0,
1117+
}
1118+
1119+
if len(parent) < len(path):
1120+
# don't go above the requested level
1121+
break
1122+
1123+
cache_entries.setdefault(parent, []).append(previous)
1124+
1125+
previous = dirs[parent]
1126+
parent = self._parent(parent)
1127+
11281128
if not prefix:
11291129
self.dircache.update(cache_entries)
11301130

11311131
if withdirs:
1132-
out = sorted(out + dirs, key=lambda x: x["name"])
1132+
objects = sorted(objects + list(dirs.values()), key=lambda x: x["name"])
11331133

11341134
if detail:
1135-
return {o["name"]: o for o in out}
1136-
return [o["name"] for o in out]
1135+
return {o["name"]: o for o in objects}
1136+
1137+
return [o["name"] for o in objects]
11371138

11381139
@retry_request(retries=retries)
11391140
async def _get_file_request(

gcsfs/tests/test_core.py

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,12 @@
1515

1616
from gcsfs.tests.settings import TEST_BUCKET, TEST_PROJECT, TEST_REQUESTER_PAYS_BUCKET
1717
from gcsfs.tests.conftest import (
18-
files,
19-
csv_files,
20-
text_files,
2118
a,
19+
allfiles,
2220
b,
21+
csv_files,
22+
files,
23+
text_files,
2324
)
2425
from gcsfs.tests.utils import tempdir, tmpfile
2526
from gcsfs.core import GCSFileSystem, quote_plus
@@ -982,26 +983,42 @@ def test_zero_cache_timeout(gcs):
982983
gcs.dircache[f"{TEST_BUCKET}/a"]
983984

984985

985-
def test_find_with_prefix_partial_cache(gcs):
986+
@pytest.mark.parametrize("with_cache", (False, True))
987+
def test_find_with_prefix_partial_cache(gcs, with_cache):
986988
base_dir = f"{TEST_BUCKET}/test_find_with_prefix"
987989
gcs.touch(base_dir + "/test_1")
988990
gcs.touch(base_dir + "/test_2")
989991

990-
for with_cache in (True, False):
991-
# Test once with cached, and once with no cache
992-
gcs.invalidate_cache()
993-
if with_cache:
994-
gcs.ls(base_dir)
995-
precache = dict(gcs.dircache)
996-
assert gcs.find(base_dir, prefix="non_existent_") == []
997-
assert gcs.find(base_dir, prefix="test_") == [
998-
base_dir + "/test_1",
999-
base_dir + "/test_2",
1000-
]
1001-
assert dict(gcs.dircache) == precache # find qwith prefix shouldn't touch cache
1002-
assert gcs.find(base_dir + "/test_1") == [base_dir + "/test_1"]
1003-
assert gcs.find(base_dir + "/non_existent") == []
1004-
assert gcs.find(base_dir + "/non_existent", prefix="more_non_existent") == []
992+
gcs.invalidate_cache()
993+
if with_cache:
994+
gcs.ls(base_dir)
995+
precache = dict(gcs.dircache)
996+
assert gcs.find(base_dir, prefix="non_existent_") == []
997+
assert gcs.find(base_dir, prefix="test_") == [
998+
base_dir + "/test_1",
999+
base_dir + "/test_2",
1000+
]
1001+
assert dict(gcs.dircache) == precache # find qwith prefix shouldn't touch cache
1002+
assert gcs.find(base_dir + "/test_1") == [base_dir + "/test_1"]
1003+
assert gcs.find(base_dir + "/non_existent") == []
1004+
assert gcs.find(base_dir + "/non_existent", prefix="more_non_existent") == []
1005+
1006+
1007+
def test_find_dircache(gcs):
1008+
"""Running `ls` after find should not corrupt the dir cache"""
1009+
assert set(gcs.find(TEST_BUCKET)) == {f"{TEST_BUCKET}/{path}" for path in allfiles}
1010+
assert set(gcs.ls(TEST_BUCKET)) == {
1011+
f"{TEST_BUCKET}/test",
1012+
f"{TEST_BUCKET}/nested",
1013+
f"{TEST_BUCKET}/2014-01-01.csv",
1014+
f"{TEST_BUCKET}/2014-01-02.csv",
1015+
f"{TEST_BUCKET}/2014-01-03.csv",
1016+
}
1017+
assert set(gcs.ls(f"{TEST_BUCKET}/nested")) == {
1018+
f"{TEST_BUCKET}/nested/file1",
1019+
f"{TEST_BUCKET}/nested/file2",
1020+
f"{TEST_BUCKET}/nested/nested2",
1021+
}
10051022

10061023

10071024
def test_percent_file_name(gcs):

0 commit comments

Comments
 (0)