Using Path and glob instead of walk_files (#1069)

krishnakalyan3 · krishnakalyan3 · vincentqb · web-flow · commit d25a4ddff300 · 2020-12-15T14:25:13.000-05:00
- yesno
- librispeech
- libritts
- speechcommands

Co-authored-by: krishnakalyan3 &lt;skalyan@cloudera.com&gt;
Co-authored-by: Vincent Quenneville-Belair &lt;vincentqb@gmail.com&gt;
diff --git a/torchaudio/datasets/librispeech.py b/torchaudio/datasets/librispeech.py
@@ -8,7 +8,6 @@
 from torchaudio.datasets.utils import (
     download_url,
     extract_archive,
-    walk_files,
 )
 
 URL = "train-clean-100"
@@ -125,10 +124,7 @@ def __init__(self,
                     download_url(url, root, hash_value=checksum)
                 extract_archive(archive)
 
-        walker = walk_files(
-            self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True
-        )
-        self._walker = list(walker)
+        self._walker = sorted(str(p.stem) for p in Path(self._path).glob('*/*/*' + self._ext_audio))
 
     def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
         """Load the n-th sample from the dataset.
diff --git a/torchaudio/datasets/libritts.py b/torchaudio/datasets/libritts.py
@@ -8,7 +8,6 @@
 from torchaudio.datasets.utils import (
     download_url,
     extract_archive,
-    walk_files,
 )
 
 URL = "train-clean-100"
@@ -126,10 +125,7 @@ def __init__(
                     download_url(url, root, hash_value=checksum)
                 extract_archive(archive)
 
-        walker = walk_files(
-            self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True
-        )
-        self._walker = list(walker)
+        self._walker = sorted(str(p.stem) for p in Path(self._path).glob('*/*/*' + self._ext_audio))
 
     def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int, int, str]:
         """Load the n-th sample from the dataset.
diff --git a/torchaudio/datasets/speechcommands.py b/torchaudio/datasets/speechcommands.py
@@ -8,7 +8,6 @@
 from torchaudio.datasets.utils import (
     download_url,
     extract_archive,
-    walk_files
 )
 
 FOLDER_IN_ARCHIVE = "SpeechCommands"
@@ -110,15 +109,15 @@ def __init__(self,
             self._walker = _load_list(self._path, "testing_list.txt")
         elif subset == "training":
             excludes = set(_load_list(self._path, "validation_list.txt", "testing_list.txt"))
-            walker = walk_files(self._path, suffix=".wav", prefix=True)
+            walker = sorted(str(p) for p in Path(self._path).glob('*/*.wav'))
             self._walker = [
                 w for w in walker
                 if HASH_DIVIDER in w
                 and EXCEPT_FOLDER not in w
                 and os.path.normpath(w) not in excludes
             ]
         else:
-            walker = walk_files(self._path, suffix=".wav", prefix=True)
+            walker = sorted(str(p) for p in Path(self._path).glob('*/*.wav'))
             self._walker = [w for w in walker if HASH_DIVIDER in w and EXCEPT_FOLDER not in w]
 
     def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int]:
diff --git a/torchaudio/datasets/yesno.py b/torchaudio/datasets/yesno.py
@@ -9,7 +9,6 @@
 from torchaudio.datasets.utils import (
     download_url,
     extract_archive,
-    walk_files
 )
 
 URL = "http://www.openslr.org/resources/1/waves_yesno.tar.gz"
@@ -85,10 +84,7 @@ def __init__(self,
                 "Dataset not found. Please use `download=True` to download it."
             )
 
-        walker = walk_files(
-            self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True
-        )
-        self._walker = list(walker)
+        self._walker = sorted(str(p.stem) for p in Path(self._path).glob('*' + self._ext_audio))
 
     def __getitem__(self, n: int) -> Tuple[Tensor, int, List[int]]:
         """Load the n-th sample from the dataset.