Make walk_files return sorted

mthrok · mthrok · commit 54618626ff64 · 2020-07-23T00:52:23.000Z
diff --git a/test/test_datasets.py b/test/test_datasets.py
@@ -1,6 +1,7 @@
 import os
-import unittest
+from pathlib import Path
 
+from torchaudio.datasets import utils as dataset_utils
 from torchaudio.datasets.commonvoice import COMMONVOICE
 from torchaudio.datasets.librispeech import LIBRISPEECH
 from torchaudio.datasets.speechcommands import SPEECHCOMMANDS
@@ -22,6 +23,41 @@
 )
 
 
+class TestWalkFiles(TempDirMixin, TorchaudioTestCase):
+    root = None
+    expected = None
+
+    def _add_file(self, *parts):
+        path = self.get_temp_path(*parts)
+        self.expected.append(path)
+        Path(path).touch()
+
+    def setUp(self):
+        self.root = self.get_temp_path()
+        self.expected = []
+
+        # level 1
+        for filename in ['a.txt', 'b.txt', 'c.txt']:
+            self._add_file(filename)
+
+        # level 2
+        for dir1 in ['d1', 'd2', 'd3']:
+            for filename in ['d.txt', 'e.txt', 'f.txt']:
+                self._add_file(dir1, filename)
+            # level 3
+            for dir2 in ['d1', 'd2', 'd3']:
+                for filename in ['g.txt', 'h.txt', 'i.txt']:
+                    self._add_file(dir1, dir2, filename)
+
+        print('\n'.join(self.expected))
+
+    def test_walk_files(self):
+        """walk_files should traverse files in alphabetical order"""
+        for i, path in enumerate(dataset_utils.walk_files(self.root, '.txt', prefix=True)):
+            found = os.path.join(self.root, path)
+            assert found == self.expected[i]
+
+
 class TestDatasets(TorchaudioTestCase):
     backend = 'default'
     path = get_asset_path()
diff --git a/torchaudio/datasets/gtzan.py b/torchaudio/datasets/gtzan.py
@@ -1064,6 +1064,7 @@ def __init__(
                     continue
 
                 songs_in_genre = os.listdir(fulldir)
+                songs_in_genre.sort()
                 for fname in songs_in_genre:
                     name, ext = os.path.splitext(fname)
                     if ext.lower() == ".wav" and "." in name:
diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py
@@ -264,7 +264,13 @@ def walk_files(root: str,
 
     root = os.path.expanduser(root)
 
-    for dirpath, _, files in os.walk(root):
+    for dirpath, dirs, files in os.walk(root):
+        dirs.sort()
+        # `dirs` is the list used in os.walk function and by sorting it in-place here, we change the
+        # behavior of os.walk to traverse sub directory alphabetically
+        # see also
+        # https://stackoverflow.com/questions/6670029/can-i-force-python3s-os-walk-to-visit-directories-in-alphabetical-order-how#comment71993866_6670926
+        files.sort()
         for f in files:
             if f.endswith(suffix):