refactors some of the caching logic and cleaners

erip · erip · commit 60f0b80f0cc6 · 2022-01-29T16:10:45.000-05:00
diff --git a/torchtext/data/datasets_utils.py b/torchtext/data/datasets_utils.py
@@ -33,27 +33,23 @@ def _clean_xml_file(f_xml):
                 fd_txt.write(e.text.strip() + '\n')
 
 
-def _clean_inner_xml_file(f_xml, base, stream):
-    """Accepts an XML filename within a tarball and a stream of the byte contents
-    within that file and writes the cleaned contents to a new, untarred file
-    found in the provided base directory.
+def _clean_inner_xml_file(outfile, stream):
+    """Accepts an output filename and a stream of the byte contents of an XML file
+    within a tarball and writes the cleaned contents to a new, untarred file.
 
     Args:
-        f_orig: the full path of the XML file in the archive
-        base: the directory to which the new file should be written
-        stream: the byte datapipe of the contents of f_orig
+        outfile: the path to which the modified stream should be written
+        stream: the byte datapipe of the contents of the archived XML file
 
     Returns: the path to the newly-written file
     """
-    f_txt = os.path.basename(os.path.splitext(f_xml)[0])
-    os.makedirs(base, exist_ok=True)
-    out_file = os.path.join(base, f_txt)
-    with codecs.open(out_file, mode='w', encoding='utf-8') as fd_txt:
+    os.makedirs(os.path.dirname(outfile), exist_ok=True)
+    with codecs.open(outfile, mode='w', encoding='utf-8') as fd_txt:
         root = ET.fromstring(stream.read().decode("utf-8"))[0]
         for doc in root.findall('doc'):
             for e in doc.findall('seg'):
                 fd_txt.write(e.text.strip() + '\n')
-    return os.path.join(base, f_txt)
+    return outfile
 
 
 def _clean_tags_file(f_orig):
@@ -73,62 +69,55 @@ def _clean_tags_file(f_orig):
                 fd_txt.write(line.strip() + '\n')
 
 
-def _clean_inner_tags_file(f_orig, base, stream):
-    """Accepts a tags filename within a tarball and a stream of the byte contents
-    within that file and writes the cleaned contents to a new, untarred file
-    found in the provided base directory.
+def _clean_inner_tags_file(outfile, stream):
+    """Accepts an output filename and a stream of the byte contents of a tags file
+    within a tarball and writes the cleaned contents to a new, untarred file.
 
     Args:
-        f_orig: the full path of the tags file in the archive
-        base: the directory to which the new file should be written
-        stream: the byte datapipe of the contents of f_orig
+        outfile: the path to which the modified stream should be written
+        stream: the byte datapipe of the contents of the archived tags file
 
     Returns: the path to the newly-written file
     """
     xml_tags = [
         '<url', '<keywords', '<talkid', '<description', '<reviewer',
         '<translator', '<title', '<speaker', '<doc', '</doc'
     ]
-    f_txt = os.path.join(base, os.path.basename(f_orig.replace('.tags', '')))
-    os.makedirs(base, exist_ok=True)
-    with codecs.open(f_txt, mode='w', encoding='utf-8') as fd_txt:
+    os.makedirs(os.path.dirname(outfile), exist_ok=True)
+    with codecs.open(outfile, mode='w', encoding='utf-8') as fd_txt:
         for line in stream.readlines():
             if not any(tag in line.decode("utf-8") for tag in xml_tags):
                 # TODO: Fix utf-8 next line mark
                 #                fd_txt.write(l.strip() + '\n')
                 #                fd_txt.write(l.strip() + u"\u0085")
                 #                fd_txt.write(l.lstrip())
                 fd_txt.write(line.decode("utf-8").strip() + '\n')
-    return f_txt
+    return outfile
 
 
-def _rewrite_text_file(file, base, stream):
-    """Accepts a text filename within a tarball and a stream of the byte contents
-    within that file and writes the cleaned contents to a new, untarred file
-    found in the provided base directory.
+def _rewrite_text_file(outfile, stream):
+    """Accepts an output filename and a stream of the byte contents of a text file
+    within a tarball and writes the cleaned contents to a new, untarred file.
 
     Args:
-        f_orig: the full path of the text file in the archive
-        base: the directory to which the new file should be written
-        stream: the byte datapipe of the contents of f_orig
+        outfile: the path to which the modified stream should be written
+        stream: the byte datapipe of the contents of the archived text file
 
     Returns: the path to the newly-written file
     """
-    f_txt = os.path.basename(file)
-    os.makedirs(base, exist_ok=True)
-    out_file = os.path.join(base, f_txt)
-    with open(out_file, 'w', encoding='utf-8') as f:
+    os.makedirs(os.path.dirname(outfile), exist_ok=True)
+    with open(outfile, 'w', encoding='utf-8') as f:
         for line in stream.readlines():
-            f.write(line.decode("utf-8"))
-    return out_file
+            f.write(line.decode("utf-8") + "\n")
+    return outfile
 
 
-def _clean_files(fname, base, stream):
+def _clean_files(outfile, fname, stream):
     if 'xml' in fname:
-        return _clean_inner_xml_file(fname, base, stream)
+        return _clean_inner_xml_file(outfile, stream)
     elif "tags" in fname:
-        return _clean_inner_tags_file(fname, base, stream)
-    return _rewrite_text_file(fname, base, stream)
+        return _clean_inner_tags_file(outfile, stream)
+    return _rewrite_text_file(outfile, stream)
 
 
 def _create_data_from_json(data_path):
diff --git a/torchtext/datasets/iwslt2016.py b/torchtext/datasets/iwslt2016.py
@@ -164,7 +164,7 @@ def IWSLT2016(root='.data', split=('train', 'valid', 'test'), language_pair=('de
     Examples:
         >>> from torchtext.datasets import IWSLT2016
         >>> train_iter, valid_iter, test_iter = IWSLT2016()
-        >>> src_sentence, tgt_sentence = next(train_iter)
+        >>> src_sentence, tgt_sentence = next(iter(train_iter))
 
     """
     if not is_module_available("torchdata"):
@@ -204,6 +204,17 @@ def IWSLT2016(root='.data', split=('train', 'valid', 'test'), language_pair=('de
     src_eval, tgt_eval = valid_filenames
     src_test, tgt_test = test_filenames
 
+    uncleaned_train_filenames = ('train.tags.{}-{}.{}'.format(src_language, tgt_language, src_language),
+                       'train.tags.{}-{}.{}'.format(src_language, tgt_language, tgt_language))
+    uncleaed_valid_filenames = ('IWSLT{}.TED.{}.{}-{}.{}.xml'.format(SUPPORTED_DATASETS['year'], valid_set, src_language, tgt_language, src_language),
+                       'IWSLT{}.TED.{}.{}-{}.{}.xml'.format(SUPPORTED_DATASETS['year'], valid_set, src_language, tgt_language, tgt_language))
+    uncleaned_test_filenames = ('IWSLT{}.TED.{}.{}-{}.{}.xml'.format(SUPPORTED_DATASETS['year'], test_set, src_language, tgt_language, src_language),
+                      'IWSLT{}.TED.{}.{}-{}.{}.xml'.format(SUPPORTED_DATASETS['year'], test_set, src_language, tgt_language, tgt_language))
+
+    uncleaned_src_train, uncleaned_tgt_train = uncleaned_train_filenames
+    uncleaned_src_eval, uncleaned_tgt_eval = uncleaed_valid_filenames
+    uncleaned_src_test, uncleaned_tgt_test = uncleaned_test_filenames
+
     url_dp = IterableWrapper([URL])
     cache_compressed_dp = url_dp.on_disk_cache(
         filepath_fn=lambda x: os.path.join(root, _PATH),
@@ -215,14 +226,13 @@ def IWSLT2016(root='.data', split=('train', 'valid', 'test'), language_pair=('de
 
     languages = "-".join([src_language, tgt_language])
 
-    inner_iwslt_tar = os.path.join(
-        root, os.path.splitext(_PATH)[0], "texts", src_language, tgt_language, languages
-    ) + ".tgz"
+    # We create the whole filepath here, but only check for the literal filename in the filter
+    # because we're lazily extracting from the outer tarfile. Thus,
+    # /root/2016-01/texts/.../src-tgt.tgz will never be in /root/2016-01.tgz/texts/.../src-tgt.tgz
+    inner_iwslt_tar = os.path.join(root, os.path.splitext(_PATH)[0], "texts", src_language, tgt_language, languages) + ".tgz"
 
-    cache_decompressed_dp = cache_compressed_dp.on_disk_cache(
-        filepath_fn=lambda x: inner_iwslt_tar
-    )
-    cache_decompressed_dp = FileOpener(cache_decompressed_dp, mode="b").read_from_tar().filter(lambda x: inner_iwslt_tar in x[0])
+    cache_decompressed_dp = cache_compressed_dp.on_disk_cache(filepath_fn=lambda x: inner_iwslt_tar)
+    cache_decompressed_dp = FileOpener(cache_decompressed_dp, mode="b").read_from_tar().filter(lambda x: os.path.basename(inner_iwslt_tar) in x[0])
     cache_decompressed_dp = cache_decompressed_dp.end_caching(mode="wb", same_filepath_fn=True)
 
     file_path_by_lang_and_split = {
@@ -238,28 +248,49 @@ def IWSLT2016(root='.data', split=('train', 'valid', 'test'), language_pair=('de
         }
     }
 
+    uncleaned_filenames = {
+        src_language: {
+            "train": uncleaned_src_train,
+            "valid": uncleaned_src_eval,
+            "test": uncleaned_src_test,
+        },
+        tgt_language: {
+            "train": uncleaned_tgt_train,
+            "valid": uncleaned_tgt_eval,
+            "test": uncleaned_tgt_test,
+        }
+    }
+
     src_filename = file_path_by_lang_and_split[src_language][split]
+    uncleaned_src_filename = uncleaned_filenames[src_language][split]
+
+    # We create the whole filepath here, but only check for the literal filename in the filter
+    # because we're lazily extracting from the outer tarfile.
     full_src_filepath = os.path.join(root, "2016-01/texts/", src_language, tgt_language, languages, src_filename)
 
     cache_inner_src_decompressed_dp = cache_decompressed_dp.on_disk_cache(filepath_fn=lambda x: full_src_filepath)
     cache_inner_src_decompressed_dp = FileOpener(cache_inner_src_decompressed_dp, mode="b").read_from_tar()
-    cache_inner_src_decompressed_dp = cache_inner_src_decompressed_dp.map(lambda x: _clean_files(x[0], os.path.splitext(os.path.dirname(os.path.dirname(x[0])))[0], x[1]))
-    cache_inner_src_decompressed_dp = cache_inner_src_decompressed_dp.filter(lambda x: full_src_filepath in x)
+    cache_inner_src_decompressed_dp = cache_inner_src_decompressed_dp.filter(lambda x: os.path.basename(uncleaned_src_filename) in x[0])
+    cache_inner_src_decompressed_dp = cache_inner_src_decompressed_dp.map(lambda x: _clean_files(full_src_filepath, x[0], x[1]))
     cache_inner_src_decompressed_dp = FileOpener(cache_inner_src_decompressed_dp, mode="b")
     cache_inner_src_decompressed_dp = cache_inner_src_decompressed_dp.end_caching(mode="wb", same_filepath_fn=True)
 
     tgt_filename = file_path_by_lang_and_split[tgt_language][split]
+    uncleaned_tgt_filename = uncleaned_filenames[tgt_language][split]
+
+    # We create the whole filepath here, but only check for the literal filename in the filter
+    # because we're lazily extracting from the outer tarfile.
     full_tgt_filepath = os.path.join(root, "2016-01/texts/", src_language, tgt_language, languages, tgt_filename)
 
     cache_inner_tgt_decompressed_dp = cache_decompressed_dp.on_disk_cache(filepath_fn=lambda x: full_tgt_filepath)
     cache_inner_tgt_decompressed_dp = FileOpener(cache_inner_tgt_decompressed_dp, mode="b").read_from_tar()
-    cache_inner_tgt_decompressed_dp = cache_inner_tgt_decompressed_dp.map(lambda x: _clean_files(x[0], os.path.splitext(os.path.dirname(os.path.dirname(x[0])))[0], x[1]))
-    cache_inner_tgt_decompressed_dp = cache_inner_tgt_decompressed_dp.filter(lambda x: full_tgt_filepath in x)
+    cache_inner_tgt_decompressed_dp = cache_inner_tgt_decompressed_dp.filter(lambda x: os.path.basename(uncleaned_tgt_filename) in x[0])
+    cache_inner_tgt_decompressed_dp = cache_inner_tgt_decompressed_dp.map(lambda x: _clean_files(full_tgt_filepath, x[0], x[1]))
     cache_inner_tgt_decompressed_dp = FileOpener(cache_inner_tgt_decompressed_dp, mode="b")
     cache_inner_tgt_decompressed_dp = cache_inner_tgt_decompressed_dp.end_caching(mode="wb", same_filepath_fn=True)
 
-    tgt_data_dp = FileOpener(cache_inner_tgt_decompressed_dp, mode="r")
-    src_data_dp = FileOpener(cache_inner_src_decompressed_dp, mode="r")
+    tgt_data_dp = FileOpener(cache_inner_tgt_decompressed_dp, mode="b")
+    src_data_dp = FileOpener(cache_inner_src_decompressed_dp, mode="b")
 
     src_lines = src_data_dp.readlines(return_path=False, strip_newline=False, decode=True)
     tgt_lines = tgt_data_dp.readlines(return_path=False, strip_newline=False, decode=True)