Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.

Commit a2dc38e

Browse files
committed
pull out common absolute paths for filtering src/tgt files.
1 parent 77839ce commit a2dc38e

File tree

1 file changed

+6
-8
lines changed

1 file changed

+6
-8
lines changed

torchtext/datasets/iwslt2016.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -239,24 +239,22 @@ def IWSLT2016(root='.data', split=('train', 'valid', 'test'), language_pair=('de
239239
}
240240

241241
src_filename = file_path_by_lang_and_split[src_language][split]
242+
full_src_filepath = os.path.join(root, "2016-01/texts/", src_language, tgt_language, languages, src_filename)
242243

243-
cache_inner_src_decompressed_dp = cache_decompressed_dp.on_disk_cache(
244-
filepath_fn=lambda x: os.path.join(root, "2016-01/texts/", src_language, tgt_language, languages, src_filename)
245-
)
244+
cache_inner_src_decompressed_dp = cache_decompressed_dp.on_disk_cache(filepath_fn=lambda x: full_src_filepath)
246245
cache_inner_src_decompressed_dp = FileOpener(cache_inner_src_decompressed_dp, mode="b").read_from_tar()
247246
cache_inner_src_decompressed_dp = cache_inner_src_decompressed_dp.map(lambda x: _clean_files(x[0], os.path.splitext(os.path.dirname(os.path.dirname(x[0])))[0], x[1]))
248-
cache_inner_src_decompressed_dp = cache_inner_src_decompressed_dp.filter(lambda x: src_filename in x)
247+
cache_inner_src_decompressed_dp = cache_inner_src_decompressed_dp.filter(lambda x: full_src_filepath in x)
249248
cache_inner_src_decompressed_dp = FileOpener(cache_inner_src_decompressed_dp, mode="b")
250249
cache_inner_src_decompressed_dp = cache_inner_src_decompressed_dp.end_caching(mode="wb", same_filepath_fn=True)
251250

252251
tgt_filename = file_path_by_lang_and_split[tgt_language][split]
252+
full_tgt_filepath = os.path.join(root, "2016-01/texts/", src_language, tgt_language, languages, tgt_filename)
253253

254-
cache_inner_tgt_decompressed_dp = cache_decompressed_dp.on_disk_cache(
255-
filepath_fn=lambda x: os.path.join(root, "2016-01/texts/", src_language, tgt_language, languages, tgt_filename)
256-
)
254+
cache_inner_tgt_decompressed_dp = cache_decompressed_dp.on_disk_cache(filepath_fn=lambda x: full_tgt_filepath)
257255
cache_inner_tgt_decompressed_dp = FileOpener(cache_inner_tgt_decompressed_dp, mode="b").read_from_tar()
258256
cache_inner_tgt_decompressed_dp = cache_inner_tgt_decompressed_dp.map(lambda x: _clean_files(x[0], os.path.splitext(os.path.dirname(os.path.dirname(x[0])))[0], x[1]))
259-
cache_inner_tgt_decompressed_dp = cache_inner_tgt_decompressed_dp.filter(lambda x: tgt_filename in x)
257+
cache_inner_tgt_decompressed_dp = cache_inner_tgt_decompressed_dp.filter(lambda x: full_tgt_filepath in x)
260258
cache_inner_tgt_decompressed_dp = FileOpener(cache_inner_tgt_decompressed_dp, mode="b")
261259
cache_inner_tgt_decompressed_dp = cache_inner_tgt_decompressed_dp.end_caching(mode="wb", same_filepath_fn=True)
262260

0 commit comments

Comments
 (0)