@@ -392,6 +392,70 @@ def __str__(self):
392392 return self .description
393393
394394
395+ def _generate_iwslt_files_for_lang_and_split (year , src_language , tgt_language , valid_set , test_set ):
396+ train_filenames = (
397+ "train.{}-{}.{}" .format (src_language , tgt_language , src_language ),
398+ "train.{}-{}.{}" .format (src_language , tgt_language , tgt_language )
399+ )
400+ valid_filenames = (
401+ "IWSLT{}.TED.{}.{}-{}.{}" .format (year , valid_set , src_language , tgt_language , src_language ),
402+ "IWSLT{}.TED.{}.{}-{}.{}" .format (year , valid_set , src_language , tgt_language , tgt_language )
403+ )
404+ test_filenames = (
405+ "IWSLT{}.TED.{}.{}-{}.{}" .format (year , test_set , src_language , tgt_language , src_language ),
406+ "IWSLT{}.TED.{}.{}-{}.{}" .format (year , test_set , src_language , tgt_language , tgt_language )
407+ )
408+
409+ src_train , tgt_train = train_filenames
410+ src_eval , tgt_eval = valid_filenames
411+ src_test , tgt_test = test_filenames
412+
413+ uncleaned_train_filenames = (
414+ "train.tags.{}-{}.{}" .format (src_language , tgt_language , src_language ),
415+ "train.tags.{}-{}.{}" .format (src_language , tgt_language , tgt_language )
416+ )
417+ uncleaned_valid_filenames = (
418+ "IWSLT{}.TED.{}.{}-{}.{}.xml" .format (year , valid_set , src_language , tgt_language , src_language ),
419+ "IWSLT{}.TED.{}.{}-{}.{}.xml" .format (year , valid_set , src_language , tgt_language , tgt_language )
420+ )
421+ uncleaned_test_filenames = (
422+ "IWSLT{}.TED.{}.{}-{}.{}.xml" .format (year , test_set , src_language , tgt_language , src_language ),
423+ "IWSLT{}.TED.{}.{}-{}.{}.xml" .format (year , test_set , src_language , tgt_language , tgt_language )
424+ )
425+
426+ uncleaned_src_train , uncleaned_tgt_train = uncleaned_train_filenames
427+ uncleaned_src_eval , uncleaned_tgt_eval = uncleaned_valid_filenames
428+ uncleaned_src_test , uncleaned_tgt_test = uncleaned_test_filenames
429+
430+ file_path_by_lang_and_split = {
431+ src_language : {
432+ "train" : src_train ,
433+ "valid" : src_eval ,
434+ "test" : src_test ,
435+ },
436+ tgt_language : {
437+ "train" : tgt_train ,
438+ "valid" : tgt_eval ,
439+ "test" : tgt_test ,
440+ }
441+ }
442+
443+ uncleaned_filenames_by_lang_and_split = {
444+ src_language : {
445+ "train" : uncleaned_src_train ,
446+ "valid" : uncleaned_src_eval ,
447+ "test" : uncleaned_src_test ,
448+ },
449+ tgt_language : {
450+ "train" : uncleaned_tgt_train ,
451+ "valid" : uncleaned_tgt_eval ,
452+ "test" : uncleaned_tgt_test ,
453+ }
454+ }
455+
456+ return file_path_by_lang_and_split , uncleaned_filenames_by_lang_and_split
457+
458+
395459@functional_datapipe ("read_squad" )
396460class _ParseSQuADQAData (IterDataPipe ):
397461 r"""Iterable DataPipe to parse the contents of a stream of JSON objects
0 commit comments