Muennighoff · Muennighoff · Sep 18, 2022 · Sep 18, 2022 · Sep 18, 2022 · Sep 18, 2022
diff --git a/promptsource/machine_translate.py b/promptsource/machine_translate.py
@@ -3,95 +3,154 @@
 
 from promptsource.templates import Template, TemplateCollection
 
+DS_TO_ENG_PROMPT = {
+    "xcopa": "en",
+    "Muennighoff/xstory_cloze": "en",
+    "Muennighoff/xwinograd": "en",
+    'GEM/wiki_lingua': 'en_en', # Contains correct language names
+    'xnli': 'en',
+    "paws-x": "en",
+    "mlqa": "mlqa.en.en",
+    "xquad": "xquad.en",
+    "khalidalt/tydiqa-primary": "english",
+    "khalidalt/tydiqa-goldp": "english",
+    "pasinit/xlwic": "en",
+    "GEM/xlsum": "english",
+    "GEM/BiSECT": "en",
+}
+
+### ZH Datasets
+
+"""
+DATASETS = [
+    ('xquad', 'xquad.zh'),
+    # Context & Answer is in ZH
+    ('mlqa', 'mlqa.zh.ar'),
+    ('mlqa', 'mlqa.zh.vi'),
+    ('mlqa', 'mlqa.zh.es'),
+    ('mlqa', 'mlqa.zh.en'),
+    ('mlqa', 'mlqa.zh.hi'),
+    ('paws-x', 'zh'),
+    ('clue', 'c3'),
+    ('clue', 'cmrc2018'),
+    ('clue', 'csl'),
+    ('clue', 'drcd'),
+    ('clue', 'tnews'),
+    ('pasinit/xlwic', "xlwic_en_zh"),
+    ('GEM/xlsum', "chinese_simplified"),
+    # ('GEM/xlsum', "chinese_traditional"),
+    # For WikiLingua there are already ZH prompts (except for xp3long prompts)
+    ("xquad", )
+]
 
-### XNLI
+LANG = "zh"
+"""
 
-PROMPTS = [
-    "GPT-3 style",
-    "can we infer",
-    "justified in saying",
-    "guaranteed/possible/impossible",
-    "MNLI crowdsource",
-]
+### ES Datasets
 
-LANGS = [
-    "ar",
-    "es",
-    "fr",
-    "hi",
-    "sw",
-    "ur",
-    "vi",
-    "zh",
+"""
+DATASETS = [
+    ('xquad', 'xquad.es'),
+    # Context & Answer is in ZH
+    ('mlqa', 'mlqa.es.es'),
+    ('paws-x', 'es'),
+    ('GEM/xlsum', "spanish"),
+    ('GEM/BiSECT', "es"),
+    #('GEM/wiki_lingua', 'es'),
 ]
+"""
 
-SOURCE_DATASET = TARGET_DATASET = "xnli"
-SOURCE_LANG = "en"
+LANG = "es"
 
 
-### XCOPA
+### FR Datasets
 
-PROMPTS = [
-    "best_option",
-    'C1 or C2? premise, so/because…',
-    "i_am_hesitating",
-    "cause_effect",
-    "plausible_alternatives",
+"""
+DATASETS = [
+    #('xquad', 'xquad.fr'),
+    # Context & Answer is in ZH
+    #('mlqa', 'mlqa.es.es'),
+    ('paws-x', 'fr'),
+    ('GEM/xlsum', "french"),
+    ('GEM/BiSECT', "fr"),
+    #('GEM/wiki_lingua', 'fr'),
+    ('pasinit/xlwic', "xlwic_fr_fr"),
 ]
+"""
 
-LANGS = [
-    "id",
-    "sw",
-    "ta",
-    "vi",
-    "zh",
-]
-
-SOURCE_DATASET = "super_glue/copa"
-SOURCE_LANG = None
-TARGET_DATASET = "xcopa"
+LANG = "fr"
 
-### XSTORY_CLOZE
+### VI Datasets
 
-PROMPTS = [
-    "Answer Given options",
-    'Choose Story Ending',
-    "Story Continuation and Options",
-    "Generate Ending",
-    "Novel Correct Ending",
+"""
+DATASETS = [
+    ('xquad', 'xquad.vi'),
+    # Context & Answer is in ZH
+    ('mlqa', 'mlqa.vi.vi'),
+    #('paws-x', 'vi'),
+    ('GEM/xlsum', "vietnamese"),
+    #('GEM/wiki_lingua', 'vi'),
+LANGS = [
+    #"id",
+    #"sw",
+    #"ta",
+    #"vi",
+    #"zh",
+    "tr",
+    "et",
+    "ht",
+    "th",
+    "it",
+    "qu",
+]
+"""
+
+LANG = "vi"
+
+### AR Datasets
+"""
+DATASETS = [
+    ('xquad', 'xquad.ar'),
+    # Context & Answer is in ZH
+    ('mlqa', 'mlqa.ar.ar'),
+    #('paws-x', 'vi'),
+    #('GEM/BiSECT', "fr"),
+    #('GEM/wiki_lingua', 'es'),
+    ('khalidalt/tydiqa-primary', 'arabic'),
+    ('khalidalt/tydiqa-goldp', 'arabic'),
 ]
-
 LANGS = [
     "ar",
     "es",
     "eu",
     "hi",
     "id",
     "zh",
+    "my",
+    "ru",
+    "sw",
+    "te",
 ]
 
 SOURCE_DATASET = TARGET_DATASET = "Muennighoff/xstory_cloze"
 SOURCE_LANG = "en"
 
 ### XWINOGRAD
-
 PROMPTS = [
     "Replace",
     "stand for",
     "True or False",
     "does underscore refer to",
     "underscore refer to",
 ]
+LANG = "ar"
+"""
 
-LANGS = [
-    "fr",
-    "pt",
-    "zh",
-]
 
-SOURCE_DATASET = TARGET_DATASET = "Muennighoff/xwinograd"
-SOURCE_LANG = "en"
+# Choose datasets & lang, e.g.
 
+DATASETS = [('GEM/xlsum', "chinese_traditional"),]
+LANG = "zh-Hant"
 
 
 # Path to key
@@ -147,22 +206,28 @@ def normalize_string(zh_string, en_string):
 
 
 template_collection = TemplateCollection()
-source_templates = template_collection.get_dataset(SOURCE_DATASET, SOURCE_LANG)
 
-for lang in LANGS:
-    target_templates = template_collection.get_dataset(TARGET_DATASET, lang)
+for (ds_name, subset_name) in DATASETS:
+
+    subset_name_eng = subset_name
+    if ds_name in DS_TO_ENG_PROMPT:
+        subset_name_eng = DS_TO_ENG_PROMPT[ds_name]
+
+    source_templates = template_collection.get_dataset(ds_name, subset_name_eng)
+    #for lang in LANGS:
+    target_templates = template_collection.get_dataset(ds_name, subset_name)
     for uid, template in source_templates.templates.items():
-        if template.name.strip() not in PROMPTS:
-            continue
-        print(f"Translating {template.name.strip()} to {lang}")
+        #if not("xp3long" in template.name.strip()):# not in PROMPTS:
+        #    continue
+        print(f"Translating {template.name.strip()} to {LANG}")
         answer_choices = []
         if template.answer_choices is not None:
             choices = template.answer_choices.split("|||")
             for c in choices:
-                answer_choices.append(normalize_string(translate(lang, c.strip()), c.strip()))
+                answer_choices.append(normalize_string(translate(LANG, c.strip()), c.strip()))
         or_jinja = template.jinja.strip()
-        jinja = normalize_string(translate(lang, or_jinja), or_jinja)
-        template_name = template.name.strip() + f"_{lang}mt"
+        jinja = normalize_string(translate(LANG, or_jinja), or_jinja)
+        template_name = template.name.strip() + f"_{LANG}mt"
         target_template = Template(
             template_name, jinja=jinja, reference="", answer_choices=" ||| ".join(answer_choices)
         )

diff --git a/promptsource/templates.py b/promptsource/templates.py
@@ -841,6 +841,7 @@ def read_from_file(self) -> Dict:
                 "Please ignore this warning if you are creating new prompts for this dataset."
             )
             return {}
+        print(self.dataset_name, self.yaml_path)
         yaml_dict = yaml.load(open(self.yaml_path, "r"), Loader=yaml.FullLoader)
         return yaml_dict[self.TEMPLATES_KEY]
 

diff --git a/...thon-state-changes/default/templates.yaml → ...raser/python-state-changes/templates.yaml b/...thon-state-changes/default/templates.yaml → ...raser/python-state-changes/templates.yaml
diff --git a/promptsource/templates/GEM/BiSECT/es/templates.yaml b/promptsource/templates/GEM/BiSECT/es/templates.yaml
@@ -0,0 +1,41 @@
+dataset: GEM/BiSECT
+subset: es
+templates:
+  5c715c55-e860-4cd3-b6cc-f2466a79690a: !Template
+    answer_choices: ''
+    id: 5c715c55-e860-4cd3-b6cc-f2466a79690a
+    jinja: '{{source}}. Esta frase es difícil de entender. Una versión más simple
+      con significado equivalente es la siguiente: ||| {{target}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: null
+      languages: null
+      metrics: null
+      original_task: null
+    name: equimeaning_esmt
+    reference: ''
+  97498532-926b-4adf-b08b-f8f2d57028b6: !Template
+    answer_choices: ''
+    id: 97498532-926b-4adf-b08b-f8f2d57028b6
+    jinja: "Divida y simplifique la siguiente oración conservando su significado completo:\
+      \ {{source}}\n Versión simplificada: ||| {{target}}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: null
+      languages: null
+      metrics: null
+      original_task: null
+    name: fullmeaning_esmt
+    reference: ''
+  d2f29ff0-4bfd-458d-91f3-92a5c1b8c1cd: !Template
+    answer_choices: ''
+    id: d2f29ff0-4bfd-458d-91f3-92a5c1b8c1cd
+    jinja: '{{source}}
+
+      La frase anterior es muy complicada. Por favor, proporcione una versión sinónima
+      simplificada que consta de varias oraciones: ||| {{target}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: null
+      languages: null
+      metrics: null
+      original_task: null
+    name: synonymous_esmt
+    reference: ''
diff --git a/promptsource/templates/GEM/BiSECT/fr/templates.yaml b/promptsource/templates/GEM/BiSECT/fr/templates.yaml
@@ -0,0 +1,41 @@
+dataset: GEM/BiSECT
+subset: fr
+templates:
+  72ade3f4-3abe-4d5d-8a0e-9e7b0c8024cd: !Template
+    answer_choices: ''
+    id: 72ade3f4-3abe-4d5d-8a0e-9e7b0c8024cd
+    jinja: "Divisez et simplifiez la phrase suivante tout en conservant son sens complet :\
+      \ {{source}}\n Version simplifiée : ||| {{target}}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: null
+      languages: null
+      metrics: null
+      original_task: null
+    name: fullmeaning_frmt
+    reference: ''
+  a244023f-dcec-4352-8026-1404f048eda5: !Template
+    answer_choices: ''
+    id: a244023f-dcec-4352-8026-1404f048eda5
+    jinja: '{{source}}
+
+      La phrase ci-dessus est très compliquée. Veuillez me fournir une version synonyme
+      simplifiée composée de plusieurs phrases : ||| {{target}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: null
+      languages: null
+      metrics: null
+      original_task: null
+    name: synonymous_frmt
+    reference: ''
+  eea19d70-be10-4cfa-abcf-4dc56753a0aa: !Template
+    answer_choices: ''
+    id: eea19d70-be10-4cfa-abcf-4dc56753a0aa
+    jinja: '{{source}}. Cette phrase est difficile à comprendre. Une version plus
+      simple avec une signification équivalente est la suivante : ||| {{target}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: null
+      languages: null
+      metrics: null
+      original_task: null
+    name: equimeaning_frmt
+    reference: ''
diff --git a/promptsource/templates/GEM/wiki_lingua/zh/templates.yaml b/promptsource/templates/GEM/wiki_lingua/zh/templates.yaml
@@ -71,3 +71,30 @@ templates:
       original_task: true
     name: summarize_above_zh
     reference: xsum templates
+  dfa7b514-7385-4855-bb90-253073a34fde: !Template
+    answer_choices: null
+    id: dfa7b514-7385-4855-bb90-253073a34fde
+    jinja: "{{target}}\n\n鉴于上述总结，为它写一个详细的文本。||| {{source}}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages: []
+      metrics:
+      - ROUGE
+      - BLEU
+      original_task: true
+    name: xp3longwritearticle_zhmt
+    reference: ''
+  dff8b414-7485-4855-bb90-253073a34fde: !Template
+    answer_choices: null
+    id: dff8b414-7485-4855-bb90-253073a34fde
+    jinja: "{{target}}\n\n我对此很感兴趣，但我只有几分钟的时间。
+    你能不能给我最多前500个字符的详细解释关于那个？ ||| {{source[:500]}}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages: []
+      metrics:
+      - ROUGE
+      - BLEU
+      original_task: true
+    name: xp3longchars_zhmt
+    reference: ''