Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 127 additions & 62 deletions promptsource/machine_translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,95 +3,154 @@

from promptsource.templates import Template, TemplateCollection

DS_TO_ENG_PROMPT = {
"xcopa": "en",
"Muennighoff/xstory_cloze": "en",
"Muennighoff/xwinograd": "en",
'GEM/wiki_lingua': 'en_en', # Contains correct language names
'xnli': 'en',
"paws-x": "en",
"mlqa": "mlqa.en.en",
"xquad": "xquad.en",
"khalidalt/tydiqa-primary": "english",
"khalidalt/tydiqa-goldp": "english",
"pasinit/xlwic": "en",
"GEM/xlsum": "english",
"GEM/BiSECT": "en",
}

### ZH Datasets

"""
DATASETS = [
('xquad', 'xquad.zh'),
# Context & Answer is in ZH
('mlqa', 'mlqa.zh.ar'),
('mlqa', 'mlqa.zh.vi'),
('mlqa', 'mlqa.zh.es'),
('mlqa', 'mlqa.zh.en'),
('mlqa', 'mlqa.zh.hi'),
('paws-x', 'zh'),
('clue', 'c3'),
('clue', 'cmrc2018'),
('clue', 'csl'),
('clue', 'drcd'),
('clue', 'tnews'),
('pasinit/xlwic', "xlwic_en_zh"),
('GEM/xlsum', "chinese_simplified"),
# ('GEM/xlsum', "chinese_traditional"),
# For WikiLingua there are already ZH prompts (except for xp3long prompts)
("xquad", )
]

### XNLI
LANG = "zh"
"""

PROMPTS = [
"GPT-3 style",
"can we infer",
"justified in saying",
"guaranteed/possible/impossible",
"MNLI crowdsource",
]
### ES Datasets

LANGS = [
"ar",
"es",
"fr",
"hi",
"sw",
"ur",
"vi",
"zh",
"""
DATASETS = [
('xquad', 'xquad.es'),
# Context & Answer is in ZH
('mlqa', 'mlqa.es.es'),
('paws-x', 'es'),
('GEM/xlsum', "spanish"),
('GEM/BiSECT', "es"),
#('GEM/wiki_lingua', 'es'),
]
"""

SOURCE_DATASET = TARGET_DATASET = "xnli"
SOURCE_LANG = "en"
LANG = "es"


### XCOPA
### FR Datasets

PROMPTS = [
"best_option",
'C1 or C2? premise, so/because…',
"i_am_hesitating",
"cause_effect",
"plausible_alternatives",
"""
DATASETS = [
#('xquad', 'xquad.fr'),
# Context & Answer is in ZH
#('mlqa', 'mlqa.es.es'),
('paws-x', 'fr'),
('GEM/xlsum', "french"),
('GEM/BiSECT', "fr"),
#('GEM/wiki_lingua', 'fr'),
('pasinit/xlwic', "xlwic_fr_fr"),
]
"""

LANGS = [
"id",
"sw",
"ta",
"vi",
"zh",
]

SOURCE_DATASET = "super_glue/copa"
SOURCE_LANG = None
TARGET_DATASET = "xcopa"
LANG = "fr"

### XSTORY_CLOZE
### VI Datasets

PROMPTS = [
"Answer Given options",
'Choose Story Ending',
"Story Continuation and Options",
"Generate Ending",
"Novel Correct Ending",
"""
DATASETS = [
('xquad', 'xquad.vi'),
# Context & Answer is in ZH
('mlqa', 'mlqa.vi.vi'),
#('paws-x', 'vi'),
('GEM/xlsum', "vietnamese"),
#('GEM/wiki_lingua', 'vi'),
LANGS = [
#"id",
#"sw",
#"ta",
#"vi",
#"zh",
"tr",
"et",
"ht",
"th",
"it",
"qu",
]
"""

LANG = "vi"

### AR Datasets
"""
DATASETS = [
('xquad', 'xquad.ar'),
# Context & Answer is in ZH
('mlqa', 'mlqa.ar.ar'),
#('paws-x', 'vi'),
#('GEM/BiSECT', "fr"),
#('GEM/wiki_lingua', 'es'),
('khalidalt/tydiqa-primary', 'arabic'),
('khalidalt/tydiqa-goldp', 'arabic'),
]

LANGS = [
"ar",
"es",
"eu",
"hi",
"id",
"zh",
"my",
"ru",
"sw",
"te",
]

SOURCE_DATASET = TARGET_DATASET = "Muennighoff/xstory_cloze"
SOURCE_LANG = "en"

### XWINOGRAD

PROMPTS = [
"Replace",
"stand for",
"True or False",
"does underscore refer to",
"underscore refer to",
]
LANG = "ar"
"""

LANGS = [
"fr",
"pt",
"zh",
]

SOURCE_DATASET = TARGET_DATASET = "Muennighoff/xwinograd"
SOURCE_LANG = "en"
# Choose datasets & lang, e.g.

DATASETS = [('GEM/xlsum', "chinese_traditional"),]
LANG = "zh-Hant"


# Path to key
Expand Down Expand Up @@ -147,22 +206,28 @@ def normalize_string(zh_string, en_string):


template_collection = TemplateCollection()
source_templates = template_collection.get_dataset(SOURCE_DATASET, SOURCE_LANG)

for lang in LANGS:
target_templates = template_collection.get_dataset(TARGET_DATASET, lang)
for (ds_name, subset_name) in DATASETS:

subset_name_eng = subset_name
if ds_name in DS_TO_ENG_PROMPT:
subset_name_eng = DS_TO_ENG_PROMPT[ds_name]

source_templates = template_collection.get_dataset(ds_name, subset_name_eng)
#for lang in LANGS:
target_templates = template_collection.get_dataset(ds_name, subset_name)
for uid, template in source_templates.templates.items():
if template.name.strip() not in PROMPTS:
continue
print(f"Translating {template.name.strip()} to {lang}")
#if not("xp3long" in template.name.strip()):# not in PROMPTS:
# continue
print(f"Translating {template.name.strip()} to {LANG}")
answer_choices = []
if template.answer_choices is not None:
choices = template.answer_choices.split("|||")
for c in choices:
answer_choices.append(normalize_string(translate(lang, c.strip()), c.strip()))
answer_choices.append(normalize_string(translate(LANG, c.strip()), c.strip()))
or_jinja = template.jinja.strip()
jinja = normalize_string(translate(lang, or_jinja), or_jinja)
template_name = template.name.strip() + f"_{lang}mt"
jinja = normalize_string(translate(LANG, or_jinja), or_jinja)
template_name = template.name.strip() + f"_{LANG}mt"
target_template = Template(
template_name, jinja=jinja, reference="", answer_choices=" ||| ".join(answer_choices)
)
Expand Down
1 change: 1 addition & 0 deletions promptsource/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -841,6 +841,7 @@ def read_from_file(self) -> Dict:
"Please ignore this warning if you are creating new prompts for this dataset."
)
return {}
print(self.dataset_name, self.yaml_path)
yaml_dict = yaml.load(open(self.yaml_path, "r"), Loader=yaml.FullLoader)
return yaml_dict[self.TEMPLATES_KEY]

Expand Down
41 changes: 41 additions & 0 deletions promptsource/templates/GEM/BiSECT/es/templates.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
dataset: GEM/BiSECT
subset: es
templates:
5c715c55-e860-4cd3-b6cc-f2466a79690a: !Template
answer_choices: ''
id: 5c715c55-e860-4cd3-b6cc-f2466a79690a
jinja: '{{source}}. Esta frase es difícil de entender. Una versión más simple
con significado equivalente es la siguiente: ||| {{target}}'
metadata: !TemplateMetadata
choices_in_prompt: null
languages: null
metrics: null
original_task: null
name: equimeaning_esmt
reference: ''
97498532-926b-4adf-b08b-f8f2d57028b6: !Template
answer_choices: ''
id: 97498532-926b-4adf-b08b-f8f2d57028b6
jinja: "Divida y simplifique la siguiente oración conservando su significado completo:\
\ {{source}}\n Versión simplificada: ||| {{target}}"
metadata: !TemplateMetadata
choices_in_prompt: null
languages: null
metrics: null
original_task: null
name: fullmeaning_esmt
reference: ''
d2f29ff0-4bfd-458d-91f3-92a5c1b8c1cd: !Template
answer_choices: ''
id: d2f29ff0-4bfd-458d-91f3-92a5c1b8c1cd
jinja: '{{source}}

La frase anterior es muy complicada. Por favor, proporcione una versión sinónima
simplificada que consta de varias oraciones: ||| {{target}}'
metadata: !TemplateMetadata
choices_in_prompt: null
languages: null
metrics: null
original_task: null
name: synonymous_esmt
reference: ''
41 changes: 41 additions & 0 deletions promptsource/templates/GEM/BiSECT/fr/templates.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
dataset: GEM/BiSECT
subset: fr
templates:
72ade3f4-3abe-4d5d-8a0e-9e7b0c8024cd: !Template
answer_choices: ''
id: 72ade3f4-3abe-4d5d-8a0e-9e7b0c8024cd
jinja: "Divisez et simplifiez la phrase suivante tout en conservant son sens complet :\
\ {{source}}\n Version simplifiée : ||| {{target}}"
metadata: !TemplateMetadata
choices_in_prompt: null
languages: null
metrics: null
original_task: null
name: fullmeaning_frmt
reference: ''
a244023f-dcec-4352-8026-1404f048eda5: !Template
answer_choices: ''
id: a244023f-dcec-4352-8026-1404f048eda5
jinja: '{{source}}

La phrase ci-dessus est très compliquée. Veuillez me fournir une version synonyme
simplifiée composée de plusieurs phrases : ||| {{target}}'
metadata: !TemplateMetadata
choices_in_prompt: null
languages: null
metrics: null
original_task: null
name: synonymous_frmt
reference: ''
eea19d70-be10-4cfa-abcf-4dc56753a0aa: !Template
answer_choices: ''
id: eea19d70-be10-4cfa-abcf-4dc56753a0aa
jinja: '{{source}}. Cette phrase est difficile à comprendre. Une version plus
simple avec une signification équivalente est la suivante : ||| {{target}}'
metadata: !TemplateMetadata
choices_in_prompt: null
languages: null
metrics: null
original_task: null
name: equimeaning_frmt
reference: ''
27 changes: 27 additions & 0 deletions promptsource/templates/GEM/wiki_lingua/zh/templates.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,30 @@ templates:
original_task: true
name: summarize_above_zh
reference: xsum templates
dfa7b514-7385-4855-bb90-253073a34fde: !Template
answer_choices: null
id: dfa7b514-7385-4855-bb90-253073a34fde
jinja: "{{target}}\n\n鉴于上述总结,为它写一个详细的文本。||| {{source}}"
metadata: !TemplateMetadata
choices_in_prompt: false
languages: []
metrics:
- ROUGE
- BLEU
original_task: true
name: xp3longwritearticle_zhmt
reference: ''
dff8b414-7485-4855-bb90-253073a34fde: !Template
answer_choices: null
id: dff8b414-7485-4855-bb90-253073a34fde
jinja: "{{target}}\n\n我对此很感兴趣,但我只有几分钟的时间。
你能不能给我最多前500个字符的详细解释关于那个? ||| {{source[:500]}}"
metadata: !TemplateMetadata
choices_in_prompt: false
languages: []
metrics:
- ROUGE
- BLEU
original_task: true
name: xp3longchars_zhmt
reference: ''
Loading